Spaces:

CYF200127
/

RxnIM

Sleeping

App Files Files Community

CYF200127 commited on Nov 20, 2024

Commit

5e9bd47

verified ·

1 Parent(s): 23b9d28

Upload 116 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +131 -0
examples/exp.png +0 -0
examples/reaction1.png +0 -0
examples/reaction2.png +0 -0
examples/reaction3.png +0 -0
examples/reaction4.png +0 -0
getReaction.py +78 -0
molscribe/__init__.py +1 -0
molscribe/__pycache__/__init__.cpython-310.pyc +0 -0
molscribe/__pycache__/augment.cpython-310.pyc +0 -0
molscribe/__pycache__/chemistry.cpython-310.pyc +0 -0
molscribe/__pycache__/constants.cpython-310.pyc +0 -0
molscribe/__pycache__/dataset.cpython-310.pyc +0 -0
molscribe/__pycache__/evaluate.cpython-310.pyc +0 -0
molscribe/__pycache__/interface.cpython-310.pyc +0 -0
molscribe/__pycache__/loss.cpython-310.pyc +0 -0
molscribe/__pycache__/model.cpython-310.pyc +0 -0
molscribe/__pycache__/tokenizer.cpython-310.pyc +0 -0
molscribe/__pycache__/utils.cpython-310.pyc +0 -0
molscribe/augment.py +282 -0
molscribe/chemistry.py +649 -0
molscribe/constants.py +130 -0
molscribe/dataset.py +594 -0
molscribe/evaluate.py +79 -0
molscribe/indigo/__init__.py +0 -0
molscribe/indigo/__pycache__/__init__.cpython-310.pyc +0 -0
molscribe/indigo/__pycache__/bingo.cpython-310.pyc +0 -0
molscribe/indigo/__pycache__/inchi.cpython-310.pyc +0 -0
molscribe/indigo/__pycache__/renderer.cpython-310.pyc +0 -0
molscribe/indigo/bingo.py +334 -0
molscribe/indigo/inchi.py +84 -0
molscribe/indigo/renderer.py +113 -0
molscribe/inference/__init__.py +4 -0
molscribe/inference/__pycache__/__init__.cpython-310.pyc +0 -0
molscribe/inference/__pycache__/beam_search.cpython-310.pyc +0 -0
molscribe/inference/__pycache__/decode_strategy.cpython-310.pyc +0 -0
molscribe/inference/__pycache__/greedy_search.cpython-310.pyc +0 -0
molscribe/inference/beam_search.py +190 -0
molscribe/inference/decode_strategy.py +63 -0
molscribe/inference/greedy_search.py +128 -0
molscribe/interface.py +223 -0
molscribe/loss.py +125 -0
molscribe/model.py +397 -0
molscribe/tokenizer.py +524 -0
molscribe/transformer/__init__.py +3 -0
molscribe/transformer/__pycache__/__init__.cpython-310.pyc +0 -0
molscribe/transformer/__pycache__/decoder.cpython-310.pyc +0 -0
molscribe/transformer/__pycache__/embedding.cpython-310.pyc +0 -0
molscribe/transformer/__pycache__/swin_transformer.cpython-310.pyc +0 -0
molscribe/transformer/decoder.py +487 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import gradio as gr
+import json
+from rxnim import RXNIM
+from getReaction import generate_combined_image
+import torch
+from rxn.reaction import Reaction
+PROMPT_DIR = "prompts/"
+ckpt_path = "./rxn/model/model.ckpt"
+model = Reaction(ckpt_path, device=torch.device('cpu'))
+# 定义 prompt 文件名到友好名字的映射
+PROMPT_NAMES = {
+    "2_RxnOCR.txt": "Reaction Image Parsing Workflow",
+}
+example_diagram = "examples/exp.png"
+def list_prompt_files_with_names():
+    """
+    列出 prompts 目录下的所有 .txt 文件，为没有名字的生成默认名字。
+    返回 {friendly_name: filename} 映射。
+    """
+    prompt_files = {}
+    for f in os.listdir(PROMPT_DIR):
+        if f.endswith(".txt"):
+            # 如果文件名有预定义的名字，使用预定义名字
+            friendly_name = PROMPT_NAMES.get(f, f"Task: {os.path.splitext(f)[0]}")
+            prompt_files[friendly_name] = f
+    return prompt_files
+def parse_reactions(output_json):
+    """
+    解析 JSON 格式的反应数据并格式化输出，包含颜色定制。
+    """
+    reactions_data = json.loads(output_json)  # 转换 JSON 字符串为字典
+    reactions_list = reactions_data.get("reactions", [])
+    detailed_output = []
+    for reaction in reactions_list:
+        reaction_id = reaction.get("reaction_id", "Unknown ID")
+        reactants = [r.get("smiles", "Unknown") for r in reaction.get("reactants", [])]
+        conditions = [
+            f"<span style='color:red'>{c.get('smiles', c.get('text', 'Unknown'))}[{c.get('role', 'Unknown')}]</span>"
+            for c in reaction.get("conditions", [])
+        ]
+        conditions_1 = [
+            f"<span style='color:black'>{c.get('smiles', c.get('text', 'Unknown'))}[{c.get('role', 'Unknown')}]</span>"
+            for c in reaction.get("conditions", [])
+        ]
+        products = [f"<span style='color:orange'>{p.get('smiles', 'Unknown')}</span>" for p in reaction.get("products", [])]
+        products_1 = [f"<span style='color:black'>{p.get('smiles', 'Unknown')}</span>" for p in reaction.get("products", [])]
+        # 构造反应的完整字符串，定制字体颜色
+        full_reaction = f"{'.'.join(reactants)}>>{'.'.join(products_1)} | {', '.join(conditions_1)}"
+        full_reaction = f"<span style='color:black'>{full_reaction}</span>"
+        # 详细反应格式化输出
+        reaction_output = f"<b>Reaction: </b> {reaction_id}<br>"
+        reaction_output += f"  Reactants: <span style='color:blue'>{', '.join(reactants)}</span><br>"
+        reaction_output += f"  Conditions: {', '.join(conditions)}<br>"
+        reaction_output += f"  Products: {', '.join(products)}<br>"
+        reaction_output += f"  <b>Full Reaction:</b> {full_reaction}<br>"
+        reaction_output += "<br>"
+        detailed_output.append(reaction_output)
+    return detailed_output
+def process_chem_image(image, selected_task):
+    chem_mllm = RXNIM()
+    # 将友好名字转换为实际文件名
+    prompt_path = os.path.join(PROMPT_DIR, prompts_with_names[selected_task])
+    image_path = "temp_image.png"
+    image.save(image_path)
+    # 调用 RXNIM 处理
+    rxnim_result = chem_mllm.process(image_path, prompt_path)
+    # 将 JSON 结果解析为结构化输出
+    detailed_reactions = parse_reactions(rxnim_result)
+    # 调用 RxnScribe 模型处理并生成整合图像
+    predictions = model.predict_image_file(image_path, molscribe=True, ocr=True)
+    combined_image_path = generate_combined_image(predictions, image_path)
+    json_file_path = "output.json"
+    with open(json_file_path, "w") as json_file:
+        json.dump(json.loads(rxnim_result), json_file, indent=4)
+    # 返回详细反应和整合图像
+    return "\n\n".join(detailed_reactions), combined_image_path, example_diagram, json_file_path
+# 获取 prompts 和友好名字
+prompts_with_names = list_prompt_files_with_names()
+# 示例数据：图像路径 + 任务选项
+examples = [
+    ["examples/reaction1.png", "Reaction Image Parsing Workflow"],
+    ["examples/reaction2.png", "Reaction Image Parsing Workflow"],
+    ["examples/reaction3.png", "Reaction Image Parsing Workflow"],
+    ["examples/reaction4.png", "Reaction Image Parsing Workflow"],
+]
+# 定义 Gradio 界面
+demo = gr.Interface(
+    fn=process_chem_image,
+    inputs=[
+        gr.Image(type="pil", label="Upload Reaction Image"),
+        gr.Radio(
+            choices=list(prompts_with_names.keys()),  # 显示任务名字
+            label="Select a predefined task",
+        ),
+    ],
+    outputs=[
+        gr.HTML(label="Reaction outputs"),
+        gr.Image(label="Visualization"), # 显示整合图像
+        gr.Image(value=example_diagram, label="Schematic Diagram"),
+        gr.File(label="Download JSON File"),
+    ],
+    title="Towards Large-scale Chemical Reaction Image Parsing via a Multimodal Large Language Model",
+    description="Upload a reaction image and select a predefined task prompt.",
+    examples=examples,  # 使用嵌套列表作为示例
+    examples_per_page=20,
+)
+demo.launch()

examples/exp.png ADDED Viewed

examples/reaction1.png ADDED Viewed

examples/reaction2.png ADDED Viewed

examples/reaction3.png ADDED Viewed

examples/reaction4.png ADDED Viewed

getReaction.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import sys
+sys.path.append('./rxn/')
+import torch
+from rxn.reaction import Reaction
+import json
+from matplotlib import pyplot as plt
+import numpy as np
+ckpt_path = "./rxn/model/model.ckpt"
+model = Reaction(ckpt_path, device=torch.device('cpu'))
+device = torch.device('cpu')
+def get_reaction(image_path: str) -> list:
+    '''Returns a list of reactions extracted from the image.'''
+    image_file = image_path
+    return json.dumps(model.predict_image_file(image_file, molscribe=True, ocr=True))
+def generate_combined_image(predictions, image_file):
+    """
+    将预测的图像整合到一个对称的布局中输出。
+    """
+    output = model.draw_predictions(predictions, image_file=image_file)
+    n_images = len(output)
+    if n_images == 1:
+        n_cols = 1
+    elif n_images == 2:
+        n_cols = 2
+    else:
+        n_cols = 3
+    n_rows = (n_images + n_cols - 1) // n_cols  # 计算需要的行数
+    # 确保每张图像符合要求
+    processed_images = []
+    for img in output:
+        if len(img.shape) == 2:  # 灰度图像
+            img = np.stack([img] * 3, axis=-1)  # 转换为 RGB 格式
+        elif img.shape[2] > 3:  # RGBA 图像
+            img = img[:, :, :3]  # 只保留 RGB 通道
+        if img.dtype == np.float32 or img.dtype == np.float64:
+            img = (img * 255).astype(np.uint8)  # 转换为 uint8
+        processed_images.append(img)
+    output = processed_images
+    # 为不足的子图位置添加占位图
+    if n_images < n_rows * n_cols:
+        blank_image = np.ones_like(output[0]) * 255  # 生成一个白色占位图
+        while len(output) < n_rows * n_cols:
+            output.append(blank_image)
+    # 创建子图画布
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
+    # 确保 axes 是一维数组
+    if isinstance(axes, np.ndarray):
+        axes = axes.flatten()
+    else:
+        axes = [axes]  # 单个子图的情况
+    # 绘制每张图像
+    for idx, img in enumerate(output):
+        ax = axes[idx]
+        ax.imshow(img)
+        ax.axis('off')
+        if idx < n_images:
+            ax.set_title(f"Reaction {idx + 1}")
+    # 删除多余的子图
+    for idx in range(n_images, len(axes)):
+        fig.delaxes(axes[idx])
+    # 保存整合图像
+    combined_image_path = "combined_output.png"
+    plt.tight_layout()
+    plt.savefig(combined_image_path)
+    plt.close(fig)
+    return combined_image_path

molscribe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .interface import MolScribe

molscribe/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (190 Bytes). View file

molscribe/__pycache__/augment.cpython-310.pyc ADDED Viewed

Binary file (8.98 kB). View file

molscribe/__pycache__/chemistry.cpython-310.pyc ADDED Viewed

Binary file (17.5 kB). View file

molscribe/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (6.18 kB). View file

molscribe/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (17.6 kB). View file

molscribe/__pycache__/evaluate.cpython-310.pyc ADDED Viewed

Binary file (3.48 kB). View file

molscribe/__pycache__/interface.cpython-310.pyc ADDED Viewed

Binary file (8.95 kB). View file

molscribe/__pycache__/loss.cpython-310.pyc ADDED Viewed

Binary file (4.25 kB). View file

molscribe/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (13.2 kB). View file

molscribe/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (16.8 kB). View file

molscribe/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (6.33 kB). View file

molscribe/augment.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import albumentations as A
+from albumentations.augmentations.geometric.functional import safe_rotate_enlarged_img_size, _maybe_process_in_chunks, \
+                                                              keypoint_rotate
+import cv2
+import math
+import random
+import numpy as np
+def safe_rotate(
+    img: np.ndarray,
+    angle: int = 0,
+    interpolation: int = cv2.INTER_LINEAR,
+    value: int = None,
+    border_mode: int = cv2.BORDER_REFLECT_101,
+):
+    old_rows, old_cols = img.shape[:2]
+    # getRotationMatrix2D needs coordinates in reverse order (width, height) compared to shape
+    image_center = (old_cols / 2, old_rows / 2)
+    # Rows and columns of the rotated image (not cropped)
+    new_rows, new_cols = safe_rotate_enlarged_img_size(angle=angle, rows=old_rows, cols=old_cols)
+    # Rotation Matrix
+    rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
+    # Shift the image to create padding
+    rotation_mat[0, 2] += new_cols / 2 - image_center[0]
+    rotation_mat[1, 2] += new_rows / 2 - image_center[1]
+    # CV2 Transformation function
+    warp_affine_fn = _maybe_process_in_chunks(
+        cv2.warpAffine,
+        M=rotation_mat,
+        dsize=(new_cols, new_rows),
+        flags=interpolation,
+        borderMode=border_mode,
+        borderValue=value,
+    )
+    # rotate image with the new bounds
+    rotated_img = warp_affine_fn(img)
+    return rotated_img
+def keypoint_safe_rotate(keypoint, angle, rows, cols):
+    old_rows = rows
+    old_cols = cols
+    # Rows and columns of the rotated image (not cropped)
+    new_rows, new_cols = safe_rotate_enlarged_img_size(angle=angle, rows=old_rows, cols=old_cols)
+    col_diff = (new_cols - old_cols) / 2
+    row_diff = (new_rows - old_rows) / 2
+    # Shift keypoint
+    shifted_keypoint = (int(keypoint[0] + col_diff), int(keypoint[1] + row_diff), keypoint[2], keypoint[3])
+    # Rotate keypoint
+    rotated_keypoint = keypoint_rotate(shifted_keypoint, angle, rows=new_rows, cols=new_cols)
+    return rotated_keypoint
+class SafeRotate(A.SafeRotate):
+    def __init__(
+        self,
+        limit=90,
+        interpolation=cv2.INTER_LINEAR,
+        border_mode=cv2.BORDER_REFLECT_101,
+        value=None,
+        mask_value=None,
+        always_apply=False,
+        p=0.5,
+    ):
+        super(SafeRotate, self).__init__(
+            limit=limit,
+            interpolation=interpolation,
+            border_mode=border_mode,
+            value=value,
+            mask_value=mask_value,
+            always_apply=always_apply,
+            p=p)
+    def apply(self, img, angle=0, interpolation=cv2.INTER_LINEAR, **params):
+        return safe_rotate(
+            img=img, value=self.value, angle=angle, interpolation=interpolation, border_mode=self.border_mode)
+    def apply_to_keypoint(self, keypoint, angle=0, **params):
+        return keypoint_safe_rotate(keypoint, angle=angle, rows=params["rows"], cols=params["cols"])
+class CropWhite(A.DualTransform):
+    def __init__(self, value=(255, 255, 255), pad=0, p=1.0):
+        super(CropWhite, self).__init__(p=p)
+        self.value = value
+        self.pad = pad
+        assert pad >= 0
+    def update_params(self, params, **kwargs):
+        super().update_params(params, **kwargs)
+        assert "image" in kwargs
+        img = kwargs["image"]
+        height, width, _ = img.shape
+        x = (img != self.value).sum(axis=2)
+        if x.sum() == 0:
+            return params
+        row_sum = x.sum(axis=1)
+        top = 0
+        while row_sum[top] == 0 and top+1 < height:
+            top += 1
+        bottom = height
+        while row_sum[bottom-1] == 0 and bottom-1 > top:
+            bottom -= 1
+        col_sum = x.sum(axis=0)
+        left = 0
+        while col_sum[left] == 0 and left+1 < width:
+            left += 1
+        right = width
+        while col_sum[right-1] == 0 and right-1 > left:
+            right -= 1
+        # crop_top = max(0, top - self.pad)
+        # crop_bottom = max(0, height - bottom - self.pad)
+        # crop_left = max(0, left - self.pad)
+        # crop_right = max(0, width - right - self.pad)
+        # params.update({"crop_top": crop_top, "crop_bottom": crop_bottom,
+        #                "crop_left": crop_left, "crop_right": crop_right})
+        params.update({"crop_top": top, "crop_bottom": height - bottom,
+                       "crop_left": left, "crop_right": width - right})
+        return params
+    def apply(self, img, crop_top=0, crop_bottom=0, crop_left=0, crop_right=0, **params):
+        height, width, _ = img.shape
+        img = img[crop_top:height - crop_bottom, crop_left:width - crop_right]
+        img = A.augmentations.pad_with_params(
+            img, self.pad, self.pad, self.pad, self.pad, border_mode=cv2.BORDER_CONSTANT, value=self.value)
+        return img
+    def apply_to_keypoint(self, keypoint, crop_top=0, crop_bottom=0, crop_left=0, crop_right=0, **params):
+        x, y, angle, scale = keypoint[:4]
+        return x - crop_left + self.pad, y - crop_top + self.pad, angle, scale
+    def get_transform_init_args_names(self):
+        return ('value', 'pad')
+class PadWhite(A.DualTransform):
+    def __init__(self, pad_ratio=0.2, p=0.5, value=(255, 255, 255)):
+        super(PadWhite, self).__init__(p=p)
+        self.pad_ratio = pad_ratio
+        self.value = value
+    def update_params(self, params, **kwargs):
+        super().update_params(params, **kwargs)
+        assert "image" in kwargs
+        img = kwargs["image"]
+        height, width, _ = img.shape
+        side = random.randrange(4)
+        if side == 0:
+            params['pad_top'] = int(height * self.pad_ratio * random.random())
+        elif side == 1:
+            params['pad_bottom'] = int(height * self.pad_ratio * random.random())
+        elif side == 2:
+            params['pad_left'] = int(width * self.pad_ratio * random.random())
+        elif side == 3:
+            params['pad_right'] = int(width * self.pad_ratio * random.random())
+        return params
+    def apply(self, img, pad_top=0, pad_bottom=0, pad_left=0, pad_right=0, **params):
+        height, width, _ = img.shape
+        img = A.augmentations.pad_with_params(
+            img, pad_top, pad_bottom, pad_left, pad_right, border_mode=cv2.BORDER_CONSTANT, value=self.value)
+        return img
+    def apply_to_keypoint(self, keypoint, pad_top=0, pad_bottom=0, pad_left=0, pad_right=0, **params):
+        x, y, angle, scale = keypoint[:4]
+        return x + pad_left, y + pad_top, angle, scale
+    def get_transform_init_args_names(self):
+        return ('value', 'pad_ratio')
+class SaltAndPepperNoise(A.DualTransform):
+    def __init__(self, num_dots, value=(0, 0, 0), p=0.5):
+        super().__init__(p)
+        self.num_dots = num_dots
+        self.value = value
+    def apply(self, img, **params):
+        height, width, _ = img.shape
+        num_dots = random.randrange(self.num_dots + 1)
+        for i in range(num_dots):
+            x = random.randrange(height)
+            y = random.randrange(width)
+            img[x, y] = self.value
+        return img
+    def apply_to_keypoint(self, keypoint, **params):
+        return keypoint
+    def get_transform_init_args_names(self):
+        return ('value', 'num_dots')
+class ResizePad(A.DualTransform):
+    def __init__(self, height, width, interpolation=cv2.INTER_LINEAR, value=(255, 255, 255)):
+        super(ResizePad, self).__init__(always_apply=True)
+        self.height = height
+        self.width = width
+        self.interpolation = interpolation
+        self.value = value
+    def apply(self, img, interpolation=cv2.INTER_LINEAR, **params):
+        h, w, _ = img.shape
+        img = A.augmentations.geometric.functional.resize(
+            img,
+            height=min(h, self.height),
+            width=min(w, self.width),
+            interpolation=interpolation
+        )
+        h, w, _ = img.shape
+        pad_top = (self.height - h) // 2
+        pad_bottom = (self.height - h) - pad_top
+        pad_left = (self.width - w) // 2
+        pad_right = (self.width - w) - pad_left
+        img = A.augmentations.pad_with_params(
+            img,
+            pad_top,
+            pad_bottom,
+            pad_left,
+            pad_right,
+            border_mode=cv2.BORDER_CONSTANT,
+            value=self.value,
+        )
+        return img
+def normalized_grid_distortion(
+        img,
+        num_steps=10,
+        xsteps=(),
+        ysteps=(),
+        *args,
+        **kwargs
+):
+    height, width = img.shape[:2]
+    # compensate for smaller last steps in source image.
+    x_step = width // num_steps
+    last_x_step = min(width, ((num_steps + 1) * x_step)) - (num_steps * x_step)
+    xsteps[-1] *= last_x_step / x_step
+    y_step = height // num_steps
+    last_y_step = min(height, ((num_steps + 1) * y_step)) - (num_steps * y_step)
+    ysteps[-1] *= last_y_step / y_step
+    # now normalize such that distortion never leaves image bounds.
+    tx = width / math.floor(width / num_steps)
+    ty = height / math.floor(height / num_steps)
+    xsteps = np.array(xsteps) * (tx / np.sum(xsteps))
+    ysteps = np.array(ysteps) * (ty / np.sum(ysteps))
+    # do actual distortion.
+    return A.augmentations.functional.grid_distortion(img, num_steps, xsteps, ysteps, *args, **kwargs)
+class NormalizedGridDistortion(A.augmentations.transforms.GridDistortion):
+    def apply(self, img, stepsx=(), stepsy=(), interpolation=cv2.INTER_LINEAR, **params):
+        return normalized_grid_distortion(img, self.num_steps, stepsx, stepsy, interpolation, self.border_mode,
+                                          self.value)
+    def apply_to_mask(self, img, stepsx=(), stepsy=(), **params):
+        return normalized_grid_distortion(
+            img, self.num_steps, stepsx, stepsy, cv2.INTER_NEAREST, self.border_mode, self.mask_value)

molscribe/chemistry.py ADDED Viewed

	@@ -0,0 +1,649 @@

+import copy
+import traceback
+import numpy as np
+import multiprocessing
+import rdkit
+import rdkit.Chem as Chem
+rdkit.RDLogger.DisableLog('rdApp.*')
+from SmilesPE.pretokenizer import atomwise_tokenizer
+from .constants import RGROUP_SYMBOLS, ABBREVIATIONS, VALENCES, FORMULA_REGEX
+def is_valid_mol(s, format_='atomtok'):
+    if format_ == 'atomtok':
+        mol = Chem.MolFromSmiles(s)
+    elif format_ == 'inchi':
+        if not s.startswith('InChI=1S'):
+            s = f"InChI=1S/{s}"
+        mol = Chem.MolFromInchi(s)
+    else:
+        raise NotImplemented
+    return mol is not None
+def _convert_smiles_to_inchi(smiles):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        inchi = Chem.MolToInchi(mol)
+    except:
+        inchi = None
+    return inchi
+def convert_smiles_to_inchi(smiles_list, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        inchi_list = p.map(_convert_smiles_to_inchi, smiles_list, chunksize=128)
+    n_success = sum([x is not None for x in inchi_list])
+    r_success = n_success / len(inchi_list)
+    inchi_list = [x if x else 'InChI=1S/H2O/h1H2' for x in inchi_list]
+    return inchi_list, r_success
+def merge_inchi(inchi1, inchi2):
+    replaced = 0
+    inchi1 = copy.deepcopy(inchi1)
+    for i in range(len(inchi1)):
+        if inchi1[i] == 'InChI=1S/H2O/h1H2':
+            inchi1[i] = inchi2[i]
+            replaced += 1
+    return inchi1, replaced
+def _get_num_atoms(smiles):
+    try:
+        return Chem.MolFromSmiles(smiles).GetNumAtoms()
+    except:
+        return 0
+def get_num_atoms(smiles, num_workers=16):
+    if type(smiles) is str:
+        return _get_num_atoms(smiles)
+    with multiprocessing.Pool(num_workers) as p:
+        num_atoms = p.map(_get_num_atoms, smiles)
+    return num_atoms
+def normalize_nodes(nodes, flip_y=True):
+    x, y = nodes[:, 0], nodes[:, 1]
+    minx, maxx = min(x), max(x)
+    miny, maxy = min(y), max(y)
+    x = (x - minx) / max(maxx - minx, 1e-6)
+    if flip_y:
+        y = (maxy - y) / max(maxy - miny, 1e-6)
+    else:
+        y = (y - miny) / max(maxy - miny, 1e-6)
+    return np.stack([x, y], axis=1)
+def _verify_chirality(mol, coords, symbols, edges, debug=False):
+    try:
+        n = mol.GetNumAtoms()
+        # Make a temp mol to find chiral centers
+        mol_tmp = mol.GetMol()
+        Chem.SanitizeMol(mol_tmp)
+        chiral_centers = Chem.FindMolChiralCenters(
+            mol_tmp, includeUnassigned=True, includeCIP=False, useLegacyImplementation=False)
+        chiral_center_ids = [idx for idx, _ in chiral_centers]  # List[Tuple[int, any]] -> List[int]
+        # correction to clear pre-condition violation (for some corner cases)
+        for bond in mol.GetBonds():
+            if bond.GetBondType() == Chem.BondType.SINGLE:
+                bond.SetBondDir(Chem.BondDir.NONE)
+        # Create conformer from 2D coordinate
+        conf = Chem.Conformer(n)
+        conf.Set3D(True)
+        for i, (x, y) in enumerate(coords):
+            conf.SetAtomPosition(i, (x, 1 - y, 0))
+        mol.AddConformer(conf)
+        Chem.SanitizeMol(mol)
+        Chem.AssignStereochemistryFrom3D(mol)
+        # NOTE: seems that only AssignStereochemistryFrom3D can handle double bond E/Z
+        # So we do this first, remove the conformer and add back the 2D conformer for chiral correction
+        mol.RemoveAllConformers()
+        conf = Chem.Conformer(n)
+        conf.Set3D(False)
+        for i, (x, y) in enumerate(coords):
+            conf.SetAtomPosition(i, (x, 1 - y, 0))
+        mol.AddConformer(conf)
+        # Magic, inferring chirality from coordinates and BondDir. DO NOT CHANGE.
+        Chem.SanitizeMol(mol)
+        Chem.AssignChiralTypesFromBondDirs(mol)
+        Chem.AssignStereochemistry(mol, force=True)
+        # Second loop to reset any wedge/dash bond to be starting from the chiral center)
+        for i in chiral_center_ids:
+            for j in range(n):
+                if edges[i][j] == 5:
+                    # assert edges[j][i] == 6
+                    mol.RemoveBond(i, j)
+                    mol.AddBond(i, j, Chem.BondType.SINGLE)
+                    mol.GetBondBetweenAtoms(i, j).SetBondDir(Chem.BondDir.BEGINWEDGE)
+                elif edges[i][j] == 6:
+                    # assert edges[j][i] == 5
+                    mol.RemoveBond(i, j)
+                    mol.AddBond(i, j, Chem.BondType.SINGLE)
+                    mol.GetBondBetweenAtoms(i, j).SetBondDir(Chem.BondDir.BEGINDASH)
+            Chem.AssignChiralTypesFromBondDirs(mol)
+            Chem.AssignStereochemistry(mol, force=True)
+        # reset chiral tags for non-carbon atom
+        for atom in mol.GetAtoms():
+            if atom.GetSymbol() != "C":
+                atom.SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
+        mol = mol.GetMol()
+    except Exception as e:
+        if debug:
+            raise e
+        pass
+    return mol
+def _parse_tokens(tokens: list):
+    """
+    Parse tokens of condensed formula into list of pairs `(elt, num)`
+    where `num` is the multiplicity of the atom (or nested condensed formula) `elt`
+    Used by `_parse_formula`, which does the same thing but takes a formula in string form as input
+    """
+    elements = []
+    i = 0
+    j = 0
+    while i < len(tokens):
+        if tokens[i] == '(':
+            while j < len(tokens) and tokens[j] != ')':
+                j += 1
+            elt = _parse_tokens(tokens[i + 1:j])
+        else:
+            elt = tokens[i]
+        j += 1
+        if j < len(tokens) and tokens[j].isnumeric():
+            num = int(tokens[j])
+            j += 1
+        else:
+            num = 1
+        elements.append((elt, num))
+        i = j
+    return elements
+def _parse_formula(formula: str):
+    """
+    Parse condensed formula into list of pairs `(elt, num)`
+    where `num` is the subscript to the atom (or nested condensed formula) `elt`
+    Example: "C2H4O" -> [('C', 2), ('H', 4), ('O', 1)]
+    """
+    tokens = FORMULA_REGEX.findall(formula)
+    # if ''.join(tokens) != formula:
+    #     tokens = FORMULA_REGEX_BACKUP.findall(formula)
+    return _parse_tokens(tokens)
+def _expand_carbon(elements: list):
+    """
+    Given list of pairs `(elt, num)`, output single list of all atoms in order,
+    expanding carbon sequences (CaXb where a > 1 and X is halogen) if necessary
+    Example: [('C', 2), ('H', 4), ('O', 1)] -> ['C', 'H', 'H', 'C', 'H', 'H', 'O'])
+    """
+    expanded = []
+    i = 0
+    while i < len(elements):
+        elt, num = elements[i]
+        # expand carbon sequence
+        if elt == 'C' and num > 1 and i + 1 < len(elements):
+            next_elt, next_num = elements[i + 1]
+            quotient, remainder = next_num // num, next_num % num
+            for _ in range(num):
+                expanded.append('C')
+                for _ in range(quotient):
+                    expanded.append(next_elt)
+            for _ in range(remainder):
+                expanded.append(next_elt)
+            i += 2
+        # recurse if `elt` itself is a list (nested formula)
+        elif isinstance(elt, list):
+            new_elt = _expand_carbon(elt)
+            for _ in range(num):
+                expanded.append(new_elt)
+            i += 1
+        # simplest case: simply append `elt` `num` times
+        else:
+            for _ in range(num):
+                expanded.append(elt)
+            i += 1
+    return expanded
+def _expand_abbreviation(abbrev):
+    """
+    Expand abbreviation into its SMILES; also converts [Rn] to [n*]
+    Used in `_condensed_formula_list_to_smiles` when encountering abbrev. in condensed formula
+    """
+    if abbrev in ABBREVIATIONS:
+        return ABBREVIATIONS[abbrev].smiles
+    if abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):
+        if abbrev[1:].isdigit():
+            return f'[{abbrev[1:]}*]'
+        return '*'
+    return f'[{abbrev}]'
+def _get_bond_symb(bond_num):
+    """
+    Get SMILES symbol for a bond given bond order
+    Used in `_condensed_formula_list_to_smiles` while writing the SMILES string
+    """
+    if bond_num == 0:
+        return '.'
+    if bond_num == 1:
+        return ''
+    if bond_num == 2:
+        return '='
+    if bond_num == 3:
+        return '#'
+    return ''
+def _condensed_formula_list_to_smiles(formula_list, start_bond, end_bond=None, direction=None):
+    """
+    Converts condensed formula (in the form of a list of symbols) to smiles
+    Input:
+    `formula_list`: e.g. ['C', 'H', 'H', 'N', ['C', 'H', 'H', 'H'], ['C', 'H', 'H', 'H']] for CH2N(CH3)2
+    `start_bond`: # bonds attached to beginning of formula
+    `end_bond`: # bonds attached to end of formula (deduce automatically if None)
+    `direction` (1, -1, or None): direction in which to process the list (1: left to right; -1: right to left; None: deduce automatically)
+    Returns:
+    `smiles`: smiles corresponding to input condensed formula
+    `bonds_left`: bonds remaining at the end of the formula (for connecting back to main molecule); should equal `end_bond` if specified
+    `num_trials`: number of trials
+    `success` (bool): whether conversion was successful
+    """
+    # `direction` not specified: try left to right; if fails, try right to left
+    if direction is None:
+        num_trials = 1
+        for dir_choice in [1, -1]:
+            smiles, bonds_left, trials, success = _condensed_formula_list_to_smiles(formula_list, start_bond, end_bond, dir_choice)
+            num_trials += trials
+            if success:
+                return smiles, bonds_left, num_trials, success
+        return None, None, num_trials, False
+    assert direction == 1 or direction == -1
+    def dfs(smiles, bonds_left, cur_idx, add_idx):
+        """
+        `smiles`: SMILES string so far
+        `cur_idx`: index (in list `formula`) of current atom (i.e. atom to which subsequent atoms are being attached)
+        `cur_flat_idx`: index of current atom in list of atom tokens of SMILES so far
+        `bonds_left`: bonds remaining on current atom for subsequent atoms to be attached to
+        `add_idx`: index (in list `formula`) of atom to be attached to current atom
+        `add_flat_idx`: index of atom to be added in list of atom tokens of SMILES so far
+        Note: "atom" could refer to nested condensed formula (e.g. CH3 in CH2N(CH3)2)
+        """
+        num_trials = 1
+        # end of formula: return result
+        if (direction == 1 and add_idx == len(formula_list)) or (direction == -1 and add_idx == -1):
+            if end_bond is not None and end_bond != bonds_left:
+                return smiles, bonds_left, num_trials, False
+            return smiles, bonds_left, num_trials, True
+        # no more bonds but there are atoms remaining: conversion failed
+        if bonds_left <= 0:
+            return smiles, bonds_left, num_trials, False
+        to_add = formula_list[add_idx]  # atom to be added to current atom
+        if isinstance(to_add, list):  # "atom" added is a list (i.e. nested condensed formula): assume valence of 1
+            if bonds_left > 1:
+                # "atom" added does not use up remaining bonds of current atom
+                # get smiles of "atom" (which is itself a condensed formula)
+                add_str, val, trials, success = _condensed_formula_list_to_smiles(to_add, 1, None, direction)
+                if val > 0:
+                    add_str = _get_bond_symb(val + 1) + add_str
+                num_trials += trials
+                if not success:
+                    return smiles, bonds_left, num_trials, False
+                # put smiles of "atom" in parentheses and append to smiles; go to next atom to add to current atom
+                result = dfs(smiles + f'({add_str})', bonds_left - 1, cur_idx, add_idx + direction)
+            else:
+                # "atom" added uses up remaining bonds of current atom
+                # get smiles of "atom" and bonds left on it
+                add_str, bonds_left, trials, success = _condensed_formula_list_to_smiles(to_add, 1, None, direction)
+                num_trials += trials
+                if not success:
+                    return smiles, bonds_left, num_trials, False
+                # append smiles of "atom" (without parentheses) to smiles; it becomes new current atom
+                result = dfs(smiles + add_str, bonds_left, add_idx, add_idx + direction)
+            smiles, bonds_left, trials, success = result
+            num_trials += trials
+            return smiles, bonds_left, num_trials, success
+        # atom added is a single symbol (as opposed to nested condensed formula)
+        for val in VALENCES.get(to_add, [1]):  # try all possible valences of atom added
+            add_str = _expand_abbreviation(to_add)  # expand to smiles if symbol is abbreviation
+            if bonds_left > val:  # atom added does not use up remaining bonds of current atom; go to next atom to add to current atom
+                if cur_idx >= 0:
+                    add_str = _get_bond_symb(val) + add_str
+                result = dfs(smiles + f'({add_str})', bonds_left - val, cur_idx, add_idx + direction)
+            else:  # atom added uses up remaining bonds of current atom; it becomes new current atom
+                if cur_idx >= 0:
+                    add_str = _get_bond_symb(bonds_left) + add_str
+                result = dfs(smiles + add_str, val - bonds_left, add_idx, add_idx + direction)
+            trials, success = result[2:]
+            num_trials += trials
+            if success:
+                return result[0], result[1], num_trials, success
+            if num_trials > 10000:
+                break
+        return smiles, bonds_left, num_trials, False
+    cur_idx = -1 if direction == 1 else len(formula_list)
+    add_idx = 0 if direction == 1 else len(formula_list) - 1
+    return dfs('', start_bond, cur_idx, add_idx)
+def get_smiles_from_symbol(symbol, mol, atom, bonds):
+    """
+    Convert symbol (abbrev. or condensed formula) to smiles
+    If condensed formula, determine parsing direction and num. bonds on each side using coordinates
+    """
+    print(symbol)
+    if symbol in ABBREVIATIONS:
+        return ABBREVIATIONS[symbol].smiles
+    if len(symbol) > 20:
+        return None
+    #mol_check = Chem.MolFromSmiles(symbol)
+    #if mol_check:
+    #        print(symbol)  # Print the symbol to debug
+    #        return symbol
+    total_bonds = int(sum([bond.GetBondTypeAsDouble() for bond in bonds]))
+    formula_list = _expand_carbon(_parse_formula(symbol))
+    smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, total_bonds, None)
+    if success:
+        mol_check = Chem.MolFromSmiles(smiles)  # Check if the SMILES is valid
+        if mol_check:
+            print(f"smiles:{smiles}")  # Print the symbol to debug
+            return smiles
+    mol_check = Chem.MolFromSmiles(symbol)
+    if mol_check:
+            print(f"symbol:{symbol}")  # Print the symbol to debug
+            return symbol
+    return None
+def _replace_functional_group(smiles):
+    smiles = smiles.replace('<unk>', 'C')
+    for i, r in enumerate(RGROUP_SYMBOLS):
+        symbol = f'[{r}]'
+        if symbol in smiles:
+            if r[0] == 'R' and r[1:].isdigit():
+                smiles = smiles.replace(symbol, f'[{int(r[1:])}*]')
+            else:
+                smiles = smiles.replace(symbol, '*')
+    # For unknown tokens (i.e. rdkit cannot parse), replace them with [{isotope}*], where isotope is an identifier.
+    tokens = atomwise_tokenizer(smiles)
+    new_tokens = []
+    mappings = {}  # isotope : symbol
+    isotope = 50
+    for token in tokens:
+        if token[0] == '[':
+            if token[1:-1] in ABBREVIATIONS or Chem.AtomFromSmiles(token) is None:
+                while f'[{isotope}*]' in smiles or f'[{isotope}*]' in new_tokens:
+                    isotope += 1
+                placeholder = f'[{isotope}*]'
+                mappings[isotope] = token[1:-1]
+                new_tokens.append(placeholder)
+                continue
+        new_tokens.append(token)
+    smiles = ''.join(new_tokens)
+    return smiles, mappings
+def convert_smiles_to_mol(smiles):
+    if smiles is None or smiles == '':
+        return None
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+    except:
+        return None
+    return mol
+BOND_TYPES = {1: Chem.rdchem.BondType.SINGLE, 2: Chem.rdchem.BondType.DOUBLE, 3: Chem.rdchem.BondType.TRIPLE}
+def _expand_functional_group(mol, mappings, debug=False):
+    def _need_expand(mol, mappings):
+        return any([len(Chem.GetAtomAlias(atom)) > 0 for atom in mol.GetAtoms()]) or len(mappings) > 0
+    if _need_expand(mol, mappings):
+        mol_w = Chem.RWMol(mol)
+        num_atoms = mol_w.GetNumAtoms()
+        for i, atom in enumerate(mol_w.GetAtoms()):  # reset radical electrons
+            atom.SetNumRadicalElectrons(0)
+        atoms_to_remove = []
+        for i in range(num_atoms):
+            atom = mol_w.GetAtomWithIdx(i)
+            if atom.GetSymbol() == '*':
+                symbol = Chem.GetAtomAlias(atom)
+                isotope = atom.GetIsotope()
+                if isotope > 0 and isotope in mappings:
+                    symbol = mappings[isotope]
+                if not (isinstance(symbol, str) and len(symbol) > 0):
+                    continue
+                # rgroups do not need to be expanded
+                if symbol in RGROUP_SYMBOLS:
+                    continue
+                bonds = atom.GetBonds()
+                sub_smiles = get_smiles_from_symbol(symbol, mol_w, atom, bonds)
+                # create mol object for abbreviation/condensed formula from its SMILES
+                mol_r = convert_smiles_to_mol(sub_smiles)
+                if mol_r is None:
+                    # atom.SetAtomicNum(6)
+                    atom.SetIsotope(0)
+                    continue
+                # remove bonds connected to abbreviation/condensed formula
+                adjacent_indices = [bond.GetOtherAtomIdx(i) for bond in bonds]
+                for adjacent_idx in adjacent_indices:
+                    mol_w.RemoveBond(i, adjacent_idx)
+                adjacent_atoms = [mol_w.GetAtomWithIdx(adjacent_idx) for adjacent_idx in adjacent_indices]
+                for adjacent_atom, bond in zip(adjacent_atoms, bonds):
+                    adjacent_atom.SetNumRadicalElectrons(int(bond.GetBondTypeAsDouble()))
+                # get indices of atoms of main body that connect to substituent
+                bonding_atoms_w = adjacent_indices
+                # assume indices are concated after combine mol_w and mol_r
+                bonding_atoms_r = [mol_w.GetNumAtoms()]
+                for atm in mol_r.GetAtoms():
+                    if atm.GetNumRadicalElectrons() and atm.GetIdx() > 0:
+                        bonding_atoms_r.append(mol_w.GetNumAtoms() + atm.GetIdx())
+                # combine main body and substituent into a single molecule object
+                combo = Chem.CombineMols(mol_w, mol_r)
+                # connect substituent to main body with bonds
+                mol_w = Chem.RWMol(combo)
+                # if len(bonding_atoms_r) == 1:  # substituent uses one atom to bond to main body
+                for atm in bonding_atoms_w:
+                    bond_order = mol_w.GetAtomWithIdx(atm).GetNumRadicalElectrons()
+                    mol_w.AddBond(atm, bonding_atoms_r[0], order=BOND_TYPES[bond_order])
+                # reset radical electrons
+                for atm in bonding_atoms_w:
+                    mol_w.GetAtomWithIdx(atm).SetNumRadicalElectrons(0)
+                for atm in bonding_atoms_r:
+                    mol_w.GetAtomWithIdx(atm).SetNumRadicalElectrons(0)
+                atoms_to_remove.append(i)
+        # Remove atom in the end, otherwise the id will change
+        # Reverse the order and remove atoms with larger id first
+        atoms_to_remove.sort(reverse=True)
+        for i in atoms_to_remove:
+            mol_w.RemoveAtom(i)
+        smiles = Chem.MolToSmiles(mol_w)
+        mol = mol_w.GetMol()
+    else:
+        smiles = Chem.MolToSmiles(mol)
+    return smiles, mol
+def _convert_graph_to_smiles(coords, symbols, edges, image=None, debug=False):
+    mol = Chem.RWMol()
+    n = len(symbols)
+    ids = []
+    for i in range(n):
+        symbol = symbols[i]
+        if symbol[0] == '[':
+            symbol = symbol[1:-1]
+        if symbol in RGROUP_SYMBOLS:
+            atom = Chem.Atom("*")
+            if symbol[0] == 'R' and symbol[1:].isdigit():
+                atom.SetIsotope(int(symbol[1:]))
+            Chem.SetAtomAlias(atom, symbol)
+        elif symbol in ABBREVIATIONS:
+            atom = Chem.Atom("*")
+            Chem.SetAtomAlias(atom, symbol)
+        else:
+            try:  # try to get SMILES of atom
+                atom = Chem.AtomFromSmiles(symbols[i])
+                atom.SetChiralTag(Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
+            except:  # otherwise, abbreviation or condensed formula
+                atom = Chem.Atom("*")
+                Chem.SetAtomAlias(atom, symbol)
+        if atom.GetSymbol() == '*':
+            atom.SetProp('molFileAlias', symbol)
+        idx = mol.AddAtom(atom)
+        assert idx == i
+        ids.append(idx)
+    for i in range(n):
+        for j in range(i + 1, n):
+            if edges[i][j] == 1:
+                mol.AddBond(ids[i], ids[j], Chem.BondType.SINGLE)
+            elif edges[i][j] == 2:
+                mol.AddBond(ids[i], ids[j], Chem.BondType.DOUBLE)
+            elif edges[i][j] == 3:
+                mol.AddBond(ids[i], ids[j], Chem.BondType.TRIPLE)
+            elif edges[i][j] == 4:
+                mol.AddBond(ids[i], ids[j], Chem.BondType.AROMATIC)
+            elif edges[i][j] == 5:
+                mol.AddBond(ids[i], ids[j], Chem.BondType.SINGLE)
+                mol.GetBondBetweenAtoms(ids[i], ids[j]).SetBondDir(Chem.BondDir.BEGINWEDGE)
+            elif edges[i][j] == 6:
+                mol.AddBond(ids[i], ids[j], Chem.BondType.SINGLE)
+                mol.GetBondBetweenAtoms(ids[i], ids[j]).SetBondDir(Chem.BondDir.BEGINDASH)
+    pred_smiles = '<invalid>'
+    try:
+        # TODO: move to an util function
+        if image is not None:
+            height, width, _ = image.shape
+            ratio = width / height
+            coords = [[x * ratio * 10, y * 10] for x, y in coords]
+        mol = _verify_chirality(mol, coords, symbols, edges, debug)
+        # molblock is obtained before expanding func groups, otherwise the expanded group won't have coordinates.
+        # TODO: make sure molblock has the abbreviation information
+        pred_molblock = Chem.MolToMolBlock(mol)
+        pred_smiles, mol = _expand_functional_group(mol, {}, debug)
+        success = True
+    except Exception as e:
+        if debug:
+            print(traceback.format_exc())
+        pred_molblock = ''
+        success = False
+    if debug:
+        return pred_smiles, pred_molblock, mol, success
+    return pred_smiles, pred_molblock, success
+def convert_graph_to_smiles(coords, symbols, edges, images=None, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        if images is None:
+            results = p.starmap(_convert_graph_to_smiles, zip(coords, symbols, edges), chunksize=128)
+        else:
+            results = p.starmap(_convert_graph_to_smiles, zip(coords, symbols, edges, images), chunksize=128)
+    smiles_list, molblock_list, success = zip(*results)
+    r_success = np.mean(success)
+    return smiles_list, molblock_list, r_success
+def _postprocess_smiles(smiles, coords=None, symbols=None, edges=None, molblock=False, debug=False):
+    if type(smiles) is not str or smiles == '':
+        return '', False
+    mol = None
+    pred_molblock = ''
+    try:
+        pred_smiles = smiles
+        pred_smiles, mappings = _replace_functional_group(pred_smiles)
+        if coords is not None and symbols is not None and edges is not None:
+            pred_smiles = pred_smiles.replace('@', '').replace('/', '').replace('\\', '')
+            mol = Chem.RWMol(Chem.MolFromSmiles(pred_smiles, sanitize=False))
+            mol = _verify_chirality(mol, coords, symbols, edges, debug)
+        else:
+            mol = Chem.MolFromSmiles(pred_smiles, sanitize=False)
+        # pred_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
+        if molblock:
+            pred_molblock = Chem.MolToMolBlock(mol)
+        pred_smiles, mol = _expand_functional_group(mol, mappings)
+        success = True
+    except Exception as e:
+        if debug:
+            print(traceback.format_exc())
+        pred_smiles = smiles
+        pred_molblock = ''
+        success = False
+    if debug:
+        return pred_smiles, pred_molblock, mol, success
+    return pred_smiles, pred_molblock, success
+def postprocess_smiles(smiles, coords=None, symbols=None, edges=None, molblock=False, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        if coords is not None and symbols is not None and edges is not None:
+            results = p.starmap(_postprocess_smiles, zip(smiles, coords, symbols, edges), chunksize=128)
+        else:
+            results = p.map(_postprocess_smiles, smiles, chunksize=128)
+    smiles_list, molblock_list, success = zip(*results)
+    r_success = np.mean(success)
+    return smiles_list, molblock_list, r_success
+def _keep_main_molecule(smiles, debug=False):
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        frags = Chem.GetMolFrags(mol, asMols=True)
+        if len(frags) > 1:
+            num_atoms = [m.GetNumAtoms() for m in frags]
+            main_mol = frags[np.argmax(num_atoms)]
+            smiles = Chem.MolToSmiles(main_mol)
+    except Exception as e:
+        if debug:
+            print(traceback.format_exc())
+    return smiles
+def keep_main_molecule(smiles, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        results = p.map(_keep_main_molecule, smiles, chunksize=128)
+    return results

molscribe/constants.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from typing import List
+import re
+ORGANIC_SET = {'B', 'C', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I'}
+RGROUP_SYMBOLS = ['R', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12', "R'",
+                  'Ra', 'Rb', 'Rc', 'Rd', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar']
+PLACEHOLDER_ATOMS = ["Lv", "Lu", "Nd", "Yb", "At", "Fm", "Er"]
+class Substitution(object):
+    '''Define common substitutions for chemical shorthand'''
+    def __init__(self, abbrvs, smarts, smiles, probability):
+        assert type(abbrvs) is list
+        self.abbrvs = abbrvs
+        self.smarts = smarts
+        self.smiles = smiles
+        self.probability = probability
+SUBSTITUTIONS: List[Substitution] = [
+    Substitution(['NO2', 'O2N'], '[N+](=O)[O-]', "[N+](=O)[O-]", 0.5),
+    Substitution(['OCOCH3'], '[#8]-[#6](=[#8])-[#6]', "[O]C(=O)C]", 0.5),
+    Substitution(['CHO', 'OHC'], '[CH1](=O)', "[CH1](=O)", 0.5),
+    Substitution(['CO2Et', 'COOEt', 'EtO2C'], 'C(=O)[OH0;D2][CH2;D2][CH3]', "[C](=O)OCC", 0.5),
+    Substitution(['OAc'], '[OH0;X2]C(=O)[CH3]', "[O]C(=O)C", 0.7),
+    Substitution(['NHAc'], '[NH1;D2]C(=O)[CH3]', "[NH]C(=O)C", 0.7),
+    Substitution(['Ac'], 'C(=O)[CH3]', "[C](=O)C", 0.1),
+    Substitution(['OBz'], '[OH0;D2]C(=O)[cH0]1[cH][cH][cH][cH][cH]1', "[O]C(=O)c1ccccc1", 0.7),  # Benzoyl
+    Substitution(['Bz'], 'C(=O)[cH0]1[cH][cH][cH][cH][cH]1', "[C](=O)c1ccccc1", 0.2),  # Benzoyl
+    Substitution(['OBn'], '[OH0;D2][CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[O]Cc1ccccc1", 0.7),  # Benzyl
+    Substitution(['Bn'], '[CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[CH2]c1ccccc1", 0.2),  # Benzyl
+    Substitution(['NHBoc'], '[NH1;D2]C(=O)OC([CH3])([CH3])[CH3]', "[NH1]C(=O)OC(C)(C)C", 0.6),
+    Substitution(['NBoc'], '[NH0;D3]C(=O)OC([CH3])([CH3])[CH3]', "[NH1]C(=O)OC(C)(C)C", 0.6),
+    Substitution(['Boc'], 'C(=O)OC([CH3])([CH3])[CH3]', "[C](=O)OC(C)(C)C", 0.2),
+    Substitution(['Cbm'], 'C(=O)[NH2;D1]', "[C](=O)N", 0.2),
+    Substitution(['Cbz'], 'C(=O)OC[cH]1[cH][cH][cH1][cH][cH]1', "[C](=O)OCc1ccccc1", 0.4),
+    Substitution(['Cy'], '[CH1;X3]1[CH2][CH2][CH2][CH2][CH2]1', "[CH1]1CCCCC1", 0.3),
+    Substitution(['Fmoc'], 'C(=O)O[CH2][CH1]1c([cH1][cH1][cH1][cH1]2)c2c3c1[cH1][cH1][cH1][cH1]3',
+                 "[C](=O)OCC1c(cccc2)c2c3c1cccc3", 0.6),
+    Substitution(['Mes'], '[cH0]1c([CH3])cc([CH3])cc([CH3])1', "[c]1c(C)cc(C)cc(C)1", 0.5),
+    Substitution(['OMs'], '[OH0;D2]S(=O)(=O)[CH3]', "[O]S(=O)(=O)C", 0.7),
+    Substitution(['Ms'], 'S(=O)(=O)[CH3]', "[S](=O)(=O)C", 0.2),
+    Substitution(['Ph'], '[cH0]1[cH][cH][cH1][cH][cH]1', "[c]1ccccc1", 0.5),
+    Substitution(['PMB'], '[CH2;D2][cH0]1[cH1][cH1][cH0](O[CH3])[cH1][cH1]1', "[CH2]c1ccc(OC)cc1", 0.2),
+    Substitution(['Py'], '[cH0]1[n;+0][cH1][cH1][cH1][cH1]1', "[c]1ncccc1", 0.1),
+    Substitution(['SEM'], '[CH2;D2][CH2][Si]([CH3])([CH3])[CH3]', "[CH2]CSi(C)(C)C", 0.2),
+    Substitution(['Suc'], 'C(=O)[CH2][CH2]C(=O)[OH]', "[C](=O)CCC(=O)O", 0.2),
+    Substitution(['TBS'], '[Si]([CH3])([CH3])C([CH3])([CH3])[CH3]', "[Si](C)(C)C(C)(C)C", 0.5),
+    Substitution(['TBZ'], 'C(=S)[cH]1[cH][cH][cH1][cH][cH]1', "[C](=S)c1ccccc1", 0.2),
+    Substitution(['OTf'], '[OH0;D2]S(=O)(=O)C(F)(F)F', "[O]S(=O)(=O)C(F)(F)F", 0.7),
+    Substitution(['Tf'], 'S(=O)(=O)C(F)(F)F', "[S](=O)(=O)C(F)(F)F", 0.2),
+    Substitution(['TFA'], 'C(=O)C(F)(F)F', "[C](=O)C(F)(F)F", 0.3),
+    Substitution(['TMS'], '[Si]([CH3])([CH3])[CH3]', "[Si](C)(C)C", 0.5),
+    Substitution(['Ts'], 'S(=O)(=O)c1[cH1][cH1][cH0]([CH3])[cH1][cH1]1', "[S](=O)(=O)c1ccc(C)cc1", 0.6),  # Tos
+    # Alkyl chains
+    Substitution(['OMe', 'MeO'], '[OH0;D2][CH3;D1]', "[O]C", 0.3),
+    Substitution(['SMe', 'MeS'], '[SH0;D2][CH3;D1]', "[S]C", 0.3),
+    Substitution(['NMe', 'MeN'], '[N;X3][CH3;D1]', "[NH]C", 0.3),
+    Substitution(['Me'], '[CH3;D1]', "[CH3]", 0.1),
+    Substitution(['OEt', 'EtO'], '[OH0;D2][CH2;D2][CH3]', "[O]CC", 0.5),
+    Substitution(['Et', 'C2H5'], '[CH2;D2][CH3]', "[CH2]C", 0.3),
+    Substitution(['Pr', 'nPr', 'n-Pr'], '[CH2;D2][CH2;D2][CH3]', "[CH2]CC", 0.3),
+    Substitution(['Bu', 'nBu', 'n-Bu'], '[CH2;D2][CH2;D2][CH2;D2][CH3]', "[CH2]CCC", 0.3),
+    # Branched
+    Substitution(['iPr', 'i-Pr'], '[CH1;D3]([CH3])[CH3]', "[CH1](C)C", 0.2),
+    Substitution(['iBu', 'i-Bu'], '[CH2;D2][CH1;D3]([CH3])[CH3]', "[CH2]C(C)C", 0.2),
+    Substitution(['OiBu'], '[OH0;D2][CH2;D2][CH1;D3]([CH3])[CH3]', "[O]CC(C)C", 0.2),
+    Substitution(['OtBu'], '[OH0;D2][CH0]([CH3])([CH3])[CH3]', "[O]C(C)(C)C", 0.6),
+    Substitution(['tBu', 't-Bu'], '[CH0]([CH3])([CH3])[CH3]', "[C](C)(C)C", 0.3),
+    # Other shorthands (MIGHT NOT WANT ALL OF THESE)
+    Substitution(['CF3', 'F3C'], '[CH0;D4](F)(F)F', "[C](F)(F)F", 0.5),
+    Substitution(['NCF3', 'F3CN'], '[N;X3][CH0;D4](F)(F)F', "[NH]C(F)(F)F", 0.5),
+    Substitution(['OCF3', 'F3CO'], '[OH0;X2][CH0;D4](F)(F)F', "[O]C(F)(F)F", 0.5),
+    Substitution(['CCl3'], '[CH0;D4](Cl)(Cl)Cl', "[C](Cl)(Cl)Cl", 0.5),
+    Substitution(['CO2H', 'HO2C', 'COOH'], 'C(=O)[OH]', "[C](=O)O", 0.5),  # COOH
+    Substitution(['CN', 'NC'], 'C#[ND1]', "[C]#N", 0.5),
+    Substitution(['OCH3', 'H3CO'], '[OH0;D2][CH3]', "[O]C", 0.4),
+    Substitution(['SO3H'], 'S(=O)(=O)[OH]', "[S](=O)(=O)O", 0.4),
+    Substitution(['CH3O'], '[OH0;D2][CH3]', "[O]C", 0),
+    Substitution(['PhCH2CH2'], '[OH0;D2][CH3]', "C1=CC=CC=C1CC", 0),
+    Substitution(['SO2ToI','SO2Tol'], '[OH0;D2][CH3]', "CS(=O)(=O)C1=CC=CC=C1", 0),
+]
+ABBREVIATIONS = {abbrv: sub for sub in SUBSTITUTIONS for abbrv in sub.abbrvs}
+VALENCES = {
+    "H": [1], "Li": [1], "Be": [2], "B": [3], "C": [4], "N": [3, 5], "O": [2], "F": [1],
+    "Na": [1], "Mg": [2], "Al": [3], "Si": [4], "P": [5, 3], "S": [6, 2, 4], "Cl": [1], "K": [1], "Ca": [2],
+    "Br": [1], "I": [1]
+}
+ELEMENTS = [
+    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
+    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
+    "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
+    "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr",
+    "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
+    "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
+    "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
+    "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
+    "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
+    "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
+    "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
+    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
+]
+COLORS = {
+    u'c': '0.0,0.75,0.75', u'b': '0.0,0.0,1.0', u'g': '0.0,0.5,0.0', u'y': '0.75,0.75,0',
+    u'k': '0.0,0.0,0.0', u'r': '1.0,0.0,0.0', u'm': '0.75,0,0.75'
+}
+# tokens of condensed formula
+FORMULA_REGEX = re.compile(
+    '(' + '|'.join(list(ABBREVIATIONS.keys())) + '|R[0-9]*|[A-Z][a-z]+|[A-Z]|[0-9]+|\(|\))')

molscribe/dataset.py ADDED Viewed

	@@ -0,0 +1,594 @@

+import os
+import cv2
+import time
+import random
+import re
+import string
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+from .indigo import Indigo
+from .indigo.renderer import IndigoRenderer
+from .augment import SafeRotate, CropWhite, PadWhite, SaltAndPepperNoise
+from .utils import FORMAT_INFO
+from .tokenizer import PAD_ID
+from .chemistry import get_num_atoms, normalize_nodes
+from .constants import RGROUP_SYMBOLS, SUBSTITUTIONS, ELEMENTS, COLORS
+cv2.setNumThreads(1)
+INDIGO_HYGROGEN_PROB = 0.2
+INDIGO_FUNCTIONAL_GROUP_PROB = 0.8
+INDIGO_CONDENSED_PROB = 0.5
+INDIGO_RGROUP_PROB = 0.5
+INDIGO_COMMENT_PROB = 0.3
+INDIGO_DEARMOTIZE_PROB = 0.8
+INDIGO_COLOR_PROB = 0.2
+def get_transforms(input_size, augment=True, rotate=True, debug=False):
+    trans_list = []
+    if augment and rotate:
+        trans_list.append(SafeRotate(limit=90, border_mode=cv2.BORDER_CONSTANT, value=(255, 255, 255)))
+    trans_list.append(CropWhite(pad=5))
+    if augment:
+        trans_list += [
+            # NormalizedGridDistortion(num_steps=10, distort_limit=0.3),
+            A.CropAndPad(percent=[-0.01, 0.00], keep_size=False, p=0.5),
+            PadWhite(pad_ratio=0.4, p=0.2),
+            A.Downscale(scale_min=0.2, scale_max=0.5, interpolation=3),
+            A.Blur(),
+            A.GaussNoise(),
+            SaltAndPepperNoise(num_dots=20, p=0.5)
+        ]
+    trans_list.append(A.Resize(input_size, input_size))
+    if not debug:
+        mean = [0.485, 0.456, 0.406]
+        std = [0.229, 0.224, 0.225]
+        trans_list += [
+            A.ToGray(p=1),
+            A.Normalize(mean=mean, std=std),
+            ToTensorV2(),
+        ]
+    return A.Compose(trans_list, keypoint_params=A.KeypointParams(format='xy', remove_invisible=False))
+def add_functional_group(indigo, mol, debug=False):
+    if random.random() > INDIGO_FUNCTIONAL_GROUP_PROB:
+        return mol
+    # Delete functional group and add a pseudo atom with its abbrv
+    substitutions = [sub for sub in SUBSTITUTIONS]
+    random.shuffle(substitutions)
+    for sub in substitutions:
+        query = indigo.loadSmarts(sub.smarts)
+        matcher = indigo.substructureMatcher(mol)
+        matched_atoms_ids = set()
+        for match in matcher.iterateMatches(query):
+            if random.random() < sub.probability or debug:
+                atoms = []
+                atoms_ids = set()
+                for item in query.iterateAtoms():
+                    atom = match.mapAtom(item)
+                    atoms.append(atom)
+                    atoms_ids.add(atom.index())
+                if len(matched_atoms_ids.intersection(atoms_ids)) > 0:
+                    continue
+                abbrv = random.choice(sub.abbrvs)
+                superatom = mol.addAtom(abbrv)
+                for atom in atoms:
+                    for nei in atom.iterateNeighbors():
+                        if nei.index() not in atoms_ids:
+                            if nei.symbol() == 'H':
+                                # indigo won't match explicit hydrogen, so remove them explicitly
+                                atoms_ids.add(nei.index())
+                            else:
+                                superatom.addBond(nei, nei.bond().bondOrder())
+                for id in atoms_ids:
+                    mol.getAtom(id).remove()
+                matched_atoms_ids = matched_atoms_ids.union(atoms_ids)
+    return mol
+def add_explicit_hydrogen(indigo, mol):
+    atoms = []
+    for atom in mol.iterateAtoms():
+        try:
+            hs = atom.countImplicitHydrogens()
+            if hs > 0:
+                atoms.append((atom, hs))
+        except:
+            continue
+    if len(atoms) > 0 and random.random() < INDIGO_HYGROGEN_PROB:
+        atom, hs = random.choice(atoms)
+        for i in range(hs):
+            h = mol.addAtom('H')
+            h.addBond(atom, 1)
+    return mol
+def add_rgroup(indigo, mol, smiles):
+    atoms = []
+    for atom in mol.iterateAtoms():
+        try:
+            hs = atom.countImplicitHydrogens()
+            if hs > 0:
+                atoms.append(atom)
+        except:
+            continue
+    if len(atoms) > 0 and '*' not in smiles:
+        if random.random() < INDIGO_RGROUP_PROB:
+            atom_idx = random.choice(range(len(atoms)))
+            atom = atoms[atom_idx]
+            atoms.pop(atom_idx)
+            symbol = random.choice(RGROUP_SYMBOLS)
+            r = mol.addAtom(symbol)
+            r.addBond(atom, 1)
+    return mol
+def get_rand_symb():
+    symb = random.choice(ELEMENTS)
+    if random.random() < 0.1:
+        symb += random.choice(string.ascii_lowercase)
+    if random.random() < 0.1:
+        symb += random.choice(string.ascii_uppercase)
+    if random.random() < 0.1:
+        symb = f'({gen_rand_condensed()})'
+    return symb
+def get_rand_num():
+    if random.random() < 0.9:
+        if random.random() < 0.8:
+            return ''
+        else:
+            return str(random.randint(2, 9))
+    else:
+        return '1' + str(random.randint(2, 9))
+def gen_rand_condensed():
+    tokens = []
+    for i in range(5):
+        if i >= 1 and random.random() < 0.8:
+            break
+        tokens.append(get_rand_symb())
+        tokens.append(get_rand_num())
+    return ''.join(tokens)
+def add_rand_condensed(indigo, mol):
+    atoms = []
+    for atom in mol.iterateAtoms():
+        try:
+            hs = atom.countImplicitHydrogens()
+            if hs > 0:
+                atoms.append(atom)
+        except:
+            continue
+    if len(atoms) > 0 and random.random() < INDIGO_CONDENSED_PROB:
+        atom = random.choice(atoms)
+        symbol = gen_rand_condensed()
+        r = mol.addAtom(symbol)
+        r.addBond(atom, 1)
+    return mol
+def generate_output_smiles(indigo, mol):
+    # TODO: if using mol.canonicalSmiles(), explicit H will be removed
+    smiles = mol.smiles()
+    mol = indigo.loadMolecule(smiles)
+    if '*' in smiles:
+        part_a, part_b = smiles.split(' ', maxsplit=1)
+        part_b = re.search(r'\$.*\$', part_b).group(0)[1:-1]
+        symbols = [t for t in part_b.split(';') if len(t) > 0]
+        output = ''
+        cnt = 0
+        for i, c in enumerate(part_a):
+            if c != '*':
+                output += c
+            else:
+                output += f'[{symbols[cnt]}]'
+                cnt += 1
+        return mol, output
+    else:
+        if ' ' in smiles:
+            # special cases with extension
+            smiles = smiles.split(' ')[0]
+        return mol, smiles
+def add_comment(indigo):
+    if random.random() < INDIGO_COMMENT_PROB:
+        indigo.setOption('render-comment', str(random.randint(1, 20)) + random.choice(string.ascii_letters))
+        indigo.setOption('render-comment-font-size', random.randint(40, 60))
+        indigo.setOption('render-comment-alignment', random.choice([0, 0.5, 1]))
+        indigo.setOption('render-comment-position', random.choice(['top', 'bottom']))
+        indigo.setOption('render-comment-offset', random.randint(2, 30))
+def add_color(indigo, mol):
+    if random.random() < INDIGO_COLOR_PROB:
+        indigo.setOption('render-coloring', True)
+    if random.random() < INDIGO_COLOR_PROB:
+        indigo.setOption('render-base-color', random.choice(list(COLORS.values())))
+    if random.random() < INDIGO_COLOR_PROB:
+        if random.random() < 0.5:
+            indigo.setOption('render-highlight-color-enabled', True)
+            indigo.setOption('render-highlight-color', random.choice(list(COLORS.values())))
+        if random.random() < 0.5:
+            indigo.setOption('render-highlight-thickness-enabled', True)
+        for atom in mol.iterateAtoms():
+            if random.random() < 0.1:
+                atom.highlight()
+    return mol
+def get_graph(mol, image, shuffle_nodes=False, pseudo_coords=False):
+    mol.layout()
+    coords, symbols = [], []
+    index_map = {}
+    atoms = [atom for atom in mol.iterateAtoms()]
+    if shuffle_nodes:
+        random.shuffle(atoms)
+    for i, atom in enumerate(atoms):
+        if pseudo_coords:
+            x, y, z = atom.xyz()
+        else:
+            x, y = atom.coords()
+        coords.append([x, y])
+        symbols.append(atom.symbol())
+        index_map[atom.index()] = i
+    if pseudo_coords:
+        coords = normalize_nodes(np.array(coords))
+        h, w, _ = image.shape
+        coords[:, 0] = coords[:, 0] * w
+        coords[:, 1] = coords[:, 1] * h
+    n = len(symbols)
+    edges = np.zeros((n, n), dtype=int)
+    for bond in mol.iterateBonds():
+        s = index_map[bond.source().index()]
+        t = index_map[bond.destination().index()]
+        # 1/2/3/4 : single/double/triple/aromatic
+        edges[s, t] = bond.bondOrder()
+        edges[t, s] = bond.bondOrder()
+        if bond.bondStereo() in [5, 6]:
+            edges[s, t] = bond.bondStereo()
+            edges[t, s] = 11 - bond.bondStereo()
+    graph = {
+        'coords': coords,
+        'symbols': symbols,
+        'edges': edges,
+        'num_atoms': len(symbols)
+    }
+    return graph
+def generate_indigo_image(smiles, mol_augment=True, default_option=False, shuffle_nodes=False, pseudo_coords=False,
+                          include_condensed=True, debug=False):
+    indigo = Indigo()
+    renderer = IndigoRenderer(indigo)
+    indigo.setOption('render-output-format', 'png')
+    indigo.setOption('render-background-color', '1,1,1')
+    indigo.setOption('render-stereo-style', 'none')
+    indigo.setOption('render-label-mode', 'hetero')
+    indigo.setOption('render-font-family', 'Arial')
+    if not default_option:
+        thickness = random.uniform(0.5, 2)  # limit the sum of the following two parameters to be smaller than 4
+        indigo.setOption('render-relative-thickness', thickness)
+        indigo.setOption('render-bond-line-width', random.uniform(1, 4 - thickness))
+        if random.random() < 0.5:
+            indigo.setOption('render-font-family', random.choice(['Arial', 'Times', 'Courier', 'Helvetica']))
+        indigo.setOption('render-label-mode', random.choice(['hetero', 'terminal-hetero']))
+        indigo.setOption('render-implicit-hydrogens-visible', random.choice([True, False]))
+        if random.random() < 0.1:
+            indigo.setOption('render-stereo-style', 'old')
+        if random.random() < 0.2:
+            indigo.setOption('render-atom-ids-visible', True)
+    try:
+        mol = indigo.loadMolecule(smiles)
+        if mol_augment:
+            if random.random() < INDIGO_DEARMOTIZE_PROB:
+                mol.dearomatize()
+            else:
+                mol.aromatize()
+            smiles = mol.canonicalSmiles()
+            add_comment(indigo)
+            mol = add_explicit_hydrogen(indigo, mol)
+            mol = add_rgroup(indigo, mol, smiles)
+            if include_condensed:
+                mol = add_rand_condensed(indigo, mol)
+            mol = add_functional_group(indigo, mol, debug)
+            mol = add_color(indigo, mol)
+            mol, smiles = generate_output_smiles(indigo, mol)
+        buf = renderer.renderToBuffer(mol)
+        img = cv2.imdecode(np.asarray(bytearray(buf), dtype=np.uint8), 1)  # decode buffer to image
+        # img = np.repeat(np.expand_dims(img, 2), 3, axis=2)  # expand to RGB
+        graph = get_graph(mol, img, shuffle_nodes, pseudo_coords)
+        success = True
+    except Exception:
+        if debug:
+            raise Exception
+        img = np.array([[[255., 255., 255.]] * 10] * 10).astype(np.float32)
+        graph = {}
+        success = False
+    return img, smiles, graph, success
+class TrainDataset(Dataset):
+    def __init__(self, args, df, tokenizer, split='train', dynamic_indigo=False):
+        super().__init__()
+        self.df = df
+        self.args = args
+        self.tokenizer = tokenizer
+        if 'file_path' in df.columns:
+            self.file_paths = df['file_path'].values
+            if not self.file_paths[0].startswith(args.data_path):
+                self.file_paths = [os.path.join(args.data_path, path) for path in df['file_path']]
+        self.smiles = df['SMILES'].values if 'SMILES' in df.columns else None
+        self.formats = args.formats
+        self.labelled = (split == 'train')
+        if self.labelled:
+            self.labels = {}
+            for format_ in self.formats:
+                if format_ in ['atomtok', 'inchi']:
+                    field = FORMAT_INFO[format_]['name']
+                    if field in df.columns:
+                        self.labels[format_] = df[field].values
+        self.transform = get_transforms(args.input_size,
+                                        augment=(self.labelled and args.augment))
+        # self.fix_transform = A.Compose([A.Transpose(p=1), A.VerticalFlip(p=1)])
+        self.dynamic_indigo = (dynamic_indigo and split == 'train')
+        if self.labelled and not dynamic_indigo and args.coords_file is not None:
+            if args.coords_file == 'aux_file':
+                self.coords_df = df
+                self.pseudo_coords = True
+            else:
+                self.coords_df = pd.read_csv(args.coords_file)
+                self.pseudo_coords = False
+        else:
+            self.coords_df = None
+            self.pseudo_coords = args.pseudo_coords
+    def __len__(self):
+        return len(self.df)
+    def image_transform(self, image, coords=[], renormalize=False):
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # .astype(np.float32)
+        augmented = self.transform(image=image, keypoints=coords)
+        image = augmented['image']
+        if len(coords) > 0:
+            coords = np.array(augmented['keypoints'])
+            if renormalize:
+                coords = normalize_nodes(coords, flip_y=False)
+            else:
+                _, height, width = image.shape
+                coords[:, 0] = coords[:, 0] / width
+                coords[:, 1] = coords[:, 1] / height
+            coords = np.array(coords).clip(0, 1)
+            return image, coords
+        return image
+    def __getitem__(self, idx):
+        try:
+            return self.getitem(idx)
+        except Exception as e:
+            with open(os.path.join(self.args.save_path, f'error_dataset_{int(time.time())}.log'), 'w') as f:
+                f.write(str(e))
+            raise e
+    def getitem(self, idx):
+        ref = {}
+        if self.dynamic_indigo:
+            begin = time.time()
+            image, smiles, graph, success = generate_indigo_image(
+                self.smiles[idx], mol_augment=self.args.mol_augment, default_option=self.args.default_option,
+                shuffle_nodes=self.args.shuffle_nodes, pseudo_coords=self.pseudo_coords,
+                include_condensed=self.args.include_condensed)
+            # raw_image = image
+            end = time.time()
+            if idx < 30 and self.args.save_image:
+                path = os.path.join(self.args.save_path, 'images')
+                os.makedirs(path, exist_ok=True)
+                cv2.imwrite(os.path.join(path, f'{idx}.png'), image)
+            if not success:
+                return idx, None, {}
+            image, coords = self.image_transform(image, graph['coords'], renormalize=self.pseudo_coords)
+            graph['coords'] = coords
+            ref['time'] = end - begin
+            if 'atomtok' in self.formats:
+                max_len = FORMAT_INFO['atomtok']['max_len']
+                label = self.tokenizer['atomtok'].text_to_sequence(smiles, tokenized=False)
+                ref['atomtok'] = torch.LongTensor(label[:max_len])
+            if 'edges' in self.formats and 'atomtok_coords' not in self.formats and 'chartok_coords' not in self.formats:
+                ref['edges'] = torch.tensor(graph['edges'])
+            if 'atomtok_coords' in self.formats:
+                self._process_atomtok_coords(idx, ref, smiles, graph['coords'], graph['edges'],
+                                             mask_ratio=self.args.mask_ratio)
+            if 'chartok_coords' in self.formats:
+                self._process_chartok_coords(idx, ref, smiles, graph['coords'], graph['edges'],
+                                             mask_ratio=self.args.mask_ratio)
+            return idx, image, ref
+        else:
+            file_path = self.file_paths[idx]
+            image = cv2.imread(file_path)
+            if image is None:
+                image = np.array([[[255., 255., 255.]] * 10] * 10).astype(np.float32)
+                print(file_path, 'not found!')
+            if self.coords_df is not None:
+                h, w, _ = image.shape
+                coords = np.array(eval(self.coords_df.loc[idx, 'node_coords']))
+                if self.pseudo_coords:
+                    coords = normalize_nodes(coords)
+                coords[:, 0] = coords[:, 0] * w
+                coords[:, 1] = coords[:, 1] * h
+                image, coords = self.image_transform(image, coords, renormalize=self.pseudo_coords)
+            else:
+                image = self.image_transform(image)
+                coords = None
+            if self.labelled:
+                smiles = self.smiles[idx]
+                if 'atomtok' in self.formats:
+                    max_len = FORMAT_INFO['atomtok']['max_len']
+                    label = self.tokenizer['atomtok'].text_to_sequence(smiles, False)
+                    ref['atomtok'] = torch.LongTensor(label[:max_len])
+                if 'atomtok_coords' in self.formats:
+                    if coords is not None:
+                        self._process_atomtok_coords(idx, ref, smiles, coords, mask_ratio=0)
+                    else:
+                        self._process_atomtok_coords(idx, ref, smiles, mask_ratio=1)
+                if 'chartok_coords' in self.formats:
+                    if coords is not None:
+                        self._process_chartok_coords(idx, ref, smiles, coords, mask_ratio=0)
+                    else:
+                        self._process_chartok_coords(idx, ref, smiles, mask_ratio=1)
+            if self.args.predict_coords and ('atomtok_coords' in self.formats or 'chartok_coords' in self.formats):
+                smiles = self.smiles[idx]
+                if 'atomtok_coords' in self.formats:
+                    self._process_atomtok_coords(idx, ref, smiles, mask_ratio=1)
+                if 'chartok_coords' in self.formats:
+                    self._process_chartok_coords(idx, ref, smiles, mask_ratio=1)
+            return idx, image, ref
+    def _process_atomtok_coords(self, idx, ref, smiles, coords=None, edges=None, mask_ratio=0):
+        max_len = FORMAT_INFO['atomtok_coords']['max_len']
+        tokenizer = self.tokenizer['atomtok_coords']
+        if smiles is None or type(smiles) is not str:
+            smiles = ""
+        label, indices = tokenizer.smiles_to_sequence(smiles, coords, mask_ratio=mask_ratio)
+        ref['atomtok_coords'] = torch.LongTensor(label[:max_len])
+        indices = [i for i in indices if i < max_len]
+        ref['atom_indices'] = torch.LongTensor(indices)
+        if tokenizer.continuous_coords:
+            if coords is not None:
+                ref['coords'] = torch.tensor(coords)
+            else:
+                ref['coords'] = torch.ones(len(indices), 2) * -1.
+        if edges is not None:
+            ref['edges'] = torch.tensor(edges)[:len(indices), :len(indices)]
+        else:
+            if 'edges' in self.df.columns:
+                edge_list = eval(self.df.loc[idx, 'edges'])
+                n = len(indices)
+                edges = torch.zeros((n, n), dtype=torch.long)
+                for u, v, t in edge_list:
+                    if u < n and v < n:
+                        if t <= 4:
+                            edges[u, v] = t
+                            edges[v, u] = t
+                        else:
+                            edges[u, v] = t
+                            edges[v, u] = 11 - t
+                ref['edges'] = edges
+            else:
+                ref['edges'] = torch.ones(len(indices), len(indices), dtype=torch.long) * (-100)
+    def _process_chartok_coords(self, idx, ref, smiles, coords=None, edges=None, mask_ratio=0):
+        max_len = FORMAT_INFO['chartok_coords']['max_len']
+        tokenizer = self.tokenizer['chartok_coords']
+        if smiles is None or type(smiles) is not str:
+            smiles = ""
+        label, indices = tokenizer.smiles_to_sequence(smiles, coords, mask_ratio=mask_ratio)
+        ref['chartok_coords'] = torch.LongTensor(label[:max_len])
+        indices = [i for i in indices if i < max_len]
+        ref['atom_indices'] = torch.LongTensor(indices)
+        if tokenizer.continuous_coords:
+            if coords is not None:
+                ref['coords'] = torch.tensor(coords)
+            else:
+                ref['coords'] = torch.ones(len(indices), 2) * -1.
+        if edges is not None:
+            ref['edges'] = torch.tensor(edges)[:len(indices), :len(indices)]
+        else:
+            if 'edges' in self.df.columns:
+                edge_list = eval(self.df.loc[idx, 'edges'])
+                n = len(indices)
+                edges = torch.zeros((n, n), dtype=torch.long)
+                for u, v, t in edge_list:
+                    if u < n and v < n:
+                        if t <= 4:
+                            edges[u, v] = t
+                            edges[v, u] = t
+                        else:
+                            edges[u, v] = t
+                            edges[v, u] = 11 - t
+                ref['edges'] = edges
+            else:
+                ref['edges'] = torch.ones(len(indices), len(indices), dtype=torch.long) * (-100)
+class AuxTrainDataset(Dataset):
+    def __init__(self, args, train_df, aux_df, tokenizer):
+        super().__init__()
+        self.train_dataset = TrainDataset(args, train_df, tokenizer, dynamic_indigo=args.dynamic_indigo)
+        self.aux_dataset = TrainDataset(args, aux_df, tokenizer, dynamic_indigo=False)
+    def __len__(self):
+        return len(self.train_dataset) + len(self.aux_dataset)
+    def __getitem__(self, idx):
+        if idx < len(self.train_dataset):
+            return self.train_dataset[idx]
+        else:
+            return self.aux_dataset[idx - len(self.train_dataset)]
+def pad_images(imgs):
+    # B, C, H, W
+    max_shape = [0, 0]
+    for img in imgs:
+        for i in range(len(max_shape)):
+            max_shape[i] = max(max_shape[i], img.shape[-1 - i])
+    stack = []
+    for img in imgs:
+        pad = []
+        for i in range(len(max_shape)):
+            pad = pad + [0, max_shape[i] - img.shape[-1 - i]]
+        stack.append(F.pad(img, pad, value=0))
+    return torch.stack(stack)
+def bms_collate(batch):
+    ids = []
+    imgs = []
+    batch = [ex for ex in batch if ex[1] is not None]
+    formats = list(batch[0][2].keys())
+    seq_formats = [k for k in formats if
+                   k in ['atomtok', 'inchi', 'nodes', 'atomtok_coords', 'chartok_coords', 'atom_indices']]
+    refs = {key: [[], []] for key in seq_formats}
+    for ex in batch:
+        ids.append(ex[0])
+        imgs.append(ex[1])
+        ref = ex[2]
+        for key in seq_formats:
+            refs[key][0].append(ref[key])
+            refs[key][1].append(torch.LongTensor([len(ref[key])]))
+    # Sequence
+    for key in seq_formats:
+        # this padding should work for atomtok_with_coords too, each of which has shape (length, 4)
+        refs[key][0] = pad_sequence(refs[key][0], batch_first=True, padding_value=PAD_ID)
+        refs[key][1] = torch.stack(refs[key][1]).reshape(-1, 1)
+    # Time
+    # if 'time' in formats:
+    #     refs['time'] = [ex[2]['time'] for ex in batch]
+    # Coords
+    if 'coords' in formats:
+        refs['coords'] = pad_sequence([ex[2]['coords'] for ex in batch], batch_first=True, padding_value=-1.)
+    # Edges
+    if 'edges' in formats:
+        edges_list = [ex[2]['edges'] for ex in batch]
+        max_len = max([len(edges) for edges in edges_list])
+        refs['edges'] = torch.stack(
+            [F.pad(edges, (0, max_len - len(edges), 0, max_len - len(edges)), value=-100) for edges in edges_list],
+            dim=0)
+    return ids, pad_images(imgs), refs

molscribe/evaluate.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import numpy as np
+import multiprocessing
+import rdkit
+import rdkit.Chem as Chem
+rdkit.RDLogger.DisableLog('rdApp.*')
+from SmilesPE.pretokenizer import atomwise_tokenizer
+def canonicalize_smiles(smiles, ignore_chiral=False, ignore_cistrans=False, replace_rgroup=True):
+    if type(smiles) is not str or smiles == '':
+        return '', False
+    if ignore_cistrans:
+        smiles = smiles.replace('/', '').replace('\\', '')
+    if replace_rgroup:
+        tokens = atomwise_tokenizer(smiles)
+        for j, token in enumerate(tokens):
+            if token[0] == '[' and token[-1] == ']':
+                symbol = token[1:-1]
+                if symbol[0] == 'R' and symbol[1:].isdigit():
+                    tokens[j] = f'[{symbol[1:]}*]'
+                elif Chem.AtomFromSmiles(token) is None:
+                    tokens[j] = '*'
+        smiles = ''.join(tokens)
+    try:
+        canon_smiles = Chem.CanonSmiles(smiles, useChiral=(not ignore_chiral))
+        success = True
+    except:
+        canon_smiles = smiles
+        success = False
+    return canon_smiles, success
+def convert_smiles_to_canonsmiles(
+        smiles_list, ignore_chiral=False, ignore_cistrans=False, replace_rgroup=True, num_workers=16):
+    with multiprocessing.Pool(num_workers) as p:
+        results = p.starmap(canonicalize_smiles,
+                            [(smiles, ignore_chiral, ignore_cistrans, replace_rgroup) for smiles in smiles_list],
+                            chunksize=128)
+    canon_smiles, success = zip(*results)
+    return list(canon_smiles), np.mean(success)
+class SmilesEvaluator(object):
+    def __init__(self, gold_smiles, num_workers=16):
+        self.gold_smiles = gold_smiles
+        self.gold_canon_smiles, self.gold_valid = convert_smiles_to_canonsmiles(gold_smiles, num_workers=num_workers)
+        self.gold_smiles_chiral, _ = convert_smiles_to_canonsmiles(gold_smiles,
+                                                                   ignore_chiral=True, num_workers=num_workers)
+        self.gold_smiles_cistrans, _ = convert_smiles_to_canonsmiles(gold_smiles,
+                                                                     ignore_cistrans=True, num_workers=num_workers)
+        self.gold_canon_smiles = self._replace_empty(self.gold_canon_smiles)
+        self.gold_smiles_chiral = self._replace_empty(self.gold_smiles_chiral)
+        self.gold_smiles_cistrans = self._replace_empty(self.gold_smiles_cistrans)
+    def _replace_empty(self, smiles_list):
+        """Replace empty SMILES in the gold, otherwise it will be considered correct if both pred and gold is empty."""
+        return [smiles if smiles is not None and type(smiles) is str and smiles != "" else "<empty>"
+                for smiles in smiles_list]
+    def evaluate(self, pred_smiles):
+        results = {}
+        results['gold_valid'] = self.gold_valid
+        # Canon SMILES
+        pred_canon_smiles, pred_valid = convert_smiles_to_canonsmiles(pred_smiles)
+        results['canon_smiles_em'] = (np.array(self.gold_canon_smiles) == np.array(pred_canon_smiles)).mean()
+        results['pred_valid'] = pred_valid
+        # Ignore chirality (Graph exact match)
+        pred_smiles_chiral, _ = convert_smiles_to_canonsmiles(pred_smiles, ignore_chiral=True)
+        results['graph'] = (np.array(self.gold_smiles_chiral) == np.array(pred_smiles_chiral)).mean()
+        # Ignore double bond cis/trans
+        pred_smiles_cistrans, _ = convert_smiles_to_canonsmiles(pred_smiles, ignore_cistrans=True)
+        results['canon_smiles'] = (np.array(self.gold_smiles_cistrans) == np.array(pred_smiles_cistrans)).mean()
+        # Evaluate on molecules with chiral centers
+        chiral = np.array([[g, p] for g, p in zip(self.gold_smiles_cistrans, pred_smiles_cistrans) if '@' in g])
+        results['chiral_ratio'] = len(chiral) / len(self.gold_smiles)
+        results['chiral'] = (chiral[:, 0] == chiral[:, 1]).mean() if len(chiral) > 0 else -1
+        return results

molscribe/indigo/__init__.py ADDED Viewed

The diff for this file is too large to render. See raw diff

molscribe/indigo/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (96.8 kB). View file

molscribe/indigo/__pycache__/bingo.cpython-310.pyc ADDED Viewed

Binary file (11.4 kB). View file

molscribe/indigo/__pycache__/inchi.cpython-310.pyc ADDED Viewed

Binary file (2.73 kB). View file

molscribe/indigo/__pycache__/renderer.cpython-310.pyc ADDED Viewed

Binary file (2.8 kB). View file

molscribe/indigo/bingo.py ADDED Viewed

	@@ -0,0 +1,334 @@

+#
+# Copyright (C) from 2009 to Present EPAM Systems.
+#
+# This file is part of Indigo toolkit.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from . import *
+class BingoException(Exception):
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        if sys.version_info > (3, 0):
+            return repr(self.value.decode('ascii'))
+        else:
+            return repr(self.value)
+class Bingo(object):
+    def __init__(self, bingoId, indigo, lib):
+        self._id = bingoId
+        self._indigo = indigo
+        self._lib = lib
+        self._lib.bingoVersion.restype = c_char_p
+        self._lib.bingoVersion.argtypes = None
+        self._lib.bingoCreateDatabaseFile.restype = c_int
+        self._lib.bingoCreateDatabaseFile.argtypes = [c_char_p, c_char_p, c_char_p]
+        self._lib.bingoLoadDatabaseFile.restype = c_int
+        self._lib.bingoLoadDatabaseFile.argtypes = [c_char_p, c_char_p]
+        self._lib.bingoCloseDatabase.restype = c_int
+        self._lib.bingoCloseDatabase.argtypes = [c_int]
+        self._lib.bingoInsertRecordObj.restype = c_int
+        self._lib.bingoInsertRecordObj.argtypes = [c_int, c_int]
+        self._lib.bingoInsertRecordObjWithExtFP.restype = c_int
+        self._lib.bingoInsertRecordObjWithExtFP.argtypes = [c_int, c_int, c_int]
+        self._lib.bingoGetRecordObj.restype = c_int
+        self._lib.bingoGetRecordObj.argtypes = [c_int, c_int]
+        self._lib.bingoInsertRecordObjWithId.restype = c_int
+        self._lib.bingoInsertRecordObjWithId.argtypes = [c_int, c_int, c_int]
+        self._lib.bingoInsertRecordObjWithIdAndExtFP.restype = c_int
+        self._lib.bingoInsertRecordObjWithIdAndExtFP.argtypes = [c_int, c_int, c_int, c_int]
+        self._lib.bingoDeleteRecord.restype = c_int
+        self._lib.bingoDeleteRecord.argtypes = [c_int, c_int]
+        self._lib.bingoSearchSub.restype = c_int
+        self._lib.bingoSearchSub.argtypes = [c_int, c_int, c_char_p]
+        self._lib.bingoSearchExact.restype = c_int
+        self._lib.bingoSearchExact.argtypes = [c_int, c_int, c_char_p]
+        self._lib.bingoSearchMolFormula.restype = c_int
+        self._lib.bingoSearchMolFormula.argtypes = [c_int, c_char_p, c_char_p]
+        self._lib.bingoSearchSim.restype = c_int
+        self._lib.bingoSearchSim.argtypes = [c_int, c_int, c_float, c_float, c_char_p]
+        self._lib.bingoSearchSimWithExtFP.restype = c_int
+        self._lib.bingoSearchSimWithExtFP.argtypes = [c_int, c_int, c_float, c_float, c_int, c_char_p]
+        self._lib.bingoSearchSimTopN.restype = c_int
+        self._lib.bingoSearchSimTopN.argtypes = [c_int, c_int, c_int, c_float, c_char_p]
+        self._lib.bingoSearchSimTopNWithExtFP.restype = c_int
+        self._lib.bingoSearchSimTopNWithExtFP.argtypes = [c_int, c_int, c_int, c_float, c_int, c_char_p]
+        self._lib.bingoEnumerateId.restype = c_int
+        self._lib.bingoEnumerateId.argtypes = [c_int]
+        self._lib.bingoNext.restype = c_int
+        self._lib.bingoNext.argtypes = [c_int]
+        self._lib.bingoGetCurrentId.restype = c_int
+        self._lib.bingoGetCurrentId.argtypes = [c_int]
+        self._lib.bingoGetObject.restype = c_int
+        self._lib.bingoGetObject.argtypes = [c_int]
+        self._lib.bingoEndSearch.restype = c_int
+        self._lib.bingoEndSearch.argtypes = [c_int]
+        self._lib.bingoGetCurrentSimilarityValue.restype = c_float
+        self._lib.bingoGetCurrentSimilarityValue.argtypes = [c_int]
+        self._lib.bingoOptimize.restype = c_int
+        self._lib.bingoOptimize.argtypes = [c_int]
+        self._lib.bingoEstimateRemainingResultsCount.restype = c_int
+        self._lib.bingoEstimateRemainingResultsCount.argtypes = [c_int]
+        self._lib.bingoEstimateRemainingResultsCountError.restype = c_int
+        self._lib.bingoEstimateRemainingResultsCountError.argtypes = [c_int]
+        self._lib.bingoEstimateRemainingTime.restype = c_int
+        self._lib.bingoEstimateRemainingTime.argtypes = [c_int, POINTER(c_float)]
+        self._lib.bingoContainersCount.restype = c_int
+        self._lib.bingoContainersCount.argtypes = [c_int]
+        self._lib.bingoCellsCount.restype = c_int
+        self._lib.bingoCellsCount.argtypes = [c_int]
+        self._lib.bingoCurrentCell.restype = c_int
+        self._lib.bingoCurrentCell.argtypes = [c_int]
+        self._lib.bingoMinCell.restype = c_int
+        self._lib.bingoMinCell.argtypes = [c_int]
+        self._lib.bingoMaxCell.restype = c_int
+        self._lib.bingoMaxCell.argtypes = [c_int]
+    def __del__(self):
+        self.close()
+    def close(self):
+        self._indigo._setSessionId()
+        if self._id >= 0:
+            Bingo._checkResult(self._indigo, self._lib.bingoCloseDatabase(self._id))
+            self._id = -1
+    @staticmethod
+    def _checkResult(indigo, result):
+        if result < 0:
+            raise BingoException(indigo._lib.indigoGetLastError())
+        return result
+    @staticmethod
+    def _checkResultPtr (indigo, result):
+        if result is None:
+            raise BingoException(indigo._lib.indigoGetLastError())
+        return result
+    @staticmethod
+    def _checkResultString (indigo, result):
+        res = Bingo._checkResultPtr(indigo, result)
+        if sys.version_info >= (3, 0):
+            return res.decode('ascii')
+        else:
+            return res.encode('ascii')
+    @staticmethod
+    def _getLib(indigo):
+        if os.name == 'posix' and not platform.mac_ver()[0] and not platform.system().startswith("CYGWIN"):
+            _lib = CDLL(indigo.dllpath + "/libbingo.so")
+        elif os.name == 'nt' or platform.system().startswith("CYGWIN"):
+            _lib = CDLL(indigo.dllpath + "/bingo.dll")
+        elif platform.mac_ver()[0]:
+            _lib = CDLL(indigo.dllpath + "/libbingo.dylib")
+        else:
+            raise BingoException("unsupported OS: " + os.name)
+        return _lib
+    @staticmethod
+    def createDatabaseFile(indigo, path, databaseType, options=''):
+        indigo._setSessionId()
+        if not options:
+            options = ''
+        lib = Bingo._getLib(indigo)
+        lib.bingoCreateDatabaseFile.restype = c_int
+        lib.bingoCreateDatabaseFile.argtypes = [c_char_p, c_char_p, c_char_p]
+        return Bingo(Bingo._checkResult(indigo, lib.bingoCreateDatabaseFile(path.encode('ascii'), databaseType.encode('ascii'), options.encode('ascii'))), indigo, lib)
+    @staticmethod
+    def loadDatabaseFile(indigo, path, options=''):
+        indigo._setSessionId()
+        if not options:
+            options = ''
+        lib = Bingo._getLib(indigo)
+        lib.bingoLoadDatabaseFile.restype = c_int
+        lib.bingoLoadDatabaseFile.argtypes = [c_char_p, c_char_p]
+        return Bingo(Bingo._checkResult(indigo, lib.bingoLoadDatabaseFile(path.encode('ascii'), options.encode('ascii'))), indigo, lib)
+    def version(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResultString(self._indigo, self._lib.bingoVersion())
+    def insert(self, indigoObject, index=None):
+        self._indigo._setSessionId()
+        if not index:
+            return Bingo._checkResult(self._indigo, self._lib.bingoInsertRecordObj(self._id, indigoObject.id))
+        else:
+            return Bingo._checkResult(self._indigo,
+                                      self._lib.bingoInsertRecordObjWithId(self._id, indigoObject.id, index))
+    def insertWithExtFP(self, indigoObject, ext_fp, index=None):
+        self._indigo._setSessionId()
+        if not index:
+            return Bingo._checkResult(self._indigo, self._lib.bingoInsertRecordObjWithExtFP(self._id, indigoObject.id, ext_fp.id))
+        else:
+            return Bingo._checkResult(self._indigo,
+                                      self._lib.bingoInsertRecordObjWithIdAndExtFP(self._id, indigoObject.id, index, ext_fp.id))
+    def delete(self, index):
+        self._indigo._setSessionId()
+        Bingo._checkResult(self._indigo, self._lib.bingoDeleteRecord(self._id, index))
+    def searchSub(self, query, options=''):
+        self._indigo._setSessionId()
+        if not options:
+            options = ''
+        return BingoObject(Bingo._checkResult(self._indigo, self._lib.bingoSearchSub(self._id, query.id, options.encode('ascii'))),
+                           self._indigo, self)
+    def searchExact(self, query, options=''):
+        self._indigo._setSessionId()
+        if not options:
+            options = ''
+        return BingoObject(Bingo._checkResult(self._indigo, self._lib.bingoSearchExact(self._id, query.id, options.encode('ascii'))),
+                           self._indigo, self)
+    def searchSim(self, query, minSim, maxSim, metric='tanimoto'):
+        self._indigo._setSessionId()
+        if not metric:
+            metric = 'tanimoto'
+        return BingoObject(
+            Bingo._checkResult(self._indigo, self._lib.bingoSearchSim(self._id, query.id, minSim, maxSim, metric.encode('ascii'))),
+            self._indigo, self)
+    def searchSimWithExtFP(self, query, minSim, maxSim, ext_fp, metric='tanimoto'):
+        self._indigo._setSessionId()
+        if not metric:
+            metric = 'tanimoto'
+        return BingoObject(
+            Bingo._checkResult(self._indigo, self._lib.bingoSearchSimWithExtFP(self._id, query.id, minSim, maxSim, ext_fp.id, metric.encode('ascii'))),
+            self._indigo, self)
+    def searchSimTopN(self, query, limit, minSim, metric='tanimoto'):
+        self._indigo._setSessionId()
+        if not metric:
+            metric = 'tanimoto'
+        return BingoObject(
+            Bingo._checkResult(self._indigo, self._lib.bingoSearchSimTopN(self._id, query.id, limit, minSim, metric.encode('ascii'))),
+            self._indigo, self)
+    def searchSimTopNWithExtFP(self, query, limit, minSim, ext_fp, metric='tanimoto'):
+        self._indigo._setSessionId()
+        if not metric:
+            metric = 'tanimoto'
+        return BingoObject(
+            Bingo._checkResult(self._indigo, self._lib.bingoSearchSimTopNWithExtFP(self._id, query.id, limit, minSim, ext_fp.id, metric.encode('ascii'))),
+            self._indigo, self)
+    def enumerateId(self):
+        self._indigo._setSessionId()
+        e = self._lib.bingoEnumerateId(self._id)
+        result = Bingo._checkResult(self._indigo, e)
+        return BingoObject(result, self._indigo, self)
+    def searchMolFormula(self, query, options=''):
+        self._indigo._setSessionId()
+        if not options:
+            options = ''
+        return BingoObject(Bingo._checkResult(self._indigo, self._lib.bingoSearchMolFormula(self._id, query.encode('ascii'), options.encode('ascii'))),
+                           self._indigo, self)
+    def optimize(self):
+        self._indigo._setSessionId()
+        Bingo._checkResult(self._indigo, self._lib.bingoOptimize(self._id))
+    def getRecordById (self, id):
+        self._indigo._setSessionId()
+        return IndigoObject(self._indigo, Bingo._checkResult(self._indigo, self._lib.bingoGetRecordObj(self._id, id)))
+class BingoObject(object):
+    def __init__(self, objId, indigo, bingo):
+        self._id = objId
+        self._indigo = indigo
+        self._bingo = bingo
+    def __del__(self):
+        self.close()
+    def close(self):
+        self._indigo._setSessionId()
+        if self._id >= 0:
+            Bingo._checkResult(self._indigo, self._bingo._lib.bingoEndSearch(self._id))
+            self._id = -1
+    def next(self):
+        self._indigo._setSessionId()
+        return (Bingo._checkResult(self._indigo, self._bingo._lib.bingoNext(self._id)) == 1)
+    def getCurrentId(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoGetCurrentId(self._id))
+    def getIndigoObject(self):
+        self._indigo._setSessionId()
+        return IndigoObject(self._indigo, Bingo._checkResult(self._indigo, self._bingo._lib.bingoGetObject(self._id)))
+    def getCurrentSimilarityValue(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoGetCurrentSimilarityValue(self._id))
+    def estimateRemainingResultsCount(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoEstimateRemainingResultsCount(self._id))
+    def estimateRemainingResultsCountError(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoEstimateRemainingResultsCountError(self._id))
+    def estimateRemainingTime(self):
+        self._indigo._setSessionId()
+        value = c_float()
+        Bingo._checkResult(self._indigo, self._bingo._lib.bingoEstimateRemainingTime(self._id, pointer(value)))
+        return value.value
+    def containersCount(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoContainersCount(self._id))
+    def cellsCount(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoCellsCount(self._id))
+    def currentCell(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoCurrentCell(self._id))
+    def minCell(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoMinCell(self._id))
+    def maxCell(self):
+        self._indigo._setSessionId()
+        return Bingo._checkResult(self._indigo, self._bingo._lib.bingoMaxCell(self._id))
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def __iter__(self):
+        return self
+    def __next__(self):
+        next_item = self.next()
+        if next_item:
+            return self
+        raise StopIteration

molscribe/indigo/inchi.py ADDED Viewed

	@@ -0,0 +1,84 @@

+#
+# Copyright (C) from 2009 to Present EPAM Systems.
+#
+# This file is part of Indigo toolkit.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import *
+class IndigoInchi(object):
+    def __init__(self, indigo):
+        self.indigo = indigo
+        if os.name == 'posix' and not platform.mac_ver()[0] and not platform.system().startswith("CYGWIN"):
+            self._lib = CDLL(indigo.dllpath + "/libindigo-inchi.so")
+        elif os.name == 'nt' or platform.system().startswith("CYGWIN"):
+            self._lib = CDLL(indigo.dllpath + "\indigo-inchi.dll")
+        elif platform.mac_ver()[0]:
+            self._lib = CDLL(indigo.dllpath + "/libindigo-inchi.dylib")
+        else:
+            raise IndigoException("unsupported OS: " + os.name)
+        self._lib.indigoInchiVersion.restype = c_char_p
+        self._lib.indigoInchiVersion.argtypes = []
+        self._lib.indigoInchiResetOptions.restype = c_int
+        self._lib.indigoInchiResetOptions.argtypes = []
+        self._lib.indigoInchiLoadMolecule.restype = c_int
+        self._lib.indigoInchiLoadMolecule.argtypes = [c_char_p]
+        self._lib.indigoInchiGetInchi.restype = c_char_p
+        self._lib.indigoInchiGetInchi.argtypes = [c_int]
+        self._lib.indigoInchiGetInchiKey.restype = c_char_p
+        self._lib.indigoInchiGetInchiKey.argtypes = [c_char_p]
+        self._lib.indigoInchiGetWarning.restype = c_char_p
+        self._lib.indigoInchiGetWarning.argtypes = []
+        self._lib.indigoInchiGetLog.restype = c_char_p
+        self._lib.indigoInchiGetLog.argtypes = []
+        self._lib.indigoInchiGetAuxInfo.restype = c_char_p
+        self._lib.indigoInchiGetAuxInfo.argtypes = []
+    def resetOptions(self):
+        self.indigo._setSessionId()
+        self.indigo._checkResult(self._lib.indigoInchiResetOptions())
+    def loadMolecule(self, inchi):
+        self.indigo._setSessionId()
+        res = self.indigo._checkResult(self._lib.indigoInchiLoadMolecule(inchi.encode('ascii')))
+        if res == 0:
+            return None
+        return self.indigo.IndigoObject(self.indigo, res)
+    def version(self):
+        self.indigo._setSessionId()
+        return self.indigo._checkResultString(self._lib.indigoInchiVersion())
+    def getInchi(self, molecule):
+        self.indigo._setSessionId()
+        return self.indigo._checkResultString(self._lib.indigoInchiGetInchi(molecule.id))
+    def getInchiKey(self, inchi):
+        self.indigo._setSessionId()
+        return self.indigo._checkResultString(self._lib.indigoInchiGetInchiKey(inchi.encode('ascii')))
+    def getWarning(self):
+        self.indigo._setSessionId()
+        return self.indigo._checkResultString(self._lib.indigoInchiGetWarning())
+    def getLog(self):
+        self.indigo._setSessionId()
+        return self.indigo._checkResultString(self._lib.indigoInchiGetLog())
+    def getAuxInfo(self):
+        self.indigo._setSessionId()
+        return self.indigo._checkResultString(self._lib.indigoInchiGetAuxInfo())

molscribe/indigo/renderer.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#
+# Copyright (C) from 2009 to Present EPAM Systems.
+#
+# This file is part of Indigo toolkit.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import platform
+from ctypes import CDLL, POINTER, c_char_p, c_int
+from . import IndigoException
+class IndigoRenderer(object):
+    def __init__(self, indigo):
+        self.indigo = indigo
+        if (
+            os.name == "posix"
+            and not platform.mac_ver()[0]
+            and not platform.system().startswith("CYGWIN")
+        ):
+            self._lib = CDLL(indigo.dllpath + "/libindigo-renderer.so")
+        elif os.name == "nt" or platform.system().startswith("CYGWIN"):
+            self._lib = CDLL(indigo.dllpath + "\indigo-renderer.dll")
+        elif platform.mac_ver()[0]:
+            self._lib = CDLL(indigo.dllpath + "/libindigo-renderer.dylib")
+        else:
+            raise IndigoException("unsupported OS: " + os.name)
+        self._lib.indigoRender.restype = c_int
+        self._lib.indigoRender.argtypes = [c_int, c_int]
+        self._lib.indigoRenderToFile.restype = c_int
+        self._lib.indigoRenderToFile.argtypes = [c_int, c_char_p]
+        self._lib.indigoRenderGrid.restype = c_int
+        self._lib.indigoRenderGrid.argtypes = [
+            c_int,
+            POINTER(c_int),
+            c_int,
+            c_int,
+        ]
+        self._lib.indigoRenderGridToFile.restype = c_int
+        self._lib.indigoRenderGridToFile.argtypes = [
+            c_int,
+            POINTER(c_int),
+            c_int,
+            c_char_p,
+        ]
+        self._lib.indigoRenderReset.restype = c_int
+        self._lib.indigoRenderReset.argtypes = [c_int]
+    def renderToBuffer(self, obj):
+        self.indigo._setSessionId()
+        wb = self.indigo.writeBuffer()
+        try:
+            self.indigo._checkResult(self._lib.indigoRender(obj.id, wb.id))
+            return wb.toBuffer()
+        finally:
+            wb.dispose()
+    def renderToFile(self, obj, filename):
+        self.indigo._setSessionId()
+        self.indigo._checkResult(
+            self._lib.indigoRenderToFile(obj.id, filename.encode("ascii"))
+        )
+    def renderGridToFile(self, objects, refatoms, ncolumns, filename):
+        self.indigo._setSessionId()
+        arr = None
+        if refatoms:
+            if len(refatoms) != objects.count():
+                raise IndigoException(
+                    "renderGridToFile(): refatoms[] size must be equal to the number of objects"
+                )
+            arr = (c_int * len(refatoms))()
+            for i in range(len(refatoms)):
+                arr[i] = refatoms[i]
+        self.indigo._checkResult(
+            self._lib.indigoRenderGridToFile(
+                objects.id, arr, ncolumns, filename.encode("ascii")
+            )
+        )
+    def renderGridToBuffer(self, objects, refatoms, ncolumns):
+        self.indigo._setSessionId()
+        arr = None
+        if refatoms:
+            if len(refatoms) != objects.count():
+                raise IndigoException(
+                    "renderGridToBuffer(): refatoms[] size must be equal to the number of objects"
+                )
+            arr = (c_int * len(refatoms))()
+            for i in range(len(refatoms)):
+                arr[i] = refatoms[i]
+        wb = self.indigo.writeBuffer()
+        try:
+            self.indigo._checkResult(
+                self._lib.indigoRenderGrid(objects.id, arr, ncolumns, wb.id)
+            )
+            return wb.toBuffer()
+        finally:
+            wb.dispose()

molscribe/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .greedy_search import GreedySearch
+from .beam_search import BeamSearch
+__all__ = ["GreedySearch", "BeamSearch"]

molscribe/inference/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (282 Bytes). View file

molscribe/inference/__pycache__/beam_search.cpython-310.pyc ADDED Viewed

Binary file (5.44 kB). View file

molscribe/inference/__pycache__/decode_strategy.cpython-310.pyc ADDED Viewed

Binary file (2.68 kB). View file

molscribe/inference/__pycache__/greedy_search.cpython-310.pyc ADDED Viewed

Binary file (4.11 kB). View file

molscribe/inference/beam_search.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+from .decode_strategy import DecodeStrategy
+class BeamSearch(DecodeStrategy):
+    """Generation with beam search.
+    """
+    def __init__(self, pad, bos, eos, batch_size, beam_size, n_best, min_length,
+                 return_attention, max_length):
+        super(BeamSearch, self).__init__(
+            pad, bos, eos, batch_size, beam_size, min_length, return_attention, max_length)
+        self.beam_size = beam_size
+        self.n_best = n_best
+        # result caching
+        self.hypotheses = [[] for _ in range(batch_size)]
+        # beam state
+        self.top_beam_finished = torch.zeros([batch_size], dtype=torch.bool)
+        self._batch_offset = torch.arange(batch_size, dtype=torch.long)
+        self.select_indices = None
+        self.done = False
+    def initialize(self, memory_bank, device=None):
+        """Repeat src objects `beam_size` times.
+        """
+        def fn_map_state(state, dim):
+            return torch.repeat_interleave(state, self.beam_size, dim=dim)
+        memory_bank = torch.repeat_interleave(memory_bank, self.beam_size, dim=0)
+        if device is None:
+            device = memory_bank.device
+        self.memory_length = memory_bank.size(1)
+        super().initialize(memory_bank, device)
+        self.best_scores = torch.full([self.batch_size], -1e10, dtype=torch.float, device=device)
+        self._beam_offset = torch.arange(
+            0, self.batch_size * self.beam_size, step=self.beam_size, dtype=torch.long, device=device)
+        self.topk_log_probs = torch.tensor(
+            [0.0] + [float("-inf")] * (self.beam_size - 1), device=device
+        ).repeat(self.batch_size)
+        # buffers for the topk scores and 'backpointer'
+        self.topk_scores = torch.empty((self.batch_size, self.beam_size), dtype=torch.float, device=device)
+        self.topk_ids = torch.empty((self.batch_size, self.beam_size), dtype=torch.long, device=device)
+        self._batch_index = torch.empty([self.batch_size, self.beam_size], dtype=torch.long, device=device)
+        return fn_map_state, memory_bank
+    @property
+    def current_predictions(self):
+        return self.alive_seq[:, -1]
+    @property
+    def current_backptr(self):
+        # for testing
+        return self.select_indices.view(self.batch_size, self.beam_size)
+    @property
+    def batch_offset(self):
+        return self._batch_offset
+    def _pick(self, log_probs):
+        """Return token decision for a step.
+        Args:
+            log_probs (FloatTensor): (B, vocab_size)
+        Returns:
+            topk_scores (FloatTensor): (B, beam_size)
+            topk_ids (LongTensor): (B, beam_size)
+        """
+        vocab_size = log_probs.size(-1)
+        # Flatten probs into a list of probabilities.
+        curr_scores = log_probs.reshape(-1, self.beam_size * vocab_size)
+        topk_scores, topk_ids = torch.topk(curr_scores, self.beam_size, dim=-1)
+        return topk_scores, topk_ids
+    def advance(self, log_probs, attn):
+        """
+        Args:
+            log_probs: (B * beam_size, vocab_size)
+        """
+        vocab_size = log_probs.size(-1)
+        # (non-finished) batch_size
+        _B = log_probs.shape[0] // self.beam_size
+        step = len(self)  # alive_seq
+        self.ensure_min_length(log_probs)
+        # Multiply probs by the beam probability
+        log_probs += self.topk_log_probs.view(_B * self.beam_size, 1)
+        curr_length = step + 1
+        curr_scores = log_probs / curr_length  # avg log_prob
+        self.topk_scores, self.topk_ids = self._pick(curr_scores)
+        # topk_scores/topk_ids: (batch_size, beam_size)
+        # Recover log probs
+        torch.mul(self.topk_scores, curr_length, out=self.topk_log_probs)
+        # Resolve beam origin and map to batch index flat representation.
+        self._batch_index = self.topk_ids // vocab_size
+        self._batch_index += self._beam_offset[:_B].unsqueeze(1)
+        self.select_indices = self._batch_index.view(_B * self.beam_size)
+        self.topk_ids.fmod_(vocab_size)  # resolve true word ids
+        # Append last prediction.
+        self.alive_seq = torch.cat(
+            [self.alive_seq.index_select(0, self.select_indices),
+             self.topk_ids.view(_B * self.beam_size, 1)], -1)
+        if self.return_attention:
+            current_attn = attn.index_select(1, self.select_indices)
+            if step == 1:
+                self.alive_attn = current_attn
+            else:
+                self.alive_attn = self.alive_attn.index_select(
+                    1, self.select_indices)
+                self.alive_attn = torch.cat([self.alive_attn, current_attn], 0)
+        self.is_finished = self.topk_ids.eq(self.eos)
+        self.ensure_max_length()
+    def update_finished(self):
+        _B_old = self.topk_log_probs.shape[0]
+        step = self.alive_seq.shape[-1]  # len(self)
+        self.topk_log_probs.masked_fill_(self.is_finished, -1e10)
+        self.is_finished = self.is_finished.to('cpu')
+        self.top_beam_finished |= self.is_finished[:, 0].eq(1)
+        predictions = self.alive_seq.view(_B_old, self.beam_size, step)
+        attention = (
+            self.alive_attn.view(
+                step - 1, _B_old, self.beam_size, self.alive_attn.size(-1))
+            if self.alive_attn is not None else None)
+        non_finished_batch = []
+        for i in range(self.is_finished.size(0)):
+            b = self._batch_offset[i]
+            finished_hyp = self.is_finished[i].nonzero(as_tuple=False).view(-1)
+            # Store finished hypothesis for this batch.
+            for j in finished_hyp:  # Beam level: finished beam j in batch i
+                self.hypotheses[b].append((
+                    self.topk_scores[i, j],
+                    predictions[i, j, 1:],  # Ignore start token
+                    attention[:, i, j, :self.memory_length]
+                    if attention is not None else None))
+            # End condition is the top beam finished and we can return
+            # n_best hypotheses.
+            finish_flag = self.top_beam_finished[i] != 0
+            if finish_flag and len(self.hypotheses[b]) >= self.n_best:
+                best_hyp = sorted(
+                    self.hypotheses[b], key=lambda x: x[0], reverse=True)
+                for n, (score, pred, attn) in enumerate(best_hyp):
+                    if n >= self.n_best:
+                        break
+                    self.scores[b].append(score.item())
+                    self.predictions[b].append(pred)
+                    self.attention[b].append(
+                        attn if attn is not None else [])
+            else:
+                non_finished_batch.append(i)
+        non_finished = torch.tensor(non_finished_batch)
+        if len(non_finished) == 0:
+            self.done = True
+            return
+        _B_new = non_finished.shape[0]
+        # Remove finished batches for the next step
+        self.top_beam_finished = self.top_beam_finished.index_select(0, non_finished)
+        self._batch_offset = self._batch_offset.index_select(0, non_finished)
+        non_finished = non_finished.to(self.topk_ids.device)
+        self.topk_log_probs = self.topk_log_probs.index_select(0, non_finished)
+        self._batch_index = self._batch_index.index_select(0, non_finished)
+        self.select_indices = self._batch_index.view(_B_new * self.beam_size)
+        self.alive_seq = predictions.index_select(0, non_finished).view(-1, self.alive_seq.size(-1))
+        self.topk_scores = self.topk_scores.index_select(0, non_finished)
+        self.topk_ids = self.topk_ids.index_select(0, non_finished)
+        if self.alive_attn is not None:
+            inp_seq_len = self.alive_attn.size(-1)
+            self.alive_attn = attention.index_select(1, non_finished) \
+                .view(step - 1, _B_new * self.beam_size, inp_seq_len)

molscribe/inference/decode_strategy.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+class DecodeStrategy(object):
+    def __init__(self, pad, bos, eos, batch_size, parallel_paths, min_length, max_length,
+                 return_attention=False, return_hidden=False):
+        self.pad = pad
+        self.bos = bos
+        self.eos = eos
+        self.batch_size = batch_size
+        self.parallel_paths = parallel_paths
+        # result catching
+        self.predictions = [[] for _ in range(batch_size)]
+        self.scores = [[] for _ in range(batch_size)]
+        self.token_scores = [[] for _ in range(batch_size)]
+        self.attention = [[] for _ in range(batch_size)]
+        self.hidden = [[] for _ in range(batch_size)]
+        self.alive_attn = None
+        self.alive_hidden = None
+        self.min_length = min_length
+        self.max_length = max_length
+        n_paths = batch_size * parallel_paths
+        self.return_attention = return_attention
+        self.return_hidden = return_hidden
+        self.done = False
+    def initialize(self, memory_bank, device=None):
+        if device is None:
+            device = torch.device('cpu')
+        self.alive_seq = torch.full(
+            [self.batch_size * self.parallel_paths, 1], self.bos,
+            dtype=torch.long, device=device)
+        self.is_finished = torch.zeros(
+            [self.batch_size, self.parallel_paths],
+            dtype=torch.uint8, device=device)
+        self.alive_log_token_scores = torch.zeros(
+            [self.batch_size * self.parallel_paths, 0],
+            dtype=torch.float, device=device)
+        return None, memory_bank
+    def __len__(self):
+        return self.alive_seq.shape[1]
+    def ensure_min_length(self, log_probs):
+        if len(self) <= self.min_length:
+            log_probs[:, self.eos] = -1e20 # forced non-end
+    def ensure_max_length(self):
+        if len(self) == self.max_length + 1:
+            self.is_finished.fill_(1)
+    def advance(self, log_probs, attn):
+        raise NotImplementedError()
+    def update_finished(self):
+        raise NotImplementedError

molscribe/inference/greedy_search.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+from .decode_strategy import DecodeStrategy
+def sample_with_temperature(logits, sampling_temp, keep_topk):
+    """Select next tokens randomly from the top k possible next tokens.
+    Samples from a categorical distribution over the ``keep_topk`` words using
+    the category probabilities ``logits / sampling_temp``.
+    """
+    if sampling_temp == 0.0 or keep_topk == 1:
+        # argmax
+        topk_scores, topk_ids = logits.topk(1, dim=-1)
+        if sampling_temp > 0:
+            topk_scores /= sampling_temp
+    else:
+        logits = torch.div(logits, sampling_temp)
+        if keep_topk > 0:
+            top_values, top_indices = torch.topk(logits, keep_topk, dim=1)
+            kth_best = top_values[:, -1].view([-1, 1])
+            kth_best = kth_best.repeat([1, logits.shape[1]]).float()
+            ignore = torch.lt(logits, kth_best)
+            logits = logits.masked_fill(ignore, -10000)
+        dist = torch.distributions.Multinomial(logits=logits, total_count=1)
+        topk_ids = torch.argmax(dist.sample(), dim=1, keepdim=True)
+        topk_scores = logits.gather(dim=1, index=topk_ids)
+    return topk_ids, topk_scores
+class GreedySearch(DecodeStrategy):
+    """Select next tokens randomly from the top k possible next tokens.
+    """
+    def __init__(self, pad, bos, eos, batch_size, min_length, max_length,
+                 return_attention=False, return_hidden=False, sampling_temp=1, keep_topk=1):
+        super().__init__(
+            pad, bos, eos, batch_size, 1, min_length, max_length, return_attention, return_hidden)
+        self.sampling_temp = sampling_temp
+        self.keep_topk = keep_topk
+        self.topk_scores = None
+    def initialize(self, memory_bank, device=None):
+        fn_map_state = None
+        if device is None:
+            device = memory_bank.device
+        self.memory_length = memory_bank.size(1)
+        super().initialize(memory_bank, device)
+        self.select_indices = torch.arange(
+            self.batch_size, dtype=torch.long, device=device)
+        self.original_batch_idx = torch.arange(
+            self.batch_size, dtype=torch.long, device=device)
+        return fn_map_state, memory_bank
+    @property
+    def current_predictions(self):
+        return self.alive_seq[:, -1]
+    @property
+    def batch_offset(self):
+        return self.select_indices
+    def _pick(self, log_probs):
+        """Function used to pick next tokens.
+        """
+        topk_ids, topk_scores = sample_with_temperature(
+            log_probs, self.sampling_temp, self.keep_topk)
+        return topk_ids, topk_scores
+    def advance(self, log_probs, attn=None, hidden=None, label=None):
+        """Select next tokens randomly from the top k possible next tokens.
+        """
+        self.ensure_min_length(log_probs)
+        topk_ids, self.topk_scores = self._pick(log_probs)  # log_probs: b x v; topk_ids & self.topk_scores: b x (t=1)
+        self.is_finished = topk_ids.eq(self.eos)
+        if label is not None:
+            label = label.view_as(self.is_finished)
+            self.is_finished = label.eq(self.eos)
+        self.alive_seq = torch.cat([self.alive_seq, topk_ids], -1)  # b x (l+1) (first element is <bos>; note l = len(self)-1)
+        self.alive_log_token_scores = torch.cat([self.alive_log_token_scores, self.topk_scores], -1)
+        if self.return_attention:
+            if self.alive_attn is None:
+                self.alive_attn = attn
+            else:
+                self.alive_attn = torch.cat([self.alive_attn, attn], 1)
+        if self.return_hidden:
+            if self.alive_hidden is None:
+                self.alive_hidden = hidden
+            else:
+                self.alive_hidden = torch.cat([self.alive_hidden, hidden], 1)  # b x l x h
+        self.ensure_max_length()
+    def update_finished(self):
+        """Finalize scores and predictions."""
+        # is_finished indicates the decoder finished generating the sequence. Remove it from the batch and update
+        # the results.
+        finished_batches = self.is_finished.view(-1).nonzero()
+        for b in finished_batches.view(-1):
+            b_orig = self.original_batch_idx[b]
+            # scores/predictions/attention are lists,
+            # (to be compatible with beam-search)
+            self.scores[b_orig].append(torch.exp(torch.mean(self.alive_log_token_scores[b])).item())
+            self.token_scores[b_orig].append(torch.exp(self.alive_log_token_scores[b]).tolist())
+            self.predictions[b_orig].append(self.alive_seq[b, 1:])  # skip <bos>
+            self.attention[b_orig].append(
+                self.alive_attn[b, :, :self.memory_length] if self.alive_attn is not None else [])
+            self.hidden[b_orig].append(
+                self.alive_hidden[b, :] if self.alive_hidden is not None else [])
+        self.done = self.is_finished.all()
+        if self.done:
+            return
+        is_alive = ~self.is_finished.view(-1)
+        self.alive_seq = self.alive_seq[is_alive]
+        self.alive_log_token_scores = self.alive_log_token_scores[is_alive]
+        if self.alive_attn is not None:
+            self.alive_attn = self.alive_attn[is_alive]
+        if self.alive_hidden is not None:
+            self.alive_hidden = self.alive_hidden[is_alive]
+        self.select_indices = is_alive.nonzero().view(-1)
+        self.original_batch_idx = self.original_batch_idx[is_alive]
+        # select_indices is equal to original_batch_idx for greedy search?

molscribe/interface.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import argparse
+from typing import List
+import cv2
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from .dataset import get_transforms
+from .model import Encoder, Decoder
+from .chemistry import convert_graph_to_smiles
+from .tokenizer import get_tokenizer
+BOND_TYPES = ["", "single", "double", "triple", "aromatic", "solid wedge", "dashed wedge"]
+def safe_load(module, module_states):
+    def remove_prefix(state_dict):
+        return {k.replace('module.', ''): v for k, v in state_dict.items()}
+    missing_keys, unexpected_keys = module.load_state_dict(remove_prefix(module_states), strict=False)
+    return
+class MolScribe:
+    def __init__(self, model_path, device=None):
+        """
+        MolScribe Interface
+        :param model_path: path of the model checkpoint.
+        :param device: torch device, defaults to be CPU.
+        """
+        model_states = torch.load(model_path, map_location=torch.device('cpu'))
+        args = self._get_args(model_states['args'])
+        if device is None:
+            device = torch.device('cpu')
+        self.device = device
+        self.tokenizer = get_tokenizer(args)
+        self.encoder, self.decoder = self._get_model(args, self.tokenizer, self.device, model_states)
+        self.transform = get_transforms(args.input_size, augment=False)
+    def _get_args(self, args_states=None):
+        parser = argparse.ArgumentParser()
+        # Model
+        parser.add_argument('--encoder', type=str, default='swin_base')
+        parser.add_argument('--decoder', type=str, default='transformer')
+        parser.add_argument('--trunc_encoder', action='store_true')  # use the hidden states before downsample
+        parser.add_argument('--no_pretrained', action='store_true')
+        parser.add_argument('--use_checkpoint', action='store_true', default=True)
+        parser.add_argument('--dropout', type=float, default=0.5)
+        parser.add_argument('--embed_dim', type=int, default=256)
+        parser.add_argument('--enc_pos_emb', action='store_true')
+        group = parser.add_argument_group("transformer_options")
+        group.add_argument("--dec_num_layers", help="No. of layers in transformer decoder", type=int, default=6)
+        group.add_argument("--dec_hidden_size", help="Decoder hidden size", type=int, default=256)
+        group.add_argument("--dec_attn_heads", help="Decoder no. of attention heads", type=int, default=8)
+        group.add_argument("--dec_num_queries", type=int, default=128)
+        group.add_argument("--hidden_dropout", help="Hidden dropout", type=float, default=0.1)
+        group.add_argument("--attn_dropout", help="Attention dropout", type=float, default=0.1)
+        group.add_argument("--max_relative_positions", help="Max relative positions", type=int, default=0)
+        parser.add_argument('--continuous_coords', action='store_true')
+        parser.add_argument('--compute_confidence', action='store_true')
+        # Data
+        parser.add_argument('--input_size', type=int, default=384)
+        parser.add_argument('--vocab_file', type=str, default=None)
+        parser.add_argument('--coord_bins', type=int, default=64)
+        parser.add_argument('--sep_xy', action='store_true', default=True)
+        args = parser.parse_args([])
+        if args_states:
+            for key, value in args_states.items():
+                args.__dict__[key] = value
+        return args
+    def _get_model(self, args, tokenizer, device, states):
+        encoder = Encoder(args, pretrained=False)
+        args.encoder_dim = encoder.n_features
+        decoder = Decoder(args, tokenizer)
+        safe_load(encoder, states['encoder'])
+        safe_load(decoder, states['decoder'])
+        # print(f"Model loaded from {load_path}")
+        encoder.to(device)
+        decoder.to(device)
+        encoder.eval()
+        decoder.eval()
+        return encoder, decoder
+    def predict_images(self, input_images: List, return_atoms_bonds=False, return_confidence=False, batch_size=16):
+        device = self.device
+        predictions = []
+        self.decoder.compute_confidence = return_confidence
+        for idx in range(0, len(input_images), batch_size):
+            batch_images = input_images[idx:idx+batch_size]
+            images = [self.transform(image=image, keypoints=[])['image'] for image in batch_images]
+            images = torch.stack(images, dim=0).to(device)
+            with torch.no_grad():
+                features, hiddens = self.encoder(images)
+                batch_predictions = self.decoder.decode(features, hiddens)
+            predictions += batch_predictions
+        return self.convert_graph_to_output(predictions, input_images, return_confidence, return_atoms_bonds)
+    def convert_graph_to_output(self, predictions, input_images, return_confidence=True, return_atoms_bonds=True):
+        node_coords = [pred['chartok_coords']['coords'] for pred in predictions]
+        node_symbols = [pred['chartok_coords']['symbols'] for pred in predictions]
+        edges = [pred['edges'] for pred in predictions]
+        # node_symbols = [r_groups[symbol] if symbol in r_groups else symbol for symbol in node_symbols]
+        smiles_list, molblock_list, r_success = convert_graph_to_smiles(
+            node_coords, node_symbols, edges, images=input_images)
+        outputs = []
+        for smiles, molblock, pred in zip(smiles_list, molblock_list, predictions):
+            pred_dict = {"smiles": smiles, "molfile": molblock, "oringinal_coords": pred['chartok_coords']['coords'], "original_symbols": pred['chartok_coords']['symbols'], "orignal_edges": pred['edges']}
+            if return_confidence:
+                pred_dict["confidence"] = pred["overall_score"]
+            if return_atoms_bonds:
+                coords = pred['chartok_coords']['coords']
+                symbols = pred['chartok_coords']['symbols']
+                # get atoms info
+                atom_list = []
+                for i, (symbol, coord) in enumerate(zip(symbols, coords)):
+                    atom_dict = {"atom_symbol": symbol, "x": round(coord[0],3), "y": round(coord[1],3)}
+                    if return_confidence:
+                        atom_dict["confidence"] = pred['chartok_coords']['atom_scores'][i]
+                    atom_list.append(atom_dict)
+                pred_dict["atoms"] = atom_list
+                # get bonds info
+                bond_list = []
+                num_atoms = len(symbols)
+                for i in range(num_atoms-1):
+                    for j in range(i+1, num_atoms):
+                        bond_type_int = pred['edges'][i][j]
+                        if bond_type_int != 0:
+                            bond_type_str = BOND_TYPES[bond_type_int]
+                            bond_dict = {"bond_type": bond_type_str, "endpoint_atoms": (i, j)}
+                            if return_confidence:
+                                bond_dict["confidence"] = pred["edge_scores"][i][j]
+                            bond_list.append(bond_dict)
+                pred_dict["bonds"] = bond_list
+            outputs.append(pred_dict)
+        return outputs
+    def predict_image(self, image, return_atoms_bonds=False, return_confidence=False):
+        return self.predict_images([
+            image], return_atoms_bonds=return_atoms_bonds, return_confidence=return_confidence)[0]
+    def predict_image_files(self, image_files: List, return_atoms_bonds=False, return_confidence=False):
+        input_images = []
+        for path in image_files:
+            image = cv2.imread(path)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            input_images.append(image)
+        return self.predict_images(
+            input_images, return_atoms_bonds=return_atoms_bonds, return_confidence=return_confidence)
+    def predict_image_file(self, image_file: str, return_atoms_bonds=False, return_confidence=False):
+        return self.predict_image_files(
+            [image_file], return_atoms_bonds=return_atoms_bonds, return_confidence=return_confidence)[0]
+    def draw_prediction(self, prediction, image, notebook=False):
+        if "atoms" not in prediction or "bonds" not in prediction:
+            raise ValueError("atoms and bonds information are not provided.")
+        h, w, _ = image.shape
+        h, w = np.array([h, w]) * 400 / max(h, w)
+        image = cv2.resize(image, (int(w), int(h)))
+        fig, ax = plt.subplots(1, 1)
+        ax.axis('off')
+        ax.set_xlim(-0.05 * w, w * 1.05)
+        ax.set_ylim(1.05 * h, -0.05 * h)
+        plt.imshow(image, alpha=0.)
+        x = [a['x'] * w for a in prediction['atoms']]
+        y = [a['y'] * h for a in prediction['atoms']]
+        markersize = min(w, h) / 3
+        plt.scatter(x, y, marker='o', s=markersize, color='lightskyblue', zorder=10)
+        for i, atom in enumerate(prediction['atoms']):
+            symbol = atom['atom_symbol'].lstrip('[').rstrip(']')
+            plt.annotate(symbol, xy=(x[i], y[i]), ha='center', va='center', color='black', zorder=100)
+        for bond in prediction['bonds']:
+            u, v = bond['endpoint_atoms']
+            x1, y1, x2, y2 = x[u], y[u], x[v], y[v]
+            bond_type = bond['bond_type']
+            if bond_type == 'single':
+                color = 'tab:green'
+                ax.plot([x1, x2], [y1, y2], color, linewidth=4)
+            elif bond_type == 'aromatic':
+                color = 'tab:purple'
+                ax.plot([x1, x2], [y1, y2], color, linewidth=4)
+            elif bond_type == 'double':
+                color = 'tab:green'
+                ax.plot([x1, x2], [y1, y2], color=color, linewidth=7)
+                ax.plot([x1, x2], [y1, y2], color='w', linewidth=1.5, zorder=2.1)
+            elif bond_type == 'triple':
+                color = 'tab:green'
+                x1s, x2s = 0.8 * x1 + 0.2 * x2, 0.2 * x1 + 0.8 * x2
+                y1s, y2s = 0.8 * y1 + 0.2 * y2, 0.2 * y1 + 0.8 * y2
+                ax.plot([x1s, x2s], [y1s, y2s], color=color, linewidth=9)
+                ax.plot([x1, x2], [y1, y2], color='w', linewidth=5, zorder=2.05)
+                ax.plot([x1, x2], [y1, y2], color=color, linewidth=2, zorder=2.1)
+            else:
+                length = 10
+                width = 10
+                color = 'tab:green'
+                if bond_type == 'solid wedge':
+                    ax.annotate('', xy=(x1, y1), xytext=(x2, y2),
+                                arrowprops=dict(color=color, width=3, headwidth=width, headlength=length), zorder=2)
+                else:
+                    ax.annotate('', xy=(x2, y2), xytext=(x1, y1),
+                                arrowprops=dict(color=color, width=3, headwidth=width, headlength=length), zorder=2)
+        fig.tight_layout()
+        if not notebook:
+            canvas = FigureCanvasAgg(fig)
+            canvas.draw()
+            buf = canvas.buffer_rgba()
+            result_image = np.asarray(buf)
+            plt.close(fig)
+            return result_image

molscribe/loss.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from .tokenizer import PAD_ID, MASK, MASK_ID
+class LabelSmoothingLoss(nn.Module):
+    """
+    With label smoothing,
+    KL-divergence between q_{smoothed ground truth prob.}(w)
+    and p_{prob. computed by model}(w) is minimized.
+    """
+    def __init__(self, label_smoothing, tgt_vocab_size, ignore_index=-100):
+        assert 0.0 < label_smoothing <= 1.0
+        self.ignore_index = ignore_index
+        super(LabelSmoothingLoss, self).__init__()
+        smoothing_value = label_smoothing / (tgt_vocab_size - 2)
+        one_hot = torch.full((tgt_vocab_size,), smoothing_value)
+        one_hot[self.ignore_index] = 0
+        self.register_buffer('one_hot', one_hot.unsqueeze(0))
+        self.confidence = 1.0 - label_smoothing
+    def forward(self, output, target):
+        """
+        output (FloatTensor): batch_size x n_classes
+        target (LongTensor): batch_size
+        """
+        # assuming output is raw logits
+        # convert to log_probs
+        log_probs = F.log_softmax(output, dim=-1)
+        model_prob = self.one_hot.repeat(target.size(0), 1)
+        model_prob.scatter_(1, target.unsqueeze(1), self.confidence)
+        model_prob.masked_fill_((target == self.ignore_index).unsqueeze(1), 0)
+        # reduction mean or sum?
+        return F.kl_div(log_probs, model_prob, reduction='batchmean')
+class SequenceLoss(nn.Module):
+    def __init__(self, label_smoothing, vocab_size, ignore_index=-100, ignore_indices=[]):
+        super(SequenceLoss, self).__init__()
+        if ignore_indices:
+            ignore_index = ignore_indices[0]
+        self.ignore_index = ignore_index
+        self.ignore_indices = ignore_indices
+        if label_smoothing == 0:
+            self.criterion = nn.CrossEntropyLoss(ignore_index=ignore_index, reduction='mean')
+        else:
+            self.criterion = LabelSmoothingLoss(label_smoothing, vocab_size, ignore_index)
+    def forward(self, output, target):
+        """
+        :param output: [batch, len, vocab]
+        :param target: [batch, len]
+        :return:
+        """
+        batch_size, max_len, vocab_size = output.size()
+        output = output.reshape(-1, vocab_size)
+        target = target.reshape(-1)
+        for idx in self.ignore_indices:
+            if idx != self.ignore_index:
+                target.masked_fill_((target == idx), self.ignore_index)
+        loss = self.criterion(output, target)
+        return loss
+class GraphLoss(nn.Module):
+    def __init__(self):
+        super(GraphLoss, self).__init__()
+        weight = torch.ones(7) * 10
+        weight[0] = 1
+        self.criterion = nn.CrossEntropyLoss(weight, ignore_index=-100)
+    def forward(self, outputs, targets):
+        results = {}
+        if 'coords' in outputs:
+            pred = outputs['coords']
+            max_len = pred.size(1)
+            target = targets['coords'][:, :max_len]
+            mask = target.ge(0)
+            loss = F.l1_loss(pred, target, reduction='none')
+            results['coords'] = (loss * mask).sum() / mask.sum()
+        if 'edges' in outputs:
+            pred = outputs['edges']
+            max_len = pred.size(-1)
+            target = targets['edges'][:, :max_len, :max_len]
+            results['edges'] = self.criterion(pred, target)
+        return results
+class Criterion(nn.Module):
+    def __init__(self, args, tokenizer):
+        super(Criterion, self).__init__()
+        criterion = {}
+        for format_ in args.formats:
+            if format_ == 'edges':
+                criterion['edges'] = GraphLoss()
+            else:
+                if MASK in tokenizer[format_].stoi:
+                    ignore_indices = [PAD_ID, MASK_ID]
+                else:
+                    ignore_indices = []
+                criterion[format_] = SequenceLoss(args.label_smoothing, len(tokenizer[format_]),
+                                                  ignore_index=PAD_ID, ignore_indices=ignore_indices)
+        self.criterion = nn.ModuleDict(criterion)
+    def forward(self, results, refs):
+        losses = {}
+        for format_ in results:
+            predictions, targets, *_ = results[format_]
+            loss_ = self.criterion[format_](predictions, targets)
+            if type(loss_) is dict:
+                losses.update(loss_)
+            else:
+                if loss_.numel() > 1:
+                    loss_ = loss_.mean()
+                losses[format_] = loss_
+        return losses

molscribe/model.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+from .utils import FORMAT_INFO, to_device
+from .tokenizer import SOS_ID, EOS_ID, PAD_ID, MASK_ID
+from .inference import GreedySearch, BeamSearch
+from .transformer import TransformerDecoder, Embeddings
+class Encoder(nn.Module):
+    def __init__(self, args, pretrained=False):
+        super().__init__()
+        model_name = args.encoder
+        self.model_name = model_name
+        if model_name.startswith('resnet'):
+            self.model_type = 'resnet'
+            self.cnn = timm.create_model(model_name, pretrained=pretrained)
+            self.n_features = self.cnn.num_features  # encoder_dim
+            self.cnn.global_pool = nn.Identity()
+            self.cnn.fc = nn.Identity()
+        elif model_name.startswith('swin'):
+            self.model_type = 'swin'
+            self.transformer = timm.create_model(model_name, pretrained=pretrained, pretrained_strict=False,
+                                                 use_checkpoint=args.use_checkpoint)
+            self.n_features = self.transformer.num_features
+            self.transformer.head = nn.Identity()
+        elif 'efficientnet' in model_name:
+            self.model_type = 'efficientnet'
+            self.cnn = timm.create_model(model_name, pretrained=pretrained)
+            self.n_features = self.cnn.num_features
+            self.cnn.global_pool = nn.Identity()
+            self.cnn.classifier = nn.Identity()
+        else:
+            raise NotImplemented
+    def swin_forward(self, transformer, x):
+        x = transformer.patch_embed(x)
+        if transformer.absolute_pos_embed is not None:
+            x = x + transformer.absolute_pos_embed
+        x = transformer.pos_drop(x)
+        def layer_forward(layer, x, hiddens):
+            for blk in layer.blocks:
+                if not torch.jit.is_scripting() and layer.use_checkpoint:
+                    x = torch.utils.checkpoint.checkpoint(blk, x)
+                else:
+                    x = blk(x)
+            H, W = layer.input_resolution
+            B, L, C = x.shape
+            hiddens.append(x.view(B, H, W, C))
+            if layer.downsample is not None:
+                x = layer.downsample(x)
+            return x, hiddens
+        hiddens = []
+        for layer in transformer.layers:
+            x, hiddens = layer_forward(layer, x, hiddens)
+        x = transformer.norm(x)  # B L C
+        hiddens[-1] = x.view_as(hiddens[-1])
+        return x, hiddens
+    def forward(self, x, refs=None):
+        if self.model_type in ['resnet', 'efficientnet']:
+            features = self.cnn(x)
+            features = features.permute(0, 2, 3, 1)
+            hiddens = []
+        elif self.model_type == 'swin':
+            if 'patch' in self.model_name:
+                features, hiddens = self.swin_forward(self.transformer, x)
+            else:
+                features, hiddens = self.transformer(x)
+        else:
+            raise NotImplemented
+        return features, hiddens
+class TransformerDecoderBase(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.enc_trans_layer = nn.Sequential(
+            nn.Linear(args.encoder_dim, args.dec_hidden_size)
+            # nn.LayerNorm(args.dec_hidden_size, eps=1e-6)
+        )
+        self.enc_pos_emb = nn.Embedding(144, args.encoder_dim) if args.enc_pos_emb else None
+        self.decoder = TransformerDecoder(
+            num_layers=args.dec_num_layers,
+            d_model=args.dec_hidden_size,
+            heads=args.dec_attn_heads,
+            d_ff=args.dec_hidden_size * 4,
+            copy_attn=False,
+            self_attn_type="scaled-dot",
+            dropout=args.hidden_dropout,
+            attention_dropout=args.attn_dropout,
+            max_relative_positions=args.max_relative_positions,
+            aan_useffn=False,
+            full_context_alignment=False,
+            alignment_layer=0,
+            alignment_heads=0,
+            pos_ffn_activation_fn='gelu'
+        )
+    def enc_transform(self, encoder_out):
+        batch_size = encoder_out.size(0)
+        encoder_dim = encoder_out.size(-1)
+        encoder_out = encoder_out.view(batch_size, -1, encoder_dim)  # (batch_size, num_pixels, encoder_dim)
+        max_len = encoder_out.size(1)
+        device = encoder_out.device
+        if self.enc_pos_emb:
+            pos_emb = self.enc_pos_emb(torch.arange(max_len, device=device)).unsqueeze(0)
+            encoder_out = encoder_out + pos_emb
+        encoder_out = self.enc_trans_layer(encoder_out)
+        return encoder_out
+class TransformerDecoderAR(TransformerDecoderBase):
+    """Autoregressive Transformer Decoder"""
+    def __init__(self, args, tokenizer):
+        super().__init__(args)
+        self.tokenizer = tokenizer
+        self.vocab_size = len(self.tokenizer)
+        self.output_layer = nn.Linear(args.dec_hidden_size, self.vocab_size, bias=True)
+        self.embeddings = Embeddings(
+            word_vec_size=args.dec_hidden_size,
+            word_vocab_size=self.vocab_size,
+            word_padding_idx=PAD_ID,
+            position_encoding=True,
+            dropout=args.hidden_dropout)
+    def dec_embedding(self, tgt, step=None):
+        pad_idx = self.embeddings.word_padding_idx
+        tgt_pad_mask = tgt.data.eq(pad_idx).transpose(1, 2)  # [B, 1, T_tgt]
+        emb = self.embeddings(tgt, step=step)
+        assert emb.dim() == 3  # batch x len x embedding_dim
+        return emb, tgt_pad_mask
+    def forward(self, encoder_out, labels, label_lengths):
+        """Training mode"""
+        batch_size, max_len, _ = encoder_out.size()
+        memory_bank = self.enc_transform(encoder_out)
+        tgt = labels.unsqueeze(-1)  # (b, t, 1)
+        tgt_emb, tgt_pad_mask = self.dec_embedding(tgt)
+        dec_out, *_ = self.decoder(tgt_emb=tgt_emb, memory_bank=memory_bank, tgt_pad_mask=tgt_pad_mask)
+        logits = self.output_layer(dec_out)  # (b, t, h) -> (b, t, v)
+        return logits[:, :-1], labels[:, 1:], dec_out
+    def decode(self, encoder_out, beam_size: int, n_best: int, min_length: int = 1, max_length: int = 256,
+               labels=None):
+        """Inference mode. Autoregressively decode the sequence. Only greedy search is supported now. Beam search is
+        out-dated. The labels is used for partial prediction, i.e. part of the sequence is given. In standard decoding,
+        labels=None."""
+        batch_size, max_len, _ = encoder_out.size()
+        memory_bank = self.enc_transform(encoder_out)
+        orig_labels = labels
+        if beam_size == 1:
+            decode_strategy = GreedySearch(
+                sampling_temp=0.0, keep_topk=1, batch_size=batch_size, min_length=min_length, max_length=max_length,
+                pad=PAD_ID, bos=SOS_ID, eos=EOS_ID,
+                return_attention=False, return_hidden=True)
+        else:
+            decode_strategy = BeamSearch(
+                beam_size=beam_size, n_best=n_best, batch_size=batch_size, min_length=min_length, max_length=max_length,
+                pad=PAD_ID, bos=SOS_ID, eos=EOS_ID,
+                return_attention=False)
+        # adapted from onmt.translate.translator
+        results = {
+            "predictions": None,
+            "scores": None,
+            "attention": None
+        }
+        # (2) prep decode_strategy. Possibly repeat src objects.
+        _, memory_bank = decode_strategy.initialize(memory_bank=memory_bank)
+        # (3) Begin decoding step by step:
+        for step in range(decode_strategy.max_length):
+            tgt = decode_strategy.current_predictions.view(-1, 1, 1)
+            if labels is not None:
+                label = labels[:, step].view(-1, 1, 1)
+                mask = label.eq(MASK_ID).long()
+                tgt = tgt * mask + label * (1 - mask)
+            tgt_emb, tgt_pad_mask = self.dec_embedding(tgt)
+            dec_out, dec_attn, *_ = self.decoder(tgt_emb=tgt_emb, memory_bank=memory_bank,
+                                                 tgt_pad_mask=tgt_pad_mask, step=step)
+            attn = dec_attn.get("std", None)
+            dec_logits = self.output_layer(dec_out)  # [b, t, h] => [b, t, v]
+            dec_logits = dec_logits.squeeze(1)
+            log_probs = F.log_softmax(dec_logits, dim=-1)
+            if self.tokenizer.output_constraint:
+                output_mask = [self.tokenizer.get_output_mask(id) for id in tgt.view(-1).tolist()]
+                output_mask = torch.tensor(output_mask, device=log_probs.device)
+                log_probs.masked_fill_(output_mask, -10000)
+            label = labels[:, step + 1] if labels is not None and step + 1 < labels.size(1) else None
+            decode_strategy.advance(log_probs, attn, dec_out, label)
+            any_finished = decode_strategy.is_finished.any()
+            if any_finished:
+                decode_strategy.update_finished()
+                if decode_strategy.done:
+                    break
+            select_indices = decode_strategy.select_indices
+            if any_finished:
+                # Reorder states.
+                memory_bank = memory_bank.index_select(0, select_indices)
+                if labels is not None:
+                    labels = labels.index_select(0, select_indices)
+                self.map_state(lambda state, dim: state.index_select(dim, select_indices))
+        results["scores"] = decode_strategy.scores  # fixed to be average of token scores
+        results["token_scores"] = decode_strategy.token_scores
+        results["predictions"] = decode_strategy.predictions
+        results["attention"] = decode_strategy.attention
+        results["hidden"] = decode_strategy.hidden
+        if orig_labels is not None:
+            for i in range(batch_size):
+                pred = results["predictions"][i][0]
+                label = orig_labels[i][1:len(pred) + 1]
+                mask = label.eq(MASK_ID).long()
+                pred = pred[:len(label)]
+                results["predictions"][i][0] = pred * mask + label * (1 - mask)
+        return results["predictions"], results['scores'], results["token_scores"], results["hidden"]
+    # adapted from onmt.decoders.transformer
+    def map_state(self, fn):
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+        if self.decoder.state["cache"] is not None:
+            _recursive_map(self.decoder.state["cache"])
+class GraphPredictor(nn.Module):
+    def __init__(self, decoder_dim, coords=False):
+        super(GraphPredictor, self).__init__()
+        self.coords = coords
+        self.mlp = nn.Sequential(
+            nn.Linear(decoder_dim * 2, decoder_dim), nn.GELU(),
+            nn.Linear(decoder_dim, 7)
+        )
+        if coords:
+            self.coords_mlp = nn.Sequential(
+                nn.Linear(decoder_dim, decoder_dim), nn.GELU(),
+                nn.Linear(decoder_dim, 2)
+            )
+    def forward(self, hidden, indices=None):
+        b, l, dim = hidden.size()
+        if indices is None:
+            index = [i for i in range(3, l, 3)]
+            hidden = hidden[:, index]
+        else:
+            batch_id = torch.arange(b).unsqueeze(1).expand_as(indices).reshape(-1)
+            indices = indices.view(-1)
+            hidden = hidden[batch_id, indices].view(b, -1, dim)
+        b, l, dim = hidden.size()
+        results = {}
+        hh = torch.cat([hidden.unsqueeze(2).expand(b, l, l, dim), hidden.unsqueeze(1).expand(b, l, l, dim)], dim=3)
+        results['edges'] = self.mlp(hh).permute(0, 3, 1, 2)
+        if self.coords:
+            results['coords'] = self.coords_mlp(hidden)
+        return results
+def get_edge_prediction(edge_prob):
+    if not edge_prob:
+        return [], []
+    n = len(edge_prob)
+    if n == 0:
+        return [], []
+    for i in range(n):
+        for j in range(i + 1, n):
+            for k in range(5):
+                edge_prob[i][j][k] = (edge_prob[i][j][k] + edge_prob[j][i][k]) / 2
+                edge_prob[j][i][k] = edge_prob[i][j][k]
+            edge_prob[i][j][5] = (edge_prob[i][j][5] + edge_prob[j][i][6]) / 2
+            edge_prob[i][j][6] = (edge_prob[i][j][6] + edge_prob[j][i][5]) / 2
+            edge_prob[j][i][5] = edge_prob[i][j][6]
+            edge_prob[j][i][6] = edge_prob[i][j][5]
+    prediction = np.argmax(edge_prob, axis=2).tolist()
+    score = np.max(edge_prob, axis=2).tolist()
+    return prediction, score
+class Decoder(nn.Module):
+    """This class is a wrapper for different decoder architectures, and support multiple decoders."""
+    def __init__(self, args, tokenizer):
+        super(Decoder, self).__init__()
+        self.args = args
+        self.formats = args.formats
+        self.tokenizer = tokenizer
+        decoder = {}
+        for format_ in args.formats:
+            if format_ == 'edges':
+                decoder['edges'] = GraphPredictor(args.dec_hidden_size, coords=args.continuous_coords)
+            else:
+                decoder[format_] = TransformerDecoderAR(args, tokenizer[format_])
+        self.decoder = nn.ModuleDict(decoder)
+        self.compute_confidence = args.compute_confidence
+    def forward(self, encoder_out, hiddens, refs):
+        """Training mode. Compute the logits with teacher forcing."""
+        results = {}
+        refs = to_device(refs, encoder_out.device)
+        for format_ in self.formats:
+            if format_ == 'edges':
+                if 'atomtok_coords' in results:
+                    dec_out = results['atomtok_coords'][2]
+                    predictions = self.decoder['edges'](dec_out, indices=refs['atom_indices'][0])
+                elif 'chartok_coords' in results:
+                    dec_out = results['chartok_coords'][2]
+                    predictions = self.decoder['edges'](dec_out, indices=refs['atom_indices'][0])
+                else:
+                    raise NotImplemented
+                targets = {'edges': refs['edges']}
+                if 'coords' in predictions:
+                    targets['coords'] = refs['coords']
+                results['edges'] = (predictions, targets)
+            else:
+                labels, label_lengths = refs[format_]
+                results[format_] = self.decoder[format_](encoder_out, labels, label_lengths)
+        return results
+    def decode(self, encoder_out, hiddens=None, refs=None, beam_size=1, n_best=1):
+        """Inference mode. Call each decoder's decode method (if required), convert the output format (e.g. token to
+        sequence). Beam search is not supported yet."""
+        results = {}
+        predictions = []
+        for format_ in self.formats:
+            if format_ in ['atomtok', 'atomtok_coords', 'chartok_coords']:
+                max_len = FORMAT_INFO[format_]['max_len']
+                results[format_] = self.decoder[format_].decode(encoder_out, beam_size, n_best, max_length=max_len)
+                outputs, scores, token_scores, *_ = results[format_]
+                beam_preds = [[self.tokenizer[format_].sequence_to_smiles(x.tolist()) for x in pred]
+                              for pred in outputs]
+                predictions = [{format_: pred[0]} for pred in beam_preds]
+                if self.compute_confidence:
+                    for i in range(len(predictions)):
+                        # -1: y score, -2: x score, -3: symbol score
+                        indices = np.array(predictions[i][format_]['indices']) - 3
+                        if format_ == 'chartok_coords':
+                            atom_scores = []
+                            for symbol, index in zip(predictions[i][format_]['symbols'], indices):
+                                atom_score = (np.prod(token_scores[i][0][index - len(symbol) + 1:index + 1])
+                                              ** (1 / len(symbol))).item()
+                                atom_scores.append(atom_score)
+                        else:
+                            atom_scores = np.array(token_scores[i][0])[indices].tolist()
+                        predictions[i][format_]['atom_scores'] = atom_scores
+                        predictions[i][format_]['average_token_score'] = scores[i][0]
+            if format_ == 'edges':
+                if 'atomtok_coords' in results:
+                    atom_format = 'atomtok_coords'
+                elif 'chartok_coords' in results:
+                    atom_format = 'chartok_coords'
+                else:
+                    raise NotImplemented
+                dec_out = results[atom_format][3]  # batch x n_best x len x dim
+                for i in range(len(dec_out)):
+                    hidden = dec_out[i][0].unsqueeze(0)  # 1 * len * dim
+                    indices = torch.LongTensor(predictions[i][atom_format]['indices']).unsqueeze(0)  # 1 * k
+                    pred = self.decoder['edges'](hidden, indices)  # k * k
+                    prob = F.softmax(pred['edges'].squeeze(0).permute(1, 2, 0), dim=2).tolist()  # k * k * 7
+                    edge_pred, edge_score = get_edge_prediction(prob)
+                    predictions[i]['edges'] = edge_pred
+                    if self.compute_confidence:
+                        predictions[i]['edge_scores'] = edge_score
+                        predictions[i]['edge_score_product'] = np.sqrt(np.prod(edge_score)).item()
+                        predictions[i]['overall_score'] = predictions[i][atom_format]['average_token_score'] * \
+                                                          predictions[i]['edge_score_product']
+                        predictions[i][atom_format].pop('average_token_score')
+                        predictions[i].pop('edge_score_product')
+        return predictions

molscribe/tokenizer.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import os
+import json
+import random
+import numpy as np
+from SmilesPE.pretokenizer import atomwise_tokenizer
+PAD = '<pad>'
+SOS = '<sos>'
+EOS = '<eos>'
+UNK = '<unk>'
+MASK = '<mask>'
+PAD_ID = 0
+SOS_ID = 1
+EOS_ID = 2
+UNK_ID = 3
+MASK_ID = 4
+class Tokenizer(object):
+    def __init__(self, path=None):
+        self.stoi = {}
+        self.itos = {}
+        if path:
+            self.load(path)
+    def __len__(self):
+        return len(self.stoi)
+    @property
+    def output_constraint(self):
+        return False
+    def save(self, path):
+        with open(path, 'w') as f:
+            json.dump(self.stoi, f)
+    def load(self, path):
+        with open(path) as f:
+            self.stoi = json.load(f)
+        self.itos = {item[1]: item[0] for item in self.stoi.items()}
+    def fit_on_texts(self, texts):
+        vocab = set()
+        for text in texts:
+            vocab.update(text.split(' '))
+        vocab = [PAD, SOS, EOS, UNK] + list(vocab)
+        for i, s in enumerate(vocab):
+            self.stoi[s] = i
+        self.itos = {item[1]: item[0] for item in self.stoi.items()}
+        assert self.stoi[PAD] == PAD_ID
+        assert self.stoi[SOS] == SOS_ID
+        assert self.stoi[EOS] == EOS_ID
+        assert self.stoi[UNK] == UNK_ID
+    def text_to_sequence(self, text, tokenized=True):
+        sequence = []
+        sequence.append(self.stoi['<sos>'])
+        if tokenized:
+            tokens = text.split(' ')
+        else:
+            tokens = atomwise_tokenizer(text)
+        for s in tokens:
+            if s not in self.stoi:
+                s = '<unk>'
+            sequence.append(self.stoi[s])
+        sequence.append(self.stoi['<eos>'])
+        return sequence
+    def texts_to_sequences(self, texts):
+        sequences = []
+        for text in texts:
+            sequence = self.text_to_sequence(text)
+            sequences.append(sequence)
+        return sequences
+    def sequence_to_text(self, sequence):
+        return ''.join(list(map(lambda i: self.itos[i], sequence)))
+    def sequences_to_texts(self, sequences):
+        texts = []
+        for sequence in sequences:
+            text = self.sequence_to_text(sequence)
+            texts.append(text)
+        return texts
+    def predict_caption(self, sequence):
+        caption = ''
+        for i in sequence:
+            if i == self.stoi['<eos>'] or i == self.stoi['<pad>']:
+                break
+            caption += self.itos[i]
+        return caption
+    def predict_captions(self, sequences):
+        captions = []
+        for sequence in sequences:
+            caption = self.predict_caption(sequence)
+            captions.append(caption)
+        return captions
+    def sequence_to_smiles(self, sequence):
+        return {'smiles': self.predict_caption(sequence)}
+class NodeTokenizer(Tokenizer):
+    def __init__(self, input_size=100, path=None, sep_xy=False, continuous_coords=False, debug=False):
+        super().__init__(path)
+        self.maxx = input_size  # height
+        self.maxy = input_size  # width
+        self.sep_xy = sep_xy
+        self.special_tokens = [PAD, SOS, EOS, UNK, MASK]
+        self.continuous_coords = continuous_coords
+        self.debug = debug
+    def __len__(self):
+        if self.sep_xy:
+            return self.offset + self.maxx + self.maxy
+        else:
+            return self.offset + max(self.maxx, self.maxy)
+    @property
+    def offset(self):
+        return len(self.stoi)
+    @property
+    def output_constraint(self):
+        return not self.continuous_coords
+    def len_symbols(self):
+        return len(self.stoi)
+    def fit_atom_symbols(self, atoms):
+        vocab = self.special_tokens + list(set(atoms))
+        for i, s in enumerate(vocab):
+            self.stoi[s] = i
+        assert self.stoi[PAD] == PAD_ID
+        assert self.stoi[SOS] == SOS_ID
+        assert self.stoi[EOS] == EOS_ID
+        assert self.stoi[UNK] == UNK_ID
+        assert self.stoi[MASK] == MASK_ID
+        self.itos = {item[1]: item[0] for item in self.stoi.items()}
+    def is_x(self, x):
+        return self.offset <= x < self.offset + self.maxx
+    def is_y(self, y):
+        if self.sep_xy:
+            return self.offset + self.maxx <= y
+        return self.offset <= y
+    def is_symbol(self, s):
+        return len(self.special_tokens) <= s < self.offset or s == UNK_ID
+    def is_atom(self, id):
+        if self.is_symbol(id):
+            return self.is_atom_token(self.itos[id])
+        return False
+    def is_atom_token(self, token):
+        return token.isalpha() or token.startswith("[") or token == '*' or token == UNK
+    def x_to_id(self, x):
+        return self.offset + round(x * (self.maxx - 1))
+    def y_to_id(self, y):
+        if self.sep_xy:
+            return self.offset + self.maxx + round(y * (self.maxy - 1))
+        return self.offset + round(y * (self.maxy - 1))
+    def id_to_x(self, id):
+        return (id - self.offset) / (self.maxx - 1)
+    def id_to_y(self, id):
+        if self.sep_xy:
+            return (id - self.offset - self.maxx) / (self.maxy - 1)
+        return (id - self.offset) / (self.maxy - 1)
+    def get_output_mask(self, id):
+        mask = [False] * len(self)
+        if self.continuous_coords:
+            return mask
+        if self.is_atom(id):
+            return [True] * self.offset + [False] * self.maxx + [True] * self.maxy
+        if self.is_x(id):
+            return [True] * (self.offset + self.maxx) + [False] * self.maxy
+        if self.is_y(id):
+            return [False] * self.offset + [True] * (self.maxx + self.maxy)
+        return mask
+    def symbol_to_id(self, symbol):
+        if symbol not in self.stoi:
+            return UNK_ID
+        return self.stoi[symbol]
+    def symbols_to_labels(self, symbols):
+        labels = []
+        for symbol in symbols:
+            labels.append(self.symbol_to_id(symbol))
+        return labels
+    def labels_to_symbols(self, labels):
+        symbols = []
+        for label in labels:
+            symbols.append(self.itos[label])
+        return symbols
+    def nodes_to_grid(self, nodes):
+        coords, symbols = nodes['coords'], nodes['symbols']
+        grid = np.zeros((self.maxx, self.maxy), dtype=int)
+        for [x, y], symbol in zip(coords, symbols):
+            x = round(x * (self.maxx - 1))
+            y = round(y * (self.maxy - 1))
+            grid[x][y] = self.symbol_to_id(symbol)
+        return grid
+    def grid_to_nodes(self, grid):
+        coords, symbols, indices = [], [], []
+        for i in range(self.maxx):
+            for j in range(self.maxy):
+                if grid[i][j] != 0:
+                    x = i / (self.maxx - 1)
+                    y = j / (self.maxy - 1)
+                    coords.append([x, y])
+                    symbols.append(self.itos[grid[i][j]])
+                    indices.append([i, j])
+        return {'coords': coords, 'symbols': symbols, 'indices': indices}
+    def nodes_to_sequence(self, nodes):
+        coords, symbols = nodes['coords'], nodes['symbols']
+        labels = [SOS_ID]
+        for (x, y), symbol in zip(coords, symbols):
+            assert 0 <= x <= 1
+            assert 0 <= y <= 1
+            labels.append(self.x_to_id(x))
+            labels.append(self.y_to_id(y))
+            labels.append(self.symbol_to_id(symbol))
+        labels.append(EOS_ID)
+        return labels
+    def sequence_to_nodes(self, sequence):
+        coords, symbols = [], []
+        i = 0
+        if sequence[0] == SOS_ID:
+            i += 1
+        while i + 2 < len(sequence):
+            if sequence[i] == EOS_ID:
+                break
+            if self.is_x(sequence[i]) and self.is_y(sequence[i+1]) and self.is_symbol(sequence[i+2]):
+                x = self.id_to_x(sequence[i])
+                y = self.id_to_y(sequence[i+1])
+                symbol = self.itos[sequence[i+2]]
+                coords.append([x, y])
+                symbols.append(symbol)
+            i += 3
+        return {'coords': coords, 'symbols': symbols}
+    def smiles_to_sequence(self, smiles, coords=None, mask_ratio=0, atom_only=False):
+        tokens = atomwise_tokenizer(smiles)
+        labels = [SOS_ID]
+        indices = []
+        atom_idx = -1
+        for token in tokens:
+            if atom_only and not self.is_atom_token(token):
+                continue
+            if token in self.stoi:
+                labels.append(self.stoi[token])
+            else:
+                if self.debug:
+                    print(f'{token} not in vocab')
+                labels.append(UNK_ID)
+            if self.is_atom_token(token):
+                atom_idx += 1
+                if not self.continuous_coords:
+                    if mask_ratio > 0 and random.random() < mask_ratio:
+                        labels.append(MASK_ID)
+                        labels.append(MASK_ID)
+                    elif coords is not None:
+                        if atom_idx < len(coords):
+                            x, y = coords[atom_idx]
+                            assert 0 <= x <= 1
+                            assert 0 <= y <= 1
+                        else:
+                            x = random.random()
+                            y = random.random()
+                        labels.append(self.x_to_id(x))
+                        labels.append(self.y_to_id(y))
+                indices.append(len(labels) - 1)
+        labels.append(EOS_ID)
+        return labels, indices
+    def sequence_to_smiles(self, sequence):
+        has_coords = not self.continuous_coords
+        smiles = ''
+        coords, symbols, indices = [], [], []
+        for i, label in enumerate(sequence):
+            if label == EOS_ID or label == PAD_ID:
+                break
+            if self.is_x(label) or self.is_y(label):
+                continue
+            token = self.itos[label]
+            smiles += token
+            if self.is_atom_token(token):
+                if has_coords:
+                    if i+3 < len(sequence) and self.is_x(sequence[i+1]) and self.is_y(sequence[i+2]):
+                        x = self.id_to_x(sequence[i+1])
+                        y = self.id_to_y(sequence[i+2])
+                        coords.append([x, y])
+                        symbols.append(token)
+                        indices.append(i+3)
+                else:
+                    if i+1 < len(sequence):
+                        symbols.append(token)
+                        indices.append(i+1)
+        results = {'smiles': smiles, 'symbols': symbols, 'indices': indices}
+        if has_coords:
+            results['coords'] = coords
+        return results
+class CharTokenizer(NodeTokenizer):
+    def __init__(self, input_size=100, path=None, sep_xy=False, continuous_coords=False, debug=False):
+        super().__init__(input_size, path, sep_xy, continuous_coords, debug)
+    def fit_on_texts(self, texts):
+        vocab = set()
+        for text in texts:
+            vocab.update(list(text))
+        if ' ' in vocab:
+            vocab.remove(' ')
+        vocab = [PAD, SOS, EOS, UNK] + list(vocab)
+        for i, s in enumerate(vocab):
+            self.stoi[s] = i
+        self.itos = {item[1]: item[0] for item in self.stoi.items()}
+        assert self.stoi[PAD] == PAD_ID
+        assert self.stoi[SOS] == SOS_ID
+        assert self.stoi[EOS] == EOS_ID
+        assert self.stoi[UNK] == UNK_ID
+    def text_to_sequence(self, text, tokenized=True):
+        sequence = []
+        sequence.append(self.stoi['<sos>'])
+        if tokenized:
+            tokens = text.split(' ')
+            assert all(len(s) == 1 for s in tokens)
+        else:
+            tokens = list(text)
+        for s in tokens:
+            if s not in self.stoi:
+                s = '<unk>'
+            sequence.append(self.stoi[s])
+        sequence.append(self.stoi['<eos>'])
+        return sequence
+    def fit_atom_symbols(self, atoms):
+        atoms = list(set(atoms))
+        chars = []
+        for atom in atoms:
+            chars.extend(list(atom))
+        vocab = self.special_tokens + chars
+        for i, s in enumerate(vocab):
+            self.stoi[s] = i
+        assert self.stoi[PAD] == PAD_ID
+        assert self.stoi[SOS] == SOS_ID
+        assert self.stoi[EOS] == EOS_ID
+        assert self.stoi[UNK] == UNK_ID
+        assert self.stoi[MASK] == MASK_ID
+        self.itos = {item[1]: item[0] for item in self.stoi.items()}
+    def get_output_mask(self, id):
+        ''' TO FIX '''
+        mask = [False] * len(self)
+        if self.continuous_coords:
+            return mask
+        if self.is_x(id):
+            return [True] * (self.offset + self.maxx) + [False] * self.maxy
+        if self.is_y(id):
+            return [False] * self.offset + [True] * (self.maxx + self.maxy)
+        return mask
+    def nodes_to_sequence(self, nodes):
+        coords, symbols = nodes['coords'], nodes['symbols']
+        labels = [SOS_ID]
+        for (x, y), symbol in zip(coords, symbols):
+            assert 0 <= x <= 1
+            assert 0 <= y <= 1
+            labels.append(self.x_to_id(x))
+            labels.append(self.y_to_id(y))
+            for char in symbol:
+                labels.append(self.symbol_to_id(char))
+        labels.append(EOS_ID)
+        return labels
+    def sequence_to_nodes(self, sequence):
+        coords, symbols = [], []
+        i = 0
+        if sequence[0] == SOS_ID:
+            i += 1
+        while i < len(sequence):
+            if sequence[i] == EOS_ID:
+                break
+            if i+2 < len(sequence) and self.is_x(sequence[i]) and self.is_y(sequence[i+1]) and self.is_symbol(sequence[i+2]):
+                x = self.id_to_x(sequence[i])
+                y = self.id_to_y(sequence[i+1])
+                for j in range(i+2, len(sequence)):
+                    if not self.is_symbol(sequence[j]):
+                        break
+                symbol = ''.join(self.itos(sequence[k]) for k in range(i+2, j))
+                coords.append([x, y])
+                symbols.append(symbol)
+                i = j
+            else:
+                i += 1
+        return {'coords': coords, 'symbols': symbols}
+    def smiles_to_sequence(self, smiles, coords=None, mask_ratio=0, atom_only=False):
+        tokens = atomwise_tokenizer(smiles)
+        labels = [SOS_ID]
+        indices = []
+        atom_idx = -1
+        for token in tokens:
+            if atom_only and not self.is_atom_token(token):
+                continue
+            for c in token:
+                if c in self.stoi:
+                    labels.append(self.stoi[c])
+                else:
+                    if self.debug:
+                        print(f'{c} not in vocab')
+                    labels.append(UNK_ID)
+            if self.is_atom_token(token):
+                atom_idx += 1
+                if not self.continuous_coords:
+                    if mask_ratio > 0 and random.random() < mask_ratio:
+                        labels.append(MASK_ID)
+                        labels.append(MASK_ID)
+                    elif coords is not None:
+                        if atom_idx < len(coords):
+                            x, y = coords[atom_idx]
+                            assert 0 <= x <= 1
+                            assert 0 <= y <= 1
+                        else:
+                            x = random.random()
+                            y = random.random()
+                        labels.append(self.x_to_id(x))
+                        labels.append(self.y_to_id(y))
+                indices.append(len(labels) - 1)
+        labels.append(EOS_ID)
+        return labels, indices
+    def sequence_to_smiles(self, sequence):
+        has_coords = not self.continuous_coords
+        smiles = ''
+        coords, symbols, indices = [], [], []
+        i = 0
+        while i < len(sequence):
+            label = sequence[i]
+            if label == EOS_ID or label == PAD_ID:
+                break
+            if self.is_x(label) or self.is_y(label):
+                i += 1
+                continue
+            if not self.is_atom(label):
+                smiles += self.itos[label]
+                i += 1
+                continue
+            if self.itos[label] == '[':
+                j = i + 1
+                while j < len(sequence):
+                    if not self.is_symbol(sequence[j]):
+                        break
+                    if self.itos[sequence[j]] == ']':
+                        j += 1
+                        break
+                    j += 1
+            else:
+                if i+1 < len(sequence) and (self.itos[label] == 'C' and self.is_symbol(sequence[i+1]) and self.itos[sequence[i+1]] == 'l' \
+                        or self.itos[label] == 'B' and self.is_symbol(sequence[i+1]) and self.itos[sequence[i+1]] == 'r'):
+                    j = i+2
+                else:
+                    j = i+1
+            token = ''.join(self.itos[sequence[k]] for k in range(i, j))
+            smiles += token
+            if has_coords:
+                if j+2 < len(sequence) and self.is_x(sequence[j]) and self.is_y(sequence[j+1]):
+                    x = self.id_to_x(sequence[j])
+                    y = self.id_to_y(sequence[j+1])
+                    coords.append([x, y])
+                    symbols.append(token)
+                    indices.append(j+2)
+                    i = j+2
+                else:
+                    i = j
+            else:
+                if j < len(sequence):
+                    symbols.append(token)
+                    indices.append(j)
+                i = j
+        results = {'smiles': smiles, 'symbols': symbols, 'indices': indices}
+        if has_coords:
+            results['coords'] = coords
+        return results
+def get_tokenizer(args):
+    tokenizer = {}
+    for format_ in args.formats:
+        if format_ == 'atomtok':
+            if args.vocab_file is None:
+                args.vocab_file = os.path.join(os.path.dirname(__file__), 'vocab/vocab_uspto.json')
+            tokenizer['atomtok'] = Tokenizer(args.vocab_file)
+        elif format_ == "atomtok_coords":
+            if args.vocab_file is None:
+                args.vocab_file = os.path.join(os.path.dirname(__file__), 'vocab/vocab_uspto.json')
+            tokenizer["atomtok_coords"] = NodeTokenizer(args.coord_bins, args.vocab_file, args.sep_xy,
+                                                        continuous_coords=args.continuous_coords)
+        elif format_ == "chartok_coords":
+            if args.vocab_file is None:
+                args.vocab_file = os.path.join(os.path.dirname(__file__), 'vocab/vocab_chars.json')
+            tokenizer["chartok_coords"] = CharTokenizer(args.coord_bins, args.vocab_file, args.sep_xy,
+                                                        continuous_coords=args.continuous_coords)
+    return tokenizer

molscribe/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .decoder import TransformerDecoder
+from .embedding import Embeddings
+from .swin_transformer import swin_base, swin_large

molscribe/transformer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (324 Bytes). View file

molscribe/transformer/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (13.5 kB). View file

molscribe/transformer/__pycache__/embedding.cpython-310.pyc ADDED Viewed

Binary file (7.91 kB). View file

molscribe/transformer/__pycache__/swin_transformer.cpython-310.pyc ADDED Viewed

Binary file (21.2 kB). View file

molscribe/transformer/decoder.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""
+Implementation of "Attention is All You Need" and of
+subsequent transformer based architectures
+"""
+import torch
+import torch.nn as nn
+from onmt.decoders.decoder import DecoderBase
+from onmt.modules import MultiHeadedAttention, AverageAttention
+from onmt.modules.position_ffn import PositionwiseFeedForward
+from onmt.modules.position_ffn import ActivationFunction
+from onmt.utils.misc import sequence_mask
+class TransformerDecoderLayerBase(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        heads,
+        d_ff,
+        dropout,
+        attention_dropout,
+        self_attn_type="scaled-dot",
+        max_relative_positions=0,
+        aan_useffn=False,
+        full_context_alignment=False,
+        alignment_heads=0,
+        pos_ffn_activation_fn=ActivationFunction.relu,
+    ):
+        """
+        Args:
+            d_model (int): the dimension of keys/values/queries in
+                :class:`MultiHeadedAttention`, also the input size of
+                the first-layer of the :class:`PositionwiseFeedForward`.
+            heads (int): the number of heads for MultiHeadedAttention.
+            d_ff (int): the second-layer of the
+                :class:`PositionwiseFeedForward`.
+            dropout (float): dropout in residual, self-attn(dot) and
+                feed-forward
+            attention_dropout (float): dropout in context_attn  (and
+                self-attn(avg))
+            self_attn_type (string): type of self-attention scaled-dot,
+                average
+            max_relative_positions (int):
+                Max distance between inputs in relative positions
+                representations
+            aan_useffn (bool): Turn on the FFN layer in the AAN decoder
+            full_context_alignment (bool):
+                whether enable an extra full context decoder forward for
+                alignment
+            alignment_heads (int):
+                N. of cross attention heads to use for alignment guiding
+            pos_ffn_activation_fn (ActivationFunction):
+                activation function choice for PositionwiseFeedForward layer
+        """
+        super(TransformerDecoderLayerBase, self).__init__()
+        if self_attn_type == "scaled-dot":
+            self.self_attn = MultiHeadedAttention(
+                heads,
+                d_model,
+                dropout=attention_dropout,
+                max_relative_positions=max_relative_positions,
+            )
+        elif self_attn_type == "average":
+            self.self_attn = AverageAttention(
+                d_model, dropout=attention_dropout, aan_useffn=aan_useffn
+            )
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout,
+                                                    pos_ffn_activation_fn
+                                                    )
+        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
+        self.drop = nn.Dropout(dropout)
+        self.full_context_alignment = full_context_alignment
+        self.alignment_heads = alignment_heads
+    def forward(self, *args, **kwargs):
+        """Extend `_forward` for (possibly) multiple decoder pass:
+        Always a default (future masked) decoder forward pass,
+        Possibly a second future aware decoder pass for joint learn
+        full context alignement, :cite:`garg2019jointly`.
+        Args:
+            * All arguments of _forward.
+            with_align (bool): whether return alignment attention.
+        Returns:
+            (FloatTensor, FloatTensor, FloatTensor or None):
+            * output ``(batch_size, T, model_dim)``
+            * top_attn ``(batch_size, T, src_len)``
+            * attn_align ``(batch_size, T, src_len)`` or None
+        """
+        with_align = kwargs.pop("with_align", False)
+        output, attns = self._forward(*args, **kwargs)
+        top_attn = attns[:, 0, :, :].contiguous()
+        attn_align = None
+        if with_align:
+            if self.full_context_alignment:
+                # return _, (B, Q_len, K_len)
+                _, attns = self._forward(*args, **kwargs, future=True)
+            if self.alignment_heads > 0:
+                attns = attns[:, : self.alignment_heads, :, :].contiguous()
+            # layer average attention across heads, get ``(B, Q, K)``
+            # Case 1: no full_context, no align heads -> layer avg baseline
+            # Case 2: no full_context, 1 align heads -> guided align
+            # Case 3: full_context, 1 align heads -> full cte guided align
+            attn_align = attns.mean(dim=1)
+        return output, top_attn, attn_align
+    def update_dropout(self, dropout, attention_dropout):
+        self.self_attn.update_dropout(attention_dropout)
+        self.feed_forward.update_dropout(dropout)
+        self.drop.p = dropout
+    def _forward(self, *args, **kwargs):
+        raise NotImplementedError
+    def _compute_dec_mask(self, tgt_pad_mask, future):
+        tgt_len = tgt_pad_mask.size(-1)
+        if not future:  # apply future_mask, result mask in (B, T, T)
+            future_mask = torch.ones(
+                [tgt_len, tgt_len],
+                device=tgt_pad_mask.device,
+                dtype=torch.uint8,
+            )
+            future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
+            # BoolTensor was introduced in pytorch 1.2
+            try:
+                future_mask = future_mask.bool()
+            except AttributeError:
+                pass
+            dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
+        else:  # only mask padding, result mask in (B, 1, T)
+            dec_mask = tgt_pad_mask
+        return dec_mask
+    def _forward_self_attn(self, inputs_norm, dec_mask, layer_cache, step):
+        if isinstance(self.self_attn, MultiHeadedAttention):
+            return self.self_attn(
+                inputs_norm,
+                inputs_norm,
+                inputs_norm,
+                mask=dec_mask,
+                layer_cache=layer_cache,
+                attn_type="self",
+            )
+        elif isinstance(self.self_attn, AverageAttention):
+            return self.self_attn(
+                inputs_norm, mask=dec_mask, layer_cache=layer_cache, step=step
+            )
+        else:
+            raise ValueError(
+                f"self attention {type(self.self_attn)} not supported"
+            )
+class TransformerDecoderLayer(TransformerDecoderLayerBase):
+    """Transformer Decoder layer block in Pre-Norm style.
+    Pre-Norm style is an improvement w.r.t. Original paper's Post-Norm style,
+    providing better converge speed and performance. This is also the actual
+    implementation in tensor2tensor and also avalable in fairseq.
+    See https://tunz.kr/post/4 and :cite:`DeeperTransformer`.
+    .. mermaid::
+        graph LR
+        %% "*SubLayer" can be self-attn, src-attn or feed forward block
+            A(input) --> B[Norm]
+            B --> C["*SubLayer"]
+            C --> D[Drop]
+            D --> E((+))
+            A --> E
+            E --> F(out)
+    """
+    def __init__(
+        self,
+        d_model,
+        heads,
+        d_ff,
+        dropout,
+        attention_dropout,
+        self_attn_type="scaled-dot",
+        max_relative_positions=0,
+        aan_useffn=False,
+        full_context_alignment=False,
+        alignment_heads=0,
+        pos_ffn_activation_fn=ActivationFunction.relu,
+    ):
+        """
+        Args:
+            See TransformerDecoderLayerBase
+        """
+        super(TransformerDecoderLayer, self).__init__(
+            d_model,
+            heads,
+            d_ff,
+            dropout,
+            attention_dropout,
+            self_attn_type,
+            max_relative_positions,
+            aan_useffn,
+            full_context_alignment,
+            alignment_heads,
+            pos_ffn_activation_fn=pos_ffn_activation_fn,
+        )
+        self.context_attn = MultiHeadedAttention(
+            heads, d_model, dropout=attention_dropout
+        )
+        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
+    def update_dropout(self, dropout, attention_dropout):
+        super(TransformerDecoderLayer, self).update_dropout(
+            dropout, attention_dropout
+        )
+        self.context_attn.update_dropout(attention_dropout)
+    def _forward(
+        self,
+        inputs,
+        memory_bank,
+        src_pad_mask,
+        tgt_pad_mask,
+        layer_cache=None,
+        step=None,
+        future=False,
+    ):
+        """A naive forward pass for transformer decoder.
+        # T: could be 1 in the case of stepwise decoding or tgt_len
+        Args:
+            inputs (FloatTensor): ``(batch_size, T, model_dim)``
+            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
+            src_pad_mask (bool): ``(batch_size, 1, src_len)``
+            tgt_pad_mask (bool): ``(batch_size, 1, T)``
+            layer_cache (dict or None): cached layer info when stepwise decode
+            step (int or None): stepwise decoding counter
+            future (bool): If set True, do not apply future_mask.
+        Returns:
+            (FloatTensor, FloatTensor):
+            * output ``(batch_size, T, model_dim)``
+            * attns ``(batch_size, head, T, src_len)``
+        """
+        dec_mask = None
+        if inputs.size(1) > 1:
+            # masking is necessary when sequence length is greater than one
+            dec_mask = self._compute_dec_mask(tgt_pad_mask, future)
+        inputs_norm = self.layer_norm_1(inputs)
+        query, _ = self._forward_self_attn(
+            inputs_norm, dec_mask, layer_cache, step
+        )
+        query = self.drop(query) + inputs
+        query_norm = self.layer_norm_2(query)
+        mid, attns = self.context_attn(
+            memory_bank,
+            memory_bank,
+            query_norm,
+            mask=src_pad_mask,
+            layer_cache=layer_cache,
+            attn_type="context",
+        )
+        output = self.feed_forward(self.drop(mid) + query)
+        return output, attns
+class TransformerDecoderBase(DecoderBase):
+    def __init__(self, d_model, copy_attn, alignment_layer):
+        super(TransformerDecoderBase, self).__init__()
+        # Decoder State
+        self.state = {}
+        # previously, there was a GlobalAttention module here for copy
+        # attention. But it was never actually used -- the "copy" attention
+        # just reuses the context attention.
+        self._copy = copy_attn
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.alignment_layer = alignment_layer
+    @classmethod
+    def from_opt(cls, opt, embeddings):
+        """Alternate constructor."""
+        return cls(
+            opt.dec_layers,
+            opt.dec_rnn_size,
+            opt.heads,
+            opt.transformer_ff,
+            opt.copy_attn,
+            opt.self_attn_type,
+            opt.dropout[0] if type(opt.dropout) is list else opt.dropout,
+            opt.attention_dropout[0] if type(opt.attention_dropout) is list else opt.attention_dropout,
+            embeddings,
+            opt.max_relative_positions,
+            opt.aan_useffn,
+            opt.full_context_alignment,
+            opt.alignment_layer,
+            alignment_heads=opt.alignment_heads,
+            pos_ffn_activation_fn=opt.pos_ffn_activation_fn,
+        )
+    def init_state(self, src, memory_bank, enc_hidden):
+        """Initialize decoder state."""
+        self.state["src"] = src
+        self.state["cache"] = None
+    def map_state(self, fn):
+        def _recursive_map(struct, batch_dim=0):
+            for k, v in struct.items():
+                if v is not None:
+                    if isinstance(v, dict):
+                        _recursive_map(v)
+                    else:
+                        struct[k] = fn(v, batch_dim)
+        if self.state["src"] is not None:
+            self.state["src"] = fn(self.state["src"], 1)
+        if self.state["cache"] is not None:
+            _recursive_map(self.state["cache"])
+    def detach_state(self):
+        raise NotImplementedError
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+    def update_dropout(self, dropout, attention_dropout):
+        self.embeddings.update_dropout(dropout)
+        for layer in self.transformer_layers:
+            layer.update_dropout(dropout, attention_dropout)
+class TransformerDecoder(TransformerDecoderBase):
+    """The Transformer decoder from "Attention is All You Need".
+    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
+    .. mermaid::
+       graph BT
+          A[input]
+          B[multi-head self-attn]
+          BB[multi-head src-attn]
+          C[feed forward]
+          O[output]
+          A --> B
+          B --> BB
+          BB --> C
+          C --> O
+    Args:
+        num_layers (int): number of decoder layers.
+        d_model (int): size of the model
+        heads (int): number of heads
+        d_ff (int): size of the inner FF layer
+        copy_attn (bool): if using a separate copy attention
+        self_attn_type (str): type of self-attention scaled-dot, average
+        dropout (float): dropout in residual, self-attn(dot) and feed-forward
+        attention_dropout (float): dropout in context_attn (and self-attn(avg))
+        embeddings (onmt.modules.Embeddings):
+            embeddings to use, should have positional encodings
+        max_relative_positions (int):
+            Max distance between inputs in relative positions representations
+        aan_useffn (bool): Turn on the FFN layer in the AAN decoder
+        full_context_alignment (bool):
+            whether enable an extra full context decoder forward for alignment
+        alignment_layer (int): N° Layer to supervise with for alignment guiding
+        alignment_heads (int):
+            N. of cross attention heads to use for alignment guiding
+    """
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        heads,
+        d_ff,
+        copy_attn,
+        self_attn_type,
+        dropout,
+        attention_dropout,
+        max_relative_positions,
+        aan_useffn,
+        full_context_alignment,
+        alignment_layer,
+        alignment_heads,
+        pos_ffn_activation_fn=ActivationFunction.relu,
+    ):
+        super(TransformerDecoder, self).__init__(
+            d_model, copy_attn, alignment_layer
+        )
+        self.transformer_layers = nn.ModuleList(
+            [
+                TransformerDecoderLayer(
+                    d_model,
+                    heads,
+                    d_ff,
+                    dropout,
+                    attention_dropout,
+                    self_attn_type=self_attn_type,
+                    max_relative_positions=max_relative_positions,
+                    aan_useffn=aan_useffn,
+                    full_context_alignment=full_context_alignment,
+                    alignment_heads=alignment_heads,
+                    pos_ffn_activation_fn=pos_ffn_activation_fn,
+                )
+                for i in range(num_layers)
+            ]
+        )
+    def detach_state(self):
+        self.state["src"] = self.state["src"].detach()
+    def forward(self, tgt_emb, memory_bank, src_pad_mask=None, tgt_pad_mask=None, step=None, **kwargs):
+        """Decode, possibly stepwise."""
+        if step == 0:
+            self._init_cache(memory_bank)
+        batch_size, src_len, src_dim = memory_bank.size()
+        device = memory_bank.device
+        if src_pad_mask is None:
+            src_pad_mask = torch.zeros((batch_size, 1, src_len), dtype=torch.bool, device=device)
+        output = tgt_emb
+        batch_size, tgt_len, tgt_dim = tgt_emb.size()
+        if tgt_pad_mask is None:
+            tgt_pad_mask = torch.zeros((batch_size, 1, tgt_len), dtype=torch.bool, device=device)
+        future = kwargs.pop("future", False)
+        with_align = kwargs.pop("with_align", False)
+        attn_aligns = []
+        hiddens = []
+        for i, layer in enumerate(self.transformer_layers):
+            layer_cache = (
+                self.state["cache"]["layer_{}".format(i)]
+                if step is not None
+                else None
+            )
+            output, attn, attn_align = layer(
+                output,
+                memory_bank,
+                src_pad_mask,
+                tgt_pad_mask,
+                layer_cache=layer_cache,
+                step=step,
+                with_align=with_align,
+                future=future
+            )
+            hiddens.append(output)
+            if attn_align is not None:
+                attn_aligns.append(attn_align)
+        output = self.layer_norm(output)  # (B, L, D)
+        attns = {"std": attn}
+        if self._copy:
+            attns["copy"] = attn
+        if with_align:
+            attns["align"] = attn_aligns[self.alignment_layer]  # `(B, Q, K)`
+            # attns["align"] = torch.stack(attn_aligns, 0).mean(0)  # All avg
+        # TODO change the way attns is returned dict => list or tuple (onnx)
+        return output, attns, hiddens
+    def _init_cache(self, memory_bank):
+        self.state["cache"] = {}
+        for i, layer in enumerate(self.transformer_layers):
+            layer_cache = {"memory_keys": None, "memory_values": None, "self_keys": None, "self_values": None}
+            self.state["cache"]["layer_{}".format(i)] = layer_cache