From 60e71d3379d624ed1ccbb3afc2dea75df33b668a Mon Sep 17 00:00:00 2001
From: Philippe Schwaller <phischwaller@gmail.com>
Date: Sun, 7 May 2023 13:48:32 +0200
Subject: [PATCH] Adding Suzuki Miyaura yield prediction dataset

---
 data/rxn_suzuki_miyaura/meta.yaml    |  38 +++++
 data/rxn_suzuki_miyaura/transform.py | 201 +++++++++++++++++++++++++++
 2 files changed, 239 insertions(+)
 create mode 100644 data/rxn_suzuki_miyaura/meta.yaml
 create mode 100644 data/rxn_suzuki_miyaura/transform.py

diff --git a/data/rxn_suzuki_miyaura/meta.yaml b/data/rxn_suzuki_miyaura/meta.yaml
new file mode 100644
index 000000000..f744718f3
--- /dev/null
+++ b/data/rxn_suzuki_miyaura/meta.yaml
@@ -0,0 +1,38 @@
+name: suzuki_miyaura_sach
+description: "High-throughput experimentation palladium-catalyzed Suzuki-Miyaura C-C\
+  \ cross-coupling data set with yields measured by liquid chromatography\u2013mass\
+  \ spectrometry."
+targets:
+- id: Product_Yield_PCT_Area_UV
+  description: Reaction yields analyzed by LCMS
+  units: '%'
+  type: continuous
+  names:
+  - Reaction yield
+  - yield
+identifiers:
+- id: reaction_SMILES
+  type: RXNSMILES
+  description: RXNSMILES
+license: MIT
+links:
+- url: https://doi.org/10.1126/science.aap9112
+  description: corresponding publication
+- url: https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py
+  description: preprocessing
+- url: https://github.com/reymond-group/drfp/tree/main/data
+  description: dataset
+num_points: 5760
+url: https://doi.org/10.1126/science.aap9112
+bibtex:
+- |-
+  @article{perera2018platform,
+  title={A platform for automated nanomole-scale reaction screening and micromole-scale synthesis in flow},
+  author={Perera, Damith and Tucker, Joseph W and Brahmbhatt, Shalini and Helal, Christopher J and Chong, Ashley and Farrell, William and Richardson, Paul and Sach, Neal W},
+  journal={Science},
+  volume={359},
+  number={6374},
+  pages={429--434},
+  year={2018},
+  publisher={American Association for the Advancement of Science},
+  }
diff --git a/data/rxn_suzuki_miyaura/transform.py b/data/rxn_suzuki_miyaura/transform.py
new file mode 100644
index 000000000..807744a93
--- /dev/null
+++ b/data/rxn_suzuki_miyaura/transform.py
@@ -0,0 +1,201 @@
+import pandas as pd
+import yaml
+from rdkit import Chem # 2022.9.5
+from rdkit.Chem import rdChemReactions
+
+reactant_1_smiles_dict = {
+    '6-chloroquinoline': 'C1=C(Cl)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
+    '6-Bromoquinoline': 'C1=C(Br)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
+    '6-triflatequinoline': 'C1C2C(=NC=CC=2)C=CC=1OS(C(F)(F)F)(=O)=O.CCC1=CC(=CC=C1)CC',
+    '6-Iodoquinoline': 'C1=C(I)C=CC2=NC=CC=C12.CCC1=CC(=CC=C1)CC', 
+    '6-quinoline-boronic acid hydrochloride': 'C1C(B(O)O)=CC=C2N=CC=CC=12.Cl.O',
+    'Potassium quinoline-6-trifluoroborate': '[B-](C1=CC2=C(C=C1)N=CC=C2)(F)(F)F.[K+].O',
+    '6-Quinolineboronic acid pinacol ester': 'B1(OC(C(O1)(C)C)(C)C)C2=CC3=C(C=C2)N=CC=C3.O'
+}
+
+reactant_2_smiles_dict = {
+    '2a, Boronic Acid': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B(O)O', 
+    '2b, Boronic Ester': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1B4OC(C)(C)C(C)(C)O4', 
+    '2c, Trifluoroborate': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1[B-](F)(F)F.[K+]',
+    '2d, Bromide': 'CC1=CC=C2C(C=NN2C3OCCCC3)=C1Br' 
+}
+
+catalyst_smiles_dict = {
+    'Pd(OAc)2': 'CC(=O)O~CC(=O)O~[Pd]'
+}
+
+ligand_smiles_dict = {
+    'P(tBu)3': 'CC(C)(C)P(C(C)(C)C)C(C)(C)C', 
+    'P(Ph)3 ': 'c3c(P(c1ccccc1)c2ccccc2)cccc3', 
+    'AmPhos': 'CC(C)(C)P(C1=CC=C(C=C1)N(C)C)C(C)(C)C', 
+    'P(Cy)3': 'C1(CCCCC1)P(C2CCCCC2)C3CCCCC3', 
+    'P(o-Tol)3': 'CC1=CC=CC=C1P(C2=CC=CC=C2C)C3=CC=CC=C3C',
+    'CataCXium A': 'CCCCP(C12CC3CC(C1)CC(C3)C2)C45CC6CC(C4)CC(C6)C5', 
+    'SPhos': 'COc1cccc(c1c2ccccc2P(C3CCCCC3)C4CCCCC4)OC', 
+    'dtbpf': 'CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.CC(C)(C)P(C1=CC=C[CH]1)C(C)(C)C.[Fe]', 
+    'XPhos': 'P(c2ccccc2c1c(cc(cc1C(C)C)C(C)C)C(C)C)(C3CCCCC3)C4CCCCC4', 
+    'dppf': 'C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.C1=CC=C(C=C1)P([C-]2C=CC=C2)C3=CC=CC=C3.[Fe+2]', 
+    'Xantphos': 'O6c1c(cccc1P(c2ccccc2)c3ccccc3)C(c7cccc(P(c4ccccc4)c5ccccc5)c67)(C)C',
+    'None': ''
+}
+
+reagent_1_smiles_dict = {
+    'NaOH': '[OH-].[Na+]', 
+    'NaHCO3': '[Na+].OC([O-])=O', 
+    'CsF': '[F-].[Cs+]', 
+    'K3PO4': '[K+].[K+].[K+].[O-]P([O-])([O-])=O', 
+    'KOH': '[K+].[OH-]', 
+    'LiOtBu': '[Li+].[O-]C(C)(C)C', 
+    'Et3N': 'CCN(CC)CC', 
+    'None': ''
+}
+
+solvent_1_smiles_dict = {
+    'MeCN': 'CC#N.O', 
+    'THF': 'C1CCOC1.O', 
+    'DMF': 'CN(C)C=O.O', 
+    'MeOH': 'CO.O', 
+    'MeOH/H2O_V2 9:1': 'CO.O', 
+    'THF_V2': 'C1CCOC1.O'
+}
+
+def canonicalize_smiles(smi):
+    mol = Chem.MolFromSmiles(smi)
+    if mol is not None:
+        return Chem.MolToSmiles(mol)
+    return ''
+
+def make_reaction_smiles(row):
+    precursors = f" {row['reactant1_SMILES']}.{row['reactant2_SMILES']}.{row['catalyst_SMILES']}.{row['ligand_SMILES']}.{row['reagent_SMILES']}.{row['solvent_SMILES']} "
+    product = 'C1=C(C2=C(C)C=CC3N(C4OCCCC4)N=CC2=3)C=CC2=NC=CC=C12'
+#     print(precursors, product)
+    can_precursors = Chem.MolToSmiles(Chem.MolFromSmiles(precursors.replace('...', '.').replace('..', '.').replace(' .', '').replace('. ', '').replace(' ', '')))
+    can_product = Chem.MolToSmiles(Chem.MolFromSmiles(product))
+    
+    return f"{can_precursors}>>{can_product}"
+
+def add_molecules_and_rxn_smiles_to_df(df):
+
+    df['reactant1_SMILES'] = df.Reactant_1_Name.apply(lambda molecule: canonicalize_smiles(reactant_1_smiles_dict[molecule]))
+    df['reactant2_SMILES'] = df.Reactant_2_Name.apply(lambda molecule: canonicalize_smiles(reactant_2_smiles_dict[molecule]))
+    df['catalyst_SMILES'] = df.Catalyst_1_Short_Hand.apply(lambda molecule: canonicalize_smiles(catalyst_smiles_dict[molecule]))
+    df['ligand_SMILES'] = df.Ligand_Short_Hand.apply(lambda molecule: canonicalize_smiles(ligand_smiles_dict[molecule]))
+    df['reagent_SMILES'] = df.Reagent_1_Short_Hand.apply(lambda molecule: canonicalize_smiles(reagent_1_smiles_dict[molecule]))
+    df['solvent_SMILES'] = df.Solvent_1_Short_Hand.apply(lambda molecule: canonicalize_smiles(solvent_1_smiles_dict[molecule]))
+
+    df['reaction_SMILES'] = df.apply(lambda row: make_reaction_smiles(row), axis=1)
+
+    return df
+
+
+def get_and_transform_data():
+    # get raw data
+    fn_data_original = "Richardson_and_Sach_input_data.csv"
+    data = pd.read_excel('https://github.com/reymond-group/drfp/raw/main/data/Suzuki-Miyaura/aap9112_Data_File_S1.xlsx')
+    data.to_csv(fn_data_original, index=False)
+    df = pd.read_csv(fn_data_original, delimiter=",")
+
+    # check if fields are the same
+    fields_orig = df.columns.tolist()
+    assert fields_orig == [
+        'Reaction_No',
+        'Reactant_1_Name',
+        'Reactant_1_Short_Hand',
+        'Reactant_1_eq',
+        'Reactant_1_mmol',
+        'Reactant_2_Name',
+        'Reactant_2_eq',
+        'Catalyst_1_Short_Hand',
+        'Catalyst_1_eq',
+        'Ligand_Short_Hand',
+        'Ligand_eq',
+        'Reagent_1_Short_Hand',
+        'Reagent_1_eq',
+        'Solvent_1_Short_Hand',
+        'Product_Yield_PCT_Area_UV',
+        'Product_Yield_Mass_Ion_Count'
+    ]
+
+    # data cleaning
+    df = add_molecules_and_rxn_smiles_to_df(df)
+    
+    assert not df.duplicated().sum()
+
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    # create meta yaml
+    meta = {
+    "name": "suzuki_miyaura_sach",  # unique identifier, we will also use this for directory names
+    "description": """High-throughput experimentation palladium-catalyzed Suzuki-Miyaura C-C cross-coupling data set with yields measured by liquid chromatography–mass spectrometry.""",
+    "targets": [
+        {
+            "id": "Product_Yield_PCT_Area_UV",  # name of the column in a tabular dataset
+            "description": "Reaction yields analyzed by LCMS",  # description of what this column means
+            "units": "%",  # units of the values in this column (leave empty if unitless)
+            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
+            "names": [  # names for the property (to sample from for building the prompts)
+                "Reaction yield",
+                "yield",
+            ],
+        },
+    ],
+    "identifiers": [
+        {
+            "id": "reaction_SMILES",  # column name
+            "type": "RXNSMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+            "description": "RXNSMILES",  # description (optional, except for "Other")
+        },
+    ],
+    "license": "MIT",  # license under which the original dataset was published
+    "links": [  # list of relevant links (original dataset, other uses, etc.)
+        {
+            "url": "https://doi.org/10.1126/science.aap9112",
+            "description": "corresponding publication",
+        },
+        {
+            "url": "https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py",
+            "description": "preprocessing",
+        },
+        {
+            "url": "https://github.com/reymond-group/drfp/tree/main/data",
+            "description": "dataset",
+        }
+    ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "url": "https://doi.org/10.1126/science.aap9112",
+        "bibtex": [
+            """@article{perera2018platform,
+title={A platform for automated nanomole-scale reaction screening and micromole-scale synthesis in flow},
+author={Perera, Damith and Tucker, Joseph W and Brahmbhatt, Shalini and Helal, Christopher J and Chong, Ashley and Farrell, William and Richardson, Paul and Sach, Neal W},
+journal={Science},
+volume={359},
+number={6374},
+pages={429--434},
+year={2018},
+publisher={American Association for the Advancement of Science},
+}""",
+    ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+if __name__ == "__main__":
+    get_and_transform_data()