Skip to content

Commit

Permalink
Use featurizers that us 3d conformer-based features and pretrained fe…
Browse files Browse the repository at this point in the history
…aturizers from hugging face and DGL
  • Loading branch information
Tom authored and Tom committed Aug 8, 2024
1 parent 407eeda commit 6bebadd
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
47 changes: 44 additions & 3 deletions comptox_ai/chemical_featurizer/generate_vectors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from comptox_ai.db.graph_db import GraphDB
from molfeat.trans.fp import FPVecTransformer
from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer
from molfeat.trans.pretrained import PretrainedDGLTransformer
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors
from rdkit.Chem import Descriptors, AllChem
import numpy as np
import pandas as pd
from collections import defaultdict
Expand Down Expand Up @@ -160,6 +162,38 @@ def retrieve_smiles(

return smiles_list, found_chemical_id_list

def generate_3d_conformers(smiles_list):
"""
Generates a list of mol objects with 3D conformer data from input list of SMILES strings.
Parameters
----------
smiles_list : List[str]
List of SMILES strings.
Returns
-------
List[rdkit.Chem.rdchem.Mol]
List of mol objects with 3D conformer information.
Examples
--------
>>> from comptox_ai.chemical_featurizer.generate_vectors import generate_3d_conformers
>>> from rdkit.Chem import Descriptors
>>> mol = generate_3d_conformers(["CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12"])
>>> Descriptors.MolWt(mol[0])
335.8789999999995
"""
mols_with_3d = []
for smiles in smiles_list:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol, randomSeed=0)
AllChem.UFFOptimizeMolecule(mol)
mols_with_3d.append(mol)
return mols_with_3d


def create_vector_table(
chemicals_to_find,
Expand Down Expand Up @@ -225,8 +259,15 @@ def create_vector_table(

for feature in molfeat_descriptors:
print(f"Calculating {feature} descriptors")
featurizer = FPVecTransformer(kind=feature, dtype=np.float32, verbose=True)
vectors.append(featurizer(smiles_list).tolist())

if feature in {"Roberta-Zinc480M-102M", "GPT2-Zinc480M-87M", "ChemGPT-1.2B", "ChemGPT-19M", "ChemGPT-4.7M", "MolT5", "ChemBERTa-77M-MTR", "ChemBERTa-77M-MLM"}:
featurizer = PretrainedHFTransformer(kind=feature, notation='smiles', dtype=float)
elif feature in {"gin_supervised_masking", "gin_supervised_infomax", "gin_supervised_edgepred", "jtvae_zinc_no_kl", "gin_supervised_contextpred"}:
featurizer = PretrainedDGLTransformer(kind=feature, dtype=float)
else:
mol_list = generate_3d_conformers(smiles_list)
featurizer = FPVecTransformer(kind=feature, dtype=np.float32, verbose=True)
vectors.append(featurizer(mol_list).tolist())
df_column_names.append(feature)

if rdkit_descriptors:
Expand Down
5 changes: 4 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,7 @@ dependencies:
- tqdm=4.62.2
- conda-forge::molfeat
- rdkit=="2024.03.3"
- conda-forge::ipdb
- conda-forge::ipdb
- transformers
- dgllife
- dglteam=2.0.0
7 changes: 7 additions & 0 deletions tests/test_chemical_featurizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from comptox_ai.chemical_featurizer.generate_vectors import *
from rdkit.Chem import Descriptors
import pytest
import os

Expand Down Expand Up @@ -127,6 +128,12 @@ def test_retrieve_dtxsid(self):
["DTXSID8023135", "DTXSID5023742"],
)

class TestGenerate3DConformers:
def test_generate_3d_conformers(self):
mol_list = generate_3d_conformers(["CCN(CCO)CCCC(C)Nc1ccnc2cc(Cl)ccc12", "CC(=O)CC(c1ccccc1)c1c(O)c2ccccc2oc1=O"])

assert Descriptors.MolWt(mol_list[0]) == pytest.approx(335.8789999999995, 0.001)
assert Descriptors.MolWt(mol_list[1]) == pytest.approx(308.3329999999997, 0.001)

class TestCreateVectorTable:

Expand Down

0 comments on commit 6bebadd

Please sign in to comment.