From 5f6f55dc288338e4358cf6e665930867ff5ca8cb Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Wed, 27 May 2020 18:06:35 -0400 Subject: [PATCH 01/17] ignore pycharm, .DS_Store --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 2267b020..91fbb112 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,9 @@ dmypy.json # Parm@Frosst download parm_at_Frosst.tgz + +# PyCharm +.idea + + +.DS_Store From 9dc8cd45337e21480283b7a041cecfcbfa4955e3 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Wed, 27 May 2020 18:15:11 -0400 Subject: [PATCH 02/17] alkethoh rings interaction-typing dataset --- .gitignore | 4 ++ hgfp/data/alkethoh/__init__.py | 0 hgfp/data/alkethoh/label_molecules.py | 98 +++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 hgfp/data/alkethoh/__init__.py create mode 100644 hgfp/data/alkethoh/label_molecules.py diff --git a/.gitignore b/.gitignore index 91fbb112..8ff73327 100644 --- a/.gitignore +++ b/.gitignore @@ -140,3 +140,7 @@ parm_at_Frosst.tgz .DS_Store + +# AlkEthOH dataset +hgfp/data/alkethoh/AlkEthOH_rings.npz +hgfp/data/alkethoh/AlkEthOH_rings.smi diff --git a/hgfp/data/alkethoh/__init__.py b/hgfp/data/alkethoh/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hgfp/data/alkethoh/label_molecules.py b/hgfp/data/alkethoh/label_molecules.py new file mode 100644 index 00000000..90af34c8 --- /dev/null +++ b/hgfp/data/alkethoh/label_molecules.py @@ -0,0 +1,98 @@ +"""label every molecule in AlkEthOH rings set""" + +import os +import urllib + +import numpy as np +from openforcefield.topology import Molecule +from openforcefield.typing.engines.smirnoff import ForceField +from tqdm import tqdm + +alkethoh_url = 'https://raw.githubusercontent.com/openforcefield/open-forcefield-data/e07bde16c34a3fa1d73ab72e2b8aeab7cd6524df/Model-Systems/AlkEthOH_distrib/AlkEthOH_rings.smi' +alkethoh_local_path = 'AlkEthOH_rings.smi' + + +def download_alkethoh(): + if not os.path.exists(alkethoh_local_path): + with urllib.request.urlopen(alkethoh_url) as response: + smi = response.read() + with open(alkethoh_local_path, 'wb') as f: + f.write(smi) + + +# Load the OpenFF "Parsley" force field. -- pick unconstrained so that Hbond stretches are sampled... +forcefield = ForceField('openff_unconstrained-1.0.0.offxml') + + +## loading molecules +# TODO: replace mol_from_smiles with something that reads the mol2 files directly... +def mol_from_smiles(smiles): + mol = Molecule.from_smiles(smiles, hydrogens_are_explicit=False, allow_undefined_stereo=True) + return mol + + +## labeling molecules +def label_mol(mol): + return forcefield.label_molecules(mol.to_topology())[0] + + +def get_inds_and_labels(labeled_mol, type_name='vdW'): + terms = labeled_mol[type_name] + inds = np.array(list(terms.keys())) + labels = np.array([int(term.id[1:]) for term in terms.values()]) + + assert (len(inds) == len(labels)) + + return inds, labels + + +from functools import partial + +get_labeled_atoms = partial(get_inds_and_labels, type_name='vdW') +get_labeled_bonds = partial(get_inds_and_labels, type_name='Bonds') +get_labeled_angles = partial(get_inds_and_labels, type_name='Angles') +get_labeled_torsions = partial(get_inds_and_labels, type_name='Torsions') + +if __name__ == '__main__': + # download, if it isn't already present + download_alkethoh() + + # load molecules + with open(alkethoh_local_path, 'r') as f: + ring_smiles = [l.split()[0] for l in f.readlines()] + with open(alkethoh_local_path, 'r') as f: + ring_names = [l.split()[1] for l in f.readlines()] + mols = [Molecule.from_smiles(s, allow_undefined_stereo=True) for s in ring_smiles] + + # Label molecules using forcefield + # Takes about ~200ms per molecule -- can do ~1000 molecules in ~5-6 minutes, sequentially + labeled_mols = [] + for mol in tqdm(mols): + labeled_mols.append(label_mol(mol)) + + label_dict = dict() + for (name, labeled_mol) in zip(ring_names, labeled_mols): + label_dict[name + '_atom_inds'], label_dict[name + '_atom_labels'] = get_labeled_atoms(labeled_mol) + label_dict[name + '_bond_inds'], label_dict[name + '_bond_labels'] = get_labeled_bonds(labeled_mol) + label_dict[name + '_angle_inds'], label_dict[name + '_angle_labels'] = get_labeled_angles(labeled_mol) + label_dict[name + '_torsion_inds'], label_dict[name + '_torsion_labels'] = get_labeled_torsions(labeled_mol) + + # save to compressed array + description = f""" + Each of the molecules in AlkEthOH_rings.smi + {alkethoh_url} + + is labeled according to the forcefield `openff_unconstrained-1.0.0.offxml`: + https://github.com/openforcefield/openforcefields/blob/master/openforcefields/offxml/openff_unconstrained-1.0.0.offxml + + Keys are of the form + __ + + such as 'AlkEthOH_r0_atom_inds' or 'AlkEthOH_r0_torsion_labels' + + and values are integer arrays + """ + + np.savez_compressed('AlkEthOH_rings.npz', + description=description, + **label_dict) From bdeccba4c3d2459aa12728a62010e08808bc95c1 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Wed, 27 May 2020 18:22:02 -0400 Subject: [PATCH 03/17] fix key error: 'Torsions' --> 'ProperTorsions' --- hgfp/data/alkethoh/label_molecules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hgfp/data/alkethoh/label_molecules.py b/hgfp/data/alkethoh/label_molecules.py index 90af34c8..b24f036e 100644 --- a/hgfp/data/alkethoh/label_molecules.py +++ b/hgfp/data/alkethoh/label_molecules.py @@ -51,7 +51,7 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): get_labeled_atoms = partial(get_inds_and_labels, type_name='vdW') get_labeled_bonds = partial(get_inds_and_labels, type_name='Bonds') get_labeled_angles = partial(get_inds_and_labels, type_name='Angles') -get_labeled_torsions = partial(get_inds_and_labels, type_name='Torsions') +get_labeled_torsions = partial(get_inds_and_labels, type_name='ProperTorsions') if __name__ == '__main__': # download, if it isn't already present From c91dce01103b47c5a9e752c182da3d58f08b2c2d Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Wed, 27 May 2020 19:32:45 -0400 Subject: [PATCH 04/17] save also openff mols --- .gitignore | 1 + hgfp/data/alkethoh/label_molecules.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 8ff73327..fdc4d476 100644 --- a/.gitignore +++ b/.gitignore @@ -144,3 +144,4 @@ parm_at_Frosst.tgz # AlkEthOH dataset hgfp/data/alkethoh/AlkEthOH_rings.npz hgfp/data/alkethoh/AlkEthOH_rings.smi +hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl diff --git a/hgfp/data/alkethoh/label_molecules.py b/hgfp/data/alkethoh/label_molecules.py index b24f036e..35225349 100644 --- a/hgfp/data/alkethoh/label_molecules.py +++ b/hgfp/data/alkethoh/label_molecules.py @@ -2,6 +2,7 @@ import os import urllib +from pickle import dump import numpy as np from openforcefield.topology import Molecule @@ -64,6 +65,9 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): ring_names = [l.split()[1] for l in f.readlines()] mols = [Molecule.from_smiles(s, allow_undefined_stereo=True) for s in ring_smiles] + with open('AlkEthOH_rings_offmols.pkl', 'wb') as f: + dump(mols, f) + # Label molecules using forcefield # Takes about ~200ms per molecule -- can do ~1000 molecules in ~5-6 minutes, sequentially labeled_mols = [] From 69538698447fa7cb2c44526796b3d6f73fa3aed3 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 12:27:38 -0400 Subject: [PATCH 05/17] dicts instead of lists for mols and labeled_mols --- hgfp/data/alkethoh/label_molecules.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hgfp/data/alkethoh/label_molecules.py b/hgfp/data/alkethoh/label_molecules.py index 35225349..0072220d 100644 --- a/hgfp/data/alkethoh/label_molecules.py +++ b/hgfp/data/alkethoh/label_molecules.py @@ -63,16 +63,19 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): ring_smiles = [l.split()[0] for l in f.readlines()] with open(alkethoh_local_path, 'r') as f: ring_names = [l.split()[1] for l in f.readlines()] - mols = [Molecule.from_smiles(s, allow_undefined_stereo=True) for s in ring_smiles] + + mols = dict() + for i in range(len(ring_names)): + mols[ring_names[i]] = Molecule.from_smiles(ring_smiles[i], allow_undefined_stereo=True) with open('AlkEthOH_rings_offmols.pkl', 'wb') as f: dump(mols, f) # Label molecules using forcefield # Takes about ~200ms per molecule -- can do ~1000 molecules in ~5-6 minutes, sequentially - labeled_mols = [] - for mol in tqdm(mols): - labeled_mols.append(label_mol(mol)) + labeled_mols = dict() + for name in tqdm(ring_names): + labeled_mols[name] = label_mol(mols[name]) label_dict = dict() for (name, labeled_mol) in zip(ring_names, labeled_mols): From 78cd003da042e9c037d67a3a883aaf72daf1bfe6 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 12:27:54 -0400 Subject: [PATCH 06/17] initial rule-based baseline typers --- hgfp/data/alkethoh/baseline_typers.py | 201 ++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 hgfp/data/alkethoh/baseline_typers.py diff --git a/hgfp/data/alkethoh/baseline_typers.py b/hgfp/data/alkethoh/baseline_typers.py new file mode 100644 index 00000000..411ddbcf --- /dev/null +++ b/hgfp/data/alkethoh/baseline_typers.py @@ -0,0 +1,201 @@ +import numpy as np +from hgfp.data.alkethoh.label_molecules import get_labeled_atoms + +# baseline rule-based classifiers + +def get_elements(mol, atom_indices): + elements = np.array([a.element.atomic_number for a in mol.atoms]) + return elements[atom_indices] + + +def get_element_tuples(mol, atom_indices): + """return the same tuple regardless of whether atom_indices are running forward or backward + + for torsions, get_element_tuples(i,j,k,l) + + """ + return [min(tuple(t), tuple(t[::-1])) for t in get_elements(mol, atom_indices)] + +def is_carbon(atom): + return atom.atomic_number == 6 + + +def is_hydrogen(atom): + return atom.atomic_number == 1 + + +def is_oxygen(atom): + return atom.atomic_number == 8 + + +def neighboring_carbons(atom): + return list(filter(is_carbon, atom.bonded_atoms)) + + +def neighboring_hydrogens(atom): + return list(filter(is_hydrogen, atom.bonded_atoms)) + + +def neighboring_oxygens(atom): + return list(filter(is_oxygen, atom.bonded_atoms)) + + +def classify_atom(atom): + if is_hydrogen(atom): + carbon_neighborhood = neighboring_carbons(atom) + if len(carbon_neighborhood) == 1: + N = len(neighboring_oxygens(carbon_neighborhood[0])) + if N >= 3: + return 5 + elif N == 2: + return 4 + elif N == 1: + return 3 + else: + return 2 + else: + return 12 + elif is_carbon(atom): + return 16 + elif is_oxygen(atom): + if len(neighboring_hydrogens(atom)) == 1: + return 19 + else: + return 18 + + +def classify_atoms(mol): + return np.array([classify_atom(a) for a in mol.atoms]) + + +def classify_bond(atom1, atom2): + # both carbon + if is_carbon(atom1) and is_carbon(atom2): + return 1 + + # one is carbon and the other oxygen, regardless of order + elif (is_carbon(atom1) and is_oxygen(atom2)) or (is_carbon(atom2) and is_oxygen(atom1)): + + # need to catch *which* one was oxygen + if is_oxygen(atom1): + oxygen = atom1 + else: + oxygen = atom2 + + H = len(neighboring_hydrogens(oxygen)) + X = len(list(oxygen.bonded_atoms)) + + if (X == 2) and (H == 0): + return 15 + else: + return 14 + + # one is carbon and the other hydrogen, regardless of order + elif (is_carbon(atom1) and is_hydrogen(atom2)) or (is_carbon(atom2) and is_hydrogen(atom1)): + return 83 + + # both oxygen + elif is_oxygen(atom1) and is_oxygen(atom2): + return 40 + + # oxygen-hydrogen + else: + return 87 + +def classify_bonds(mol, bond_inds): + atoms = list(mol.atoms) + return np.array([classify_bond(atoms[i], atoms[j]) for (i,j) in bond_inds]) + + +def classify_angle(atom1, atom2, atom3): + if is_hydrogen(atom1) and is_hydrogen(atom3): + return 2 + elif is_oxygen(atom2): + return 27 + else: + return 1 + + +def classify_angles(mol, angle_inds): + return np.array([ + classify_angle(mol.atoms[i], mol.atoms[j], mol.atoms[k]) for (i, j, k) in angle_inds]) + + +# simple torsion classifier: look at element identities of atom1, atom2, atom3, and atom4 +from collections import defaultdict + +def learn_torsion_lookup(): + # TODO: collect rest of missing logic... + torsion_tuple_counts = defaultdict(lambda: defaultdict(lambda: 0)) + for i in range(len(mols)): + torsion_inds, torsion_labels = get_labeled_torsions(labeled_mols[i]) + element_tuples = get_element_tuples(mols[i], torsion_inds) + for j in range(len(torsion_labels)): + torsion_tuple_counts[element_tuples[j]][torsion_labels[j]] += 1 + + + +# the learned classifier +torsion_prediction_dict = { + (6, 6, 6, 8): 1, (1, 6, 6, 6): 4, (1, 8, 6, 6): 85, (6, 6, 8, 6): 87, + (6, 8, 6, 8): 89, (1, 6, 8, 6): 86, (8, 6, 6, 8): 5, (1, 6, 6, 8): 9, + (1, 8, 6, 8): 84, (1, 6, 6, 1): 3, (1, 6, 8, 1): 84, (6, 6, 6, 6): 2, + (6, 6, 8, 8): 86, (6, 8, 8, 6): 116, (1, 6, 8, 8): 86, (8, 6, 8, 8): 86, + (6, 8, 8, 8): 116 +} + + +def classify_torsions(mol, torsion_inds): + return np.array([torsion_prediction_dict[t] for t in get_element_tuples(mol, torsion_inds)]) + + +if __name__ == '__main__': + from pickle import load + with open('AlkEthOH_rings_offmols.pkl', 'rb') as f: + mols = load(f) + + label_dict = np.load('AlkEthOH_rings.npz') + + # atoms + n_correct, n_total = 0, 0 + for name in mols: + mol = mols[name] + inds, labels = label_dict[f'{name}_atom_inds'], label_dict[f'{name}_atom_labels'] + + ## TODO: track this down + #n_correct += sum(classify_atoms(mols[name])[inds] == labels) # problem: atom_inds and atom_labels aren't the same shape! + + # TODO: update classify_atoms signature to accept indices, just like bonds/angles/torsions + n_correct += sum(classify_atoms(mols[name]) == labels) + n_total += mol.n_atoms + print('atoms: ', n_correct, n_total) + + # bonds + n_correct, n_total = 0, 0 + for name in mols: + mol = mols[name] + inds, labels = label_dict[f'{name}_bond_inds'], label_dict[f'{name}_bond_labels'] + + n_correct += sum(classify_bonds(mols[name], inds) == labels) + n_total += len(labels) + print('bonds: ', n_correct, n_total) + + # angles + n_correct, n_total = 0, 0 + for name in mols: + mol = mols[name] + inds, labels = label_dict[f'{name}_angle_inds'], label_dict[f'{name}_angle_labels'] + + n_correct += sum(classify_angles(mols[name], inds) == labels) + n_total += len(labels) + print('angles: ', n_correct, n_total) + + # torsions + n_correct, n_total = 0, 0 + for name in mols: + mol = mols[name] + inds, labels = label_dict[f'{name}_torsion_inds'], label_dict[f'{name}_torsion_labels'] + + n_correct += sum(classify_torsions(mols[name], inds) == labels) + n_total += len(labels) + print('torsions: ', n_correct, n_total) From 5a045286f8f1c3381562936f08038ae91c970d3b Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 14:51:45 -0400 Subject: [PATCH 07/17] spruce up molecule-labeling script --- hgfp/data/alkethoh/label_molecules.py | 33 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/hgfp/data/alkethoh/label_molecules.py b/hgfp/data/alkethoh/label_molecules.py index 0072220d..911d05d3 100644 --- a/hgfp/data/alkethoh/label_molecules.py +++ b/hgfp/data/alkethoh/label_molecules.py @@ -60,13 +60,13 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): # load molecules with open(alkethoh_local_path, 'r') as f: - ring_smiles = [l.split()[0] for l in f.readlines()] + smiles = [l.split()[0] for l in f.readlines()] with open(alkethoh_local_path, 'r') as f: - ring_names = [l.split()[1] for l in f.readlines()] + names = [l.split()[1] for l in f.readlines()] mols = dict() - for i in range(len(ring_names)): - mols[ring_names[i]] = Molecule.from_smiles(ring_smiles[i], allow_undefined_stereo=True) + for i in range(len(names)): + mols[names[i]] = Molecule.from_smiles(smiles[i], allow_undefined_stereo=True) with open('AlkEthOH_rings_offmols.pkl', 'wb') as f: dump(mols, f) @@ -74,15 +74,24 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): # Label molecules using forcefield # Takes about ~200ms per molecule -- can do ~1000 molecules in ~5-6 minutes, sequentially labeled_mols = dict() - for name in tqdm(ring_names): + for name in tqdm(names): labeled_mols[name] = label_mol(mols[name]) label_dict = dict() - for (name, labeled_mol) in zip(ring_names, labeled_mols): - label_dict[name + '_atom_inds'], label_dict[name + '_atom_labels'] = get_labeled_atoms(labeled_mol) - label_dict[name + '_bond_inds'], label_dict[name + '_bond_labels'] = get_labeled_bonds(labeled_mol) - label_dict[name + '_angle_inds'], label_dict[name + '_angle_labels'] = get_labeled_angles(labeled_mol) - label_dict[name + '_torsion_inds'], label_dict[name + '_torsion_labels'] = get_labeled_torsions(labeled_mol) + n_atoms, n_bonds, n_angles, n_torsions = 0, 0, 0, 0 + + for name in names: + labeled_mol = labeled_mols[name] + label_dict[f'{name}_atom_inds'], label_dict[f'{name}_atom_labels'] = get_labeled_atoms(labeled_mol) + n_atoms += len(label_dict[f'{name}_atom_inds']) + label_dict[f'{name}_bond_inds'], label_dict[f'{name}_bond_labels'] = get_labeled_bonds(labeled_mol) + n_bonds += len(label_dict[f'{name}_bond_inds']) + label_dict[f'{name}_angle_inds'], label_dict[f'{name}_angle_labels'] = get_labeled_angles(labeled_mol) + n_angles += len(label_dict[f'{name}_angle_inds']) + label_dict[f'{name}_torsion_inds'], label_dict[f'{name}_torsion_labels'] = get_labeled_torsions(labeled_mol) + n_torsions += len(label_dict[f'{name}_torsion_inds']) + summary = f'# atoms: {n_atoms}, # bonds: {n_bonds}, # angles: {n_angles}, # torsions: {n_torsions}' + print(summary) # save to compressed array description = f""" @@ -97,7 +106,9 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): such as 'AlkEthOH_r0_atom_inds' or 'AlkEthOH_r0_torsion_labels' - and values are integer arrays + and values are integer arrays. + + {summary} """ np.savez_compressed('AlkEthOH_rings.npz', From 88595618162b88028bf046a80f22d005fe2e14f9 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 14:52:36 -0400 Subject: [PATCH 08/17] add type hints and reduce duplicated code alkethoh/baseline_typers --- hgfp/data/alkethoh/baseline_typers.py | 141 +++++++++----------------- 1 file changed, 46 insertions(+), 95 deletions(-) diff --git a/hgfp/data/alkethoh/baseline_typers.py b/hgfp/data/alkethoh/baseline_typers.py index 411ddbcf..3fbcbf6d 100644 --- a/hgfp/data/alkethoh/baseline_typers.py +++ b/hgfp/data/alkethoh/baseline_typers.py @@ -1,46 +1,32 @@ +from typing import Iterable + import numpy as np -from hgfp.data.alkethoh.label_molecules import get_labeled_atoms +from openforcefield.topology import Molecule, Atom -# baseline rule-based classifiers -def get_elements(mol, atom_indices): +# baseline rule-based classifiers +def get_elements(mol: Molecule, atom_indices: Iterable[Iterable[int]]) -> np.ndarray: + """array of atomic numbers within a molecule""" elements = np.array([a.element.atomic_number for a in mol.atoms]) return elements[atom_indices] -def get_element_tuples(mol, atom_indices): - """return the same tuple regardless of whether atom_indices are running forward or backward - - for torsions, get_element_tuples(i,j,k,l) - - """ +def get_element_tuples(mol: Molecule, atom_indices: Iterable[Iterable[int]]) -> list: + """the same tuple regardless of whether atom_indices are running forward or backward""" return [min(tuple(t), tuple(t[::-1])) for t in get_elements(mol, atom_indices)] -def is_carbon(atom): - return atom.atomic_number == 6 - - -def is_hydrogen(atom): - return atom.atomic_number == 1 - - -def is_oxygen(atom): - return atom.atomic_number == 8 - -def neighboring_carbons(atom): - return list(filter(is_carbon, atom.bonded_atoms)) +is_carbon = lambda atom: atom.atomic_number == 6 +is_hydrogen = lambda atom: atom.atomic_number == 1 +is_oxygen = lambda atom: atom.atomic_number == 8 +neighboring_carbons = lambda atom: list(filter(is_carbon, atom.bonded_atoms)) +neighboring_hydrogens = lambda atom: list(filter(is_hydrogen, atom.bonded_atoms)) +neighboring_oxygens = lambda atom: list(filter(is_oxygen, atom.bonded_atoms)) -def neighboring_hydrogens(atom): - return list(filter(is_hydrogen, atom.bonded_atoms)) - -def neighboring_oxygens(atom): - return list(filter(is_oxygen, atom.bonded_atoms)) - - -def classify_atom(atom): +## atoms +def classify_atom(atom: Atom) -> int: if is_hydrogen(atom): carbon_neighborhood = neighboring_carbons(atom) if len(carbon_neighborhood) == 1: @@ -64,11 +50,14 @@ def classify_atom(atom): return 18 -def classify_atoms(mol): - return np.array([classify_atom(a) for a in mol.atoms]) +def classify_atoms(mol: Molecule, atom_inds: np.ndarray) -> np.ndarray: + assert (atom_inds.shape)[1] == 1 + atoms = mol.atoms + return np.array([classify_atom(atoms[i]) for i in atom_inds[:, 0]]) -def classify_bond(atom1, atom2): +## bonds +def classify_bond(atom1: Atom, atom2: Atom) -> int: # both carbon if is_carbon(atom1) and is_carbon(atom2): return 1 @@ -102,12 +91,14 @@ def classify_bond(atom1, atom2): else: return 87 -def classify_bonds(mol, bond_inds): + +def classify_bonds(mol: Molecule, bond_inds: Iterable) -> np.ndarray: atoms = list(mol.atoms) - return np.array([classify_bond(atoms[i], atoms[j]) for (i,j) in bond_inds]) + return np.array([classify_bond(atoms[i], atoms[j]) for (i, j) in bond_inds]) -def classify_angle(atom1, atom2, atom3): +## angles +def classify_angle(atom1: Atom, atom2: Atom, atom3: Atom) -> int: if is_hydrogen(atom1) and is_hydrogen(atom3): return 2 elif is_oxygen(atom2): @@ -116,26 +107,13 @@ def classify_angle(atom1, atom2, atom3): return 1 -def classify_angles(mol, angle_inds): +def classify_angles(mol: Molecule, angle_inds: Iterable[Iterable[int]]) -> np.ndarray: return np.array([ classify_angle(mol.atoms[i], mol.atoms[j], mol.atoms[k]) for (i, j, k) in angle_inds]) +## torsions # simple torsion classifier: look at element identities of atom1, atom2, atom3, and atom4 -from collections import defaultdict - -def learn_torsion_lookup(): - # TODO: collect rest of missing logic... - torsion_tuple_counts = defaultdict(lambda: defaultdict(lambda: 0)) - for i in range(len(mols)): - torsion_inds, torsion_labels = get_labeled_torsions(labeled_mols[i]) - element_tuples = get_element_tuples(mols[i], torsion_inds) - for j in range(len(torsion_labels)): - torsion_tuple_counts[element_tuples[j]][torsion_labels[j]] += 1 - - - -# the learned classifier torsion_prediction_dict = { (6, 6, 6, 8): 1, (1, 6, 6, 6): 4, (1, 8, 6, 6): 85, (6, 6, 8, 6): 87, (6, 8, 6, 8): 89, (1, 6, 8, 6): 86, (8, 6, 6, 8): 5, (1, 6, 6, 8): 9, @@ -145,57 +123,30 @@ def learn_torsion_lookup(): } -def classify_torsions(mol, torsion_inds): +def classify_torsions(mol: Molecule, torsion_inds: Iterable[Iterable[int]]): return np.array([torsion_prediction_dict[t] for t in get_element_tuples(mol, torsion_inds)]) if __name__ == '__main__': from pickle import load + with open('AlkEthOH_rings_offmols.pkl', 'rb') as f: mols = load(f) label_dict = np.load('AlkEthOH_rings.npz') - # atoms - n_correct, n_total = 0, 0 - for name in mols: - mol = mols[name] - inds, labels = label_dict[f'{name}_atom_inds'], label_dict[f'{name}_atom_labels'] - - ## TODO: track this down - #n_correct += sum(classify_atoms(mols[name])[inds] == labels) # problem: atom_inds and atom_labels aren't the same shape! - - # TODO: update classify_atoms signature to accept indices, just like bonds/angles/torsions - n_correct += sum(classify_atoms(mols[name]) == labels) - n_total += mol.n_atoms - print('atoms: ', n_correct, n_total) - - # bonds - n_correct, n_total = 0, 0 - for name in mols: - mol = mols[name] - inds, labels = label_dict[f'{name}_bond_inds'], label_dict[f'{name}_bond_labels'] - - n_correct += sum(classify_bonds(mols[name], inds) == labels) - n_total += len(labels) - print('bonds: ', n_correct, n_total) - - # angles - n_correct, n_total = 0, 0 - for name in mols: - mol = mols[name] - inds, labels = label_dict[f'{name}_angle_inds'], label_dict[f'{name}_angle_labels'] - - n_correct += sum(classify_angles(mols[name], inds) == labels) - n_total += len(labels) - print('angles: ', n_correct, n_total) - - # torsions - n_correct, n_total = 0, 0 - for name in mols: - mol = mols[name] - inds, labels = label_dict[f'{name}_torsion_inds'], label_dict[f'{name}_torsion_labels'] - - n_correct += sum(classify_torsions(mols[name], inds) == labels) - n_total += len(labels) - print('torsions: ', n_correct, n_total) + for type_name, classifier in [ + ('atom', classify_atoms), + ('bond', classify_bonds), + ('angle', classify_angles), + ('torsion', classify_torsions) + ]: + n_correct, n_total = 0, 0 + for name in mols: + mol = mols[name] + inds, labels = label_dict[f'{name}_{type_name}_inds'], label_dict[f'{name}_{type_name}_labels'] + + n_correct += sum(classifier(mols[name], inds) == labels) + n_total += len(labels) + print(f'{type_name}s:\n\t# correct: {n_correct}\n\t# total: {n_total}\n\taccuracy: ' + '{:.4f}%'.format( + 100 * n_correct / n_total)) From 305f56ac735fd463c0f34cc102fb585655de324e Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 14:57:01 -0400 Subject: [PATCH 09/17] add alkethoh dataset using lfs --- .gitignore | 6 +++--- hgfp/data/alkethoh/.gitattributes | 3 +++ hgfp/data/alkethoh/AlkEthOH_rings.npz | 3 +++ hgfp/data/alkethoh/AlkEthOH_rings.smi | 3 +++ hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl | 3 +++ 5 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 hgfp/data/alkethoh/.gitattributes create mode 100644 hgfp/data/alkethoh/AlkEthOH_rings.npz create mode 100644 hgfp/data/alkethoh/AlkEthOH_rings.smi create mode 100644 hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl diff --git a/.gitignore b/.gitignore index fdc4d476..4ec96559 100644 --- a/.gitignore +++ b/.gitignore @@ -142,6 +142,6 @@ parm_at_Frosst.tgz .DS_Store # AlkEthOH dataset -hgfp/data/alkethoh/AlkEthOH_rings.npz -hgfp/data/alkethoh/AlkEthOH_rings.smi -hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl +#hgfp/data/alkethoh/AlkEthOH_rings.npz +#hgfp/data/alkethoh/AlkEthOH_rings.smi +#hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl diff --git a/hgfp/data/alkethoh/.gitattributes b/hgfp/data/alkethoh/.gitattributes new file mode 100644 index 00000000..6a8aed9d --- /dev/null +++ b/hgfp/data/alkethoh/.gitattributes @@ -0,0 +1,3 @@ +AlkEthOH_rings.npz filter=lfs diff=lfs merge=lfs -text +AlkEthOH_rings.smi filter=lfs diff=lfs merge=lfs -text +AlkEthOH_rings_offmols.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/hgfp/data/alkethoh/AlkEthOH_rings.npz b/hgfp/data/alkethoh/AlkEthOH_rings.npz new file mode 100644 index 00000000..6bf013ba --- /dev/null +++ b/hgfp/data/alkethoh/AlkEthOH_rings.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f6920763db4746288aac84c523e47dca9a94bd28f4572877a54e721bc3a3a5e +size 2815911 diff --git a/hgfp/data/alkethoh/AlkEthOH_rings.smi b/hgfp/data/alkethoh/AlkEthOH_rings.smi new file mode 100644 index 00000000..31c2fd18 --- /dev/null +++ b/hgfp/data/alkethoh/AlkEthOH_rings.smi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a715c40e4cd31cdf6325de72e67c0f4e030b375918bfde49e04124c1ecb4b7d +size 34105 diff --git a/hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl b/hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl new file mode 100644 index 00000000..d9656738 --- /dev/null +++ b/hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfbac702a9a420ae954983c8664cfcd652eec16d89db133ff06300814189ed0 +size 2032592 From 3c96239ab2ef763867ef5aa65f4fe370af226f18 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 17:43:54 -0400 Subject: [PATCH 10/17] Create pytorch_datasets.py --- hgfp/data/alkethoh/pytorch_datasets.py | 69 ++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 hgfp/data/alkethoh/pytorch_datasets.py diff --git a/hgfp/data/alkethoh/pytorch_datasets.py b/hgfp/data/alkethoh/pytorch_datasets.py new file mode 100644 index 00000000..201f726a --- /dev/null +++ b/hgfp/data/alkethoh/pytorch_datasets.py @@ -0,0 +1,69 @@ +"""provide a pytorch dataset views of the interaction typing dataset""" + +from pickle import load +from typing import Tuple + +import numpy as np +from openforcefield.topology import Molecule +from torch import tensor, Tensor +from torch.utils.data.dataset import Dataset + + +class AlkEthOHDataset(Dataset): + def __init__(self): + with open('AlkEthOH_rings_offmols.pkl', 'rb') as f: + self._mols = load(f) + + self._label_dict = np.load('AlkEthOH_rings.npz') + + def _get_inds(self, index: str, type_name: str) -> Tensor: + return tensor(self._label_dict[f'{index}_{type_name}_inds']) + + def _get_labels(self, index: str, type_name:str) -> Tensor: + return tensor(self._label_dict[f'{index}_{type_name}_labels']) + + def _get_mol_inds_labels(self, index: str, type_name: str) -> Tuple[Molecule, Tensor, Tensor]: + assert (index in self._mols) + mol = self._mols[index] + inds = self._get_inds(index, type_name) + labels = self._get_labels(index, type_name) + return mol, inds, labels + + +class AlkEthOHAtomTypesDataset(AlkEthOHDataset): + + def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: + return self._get_mol_inds_labels(index, 'atom') + + +class AlkEthOHBondTypesDataset(AlkEthOHDataset): + + def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: + return self._get_mol_inds_labels(index, 'bond') + + +class AlkEthOHAngleTypesDataset(AlkEthOHDataset): + + def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: + return self._get_mol_inds_labels(index, 'angle') + + +class AlkEthOHTorsionTypesDataset(AlkEthOHDataset): + + def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: + return self._get_mol_inds_labels(index, 'torsion') + + +if __name__ == '__main__': + + # TODO: move this from __main__ into doctests + + index = 'AlkEthOH_r0' + # atoms + print(AlkEthOHAtomTypesDataset()[index]) + # bonds + print(AlkEthOHBondTypesDataset()[index]) + # angles + print(AlkEthOHAngleTypesDataset()[index]) + # torsions + print(AlkEthOHTorsionTypesDataset()[index]) From f836827d4a341e7ec5226db951e03e9162dfd83e Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 17:54:49 -0400 Subject: [PATCH 11/17] string indices --> integer indices --- hgfp/data/alkethoh/pytorch_datasets.py | 29 +++++++++++++------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/hgfp/data/alkethoh/pytorch_datasets.py b/hgfp/data/alkethoh/pytorch_datasets.py index 201f726a..cbc5836a 100644 --- a/hgfp/data/alkethoh/pytorch_datasets.py +++ b/hgfp/data/alkethoh/pytorch_datasets.py @@ -15,18 +15,19 @@ def __init__(self): self._mols = load(f) self._label_dict = np.load('AlkEthOH_rings.npz') + self._mol_names = sorted(list(self._mols.keys())) - def _get_inds(self, index: str, type_name: str) -> Tensor: - return tensor(self._label_dict[f'{index}_{type_name}_inds']) + def _get_inds(self, mol_name: str, type_name: str) -> Tensor: + return tensor(self._label_dict[f'{mol_name}_{type_name}_inds']) - def _get_labels(self, index: str, type_name:str) -> Tensor: - return tensor(self._label_dict[f'{index}_{type_name}_labels']) + def _get_labels(self, mol_name: str, type_name: str) -> Tensor: + return tensor(self._label_dict[f'{mol_name}_{type_name}_labels']) - def _get_mol_inds_labels(self, index: str, type_name: str) -> Tuple[Molecule, Tensor, Tensor]: - assert (index in self._mols) - mol = self._mols[index] - inds = self._get_inds(index, type_name) - labels = self._get_labels(index, type_name) + def _get_mol_inds_labels(self, index: int, type_name: str) -> Tuple[Molecule, Tensor, Tensor]: + mol_name = self._mol_names[index] + mol = self._mols[mol_name] + inds = self._get_inds(mol_name, type_name) + labels = self._get_labels(mol_name, type_name) return mol, inds, labels @@ -55,15 +56,13 @@ def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: if __name__ == '__main__': - # TODO: move this from __main__ into doctests - index = 'AlkEthOH_r0' # atoms - print(AlkEthOHAtomTypesDataset()[index]) + print(AlkEthOHAtomTypesDataset()[0]) # bonds - print(AlkEthOHBondTypesDataset()[index]) + print(AlkEthOHBondTypesDataset()[0]) # angles - print(AlkEthOHAngleTypesDataset()[index]) + print(AlkEthOHAngleTypesDataset()[0]) # torsions - print(AlkEthOHTorsionTypesDataset()[index]) + print(AlkEthOHTorsionTypesDataset()[0]) From cf91010819f8d3edcd2471daab5db0594f4be020 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 17:56:25 -0400 Subject: [PATCH 12/17] remove AlkEthOH dataset files from .gitignore now being tracked by git-lfs --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 4ec96559..91fbb112 100644 --- a/.gitignore +++ b/.gitignore @@ -140,8 +140,3 @@ parm_at_Frosst.tgz .DS_Store - -# AlkEthOH dataset -#hgfp/data/alkethoh/AlkEthOH_rings.npz -#hgfp/data/alkethoh/AlkEthOH_rings.smi -#hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl From 8e0a3d32a8f95455ddec2e166d137c5ec95d4ef3 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 28 May 2020 18:33:35 -0400 Subject: [PATCH 13/17] hgfp/data --> espaloma/data --- {hgfp => espaloma}/data/alkethoh/.gitattributes | 0 {hgfp => espaloma}/data/alkethoh/AlkEthOH_rings.npz | 0 {hgfp => espaloma}/data/alkethoh/AlkEthOH_rings.smi | 0 {hgfp => espaloma}/data/alkethoh/AlkEthOH_rings_offmols.pkl | 0 {hgfp => espaloma}/data/alkethoh/__init__.py | 0 {hgfp => espaloma}/data/alkethoh/baseline_typers.py | 0 {hgfp => espaloma}/data/alkethoh/label_molecules.py | 0 {hgfp => espaloma}/data/alkethoh/pytorch_datasets.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename {hgfp => espaloma}/data/alkethoh/.gitattributes (100%) rename {hgfp => espaloma}/data/alkethoh/AlkEthOH_rings.npz (100%) rename {hgfp => espaloma}/data/alkethoh/AlkEthOH_rings.smi (100%) rename {hgfp => espaloma}/data/alkethoh/AlkEthOH_rings_offmols.pkl (100%) rename {hgfp => espaloma}/data/alkethoh/__init__.py (100%) rename {hgfp => espaloma}/data/alkethoh/baseline_typers.py (100%) rename {hgfp => espaloma}/data/alkethoh/label_molecules.py (100%) rename {hgfp => espaloma}/data/alkethoh/pytorch_datasets.py (100%) diff --git a/hgfp/data/alkethoh/.gitattributes b/espaloma/data/alkethoh/.gitattributes similarity index 100% rename from hgfp/data/alkethoh/.gitattributes rename to espaloma/data/alkethoh/.gitattributes diff --git a/hgfp/data/alkethoh/AlkEthOH_rings.npz b/espaloma/data/alkethoh/AlkEthOH_rings.npz similarity index 100% rename from hgfp/data/alkethoh/AlkEthOH_rings.npz rename to espaloma/data/alkethoh/AlkEthOH_rings.npz diff --git a/hgfp/data/alkethoh/AlkEthOH_rings.smi b/espaloma/data/alkethoh/AlkEthOH_rings.smi similarity index 100% rename from hgfp/data/alkethoh/AlkEthOH_rings.smi rename to espaloma/data/alkethoh/AlkEthOH_rings.smi diff --git a/hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl b/espaloma/data/alkethoh/AlkEthOH_rings_offmols.pkl similarity index 100% rename from hgfp/data/alkethoh/AlkEthOH_rings_offmols.pkl rename to espaloma/data/alkethoh/AlkEthOH_rings_offmols.pkl diff --git a/hgfp/data/alkethoh/__init__.py b/espaloma/data/alkethoh/__init__.py similarity index 100% rename from hgfp/data/alkethoh/__init__.py rename to espaloma/data/alkethoh/__init__.py diff --git a/hgfp/data/alkethoh/baseline_typers.py b/espaloma/data/alkethoh/baseline_typers.py similarity index 100% rename from hgfp/data/alkethoh/baseline_typers.py rename to espaloma/data/alkethoh/baseline_typers.py diff --git a/hgfp/data/alkethoh/label_molecules.py b/espaloma/data/alkethoh/label_molecules.py similarity index 100% rename from hgfp/data/alkethoh/label_molecules.py rename to espaloma/data/alkethoh/label_molecules.py diff --git a/hgfp/data/alkethoh/pytorch_datasets.py b/espaloma/data/alkethoh/pytorch_datasets.py similarity index 100% rename from hgfp/data/alkethoh/pytorch_datasets.py rename to espaloma/data/alkethoh/pytorch_datasets.py From 973d5e1de00b60390b93a054c4277db632569b04 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Mon, 1 Jun 2020 12:07:21 -0400 Subject: [PATCH 14/17] local paths --> pkg_resources.resource_filename's --- espaloma/data/alkethoh/baseline_typers.py | 8 ++++++-- espaloma/data/alkethoh/label_molecules.py | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/espaloma/data/alkethoh/baseline_typers.py b/espaloma/data/alkethoh/baseline_typers.py index 3fbcbf6d..f62cbbc9 100644 --- a/espaloma/data/alkethoh/baseline_typers.py +++ b/espaloma/data/alkethoh/baseline_typers.py @@ -129,11 +129,15 @@ def classify_torsions(mol: Molecule, torsion_inds: Iterable[Iterable[int]]): if __name__ == '__main__': from pickle import load + from pkg_resources import resource_filename - with open('AlkEthOH_rings_offmols.pkl', 'rb') as f: + path_to_offmols = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings_offmols.pkl') + path_to_npz = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings.npz') + + with open(path_to_offmols, 'rb') as f: mols = load(f) - label_dict = np.load('AlkEthOH_rings.npz') + label_dict = np.load(path_to_npz) for type_name, classifier in [ ('atom', classify_atoms), diff --git a/espaloma/data/alkethoh/label_molecules.py b/espaloma/data/alkethoh/label_molecules.py index 911d05d3..bc5ba96a 100644 --- a/espaloma/data/alkethoh/label_molecules.py +++ b/espaloma/data/alkethoh/label_molecules.py @@ -7,17 +7,21 @@ import numpy as np from openforcefield.topology import Molecule from openforcefield.typing.engines.smirnoff import ForceField +from pkg_resources import resource_filename from tqdm import tqdm alkethoh_url = 'https://raw.githubusercontent.com/openforcefield/open-forcefield-data/e07bde16c34a3fa1d73ab72e2b8aeab7cd6524df/Model-Systems/AlkEthOH_distrib/AlkEthOH_rings.smi' -alkethoh_local_path = 'AlkEthOH_rings.smi' + +path_to_smiles = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings.smi') +path_to_offmols = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings_offmols.pkl') +path_to_npz = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings.npz') def download_alkethoh(): - if not os.path.exists(alkethoh_local_path): + if not os.path.exists(path_to_smiles): with urllib.request.urlopen(alkethoh_url) as response: smi = response.read() - with open(alkethoh_local_path, 'wb') as f: + with open(path_to_smiles, 'wb') as f: f.write(smi) @@ -59,16 +63,16 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): download_alkethoh() # load molecules - with open(alkethoh_local_path, 'r') as f: + with open(path_to_smiles, 'r') as f: smiles = [l.split()[0] for l in f.readlines()] - with open(alkethoh_local_path, 'r') as f: + with open(path_to_smiles, 'r') as f: names = [l.split()[1] for l in f.readlines()] mols = dict() for i in range(len(names)): mols[names[i]] = Molecule.from_smiles(smiles[i], allow_undefined_stereo=True) - with open('AlkEthOH_rings_offmols.pkl', 'wb') as f: + with open(path_to_offmols, 'wb') as f: dump(mols, f) # Label molecules using forcefield @@ -110,7 +114,6 @@ def get_inds_and_labels(labeled_mol, type_name='vdW'): {summary} """ - - np.savez_compressed('AlkEthOH_rings.npz', + np.savez_compressed(path_to_npz, description=description, **label_dict) From 90e917f500edb1f9e696cb1074065fb5b2f6459d Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Wed, 3 Jun 2020 21:59:05 -0400 Subject: [PATCH 15/17] remove rule-based type-classifiers --- espaloma/data/alkethoh/baseline_typers.py | 156 ---------------------- 1 file changed, 156 deletions(-) delete mode 100644 espaloma/data/alkethoh/baseline_typers.py diff --git a/espaloma/data/alkethoh/baseline_typers.py b/espaloma/data/alkethoh/baseline_typers.py deleted file mode 100644 index f62cbbc9..00000000 --- a/espaloma/data/alkethoh/baseline_typers.py +++ /dev/null @@ -1,156 +0,0 @@ -from typing import Iterable - -import numpy as np -from openforcefield.topology import Molecule, Atom - - -# baseline rule-based classifiers -def get_elements(mol: Molecule, atom_indices: Iterable[Iterable[int]]) -> np.ndarray: - """array of atomic numbers within a molecule""" - elements = np.array([a.element.atomic_number for a in mol.atoms]) - return elements[atom_indices] - - -def get_element_tuples(mol: Molecule, atom_indices: Iterable[Iterable[int]]) -> list: - """the same tuple regardless of whether atom_indices are running forward or backward""" - return [min(tuple(t), tuple(t[::-1])) for t in get_elements(mol, atom_indices)] - - -is_carbon = lambda atom: atom.atomic_number == 6 -is_hydrogen = lambda atom: atom.atomic_number == 1 -is_oxygen = lambda atom: atom.atomic_number == 8 - -neighboring_carbons = lambda atom: list(filter(is_carbon, atom.bonded_atoms)) -neighboring_hydrogens = lambda atom: list(filter(is_hydrogen, atom.bonded_atoms)) -neighboring_oxygens = lambda atom: list(filter(is_oxygen, atom.bonded_atoms)) - - -## atoms -def classify_atom(atom: Atom) -> int: - if is_hydrogen(atom): - carbon_neighborhood = neighboring_carbons(atom) - if len(carbon_neighborhood) == 1: - N = len(neighboring_oxygens(carbon_neighborhood[0])) - if N >= 3: - return 5 - elif N == 2: - return 4 - elif N == 1: - return 3 - else: - return 2 - else: - return 12 - elif is_carbon(atom): - return 16 - elif is_oxygen(atom): - if len(neighboring_hydrogens(atom)) == 1: - return 19 - else: - return 18 - - -def classify_atoms(mol: Molecule, atom_inds: np.ndarray) -> np.ndarray: - assert (atom_inds.shape)[1] == 1 - atoms = mol.atoms - return np.array([classify_atom(atoms[i]) for i in atom_inds[:, 0]]) - - -## bonds -def classify_bond(atom1: Atom, atom2: Atom) -> int: - # both carbon - if is_carbon(atom1) and is_carbon(atom2): - return 1 - - # one is carbon and the other oxygen, regardless of order - elif (is_carbon(atom1) and is_oxygen(atom2)) or (is_carbon(atom2) and is_oxygen(atom1)): - - # need to catch *which* one was oxygen - if is_oxygen(atom1): - oxygen = atom1 - else: - oxygen = atom2 - - H = len(neighboring_hydrogens(oxygen)) - X = len(list(oxygen.bonded_atoms)) - - if (X == 2) and (H == 0): - return 15 - else: - return 14 - - # one is carbon and the other hydrogen, regardless of order - elif (is_carbon(atom1) and is_hydrogen(atom2)) or (is_carbon(atom2) and is_hydrogen(atom1)): - return 83 - - # both oxygen - elif is_oxygen(atom1) and is_oxygen(atom2): - return 40 - - # oxygen-hydrogen - else: - return 87 - - -def classify_bonds(mol: Molecule, bond_inds: Iterable) -> np.ndarray: - atoms = list(mol.atoms) - return np.array([classify_bond(atoms[i], atoms[j]) for (i, j) in bond_inds]) - - -## angles -def classify_angle(atom1: Atom, atom2: Atom, atom3: Atom) -> int: - if is_hydrogen(atom1) and is_hydrogen(atom3): - return 2 - elif is_oxygen(atom2): - return 27 - else: - return 1 - - -def classify_angles(mol: Molecule, angle_inds: Iterable[Iterable[int]]) -> np.ndarray: - return np.array([ - classify_angle(mol.atoms[i], mol.atoms[j], mol.atoms[k]) for (i, j, k) in angle_inds]) - - -## torsions -# simple torsion classifier: look at element identities of atom1, atom2, atom3, and atom4 -torsion_prediction_dict = { - (6, 6, 6, 8): 1, (1, 6, 6, 6): 4, (1, 8, 6, 6): 85, (6, 6, 8, 6): 87, - (6, 8, 6, 8): 89, (1, 6, 8, 6): 86, (8, 6, 6, 8): 5, (1, 6, 6, 8): 9, - (1, 8, 6, 8): 84, (1, 6, 6, 1): 3, (1, 6, 8, 1): 84, (6, 6, 6, 6): 2, - (6, 6, 8, 8): 86, (6, 8, 8, 6): 116, (1, 6, 8, 8): 86, (8, 6, 8, 8): 86, - (6, 8, 8, 8): 116 -} - - -def classify_torsions(mol: Molecule, torsion_inds: Iterable[Iterable[int]]): - return np.array([torsion_prediction_dict[t] for t in get_element_tuples(mol, torsion_inds)]) - - -if __name__ == '__main__': - from pickle import load - from pkg_resources import resource_filename - - path_to_offmols = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings_offmols.pkl') - path_to_npz = resource_filename('espaloma.data.alkethoh', 'AlkEthOH_rings.npz') - - with open(path_to_offmols, 'rb') as f: - mols = load(f) - - label_dict = np.load(path_to_npz) - - for type_name, classifier in [ - ('atom', classify_atoms), - ('bond', classify_bonds), - ('angle', classify_angles), - ('torsion', classify_torsions) - ]: - n_correct, n_total = 0, 0 - for name in mols: - mol = mols[name] - inds, labels = label_dict[f'{name}_{type_name}_inds'], label_dict[f'{name}_{type_name}_labels'] - - n_correct += sum(classifier(mols[name], inds) == labels) - n_total += len(labels) - print(f'{type_name}s:\n\t# correct: {n_correct}\n\t# total: {n_total}\n\taccuracy: ' + '{:.4f}%'.format( - 100 * n_correct / n_total)) From 6619b2bd8e119f935c4990772ec62589efe11866 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Wed, 3 Jun 2020 22:03:59 -0400 Subject: [PATCH 16/17] add cross-entropy loss as instance method of classification dataset per discussion with JDC and YW --- espaloma/data/alkethoh/pytorch_datasets.py | 43 +++++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/espaloma/data/alkethoh/pytorch_datasets.py b/espaloma/data/alkethoh/pytorch_datasets.py index cbc5836a..d139122c 100644 --- a/espaloma/data/alkethoh/pytorch_datasets.py +++ b/espaloma/data/alkethoh/pytorch_datasets.py @@ -4,10 +4,14 @@ from typing import Tuple import numpy as np +import torch from openforcefield.topology import Molecule from torch import tensor, Tensor +from torch.nn import CrossEntropyLoss from torch.utils.data.dataset import Dataset +categorical_loss = CrossEntropyLoss() + class AlkEthOHDataset(Dataset): def __init__(self): @@ -23,6 +27,13 @@ def _get_inds(self, mol_name: str, type_name: str) -> Tensor: def _get_labels(self, mol_name: str, type_name: str) -> Tensor: return tensor(self._label_dict[f'{mol_name}_{type_name}_labels']) + def _get_all_unique_labels(self, type_name: str): + all_labels = set() + for mol_name in self._mol_names: + new_labels = set(self._label_dict[f'{mol_name}_{type_name}_labels']) + all_labels.update(new_labels) + return sorted(list(all_labels)) + def _get_mol_inds_labels(self, index: int, type_name: str) -> Tuple[Molecule, Tensor, Tensor]: mol_name = self._mol_names[index] mol = self._mols[mol_name] @@ -30,11 +41,30 @@ def _get_mol_inds_labels(self, index: int, type_name: str) -> Tuple[Molecule, Te labels = self._get_labels(mol_name, type_name) return mol, inds, labels + def loss(self, index: int, predictions: Tensor) -> Tensor: + raise (NotImplementedError) + class AlkEthOHAtomTypesDataset(AlkEthOHDataset): + def __init__(self): + super(AlkEthOHAtomTypesDataset, self).__init__() + self.type_name = 'atom' + all_labels = self._get_all_unique_labels(self.type_name) + + self._label_mapping = dict(zip(all_labels, range(len(all_labels)))) + self.n_classes = len(self._label_mapping) def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: - return self._get_mol_inds_labels(index, 'atom') + mol, inds, _labels = self._get_mol_inds_labels(index, self.type_name) + labels = tensor([self._label_mapping[int(i)] for i in _labels]) + return mol, inds, labels + + def loss(self, index: int, predictions: Tensor) -> Tensor: + """cross entropy loss""" + _, _, target = self[index] + assert (predictions.shape == (len(target), self.n_classes)) + + return categorical_loss(predictions, target) class AlkEthOHBondTypesDataset(AlkEthOHDataset): @@ -59,7 +89,16 @@ def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: # TODO: move this from __main__ into doctests # atoms - print(AlkEthOHAtomTypesDataset()[0]) + atom_type_dataset = AlkEthOHAtomTypesDataset() + n_classes = atom_type_dataset.n_classes + + index = 0 + n_atoms = atom_type_dataset[index][0].n_atoms + predictions = torch.randn(n_atoms, n_classes, requires_grad=True) + loss = atom_type_dataset.loss(index, predictions) + loss.backward() + print(predictions.grad) + # bonds print(AlkEthOHBondTypesDataset()[0]) # angles From 7488ce2cf0eced51ae3ff75a30e128929ac31037 Mon Sep 17 00:00:00 2001 From: Josh Fass Date: Thu, 4 Jun 2020 08:05:34 -0400 Subject: [PATCH 17/17] refactor type-datasets * AlkEthOHDataset knows how to load things from disk * AlkEthOHTypesDataset(AlkEthOHDataset) knows how to compute categorical loss * AlkEthOH{Atom|Bond|Angle|Torsion}TypesDataset's index into the appropriate types --- espaloma/data/alkethoh/pytorch_datasets.py | 71 +++++++++++----------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/espaloma/data/alkethoh/pytorch_datasets.py b/espaloma/data/alkethoh/pytorch_datasets.py index d139122c..68a77092 100644 --- a/espaloma/data/alkethoh/pytorch_datasets.py +++ b/espaloma/data/alkethoh/pytorch_datasets.py @@ -1,4 +1,4 @@ -"""provide a pytorch dataset views of the interaction typing dataset""" +"""provide pytorch dataset views of the interaction typing dataset""" from pickle import load from typing import Tuple @@ -10,8 +10,6 @@ from torch.nn import CrossEntropyLoss from torch.utils.data.dataset import Dataset -categorical_loss = CrossEntropyLoss() - class AlkEthOHDataset(Dataset): def __init__(self): @@ -44,11 +42,14 @@ def _get_mol_inds_labels(self, index: int, type_name: str) -> Tuple[Molecule, Te def loss(self, index: int, predictions: Tensor) -> Tensor: raise (NotImplementedError) + def __len__(self): + return len(self._mol_names) -class AlkEthOHAtomTypesDataset(AlkEthOHDataset): - def __init__(self): - super(AlkEthOHAtomTypesDataset, self).__init__() - self.type_name = 'atom' + +class AlkEthOHTypesDataset(AlkEthOHDataset): + def __init__(self, type_name='atom'): + super().__init__() + self.type_name = type_name all_labels = self._get_all_unique_labels(self.type_name) self._label_mapping = dict(zip(all_labels, range(len(all_labels)))) @@ -64,44 +65,44 @@ def loss(self, index: int, predictions: Tensor) -> Tensor: _, _, target = self[index] assert (predictions.shape == (len(target), self.n_classes)) - return categorical_loss(predictions, target) + return CrossEntropyLoss()(predictions, target) -class AlkEthOHBondTypesDataset(AlkEthOHDataset): - - def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: - return self._get_mol_inds_labels(index, 'bond') +class AlkEthOHAtomTypesDataset(AlkEthOHTypesDataset): + def __init__(self): + super().__init__(type_name='atom') -class AlkEthOHAngleTypesDataset(AlkEthOHDataset): +class AlkEthOHBondTypesDataset(AlkEthOHTypesDataset): + def __init__(self): + super().__init__(type_name='bond') - def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: - return self._get_mol_inds_labels(index, 'angle') +class AlkEthOHAngleTypesDataset(AlkEthOHTypesDataset): + def __init__(self): + super().__init__(type_name='angle') -class AlkEthOHTorsionTypesDataset(AlkEthOHDataset): - def __getitem__(self, index) -> Tuple[Molecule, Tensor, Tensor]: - return self._get_mol_inds_labels(index, 'torsion') +class AlkEthOHTorsionTypesDataset(AlkEthOHTypesDataset): + def __init__(self): + super().__init__(type_name='torsion') if __name__ == '__main__': # TODO: move this from __main__ into doctests - # atoms - atom_type_dataset = AlkEthOHAtomTypesDataset() - n_classes = atom_type_dataset.n_classes - - index = 0 - n_atoms = atom_type_dataset[index][0].n_atoms - predictions = torch.randn(n_atoms, n_classes, requires_grad=True) - loss = atom_type_dataset.loss(index, predictions) - loss.backward() - print(predictions.grad) - - # bonds - print(AlkEthOHBondTypesDataset()[0]) - # angles - print(AlkEthOHAngleTypesDataset()[0]) - # torsions - print(AlkEthOHTorsionTypesDataset()[0]) + datasets = [AlkEthOHAtomTypesDataset(), AlkEthOHBondTypesDataset(), + AlkEthOHAngleTypesDataset(), AlkEthOHTorsionTypesDataset()] + for dataset in datasets: + # check that you can differentiate w.r.t. predictions + print(dataset.__class__.__name__) + + n_classes = dataset.n_classes + + index = np.random.randint(0, len(dataset)) + mol, inds, labels = dataset[index] + n_labeled_entitites = len(labels) + predictions = torch.randn(n_labeled_entitites, n_classes, requires_grad=True) + loss = dataset.loss(index, predictions) + loss.backward() + print(predictions.grad)