Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Qcarchive update #187

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
88c66cb
updated qcharcive code to fetch OpenFF Full Optimization Benchmark 1 …
chrisiacovella Sep 20, 2023
6f8198e
Updated torsiondrive parsing. I'm not sure this has sufficient testing.
chrisiacovella Sep 21, 2023
3e5a58f
Adding in some testing of the torsion function
chrisiacovella Sep 21, 2023
94add0c
Adding in some testing of the torsion function.
chrisiacovella Sep 21, 2023
908c35f
Updated collection type name.
chrisiacovella Sep 21, 2023
2afcd1d
fixed import issue in test
chrisiacovella Sep 21, 2023
aa41891
fixed parsing of the schema.
chrisiacovella Sep 21, 2023
dc7b10c
fixing a typo that was causing failure of torsion test.
chrisiacovella Sep 21, 2023
bc3e983
Merge branch 'main' into qcarchive_update
mikemhenry Sep 21, 2023
9cb3189
Slight change to code to add in a function that uses the iterate_reco…
chrisiacovella Sep 21, 2023
59dd1a8
Merge remote-tracking branch 'origin/qcarchive_update' into qcarchive…
chrisiacovella Sep 21, 2023
cb28a48
merged with updated dgl update; removing qcportal pinning to the old …
chrisiacovella Sep 21, 2023
f48ae7c
Made spec_name be a variable
chrisiacovella Sep 21, 2023
5f638fd
Adding in some docstrings
chrisiacovella Sep 21, 2023
86042f3
Addressed Mike's comment.s
chrisiacovella Sep 21, 2023
3861bd7
Added additional basic docstring.
chrisiacovella Sep 21, 2023
4821aad
Added additional basic docstring for torsion parsing
chrisiacovella Sep 21, 2023
a6a4cdf
Fixed bug in iterate function; added test to catch that bug .
chrisiacovella Sep 22, 2023
54ce464
Added support for singlepoint datasets
chrisiacovella Sep 22, 2023
a05bff7
fixing error in test.
chrisiacovella Sep 22, 2023
87e9d5a
Changing the dataset for singlepoint testing as we need to ensure the…
chrisiacovella Sep 22, 2023
384bcc1
Changing the dataset for singlepoint testing as we need to ensure the…
chrisiacovella Sep 22, 2023
cac1776
Move the schema conversion to after checking if a dataset is supporte…
chrisiacovella Sep 22, 2023
01c9de5
Removed support for singlepoint dataset, as openff.molecule cannot pa…
chrisiacovella Sep 22, 2023
3cf37d0
Removed support for singlepoint dataset, as openff.molecule cannot pa…
chrisiacovella Sep 23, 2023
3f32536
Merge branch 'main' into qcarchive_update
mikemhenry Oct 8, 2024
3a3700c
Merge branch 'main' into qcarchive_update
mikemhenry Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions devtools/conda-envs/espaloma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies:
- tqdm
- pydantic <2 # We need our deps to fix this
- dgl =1.1.2
- qcportal =0.15.8
- qcportal>=0.5
# Testing
- pytest
- pytest-cov
Expand All @@ -30,6 +30,5 @@ dependencies:
- nose
- nose-timer
- coverage
- qcportal>=0.15.0
- sphinx
- sphinx_rtd_theme
177 changes: 127 additions & 50 deletions espaloma/data/qcarchive_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Tuple

import numpy as np
import qcportal as ptl
import qcportal
import torch
from openmm import unit
from openmm.unit import Quantity
Expand All @@ -21,35 +21,69 @@
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================
def get_client():
return ptl.FractalClient()
def get_client(url: str = "api.qcarchive.molssi.org") -> qcportal.client.PortalClient:
"""
Returns a instance of the qcportal client.

Parameters
----------
url: str, default="api.qcarchive.molssi.org"
qcportal instance to connect

Returns
-------
qcportal.client.PortalClient
qcportal client instance.
"""
# Note, this may need to be modified to include username/password for non-public servers
return qcportal.PortalClient(url)


def get_collection(
client,
collection_type="OptimizationDataset",
name="OpenFF Full Optimization Benchmark 1",
client,
collection_type="optimization",
name="OpenFF Full Optimization Benchmark 1",
):
collection = client.get_collection(
collection_type,
name,
"""
Connects to a specific dataset on qcportal

Parameters
----------
client: qcportal.client, required
The qcportal client instance
collection_type: str, default="optimization"
The type of qcarchive collection, options are
"torsiondrive", "optimization", "gridoptimization", "reaction", "singlepoint" "manybody"
name: str, default="OpenFF Full Optimization Benchmark 1"
Name of the dataset

Returns
-------
(qcportal dataset, list(str))
Tuple with an instance of qcportal dataset and list of record names

"""
collection = client.get_dataset(
dataset_type=collection_type,
dataset_name=name,
)

record_names = list(collection.data.records)
record_names = collection.entry_names

return collection, record_names


def get_graph(collection, record_name):
# get record and trajectory
record = collection.get_record(record_name, specification="default")
entry = collection.get_entry(record_name)
def process_record(record, entry):
"""
Processes a given record/entry pair from a dataset and returns the graph
"""

from openff.toolkit.topology import Molecule

mol = Molecule.from_qcschema(entry)
mol = Molecule.from_qcschema(entry.dict())

try:
trajectory = record.get_trajectory()
trajectory = record.trajectory
except:
chrisiacovella marked this conversation as resolved.
Show resolved Hide resolved
return None

Expand All @@ -62,7 +96,7 @@ def get_graph(collection, record_name):
g.nodes["g"].data["u_ref"] = torch.tensor(
[
Quantity(
snapshot.properties.scf_total_energy,
snapshot.properties["scf_total_energy"],
esp.units.HARTREE_PER_PARTICLE,
).value_in_unit(esp.units.ENERGY_UNIT)
for snapshot in trajectory
Expand All @@ -74,7 +108,7 @@ def get_graph(collection, record_name):
np.stack(
[
Quantity(
snapshot.get_molecule().geometry,
snapshot.molecule.geometry,
unit.bohr,
).value_in_unit(esp.units.DISTANCE_UNIT)
for snapshot in trajectory
Expand All @@ -89,7 +123,7 @@ def get_graph(collection, record_name):
[
torch.tensor(
Quantity(
snapshot.dict()["return_result"],
np.array(snapshot.properties["return_result"]).reshape((-1, 3)),
esp.units.HARTREE_PER_PARTICLE / unit.bohr,
).value_in_unit(esp.units.FORCE_UNIT),
dtype=torch.get_default_dtype(),
Expand All @@ -102,21 +136,83 @@ def get_graph(collection, record_name):
return g


def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord):
final_molecules = record.get_final_molecules()
final_results = record.get_final_results()
def get_graph(collection, record_name, spec_name="default"):
"""
Processes the qcportal data for a given record name.

Parameters
----------
collection, qcportal dataset, required
The instance of the qcportal dataset
record_name, str, required
The name of a give record
spec_name, str, default="default"
Retrieve data for a given qcportal specification.
Returns
-------
Graph
"""
# get record and trajectory
record = collection.get_record(record_name, specification_name=spec_name)
entry = collection.get_entry(record_name)

g = process_record(record, entry)

return g


angle_keys = list(final_molecules.keys())
def get_graphs(collection, record_names, spec_name="default"):
"""
Processes the qcportal data for a given set of record names.
This uses the qcportal iteration functions which are faster than processing
records one at a time

Parameters
----------
collection, qcportal dataset, required
The instance of the qcportal dataset
record_name, str, required
The name of a give record
spec_name, str, default="default"
Retrieve data for a given qcportal specification.
Returns
-------
list(graph)
Returns a list of the corresponding graph for each record name
"""
g_list = []
for record, entry in zip(
collection.iterate_records(record_names, specification_names=[spec_name]),
collection.iterate_entries(record_names),
):
g = process_record(record, entry)
g_list.append(g)

return g_list


def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveRecord):
molecule_optimization = record.optimizations

angle_keys = list(molecule_optimization.keys())

xyzs = []
energies = []
gradients = []

for angle in angle_keys:
result = final_results[angle]
mol = final_molecules[angle]
# NOTE: this is calling the first index of the optimization array
# this gives the same value as the prior implementation, but I wonder if it
# should be molecule_optimization[angle][-1] in both cases
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kntkb or @yuanqing-wang thoughts?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I've been trying to figure out the structure of the torsion drive datasets (since I have not looked at them really yet prior to this). Considering the example dataset I used in test (I'll put the code below), each angle has n-number of unique initial conformations that are then optimized. In this case, there are 4 configurations (each that has their own trajectory). So I suppose choosing the first vs the last is somewhat irrelevant (I was initially thinking this was a set of chained optimizations, hence my comment...don't ask why I was thinking that).

Should each of these conformations be considered and added to the datasets rather than just arbitrarily picking one?

from espaloma.data import qcarchive_utils
import numpy as np

record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]"
name = "OpenFF Amide Torsion Set v1.0"
collection_type = "torsiondrive"
collection, record_names = qcarchive_utils.get_collection(qcarchive_utils.get_client(), collection_type, name)
record_info = collection.get_record(record_name, specification_name="default")

molecule_optimization = record_info.optimizations
angle_keys = list(molecule_optimization.keys())

angle = angle_keys[0]
mol = molecule_optimization[angle][0].final_molecule
result = molecule_optimization[angle][0].trajectory[-1].properties

looking at the actual configurations:

for i in range(len(molecule_optimization[angle])):
    init =  molecule_optimization[angle][i].initial_molecule.geometry
    final = molecule_optimization[angle][i].final_molecule.geometry
    print(init,"\n-\n", final, "\n--\n")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kntkb or @yuanqing-wang thoughts?

I don't know off the top of my head, but I've played around with different QCArchive workflows in the past. I may have some notes left somewhere, so I'll catch up shortly (tomorrow?).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the api and the way you access the data changed using qcportal v0.5...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the older version of qcportal, "get_final_molecule()" just picked the first one in the array. The full array was still part of the data record, just you had to dig through the qcvars or something to access. From conversations with Ben, there was a lot of trying to force records into a very rigid schema in the old version; he opted to break the schema in a lot of cases to just make it easier to access the relevant information (and make it clearer what information is available).

As I mentioned in an early comment, it seems that for each angle, multiple (in this case 4) independent starting configurations were used. It seems like it would be better to have the code return data for each replicate, but I'm not sure how this would impact any workflows that use this function.

mol = molecule_optimization[angle][0].final_molecule
result = molecule_optimization[angle][0].trajectory[-1].properties

"""Note: force = - gradient"""

e, g = get_energy_and_gradient(result)
# TODO: attach units here? or later?

e = result["current energy"]
g = np.array(result["current gradient"]).reshape(-1, 3)

xyzs.append(mol.geometry)
energies.append(e)
Expand Down Expand Up @@ -146,22 +242,6 @@ def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord):
return flat_angles, xyz_in_order, energies_in_order, gradients_in_order


def get_energy_and_gradient(
snapshot: ptl.models.records.ResultRecord,
) -> Tuple[float, np.ndarray]:
"""Note: force = - gradient"""

# TODO: attach units here? or later?

d = snapshot.dict()
qcvars = d["extras"]["qcvars"]
energy = qcvars["CURRENT ENERGY"]
flat_gradient = np.array(qcvars["CURRENT GRADIENT"])
num_atoms = len(flat_gradient) // 3
gradient = flat_gradient.reshape((num_atoms, 3))
return energy, gradient


MolWithTargets = namedtuple(
"MolWithTargets", ["offmol", "xyz", "energies", "gradients"]
)
Expand All @@ -184,9 +264,9 @@ def get_smiles(x):
g = esp.Graph(mol_ref)

u_ref = np.concatenate(group["energies"].values)
u_ref_prime = np.concatenate(
group["gradients"].values, axis=0
).transpose(1, 0, 2)
u_ref_prime = np.concatenate(group["gradients"].values, axis=0).transpose(
1, 0, 2
)
xyz = np.concatenate(group["xyz"].values, axis=0).transpose(1, 0, 2)

assert u_ref_prime.shape[0] == xyz.shape[0] == mol_ref.n_atoms
Expand Down Expand Up @@ -229,7 +309,7 @@ def breakdown_along_time_axis(g, batch_size=32):

shuffle(idxs)
chunks = [
idxs[_idx * batch_size : (_idx + 1) * batch_size]
idxs[_idx * batch_size: (_idx + 1) * batch_size]
for _idx in range(n_snapshots // batch_size)
]

Expand Down Expand Up @@ -259,10 +339,7 @@ def make_batch_size_consistent(ds, batch_size=32):
return esp.data.dataset.GraphDataset(
list(
itertools.chain.from_iterable(
[
breakdown_along_time_axis(g, batch_size=batch_size)
for g in ds
]
[breakdown_along_time_axis(g, batch_size=batch_size) for g in ds]
)
)
)
Expand Down
64 changes: 64 additions & 0 deletions espaloma/data/tests/test_qcarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,67 @@ def test_get_graph():
collection, record_names = qcarchive_utils.get_collection(client)
record_name = record_names[0]
graph = qcarchive_utils.get_graph(collection, record_name)


def test_get_torsiondrive():
chrisiacovella marked this conversation as resolved.
Show resolved Hide resolved
from espaloma.data import qcarchive_utils
import numpy as np

record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]"

# example dataset
name = "OpenFF Amide Torsion Set v1.0"
collection_type = "torsiondrive"

collection, record_names = qcarchive_utils.get_collection(
qcarchive_utils.get_client(), collection_type, name
)
record_info = collection.get_record(record_name, specification_name="default")

(
flat_angles,
xyz_in_order,
energies_in_order,
gradients_in_order,
) = qcarchive_utils.fetch_td_record(record_info)

assert flat_angles.shape == (24,)
assert energies_in_order.shape == (24,)
assert gradients_in_order.shape == (24, 25, 3)
assert xyz_in_order.shape == (24, 25, 3)

assert np.isclose(energies_in_order[0], -722.2850260791969)
assert np.all(
flat_angles
== np.array(
[
-165,
-150,
-135,
-120,
-105,
-90,
-75,
-60,
-45,
-30,
-15,
0,
15,
30,
45,
60,
75,
90,
105,
120,
135,
150,
165,
180,
]
)
)
assert np.all(
xyz_in_order[0][0] == np.array([-0.66407807, -8.59922225, -0.02685972])
)