Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add distance selector #78

Merged
merged 30 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
de6acd8
Update endo_ace_ex.py
Hanwen1018 Dec 6, 2023
59a6267
Update pes.py
Hanwen1018 Dec 6, 2023
0db6b53
Update uphill.py
Hanwen1018 Dec 6, 2023
4312d2f
Update endo_ace_ex.py
Hanwen1018 Dec 6, 2023
22701d5
Update selection.py
Hanwen1018 Dec 7, 2023
a8a7e3e
Update active.py
Hanwen1018 Dec 7, 2023
d7a089a
Update selection.py
Hanwen1018 Dec 7, 2023
1bbec21
Update selection.py
Hanwen1018 Dec 7, 2023
3d77885
Update selection.py
Hanwen1018 Dec 7, 2023
db5e355
Update active.py
Hanwen1018 Dec 7, 2023
d1d260c
Update active.py
Hanwen1018 Dec 7, 2023
4fea57f
Merge branch 'main' into main
juraskov Dec 7, 2023
0e90802
Apply suggestions from code review
Hanwen1018 Dec 11, 2023
19e39b5
Update active.py
Hanwen1018 Dec 15, 2023
08c66ec
Update active.py
Hanwen1018 Dec 15, 2023
c28f026
Update active.py
Hanwen1018 Dec 15, 2023
ee4a319
Update selection.py
Hanwen1018 Dec 15, 2023
9d4b367
Update selection.py
Hanwen1018 Dec 15, 2023
7452626
Update selection.py
Hanwen1018 Dec 15, 2023
b944f17
Update test_selection.py
Hanwen1018 Dec 15, 2023
0d91971
Update endo_ace_ex.py
Hanwen1018 Dec 15, 2023
9a62a68
Update endo_ace_im.py
Hanwen1018 Dec 15, 2023
f0cff3c
Update endo_ace_ex.py
Hanwen1018 Dec 15, 2023
a471a8b
Update test_selection.py
Hanwen1018 Dec 15, 2023
bbb14d5
Update environment_ace.yml
Hanwen1018 Dec 15, 2023
befb5a3
Update environment.yml
Hanwen1018 Dec 15, 2023
ed830dc
Update environment_mace.yml
Hanwen1018 Dec 15, 2023
3c01923
Merge branch 'main' into main
juraskov Dec 15, 2023
2923576
Merge branch 'main' into main
juraskov Feb 14, 2024
49d590a
Apply suggestions from code review
juraskov Feb 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies:
- py-plumed
- scipy
- xtb
- scikit-learn
- pip:
- quippy-ase # GAP
- ase@git+https://gitlab.com/ase/ase.git@f2615a6e9a # For PLUMED
1 change: 1 addition & 0 deletions environment_ace.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies:
- py-plumed
- scipy
- xtb
- scikit-learn
- pip:
- julia # Python-Julia integration (this will not install Julia itself!)
- pyjulip@git+https://github.com/casv2/pyjulip.git@72280a6ac3 # Integration with ACE
Expand Down
1 change: 1 addition & 0 deletions environment_mace.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies:
- py-plumed
- scipy
- xtb
- scikit-learn
# MACE dependencies
- pytorch=2.0
- openmm-torch=1.1
Expand Down
10 changes: 5 additions & 5 deletions examples/DA_paper/training/explicit/endo_ace_ex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from autode.atoms import Atom
from mlptrain.log import logger
from mlptrain.box import Box
from mlptrain.training.selection import MaxAtomicEnvDistance
from mlptrain.training.selection import AtomicEnvSimilarity

mlt.Config.n_cores = 10
mlt.Config.orca_keywords = ['wB97M-D3BJ', 'def2-TZVP','def2/J', 'RIJCOSX','EnGrad']
Expand Down Expand Up @@ -247,7 +247,7 @@ def remove_randomly_from_configset(configurationset, remainder):
bulk_water=True,
TS=False)
Water_mlp.al_train(method_name='orca',
selection_method=MaxAtomicEnvDistance(),
selection_method=AtomicEnvSimilarity(),
fix_init_config=True,
init_configs=water_init,
max_active_time=5000)
Expand All @@ -260,7 +260,7 @@ def remove_randomly_from_configset(configurationset, remainder):
bulk_water=True,
TS=True)
ts_in_water_mlp.al_train(method_name='orca',
selection_method=MaxAtomicEnvDistance(),
selection_method=AtomicEnvSimilarity(),
fix_init_config=True,
init_configs=ts_in_water_init,
max_active_time=5000)
Expand All @@ -273,7 +273,7 @@ def remove_randomly_from_configset(configurationset, remainder):
bulk_water=False,
TS=True)
ts_2water_mlp.al_train(method_name='orca',
selection_method=MaxAtomicEnvDistance(),
selection_method=AtomicEnvSimilarity(),
fix_init_config=True,
init_configs=ts_2water_init,
max_active_time=5000)
Expand All @@ -282,7 +282,7 @@ def remove_randomly_from_configset(configurationset, remainder):
ts_gasphase = mlt.System(ts_mol, box=Box([100, 100, 100]))
ts_gasphase_mlp = mlt.potentials.ACE('TS_gasphase', ts_gasphase)
ts_gasphase_mlp.al_train(method_name='orca',
selection_method=MaxAtomicEnvDistance(),
selection_method=AtomicEnvSimilarity(),
fix_init_config=True,
max_active_time=5000)

Expand Down
4 changes: 2 additions & 2 deletions examples/DA_paper/training/implicit/endo_ace_im.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import mlptrain as mlt
from mlptrain.training.selection import MaxAtomicEnvDistance
from mlptrain.training.selection import AtomicEnvSimilarity

mlt.Config.n_cores = 10
mlt.Config.orca_keywords = ['wB97M-D3BJ', 'def2-TZVP','def2/J', 'RIJCOSX','EnGrad', 'CPCM(water)']
Expand All @@ -16,7 +16,7 @@
ace = mlt.potentials.ACE('endo_ace_wB97M_imwater',
system=system)

selector = MaxAtomicEnvDistance()
selector = AtomicEnvSimilarity()
ace.al_train(method_name='orca',
selection_method=selector,
max_active_time=5000,
Expand Down
31 changes: 28 additions & 3 deletions mlptrain/training/active.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,10 +397,35 @@ def _gen_active_config(config: 'mlptrain.Configuration',
selector(traj.final_frame, mlp, method_name=method_name, n_cores=n_cores)

if selector.select:
if traj.final_frame.energy.true is None:
traj.final_frame.single_point(method_name, n_cores=n_cores)
if selector.check:
logger.info('currently applying distance selector,'
'to avoid un-physical structures,'
'do backtracking in the trajectory to'
'find the first configuration in '
'{selector.n_backtrack} steps recognised as outlier')

stride = max(1, len(traj)//selector.n_backtrack)

back_traj = ConfigurationSet()
for i in reversed(traj[::stride]):
back_traj.append(i)

for i, frame in enumerate(back_traj):
logger.info(f'Starting to check {i} th configuration'
'to determine whether it is the first'
'configurations selected by the distance selector')
selector(frame, mlp, method_name=method_name, n_cores=n_cores)
if selector.select is False:
logger.info(f'Selecting {i-1} th configuration.')
frame = back_traj[i-1]
break
else:
frame = traj.final_frame

if frame.energy.true is None:
frame.single_point(method_name, n_cores=n_cores)

return traj.final_frame
return frame

if selector.too_large:

Expand Down
114 changes: 111 additions & 3 deletions mlptrain/training/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from typing import Optional
from mlptrain.descriptors import soap_kernel_vector
from mlptrain.log import logger

from mlptrain.descriptors import soap_matrix
from sklearn.neighbors import LocalOutlierFactor
juraskov marked this conversation as resolved.
Show resolved Hide resolved
from sklearn.decomposition import PCA

class SelectionMethod(ABC):
"""Active learning selection method
Expand Down Expand Up @@ -48,7 +50,15 @@ def n_backtrack(self) -> int:
Returns:
(int):
"""


@property
def check(self) -> bool:
juraskov marked this conversation as resolved.
Show resolved Hide resolved
"""
Whether need to keep checking configuations in the MLP-MD trajectory
until the first configuration that will be selected by the selector is found
"""
juraskov marked this conversation as resolved.
Show resolved Hide resolved
return False

def copy(self) -> 'SelectionMethod':
return deepcopy(self)

Expand Down Expand Up @@ -117,7 +127,7 @@ def n_backtrack(self) -> int:
return 10


class MaxAtomicEnvDistance(SelectionMethod):
class AtomicEnvSimilarity(SelectionMethod):

def __init__(self,
threshold: float = 0.999):
Expand Down Expand Up @@ -187,3 +197,101 @@ def n_backtrack(self) -> int:
def _n_training_envs(self) -> int:
"""Number of training environments available"""
return len(self._k_vec)

def outlier_identifier (configuration: 'mlptrain.Configuration',
configurations:'mlptrain.ConfigurationSet',
dim_reduction: bool = False,
distance_metric: str = 'euclidean',
n_neighbors: int = 15) -> int:
"""
This function aimed to identify whether a new data (configuation)
is the outlier for the existed data (configurations) by Local Outlier
Factor (LOF). For more details about LOF method, please see the lit.
Breunig, M. M., Kriegel, H.-P., Ng, R. T. & Sander, J. LOF: Identifying
density-based local outliers. SIGMOD Rec. 29, 93–104 (2000).

-----------------------------------------------------------------------
Arguments:

dim_reduction: if Ture, dimensional reduction (PCA applied here) will
be performed before LOF calculation.
distance_metric: distance metric used in LOF,
which could be one of 'euclidean',
'cosine' and 'manhattan’.
n_neighbors: how many neighbors should be consider when compute LOF.

-----------------------------------------------------------------------
Returns:

-1 for anomalies/outliers and +1 for inliers.
"""
juraskov marked this conversation as resolved.
Show resolved Hide resolved
juraskov marked this conversation as resolved.
Show resolved Hide resolved

m1 = soap_matrix(configurations)
m1 /= np.linalg.norm(m1, axis=1).reshape(len(configurations), 1)

v1 = soap_matrix(configuration)
v1 /= np.linalg.norm(v1, axis=1).reshape(1, -1)

if dim_reduction:
pca = PCA(n_components=3)
m1 = pca.fit_transform(m1)
v1 = pca.transform(v1)

clf = LocalOutlierFactor(n_neighbors=n_neighbors, metric=distance_metric, novelty=True, contamination=0.2)
'contamination: define the porpotional of outliner in the data, the higher, the less abnormal'

clf.fit(m1)

new = clf.predict(v1)

return new

class AtomicEnvDistance(SelectionMethod):
def __init__(self,
pca: bool = False,
distance_metric: str = "euclidean",
n_neighbors: int = 15):
"""
Selection criteria based on access whether the configuration is
outlier by outlier_identifier function
-----------------------------------------------------------------------
Arguments:
pca: whether to do dimenstional reduction by PCA.
Because the selected distance_metric may potentially suffer from
the curse of dimensionality, the dimensionality reduction step
(using PCA) could be applied before calculating the LOF.
This would ensure good performance in high-dimensional data space.
the other arguments, please see details in outlier_identifier function
juraskov marked this conversation as resolved.
Show resolved Hide resolved
"""
super().__init__()
self.pca = pca
self.metric = distance_metric
self.n_neighbors = n_neighbors

def __call__(self, configuration, mlp, **kwargs) -> None:
self.mlp = mlp
self._configuration = configuration

@property
def select(self) -> bool:
metric = outlier_identifier(self._configuration,
self.mlp.training_data,
self.pca,
self.distance,
self.n_neighbors)
return metric == -1
juraskov marked this conversation as resolved.
Show resolved Hide resolved

@property
def too_large(self) -> bool:
return False

@property
def n_backtrack(self) -> int:
return 10

@property
def check(self) -> bool:
juraskov marked this conversation as resolved.
Show resolved Hide resolved
if self.mlp.n_train > 30:
return True
else:
return False
4 changes: 2 additions & 2 deletions mlptrain/training/tests/test_selection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import mlptrain as mlt
from autode.atoms import Atom
from mlptrain.training.selection import MaxAtomicEnvDistance
from mlptrain.training.selection import AtomicEnvSimilarity
here = os.path.abspath(os.path.dirname(__file__))


Expand Down Expand Up @@ -41,7 +41,7 @@ def test_selection_on_structures():

assert len(configs) == 3

selector = MaxAtomicEnvDistance(threshold=0.9)
selector = AtomicEnvSimilarity (threshold=0.9)
mlp = mlt.potentials.GAP('blank')
mlp.training_data = configs

Expand Down
Loading