duartegroup · juraskov · Feb 14, 2024 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/environment.yml b/environment.yml
@@ -18,6 +18,7 @@ dependencies:
   - py-plumed
   - scipy
   - xtb
+  - scikit-learn
   - pip:
     - quippy-ase # GAP
     - ase@git+https://gitlab.com/ase/ase.git@f2615a6e9a # For PLUMED
diff --git a/environment_ace.yml b/environment_ace.yml
@@ -18,6 +18,7 @@ dependencies:
   - py-plumed
   - scipy
   - xtb
+  - scikit-learn
   - pip:
     - julia   # Python-Julia integration (this will not install Julia itself!)
     - pyjulip@git+https://github.com/casv2/pyjulip.git@72280a6ac3 # Integration with ACE

diff --git a/environment_mace.yml b/environment_mace.yml
@@ -19,6 +19,7 @@ dependencies:
   - py-plumed
   - scipy
   - xtb
+  - scikit-learn
   # MACE dependencies
   - pytorch=2.0
   - openmm-torch=1.1

diff --git a/examples/DA_paper/training/explicit/endo_ace_ex.py b/examples/DA_paper/training/explicit/endo_ace_ex.py
@@ -3,7 +3,7 @@
 from autode.atoms import Atom
 from mlptrain.log import logger
 from mlptrain.box import Box
-from mlptrain.training.selection import MaxAtomicEnvDistance
+from mlptrain.training.selection import AtomicEnvSimilarity
 
 mlt.Config.n_cores = 10
 mlt.Config.orca_keywords = ['wB97M-D3BJ', 'def2-TZVP','def2/J', 'RIJCOSX','EnGrad']
@@ -247,7 +247,7 @@ def remove_randomly_from_configset(configurationset, remainder):
                                        bulk_water=True, 
                                        TS=False)
     Water_mlp.al_train(method_name='orca',
-                      selection_method=MaxAtomicEnvDistance(),
+                      selection_method=AtomicEnvSimilarity(),
                       fix_init_config=True,
                       init_configs=water_init,
                       max_active_time=5000)
@@ -260,7 +260,7 @@ def remove_randomly_from_configset(configurationset, remainder):
                                              bulk_water=True, 
                                              TS=True)
     ts_in_water_mlp.al_train(method_name='orca',
-                      selection_method=MaxAtomicEnvDistance(),
+                      selection_method=AtomicEnvSimilarity(),
                       fix_init_config=True,
                       init_configs=ts_in_water_init,
                       max_active_time=5000)
@@ -273,7 +273,7 @@ def remove_randomly_from_configset(configurationset, remainder):
                                       bulk_water=False, 
                                       TS=True)
     ts_2water_mlp.al_train(method_name='orca',
-                      selection_method=MaxAtomicEnvDistance(),
+                      selection_method=AtomicEnvSimilarity(),
                       fix_init_config=True,
                       init_configs=ts_2water_init,
                       max_active_time=5000)
@@ -282,7 +282,7 @@ def remove_randomly_from_configset(configurationset, remainder):
     ts_gasphase = mlt.System(ts_mol, box=Box([100, 100, 100]))
     ts_gasphase_mlp = mlt.potentials.ACE('TS_gasphase', ts_gasphase)    
     ts_gasphase_mlp.al_train(method_name='orca',
-                             selection_method=MaxAtomicEnvDistance(),
+                             selection_method=AtomicEnvSimilarity(),
                              fix_init_config=True,
                              max_active_time=5000)
 

diff --git a/examples/DA_paper/training/implicit/endo_ace_im.py b/examples/DA_paper/training/implicit/endo_ace_im.py
@@ -1,5 +1,5 @@
 import mlptrain as mlt
-from mlptrain.training.selection import MaxAtomicEnvDistance
+from mlptrain.training.selection import AtomicEnvSimilarity
 
 mlt.Config.n_cores = 10
 mlt.Config.orca_keywords = ['wB97M-D3BJ', 'def2-TZVP','def2/J', 'RIJCOSX','EnGrad', 'CPCM(water)']
@@ -16,7 +16,7 @@
     ace = mlt.potentials.ACE('endo_ace_wB97M_imwater',
                              system=system)
 
-    selector = MaxAtomicEnvDistance()
+    selector = AtomicEnvSimilarity()
     ace.al_train(method_name='orca',
                  selection_method=selector,
                  max_active_time=5000,

diff --git a/mlptrain/training/active.py b/mlptrain/training/active.py
@@ -397,10 +397,35 @@ def _gen_active_config(config:      'mlptrain.Configuration',
     selector(traj.final_frame, mlp, method_name=method_name, n_cores=n_cores)
 
     if selector.select:
-        if traj.final_frame.energy.true is None:
-            traj.final_frame.single_point(method_name, n_cores=n_cores)
+        if selector.check:
+            logger.info('currently applying distance selector,'
+                        'to avoid un-physical structures,'
+                        'do backtracking in the trajectory to'
+                        'find the first configuration in '
+                        '{selector.n_backtrack} steps recognised as outlier')
+
+            stride = max(1, len(traj)//selector.n_backtrack)
+
+            back_traj = ConfigurationSet()
+            for i in reversed(traj[::stride]):
+                back_traj.append(i)
+
+            for i, frame in enumerate(back_traj):
+                logger.info(f'Starting to check {i} th configuration'
+                              'to determine whether it is the first'
+                              'configurations selected by the distance selector')
+                selector(frame, mlp, method_name=method_name, n_cores=n_cores)
+                if selector.select is False:
+                    logger.info(f'Selecting {i-1} th configuration.')
+                    frame = back_traj[i-1]
+                    break
+        else:
+            frame = traj.final_frame
+
+        if frame.energy.true is None:
+            frame.single_point(method_name, n_cores=n_cores)
 
-        return traj.final_frame
+        return frame
 
     if selector.too_large:
 

diff --git a/mlptrain/training/selection.py b/mlptrain/training/selection.py
@@ -5,7 +5,9 @@
 from typing import Optional
 from mlptrain.descriptors import soap_kernel_vector
 from mlptrain.log import logger
-
+from mlptrain.descriptors import soap_matrix
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.decomposition import PCA
 
 class SelectionMethod(ABC):
     """Active learning selection method
@@ -48,7 +50,15 @@ def n_backtrack(self) -> int:
         Returns:
             (int):
         """
-
+
+    @property
+    def check(self)  -> bool:
+        """
+        Whether need to keep checking configuations in the MLP-MD trajectory
+        until the first configuration that will be selected by the selector is found
+        """
+        return False
+
     def copy(self) -> 'SelectionMethod':
         return deepcopy(self)
 
@@ -117,7 +127,7 @@ def n_backtrack(self) -> int:
         return 10
 
 
-class MaxAtomicEnvDistance(SelectionMethod):
+class AtomicEnvSimilarity(SelectionMethod):
 
     def __init__(self,
                  threshold: float = 0.999):
@@ -187,3 +197,101 @@ def n_backtrack(self) -> int:
     def _n_training_envs(self) -> int:
         """Number of training environments available"""
         return len(self._k_vec)
+
+def outlier_identifier (configuration: 'mlptrain.Configuration',
+                        configurations:'mlptrain.ConfigurationSet',
+                        dim_reduction: bool = False,
+                        distance_metric: str = 'euclidean',
+                        n_neighbors: int = 15) -> int:
+    """
+    This function aimed to identify whether a new data (configuation)
+    is the outlier for the existed data (configurations) by Local Outlier 
+    Factor (LOF). For more details about LOF method, please see the lit. 
+    Breunig, M. M., Kriegel, H.-P., Ng, R. T. & Sander, J. LOF: Identifying 
+    density-based local outliers. SIGMOD Rec. 29, 93–104 (2000).
+
+    -----------------------------------------------------------------------
+    Arguments:
+
+    dim_reduction: if Ture, dimensional reduction (PCA applied here) will
+                   be performed before LOF calculation.
+    distance_metric: distance metric used in LOF,
+                     which could be one of 'euclidean', 
+                     'cosine' and 'manhattan’.
+    n_neighbors: how many neighbors should be consider when compute LOF.
+
+    -----------------------------------------------------------------------
+    Returns:
+
+    -1 for anomalies/outliers and +1 for inliers.
+    """
+
+    m1 = soap_matrix(configurations)
+    m1 /= np.linalg.norm(m1, axis=1).reshape(len(configurations), 1)
+
+    v1 = soap_matrix(configuration)
+    v1 /= np.linalg.norm(v1, axis=1).reshape(1, -1)
+
+    if dim_reduction:
+        pca = PCA(n_components=3)
+        m1 = pca.fit_transform(m1)
+        v1 = pca.transform(v1)
+
+    clf = LocalOutlierFactor(n_neighbors=n_neighbors, metric=distance_metric, novelty=True, contamination=0.2)
+    'contamination: define the porpotional of outliner in the data, the higher, the less abnormal'
+
+    clf.fit(m1)
+
+    new = clf.predict(v1)
+
+    return new
+
+class AtomicEnvDistance(SelectionMethod):
+    def __init__(self,
+                 pca: bool = False,
+                 distance_metric: str = "euclidean",
+                 n_neighbors: int = 15):
+        """
+        Selection criteria based on access whether the configuration is 
+        outlier by outlier_identifier function
+        -----------------------------------------------------------------------
+        Arguments:
+            pca: whether to do dimenstional reduction by PCA. 
+                 Because the selected distance_metric may potentially suffer from 
+                 the curse of dimensionality, the dimensionality reduction step 
+                 (using PCA) could be applied before calculating the LOF. 
+                 This would ensure good performance in high-dimensional data space.
+            the other arguments, please see details in outlier_identifier function
+        """
+        super().__init__()
+        self.pca = pca
+        self.metric = distance_metric
+        self.n_neighbors = n_neighbors
+
+    def __call__(self, configuration, mlp, **kwargs) -> None:
+        self.mlp = mlp
+        self._configuration = configuration
+
+    @property
+    def select(self) -> bool:
+        metric = outlier_identifier(self._configuration, 
+                                    self.mlp.training_data, 
+                                    self.pca, 
+                                    self.distance,
+                                    self.n_neighbors)
+        return metric == -1
+
+    @property
+    def too_large(self) -> bool:
+        return False
+
+    @property
+    def n_backtrack(self) -> int:
+        return 10
+
+    @property
+    def check(self)  -> bool:
+        if self.mlp.n_train > 30:
+            return True
+        else:
+            return False
diff --git a/mlptrain/training/tests/test_selection.py b/mlptrain/training/tests/test_selection.py
@@ -1,7 +1,7 @@
 import os
 import mlptrain as mlt
 from autode.atoms import Atom
-from mlptrain.training.selection import MaxAtomicEnvDistance
+from mlptrain.training.selection import AtomicEnvSimilarity
 here = os.path.abspath(os.path.dirname(__file__))
 
 
@@ -41,7 +41,7 @@ def test_selection_on_structures():
 
     assert len(configs) == 3
 
-    selector = MaxAtomicEnvDistance(threshold=0.9)
+    selector = AtomicEnvSimilarity (threshold=0.9)
     mlp = mlt.potentials.GAP('blank')
     mlp.training_data = configs