VIDA-NYU · EdenWuyifan · Aug 19, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/alpha_automl/builtin_primitives/math_features.py b/alpha_automl/builtin_primitives/math_features.py
@@ -0,0 +1,72 @@
+import numpy as np
+import pandas as pd
+from alpha_automl.base_primitive import BasePrimitive
+from feature_engine.creation import MathFeatures
+
+class MathFeaturesSum(BasePrimitive):
+    def __init__(self, numeric_columns, column_names):
+        self.column_names = column_names
+        self.numeric_columns = numeric_columns
+        self.math_features = MathFeatures(variables=self.numeric_columns, func='sum')
+
+    def fit(self, X, y=None):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        self.math_features.fit(X)
+        return self
+
+    def transform(self, X):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        return self.math_features.transform(X)
+
+class MathFeaturesMean(BasePrimitive):
+    def __init__(self, numeric_columns, column_names):
+        self.column_names = column_names
+        self.numeric_columns = numeric_columns
+        self.math_features = MathFeatures(variables=self.numeric_columns, func='mean')
+
+    def fit(self, X, y=None):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        self.math_features.fit(X)
+        return self
+
+    def transform(self, X):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        return self.math_features.transform(X)
+
+class MathFeaturesStd(BasePrimitive):
+    def __init__(self, numeric_columns, column_names):
+        self.column_names = column_names
+        self.numeric_columns = numeric_columns
+        self.math_features = MathFeatures(variables=self.numeric_columns, func='std')
+
+    def fit(self, X, y=None):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        self.math_features.fit(X)
+        return self
+
+    def transform(self, X):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        return self.math_features.transform(X)
+
+class MathFeaturesProd(BasePrimitive):
+    def __init__(self, numeric_columns, column_names):
+        self.column_names = column_names
+        self.numeric_columns = numeric_columns
+        self.math_features = MathFeatures(variables=self.numeric_columns, func='prod')
+
+    def fit(self, X, y=None):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        self.math_features.fit(X)
+        return self
+
+    def transform(self, X):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X, columns=self.column_names)
+        return self.math_features.transform(X)
diff --git a/alpha_automl/data_profiler.py b/alpha_automl/data_profiler.py
@@ -12,7 +12,7 @@
 
 
 def profile_data(X):
-    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False}
+    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': [], 'column_names': []}
     mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
                         TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}
 
@@ -43,6 +43,11 @@ def profile_data(X):
         if 'missing_values_ratio' in profiled_column:
             metadata['missing_values'] = True
 
+    metadata['numeric_columns'] = [(index_column, column_name) for index_column, column_name in enumerate(X.columns) if X[column_name].dtype in ['int64', 'float64']]
+    metadata['categorical_columns'] = [(index_column, column_name) for index_column, column_name in enumerate(X.columns) if X[column_name].dtype in ['object', 'category']]
+
+    metadata['column_names'] = list(X.columns)
+
     logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
                 f'useless columns = {str(metadata["useless_columns"])}, '
                 f'missing values = {str(metadata["missing_values"])}')

diff --git a/alpha_automl/pipeline_search/agent_lab.py b/alpha_automl/pipeline_search/agent_lab.py
@@ -21,11 +21,11 @@ def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_s
     ray.init(local_mode=True, logging_level=logging.CRITICAL)
     num_cpus = int(ray.available_resources()["CPU"])
 
-    # load checkpoint or create a new one
+    # Load checkpoint or create a new one
     algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=1)
     logger.debug("Create Algo object done")
 
-    # train model
+    # Train model
     train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder)
     logger.debug("Training done")
     ray.shutdown()
@@ -83,20 +83,21 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
         if (
             time.time() > timeout
             or (best_unchanged_iter >= 10 and result["episode_reward_mean"] >= 0)
-            # or result["episode_reward_mean"] >= 70
         ):
             logger.debug(f"Training timeout reached")
             break
 
         if contain_checkpoints(checkpoint_save_folder):
+            # Load the most recent weights
             weights = load_rllib_policy_weights(checkpoint_save_folder)
             algo.set_weights(weights)
         elif contain_checkpoints(checkpoint_load_folder):
             weights = load_rllib_policy_weights(checkpoint_load_folder)
             algo.set_weights(weights)
         result = algo.train()
         logger.debug(pretty_print(result))
-        # stop training of the target train steps or reward are reached
+
+        # Stop training of the target train steps or reward are reached
         if result["episode_reward_mean"] > last_best:
             last_best = result["episode_reward_mean"]
             best_unchanged_iter = 1

diff --git a/alpha_automl/pipeline_synthesis/pipeline_builder.py b/alpha_automl/pipeline_synthesis/pipeline_builder.py
@@ -8,6 +8,7 @@
 from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
     ADDED_PRIMITIVE
 from alpha_automl.primitive_loader import PRIMITIVE_TYPES
+from feature_engine.creation import MathFeatures
 
 logger = logging.getLogger(__name__)
 
@@ -87,6 +88,8 @@ def make_primitive_objects(self, primitives):
         transformers = []
         nonnumeric_columns = self.metadata['nonnumeric_columns']
         useless_columns = self.metadata['useless_columns']
+        numeric_columns = self.metadata['numeric_columns']
+        column_names = self.metadata['column_names']
 
         if len(useless_columns) > 0 and len(nonnumeric_columns) == 0:  # Add the transformer to the first step
             selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
@@ -105,6 +108,8 @@ def make_primitive_objects(self, primitives):
             elif primitive_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or primitive_type == 'REGRESSION_MULTI_ENSEMBLER':
                 estimators = extract_estimators(pipeline_primitives, self.all_primitives)
                 primitive_object = create_object(primitive_name, {'estimators': estimators})
+            elif "alpha_automl.builtin_primitives.math_features" in primitive_name:
+                primitive_object = create_object(primitive_name, {'numeric_columns': [column_name for _, column_name in numeric_columns], 'column_names': column_names})
             elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE:  # It's an installed primitive
                 primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
             else:
@@ -114,6 +119,8 @@ def make_primitive_objects(self, primitives):
 
             if primitive_type in nonnumeric_columns:  # Create a new transformer and add it to the list
                 transformers += self.create_transformers(primitive_object, primitive_name, primitive_type)
+            elif primitive_type == 'FEATURE_GENERATOR':
+                transformers += self.create_transformers(primitive_object, primitive_name, primitive_type)
             else:
                 if len(transformers) > 0:  # Add previous transformers to the pipeline
                     if len(useless_columns) > 0:
@@ -129,13 +136,17 @@ def make_primitive_objects(self, primitives):
     def create_transformers(self, primitive_object, primitive_name, primitive_type):
         column_transformers = []
         nonnumeric_columns = self.metadata['nonnumeric_columns']
+        numeric_columns = self.metadata['numeric_columns']
 
         if primitive_type == 'TEXT_ENCODER':
             column_transformers = [(f'{primitive_name}-{col_name}', primitive_object, col_index) for
                                    col_index, col_name in nonnumeric_columns[primitive_type]]
         elif primitive_type == 'CATEGORICAL_ENCODER' or primitive_type == 'DATETIME_ENCODER' or primitive_type == 'IMAGE_ENCODER':
             column_transformers = [(primitive_name, primitive_object, [col_index for col_index, _
                                                                        in nonnumeric_columns[primitive_type]])]
+        elif primitive_type == 'FEATURE_GENERATOR':
+            column_transformers = [(primitive_name, primitive_object, [col_index for col_index, _
+                                                                       in numeric_columns])]
 
         return column_transformers
 
diff --git a/alpha_automl/pipeline_synthesis/setup_search.py b/alpha_automl/pipeline_synthesis/setup_search.py
@@ -92,7 +92,7 @@ def evaluate_pipeline(primitives):
     checkpoint_save_folder = (
         checkpoints_folder
         if checkpoints_folder is not None
-        else DEFAULT_CHECKPOINT_PATH
+        else output_folder
     )
     game = PipelineGame(config_updated, evaluate_pipeline)
     pipeline_search_rllib(

diff --git a/alpha_automl/resource/primitives_hierarchy.json b/alpha_automl/resource/primitives_hierarchy.json
@@ -20,7 +20,11 @@
         "alpha_automl.builtin_primitives.image_encoder.HogTransformer"
     ],
     "FEATURE_GENERATOR": [
-        "sklearn.preprocessing.PolynomialFeatures"
+        "sklearn.preprocessing.PolynomialFeatures",
+        "alpha_automl.builtin_primitives.math_features.MathFeaturesSum",
+        "alpha_automl.builtin_primitives.math_features.MathFeaturesMean",
+        "alpha_automl.builtin_primitives.math_features.MathFeaturesProd",
+        "alpha_automl.builtin_primitives.math_features.MathFeaturesStd"
     ],
     "FEATURE_SCALER": [
         "sklearn.preprocessing.MaxAbsScaler",
@@ -30,7 +34,9 @@
     "FEATURE_SELECTOR": [
         "sklearn.feature_selection.GenericUnivariateSelect",
         "sklearn.feature_selection.SelectPercentile",
-        "sklearn.feature_selection.SelectKBest"
+        "sklearn.feature_selection.SelectKBest",
+        "feature_engine.selection.SmartCorrelatedSelection",
+        "feature_engine.selection.DropHighPSIFeatures"
     ],
     "COLUMN_TRANSFORMER": [
         "sklearn.compose.ColumnTransformer"

diff --git a/tests/test_data_profiler.py b/tests/test_data_profiler.py
@@ -14,6 +14,13 @@ def test_profile_data():
                                                                  (5, 'country'), (8, 'duration'), (9, 'listed_in'),
                                                                  (10, 'description')],
                                                 'DATETIME_ENCODER': [(6, 'date_added')]},
-                         'useless_columns': [], 'missing_values': True}
+                         'useless_columns': [], 'missing_values': True,
+                         'numeric_columns': [(0, 'show_id'), (7, 'release_year')],
+                         'categorical_columns': [(1, 'type'), (2, 'title'), (3, 'director'), (4, 'cast'),
+                                                 (5, 'country'), (6, 'date_added'), (8, 'duration'), (9, 'listed_in'),
+                                                 (10, 'description')],
+                         'column_names': ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
+                                          'release_year', 'duration', 'listed_in', 'description'],
+                        }
 
     assert actual_metadata == expected_metadata