VIDA-NYU · EdenWuyifan · Jun 30, 2024 · Jun 3, 2024 · Jun 6, 2024 · Jun 20, 2024
diff --git a/alpha_automl/data_profiler.py b/alpha_automl/data_profiler.py
@@ -12,7 +12,7 @@
 
 
 def profile_data(X):
-    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False}
+    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []}
     mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
                         TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}
 
@@ -43,6 +43,9 @@ def profile_data(X):
         if 'missing_values_ratio' in profiled_column:
             metadata['missing_values'] = True
 
+    metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
+    metadata['categorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)
+
     logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
                 f'useless columns = {str(metadata["useless_columns"])}, '
                 f'missing values = {str(metadata["missing_values"])}')

diff --git a/alpha_automl/pipeline_search/agent_lab.py b/alpha_automl/pipeline_search/agent_lab.py
@@ -21,11 +21,11 @@ def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_s
     ray.init(local_mode=True, logging_level=logging.CRITICAL)
     num_cpus = int(ray.available_resources()["CPU"])
 
-    # load checkpoint or create a new one
+    # Load checkpoint or create a new one
     algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=1)
     logger.debug("Create Algo object done")
 
-    # train model
+    # Train model
     train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder)
     logger.debug("Training done")
     ray.shutdown()
@@ -83,20 +83,21 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
         if (
             time.time() > timeout
             or (best_unchanged_iter >= 10 and result["episode_reward_mean"] >= 0)
-            # or result["episode_reward_mean"] >= 70
         ):
             logger.debug(f"Training timeout reached")
             break
 
         if contain_checkpoints(checkpoint_save_folder):
+            # Load the most recent weights
             weights = load_rllib_policy_weights(checkpoint_save_folder)
             algo.set_weights(weights)
         elif contain_checkpoints(checkpoint_load_folder):
             weights = load_rllib_policy_weights(checkpoint_load_folder)
             algo.set_weights(weights)
         result = algo.train()
         logger.debug(pretty_print(result))
-        # stop training of the target train steps or reward are reached
+
+        # Stop training of the target train steps or reward are reached
         if result["episode_reward_mean"] > last_best:
             last_best = result["episode_reward_mean"]
             best_unchanged_iter = 1

diff --git a/alpha_automl/pipeline_synthesis/pipeline_builder.py b/alpha_automl/pipeline_synthesis/pipeline_builder.py
@@ -8,6 +8,7 @@
 from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
     ADDED_PRIMITIVE
 from alpha_automl.primitive_loader import PRIMITIVE_TYPES
+from feature_engine.creation import MathFeatures
 
 logger = logging.getLogger(__name__)
 
@@ -37,6 +38,17 @@ def change_default_hyperparams(primitive_object):
         primitive_object.set_params(algorithm='SAMME')
 
 
+def create_math_features(primitive_type, columns):
+    if primitive_type == "sum":
+        return MathFeatures(variables=columns, func='sum')
+    elif primitive_type == "mean":
+        return MathFeatures(variables=columns, func='mean')
+    elif primitive_type == "std":
+        return MathFeatures(variables=columns, func='std')
+    elif primitive_type == "prod":
+        return MathFeatures(variables=columns, func='prod')
+
+
 def extract_estimators(pipeline_primitives, all_primitives):
     estimators = []
     estimator_name, estimator_obj = pipeline_primitives.pop()
@@ -87,6 +99,7 @@ def make_primitive_objects(self, primitives):
         transformers = []
         nonnumeric_columns = self.metadata['nonnumeric_columns']
         useless_columns = self.metadata['useless_columns']
+        numeric_columns = self.metadata['numeric_columns']
 
         if len(useless_columns) > 0 and len(nonnumeric_columns) == 0:  # Add the transformer to the first step
             selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
@@ -105,6 +118,9 @@ def make_primitive_objects(self, primitives):
             elif primitive_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or primitive_type == 'REGRESSION_MULTI_ENSEMBLER':
                 estimators = extract_estimators(pipeline_primitives, self.all_primitives)
                 primitive_object = create_object(primitive_name, {'estimators': estimators})
+            elif "feature_engine.creation" in primitive_name:
+                primitive_name_type = primitive_name.split('-')[1]
+                primitive_object = create_math_features(primitive_name_type, numeric_columns)
             elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE:  # It's an installed primitive
                 primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
             else:

diff --git a/alpha_automl/pipeline_synthesis/setup_search.py b/alpha_automl/pipeline_synthesis/setup_search.py
@@ -92,7 +92,7 @@ def evaluate_pipeline(primitives):
     checkpoint_save_folder = (
         checkpoints_folder
         if checkpoints_folder is not None
-        else DEFAULT_CHECKPOINT_PATH
+        else output_folder
     )
     game = PipelineGame(config_updated, evaluate_pipeline)
     pipeline_search_rllib(

diff --git a/alpha_automl/resource/primitives_hierarchy.json b/alpha_automl/resource/primitives_hierarchy.json
@@ -20,7 +20,11 @@
         "alpha_automl.builtin_primitives.image_encoder.HogTransformer"
     ],
     "FEATURE_GENERATOR": [
-        "sklearn.preprocessing.PolynomialFeatures"
+        "alpha_automl.wrapper_primitives.llm_feature_engine.LLMFeatureGenerator",
+        "feature_engine.creation.math_features.MathFeatures-sum",
+        "feature_engine.creation.math_features.MathFeatures-mean",
+        "feature_engine.creation.math_features.MathFeatures-prod",
+        "feature_engine.creation.math_features.MathFeatures-std"
     ],
     "FEATURE_SCALER": [
         "sklearn.preprocessing.MaxAbsScaler",

diff --git a/alpha_automl/wrapper_primitives/llm_feature_engine.py b/alpha_automl/wrapper_primitives/llm_feature_engine.py
@@ -0,0 +1,136 @@
+import os
+import ast
+import copy
+import logging
+import numpy as np
+import pandas as pd
+import openai
+
+from alpha_automl.base_primitive import BasePrimitive
+
+logger = logging.getLogger(__name__)
+
+class LLMFeatureGenerator(BasePrimitive):
+    def __init__(self):
+        self.prompt = None
+        self.code = None
+        pass
+
+    def fit(self, X, y=None):
+        self.prompt = build_prompt_from_df(description="", df=X)
+        self.code = generate_code(self.prompt)
+        return self
+
+    def transform(self, X, y=None):
+        X_cp = copy.deepcopy(X)
+        loc = {}
+        access_scope = {"df": X_cp, "pd": pd, "np": np}
+        parsed = ast.parse(self.code)
+        exec(compile(parsed, filename="<ast>", mode="exec"), access_scope, loc)
+        return np.array(X_cp)
+
+def get_prompt(
+    df, description, iterative=1, data_description_unparsed=None, samples=None, **kwargs
+):
+    how_many = (
+        "up to 10 useful columns. Generate as many features as useful for downstream classifier, but as few as necessary to reach good performance."
+        if iterative == 1
+        else "exactly one useful column"
+    )
+    return f"""
+The dataframe `df` is loaded and in memory. Columns are also named attributes.
+Description of the dataset in `df` (column dtypes might be inaccurate):
+"{data_description_unparsed}"
+
+Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
+{samples}
+
+This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
+Number of samples (rows) in training dataset: {int(len(df))}
+
+This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{description}\".
+Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns.
+The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes.
+This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small.
+The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected.
+Added columns can be used in other codeblocks, dropped columns are not available anymore.
+
+Code formatting for each added column:
+```python
+# (Feature name and description)
+# Usefulness: (Description why this adds useful real world knowledge to classify \"{description}\" according to dataset description and attributes.)
+# Input samples: (Three samples of the columns used in the following code, e.g. '{df.columns[0]}': {list(df.iloc[:3, 0].values)}, '{df.columns[1]}': {list(df.iloc[:3, 1].values)}, ...)
+(Some pandas code using {df.columns[0]}', '{df.columns[1]}', ... to add a new column for each row in df)
+```end
+
+Code formatting for dropping columns:
+```python
+# Explanation why the column XX is dropped
+df.drop(columns=['XX'], inplace=True)
+```end
+
+Each codeblock generates {how_many} and can drop unused columns (Feature selection).
+Each codeblock ends with ```end and starts with "```python"
+Codeblock:
+"""
+
+def build_prompt_from_df(description, df, iterative=1):
+    data_description_unparsed = description
+    feature_importance = {}  # xgb_eval(_obj)
+
+    samples = ""
+    df_ = df.head(3).iloc[:, :1000]
+    for i in list(df_):
+        # show the list of values
+        nan_freq = "%s" % float("%.2g" % (df[i].isna().mean() * 100))
+        s = df_[i].tolist()
+        if str(df[i].dtype) == "float64":
+            s = [round(sample, 2) for sample in s]
+        samples += (
+            f"{df_[i].name} ({df[i].dtype}): NaN-freq [{nan_freq}%], Samples {s}\n"
+        )
+
+    kwargs = {
+        "data_description_unparsed": data_description_unparsed,
+        "samples": samples,
+        "feature_importance": {
+            k: "%s" % float("%.2g" % feature_importance[k]) for k in feature_importance
+        },
+    }
+
+    prompt = get_prompt(
+        df,
+        description,
+        data_description_unparsed=data_description_unparsed,
+        iterative=iterative,
+        samples=samples,
+    )
+
+    return prompt
+
+def generate_code(prompt, model="gpt-4o"):
+    openai_api_key = os.environ.get('OPENAI_API_KEY')
+    client = openai.OpenAI(api_key=openai_api_key)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an expert datascientist assistant solving Kaggle problems. You answer only by generating code. Answer as concisely as possible.",
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+    if model == "skip":
+        return ""
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stop=["```end"],
+        temperature=0.5,
+        max_tokens=4096,
+    )
+    code = completion.choices[0].message.content
+    code = code.replace("```python", "").replace("```", "").replace("<end>", "")
+    return code
diff --git a/tests/test_data_profiler.py b/tests/test_data_profiler.py
@@ -14,6 +14,8 @@ def test_profile_data():
                                                                  (5, 'country'), (8, 'duration'), (9, 'listed_in'),
                                                                  (10, 'description')],
                                                 'DATETIME_ENCODER': [(6, 'date_added')]},
-                         'useless_columns': [], 'missing_values': True}
+                         'useless_columns': [], 'missing_values': True,
+                         'numeric_columns': ['show_id', 'release_year'],
+                         'categorical_columns': ['type', 'title', 'director', 'cast', 'country', 'date_added', 'duration', 'listed_in', 'description']}
 
     assert actual_metadata == expected_metadata