From 338d30645335f9221e94d1a3315b7d47f66297e6 Mon Sep 17 00:00:00 2001
From: Eden Wu <yfw215@cs033.hpc.nyu.edu>
Date: Sun, 30 Jun 2024 16:52:26 -0400
Subject: [PATCH 1/4] add LLM based feature generator

---
 .../resource/primitives_hierarchy.json        |   2 +-
 .../wrapper_primitives/llm_feature_engine.py  | 136 ++++++++++++++++++
 2 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 alpha_automl/wrapper_primitives/llm_feature_engine.py
diff --git a/alpha_automl/resource/primitives_hierarchy.json b/alpha_automl/resource/primitives_hierarchy.json
index 0addea0c..03ad60d0 100644
--- a/alpha_automl/resource/primitives_hierarchy.json
+++ b/alpha_automl/resource/primitives_hierarchy.json
@@ -20,7 +20,7 @@
         "alpha_automl.builtin_primitives.image_encoder.HogTransformer"
     ],
     "FEATURE_GENERATOR": [
-        "sklearn.preprocessing.PolynomialFeatures"
+        "alpha_automl.wrapper_primitives.llm_feature_engine.LLMFeatureGenerator"
     ],
     "FEATURE_SCALER": [
         "sklearn.preprocessing.MaxAbsScaler",
diff --git a/alpha_automl/wrapper_primitives/llm_feature_engine.py b/alpha_automl/wrapper_primitives/llm_feature_engine.py
new file mode 100644
index 00000000..99131950
--- /dev/null
+++ b/alpha_automl/wrapper_primitives/llm_feature_engine.py
@@ -0,0 +1,136 @@
+import os
+import ast
+import copy
+import logging
+import numpy as np
+import pandas as pd
+import openai
+
+from alpha_automl.base_primitive import BasePrimitive
+
+logger = logging.getLogger(__name__)
+
+class LLMFeatureGenerator(BasePrimitive):
+    def __init__(self):
+        self.prompt = None
+        self.code = None
+        pass
+
+    def fit(self, X, y=None):
+        self.prompt = build_prompt_from_df(description="", df=X)
+        self.code = generate_code(self.prompt)
+        return self
+
+    def transform(self, X, y=None):
+        X_cp = copy.deepcopy(X)
+        loc = {}
+        access_scope = {"df": X_cp, "pd": pd, "np": np}
+        parsed = ast.parse(self.code)
+        exec(compile(parsed, filename="<ast>", mode="exec"), access_scope, loc)
+        return np.array(X_cp)
+
+def get_prompt(
+    df, description, iterative=1, data_description_unparsed=None, samples=None, **kwargs
+):
+    how_many = (
+        "up to 10 useful columns. Generate as many features as useful for downstream classifier, but as few as necessary to reach good performance."
+        if iterative == 1
+        else "exactly one useful column"
+    )
+    return f"""
+The dataframe `df` is loaded and in memory. Columns are also named attributes.
+Description of the dataset in `df` (column dtypes might be inaccurate):
+"{data_description_unparsed}"
+
+Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
+{samples}
+    
+This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
+Number of samples (rows) in training dataset: {int(len(df))}
+    
+This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{description}\".
+Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns.
+The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes.
+This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small.
+The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected.
+Added columns can be used in other codeblocks, dropped columns are not available anymore.
+
+Code formatting for each added column:
+```python
+# (Feature name and description)
+# Usefulness: (Description why this adds useful real world knowledge to classify \"{description}\" according to dataset description and attributes.)
+# Input samples: (Three samples of the columns used in the following code, e.g. '{df.columns[0]}': {list(df.iloc[:3, 0].values)}, '{df.columns[1]}': {list(df.iloc[:3, 1].values)}, ...)
+(Some pandas code using {df.columns[0]}', '{df.columns[1]}', ... to add a new column for each row in df)
+```end
+
+Code formatting for dropping columns:
+```python
+# Explanation why the column XX is dropped
+df.drop(columns=['XX'], inplace=True)
+```end
+
+Each codeblock generates {how_many} and can drop unused columns (Feature selection).
+Each codeblock ends with ```end and starts with "```python"
+Codeblock:
+"""
+
+def build_prompt_from_df(description, df, iterative=1):
+    data_description_unparsed = description
+    feature_importance = {}  # xgb_eval(_obj)
+
+    samples = ""
+    df_ = df.head(3).iloc[:, :1000]
+    for i in list(df_):
+        # show the list of values
+        nan_freq = "%s" % float("%.2g" % (df[i].isna().mean() * 100))
+        s = df_[i].tolist()
+        if str(df[i].dtype) == "float64":
+            s = [round(sample, 2) for sample in s]
+        samples += (
+            f"{df_[i].name} ({df[i].dtype}): NaN-freq [{nan_freq}%], Samples {s}\n"
+        )
+
+    kwargs = {
+        "data_description_unparsed": data_description_unparsed,
+        "samples": samples,
+        "feature_importance": {
+            k: "%s" % float("%.2g" % feature_importance[k]) for k in feature_importance
+        },
+    }
+
+    prompt = get_prompt(
+        df,
+        description,
+        data_description_unparsed=data_description_unparsed,
+        iterative=iterative,
+        samples=samples,
+    )
+
+    return prompt
+
+def generate_code(prompt, model="gpt-4o"):
+    openai_api_key = os.environ.get('OPENAI_API_KEY')
+    client = openai.OpenAI(api_key=openai_api_key)
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an expert datascientist assistant solving Kaggle problems. You answer only by generating code. Answer as concisely as possible.",
+        },
+        {
+            "role": "user",
+            "content": prompt,
+        },
+    ]
+    if model == "skip":
+        return ""
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stop=["```end"],
+        temperature=0.5,
+        max_tokens=4096,
+    )
+    code = completion.choices[0].message.content
+    code = code.replace("```python", "").replace("```", "").replace("<end>", "")
+    return code

From e71523f070a3d534d6ebaecf8de13d23c586c1c0 Mon Sep 17 00:00:00 2001
From: EdenWuyifan <yfw215@nyu.edu>
Date: Mon, 3 Jun 2024 11:09:14 -0400
Subject: [PATCH 2/4] add math_features and other featuretools

---
 alpha_automl/data_profiler.py                    |  5 ++++-
 .../pipeline_synthesis/pipeline_builder.py       | 16 ++++++++++++++++
 alpha_automl/resource/primitives_hierarchy.json  |  6 +++++-
 tests/test_data_profiler.py                      |  4 +++-
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/alpha_automl/data_profiler.py b/alpha_automl/data_profiler.py
index 7c854cd3..61552461 100644
--- a/alpha_automl/data_profiler.py
+++ b/alpha_automl/data_profiler.py
@@ -12,7 +12,7 @@
 
 
 def profile_data(X):
-    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False}
+    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'catagorical_columns': []}
     mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
                         TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}
 
@@ -43,6 +43,9 @@ def profile_data(X):
         if 'missing_values_ratio' in profiled_column:
             metadata['missing_values'] = True
 
+    metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
+    metadata['catagorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)
+
     logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
                 f'useless columns = {str(metadata["useless_columns"])}, '
                 f'missing values = {str(metadata["missing_values"])}')
diff --git a/alpha_automl/pipeline_synthesis/pipeline_builder.py b/alpha_automl/pipeline_synthesis/pipeline_builder.py
index 342afad6..0fd94617 100644
--- a/alpha_automl/pipeline_synthesis/pipeline_builder.py
+++ b/alpha_automl/pipeline_synthesis/pipeline_builder.py
@@ -8,6 +8,7 @@
 from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
     ADDED_PRIMITIVE
 from alpha_automl.primitive_loader import PRIMITIVE_TYPES
+from feature_engine.creation import MathFeatures
 
 logger = logging.getLogger(__name__)
 
@@ -37,6 +38,17 @@ def change_default_hyperparams(primitive_object):
         primitive_object.set_params(algorithm='SAMME')
 
 
+def create_math_features(primitive_type, columns):
+    if primitive_type == "sum":
+        return MathFeatures(variables=columns, func='sum')
+    elif primitive_type == "mean":
+        return MathFeatures(variables=columns, func='mean')
+    elif primitive_type == "std":
+        return MathFeatures(variables=columns, func='std')
+    elif primitive_type == "prod":
+        return MathFeatures(variables=columns, func='prod')
+
+
 def extract_estimators(pipeline_primitives, all_primitives):
     estimators = []
     estimator_name, estimator_obj = pipeline_primitives.pop()
@@ -87,6 +99,7 @@ def make_primitive_objects(self, primitives):
         transformers = []
         nonnumeric_columns = self.metadata['nonnumeric_columns']
         useless_columns = self.metadata['useless_columns']
+        numeric_columns = self.metadata['numeric_columns']
 
         if len(useless_columns) > 0 and len(nonnumeric_columns) == 0:  # Add the transformer to the first step
             selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
@@ -105,6 +118,9 @@ def make_primitive_objects(self, primitives):
             elif primitive_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or primitive_type == 'REGRESSION_MULTI_ENSEMBLER':
                 estimators = extract_estimators(pipeline_primitives, self.all_primitives)
                 primitive_object = create_object(primitive_name, {'estimators': estimators})
+            elif "feature_engine.creation" in primitive_name:
+                primitive_name_type = primitive_name.split('-')[1]
+                primitive_object = create_math_features(primitive_name_type, numeric_columns)
             elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE:  # It's an installed primitive
                 primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
             else:
diff --git a/alpha_automl/resource/primitives_hierarchy.json b/alpha_automl/resource/primitives_hierarchy.json
index 03ad60d0..e0c93c6a 100644
--- a/alpha_automl/resource/primitives_hierarchy.json
+++ b/alpha_automl/resource/primitives_hierarchy.json
@@ -20,7 +20,11 @@
         "alpha_automl.builtin_primitives.image_encoder.HogTransformer"
     ],
     "FEATURE_GENERATOR": [
-        "alpha_automl.wrapper_primitives.llm_feature_engine.LLMFeatureGenerator"
+        "alpha_automl.wrapper_primitives.llm_feature_engine.LLMFeatureGenerator",
+        "feature_engine.creation.math_features.MathFeatures-sum",
+        "feature_engine.creation.math_features.MathFeatures-mean",
+        "feature_engine.creation.math_features.MathFeatures-prod",
+        "feature_engine.creation.math_features.MathFeatures-std"
     ],
     "FEATURE_SCALER": [
         "sklearn.preprocessing.MaxAbsScaler",
diff --git a/tests/test_data_profiler.py b/tests/test_data_profiler.py
index 97c0d325..ef9bd709 100644
--- a/tests/test_data_profiler.py
+++ b/tests/test_data_profiler.py
@@ -14,6 +14,8 @@ def test_profile_data():
                                                                  (5, 'country'), (8, 'duration'), (9, 'listed_in'),
                                                                  (10, 'description')],
                                                 'DATETIME_ENCODER': [(6, 'date_added')]},
-                         'useless_columns': [], 'missing_values': True}
+                         'useless_columns': [], 'missing_values': True,
+                         'numeric_columns': ['show_id', 'release_year'],
+                         'catagorical_columns': ['type', 'title', 'director', 'cast', 'country', 'date_added', 'duration', 'listed_in', 'description']}
 
     assert actual_metadata == expected_metadata

From cdcc920ed612461e65807acba9b635ab8ee4b25f Mon Sep 17 00:00:00 2001
From: EdenWuyifan <yfw215@nyu.edu>
Date: Thu, 6 Jun 2024 11:34:54 -0400
Subject: [PATCH 3/4] fix typo

---
 alpha_automl/data_profiler.py | 4 ++--
 tests/test_data_profiler.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/alpha_automl/data_profiler.py b/alpha_automl/data_profiler.py
index 61552461..5cf50ac2 100644
--- a/alpha_automl/data_profiler.py
+++ b/alpha_automl/data_profiler.py
@@ -12,7 +12,7 @@
 
 
 def profile_data(X):
-    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'catagorical_columns': []}
+    metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []}
     mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
                         TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}
 
@@ -44,7 +44,7 @@ def profile_data(X):
             metadata['missing_values'] = True
 
     metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
-    metadata['catagorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)
+    metadata['categorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)
 
     logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
                 f'useless columns = {str(metadata["useless_columns"])}, '
diff --git a/tests/test_data_profiler.py b/tests/test_data_profiler.py
index ef9bd709..67966cea 100644
--- a/tests/test_data_profiler.py
+++ b/tests/test_data_profiler.py
@@ -16,6 +16,6 @@ def test_profile_data():
                                                 'DATETIME_ENCODER': [(6, 'date_added')]},
                          'useless_columns': [], 'missing_values': True,
                          'numeric_columns': ['show_id', 'release_year'],
-                         'catagorical_columns': ['type', 'title', 'director', 'cast', 'country', 'date_added', 'duration', 'listed_in', 'description']}
+                         'categorical_columns': ['type', 'title', 'director', 'cast', 'country', 'date_added', 'duration', 'listed_in', 'description']}
 
     assert actual_metadata == expected_metadata

From 8e13f021821b67d25eddb1431fa97495fdc8b5e6 Mon Sep 17 00:00:00 2001
From: Roque Lopez <rlopezc27@gmail.com>
Date: Thu, 20 Jun 2024 14:39:38 -0400
Subject: [PATCH 4/4] Fix path to save checkpoints

---
 alpha_automl/pipeline_search/agent_lab.py       | 9 +++++----
 alpha_automl/pipeline_synthesis/setup_search.py | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/alpha_automl/pipeline_search/agent_lab.py b/alpha_automl/pipeline_search/agent_lab.py
index 1efd9297..a0661f2d 100644
--- a/alpha_automl/pipeline_search/agent_lab.py
+++ b/alpha_automl/pipeline_search/agent_lab.py
@@ -21,11 +21,11 @@ def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_s
     ray.init(local_mode=True, logging_level=logging.CRITICAL)
     num_cpus = int(ray.available_resources()["CPU"])
 
-    # load checkpoint or create a new one
+    # Load checkpoint or create a new one
     algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=1)
     logger.debug("Create Algo object done")
 
-    # train model
+    # Train model
     train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder)
     logger.debug("Training done")
     ray.shutdown()
@@ -83,12 +83,12 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
         if (
             time.time() > timeout
             or (best_unchanged_iter >= 10 and result["episode_reward_mean"] >= 0)
-            # or result["episode_reward_mean"] >= 70
         ):
             logger.debug(f"Training timeout reached")
             break
 
         if contain_checkpoints(checkpoint_save_folder):
+            # Load the most recent weights
             weights = load_rllib_policy_weights(checkpoint_save_folder)
             algo.set_weights(weights)
         elif contain_checkpoints(checkpoint_load_folder):
@@ -96,7 +96,8 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
             algo.set_weights(weights)
         result = algo.train()
         logger.debug(pretty_print(result))
-        # stop training of the target train steps or reward are reached
+
+        # Stop training of the target train steps or reward are reached
         if result["episode_reward_mean"] > last_best:
             last_best = result["episode_reward_mean"]
             best_unchanged_iter = 1
diff --git a/alpha_automl/pipeline_synthesis/setup_search.py b/alpha_automl/pipeline_synthesis/setup_search.py
index d4911105..de19a054 100644
--- a/alpha_automl/pipeline_synthesis/setup_search.py
+++ b/alpha_automl/pipeline_synthesis/setup_search.py
@@ -92,7 +92,7 @@ def evaluate_pipeline(primitives):
     checkpoint_save_folder = (
         checkpoints_folder
         if checkpoints_folder is not None
-        else DEFAULT_CHECKPOINT_PATH
+        else output_folder
     )
     game = PipelineGame(config_updated, evaluate_pipeline)
     pipeline_search_rllib(