Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPT based feature generator #102

Open
wants to merge 4 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion alpha_automl/data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def profile_data(X):
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False}
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': []}
mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}

Expand Down Expand Up @@ -43,6 +43,9 @@ def profile_data(X):
if 'missing_values_ratio' in profiled_column:
metadata['missing_values'] = True

metadata['numeric_columns'] = list(X.select_dtypes(include=['int64', 'float64']).columns)
metadata['categorical_columns'] = list(X.select_dtypes(include=['object', 'category']).columns)

logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
f'useless columns = {str(metadata["useless_columns"])}, '
f'missing values = {str(metadata["missing_values"])}')
Expand Down
9 changes: 5 additions & 4 deletions alpha_automl/pipeline_search/agent_lab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_s
ray.init(local_mode=True, logging_level=logging.CRITICAL)
num_cpus = int(ray.available_resources()["CPU"])

# load checkpoint or create a new one
# Load checkpoint or create a new one
algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=1)
logger.debug("Create Algo object done")

# train model
# Train model
train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder)
logger.debug("Training done")
ray.shutdown()
Expand Down Expand Up @@ -83,20 +83,21 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
if (
time.time() > timeout
or (best_unchanged_iter >= 10 and result["episode_reward_mean"] >= 0)
# or result["episode_reward_mean"] >= 70
):
logger.debug(f"Training timeout reached")
break

if contain_checkpoints(checkpoint_save_folder):
# Load the most recent weights
weights = load_rllib_policy_weights(checkpoint_save_folder)
algo.set_weights(weights)
elif contain_checkpoints(checkpoint_load_folder):
weights = load_rllib_policy_weights(checkpoint_load_folder)
algo.set_weights(weights)
result = algo.train()
logger.debug(pretty_print(result))
# stop training of the target train steps or reward are reached

# Stop training of the target train steps or reward are reached
if result["episode_reward_mean"] > last_best:
last_best = result["episode_reward_mean"]
best_unchanged_iter = 1
Expand Down
16 changes: 16 additions & 0 deletions alpha_automl/pipeline_synthesis/pipeline_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
ADDED_PRIMITIVE
from alpha_automl.primitive_loader import PRIMITIVE_TYPES
from feature_engine.creation import MathFeatures

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -37,6 +38,17 @@ def change_default_hyperparams(primitive_object):
primitive_object.set_params(algorithm='SAMME')


def create_math_features(primitive_type, columns):
if primitive_type == "sum":
return MathFeatures(variables=columns, func='sum')
elif primitive_type == "mean":
return MathFeatures(variables=columns, func='mean')
elif primitive_type == "std":
return MathFeatures(variables=columns, func='std')
elif primitive_type == "prod":
return MathFeatures(variables=columns, func='prod')


def extract_estimators(pipeline_primitives, all_primitives):
estimators = []
estimator_name, estimator_obj = pipeline_primitives.pop()
Expand Down Expand Up @@ -87,6 +99,7 @@ def make_primitive_objects(self, primitives):
transformers = []
nonnumeric_columns = self.metadata['nonnumeric_columns']
useless_columns = self.metadata['useless_columns']
numeric_columns = self.metadata['numeric_columns']

if len(useless_columns) > 0 and len(nonnumeric_columns) == 0: # Add the transformer to the first step
selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
Expand All @@ -105,6 +118,9 @@ def make_primitive_objects(self, primitives):
elif primitive_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or primitive_type == 'REGRESSION_MULTI_ENSEMBLER':
estimators = extract_estimators(pipeline_primitives, self.all_primitives)
primitive_object = create_object(primitive_name, {'estimators': estimators})
elif "feature_engine.creation" in primitive_name:
primitive_name_type = primitive_name.split('-')[1]
primitive_object = create_math_features(primitive_name_type, numeric_columns)
elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE: # It's an installed primitive
primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
else:
Expand Down
2 changes: 1 addition & 1 deletion alpha_automl/pipeline_synthesis/setup_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def evaluate_pipeline(primitives):
checkpoint_save_folder = (
checkpoints_folder
if checkpoints_folder is not None
else DEFAULT_CHECKPOINT_PATH
else output_folder
)
game = PipelineGame(config_updated, evaluate_pipeline)
pipeline_search_rllib(
Expand Down
6 changes: 5 additions & 1 deletion alpha_automl/resource/primitives_hierarchy.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
"alpha_automl.builtin_primitives.image_encoder.HogTransformer"
],
"FEATURE_GENERATOR": [
"sklearn.preprocessing.PolynomialFeatures"
"alpha_automl.wrapper_primitives.llm_feature_engine.LLMFeatureGenerator",
"feature_engine.creation.math_features.MathFeatures-sum",
"feature_engine.creation.math_features.MathFeatures-mean",
"feature_engine.creation.math_features.MathFeatures-prod",
"feature_engine.creation.math_features.MathFeatures-std"
],
"FEATURE_SCALER": [
"sklearn.preprocessing.MaxAbsScaler",
Expand Down
136 changes: 136 additions & 0 deletions alpha_automl/wrapper_primitives/llm_feature_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os
import ast
import copy
import logging
import numpy as np
import pandas as pd
import openai

from alpha_automl.base_primitive import BasePrimitive

logger = logging.getLogger(__name__)

class LLMFeatureGenerator(BasePrimitive):
def __init__(self):
self.prompt = None
self.code = None
pass

def fit(self, X, y=None):
self.prompt = build_prompt_from_df(description="", df=X)
self.code = generate_code(self.prompt)
return self

def transform(self, X, y=None):
X_cp = copy.deepcopy(X)
loc = {}
access_scope = {"df": X_cp, "pd": pd, "np": np}
parsed = ast.parse(self.code)
exec(compile(parsed, filename="<ast>", mode="exec"), access_scope, loc)
return np.array(X_cp)

def get_prompt(
df, description, iterative=1, data_description_unparsed=None, samples=None, **kwargs
):
how_many = (
"up to 10 useful columns. Generate as many features as useful for downstream classifier, but as few as necessary to reach good performance."
if iterative == 1
else "exactly one useful column"
)
return f"""
The dataframe `df` is loaded and in memory. Columns are also named attributes.
Description of the dataset in `df` (column dtypes might be inaccurate):
"{data_description_unparsed}"

Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
{samples}

This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
Number of samples (rows) in training dataset: {int(len(df))}

This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{description}\".
Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns.
The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes.
This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small.
The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected.
Added columns can be used in other codeblocks, dropped columns are not available anymore.

Code formatting for each added column:
```python
# (Feature name and description)
# Usefulness: (Description why this adds useful real world knowledge to classify \"{description}\" according to dataset description and attributes.)
# Input samples: (Three samples of the columns used in the following code, e.g. '{df.columns[0]}': {list(df.iloc[:3, 0].values)}, '{df.columns[1]}': {list(df.iloc[:3, 1].values)}, ...)
(Some pandas code using {df.columns[0]}', '{df.columns[1]}', ... to add a new column for each row in df)
```end

Code formatting for dropping columns:
```python
# Explanation why the column XX is dropped
df.drop(columns=['XX'], inplace=True)
```end

Each codeblock generates {how_many} and can drop unused columns (Feature selection).
Each codeblock ends with ```end and starts with "```python"
Codeblock:
"""

def build_prompt_from_df(description, df, iterative=1):
data_description_unparsed = description
feature_importance = {} # xgb_eval(_obj)

samples = ""
df_ = df.head(3).iloc[:, :1000]
for i in list(df_):
# show the list of values
nan_freq = "%s" % float("%.2g" % (df[i].isna().mean() * 100))
s = df_[i].tolist()
if str(df[i].dtype) == "float64":
s = [round(sample, 2) for sample in s]
samples += (
f"{df_[i].name} ({df[i].dtype}): NaN-freq [{nan_freq}%], Samples {s}\n"
)

kwargs = {
"data_description_unparsed": data_description_unparsed,
"samples": samples,
"feature_importance": {
k: "%s" % float("%.2g" % feature_importance[k]) for k in feature_importance
},
}

prompt = get_prompt(
df,
description,
data_description_unparsed=data_description_unparsed,
iterative=iterative,
samples=samples,
)

return prompt

def generate_code(prompt, model="gpt-4o"):
openai_api_key = os.environ.get('OPENAI_API_KEY')
client = openai.OpenAI(api_key=openai_api_key)
messages = [
{
"role": "system",
"content": "You are an expert datascientist assistant solving Kaggle problems. You answer only by generating code. Answer as concisely as possible.",
},
{
"role": "user",
"content": prompt,
},
]
if model == "skip":
return ""

completion = client.chat.completions.create(
model=model,
messages=messages,
stop=["```end"],
temperature=0.5,
max_tokens=4096,
)
code = completion.choices[0].message.content
code = code.replace("```python", "").replace("```", "").replace("<end>", "")
return code
4 changes: 3 additions & 1 deletion tests/test_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def test_profile_data():
(5, 'country'), (8, 'duration'), (9, 'listed_in'),
(10, 'description')],
'DATETIME_ENCODER': [(6, 'date_added')]},
'useless_columns': [], 'missing_values': True}
'useless_columns': [], 'missing_values': True,
'numeric_columns': ['show_id', 'release_year'],
'categorical_columns': ['type', 'title', 'director', 'cast', 'country', 'date_added', 'duration', 'listed_in', 'description']}

assert actual_metadata == expected_metadata
Loading