Skip to content

Commit

Permalink
add XGBoost pipeline and add labelencoder for classification tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Nov 21, 2023
1 parent 1bdda7e commit fef5534
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 30 deletions.
3 changes: 2 additions & 1 deletion alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo
check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric')
check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy')
self.label_encoder = None
self.task_type = task

def fit(self, X, y):
"""
Expand Down Expand Up @@ -275,7 +276,7 @@ def export_pipeline_code(self, pipeline_id):
:param pipeline_id: Id of a pipeline
"""
pipeline_obj = self.pipelines[pipeline_id].get_pipeline()
write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj)
write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj, self.task_type)

def _fit(self, X, y, pipeline_id):
self.pipelines[pipeline_id].get_pipeline().fit(X, y)
Expand Down
Empty file.
88 changes: 59 additions & 29 deletions alpha_automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit, train_test_split
from xgboost import XGBClassifier, XGBRegressor

from alpha_automl.primitive_loader import PRIMITIVE_TYPES as INSTALLED_PRIMITIVES

Expand Down Expand Up @@ -334,19 +335,26 @@ def inverse_transform(self, df):
return df.to_numpy()


def write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj):
def write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj, task_type):
index = pipeline_id.split("#")[1]
f = open(f"pipeline_{index}_code.py", "w")

pipeline_str = """Pipeline(steps=[\n"""
# Import
f.write("""import pandas as pd
from os.path import join, dirname
from sklearn.pipeline import Pipeline""")
from sklearn.pipeline import Pipeline
""")
print("""import pandas as pd
from os.path import join, dirname
from sklearn.pipeline import Pipeline""")

if task_type == "CLASSIFICATION":
f.write("from sklearn.preprocessing import LabelEncoder\n")
print("from sklearn.preprocessing import LabelEncoder")

for step_name, step_obj in pipeline_obj.steps:
# Print step Import
path_list = step_name.split('.')
f.write(f"""from {".".join(path_list[:-1])} import {path_list[-1]}\n""")
print(f"""from {".".join(path_list[:-1])} import {path_list[-1]}""")
Expand All @@ -356,36 +364,58 @@ def write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj):
transformer_list = transformer_name.split('.')
f.write(f"""from {".".join(transformer_list[:-1])} import {transformer_list[-1].split("-")[0]}\n""")
print(f"""from {".".join(transformer_list[:-1])} import {transformer_list[-1].split("-")[0]}""")

# Append to pipeline
if isinstance(step_obj, XGBClassifier) or isinstance(step_obj, XGBRegressor):
xgb_params = step_obj.get_xgb_params()
pipeline_str += (f"""\t\t('{step_name}', {step_obj.__class__.__name__}(**{xgb_params})),\n""")
else:
pipeline_str += (f"""\t\t('{step_name}', {step_obj}),\n""")

pipeline_str += ("\t])\n")

# Pipeline fit/predict
f.write(f"""if __name__ == '__main__':
\ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\ttarget_column = 'FILLIN_TARGET_COLUMN_HERE'
\tX_train = train_dataset.drop(columns=[target_column])
\ty_train = train_dataset[[target_column]]
\tX_pred = pred_dataset.drop(columns=[target_column])
\tpipeline = {pipeline_obj}
\tpipeline.fit(X_train, y_train)
\tprint(pipeline.predict(X_pred))
""")

\ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\ttarget_column = 'FILLIN_TARGET_COLUMN_HERE'
\tX_train = train_dataset.drop(columns=[target_column])
\ty_train = train_dataset[[target_column]]
\tX_pred = pred_dataset.drop(columns=[target_column])
\tpipeline = {pipeline_str}""")
print(f"""if __name__ == '__main__':
\ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\ttarget_column = 'FILLIN_TARGET_COLUMN_HERE'
\tX_train = train_dataset.drop(columns=[target_column])
\ty_train = train_dataset[[target_column]]
\tX_pred = pred_dataset.drop(columns=[target_column])
\tpipeline = {pipeline_obj}
\tpipeline.fit(X_train, y_train)
\ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE'))
\ttarget_column = 'FILLIN_TARGET_COLUMN_HERE'
\tX_train = train_dataset.drop(columns=[target_column])
\ty_train = train_dataset[[target_column]]
\tX_pred = pred_dataset.drop(columns=[target_column])
\tpipeline = {pipeline_str}
""")

if task_type == "CLASSIFICATION":
f.write("""
\tlabel_encoder = LabelEncoder()
\tpipeline.fit(X_train, label_encoder.fit_transform(y_train))
\tprint(label_encoder.inverse_transform(pipeline.predict(X_pred)))
""")
print("""
\tlabel_encoder = LabelEncoder()
\tpipeline.fit(X_train, label_encoder.fit_transform(y_train))
\tprint(label_encoder.inverse_transform(pipeline.predict(X_pred)))
""")
else:
f.write("""
\tpipeline.fit(X_train, y_train)
\tprint(pipeline.predict(X_pred))
\tprint(pipeline.predict(X_pred))
""")
""")
print("""
\tpipeline.fit(X_train, y_train)
\tprint(pipeline.predict(X_pred))
""")

0 comments on commit fef5534

Please sign in to comment.