diff --git a/alpha_automl/automl_api.py b/alpha_automl/automl_api.py index decb95c2..ca735d41 100644 --- a/alpha_automl/automl_api.py +++ b/alpha_automl/automl_api.py @@ -64,6 +64,7 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric') check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy') self.label_encoder = None + self.task_type = task def fit(self, X, y): """ @@ -275,7 +276,7 @@ def export_pipeline_code(self, pipeline_id): :param pipeline_id: Id of a pipeline """ pipeline_obj = self.pipelines[pipeline_id].get_pipeline() - write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj) + write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj, self.task_type) def _fit(self, X, y, pipeline_id): self.pipelines[pipeline_id].get_pipeline().fit(X, y) diff --git a/alpha_automl/hyperparameter_tuning/optuna.py b/alpha_automl/hyperparameter_tuning/optuna.py new file mode 100644 index 00000000..e69de29b diff --git a/alpha_automl/utils.py b/alpha_automl/utils.py index 0e6ba8c0..14d3dfe0 100644 --- a/alpha_automl/utils.py +++ b/alpha_automl/utils.py @@ -11,6 +11,7 @@ from sklearn.compose import ColumnTransformer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import ShuffleSplit, train_test_split +from xgboost import XGBClassifier, XGBRegressor from alpha_automl.primitive_loader import PRIMITIVE_TYPES as INSTALLED_PRIMITIVES @@ -334,19 +335,26 @@ def inverse_transform(self, df): return df.to_numpy() -def write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj): +def write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj, task_type): index = pipeline_id.split("#")[1] f = open(f"pipeline_{index}_code.py", "w") + pipeline_str = """Pipeline(steps=[\n""" # Import f.write("""import pandas as pd from os.path import join, dirname -from sklearn.pipeline import Pipeline""") +from sklearn.pipeline import Pipeline +""") print("""import pandas as pd from os.path import join, dirname from sklearn.pipeline import Pipeline""") + + if task_type == "CLASSIFICATION": + f.write("from sklearn.preprocessing import LabelEncoder\n") + print("from sklearn.preprocessing import LabelEncoder") for step_name, step_obj in pipeline_obj.steps: + # Print step Import path_list = step_name.split('.') f.write(f"""from {".".join(path_list[:-1])} import {path_list[-1]}\n""") print(f"""from {".".join(path_list[:-1])} import {path_list[-1]}""") @@ -356,36 +364,58 @@ def write_pipeline_code_as_pyfile(pipeline_id, pipeline_obj): transformer_list = transformer_name.split('.') f.write(f"""from {".".join(transformer_list[:-1])} import {transformer_list[-1].split("-")[0]}\n""") print(f"""from {".".join(transformer_list[:-1])} import {transformer_list[-1].split("-")[0]}""") + + # Append to pipeline + if isinstance(step_obj, XGBClassifier) or isinstance(step_obj, XGBRegressor): + xgb_params = step_obj.get_xgb_params() + pipeline_str += (f"""\t\t('{step_name}', {step_obj.__class__.__name__}(**{xgb_params})),\n""") + else: + pipeline_str += (f"""\t\t('{step_name}', {step_obj}),\n""") + + pipeline_str += ("\t])\n") # Pipeline fit/predict f.write(f"""if __name__ == '__main__': - \ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) - \tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) - \ttarget_column = 'FILLIN_TARGET_COLUMN_HERE' - \tX_train = train_dataset.drop(columns=[target_column]) - \ty_train = train_dataset[[target_column]] - \tX_pred = pred_dataset.drop(columns=[target_column]) - - \tpipeline = {pipeline_obj} - - \tpipeline.fit(X_train, y_train) - - \tprint(pipeline.predict(X_pred)) - - """) - +\ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) +\tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) +\ttarget_column = 'FILLIN_TARGET_COLUMN_HERE' +\tX_train = train_dataset.drop(columns=[target_column]) +\ty_train = train_dataset[[target_column]] +\tX_pred = pred_dataset.drop(columns=[target_column]) + +\tpipeline = {pipeline_str}""") print(f"""if __name__ == '__main__': - \ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) - \tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) - \ttarget_column = 'FILLIN_TARGET_COLUMN_HERE' - \tX_train = train_dataset.drop(columns=[target_column]) - \ty_train = train_dataset[[target_column]] - \tX_pred = pred_dataset.drop(columns=[target_column]) - - \tpipeline = {pipeline_obj} - - \tpipeline.fit(X_train, y_train) +\ttrain_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) +\tpred_dataset = pd.read_csv(join(dirname(__file__), 'FILLIN_DATASET_PATH_HERE')) +\ttarget_column = 'FILLIN_TARGET_COLUMN_HERE' +\tX_train = train_dataset.drop(columns=[target_column]) +\ty_train = train_dataset[[target_column]] +\tX_pred = pred_dataset.drop(columns=[target_column]) +\tpipeline = {pipeline_str} +""") + + if task_type == "CLASSIFICATION": + f.write(""" +\tlabel_encoder = LabelEncoder() + +\tpipeline.fit(X_train, label_encoder.fit_transform(y_train)) + +\tprint(label_encoder.inverse_transform(pipeline.predict(X_pred))) + +""") + print(""" +\tlabel_encoder = LabelEncoder() +\tpipeline.fit(X_train, label_encoder.fit_transform(y_train)) +\tprint(label_encoder.inverse_transform(pipeline.predict(X_pred))) +""") + else: + f.write(""" +\tpipeline.fit(X_train, y_train) - \tprint(pipeline.predict(X_pred)) +\tprint(pipeline.predict(X_pred)) - """) \ No newline at end of file +""") + print(""" +\tpipeline.fit(X_train, y_train) +\tprint(pipeline.predict(X_pred)) +""") \ No newline at end of file