Skip to content

Commit

Permalink
Merge pull request #96 from VIDA-NYU/ensemble
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez authored Apr 8, 2024
2 parents da7a2b6 + bafeefd commit d77a9fe
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 90 deletions.
16 changes: 1 addition & 15 deletions alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,21 +105,7 @@ def fit(self, X, y):
for index, pipeline in enumerate(sorted_pipelines, start=1):
pipeline_id = PIPELINE_PREFIX + str(index)
self.pipelines[pipeline_id] = pipeline
if (
pipeline.get_pipeline().steps[-1][0]
== 'sklearn.semi_supervised.SelfTrainingClassifier'
or pipeline.get_pipeline().steps[-1][0]
== 'alpha_automl.builtin_primitives.semisupervised_classifier.AutonBox'
):
leaderboard_data.append(
[
index,
f'{pipeline.get_summary()}, {pipeline.get_pipeline().steps[-1][1].base_estimator.__class__.__name__}',
pipeline.get_score(),
]
)
else:
leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])
leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])

self.leaderboard = pd.DataFrame(leaderboard_data, columns=['ranking', 'pipeline', self.metric])

Expand Down
16 changes: 1 addition & 15 deletions alpha_automl/builtin_primitives/semisupervised_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import (
LabelPropagation,
LabelSpreading,
SelfTrainingClassifier,
LabelSpreading
)

from alpha_automl.base_primitive import BasePrimitive
Expand All @@ -19,19 +18,6 @@
logger = logging.getLogger(__name__)


class SkSelfTrainingClassifier(BasePrimitive):
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
model = SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)

def fit(self, X, y=None):
self.model.fit(X, y)

def predict(self, X):
pred = self.model.predict(X)

return np.array(pred)


def make_label_pipeline(method, X):
step = None
if method == "LabelSpreading":
Expand Down
5 changes: 5 additions & 0 deletions alpha_automl/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
from sklearn.compose import ColumnTransformer
from sklearn.semi_supervised import SelfTrainingClassifier
from alpha_automl.builtin_primitives.semisupervised_classifier import AutonBox
from alpha_automl.utils import COLUMN_SELECTOR_ID


Expand Down Expand Up @@ -56,5 +58,8 @@ def _make_summary(self):
step_name = transformer_name.split('-')[0].split('.')[-1]
if step_name not in step_names:
step_names.append(step_name)
elif isinstance(step_object, SelfTrainingClassifier) or isinstance(step_object, AutonBox):
estimator_name = step_object.base_estimator.__class__.__name__
step_names.append(estimator_name)

self.summary = ', '.join(step_names)
9 changes: 6 additions & 3 deletions alpha_automl/pipeline_search/pipeline/PipelineLogic.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,20 @@ def has_legal_moves(self):

def next_state(self, action):
s = self.valid_moves[action]
nt = self.non_terminals[s[:s.index('-')].strip()]
nt = self.non_terminals[s[:s.index('->')].strip()]
r = [self.non_terminals[p] if p in self.non_terminals.keys() else
self.terminals[p] for p in s[s.index('-')+2:].strip().split(' ')]
self.terminals[p] for p in s[s.index('->')+2:].strip().split(' ')]
r = [x for x in r if x != 0]
s = []
not_used = True

for p in self.pieces_p:
if p == 0:
continue

if p == nt:
if p == nt and not_used: # Chose one primitive at the time
s += r
not_used = False
else:
s.append(p)

Expand Down
64 changes: 29 additions & 35 deletions alpha_automl/pipeline_synthesis/pipeline_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,14 @@
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
ADDED_PRIMITIVE
from alpha_automl.primitive_loader import PRIMITIVE_TYPES

logger = logging.getLogger(__name__)

SEMI_CLASSIFIER_PARAMS = {
"sklearn.discriminant_analysis.LinearDiscriminantAnalysis": {},
"sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis": {},
"sklearn.ensemble.BaggingClassifier": {},
"sklearn.ensemble.ExtraTreesClassifier": {},
"sklearn.ensemble.GradientBoostingClassifier": {},
"sklearn.ensemble.RandomForestClassifier": {},
"sklearn.naive_bayes.BernoulliNB": {},
"sklearn.naive_bayes.GaussianNB": {},
"sklearn.naive_bayes.MultinomialNB": {},
"sklearn.neighbors.KNeighborsClassifier": {},
"sklearn.linear_model.LogisticRegression": {},
"sklearn.linear_model.PassiveAggressiveClassifier": {},
"sklearn.linear_model.SGDClassifier": dict(alpha=1e-5, penalty="l2", loss="log_loss"),
"sklearn.svm.LinearSVC": {},
"sklearn.svm.SVC": {},
"sklearn.tree.DecisionTreeClassifier": {},
"xgboost.XGBClassifier": {},
"lightgbm.LGBMClassifier": dict(verbose=-1),
}


EXTRA_PARAMS = {
"lightgbm.LGBMClassifier": dict(verbose=-1),
Expand All @@ -45,6 +25,23 @@ def change_default_hyperparams(primitive_object):
primitive_object.set_params(handle_unknown='use_encoded_value', unknown_value=-1)
elif isinstance(primitive_object, SimpleImputer):
primitive_object.set_params(strategy='most_frequent', keep_empty_features=True)
elif isinstance(primitive_object, AdaBoostClassifier):
primitive_object.set_params(algorithm='SAMME')


def extract_estimators(pipeline_primitives, all_primitives):
estimators = []
classifier_name, classifier_obj = pipeline_primitives.pop()
current_primitive_type = all_primitives[classifier_name]['type']
counter = 0

while current_primitive_type == 'CLASSIFIER':
estimators.append((f'{classifier_name}-{counter}', classifier_obj))
classifier_name, classifier_obj = pipeline_primitives.pop()
current_primitive_type = all_primitives[classifier_name]['type']
counter += 1

return estimators


class BaseBuilder:
Expand Down Expand Up @@ -88,27 +85,26 @@ def make_primitive_objects(self, primitives):
transformer_obj = ColumnTransformer([selector], remainder='passthrough')
pipeline_primitives.append((COLUMN_TRANSFORMER_ID, transformer_obj))

for primitive in primitives:
primitive_name = primitive
for primitive_name in primitives:
primitive_type = self.all_primitives[primitive_name]['type']

# Make sure that SEMISUPERVISED_CLASSIFIER primitive has a classifier primitive behind
if primitive_type == 'SEMISUPERVISED_CLASSIFIER':
if self.all_primitives[primitives[-1]]['type'] != 'CLASSIFIER':
return
classifier_obj = create_object(primitives[-1], SEMI_CLASSIFIER_PARAMS[primitives[-1]])
if primitive_type == 'SEMISUPERVISED_SELFTRAINER':
classifier_obj = pipeline_primitives.pop()[1]
primitive_object = create_object(primitive_name, {'base_estimator': classifier_obj})
elif primitive_type == 'SINGLE_ENSEMBLER':
classifier_obj = pipeline_primitives.pop()[1]
primitive_object = create_object(primitive_name, {'estimator': classifier_obj})
elif primitive_type == 'MULTI_ENSEMBLER':
estimators = extract_estimators(pipeline_primitives, self.all_primitives)
primitive_object = create_object(primitive_name, {'estimators': estimators})
elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE: # It's an installed primitive
if primitive in EXTRA_PARAMS:
primitive_object = create_object(primitive, EXTRA_PARAMS[primitive])
else:
primitive_object = create_object(primitive)
primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
else:
primitive_object = self.automl_hyperparams['new_primitives'][primitive_name]['primitive_object']

change_default_hyperparams(primitive_object)

if primitive_type in nonnumeric_columns: # Create a new transformer and add it to the list
if primitive_type in nonnumeric_columns: # Create a new transformer and add it to the list
transformers += self.create_transformers(primitive_object, primitive_name, primitive_type)
else:
if len(transformers) > 0: # Add previous transformers to the pipeline
Expand All @@ -119,8 +115,6 @@ def make_primitive_objects(self, primitives):
pipeline_primitives.append((COLUMN_TRANSFORMER_ID, transformer_obj))
transformers = []
pipeline_primitives.append((primitive_name, primitive_object))
if primitive_type == 'SEMISUPERVISED_CLASSIFIER':
break

return pipeline_primitives

Expand Down
43 changes: 43 additions & 0 deletions alpha_automl/pipeline_synthesis/setup_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,57 @@ def signal_handler(queue, signum):
sys.exit(0)


def check_repeated_classifiers(pipeline_primitives, all_primitives, ensemble_pipelines_hash):
# Verify if the classifiers are repeated in the ensembles (regardless of the order)
classifiers = []
pipeline_hash = ''
has_ensemble_primitive = False
has_repeated_classifiers = False

for primitive_name in pipeline_primitives:
primitive_type = all_primitives[primitive_name]['type']

if primitive_type == 'CLASSIFIER':
classifiers.append(primitive_name)
elif primitive_type == 'MULTI_ENSEMBLER':
has_ensemble_primitive = True
pipeline_hash += primitive_name
if len(classifiers) != len(set(classifiers)): # All classifiers should be different
has_repeated_classifiers = True
else:
pipeline_hash += primitive_name

if not has_ensemble_primitive:
return False

if has_repeated_classifiers:
return True

pipeline_hash += ''.join(sorted(classifiers))

if pipeline_hash in ensemble_pipelines_hash:
return True
else:
ensemble_pipelines_hash.add(pipeline_hash)
return False


def search_pipelines(X, y, scoring, splitting_strategy, task_name, automl_hyperparams, metadata, output_folder, verbose,
queue):
signal.signal(signal.SIGTERM, lambda signum, frame: signal_handler(queue, signum))
hide_logs(verbose) # Hide logs here too, since multiprocessing has some issues with loggers

builder = BaseBuilder(metadata, automl_hyperparams)
all_primitives = builder.all_primitives
ensemble_pipelines_hash = set()

def evaluate_pipeline(primitives, origin):
has_repeated_classifiers = check_repeated_classifiers(primitives, all_primitives, ensemble_pipelines_hash)

if has_repeated_classifiers:
logger.debug('Repeated classifiers detected in ensembles, ignoring pipeline')
return None

pipeline = builder.make_pipeline(primitives)
score = None

Expand Down
16 changes: 10 additions & 6 deletions alpha_automl/resource/base_grammar.bnf
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
S -> CLASSIFICATION_TASK | REGRESSION_TASK | CLUSTERING_TASK | TIME_SERIES_FORECAST_TASK | SEMISUPERVISED_TASK
CLASSIFICATION_TASK -> IMPUTER ENCODERS FEATURE_SCALER FEATURE_SELECTOR CLASSIFIER
CLASSIFICATION_TASK -> IMPUTER ENCODERS FEATURE_SCALER FEATURE_SELECTOR CLASSIFIER ENSEMBLER
REGRESSION_TASK -> IMPUTER ENCODERS FEATURE_SCALER FEATURE_SELECTOR REGRESSOR
CLUSTERING_TASK -> IMPUTER ENCODERS FEATURE_SCALER FEATURE_SELECTOR CLUSTERER
TIME_SERIES_FORECAST_TASK -> REGRESSION_TASK | IMPUTER TIME_SERIES_FORECAST
SEMISUPERVISED_TASK -> IMPUTER ENCODERS FEATURE_SCALER SEMISUPERVISED_CLASSIFIER CLASSIFIER | IMPUTER ENCODERS FEATURE_SCALER LABELPROPAGATION_CLASSIFIER
TIME_SERIES_FORECAST_TASK -> IMPUTER TIME_SERIES_FORECASTER | REGRESSION_TASK
SEMISUPERVISED_TASK -> IMPUTER ENCODERS FEATURE_SCALER SEMISUPERVISED_CLASSIFIER
NA_TASK -> CLASSIFICATION_TASK | REGRESSION_TASK | SEMISUPERVISED_TASK
ENCODERS -> TEXT_ENCODER DATETIME_ENCODER CATEGORICAL_ENCODER IMAGE_ENCODER
ENSEMBLER -> SINGLE_ENSEMBLER | CLASSIFIER CLASSIFIER MULTI_ENSEMBLER | E
SEMISUPERVISED_CLASSIFIER -> CLASSIFIER SEMISUPERVISED_SELFTRAINER | SEMISUPERVISED_LABELPROPAGATOR
IMPUTER -> 'primitive_terminal'
FEATURE_SCALER -> 'primitive_terminal' | 'E'
FEATURE_SELECTOR -> 'primitive_terminal' | 'E'
TEXT_ENCODER -> 'primitive_terminal'
CATEGORICAL_ENCODER -> 'primitive_terminal'
DATETIME_ENCODER -> 'primitive_terminal'
IMAGE_ENCODER -> 'primitive_terminal'
SINGLE_ENSEMBLER -> 'primitive_terminal'
MULTI_ENSEMBLER -> 'primitive_terminal'
CLASSIFIER -> 'primitive_terminal'
REGRESSOR -> 'primitive_terminal'
CLUSTERER -> 'primitive_terminal'
TIME_SERIES_FORECAST -> 'primitive_terminal'
SEMISUPERVISED_CLASSIFIER -> 'primitive_terminal'
LABELPROPAGATION_CLASSIFIER -> 'primitive_terminal'
TIME_SERIES_FORECASTER -> 'primitive_terminal'
SEMISUPERVISED_SELFTRAINER -> 'primitive_terminal'
SEMISUPERVISED_LABELPROPAGATOR -> 'primitive_terminal'
15 changes: 11 additions & 4 deletions alpha_automl/resource/primitives_hierarchy.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"CLASSIFIER": [
"sklearn.discriminant_analysis.LinearDiscriminantAnalysis",
"sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis",
"sklearn.ensemble.BaggingClassifier",
"sklearn.ensemble.ExtraTreesClassifier",
"sklearn.ensemble.GradientBoostingClassifier",
"sklearn.ensemble.RandomForestClassifier",
Expand Down Expand Up @@ -82,18 +81,26 @@
"COLUMN_TRANSFORMER": [
"sklearn.compose.ColumnTransformer"
],
"TIME_SERIES_FORECAST": [
"TIME_SERIES_FORECASTER": [
"alpha_automl.builtin_primitives.time_series_forecasting.ArimaEstimator",
"alpha_automl.builtin_primitives.time_series_forecasting.DeeparEstimator",
"alpha_automl.builtin_primitives.time_series_forecasting.NBEATSEstimator",
"alpha_automl.builtin_primitives.time_series_forecasting.NHITSEstimator"
],
"SEMISUPERVISED_CLASSIFIER": [
"SEMISUPERVISED_SELFTRAINER": [
"alpha_automl.builtin_primitives.semisupervised_classifier.AutonBox",
"sklearn.semi_supervised.SelfTrainingClassifier"
],
"LABELPROPAGATION_CLASSIFIER": [
"SEMISUPERVISED_LABELPROPAGATOR": [
"alpha_automl.builtin_primitives.semisupervised_classifier.SkLabelSpreading",
"alpha_automl.builtin_primitives.semisupervised_classifier.SkLabelPropagation"
],
"SINGLE_ENSEMBLER": [
"sklearn.ensemble.AdaBoostClassifier",
"sklearn.ensemble.BaggingClassifier"
],
"MULTI_ENSEMBLER": [
"sklearn.ensemble.StackingClassifier",
"sklearn.ensemble.VotingClassifier"
]
}
22 changes: 21 additions & 1 deletion alpha_automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
import torch
from enum import Enum
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit, train_test_split
Expand All @@ -24,6 +25,25 @@
RANDOM_SEED = 0


class PrimitiveType(Enum):
IMPUTER = 'IMPUTER'
CATEGORICAL_ENCODER = 'CATEGORICAL_ENCODER'
DATETIME_ENCODER = 'DATETIME_ENCODER'
TEXT_ENCODER = 'TEXT_ENCODER'
IMAGE_ENCODER = 'IMAGE_ENCODER'
FEATURE_SCALER = 'FEATURE_SCALER'
FEATURE_SELECTOR = 'FEATURE_SELECTOR'
COLUMN_TRANSFORMER = 'COLUMN_TRANSFORMER'
TIME_SERIES_FORECASTER = 'TIME_SERIES_FORECASTER'
CLASSIFIER = 'CLASSIFIER'
REGRESSOR = 'REGRESSOR'
CLUSTERER = 'CLUSTERER'
SINGLE_ENSEMBLER = 'SINGLE_ENSEMBLER'
MULTI_ENSEMBLER = 'MULTI_ENSEMBLER'
SEMISUPERVISED_SELFTRAINER = 'SEMISUPERVISED_SELFTRAINER'
SEMISUPERVISED_LABELPROPAGATOR = 'SEMISUPERVISED_LABELPROPAGATOR'


def create_object(import_path, class_params=None):
if class_params is None:
class_params = {}
Expand Down Expand Up @@ -130,7 +150,7 @@ def make_d3m_pipelines(pipelines, new_primitives, metric, ordering_sign, source_
cur_step_idx = add_d3m_step(steps_in_type, cur_step_idx, prev_list, new_prev_list, new_pipeline)
prev_list = new_prev_list

if all_primitive_types[step_id] == 'SEMISUPERVISED_CLASSIFIER':
if all_primitive_types[step_id] == 'SEMISUPERVISED_SELFTRAINER':
classifier_object = step_object.base_estimator
classifier_path = f'classifier.{classifier_object.__class__.__name__}'
for primitive_name, primitive_type in all_primitive_types.items():
Expand Down
2 changes: 1 addition & 1 deletion scripts/amlb/automl_job.SBATCH
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
#SBATCH --output logs/automl_job_%J.out
#SBATCH [email protected]

singularity exec --bind /scratch/rl3725/alphaautoml_experiments/experiments:/scratch/rl3725/alphaautoml_experiments/experiments --overlay overlay-15GB-500K.ext3:ro /scratch/work/public/singularity/ubuntu-20.04.4.sif /bin/bash -c "source /ext3/env.sh; python automlbenchmark/runbenchmark.py ${1} ${2} 1h4c -f 0 -u user_config/ -i openml_datasets/ -o results/"
singularity exec --overlay overlay-50G-10M.ext3:ro /scratch/work/public/singularity/ubuntu-22.04.sif /bin/bash -c "source /ext3/env.sh; python automlbenchmark/runbenchmark.py ${1} ${2} 1h4c -f 0 -u user_config/ -i openml_datasets/ -o results/"
Loading

0 comments on commit d77a9fe

Please sign in to comment.