Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add math features #101

Open
wants to merge 3 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions alpha_automl/builtin_primitives/math_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
import pandas as pd
from alpha_automl.base_primitive import BasePrimitive
from feature_engine.creation import MathFeatures

class MathFeaturesSum(BasePrimitive):
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='sum')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)

class MathFeaturesMean(BasePrimitive):
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='mean')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)

class MathFeaturesStd(BasePrimitive):
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='std')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)

class MathFeaturesProd(BasePrimitive):
def __init__(self, numeric_columns, column_names):
self.column_names = column_names
self.numeric_columns = numeric_columns
self.math_features = MathFeatures(variables=self.numeric_columns, func='prod')

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
self.math_features.fit(X)
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.column_names)
return self.math_features.transform(X)
7 changes: 6 additions & 1 deletion alpha_automl/data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


def profile_data(X):
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False}
metadata = {'nonnumeric_columns': {}, 'useless_columns': [], 'missing_values': False, 'numeric_columns': [], 'categorical_columns': [], 'column_names': []}
mapping_encoders = {CATEGORICAL_COLUMN: 'CATEGORICAL_ENCODER', DATETIME_COLUMN: 'DATETIME_ENCODER',
TEXT_COLUMN: 'TEXT_ENCODER', IMAGE_COLUMN: 'IMAGE_ENCODER'}

Expand Down Expand Up @@ -43,6 +43,11 @@ def profile_data(X):
if 'missing_values_ratio' in profiled_column:
metadata['missing_values'] = True

metadata['numeric_columns'] = [(index_column, column_name) for index_column, column_name in enumerate(X.columns) if X[column_name].dtype in ['int64', 'float64']]
metadata['categorical_columns'] = [(index_column, column_name) for index_column, column_name in enumerate(X.columns) if X[column_name].dtype in ['object', 'category']]

metadata['column_names'] = list(X.columns)

logger.debug(f'Results of profiling data: non-numeric features = {str(metadata["nonnumeric_columns"].keys())}, '
f'useless columns = {str(metadata["useless_columns"])}, '
f'missing values = {str(metadata["missing_values"])}')
Expand Down
9 changes: 5 additions & 4 deletions alpha_automl/pipeline_search/agent_lab.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_s
ray.init(local_mode=True, logging_level=logging.CRITICAL)
num_cpus = int(ray.available_resources()["CPU"])

# load checkpoint or create a new one
# Load checkpoint or create a new one
algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=1)
logger.debug("Create Algo object done")

# train model
# Train model
train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder)
logger.debug("Training done")
ray.shutdown()
Expand Down Expand Up @@ -83,20 +83,21 @@ def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_
if (
time.time() > timeout
or (best_unchanged_iter >= 10 and result["episode_reward_mean"] >= 0)
# or result["episode_reward_mean"] >= 70
):
logger.debug(f"Training timeout reached")
break

if contain_checkpoints(checkpoint_save_folder):
# Load the most recent weights
weights = load_rllib_policy_weights(checkpoint_save_folder)
algo.set_weights(weights)
elif contain_checkpoints(checkpoint_load_folder):
weights = load_rllib_policy_weights(checkpoint_load_folder)
algo.set_weights(weights)
result = algo.train()
logger.debug(pretty_print(result))
# stop training of the target train steps or reward are reached

# Stop training of the target train steps or reward are reached
if result["episode_reward_mean"] > last_best:
last_best = result["episode_reward_mean"]
best_unchanged_iter = 1
Expand Down
11 changes: 11 additions & 0 deletions alpha_automl/pipeline_synthesis/pipeline_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from alpha_automl.utils import create_object, COLUMN_TRANSFORMER_ID, COLUMN_SELECTOR_ID, NATIVE_PRIMITIVE, \
ADDED_PRIMITIVE
from alpha_automl.primitive_loader import PRIMITIVE_TYPES
from feature_engine.creation import MathFeatures

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -87,6 +88,8 @@ def make_primitive_objects(self, primitives):
transformers = []
nonnumeric_columns = self.metadata['nonnumeric_columns']
useless_columns = self.metadata['useless_columns']
numeric_columns = self.metadata['numeric_columns']
column_names = self.metadata['column_names']

if len(useless_columns) > 0 and len(nonnumeric_columns) == 0: # Add the transformer to the first step
selector = (COLUMN_SELECTOR_ID, 'drop', [col_index for col_index, _ in useless_columns])
Expand All @@ -105,6 +108,8 @@ def make_primitive_objects(self, primitives):
elif primitive_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or primitive_type == 'REGRESSION_MULTI_ENSEMBLER':
estimators = extract_estimators(pipeline_primitives, self.all_primitives)
primitive_object = create_object(primitive_name, {'estimators': estimators})
elif "alpha_automl.builtin_primitives.math_features" in primitive_name:
primitive_object = create_object(primitive_name, {'numeric_columns': [column_name for _, column_name in numeric_columns], 'column_names': column_names})
elif self.all_primitives[primitive_name]['origin'] == NATIVE_PRIMITIVE: # It's an installed primitive
primitive_object = create_object(primitive_name, EXTRA_PARAMS.get(primitive_name, None))
else:
Expand All @@ -114,6 +119,8 @@ def make_primitive_objects(self, primitives):

if primitive_type in nonnumeric_columns: # Create a new transformer and add it to the list
transformers += self.create_transformers(primitive_object, primitive_name, primitive_type)
elif primitive_type == 'FEATURE_GENERATOR':
transformers += self.create_transformers(primitive_object, primitive_name, primitive_type)
else:
if len(transformers) > 0: # Add previous transformers to the pipeline
if len(useless_columns) > 0:
Expand All @@ -129,13 +136,17 @@ def make_primitive_objects(self, primitives):
def create_transformers(self, primitive_object, primitive_name, primitive_type):
column_transformers = []
nonnumeric_columns = self.metadata['nonnumeric_columns']
numeric_columns = self.metadata['numeric_columns']

if primitive_type == 'TEXT_ENCODER':
column_transformers = [(f'{primitive_name}-{col_name}', primitive_object, col_index) for
col_index, col_name in nonnumeric_columns[primitive_type]]
elif primitive_type == 'CATEGORICAL_ENCODER' or primitive_type == 'DATETIME_ENCODER' or primitive_type == 'IMAGE_ENCODER':
column_transformers = [(primitive_name, primitive_object, [col_index for col_index, _
in nonnumeric_columns[primitive_type]])]
elif primitive_type == 'FEATURE_GENERATOR':
column_transformers = [(primitive_name, primitive_object, [col_index for col_index, _
in numeric_columns])]

return column_transformers

2 changes: 1 addition & 1 deletion alpha_automl/pipeline_synthesis/setup_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def evaluate_pipeline(primitives):
checkpoint_save_folder = (
checkpoints_folder
if checkpoints_folder is not None
else DEFAULT_CHECKPOINT_PATH
else output_folder
)
game = PipelineGame(config_updated, evaluate_pipeline)
pipeline_search_rllib(
Expand Down
10 changes: 8 additions & 2 deletions alpha_automl/resource/primitives_hierarchy.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@
"alpha_automl.builtin_primitives.image_encoder.HogTransformer"
],
"FEATURE_GENERATOR": [
"sklearn.preprocessing.PolynomialFeatures"
"sklearn.preprocessing.PolynomialFeatures",
"alpha_automl.builtin_primitives.math_features.MathFeaturesSum",
"alpha_automl.builtin_primitives.math_features.MathFeaturesMean",
"alpha_automl.builtin_primitives.math_features.MathFeaturesProd",
"alpha_automl.builtin_primitives.math_features.MathFeaturesStd"
],
"FEATURE_SCALER": [
"sklearn.preprocessing.MaxAbsScaler",
Expand All @@ -30,7 +34,9 @@
"FEATURE_SELECTOR": [
"sklearn.feature_selection.GenericUnivariateSelect",
"sklearn.feature_selection.SelectPercentile",
"sklearn.feature_selection.SelectKBest"
"sklearn.feature_selection.SelectKBest",
"feature_engine.selection.SmartCorrelatedSelection",
"feature_engine.selection.DropHighPSIFeatures"
],
"COLUMN_TRANSFORMER": [
"sklearn.compose.ColumnTransformer"
Expand Down
9 changes: 8 additions & 1 deletion tests/test_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ def test_profile_data():
(5, 'country'), (8, 'duration'), (9, 'listed_in'),
(10, 'description')],
'DATETIME_ENCODER': [(6, 'date_added')]},
'useless_columns': [], 'missing_values': True}
'useless_columns': [], 'missing_values': True,
'numeric_columns': [(0, 'show_id'), (7, 'release_year')],
'categorical_columns': [(1, 'type'), (2, 'title'), (3, 'director'), (4, 'cast'),
(5, 'country'), (6, 'date_added'), (8, 'duration'), (9, 'listed_in'),
(10, 'description')],
'column_names': ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
'release_year', 'duration', 'listed_in', 'description'],
}

assert actual_metadata == expected_metadata
Loading