Skip to content

Commit

Permalink
add metafeatures and fix grammar
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Feb 19, 2024
1 parent 5c490dd commit 8bbbb05
Showing 1 changed file with 115 additions and 62 deletions.
177 changes: 115 additions & 62 deletions alpha_automl/pipeline_synthesis/setup_search.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,74 @@
import logging
import os
import sys
import signal
import logging
import sys
from os.path import join
from alpha_automl.grammar_loader import load_automatic_grammar, load_manual_grammar

from alpha_automl.grammar_loader import (load_automatic_grammar,
load_manual_grammar)
from alpha_automl.pipeline import Pipeline
from alpha_automl.pipeline_search.Coach import Coach
from alpha_automl.pipeline_search.RlLib import pipeline_search_rllib
from alpha_automl.pipeline_search.pipeline.NNet import NNetWrapper
from alpha_automl.pipeline_search.pipeline.PipelineGame import PipelineGame
from alpha_automl.pipeline_search.RlLib import pipeline_search_rllib
from alpha_automl.pipeline_synthesis.pipeline_builder import BaseBuilder
from alpha_automl.scorer import score_pipeline
from alpha_automl.utils import hide_logs


logger = logging.getLogger(__name__)


config = {
'PROBLEM_TYPES': {
'CLASSIFICATION': 1,
'REGRESSION': 2,
'CLUSTERING': 3,
'NA': 4,
'TIME_SERIES_FORECAST': 5,
'SEMISUPERVISED': 6,
"PROBLEM_TYPES": {
"CLASSIFICATION": 1,
"REGRESSION": 2,
"CLUSTERING": 3,
"NA": 4,
"TIME_SERIES_FORECAST": 5,
"SEMISUPERVISED": 6,
},
'DATA_TYPES': {'TABULAR': 1, 'GRAPH': 2, 'IMAGE': 3},
'PIPELINE_SIZE': 8,
'ARGS': {
'numIters': 25,
'numEps': 5,
'tempThreshold': 15,
'updateThreshold': 0.6,
'maxlenOfQueue': 200000,
'numMCTSSims': 5,
'arenaCompare': 40,
'cpuct': 1,
'load_model': False,
'metafeatures_path': '/d3m/data/metafeatures',
'verbose': True,
"DATA_TYPES": {"TABULAR": 1, "GRAPH": 2, "IMAGE": 3},
"PIPELINE_SIZE": 8,
"ARGS": {
"numIters": 25,
"numEps": 5,
"tempThreshold": 15,
"updateThreshold": 0.6,
"maxlenOfQueue": 200000,
"numMCTSSims": 5,
"arenaCompare": 40,
"cpuct": 1,
"load_model": False,
"metafeatures_path": "/d3m/data/metafeatures",
"verbose": True,
},
}


def signal_handler(queue, signum):
logger.debug(f'Receiving signal {signum}, terminating process')
queue.append('DONE')
logger.debug(f"Receiving signal {signum}, terminating process")
queue.append("DONE")
# TODO: Should it save the last status of the NN model?
sys.exit(0)


def search_pipelines(X, y, scoring, splitting_strategy, task_name, time_bound, automl_hyperparams, metadata,
output_folder, verbose, queue):
def search_pipelines(
X,
y,
scoring,
splitting_strategy,
task_name,
time_bound,
automl_hyperparams,
metadata,
output_folder,
verbose,
queue,
):
signal.signal(signal.SIGTERM, lambda signum, frame: signal_handler(queue, signum))
hide_logs(verbose) # Hide logs here too, since multiprocessing has some issues with loggers
hide_logs(
verbose
) # Hide logs here too, since multiprocessing has some issues with loggers

builder = BaseBuilder(metadata, automl_hyperparams)

Expand All @@ -73,20 +87,20 @@ def evaluate_pipeline(primitives, origin):
return score

if task_name is None:
task_name = 'NA'
task_name = "NA"

task_name_id = task_name + '_TASK'
use_automatic_grammar = automl_hyperparams['use_automatic_grammar']
include_primitives = automl_hyperparams['include_primitives']
exclude_primitives = automl_hyperparams['exclude_primitives']
new_primitives = automl_hyperparams['new_primitives']
task_name_id = task_name + "_TASK"
use_automatic_grammar = automl_hyperparams["use_automatic_grammar"]
include_primitives = automl_hyperparams["include_primitives"]
exclude_primitives = automl_hyperparams["exclude_primitives"]
new_primitives = automl_hyperparams["new_primitives"]
grammar = None

if use_automatic_grammar:
logger.debug('Creating an automatic grammar')
prioritize_primitives = automl_hyperparams['prioritize_primitives']
target_column = ''
dataset_path = ''
logger.debug("Creating an automatic grammar")
prioritize_primitives = automl_hyperparams["prioritize_primitives"]
target_column = ""
dataset_path = ""
grammar = load_automatic_grammar(
task_name_id,
dataset_path,
Expand All @@ -97,9 +111,9 @@ def evaluate_pipeline(primitives, origin):
)

if grammar is None:
logger.debug('Creating a manual grammar')
use_imputer = metadata['missing_values']
nonnumeric_columns = metadata['nonnumeric_columns']
logger.debug("Creating a manual grammar")
use_imputer = metadata["missing_values"]
nonnumeric_columns = metadata["nonnumeric_columns"]
grammar = load_manual_grammar(
task_name_id,
nonnumeric_columns,
Expand All @@ -110,30 +124,69 @@ def evaluate_pipeline(primitives, origin):
)

metric = scoring._score_func.__name__
config_updated = update_config(task_name, metric, output_folder, grammar)
config_updated = update_config(task_name, metric, output_folder, grammar, metadata)
game = PipelineGame(config_updated, evaluate_pipeline)
pipeline_search_rllib(game, time_bound)

logger.debug('Search completed')
queue.append('DONE')
logger.debug("Search completed")
queue.append("DONE")


def update_config(task_name, metric, output_folder, grammar):
config['PROBLEM'] = task_name
config['DATA_TYPE'] = 'TABULAR'
config['METRIC'] = metric
config['DATASET'] = f'DATASET_{task_name}'
config['ARGS']['stepsfile'] = join(
output_folder, f'DATASET_{task_name}_pipeline_steps.txt'
def update_config(task_name, metric, output_folder, grammar, metadata):
config["PROBLEM"] = task_name
config["DATA_TYPE"] = "TABULAR"
config["METRIC"] = metric
config["DATASET"] = f"DATASET_{task_name}"
config["ARGS"]["stepsfile"] = join(
output_folder, f"DATASET_{task_name}_pipeline_steps.txt"
)
config['ARGS']['checkpoint'] = join(output_folder, 'nn_models')
config['ARGS']['load_folder_file'] = join(
output_folder, 'nn_models', 'best.pth.tar'
config["ARGS"]["checkpoint"] = join(output_folder, "nn_models")
config["ARGS"]["load_folder_file"] = join(
output_folder, "nn_models", "best.pth.tar"
)
config['GRAMMAR'] = grammar
config["GRAMMAR"] = grammar
# metafeatures_extractor = ComputeMetafeatures(dataset, targets, features, DBSession)
config['DATASET_METAFEATURES'] = [
0
] * 50 # metafeatures_extractor.compute_metafeatures('Compute_metafeatures')

# config['DATASET_METAFEATURES'] = [
# 0
# ] * 50 # metafeatures_extractor.compute_metafeatures('Compute_metafeatures')
metafeatures = compute_metafeatures(metadata)
config["DATASET_METAFEATURES"] = metafeatures + [0] * (50 - len(metafeatures))
return config


def compute_metafeatures(metadata):
metafeatures = []
# IMPUTE
metafeatures.append(1 if metadata["missing_values"] else 0)
# ENCODE
nonnumeric_columns = metadata["nonnumeric_columns"]
if nonnumeric_columns != {}:
metafeatures.append(1)
# TEXT
metafeatures.append(
len(nonnumeric_columns["TEXT_ENCODER"])
if "TEXT_ENCODER" in nonnumeric_columns
else 0
)
# CATEGORICAL
metafeatures.append(
len(nonnumeric_columns["CATEGORICAL_ENCODER"])
if "TEXT_ENCODER" in nonnumeric_columns
else 0
)
# DATETIME
metafeatures.append(
len(nonnumeric_columns["DATETIME_ENCODER"])
if "TEXT_ENCODER" in nonnumeric_columns
else 0
)
# IMAGE
metafeatures.append(
len(nonnumeric_columns["IMAGE_ENCODER"])
if "TEXT_ENCODER" in nonnumeric_columns
else 0
)
else:
metafeatures.append(0)

return metafeatures

0 comments on commit 8bbbb05

Please sign in to comment.