Skip to content

Commit

Permalink
sample by memory usage
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed Jun 24, 2024
1 parent d7e00ee commit 77a2f8d
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
2 changes: 1 addition & 1 deletion alpha_automl/automl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
INCLUDE_PRIMITIVES = []
NEW_PRIMITIVES = {}
SPLITTING_STRATEGY = 'holdout'
SAMPLE_SIZE = 100000
SAMPLE_SIZE = 1000000 # 1 Mb
MAX_RUNNING_PROCESSES = multiprocessing.cpu_count()

logger = logging.getLogger(__name__)
Expand Down
15 changes: 8 additions & 7 deletions alpha_automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,22 @@ def create_object(import_path, class_params=None):


def sample_dataset(X, y, sample_size, task):
original_rows = len(X)
original_cols = len(X.columns)
original_size = original_rows * original_cols
# original_rows = len(X)
# original_cols = len(X.columns)
original_mem_usage = X.memory_usage(index=True).sum()
# original_size = original_rows * original_cols
shuffle = True
if task == 'TIME_SERIES_FORECAST':
shuffle = False

if original_size > sample_size:
ratio = sample_size / original_size
if original_mem_usage > sample_size:
ratio = sample_size / original_mem_usage
try:
_, X_test, _, y_test = train_test_split(X, y, random_state=int(datetime.now().microsecond), test_size=ratio, stratify=y, shuffle=shuffle)
except Exception:
# Not using stratified sampling when the minority class has few instances, not enough for all the folds
_, X_test, _, y_test = train_test_split(X, y, random_state=int(datetime.now().microsecond), test_size=ratio, shuffle=shuffle)
logger.debug(f'Sampling down data from {original_size} to {len(X_test)}')
logger.debug(f'Sampling down data from {len(X)} to {len(X_test)}')
if isinstance(X_test, pd.DataFrame):
X_test = X_test.reset_index(drop=True)

Expand All @@ -83,7 +84,7 @@ def sample_dataset(X, y, sample_size, task):
return X_test, y_test, True

else:
logger.debug('Not doing sampling for small dataset (size = %d)', original_size)
logger.debug('Not doing sampling for small dataset (size = %d)', len(X))
return X, y, False


Expand Down

0 comments on commit 77a2f8d

Please sign in to comment.