sample by memory usage

VIDA-NYU · Jun 24, 2024 · 77a2f8d · 77a2f8d
1 parent d7e00ee
commit 77a2f8d
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 8 deletions.
diff --git a/alpha_automl/automl_manager.py b/alpha_automl/automl_manager.py
@@ -14,7 +14,7 @@
 INCLUDE_PRIMITIVES = []
 NEW_PRIMITIVES = {}
 SPLITTING_STRATEGY = 'holdout'
-SAMPLE_SIZE = 100000
+SAMPLE_SIZE = 1000000 # 1 Mb
 MAX_RUNNING_PROCESSES = multiprocessing.cpu_count()
 
 logger = logging.getLogger(__name__)

diff --git a/alpha_automl/utils.py b/alpha_automl/utils.py
@@ -59,21 +59,22 @@ def create_object(import_path, class_params=None):
 
 
 def sample_dataset(X, y, sample_size, task):
-    original_rows = len(X)
-    original_cols = len(X.columns)
-    original_size = original_rows * original_cols
+    # original_rows = len(X)
+    # original_cols = len(X.columns)
+    original_mem_usage = X.memory_usage(index=True).sum()
+    # original_size = original_rows * original_cols
     shuffle = True
     if task == 'TIME_SERIES_FORECAST':
         shuffle = False
 
-    if original_size > sample_size:
-        ratio = sample_size / original_size
+    if original_mem_usage > sample_size:
+        ratio = sample_size / original_mem_usage
         try:
             _, X_test, _, y_test = train_test_split(X, y, random_state=int(datetime.now().microsecond), test_size=ratio, stratify=y, shuffle=shuffle)
         except Exception:
             # Not using stratified sampling when the minority class has few instances, not enough for all the folds
             _, X_test, _, y_test = train_test_split(X, y, random_state=int(datetime.now().microsecond), test_size=ratio, shuffle=shuffle)
-        logger.debug(f'Sampling down data from {original_size} to {len(X_test)}')
+        logger.debug(f'Sampling down data from {len(X)} to {len(X_test)}')
         if isinstance(X_test, pd.DataFrame):
             X_test = X_test.reset_index(drop=True)
 
@@ -83,7 +84,7 @@ def sample_dataset(X, y, sample_size, task):
         return X_test, y_test, True
 
     else:
-        logger.debug('Not doing sampling for small dataset (size = %d)', original_size)
+        logger.debug('Not doing sampling for small dataset (size = %d)', len(X))
         return X, y, False