feat: Shuffle between epochs (#456)

This PR introduces a `shuffle` option for training: If `True`, then we shuffle the order of the partitions and the keys within the partitions between each epoch. Note that as described in #460, we might need to have this a bit more finegrained for things like Criteo to optimize performance.
eth-easl · Jun 4, 2024 · db5d38f · db5d38f
1 parent 5982bad
commit db5d38f
Show file tree

Hide file tree

Showing 38 changed files with 320 additions and 201 deletions.
diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml
@@ -16,6 +16,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml
@@ -16,6 +16,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 128
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml
@@ -16,6 +16,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 96
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml
@@ -16,6 +16,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml
@@ -17,6 +17,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml
@@ -16,6 +16,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml
@@ -16,6 +16,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
@@ -17,6 +17,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: True
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/integrationtests/online_dataset/test_online_dataset.py b/integrationtests/online_dataset/test_online_dataset.py
@@ -6,7 +6,7 @@
 import random
 import shutil
 import time
-from typing import Iterable, Tuple
+from typing import Iterable, Optional, Tuple
 
 import grpc
 import modyn.storage.internal.grpc.generated.storage_pb2 as storage_pb2
@@ -275,6 +275,8 @@ def test_dataset_impl(
     pipeline_id: int,
     trigger_id: int,
     items: list[int],
+    shuffle: bool,
+    consistency_check: bool,
 ) -> None:
     dataloader, _ = prepare_dataloaders(
         pipeline_id,
@@ -289,6 +291,7 @@ def test_dataset_impl(
         42,
         prefetched_partitions,
         parallel_prefetch_requests,
+        shuffle,
         None,
         None,
     )
@@ -326,7 +329,7 @@ def test_dataset_impl(
         + f"expected_min = {expected_min_batches}, expected_max = {expected_max_batches}"
     )
 
-    assert set(all_samples) == set(items)
+    assert set(all_samples) == set(items), f"all_samples = {all_samples} \n\n items = {items}"
     assert set(all_labels) == set(range(len(items)))
 
     trans = transforms.Compose([transforms.ToPILImage()])
@@ -339,6 +342,47 @@ def test_dataset_impl(
         if image_bytes not in FIRST_ADDED_IMAGES:
             raise ValueError(f"Could not find image {idx} in created images, all_samples = {all_samples}")
 
+    if not consistency_check:
+        return
+
+    print("Iterating again to check across epochs.")
+
+    second_samples = []
+    second_data = []
+    second_labels = []
+
+    for batch_number, batch in enumerate(dataloader):
+        sample_ids = batch[0]
+        if isinstance(sample_ids, torch.Tensor):
+            sample_ids = sample_ids.tolist()
+        elif isinstance(sample_ids, tuple):
+            sample_ids = list(sample_ids)
+
+        assert isinstance(sample_ids, list), "Cannot parse result from DataLoader"
+        assert isinstance(batch[1], torch.Tensor) and isinstance(batch[2], torch.Tensor)
+
+        second_samples.extend(sample_ids)
+        for sample in batch[1]:
+            second_data.append(sample)  # iterate over batch dimension to extract samples
+        second_labels.extend(batch[2].tolist())
+
+    # Same content, but not same order
+    # (even without shuffle, the storage may return samples in a slightly different order)
+
+    assert set(second_samples) == set(
+        all_samples
+    ), f"second_samples = {second_samples} \n\n all_samples = {all_samples}"
+    assert set(second_labels) == set(all_labels), f"second_labels = {second_labels} \n\n all_labels = {all_labels}"
+    for data1 in second_data:
+        assert any(torch.allclose(data1, data2) for data2 in all_data)
+
+    # when shuffling, we expect a different order
+
+    if shuffle:
+        assert second_samples != all_samples, f"second_samples = {second_samples} \n\n all_samples = {all_samples}"
+        assert not all(torch.allclose(data1, data2) for data1, data2 in zip(second_data, all_data))
+        assert second_labels != all_labels, f"second_labels = {second_labels} \n\n all_labels = {all_labels}"
+
 
 def test_dataset() -> None:
     NUM_IMAGES = 10
@@ -359,22 +403,35 @@ def test_dataset() -> None:
             if prefetched_partitions == 5:
                 ppr_list = [1, 2, 5, 999]
 
+            # By default, we do neither test shuffle nor cross-epoch consistency
+            # Only in a selected case, we test it to avoid blowing up the test further.
+            shuffles = [False]
+            consistency_checks = [False]
+            if num_dataworkers in [0, 4] and prefetched_partitions in [0, 4]:
+                shuffles = [False, True]
+                consistency_checks = [True]
+
             for parallel_prefetch_requests in ppr_list:
                 for batch_size in [1, 2, 10]:
-                    print(
-                        f"Testing num_workers = {num_dataworkers}, partitions = {prefetched_partitions},"
-                        + f"batch_size = {batch_size}, parallel_prefetch_requests={parallel_prefetch_requests}"
-                    )
-                    test_dataset_impl(
-                        num_dataworkers,
-                        batch_size,
-                        prefetched_partitions,
-                        parallel_prefetch_requests,
-                        pipeline_id,
-                        trigger_id,
-                        keys,
-                    )
-                    gc.collect()
+                    for consistency_check in consistency_checks:
+                        for shuffle in shuffles:
+                            print(
+                                f"Testing num_workers = {num_dataworkers}, partitions = {prefetched_partitions},"
+                                + f"batch_size = {batch_size}, parallel_prefetch_requests={parallel_prefetch_requests}"
+                                + f" consistency_check = {consistency_check} shuffle = {shuffle}"
+                            )
+                            test_dataset_impl(
+                                num_dataworkers,
+                                batch_size,
+                                prefetched_partitions,
+                                parallel_prefetch_requests,
+                                pipeline_id,
+                                trigger_id,
+                                keys,
+                                shuffle,
+                                consistency_check,
+                            )
+                            gc.collect()
 
 
 def main() -> None:

diff --git a/modyn/config/examples/example-pipeline.yaml b/modyn/config/examples/example-pipeline.yaml
@@ -24,6 +24,7 @@ training:
   use_previous_model: True
   initial_model: random
   batch_size: 64
+  shuffle: False
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/modyn/config/schema/pipeline/pipeline.py b/modyn/config/schema/pipeline/pipeline.py
@@ -282,6 +282,12 @@ class TrainingConfig(ModynBaseModel):
         description="The number of data loader workers on the trainer node that fetch data from storage.", ge=1
     )
     batch_size: int = Field(description="The batch size to be used during training.", ge=1)
+    shuffle: bool = Field(
+        description=(
+            "If True, we shuffle the order of partitions and the data within each partition at each worker."
+            "Otherwise, the output order is deterministic."
+        )
+    )
     use_previous_model: bool = Field(
         description=(
             "If True, on trigger, we continue training on the model outputted by the previous trigger. If False, "

diff --git a/modyn/protos/trainer_server.proto b/modyn/protos/trainer_server.proto
@@ -55,6 +55,7 @@ message StartTrainingRequest {
   optional int32 seed = 21;
   optional PythonString tokenizer = 22;
   int64 num_samples_to_pass = 23;
+  bool shuffle = 24;
 }
 
 message StartTrainingResponse {

diff --git a/modyn/selector/internal/selector_strategies/abstract_selection_strategy.py b/modyn/selector/internal/selector_strategies/abstract_selection_strategy.py
@@ -201,7 +201,6 @@ def store_training_set(
 
             swt.start("store_triggersamples", overwrite=True)
             if insertion_threads == 1:
-
                 AbstractSelectionStrategy._store_triggersamples_impl(
                     partition,
                     target_trigger_id,

diff --git a/modyn/supervisor/internal/triggers/datadrifttrigger.py b/modyn/supervisor/internal/triggers/datadrifttrigger.py
@@ -220,6 +220,7 @@ def _init_dataloader_info(self) -> None:
             selector_address=f"{self.context.modyn_config.selector.address}",
             num_prefetched_partitions=training_config.num_prefetched_partitions,
             parallel_prefetch_requests=training_config.parallel_prefetch_requests,
+            shuffle=training_config.shuffle,
             tokenizer=data_config.tokenizer,
         )
 

diff --git a/modyn/supervisor/internal/triggers/trigger.py b/modyn/supervisor/internal/triggers/trigger.py
@@ -16,7 +16,6 @@ class TriggerContext:
 
 
 class Trigger(ABC):
-
     # pylint: disable=unnecessary-pass
     def init_trigger(self, context: TriggerContext) -> None:
         """The supervisor initializes the concrete Trigger with Trigger-type-specific configurations

diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py b/modyn/supervisor/internal/triggers/trigger_datasets/dataloader_info.py
@@ -15,6 +15,7 @@ def __init__(
         selector_address: str,
         num_prefetched_partitions: int,
         parallel_prefetch_requests: int,
+        shuffle: bool,
         tokenizer: Optional[str],
     ):
         self.pipeline_id = pipeline_id
@@ -29,3 +30,4 @@ def __init__(
         self.parallel_prefetch_requests = parallel_prefetch_requests
         self.tokenizer = tokenizer
         self.training_id = -1
+        self.shuffle = shuffle
diff --git a/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py b/modyn/supervisor/internal/triggers/trigger_datasets/online_trigger_dataset.py
@@ -31,6 +31,7 @@ def __init__(
         training_id: int,
         num_prefetched_partitions: int,
         parallel_prefetch_requests: int,
+        shuffle: bool,
         tokenizer: Optional[str] = None,
         sample_prob: Optional[float] = None,
     ):
@@ -46,6 +47,7 @@ def __init__(
             training_id,
             num_prefetched_partitions,
             parallel_prefetch_requests,
+            shuffle,
             tokenizer,
             None,
         )

diff --git a/modyn/supervisor/internal/triggers/utils.py b/modyn/supervisor/internal/triggers/utils.py
@@ -32,6 +32,7 @@ def prepare_trigger_dataloader_by_trigger(
         dataloader_info.training_id,
         dataloader_info.num_prefetched_partitions,
         dataloader_info.parallel_prefetch_requests,
+        dataloader_info.shuffle,
         dataloader_info.tokenizer,
         sample_prob,
     )

diff --git a/modyn/tests/conftest.py b/modyn/tests/conftest.py
@@ -170,6 +170,7 @@ def pipeline_training_config() -> TrainingConfig:
         ],
         optimization_criterion=OptimizationCriterion(name="CrossEntropyLoss"),
         checkpointing=CheckpointingConfig(activated=False),
+        shuffle=False,
     )
 
 

diff --git a/modyn/tests/selector/internal/storage_backend/utils.py b/modyn/tests/selector/internal/storage_backend/utils.py
@@ -4,7 +4,6 @@
 
 
 class MockStorageBackend(AbstractStorageBackend):
-
     # pylint: disable=super-init-not-called
     def __init__(self, pipeline_id: int, modyn_config: dict, maximum_keys_in_memory: int):
         self.insertion_threads = 1

diff --git a/modyn/tests/supervisor/internal/pipeline_executor/test_pipeline_executor.py b/modyn/tests/supervisor/internal/pipeline_executor/test_pipeline_executor.py
@@ -144,12 +144,10 @@ def test_initialization(non_connecting_pipeline_executor: PipelineExecutor) -> N
 
 
 def test_pipeline_stage_decorator(dummy_pipeline_args: PipelineExecutionParams) -> None:
-
     class TestStageLogInfo(StageInfo):
         name: str
 
     class TestPipelineExecutor(PipelineExecutor):
-
         @pipeline_stage(PipelineStage.INIT, log=True, track=True)
         def _stage_func(self, s: ExecutionState, log: StageLog) -> int:
             time.sleep(0.1)
@@ -170,7 +168,6 @@ def _stage_func(self, s: ExecutionState, log: StageLog) -> int:
 
 
 def test_pipeline_stage_decorator_generator(dummy_pipeline_args: PipelineExecutionParams) -> None:
-
     class TestStageLogInfo(StageInfo):
         elements: list[int]
         finalized: bool = False
@@ -182,7 +179,6 @@ def create_generator(x: int = 3) -> Generator[int, None, None]:
             yield i
 
     class TestPipelineExecutor(PipelineExecutor):
-
         @pipeline_stage(PipelineStage.INIT, log=True, track=True)
         def _stage_func(self, s: ExecutionState, log: StageLog) -> Generator[int, None, None]:
             try:

diff --git a/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py b/modyn/tests/supervisor/internal/triggers/test_datadrifttrigger.py
@@ -42,6 +42,7 @@ def noop_dataloader_info_constructor_mock(
     selector_address: str,
     num_prefetched_partitions: int,
     parallel_prefetch_requests: int,
+    shuffle: bool,
     tokenizer: Optional[None],
 ) -> None:
     pass

diff --git a/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py b/modyn/tests/supervisor/internal/triggers/trigger_datasets/test_online_trigger_dataset.py
@@ -53,6 +53,7 @@ def test_init():
         num_prefetched_partitions=1,
         parallel_prefetch_requests=1,
         sample_prob=0.5,
+        shuffle=False,
     )
     assert online_trigger_dataset._pipeline_id == 1
     assert online_trigger_dataset._trigger_id == 1
@@ -78,6 +79,7 @@ def test_dataset_iter():
         num_prefetched_partitions=1,
         parallel_prefetch_requests=1,
         sample_prob=0.5,
+        shuffle=False,
     )
 
     all_trigger_data = list(online_trigger_dataset)

diff --git a/modyn/tests/trainer_server/internal/data/test_data_utils.py b/modyn/tests/trainer_server/internal/data/test_data_utils.py
@@ -30,7 +30,7 @@ def test_prepare_dataloaders(
     test_weights, test_insecure_channel, test_grpc_connection_established, test_grpc_connection_established_selector
 ):
     train_dataloader, _ = prepare_dataloaders(
-        1, 1, "MNIST", 4, 128, get_mock_bytes_parser(), [], "", "", 42, 5, 5, None, None
+        1, 1, "MNIST", 4, 128, get_mock_bytes_parser(), [], "", "", 42, 5, 5, False, None, None
     )
 
     assert train_dataloader.num_workers == 4
-Original file line number
+Diff line change
@@ Expand Up / @@ -170,6 +170,7 @@ def pipeline_training_config() -> TrainingConfig: @@
             ],
             optimization_criterion=OptimizationCriterion(name="CrossEntropyLoss"),
             checkpointing=CheckpointingConfig(activated=False),
+            shuffle=False,
         )
@@ Expand Down @@