diff --git a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst index 6c5f25bc8f9..083e7cc8dce 100644 --- a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst +++ b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/deepspeed.rst @@ -365,6 +365,237 @@ profiling batches 3 and 4. rendering times for TensorBoard and memory issues. For long-running experiments, it is recommended to configure a profiling schedule. +******************* + DeepSpeed Trainer +******************* + +With the DeepSpeed Trainer API, you can implement and iterate on model training code locally before +running on cluster. When you are satisfied with your model code, you configure and submit the code +on cluster. + +The DeepSpeed Trainer API lets you do the following: + +- Work locally, iterating on your model code. +- Debug models in your favorite debug environment (e.g., directly on your machine, IDE, or Jupyter + notebook). +- Run training scripts without needing to use an experiment configuration file. +- Load previously saved checkpoints directly into your model. + +Initializing the Trainer +======================== + +After defining the PyTorch Trial, initialize the trial and the trainer. +:meth:`~determined.pytorch.deepspeed.init` returns a +:class:`~determined.pytorch.deepspeed.DeepSpeedTrialContext` for instantiating +:class:`~determined.pytorch.deepspeed.DeepSpeedTrial`. Initialize +:class:`~determined.pytorch.deepspeed.Trainer` with the trial and context. + +.. code:: python + + from determined.pytorch import deepspeed as det_ds + + def main(): + with det_ds.init() as train_context: + trial = MyTrial(train_context) + trainer = det_ds.Trainer(trial, train_context) + + if __name__ == "__main__": + # Configure logging + logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT) + main() + +Training is configured with a call to :meth:`~determined.pytorch.deepspeed.Trainer.fit` with +training loop arguments, such as checkpointing periods, validation periods, and checkpointing +policy. + +.. code:: diff + + from determined import pytorch + from determined.pytorch import deepspeed as det_ds + + def main(): + with det_ds.init() as train_context: + trial = MyTrial(train_context) + trainer = det_ds.Trainer(trial, train_context) + + trainer.fit( + + max_length=pytorch.Epoch(10), + + checkpoint_period=pytorch.Batch(100), + + validation_period=pytorch.Batch(100), + + checkpoint_policy="all" + + ) + + + if __name__ == "__main__": + # Configure logging + logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT) + main() + +Run Your Training Script Locally +================================ + +Run training scripts locally without submitting to a cluster or defining an experiment configuration +file. + +.. code:: python + + from determined import pytorch + from determined.pytorch import deepspeed as det_ds + + def main(): + with det_ds.init() as train_context: + trial = MyTrial(train_context) + trainer = det_ds.Trainer(trial, train_context) + trainer.fit( + max_length=pytorch.Epoch(10), + checkpoint_period=pytorch.Batch(100), + validation_period=pytorch.Batch(100), + checkpoint_policy="all", + ) + + + if __name__ == "__main__": + # Configure logging + logging.basicConfig(level=logging.INFO, format=det.LOG_FORMAT) + main() + +You can run this Python script directly (``python3 train.py``), or in a Jupyter notebook. This code +will train for ten epochs, checkpointing and validating every 100 batches. + +Local Distributed Training +========================== + +Local training can utilize multiple GPUs on a single node with a few modifications to the above +code. + +.. code:: diff + + import deepspeed + + def main(): + + # Initialize distributed backend before det_ds.init() + + deepspeed.init_distributed() + + # Initialize DistributedContext + with det_ds.init( + + distributed=core.DistributedContext.from_deepspeed() + ) as train_context: + trial = MyTrial(train_context) + trainer = det_ds.Trainer(trial, train_context) + trainer.fit( + max_length=pytorch.Epoch(10), + checkpoint_period=pytorch.Batch(100), + validation_period=pytorch.Batch(100), + checkpoint_policy="all" + ) + +This code can be directly invoked with your distributed backend's launcher: ``deepspeed --num_gpus=4 +trainer.py --deepspeed --deepspeed_config ds_config.json`` + +Test Mode +========= + +Trainer accepts a test_mode parameter which, if true, trains and validates your training code for +only one batch, checkpoints, then exits. This is helpful for debugging code or writing automated +tests around your model code. + +.. code:: diff + + trainer.fit( + max_length=pytorch.Epoch(10), + checkpoint_period=pytorch.Batch(100), + validation_period=pytorch.Batch(100), + + test_mode=True + ) + +Prepare Your Training Code for Deploying to a Determined Cluster +================================================================ + +Once you are satisfied with the results of training the model locally, you submit the code to a +cluster. This example allows for distributed training locally and on cluster without having to make +code changes. + +Example workflow of frequent iterations between local debugging and cluster deployment: + +.. code:: diff + + def main(): + + info = det.get_cluster_info() + + if info is None: + + # Local: configure local distributed training. + + deepspeed.init_distributed() + + distributed_context = core.DistributedContext.from_deepspeed() + + latest_checkpoint = None + + else: + + # On-cluster: Determined will automatically detect distributed context. + + distributed_context = None + + # On-cluster: configure the latest checkpoint for pause/resume training functionality. + + latest_checkpoint = info.latest_checkpoint + + + with det_ds.init( + + distributed=distributed_context + ) as train_context: + trial = DCGANTrial(train_context) + trainer = det_ds.Trainer(trial, train_context) + trainer.fit( + max_length=pytorch.Epoch(11), + checkpoint_period=pytorch.Batch(100), + validation_period=pytorch.Batch(100), + + latest_checkpoint=latest_checkpoint, + ) + +To run Trainer API solely on-cluster, the code is simpler: + +.. code:: python + + def main(): + with det_ds.init() as train_context: + trial_inst = gan_model.DCGANTrial(train_context) + trainer = det_ds.Trainer(trial_inst, train_context) + trainer.fit( + max_length=pytorch.Epoch(11), + checkpoint_period=pytorch.Batch(100), + validation_period=pytorch.Batch(100), + latest_checkpoint=det.get_cluster_info().latest_checkpoint, + ) + +Submit Your Trial for Training on Cluster +========================================= + +To run your experiment on cluster, you'll need to create an experiment configuration (YAML) file. +Your experiment configuration file must contain searcher configuration and entrypoint. + +.. code:: python + + name: dcgan_deepspeed_mnist + searcher: + name: single + metric: validation_loss + resources: + slots_per_trial: 2 + entrypoint: python3 -m determined.launch.deepspeed python3 train.py + +Submit the trial to the cluster: + +.. code:: bash + + det e create det.yaml . + +If your training code needs to read some values from the experiment configuration, you can set the +``data`` field and read from ``det.get_cluster_info().trial.user_data`` or set ``hyperparameters`` +and read from ``det.get_cluster_info().trial.hparams``. + +Profiling +========= + +When training on cluster, you can enable the system metrics profiler by adding a parameter to your +``fit()`` call: + +.. code:: diff + + trainer.fit( + ..., + + profiling_enabled=True + ) + ***************************** Known DeepSpeed Constraints ***************************** diff --git a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst index 5596616d896..79ade0f1f82 100644 --- a/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst +++ b/docs/model-dev-guide/api-guides/apis-howto/deepspeed/pytorch2deepspeed.rst @@ -18,8 +18,14 @@ Reference conversion example: .. code:: diff - -class MyTrial(PyTorchTrial): - +class MyTrial(DeepSpeedTrial): + +import deepspeed + + -from determined import pytorch + +from determined.pytorch import deepspeed as det_ds + + + -class MyTrial(pytorch.PyTorchTrial): + +class MyTrial(det_ds.DeepSpeedTrial): def __init__(self, context): self.context = context self.args = AttrDict(self.context.get_hparams()) diff --git a/docs/reference/deploy/helm-config-reference.rst b/docs/reference/deploy/helm-config-reference.rst index bede49d2f1b..5548c9ce93f 100644 --- a/docs/reference/deploy/helm-config-reference.rst +++ b/docs/reference/deploy/helm-config-reference.rst @@ -248,7 +248,7 @@ to: ``determinedai/pytorch-ngc-dev:0736b6d``. - ``logPolicies``: Sets log policies for trials. For details, visit :ref:`log_policies - `. + `. .. code:: yaml diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst index 65896f57cb4..6b3222b2581 100644 --- a/docs/reference/experiment-config-reference.rst +++ b/docs/reference/experiment-config-reference.rst @@ -182,19 +182,20 @@ Example: .. _scheduling-unit: -``scheduling_unit`` -=================== +``scheduling_unit`` (deprecated) +================================ Optional. Instructs how frequent to perform system operations, such as periodic checkpointing and -preemption, in the unit of batches. The number of records in a batch is controlled by the -:ref:`global_batch_size ` hyperparameter. Defaults to ``100``. +preemption, in the unit of batches. This field has been deprecated and the behavior should be +configured in training code directly. Please see :ref:`apis-howto-overview` for details specific to +your training framework. + +.. _config-records-per-epoch: + +``records_per_epoch`` (deprecated) +================================== -- Setting this value too small can increase the overhead of system operations and decrease training - throughput. -- Setting this value too large might prevent the system from reallocating resources from this - workload to another, potentially more important, workload. -- As a rule of thumb, it should be set to the number of batches that can be trained in roughly - 60--180 seconds. +Optional. The number of records in the training data set. This field has been deprecated. .. _max-restarts: @@ -321,22 +322,12 @@ While debugging, the logger will display lines highlighted in blue for easy iden .. _experiment-config-min-validation-period: -``min_validation_period`` -========================= - -Optional. Specifies the minimum frequency at which validation should be run for each trial. +``min_validation_period`` (deprecated) +====================================== -- The frequency should be defined using a nested dictionary indicating the unit as records, - batches, or epochs. For example: - -.. code:: yaml - - min_validation_period: - epochs: 2 - -- :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and - :class:`~determined.keras.TFKerasTrial`: If this is in the unit of epochs, ``records_per_epoch`` - must be specified. +Optional. Specifies the minimum frequency at which validation should be run for each trial. This +field has been deprecated and should be specified directly in training code. Please see +:ref:`apis-howto-overview` for details specific to your training framework. .. _experiment-config-perform-initial-validation: @@ -362,22 +353,12 @@ Determined checkpoints in the following situations: .. _experiment-config-min-checkpoint-period: -``min_checkpoint_period`` -========================= - -Optional. Specifies the minimum frequency for running checkpointing for each trial. - -- This value should be set using a nested dictionary in the form of records, batches, or epochs. - For example: +``min_checkpoint_period`` (deprecated) +====================================== - .. code:: yaml - - min_checkpoint_period: - epochs: 2 - -- :class:`~determined.pytorch.deepspeed.DeepSpeedTrial` and - :class:`~determined.keras.TFKerasTrial`: If the unit is in epochs, you must also specify - ``records_per_epoch``. +Optional. Specifies the minimum frequency for running checkpointing for each trial. This field has +been deprecated and should be specified directly in training code. Please see +:ref:`apis-howto-overview` for details specific to your training framework. ``checkpoint_policy`` ===================== @@ -394,8 +375,7 @@ Should be set to one of the following values: - ``none``: A checkpoint will never be taken *due* to a validation. However, even with this policy selected, checkpoints are still expected to be taken after the trial is finished training, due to - cluster scheduling decisions, before search method decisions, or due to - :ref:`min_checkpoint_period `. + cluster scheduling decisions, or when specified in training code. .. _checkpoint-storage: @@ -835,9 +815,6 @@ Single The ``single`` search method does not perform a hyperparameter search at all; rather, it trains a single trial for a fixed length. When using this search method, all of the hyperparameters specified in the :ref:`hyperparameters ` section must be constants. -By default, validation metrics are only computed once, after the specified length of training has -been completed; :ref:`min_validation_period ` can be used -to specify that validation metrics should be computed more frequently. ``metric`` ---------- @@ -847,6 +824,12 @@ configuration. .. _experiment-configuration_single-searcher-max-length: +``max_length`` (deprecated) +--------------------------- + +Previously, ``max_length`` was required to determine the length of each trial. This field has been +deprecated and all training lengths should be specified directly in training code. + **Optional Fields** ``smaller_is_better`` @@ -873,10 +856,7 @@ Random The ``random`` search method implements a simple random search. The user specifies how many hyperparameter configurations should be trained and how long each configuration should be trained -for; the configurations are sampled randomly from the hyperparameter space. Each trial is trained -for the specified length and then validation metrics are computed. :ref:`min_validation_period -` can be used to specify that validation metrics should be -computed more frequently. +for; the configurations are sampled randomly from the hyperparameter space. ``metric`` ---------- @@ -889,6 +869,12 @@ configuration. Required. The number of trials, i.e., hyperparameter configurations, to evaluate. +``max_length`` (deprecated) +--------------------------- + +Previously, ``max_length`` was required to determine the length of each trial. This field has been +deprecated and all training lengths should be specified directly in training code. + **Optional Fields** ``smaller_is_better`` @@ -929,6 +915,12 @@ specified via the ``hyperparameters`` field. For more details see the Required. The name of the validation metric used to evaluate the performance of a hyperparameter configuration. +``max_length`` (deprecated) +--------------------------- + +Previously, ``max_length`` was required to determine the length of each trial. This field has been +deprecated and all training lengths should be specified directly in training code. + **Optional Fields** ``smaller_is_better`` @@ -971,6 +963,12 @@ to terminate training. Once trials are stopped, they will not be resumed. Required. The name of the validation metric used to evaluate the performance of a hyperparameter configuration. +``max_length`` (deprecated) +--------------------------- + +The length of the trial. This field has been deprecated and should be replaced with ``time_metric`` +and ``max_time`` below. + ``time_metric`` --------------- @@ -1401,12 +1399,13 @@ details. .. _exp-config-optimizations: -*************** - Optimizations -*************** +**************************** + Optimizations (deprecated) +**************************** The ``optimizations`` section contains configuration options that influence the performance of the -experiment. +experiment. This section has been deprecated and should be configured in training code. Please see +:ref:`apis-howto-overview` for details specific to your training framework. .. _config-aggregation-frequency: diff --git a/docs/reference/training/api-deepspeed-reference.rst b/docs/reference/training/api-deepspeed-reference.rst index 0fa7fbe8f87..6d00a8253ec 100644 --- a/docs/reference/training/api-deepspeed-reference.rst +++ b/docs/reference/training/api-deepspeed-reference.rst @@ -48,3 +48,16 @@ documentation): - :ref:`determined.pytorch.samplers ` - :ref:`determined.pytorch.MetricReducer ` - :ref:`determined.pytorch.PyTorchCallback ` + +****************************************** + ``determined.pytorch.deepspeed.Trainer`` +****************************************** + +.. autoclass:: determined.pytorch.deepspeed.Trainer + :members: + +***************************************** + ``determined.pytorch.deepspeed.init()`` +***************************************** + +.. autofunction:: determined.pytorch.deepspeed.init diff --git a/docs/reference/training/api-pytorch-reference.rst b/docs/reference/training/api-pytorch-reference.rst index 5b33714ae03..31a710048a8 100644 --- a/docs/reference/training/api-pytorch-reference.rst +++ b/docs/reference/training/api-pytorch-reference.rst @@ -120,3 +120,17 @@ platform which includes: ******************************* .. autofunction:: determined.pytorch.init + +****************************** + ``determined.pytorch.Batch`` +****************************** + +.. autoclass:: determined.pytorch.Batch + :members: + +****************************** + ``determined.pytorch.Epoch`` +****************************** + +.. autoclass:: determined.pytorch.Epoch + :members: diff --git a/docs/release-notes/searcher-context-removal.rst b/docs/release-notes/searcher-context-removal.rst new file mode 100644 index 00000000000..74c81a746b2 --- /dev/null +++ b/docs/release-notes/searcher-context-removal.rst @@ -0,0 +1,72 @@ +:orphan: + +**Breaking Changes** + +- ASHA: All experiments using ASHA hyperparameter search must now configure ``max_time`` and + ``time_metric`` in the experiment config, instead of ``max_length``. Additionally, training code + must report the configured ``time_metric`` in validation metrics. As a convenience, Determined + training loops now automatically report ``batches`` and ``epochs`` with metrics, which you can + use as your ``time_metric``. ASHA experiments without this modification will no longer run. + +- Custom Searchers: all custom searchers including DeepSpeed Autotune were deprecated in ``0.36.0`` + and are now being removed. Users are encouraged to use a preset searcher, which can be easily + :ref:`configured ` for any experiment. + +**New Features** + +- API: introduce ``keras.DeterminedCallback``, a new high-level training API for TF Keras that + integrates Keras training code with Determined through a single :ref:`Keras Callback + `. + +- API: introduce ``deepspeed.Trainer``, a new high-level training API for DeepSpeedTrial that + allows for Python-side training loop configurations and includes support for local training. + +**Deprecations** + +- Experiment Config: the ``max_length`` field of the searcher configuration section has been + deprecated for all experiments and searchers. Users are expected to configure the desired + training length directly in training code. + +- Experiment Config: the ``optimizations`` config has been deprecated. Please see :ref:`Training + APIs ` to configure supported optimizations through training code directly. + +- Experiment Config: the ``scheduling_unit``, ``min_checkpoint_period``, and + ``min_validation_period`` config fields have been deprecated. Instead, these configuration + options should be specified in training code. + +- Experiment Config: the ``entrypoint`` field no longer accepts ``model_def:TrialClass`` as trial + definitions. Please invoke your training script directly (``python3 train.py``). + +- Core API: the ``SearcherContext`` (``core.searcher``) has been deprecated. Training code no + longer requires ``core.searcher.operations`` to run, and progress should be reported through + ``core.train.report_progress``. + +- DeepSpeed: the ``num_micro_batches_per_slot`` and ``train_micro_batch_size_per_gpu`` attributes + on ``DeepSpeedContext`` have been replaced with ``get_train_micro_batch_size_per_gpu()`` and + ``get_num_micro_batches_per_slot()``. + +- Horovod: the horovod distributed training backend has been deprecated. Users are encouraged to + migrate to the native distributed backend of their training framework (``torch.distributed`` or + ``tf.distribute``). + +- Trial APIs: ``TFKerasTrial`` has been deprecated. Users are encouraged to migrate to the new + :ref:`Keras Callback `. + +- Launchers: the ``--trial`` argument in Determined launchers has been deprecated. Please invoke + your training script directly. + +- ASHA: the ``stop_once`` field of the ``searcher`` config for ASHA searchers has been deprecated. + All ASHA searches are now early-stopping based (``stop_once: true``) instead of promotion based. + +- CLI: The ``--test`` and ``--local`` flags for ``det experiment create`` have been deprecated. All + training APIs now support local execution (``python3 train.py``). Please see ``training apis`` + for details specific to your framework. + +- Web UI: previously, trials that reported an ``epoch`` metric enabled an epoch X-axis in the Web + UI metrics tab. This metric name has been changed to ``epochs``, with ``epoch`` as a fallback + option. + +**Removed Features** + +- WebUI: "Continue Training" no longer supports configurable number of batches in the Web UI and + will simply resume the trial from the last checkpoint. diff --git a/examples/deepspeed/dcgan/README.md b/examples/deepspeed/dcgan/README.md index f0b9811b9c9..31481d432c3 100644 --- a/examples/deepspeed/dcgan/README.md +++ b/examples/deepspeed/dcgan/README.md @@ -25,10 +25,16 @@ After installing docker and pulling an image, users can launch a container via Install necessary dependencies via `pip install determined mpi4py` -Then, run the following command: +Then, run the following command if running on a single node and GPU: ``` python trainer.py ``` +For multiple nodes GPUs, use the following: +``` +deepspeed --num_nodes= --num_gpus= trainer.py --deepspeed --deepspeed_config ds_config.json +``` +Where `num_nodes` corresponds to the number of nodes on your local cluster and `num_gpus` corresponds to +the number of GPUs per node. Any additional configs can be specified in `mnist.yaml` and `ds_config.json` accordingly. diff --git a/examples/deepspeed/dcgan/model.py b/examples/deepspeed/dcgan/model.py index 99322dd5a9c..8b3e08d7ebb 100644 --- a/examples/deepspeed/dcgan/model.py +++ b/examples/deepspeed/dcgan/model.py @@ -47,7 +47,7 @@ def __init__( self.discriminator = self.context.wrap_model_engine(discriminator) self.fixed_noise = self.context.to_device( torch.randn( - self.context.train_micro_batch_size_per_gpu, self.hparams["noise_length"], 1, 1 + self.context.get_train_micro_batch_size_per_gpu(), self.hparams["noise_length"], 1, 1 ) ) self.criterion = nn.BCELoss() @@ -63,7 +63,7 @@ def _get_noise(self, dtype: torch.dtype) -> torch.Tensor: torch.Tensor, self.context.to_device( torch.randn( - self.context.train_micro_batch_size_per_gpu, + self.context.get_train_micro_batch_size_per_gpu(), self.hparams["noise_length"], 1, 1, @@ -94,7 +94,7 @@ def train_batch( else: dtype = torch.float32 real_label, fake_label = self._get_label_constants( - self.context.train_micro_batch_size_per_gpu, dtype + self.context.get_train_micro_batch_size_per_gpu(), dtype ) ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) @@ -107,7 +107,7 @@ def train_batch( D_x = 0.0 D_G_z1 = 0.0 fake_sample_count = ( - self.context.train_micro_batch_size_per_gpu * self.gradient_accumulation_steps + self.context.get_train_micro_batch_size_per_gpu() * self.gradient_accumulation_steps ) for i in range(self.gradient_accumulation_steps): @@ -133,7 +133,7 @@ def train_batch( output = self.discriminator(fake.detach()) errD_fake = self.criterion(output, fake_label) self.discriminator.backward(errD_fake) - errD_fake_sum += errD_fake * self.context.train_micro_batch_size_per_gpu + errD_fake_sum += errD_fake * self.context.get_train_micro_batch_size_per_gpu() D_G_z1 += output.sum().item() # update self.discriminator.step() @@ -154,7 +154,7 @@ def train_batch( output = self.discriminator(fake) errG = self.criterion(output, real_label) # fake labels are real for generator cost self.generator.backward(errG) - errG_sum += errG * self.context._train_micro_batch_size_per_gpu + errG_sum += errG * self.context.get_train_micro_batch_size_per_gpu() D_G_z2_sum += output.sum().item() self.generator.step() @@ -189,7 +189,7 @@ def build_training_data_loader(self) -> Any: dataset = data.get_dataset(self.data_config) return DataLoader( dataset, - batch_size=self.context.train_micro_batch_size_per_gpu, + batch_size=self.context.get_train_micro_batch_size_per_gpu(), shuffle=True, num_workers=int(self.hparams["data_workers"]), ) @@ -201,9 +201,9 @@ def build_validation_data_loader(self) -> Any: dataset, list( range( - self.context.train_micro_batch_size_per_gpu + self.context.get_train_micro_batch_size_per_gpu() * self.context.distributed.get_size() ) ), ) - return DataLoader(dataset, batch_size=self.context.train_micro_batch_size_per_gpu) + return DataLoader(dataset, batch_size=self.context.get_train_micro_batch_size_per_gpu()) diff --git a/harness/determined/pytorch/_trainer_utils.py b/harness/determined/pytorch/_trainer_utils.py index 254fad6e150..9a27642b9cb 100644 --- a/harness/determined/pytorch/_trainer_utils.py +++ b/harness/determined/pytorch/_trainer_utils.py @@ -12,10 +12,6 @@ class TrainUnit: the value of unit, where the value can be an int or an implementable collections.abc.Container. TrainUnits are used to define periodic training behavior such as checkpointing and validating. - - int values are treated as periods, e.g. Batch(100) will checkpoint/validate every 100 batches. - collections.abc.Container values are treated as schedules, e.g. Batch(1,5,10) will - checkpoint/validate on batches 1, 5, and 10. """ def __init__(self, value: Union[int, abc.Container]): @@ -87,7 +83,12 @@ def _divides(self, steps: int) -> bool: class Epoch(TrainUnit): """ - Epoch step type (e.g. Epoch(1) defines 1 epoch) + Defines an Epoch unit for specifying length to PyTorch trainers. + + Epoch(int) values are treated as periods, e.g. Epoch(100) will checkpoint/validate every 100 + epochs. + Epoch(collections.abc.Container) values are treated as schedules, e.g. Epoch([1,5,10]) will + checkpoint/validate on epochs 1, 5, and 10. """ pass @@ -95,7 +96,12 @@ class Epoch(TrainUnit): class Batch(TrainUnit): """ - Batch step type (e.g. Batch(1) defines 1 batch) + Defines a Batch unit for specifying length to PyTorch trainers. + + Batch(int) values are treated as periods, e.g. Batch(100) will checkpoint/validate every 100 + batches. + Batch(collections.abc.Container) values are treated as schedules, e.g. Batch([1,5,10]) will + checkpoint/validate on batches 1, 5, and 10. """ @staticmethod diff --git a/harness/determined/pytorch/deepspeed/_deepspeed_context.py b/harness/determined/pytorch/deepspeed/_deepspeed_context.py index b71f44e31da..6e15ea278e8 100644 --- a/harness/determined/pytorch/deepspeed/_deepspeed_context.py +++ b/harness/determined/pytorch/deepspeed/_deepspeed_context.py @@ -2,6 +2,7 @@ import logging import pathlib import time +import warnings from importlib import util as importutil from typing import Any, Dict, List, Optional, Set, Type, Union, cast @@ -239,6 +240,17 @@ def get_train_micro_batch_size_per_gpu(self) -> int: ) return self._train_micro_batch_size_per_gpu + @property + def train_micro_batch_size_per_gpu(self) -> int: + warnings.warn( + "DeepSpeedTrialContext.train_micro_batch_size_per_gpu has been deprecated in " + "Determined 0.38.0; please use the context.get_train_micro_batch_size_per_gpu() getter " + "instead.", + FutureWarning, + stacklevel=2, + ) + return self.get_train_micro_batch_size_per_gpu() + def get_num_micro_batches_per_slot(self) -> int: if self._num_micro_batches_per_slot is None: raise det.errors.InvalidExperimentException( @@ -246,6 +258,17 @@ def get_num_micro_batches_per_slot(self) -> int: ) return self._num_micro_batches_per_slot + @property + def num_micro_batches_per_slot(self) -> int: + warnings.warn( + "DeepSpeedTrialContext.num_micro_batches_per_slot has been deprecated in " + "Determined 0.38.0; please use the context.get_num_micro_batches_per_slot() getter " + "instead.", + FutureWarning, + stacklevel=2, + ) + return self.get_num_micro_batches_per_slot() + def _init_device(self) -> None: if not self._num_gpus: raise det.errors.InvalidExperimentException("GPUs required for DeepSpeedTrial.") diff --git a/harness/determined/pytorch/deepspeed/_trainer.py b/harness/determined/pytorch/deepspeed/_trainer.py index 8e36f345235..587a1b41999 100644 --- a/harness/determined/pytorch/deepspeed/_trainer.py +++ b/harness/determined/pytorch/deepspeed/_trainer.py @@ -65,7 +65,9 @@ def fit( max_length: The maximum number of steps to train for. This is a ``TrainUnit`` type (``Batch`` or ``Epoch``) which takes an ``int``. For example, ``Epoch(1)`` would train for a maximum length of one epoch. + .. note:: + If using an ASHA searcher, this value should match the searcher config values in the experiment config (i.e. ``Epoch(1)`` = `max_time: 1` and `time_metric: "epochs"`). diff --git a/master/pkg/searcher/searcher.go b/master/pkg/searcher/searcher.go index 8b282b369f7..95479e2b9a6 100644 --- a/master/pkg/searcher/searcher.go +++ b/master/pkg/searcher/searcher.go @@ -86,7 +86,7 @@ func (s *Searcher) TrialCreated(requestID model.RequestID) ([]Action, error) { operations, err := s.method.trialCreated(s.context(), requestID) if err != nil { return nil, errors.Wrapf(err, - "error while handling a trial created event: %d", requestID) + "error while handling a trial created event: %s", requestID) } s.record(operations) return operations, nil @@ -156,7 +156,7 @@ func (s *Searcher) ValidationCompleted( operations, err := s.method.validationCompleted(s.context(), requestID, metrics) if err != nil { - return nil, errors.Wrapf(err, "error while handling a validation completed event: %d", requestID) + return nil, errors.Wrapf(err, "error while handling a validation completed event: %s", requestID) } s.record(operations) return operations, nil @@ -170,7 +170,7 @@ func (s *Searcher) TrialExited(requestID model.RequestID) ([]Action, error) { s.state.TrialsClosed[requestID] = true actions, err := s.method.trialExited(s.context(), requestID) if err != nil { - return nil, errors.Wrapf(err, "error while handling a trial closed event: %d", requestID) + return nil, errors.Wrapf(err, "error while handling a trial closed event: %s", requestID) } s.record(actions)