Skip to content

Commit

Permalink
nits and improve clock and trainer testing
Browse files Browse the repository at this point in the history
  • Loading branch information
limiteinductive committed Jan 14, 2024
1 parent 9e36fdb commit 920e533
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
2 changes: 1 addition & 1 deletion tests/training_utils/mock_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ save_interval = "10:epoch"

[wandb]
mode = "disabled"
project="mock_project"
project = "mock_project"
23 changes: 18 additions & 5 deletions tests/training_utils/test_trainer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from functools import cached_property
from pathlib import Path
from warnings import warn

import pytest
import torch
Expand Down Expand Up @@ -52,6 +53,8 @@ def __init__(self):


class MockTrainer(Trainer[MockConfig, MockBatch]):
step_counter: int = 0

@cached_property
def mock_model(self) -> MockModel:
return MockModel()
Expand All @@ -63,6 +66,7 @@ def load_models(self) -> dict[str, fl.Module]:
return {"mock_model": self.mock_model}

def compute_loss(self, batch: MockBatch) -> Tensor:
self.step_counter += 1
inputs, targets = batch.inputs.to(self.device), batch.targets.to(self.device)
outputs = self.mock_model(inputs)
return norm(outputs - targets)
Expand All @@ -71,6 +75,7 @@ def compute_loss(self, batch: MockBatch) -> Tensor:
@pytest.fixture
def mock_config(test_device: torch.device) -> MockConfig:
if not test_device.type == "cuda":
warn("only running on CUDA, skipping")
pytest.skip("Skipping test because test_device is not CUDA")
config = MockConfig.load_from_toml(Path(__file__).parent / "mock_config.toml")
config.training.gpu_index = test_device.index
Expand Down Expand Up @@ -163,10 +168,18 @@ def test_mock_trainer_initialization(mock_config: MockConfig, mock_trainer: Mock


def test_training_cycle(mock_trainer: MockTrainer) -> None:
initial_epoch = mock_trainer.clock.epoch
initial_step = mock_trainer.clock.step
clock = mock_trainer.clock
config = mock_trainer.config

assert clock.num_step_per_iteration == config.training.gradient_accumulation["number"]
assert clock.num_batches_per_epoch == mock_trainer.dataset_length // config.training.batch_size

assert mock_trainer.step_counter == 0
assert mock_trainer.clock.epoch == 0

mock_trainer.train()

# Verify that epochs and steps are incremented
assert mock_trainer.clock.epoch > initial_epoch
assert mock_trainer.clock.step > initial_step
assert clock.epoch == config.training.duration["number"]
assert clock.step == config.training.duration["number"] * clock.num_batches_per_epoch

assert mock_trainer.step_counter == mock_trainer.clock.step

0 comments on commit 920e533

Please sign in to comment.