You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I followed the instructions in your README to set up the storage of the training data. I am training on a server with 8 NVIDIA 4090 GPUs. While the training phase runs smoothly and iterations proceed normally, I encounter an error during the validation phase when the training step reaches 2000. This issue has troubled me for several days. Below are screenshots of part of my dataset and configuration file. I would greatly appreciate it if you could take a look and help identify the cause of this issue. Thank you so much!
Traceback (most recent call last):
File "/data1/yt/TokenHMR-main/tokenhmr/lib/utils/misc.py", line 49, in wrap
ret = task_func(cfg=cfg)
File "/data1/yt/TokenHMR-main/tokenhmr/train.py", line 110, in train
trainer.fit(model, datamodule=datamodule, ckpt_path=checkpoint_path)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 46, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1025, in _run_stage
self.fit_loop.run()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run
self.advance()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 141, in run
self.on_advance_end(data_fetcher)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 295, in on_advance_end
self.val_loop.run()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 178, in _decorator
return loop_run(self, *args, **kwargs)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 128, in run
batch, batch_idx, dataloader_idx = next(data_fetcher)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fetchers.py", line 127, in next
self.batches.append(super().next())
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fetchers.py", line 60, in next
batch = next(self.iterator)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/utilities/combined_loader.py", line 341, in next
out = next(self._iterator)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/utilities/combined_loader.py", line 142, in next
out = next(self.iterators[0])
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 701, in next
data = self._next_data()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1445, in _next_data
return self._process_data(data)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1491, in _process_data
data.reraise()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/_utils.py", line 715, in reraise
raise exception
ValueError: Caught ValueError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
data = fetcher.fetch(index) # type: ignore[possibly-undefined]
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 33, in fetch
data.append(next(self.dataset_iter))
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/pipeline.py", line 71, in iterator
for sample in self.iterator1():
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 217, in _shuffle
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/mix.py", line 64, in random_samples
yield next(sources[i])
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/pipeline.py", line 71, in iterator
for sample in self.iterator1():
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 253, in _select
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 253, in _select
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
[Previous line repeated 1 more time]
File "/data1/yt/TokenHMR-main/tokenhmr/lib/datasets/image_dataset.py", line 296, in split_data
for item in source:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 325, in _rename
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 253, in _select
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/compat.py", line 105, in check_empty
raise ValueError("No samples found in dataset; perhaps you have fewer shards than workers.\n" +
ValueError: No samples found in dataset; perhaps you have fewer shards than workers.
The text was updated successfully, but these errors were encountered:
I followed the instructions in your README to set up the storage of the training data. I am training on a server with 8 NVIDIA 4090 GPUs. While the training phase runs smoothly and iterations proceed normally, I encounter an error during the validation phase when the training step reaches 2000. This issue has troubled me for several days. Below are screenshots of part of my dataset and configuration file. I would greatly appreciate it if you could take a look and help identify the cause of this issue. Thank you so much!
Traceback (most recent call last):
File "/data1/yt/TokenHMR-main/tokenhmr/lib/utils/misc.py", line 49, in wrap
ret = task_func(cfg=cfg)
File "/data1/yt/TokenHMR-main/tokenhmr/train.py", line 110, in train
trainer.fit(model, datamodule=datamodule, ckpt_path=checkpoint_path)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 46, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 981, in _run
results = self._run_stage()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1025, in _run_stage
self.fit_loop.run()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run
self.advance()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 141, in run
self.on_advance_end(data_fetcher)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 295, in on_advance_end
self.val_loop.run()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py", line 178, in _decorator
return loop_run(self, *args, **kwargs)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/evaluation_loop.py", line 128, in run
batch, batch_idx, dataloader_idx = next(data_fetcher)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fetchers.py", line 127, in next
self.batches.append(super().next())
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/loops/fetchers.py", line 60, in next
batch = next(self.iterator)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/utilities/combined_loader.py", line 341, in next
out = next(self._iterator)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/pytorch_lightning/utilities/combined_loader.py", line 142, in next
out = next(self.iterators[0])
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 701, in next
data = self._next_data()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1445, in _next_data
return self._process_data(data)
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1491, in _process_data
data.reraise()
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/_utils.py", line 715, in reraise
raise exception
ValueError: Caught ValueError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
data = fetcher.fetch(index) # type: ignore[possibly-undefined]
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 33, in fetch
data.append(next(self.dataset_iter))
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/pipeline.py", line 71, in iterator
for sample in self.iterator1():
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 217, in _shuffle
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/mix.py", line 64, in random_samples
yield next(sources[i])
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/pipeline.py", line 71, in iterator
for sample in self.iterator1():
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 253, in _select
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 253, in _select
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
[Previous line repeated 1 more time]
File "/data1/yt/TokenHMR-main/tokenhmr/lib/datasets/image_dataset.py", line 296, in split_data
for item in source:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 325, in _rename
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 305, in _map
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/filters.py", line 253, in _select
for sample in data:
File "/data1/yt/anaconda3/envs/wilor/lib/python3.10/site-packages/webdataset/compat.py", line 105, in check_empty
raise ValueError("No samples found in dataset; perhaps you have fewer shards than workers.\n" +
ValueError: No samples found in dataset; perhaps you have fewer shards than workers.
The text was updated successfully, but these errors were encountered: