Hi all,
I'm using ClearML and Pytorch-lightning. I was able to train my models successfully as long as I was using a single GPU. When I used two GPUs for training my models I got the following error:
AttributeError: 'ClearmlLogger' object has no attribute '_prev_step'
Any help is appreciated.
here is the full stack trace:
` File "scripts/model_training/ML754_train_ViT_lightning.py", line 228, in main
trainer.fit(module, train_dataloader=train_dl, val_dataloaders=valid_dl)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 741, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 685, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 777, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1199, in _run
self._dispatch()
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1279, in _dispatch
self.training_type_plugin.start_training(self)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 173, in start_training
self.spawn(self.new_process, trainer, self.mp_queue, return_result=False)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 201, in spawn
mp.spawn(self._wrapped_function, args=(function, args, kwargs, return_queue), nprocs=self.num_processes)
File "/home/developer/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/developer/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/developer/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/developer/.local/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 208, in _wrapped_function
result = function(*args, **kwargs)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 236, in new_process
results = trainer.run_stage()
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1289, in run_stage
return self._run_train()
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1319, in _run_train
self.fit_loop.run()
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 228, in advance
self.trainer.logger_connector.update_train_step_metrics()
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py", line 227, in update_train_step_metrics
self.log_metrics(self.metrics["log"])
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py", line 120, in log_metrics
self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loggers/base.py", line 403, in agg_and_log_metrics
logger.agg_and_log_metrics(metrics, step)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loggers/base.py", line 162, in agg_and_log_metrics
agg_step, metrics_to_log = self._aggregate_metrics(metrics=metrics, step=step)
File "/home/developer/.local/lib/python3.6/site-packages/pytorch_lightning/loggers/base.py", line 123, in _aggregate_metrics
if step == self._prev_step:
AttributeError: 'ClearmlLogger' object has no attribute '_prev_step' `