Code to enqueue
from clearml import Task
task = Task.create(
script="script.py",
docker="ultralytics/ultralytics:latest",
docker_args=["--network=host", "--ipc=host", "--shm_size=55G"],
)
task.enqueue(task, "default")
Hi @<1734020162731905024:profile|RattyBluewhale45> , from the error it looks like there is no space left on the pod. Are you able to run this code manually?
ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
�Traceback (most recent call last):
File "/root/.clearml/venvs-builds/3.10/task_repository/script.py", line 36, in <module>
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 244, in _feed
obj = _ForkingPickler.dumps(obj)
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 568, in reduce_storage
fd, size = storage._share_fd_cpu_()
File "/opt/conda/lib/python3.10/site-packages/torch/storage.py", line 304, in wrapper
return fn(self, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/storage.py", line 374, in _share_fd_cpu_
return super()._share_fd_cpu_(*args, **kwargs)
RuntimeError: unable to write to file </torch_670_1874874997_0>: No space left on device (28)
results = model.train(
File "/ultralytics/ultralytics/engine/model.py", line 815, in train
self.trainer.train()
File "/ultralytics/ultralytics/engine/trainer.py", line 208, in train
self._do_train(world_size)
File "/ultralytics/ultralytics/engine/trainer.py", line 328, in _do_train
self._setup_train(world_size)
File "/ultralytics/ultralytics/engine/trainer.py", line 295, in _setup_train
self.test_loader = self.get_dataloader(
File "/ultralytics/ultralytics/models/yolo/detect/train.py", line 55, in get_dataloader
return build_dataloader(dataset, batch_size, workers, shuffle, rank) # return dataloader
File "/ultralytics/ultralytics/data/build.py", line 135, in build_dataloader
return InfiniteDataLoader(
File "/ultralytics/ultralytics/data/build.py", line 39, in __init__
self.iterator = super().__iter__()
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 439, in __iter__
return self._get_iterator()
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 387, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1022, in __init__
index_queue = multiprocessing_context.Queue() # type: ignore[var-annotated]
File "/opt/conda/lib/python3.10/multiprocessing/context.py", line 103, in Queue
return Queue(maxsize, ctx=self.get_context())
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 43, in __init__
self._rlock = ctx.Lock()
File "/opt/conda/lib/python3.10/multiprocessing/context.py", line 68, in Lock
return Lock(ctx=self.get_context())
File "/opt/conda/lib/python3.10/multiprocessing/synchronize.py", line 162, in __init__
SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)
File "/opt/conda/lib/python3.10/multiprocessing/synchronize.py", line 57, in __init__
sl = self._semlock = _multiprocessing.SemLock(
OSError: [Errno 28] No space left on device
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/multiprocessing/queues.py", line 244, in _feed
obj = _ForkingPickler.dumps(obj)
File "/opt/conda/lib/python3.10/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 568, in reduce_storage
fd, size = storage._share_fd_cpu_()
File "/opt/conda/lib/python3.10/site-packages/torch/storage.py", line 304, in wrapper
return fn(self, *args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/storage.py", line 374, in _share_fd_cpu_
return super()._share_fd_cpu_(*args, **kwargs)
RuntimeError: unable to write to file </torch_630_2165375255_1>: No space left on device (28)
ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
�ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
�Traceback (most recent call last):
Setting ultralytics workers=0 seems to work as per the thread above!
Although that's not ideal as it turns off CPU parallelisation
@<1523701070390366208:profile|CostlyOstrich36> I don't think it's related to disk, I think it's related to shm
I think you're right. But it looks like an infrastructure issue related to Yolo
It did work on clearml on prem with docker_args=["--network=host", "--ipc=host"]
On prem is also K8s? Question is if you run the code unrelated to ClearML on EKS, do you still get the same issue?