#
from typing import List, Optional, Tuple
import pyrootutils
import lightning
import hydra
from clearml import Task
from omegaconf import DictConfig, OmegaConf
from lightning import LightningDataModule, LightningModule, Trainer, Callback
from lightning.pytorch.loggers import Logger
pyrootutils.setup_root(__file__, indicator="pyproject.toml", pythonpath=True)
# ------------------------------------------------------------------------------------ #
# the setup_root above is equivalent to:
# - adding project root dir to PYTHONPATH
# (so you don't need to force user to install project as a package)
# (necessary before importing any local modules e.g. `from src import utils`)
# - setting up PROJECT_ROOT environment variable
# (which is used as a base for paths in "configs/paths/default.yaml")
# (this way all filepaths are the same no matter where you run the code)
# - loading environment variables from ".env" in root dir
#
# you can remove it if you:
# 1. either install project as a package or move entry files to project root dir
# 2. set `root_dir` to "." in "configs/paths/default.yaml"
#
# more info:
# ------------------------------------------------------------------------------------ #
from src.utils.pylogger import get_pylogger
from src.utils.instantiators import instantiate_callbacks, instantiate_loggers
log = get_pylogger(__name__)
def train(cfg: DictConfig):
# set seed for random number generators in pytorch, numpy and python.random
if cfg.get("seed"):
lightning.seed_everything(cfg.seed, workers=True)
log.info(f"Instantiating datamodule <{cfg.data._target_}>")
datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
log.info(f"Instantiating model <{cfg.model._target_}>")
model: LightningModule = hydra.utils.instantiate(cfg.model)
log.info("Instantiating callbacks...")
callbacks: List[Callback] = instantiate_callbacks(cfg.get("callbacks"))
log.info("Instantiating loggers...")
logger: List[Logger] = instantiate_loggers(cfg.get("logger"))
log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
trainer: Trainer = hydra.utils.instantiate(cfg.trainer, callbacks=callbacks, logger=logger)
if cfg.get("train"):
log.info("Starting training!")
trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
if cfg.get("test"):
log.info("Starting testing!")
ckpt_path = trainer.checkpoint_callback.best_model_path
if ckpt_path == "":
log.warning("Best ckpt not found! Using current weights for testing...")
ckpt_path = None
trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
log.info(f"Best ckpt path: {ckpt_path}")
@hydra.main(version_base="1.3", config_path="../../configs", config_name="train.yaml")
def main(cfg: DictConfig):
OmegaConf.set_struct(cfg, False) # allow cfg to be mutable
task = Task.init(project_name="fluoro-motion-detection", task_name="uniformer-test")
logger = task.get_logger()
logger.report_text("You can view your full hydra configuration under Configuration tab in the UI")
print(OmegaConf.to_yaml(cfg))
train(cfg)
if __name__ == "__main__":
main()
None
See: Add an experiment hyperparameter:
and add gpu
: True
I see, seems like the -args for scripts didn't passed to the docker:
--script fluoro_motion_detection/src/run/main.py \
--args experiment=example.yaml \
it has been pending whole day yesterday, but today it's able to run the task
@<1523701205467926528:profile|AgitatedDove14> I'm trying to run Clearml GPU compute(RTX 3080) with pytorch-lightning but keep getting CUDA error. Is there any specific CUDA/Ubuntu/torch/python version required? I tried several different version but can't make it work
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as telos_algorithms
File "/code/.venv/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 1013, in _run_stage
with isolate_rng():
File "/.pyenv/versions/3.10.9/lib/python3.10/contextlib.py", line 135, in __enter__
return next(self.gen)
File "/code/.venv/lib/python3.10/site-packages/lightning/pytorch/utilities/seed.py", line 42, in isolate_rng
states = _collect_rng_states(include_cuda)
File "/code/.venv/lib/python3.10/site-packages/lightning/fabric/utilities/seed.py", line 115, in _collect_rng_states
states["torch.cuda"] = torch.cuda.get_rng_state_all()
File "/code/.venv/lib/python3.10/site-packages/torch/cuda/random.py", line 39, in get_rng_state_all
results.append(get_rng_state(i))
File "/code/.venv/lib/python3.10/site-packages/torch/cuda/random.py", line 22, in get_rng_state
_lazy_init()
File "/code/.venv/lib/python3.10/site-packages/torch/cuda/__init__.py", line 247, in _lazy_init
torch._C._cuda_init()
RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 804: forward compatibility was attempted on non supported HW
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
And how did you connect your example,yaml?
Yes, because when a container is executed, the agent creates a new venv and inherits from the system wide installed packages, but it cannot inherit or "understand" there is an existing venv, and where it is.
but it still not is able to run any task after I abort and rerun another task
When you "run" a task you are pushing it to a queue, so how come a queue is empty? what happens after you push your newly cloned task to the queue ?
There is nothing on the queue and worker
Actually never mind, it's working now!
okay, when I run main.py on my local machine, I can use python main.py experiement=example.yaml
to override acceleator to GPU option. But seems like the --args experiement=example.yaml
in clearml-task didn't work so I have to manually modify it on UI?
clearml-task \
--project fluoro-motion-detection \
--name uniformer-test \
--repo git@github.com:imperative-care-campbell/algorithms-python.git \
--branch SW-956-Fluoro-Motion-Detection \
--script fluoro_motion_detection/src/run/main.py \
--args experiment=example.yaml \
--docker mzhengtelos/algorithm-ml:pyenv \
--docker_args "--env CLEARML_AGENT_SKIP_PIP_VENV_INSTALL=$PYTHON_ENV_DIR --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
--queue test-gpu
I've added gpu:True to my hydra config file but the GPU is still not used
but it still not is able to run any task after I abort and rerun another task
@<1597762318140182528:profile|EnchantingPenguin77> can you provide the full log?
you should have a gpu argument there, set it to true
Thanks @<1523701205467926528:profile|AgitatedDove14> . I just got an issue running clearml-task remotely, it has been working fine before today, but now every time I run clearml-task, it shows pending, and I've been waiting for 3 hours the status is still pending. The autoscalers was charging the hourly rate even though the task is still pending for 3 hours. From the console log of Clearml GPU instance, I saw it is listening to the queue, but there is no log even after 3 hours. There is nothing else I am running beside this one task, and seems like the worker never spin up again
2023-08-03 04:41:00,624 - clearml.Auto-Scaler - INFO - Spinning new instance resource='default', prefix='38ae71a80baf4a58893631d23c0c6e72_3090_1', queue='test-gpu'
2023-08-03 04:41:00,625 - clearml.Auto-Scaler - INFO - Creating instance for resource default
2023-08-03 04:41:01,027 - clearml.Auto-Scaler - INFO - New instance b97e702d-e2b3-4f28-adab-be59648601ea listening to test-gpu queue
That's the right place but
like you would use hydra --override, which in your case I think it should be "accelerator.gpu" ,
You can also change allow_omegaconf_edit
in the UI to True, and then you could just edit the OmegaConf in the UI (if you do not change
allow_omegaconf_edit` then the edit in the UI is ignored)
I got the same cuda issue after being able to use GPU
Thanks for the detials @<1597762318140182528:profile|EnchantingPenguin77>
clearml.Auto-Scaler - INFO - New instance b97e702d-e2b3-4f28-adab-be59648601ea listening to test-gpu queue
This looks like a new agent was spined on your EC2 account, can you see it in the "Workers" page ?
I did use --args to clearml-task command for this run, but it looks like the docker didn't take it
well I do not think you set your pytorch lightining to use cuda:
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/code/.venv/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:176: PossibleUserWarning: GPU available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='gpu', devices=1)`.
Click on the Task it is running and abort it, it seems to be stuck, I guess this is why the others are not pulled
Here it is @<1523701205467926528:profile|AgitatedDove14>
It seems like CPU is working on something, I saw the usage is spiking periodically but I didn't run any task this morning
is it displaying that it is running anything?