Unanswered
Hi, I Have A Clearml Experiment That Failed To Load Its Scalar Plots After A Few Hours Of Training, When I Look At The Log Locally With Tensorboard It Seems To Work Fine. Any Idea What'S Going On?
this is how task gets created:
def create_clearml_task(
project_name,
task_name,
script,
args,
docker_args="",
docker_image_name="<docker image name>",
add_task_init_call=True,
requirements_file=None,
**kwargs):
print(
"Creating task: project_name: {project_name}, task_name: {task_name}, script:{script} and args: \n {args}"
.format(
project_name=project_name,
task_name=task_name,
script=script,
args=args,
))
arg_tuples = args_to_tuples(args)
# Remove the argument to execute on clearML before queueing up otherwise we will just keep calling
# remote execution recursively without ever doing the work.
unset_clearml_execute(arg_tuples)
return Task.create(
argparse_args=arg_tuples,
project_name=project_name,
task_name=task_name,
script=script,
add_task_init_call=add_task_init_call,
repo='git@<repo>.git',
packages=find_current_packages() if requirements_file is None else None,
requirements_file=requirements_file,
docker=docker_image_name,
commit=get_current_commit(),
docker_bash_setup_script=bash_setup_string,
docker_args="-v /home:/home -v /data:/data -v /mnt:/mnt -v /etc/aws:/etc/aws --shm-size 50G"
+ docker_args,
**kwargs)
===============================================
if args.clearml_taskname is not None and args.clearml_execute is not None:
args_except_execute = {k: v for k, v in vars(args).items() if k != "clearml_execute"}
task = create_clearml_task(project_name=project_name,
task_name=args.clearml_taskname,
script="train.py",
args=args_except_execute,
docker_image_name=docker_img,
requirements_file=requirements_file,
add_task_init_call=False)
task.connect(config_dict)
Task.enqueue(task, queue_name=args.clearml_execute)
sys.exit(0)
# inside main:
task = Task.init(project_name, clearml_taskname)
task.connect(config_dict)
i import Task
from clearml and I also use PyTorch lightning's TensorboardLogger
152 Views
0
Answers
one year ago
one year ago