Unanswered
Hi All, I'Ve Successfully Run A Task Locally, And Now I'M Trying To Clone It And Send It To A Queue. It Looks Like The Environment Is Built Successfully, But It Hangs Here:
Here's what the agent was logging:
anjum.sayed@M209886 clearml-agent --debug daemon --queue default
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.clearml.dev.mrl:443
DEBUG:urllib3.connectionpool:
"PUT /auth.login HTTP/1.1" 200 603
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.clearml.dev.mrl:443
DEBUG:urllib3.connectionpool:
"PUT /v2.5/queues.get_all HTTP/1.1" 200 344
DEBUG:urllib3.connectionpool:
"PUT /v2.5/queues.get_all HTTP/1.1" 200 332
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): updates.clear.ml:443
DEBUG:clearml_agent.session:Run by interpreter: C:\Users\anjum.sayed\AppData\Local\Programs\Python\Python39\python.exe
Current configuration (clearml_agent v1.8.1, location: C:\Users\anjum.sayed/clearml.conf):
----------------------
agent.worker_id =
agent.worker_name = M209886
agent.force_git_ssh_protocol = true
agent.python_binary =
agent.package_manager.type = pip
agent.package_manager.pip_version.0 = <20.2 ; python_version < '3.10'
agent.package_manager.pip_version.1 = <22.3 ; python_version >\= '3.10'
agent.package_manager.system_site_packages = false
agent.package_manager.force_upgrade = false
agent.package_manager.conda_channels.0 = pytorch
agent.package_manager.conda_channels.1 = conda-forge
agent.package_manager.conda_channels.2 = nvidia
agent.package_manager.conda_channels.3 = defaults
agent.package_manager.priority_optional_packages.0 = pygobject
agent.package_manager.torch_nightly = false
agent.package_manager.poetry_files_from_repo_working_dir = false
agent.venvs_dir = C:/Users/anjum.sayed/.clearml/venvs-builds
agent.venvs_cache.max_entries = 10
agent.venvs_cache.free_space_threshold_gb = 2.0
agent.venvs_cache.path = ~/.clearml/venvs-cache
agent.vcs_cache.enabled = true
agent.vcs_cache.path = C:/Users/anjum.sayed/.clearml/vcs-cache
agent.venv_update.enabled = false
agent.pip_download_cache.enabled = true
agent.pip_download_cache.path = C:/Users/anjum.sayed/.clearml/pip-download-cache
agent.translate_ssh = true
agent.reload_config = false
agent.docker_pip_cache = C:/Users/anjum.sayed/.clearml/pip-cache
agent.docker_apt_cache = C:/Users/anjum.sayed/.clearml/apt-cache
agent.docker_force_pull = false
agent.default_docker.image = nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04
agent.enable_task_env = false
agent.sanitize_config_printout = ****
agent.hide_docker_command_env_vars.enabled = true
agent.hide_docker_command_env_vars.parse_embedded_urls = true
agent.abort_callback_max_timeout = 1800
agent.docker_internal_mounts.sdk_cache = /clearml_agent_cache
agent.docker_internal_mounts.apt_cache = /var/cache/apt/archives
agent.docker_internal_mounts.ssh_folder = ~/.ssh
agent.docker_internal_mounts.ssh_ro_folder = /.ssh
agent.docker_internal_mounts.pip_cache = /root/.cache/pip
agent.docker_internal_mounts.poetry_cache = /root/.cache/pypoetry
agent.docker_internal_mounts.vcs_cache = /root/.clearml/vcs-cache
agent.docker_internal_mounts.venv_build = ~/.clearml/venvs-builds
agent.docker_internal_mounts.pip_download = /root/.clearml/pip-download-cache
agent.apply_environment = true
agent.apply_files = true
agent.custom_build_script =
agent.disable_task_docker_override = false
agent.git_user =
agent.git_pass = ****
agent.git_host =
agent.debug = true
agent.default_python = 3.9
agent.cuda_version = 123
agent.cudnn_version = 0
api.version = 1.5
api.verify_certificate = true
api.default_version = 1.5
api.http.max_req_size = 15728640
api.http.retries.total = 240
api.http.retries.connect = 240
api.http.retries.read = 240
api.http.retries.redirect = 240
api.http.retries.status = 240
api.http.retries.backoff_factor = 1.0
api.http.retries.backoff_max = 120.0
api.http.wait_on_maintenance_forever = true
api.http.pool_maxsize = 512
api.http.pool_connections = 512
api.http.default_method = put
api.auth.token_expiration_threshold_sec = ****
api.api_server =
api.web_server =
api.files_server =
api.credentials.access_key = 1N33K4IXUYO64HVT4S3PXVDIX4K2CS
api.credentials.secret_key = ****
api.host =
sdk.storage.cache.default_base_dir = ~/.clearml/cache
sdk.storage.cache.size.min_free_bytes = 10GB
sdk.storage.direct_access.0.url = file://*
sdk.metrics.file_history_size = 100
sdk.metrics.matplotlib_untitled_history_size = 100
sdk.metrics.images.format = JPEG
sdk.metrics.images.quality = 87
sdk.metrics.images.subsampling = 0
sdk.metrics.tensorboard_single_series_per_graph = false
sdk.network.metrics.file_upload_threads = 4
sdk.network.metrics.file_upload_starvation_warning_sec = 120
sdk.network.iteration.max_retries_on_server_error = 5
sdk.network.iteration.retry_backoff_factor_sec = 10
sdk.network.file_upload_retries = 3
sdk.aws.s3.key =
sdk.aws.s3.secret = ****
sdk.aws.s3.region =
sdk.aws.s3.use_credentials_chain = false
sdk.aws.boto3.pool_connections = 512
sdk.aws.boto3.max_multipart_concurrency = 16
sdk.aws.boto3.multipart_threshold = 8388608
sdk.aws.boto3.multipart_chunksize = 8388608
sdk.log.null_log_propagate = false
sdk.log.task_log_buffer_capacity = 66
sdk.log.disable_urllib3_info = true
sdk.development.task_reuse_time_window_in_hours = 72.0
sdk.development.vcs_repo_detect_async = true
sdk.development.store_uncommitted_code_diff = true
sdk.development.support_stopping = true
sdk.development.default_output_uri =
sdk.development.force_analyze_entire_repo = false
sdk.development.suppress_update_message = false
sdk.development.detect_with_pip_freeze = false
sdk.development.worker.report_period_sec = 2
sdk.development.worker.ping_period_sec = 30
sdk.development.worker.log_stdout = true
sdk.development.worker.report_global_mem_used = false
sdk.development.worker.report_event_flush_threshold = 100
sdk.development.worker.console_cr_flush_period = 10
sdk.apply_environment = false
sdk.apply_files = false
DEBUG:clearml_agent.commands.worker:starting resource monitor thread
Worker "M209886:0" - Listening to queues:
+----------------------------------+---------+-------+
| id | name | tags |
+----------------------------------+---------+-------+
| 3e9973e15a6048c5ae5419ea7d097f9c | default | |
+----------------------------------+---------+-------+
DEBUG:urllib3.connectionpool:
"PUT /workers.register HTTP/1.1" 200 278
Running CLEARML-AGENT daemon in background mode, writing stdout/stderr to C:\Users\ANJUM~1.SAY\AppData\Local\Temp\.clearml_agent_daemon_outg5aq488v.txt
DEBUG:urllib3.connectionpool:
"PUT /v2.5/queues.get_all HTTP/1.1" 200 337
DEBUG:urllib3.connectionpool:
"PUT /workers.get_runtime_properties HTTP/1.1" 404 371
DEBUG:urllib3.connectionpool:
"PUT /v2.14/queues.get_next_task HTTP/1.1" 200 282
.................. truncating due to Slack char limit.........
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.5/tasks.ping HTTP/1.1" 200 271
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"POST /events.add_batch HTTP/1.1" 200 315
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /workers.status_report HTTP/1.1" 200 283
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /workers.status_report HTTP/1.1" 200 283
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.5/tasks.ping HTTP/1.1" 200 271
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.14/tasks.get_all HTTP/1.1" 200 363
DEBUG:urllib3.connectionpool:
"PUT /v2.5/tasks.get_by_id HTTP/1.1" 200 3490
DEBUG:urllib3.connectionpool:
"PUT /v2.5/tasks.stopped HTTP/1.1" 200 304
INFO:clearml_agent.commands.worker:Task process terminated
INFO:clearml_agent.commands.worker:Task interrupted: stopping
DEBUG:urllib3.connectionpool:
"POST /events.add_batch HTTP/1.1" 200 315
DEBUG:urllib3.connectionpool:
"PUT /v2.5/tasks.stopped HTTP/1.1" 200 333
DEBUG:urllib3.connectionpool:
"PUT /workers.status_report HTTP/1.1" 200 283
DEBUG:urllib3.connectionpool:
"PUT /v2.5/queues.get_all HTTP/1.1" 200 337
DEBUG:urllib3.connectionpool:
"PUT /v2.14/queues.get_next_task HTTP/1.1" 200 282
DEBUG:urllib3.connectionpool:
"PUT /workers.unregister HTTP/1.1" 200 280
DEBUG:urllib3.connectionpool:
"PUT /workers.unregister HTTP/1.1" 200 280
37 Views
0
Answers
3 months ago
3 months ago