@<1523701087100473344:profile|SuccessfulKoala55> Okay..but how can i specify agent’s verison in helm chart?
I run clearml-agent manually in gpu available pod using command clearml-agent daemon --queue shelley
and this doesn’t show gpu usage same with when i run task remotely
and here is the log
agent.worker_id =
agent.worker_name = shelley-gpu-pod
agent.force_git_ssh_protocol = false
agent.python_binary =
agent.package_manager.type = pip
agent.package_manager.pip_version.0 = <20.2 ; python_version < ‘3.10’
agent.package_manager.pip_version.1 = <22.3 ; python_version >= ‘3.10’
agent.package_manager.system_site_packages = false
agent.package_manager.force_upgrade = false
agent.package_manager.conda_channels.0 = pytorch
agent.package_manager.conda_channels.1 = conda-forge
agent.package_manager.conda_channels.2 = defaults
agent.package_manager.priority_optional_packages.0 = pygobject
agent.package_manager.torch_nightly = false
agent.package_manager.poetry_files_from_repo_working_dir = false
agent.venvs_dir = /root/.clearml/venvs-builds
agent.venvs_cache.max_entries = 10
agent.venvs_cache.free_space_threshold_gb = 2.0
agent.venvs_cache.path = ~/.clearml/venvs-cache
agent.vcs_cache.enabled = true
agent.vcs_cache.path = /root/.clearml/vcs-cache
agent.venv_update.enabled = false
agent.pip_download_cache.enabled = true
agent.pip_download_cache.path = /root/.clearml/pip-download-cache
agent.translate_ssh = true
agent.reload_config = false
agent.docker_pip_cache = /root/.clearml/pip-cache
agent.docker_apt_cache = /root/.clearml/apt-cache
agent.docker_force_pull = false
agent.default_docker.image = nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
agent.enable_task_env = false
agent.hide_docker_command_env_vars.enabled = true
agent.hide_docker_command_env_vars.parse_embedded_urls = true
agent.abort_callback_max_timeout = 1800
agent.docker_internal_mounts.sdk_cache = /clearml_agent_cache
agent.docker_internal_mounts.apt_cache = /var/cache/apt/archives
agent.docker_internal_mounts.ssh_folder = ~/.ssh
agent.docker_internal_mounts.ssh_ro_folder = /.ssh
agent.docker_internal_mounts.pip_cache = /root/.cache/pip
agent.docker_internal_mounts.poetry_cache = /root/.cache/pypoetry
agent.docker_internal_mounts.vcs_cache = /root/.clearml/vcs-cache
agent.docker_internal_mounts.venv_build = ~/.clearml/venvs-builds
agent.docker_internal_mounts.pip_download = /root/.clearml/pip-download-cache
agent.apply_environment = true
agent.apply_files = true
agent.custom_build_script =
agent.disable_task_docker_override = false
agent.default_python = 3.8
agent.cuda_version = 110
agent.cudnn_version = 0
api.version = 1.5
api.verify_certificate = true
api.default_version = 1.5
api.http.max_req_size = 15728640
api.http.retries.total = 240
api.http.retries.connect = 240
api.http.retries.read = 240
api.http.retries.redirect = 240
api.http.retries.status = 240
api.http.retries.backoff_factor = 1.0
api.http.retries.backoff_max = 120.0
api.http.wait_on_maintenance_forever = true
api.http.pool_maxsize = 512
api.http.pool_connections = 512
api.http.default_method = post
api.api_server = “”
api.files_server = “”
api.web_server = “”
api.credentials.access_key = “”
sdk.storage.cache.default_base_dir = ~/.clearml/cache
sdk.storage.cache.size.min_free_bytes = 10GB
sdk.storage.direct_access.0.url = file://*
sdk.metrics.file_history_size = 100
sdk.metrics.matplotlib_untitled_history_size = 100
sdk.metrics.images.format = JPEG
sdk.metrics.images.quality = 87
sdk.metrics.images.subsampling = 0
sdk.metrics.tensorboard_single_series_per_graph = false
sdk.network.metrics.file_upload_threads = 4
sdk.network.metrics.file_upload_starvation_warning_sec = 120
sdk.network.iteration.max_retries_on_server_error = 5
sdk.network.iteration.retry_backoff_factor_sec = 10
sdk.aws.s3.key =
sdk.aws.s3.region =
sdk.aws.boto3.pool_connections = 512
sdk.aws.boto3.max_multipart_concurrency = 16
sdk.log.null_log_propagate = false
sdk.log.task_log_buffer_capacity = 66
sdk.log.disable_urllib3_info = true
sdk.development.task_reuse_time_window_in_hours = 72.0
sdk.development.vcs_repo_detect_async = true
sdk.development.store_uncommitted_code_diff = true
sdk.development.support_stopping = true
sdk.development.default_output_uri =
sdk.development.force_analyze_entire_repo = false
sdk.development.suppress_update_message = false
sdk.development.detect_with_pip_freeze = false
sdk.development.worker.report_period_sec = 2
sdk.development.worker.ping_period_sec = 30
sdk.development.worker.log_stdout = true
sdk.development.worker.report_global_mem_used = false
I tried using K8S_GLUE_POD_AGENT_INSTALL_ARGS=1.5.3rc2
instead of CLEARML_AGENT_UPDATE_VERSION=1.5.3rc2
, but it’s same. doesn’t read gpu usage.. 🥲
Oh, It’s not the issue with eks.. We had the same issue on an on-premise cluster too(clearml-agent is installed). Could it be because of clearml-agent installed?
for more info, I set CLEARML_AGENT_UPDATE_VERSION=1.5.3rc2
` in agentk8sglue.basePodTemplate.env
I set CLEARML_AGENT_UPDATE_VERSION=1.5.3rc2
` in agentk8sglue.basePodTemplate.env as i mentioned
root@shelley-gpu-pod:/# clearml-agent daemon --queue shelley2 --foreground
/usr/local/lib/python3.8/dist-packages/requests/init.py:109: RequestsDependencyWarning: urllib3 (2.0.2) or chardet (None)/charset_normalizer (3.1.0) doesn’t match a supported version!
warnings.warn(
Using environment access key CLEARML_API_ACCESS_KEY=“”
Using environment secret key CLEARML_API_SECRET_KEY=********
Current configuration (clearml_agent v1.5.2, location: None):
agent.worker_id =
agent.worker_name = shelley-gpu-pod
agent.force_git_ssh_protocol = false
agent.python_binary =
agent.package_manager.type = pip
agent.package_manager.pip_version.0 = <20.2 ; python_version < ‘3.10’
agent.package_manager.pip_version.1 = <22.3 ; python_version >= ‘3.10’
agent.package_manager.system_site_packages = false
agent.package_manager.force_upgrade = false
agent.package_manager.conda_channels.0 = pytorch
agent.package_manager.conda_channels.1 = conda-forge
agent.package_manager.conda_channels.2 = defaults
agent.package_manager.priority_optional_packages.0 = pygobject
agent.package_manager.torch_nightly = false
agent.package_manager.poetry_files_from_repo_working_dir = false
agent.venvs_dir = /root/.clearml/venvs-builds
agent.venvs_cache.max_entries = 10
agent.venvs_cache.free_space_threshold_gb = 2.0
agent.venvs_cache.path = ~/.clearml/venvs-cache
agent.vcs_cache.enabled = true
agent.vcs_cache.path = /root/.clearml/vcs-cache
agent.venv_update.enabled = false
agent.pip_download_cache.enabled = true
agent.pip_download_cache.path = /root/.clearml/pip-download-cache
agent.translate_ssh = true
agent.reload_config = false
agent.docker_pip_cache = /root/.clearml/pip-cache
agent.docker_apt_cache = /root/.clearml/apt-cache
agent.docker_force_pull = false
agent.default_docker.image = nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
agent.enable_task_env = false
agent.hide_docker_command_env_vars.enabled = true
agent.hide_docker_command_env_vars.parse_embedded_urls = true
agent.abort_callback_max_timeout = 1800
agent.docker_internal_mounts.sdk_cache = /clearml_agent_cache
agent.docker_internal_mounts.apt_cache = /var/cache/apt/archives
agent.docker_internal_mounts.ssh_folder = ~/.ssh
agent.docker_internal_mounts.ssh_ro_folder = /.ssh
agent.docker_internal_mounts.pip_cache = /root/.cache/pip
agent.docker_internal_mounts.poetry_cache = /root/.cache/pypoetry
agent.docker_internal_mounts.vcs_cache = /root/.clearml/vcs-cache
agent.docker_internal_mounts.venv_build = ~/.clearml/venvs-builds
agent.docker_internal_mounts.pip_download = /root/.clearml/pip-download-cache
agent.apply_environment = true
agent.apply_files = true
agent.custom_build_script =
agent.disable_task_docker_override = false
agent.default_python = 3.8
agent.cuda_version = 110
agent.cudnn_version = 0
api.version = 1.5
api.verify_certificate = true
api.default_version = 1.5
api.http.max_req_size = 15728640
api.http.retries.total = 240
api.http.retries.connect = 240
api.http.retries.read = 240
api.http.retries.redirect = 240
api.http.retries.status = 240
api.http.retries.backoff_factor = 1.0
api.http.retries.backoff_max = 120.0
api.http.wait_on_maintenance_forever = true
api.http.pool_maxsize = 512
api.http.pool_connections = 512
api.http.default_method = post
api.api_server = “”
api.files_server = “”
api.web_server = “”
api.credentials.access_key = “”
sdk.storage.cache.default_base_dir = ~/.clearml/cache
sdk.storage.cache.size.min_free_bytes = 10GB
sdk.storage.direct_access.0.url = file://*
sdk.metrics.file_history_size = 100
sdk.metrics.matplotlib_untitled_history_size = 100
sdk.metrics.images.format = JPEG
sdk.metrics.images.quality = 87
sdk.metrics.images.subsampling = 0
sdk.metrics.tensorboard_single_series_per_graph = false
sdk.network.metrics.file_upload_threads = 4
sdk.network.metrics.file_upload_starvation_warning_sec = 120
sdk.network.iteration.max_retries_on_server_error = 5
sdk.network.iteration.retry_backoff_factor_sec = 10
sdk.aws.s3.key =
sdk.aws.s3.region =
sdk.aws.boto3.pool_connections = 512
sdk.aws.boto3.max_multipart_concurrency = 16
sdk.log.null_log_propagate = false
sdk.log.task_log_buffer_capacity = 66
sdk.log.disable_urllib3_info = true
sdk.development.task_reuse_time_window_in_hours = 72.0
sdk.development.vcs_repo_detect_async = true
sdk.development.store_uncommitted_code_diff = true
sdk.development.support_stopping = true
sdk.development.default_output_uri =
sdk.development.force_analyze_entire_repo = false
sdk.development.suppress_update_message = false
sdk.development.detect_with_pip_freeze = false
sdk.development.worker.report_period_sec = 2
sdk.development.worker.ping_period_sec = 30
sdk.development.worker.log_stdout = true
sdk.development.worker.report_global_mem_used = false
Worker “shelley-gpu-pod:gpuGPU-207fb7da-a426-da69-7b06-3bc1ec482b7e” - Listening to queues:
+----------------------------------+----------+-------+
| id | name | tags |
+----------------------------------+----------+-------+
| 2b8bfe0dc9ae4a9f9d7bd0dd4f00ae16 | shelley2 | |
+----------------------------------+----------+-------+
No tasks in queue 2b8bfe0dc9ae4a9f9d7bd0dd4f00ae16
No tasks in Queues, sleeping for 5.0 seconds
Hi @<1524922424720625664:profile|TartLeopard58> , how is the agent running? is it running in k8s as well?
@<1524922424720625664:profile|TartLeopard58> did you include the task log after setting the agent to v1.5.3rc2?
It looks like the log was truncated - can you please try to get all the log from running the agent in the cloud machine?
Hi again 😊 @<1523701087100473344:profile|SuccessfulKoala55> sure!
heres is the log when executing with --foreground. but is there any difference?
This is clearml-agent helm chart values.yaml file i used to install
@<1524922424720625664:profile|TartLeopard58> this might be related to the specific AMI/docker image in which the agent is running... can you use agent version v 1.5.3rc2 ?
nope. just running “clearml-agent daemon --queue shelley”
@<1524922424720625664:profile|TartLeopard58> the agent running the task is v1.5.2 (as shown in the log), so the whole point is lost - we need to see the v1.5.3rc2 or v1.5.3rc3 running there... how did you set up the helm chart for the new agent?
Can you share the k8s logs of the pod running the agent? (not the task)
Are there other people experiencing the same issue as me?
it is working on on-premise machine(i can see gpu usage on WORKERS & QUEUES Dashboard). but it is not working on cloud pod
I tried the suggestion you mentioned, but it’s the same. And it doesn’t seem to be an AMI issue. The same problem is occurring even in an on-premise environment.
@<1523701087100473344:profile|SuccessfulKoala55> I realized that this is not an issue with the cloud or on-premise environment. it’s working well on gke but not working on eks. here is the log when i run “clearml-agent daemon --queue ~” command on eks
root@shelley-gpu-pod:/# clearml-agent daemon --queue shelley3
/usr/local/lib/python3.8/dist-packages/requests/init.py:109: RequestsDependencyWarning: urllib3 (2.0.1) or chardet (None)/charset_normalizer (3.1.0) doesn’t match a supported version!
warnings.warn(
Using environment access key CLEARML_API_ACCESS_KEY=“”
Using environment secret key CLEARML_API_SECRET_KEY=********
Current configuration (clearml_agent v1.5.2, location: None):
agent.worker_id =
agent.worker_name = shelley-gpu-pod
agent.force_git_ssh_protocol = false
agent.python_binary =
agent.package_manager.type = pip
agent.package_manager.pip_version.0 = <20.2 ; python_version < ‘3.10’
agent.package_manager.pip_version.1 = <22.3 ; python_version >= ‘3.10’
agent.package_manager.system_site_packages = false
agent.package_manager.force_upgrade = false
agent.package_manager.conda_channels.0 = pytorch
agent.package_manager.conda_channels.1 = conda-forge
agent.package_manager.conda_channels.2 = defaults
agent.package_manager.priority_optional_packages.0 = pygobject
agent.package_manager.torch_nightly = false
agent.package_manager.poetry_files_from_repo_working_dir = false
agent.venvs_dir = /root/.clearml/venvs-builds
agent.venvs_cache.max_entries = 10
agent.venvs_cache.free_space_threshold_gb = 2.0
agent.venvs_cache.path = ~/.clearml/venvs-cache
agent.vcs_cache.enabled = true
agent.vcs_cache.path = /root/.clearml/vcs-cache
agent.venv_update.enabled = false
agent.pip_download_cache.enabled = true
agent.pip_download_cache.path = /root/.clearml/pip-download-cache
agent.translate_ssh = true
agent.reload_config = false
agent.docker_pip_cache = /root/.clearml/pip-cache
agent.docker_apt_cache = /root/.clearml/apt-cache
agent.docker_force_pull = false
agent.default_docker.image = nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04
agent.enable_task_env = false
agent.hide_docker_command_env_vars.enabled = true
agent.hide_docker_command_env_vars.parse_embedded_urls = true
agent.abort_callback_max_timeout = 1800
agent.docker_internal_mounts.sdk_cache = /clearml_agent_cache
agent.docker_internal_mounts.apt_cache = /var/cache/apt/archives
agent.docker_internal_mounts.ssh_folder = ~/.ssh
agent.docker_internal_mounts.ssh_ro_folder = /.ssh
agent.docker_internal_mounts.pip_cache = /root/.cache/pip
agent.docker_internal_mounts.poetry_cache = /root/.cache/pypoetry
agent.docker_internal_mounts.vcs_cache = /root/.clearml/vcs-cache
agent.docker_internal_mounts.venv_build = ~/.clearml/venvs-builds
agent.docker_internal_mounts.pip_download = /root/.clearml/pip-download-cache
agent.apply_environment = true
agent.apply_files = true
agent.custom_build_script =
agent.disable_task_docker_override = false
agent.default_python = 3.8
agent.cuda_version = 110
agent.cudnn_version = 0
api.version = 1.5
api.verify_certificate = true
api.default_version = 1.5
api.http.max_req_size = 15728640
api.http.retries.total = 240
api.http.retries.connect = 240
api.http.retries.read = 240
api.http.retries.redirect = 240
api.http.retries.status = 240
api.http.retries.backoff_factor = 1.0
api.http.retries.backoff_max = 120.0
api.http.wait_on_maintenance_forever = true
api.http.pool_maxsize = 512
api.http.pool_connections = 512
api.http.default_method = post
api.api_server = “”
api.files_server = “”
api.web_server = “”
api.credentials.access_key = “”
sdk.storage.cache.default_base_dir = ~/.clearml/cache
sdk.storage.cache.size.min_free_bytes = 10GB
sdk.storage.direct_access.0.url = file://*
sdk.metrics.file_history_size = 100
sdk.metrics.matplotlib_untitled_history_size = 100
sdk.metrics.images.format = JPEG
sdk.metrics.images.quality = 87
sdk.metrics.images.subsampling = 0
sdk.metrics.tensorboard_single_series_per_graph = false
sdk.network.metrics.file_upload_threads = 4
sdk.network.metrics.file_upload_starvation_warning_sec = 120
sdk.network.iteration.max_retries_on_server_error = 5
sdk.network.iteration.retry_backoff_factor_sec = 10
sdk.aws.s3.key =
sdk.aws.s3.region =
sdk.aws.boto3.pool_connections = 512
sdk.aws.boto3.max_multipart_concurrency = 16
sdk.log.null_log_propagate = false
sdk.log.task_log_buffer_capacity = 66
sdk.log.disable_urllib3_info = true
sdk.development.task_reuse_time_window_in_hours = 72.0
sdk.development.vcs_repo_detect_async = true
sdk.development.store_uncommitted_code_diff = true
sdk.development.support_stopping = true
sdk.development.default_output_uri =
sdk.development.force_analyze_entire_repo = false
sdk.development.suppress_update_message = false
sdk.development.detect_with_pip_freeze = false
sdk.development.worker.report_period_sec = 2
sdk.development.worker.ping_period_sec = 30
sdk.development.worker.log_stdout = true
sdk.development.worker.report_global_mem_used = false
Worker “shelley-gpu-pod:gpuGPU-a8a68c42-d19b-c677-5fd3-889bdce415fb” - Listening to queues:
+----------------------------------+----------+-------+
| id | name | tags |
+----------------------------------+----------+-------+
| 1a63b1506e1d4ba4b6ca290a63eceb6b | shelley3 | |
+----------------------------------+----------+-------+
Running CLEARML-AGENT daemon in background mode, writing stdout/stderr to /tmp/.clearml_agent_daemon_outj1su2mo5.txt
You can do that by passing the CLEARML_AGENT_UPDATE_VERSION=1.5.3rc2
env var