If I now reset the experiment, and enqueue the experiment to the gpu queue (but in the experimet, the user-properties configuration for k8s-glue is still set to cpu) the experiment is left in a Pending state... and in the K8sGlue Agent for the gpu queue, I can see a similar error as the one in the cpu agent....
` No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
FATAL ERROR:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 710, in urlopen
chunked=chunked,
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 398, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/local/lib/python3.6/dist-packages/urllib3/connection.py", line 239, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/usr/lib/python3.6/http/client.py", line 1285, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.6/http/client.py", line 1331, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/lib/python3.6/http/client.py", line 1280, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.6/http/client.py", line 1085, in _send_output
self.send(chunk)
File "/usr/lib/python3.6/http/client.py", line 1006, in send
self.sock.sendall(data)
File "/usr/lib/python3.6/ssl.py", line 975, in sendall
v = self.send(byte_view[count:])
File "/usr/lib/python3.6/ssl.py", line 944, in send
return self._sslobj.write(data)
File "/usr/lib/python3.6/ssl.py", line 642, in write
return self._sslobj.write(data)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 786, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "/usr/local/lib/python3.6/dist-packages/urllib3/util/retry.py", line 550, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.6/dist-packages/urllib3/packages/six.py", line 769, in reraise
raise value.with_traceback(tb)
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 710, in urlopen
chunked=chunked,
File "/usr/local/lib/python3.6/dist-packages/urllib3/connectionpool.py", line 398, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/usr/local/lib/python3.6/dist-packages/urllib3/connection.py", line 239, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/usr/lib/python3.6/http/client.py", line 1285, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.6/http/client.py", line 1331, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/usr/lib/python3.6/http/client.py", line 1280, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/usr/lib/python3.6/http/client.py", line 1085, in _send_output
self.send(chunk)
File "/usr/lib/python3.6/http/client.py", line 1006, in send
self.sock.sendall(data)
File "/usr/lib/python3.6/ssl.py", line 975, in sendall
v = self.send(byte_view[count:])
File "/usr/lib/python3.6/ssl.py", line 944, in send
return self._sslobj.write(data)
File "/usr/lib/python3.6/ssl.py", line 642, in write
return self._sslobj.write(data)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/commands/worker.py", line 1449, in daemon
gpu_queues=dynamic_gpus,
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/glue/k8s.py", line 774, in run_tasks_loop
level="INFO",
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/commands/events.py", line 99, in send_log_events
return self.send_events(list_events=log_events, session=session)
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/commands/events.py", line 58, in send_events
sent_events += send_packet(lines)
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/commands/events.py", line 32, in send_packet
'add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'}, session=session
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/commands/base.py", line 127, in post
return session.post(service=self.service, action=endpoint, *args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/session.py", line 300, in post
json=json or kwargs)
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/session.py", line 308, in _manual_request
json=json or kwargs)
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/backend_api/session/session.py", line 363, in send_request
json=json,
File "/usr/local/lib/python3.6/dist-packages/clearml_agent/backend_api/session/session.py", line 291, in _send_request
method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout)
File "/usr/local/lib/python3.6/dist-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.6/dist-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7
No tasks in Queues, sleeping for 5.0 seconds
No tasks in queue 75174e0e7ac047f195ab4dce6e9f03f7 `