The problem is that clearml installs cudatoolkit=11.0
but cudatoolkit=11.1
is needed. By setting agent.cuda_version=11.1
in clearml.conf
it uses the correct version and installs fine. With version 11.0
conda will resolve conflicts by installing pytorch cpu-version.
channels:
- pytorch
- conda-forge
- defaults
dependencies:
- cudatoolkit~=11.1.1
- pytorch~=1.8.0
Works fine
(This is why we recommend using pip, because it is stable and clearml-agent takes care of pytorch/cuda verions)
==> 2021-03-11 13:54:59 <==
# cmd: /home/tim/miniconda3/condabin/conda create --yes --mkdir --prefix /home/tim/.clearml/venvs-builds/3.8 python=3.8
# conda version: 4.9.2
+defaults/linux-64::_libgcc_mutex-0.1-main
+defaults/linux-64::ca-certificates-2021.1.19-h06a4308_1
+defaults/linux-64::certifi-2020.12.5-py38h06a4308_0
+defaults/linux-64::ld_impl_linux-64-2.33.1-h53a641e_7
+defaults/linux-64::libedit-3.1.20191231-h14c3975_1
+defaults/linux-64::libffi-3.3-he6710b0_2
+defaults/linux-64::libgcc-ng-9.1.0-hdf63c60_0
+defaults/linux-64::libstdcxx-ng-9.1.0-hdf63c60_0
+defaults/linux-64::ncurses-6.2-he6710b0_1
+defaults/linux-64::openssl-1.1.1j-h27cfd23_0
+defaults/linux-64::pip-21.0.1-py38h06a4308_0
+defaults/linux-64::python-3.8.8-hdb3f193_4
+defaults/linux-64::readline-8.1-h27cfd23_0
+defaults/linux-64::setuptools-52.0.0-py38h06a4308_0
+defaults/linux-64::sqlite-3.33.0-h62c20be_0
+defaults/linux-64::tk-8.6.10-hbc83047_0
+defaults/linux-64::xz-5.2.5-h7b6447c_0
+defaults/linux-64::zlib-1.2.11-h7b6447c_3
+defaults/noarch::wheel-0.36.2-pyhd3eb1b0_0
# update specs: ['python=3.8']
==> 2021-03-11 13:55:01 <==
# cmd: /home/tim/miniconda3/condabin/conda install -p /home/tim/.clearml/venvs-builds/3.8 -c defaults -c conda-forge -c pytorch cudatoolkit=11.1 --quiet --json
# conda version: 4.9.2
-defaults/linux-64::_libgcc_mutex-0.1-main
-defaults/linux-64::libgcc-ng-9.1.0-hdf63c60_0
-defaults/linux-64::libstdcxx-ng-9.1.0-hdf63c60_0
+conda-forge/linux-64::_libgcc_mutex-0.1-conda_forge
+conda-forge/linux-64::_openmp_mutex-4.5-1_gnu
+conda-forge/linux-64::cudatoolkit-11.1.1-h6406543_8
+conda-forge/linux-64::libgcc-ng-9.3.0-h2828fa1_18
+conda-forge/linux-64::libgomp-9.3.0-h2828fa1_18
+conda-forge/linux-64::libstdcxx-ng-9.3.0-h6de172a_18
# update specs: ['cudatoolkit=11.1']
==> 2021-03-11 13:55:06 <==
# cmd: /home/tim/miniconda3/condabin/conda install -p /home/tim/.clearml/venvs-builds/3.8 -c defaults -c conda-forge -c pytorch pip<20.2 --quiet --json
# conda version: 4.9.2
-defaults/linux-64::pip-21.0.1-py38h06a4308_0
+conda-forge/linux-64::pip-20.0.2-py38_1
# update specs: ["pip[version='<20.2']"]
==> 2021-03-11 13:55:21 <==
# cmd: /home/tim/miniconda3/bin/conda-env update -p /home/tim/.clearml/venvs-builds/3.8 --file /tmp/conda_envu_x8060h.yml --quiet --json
# conda version: 4.9.2
-conda-forge/linux-64::_openmp_mutex-4.5-1_gnu
-defaults/linux-64::ca-certificates-2021.1.19-h06a4308_1
+conda-forge/linux-64::_openmp_mutex-4.5-1_llvm
+conda-forge/linux-64::ffmpeg-4.3.1-h03821db_2
+conda-forge/linux-64::gnutls-3.6.13-h85f3911_1
+conda-forge/linux-64::libblas-3.9.0-8_mkl
+conda-forge/linux-64::libiconv-1.16-h516909a_0
+conda-forge/linux-64::libprotobuf-3.15.5-h780b84a_0
+conda-forge/linux-64::libuv-1.41.0-h7f98852_0
+conda-forge/linux-64::llvm-openmp-11.0.1-h4bd325d_0
+conda-forge/linux-64::mkl-2020.4-h726a3e6_304
+conda-forge/linux-64::mkl_random-1.2.0-py38hc5bc63f_1
+conda-forge/linux-64::nettle-3.6-he412f7d_0
+conda-forge/linux-64::openh264-2.1.1-h780b84a_0
+conda-forge/linux-64::python_abi-3.8-1_cp38
+conda-forge/linux-64::pytorch-1.8.0-cpu_py38he614459_0
+conda-forge/linux-64::sleef-3.5.1-h7f98852_1
+conda-forge/linux-64::zstd-1.4.9-ha95c52a_0
+defaults/linux-64::blas-1.0-mkl
+defaults/linux-64::bzip2-1.0.8-h7b6447c_0
+defaults/linux-64::ca-certificates-2020.12.8-h06a4308_1
+defaults/linux-64::cffi-1.14.5-py38h261ae71_0
+defaults/linux-64::freetype-2.10.4-h5ab3b9f_0
+defaults/linux-64::future-0.18.2-py38_1
+defaults/linux-64::gmp-6.2.1-h2531618_2
+defaults/linux-64::jpeg-9b-h024ee3a_2
+defaults/linux-64::lame-3.100-h7b6447c_0
+defaults/linux-64::lcms2-2.11-h396b838_0
+defaults/linux-64::libpng-1.6.37-hbc83047_0
+defaults/linux-64::libtiff-4.1.0-h2733197_1
+defaults/linux-64::lz4-c-1.9.3-h2531618_0
+defaults/linux-64::mkl-service-2.3.0-py38he904b0f_0
+defaults/linux-64::mkl_fft-1.3.0-py38h54f3939_0
+defaults/linux-64::ninja-1.10.2-py38hff7bd54_0
+defaults/linux-64::numpy-1.19.2-py38h54aff64_0
+defaults/linux-64::numpy-base-1.19.2-py38hfa32c7d_0
+defaults/linux-64::pillow-8.1.2-py38he98fc37_0
+defaults/linux-64::six-1.15.0-py38h06a4308_0
+defaults/linux-64::x264-1!152.20180806-h7b6447c_0
+defaults/noarch::olefile-0.46-py_0
+defaults/noarch::pycparser-2.20-py_2
+defaults/noarch::typing_extensions-3.7.4.3-pyha847dfd_0
+pytorch/linux-64::torchaudio-0.8.0-py38
+pytorch/linux-64::torchvision-0.9.0-py38_cu111
# update specs: ['ld_impl_linux-64~=2.33.1', 'libpng~=1.6.37', 'ninja~=1.10.2', 'pytorch~=1.8.0', 'zstd~=1.4.9', 'gmp~=6.2.1', 'python~=3.8.8', 'torchaudio~=0.8.0', 'libtiff~=4.1.0', 'mkl-service~=2.3.0', 'typing_extensions~=3.7.4.3', 'llvm-openmp~=11.0.1', 'python_abi~=3.8', 'readline~=8.1', 'jpeg~=9b.0', 'libedit~=3.1.20191231', 'mkl_random~=1.2.0', 'numpy~=1.19.2', 'openssl~=1.1.1j', 'pillow~=8.1.2', 'blas~=1.0', 'setuptools~=52.0.0', 'tk~=8.6.10', 'ffmpeg~=4.3', 'lz4-c~=1.9.3', 'xz~=5.2.5', 'ncurses~=6.2', 'lame~=3.100', 'libgcc-ng~=9.3.0', 'libffi~=3.3', 'six~=1.15.0', 'certifi~=2020.12.5', 'libuv~=1.41.0', 'gnutls~=3.6.13', 'torchvision~=0.9.0', 'sqlite~=3.33.0', 'libstdcxx-ng~=9.3.0', 'olefile~=0.46', 'openh264~=2.1.1', 'libiconv~=1.16', 'ca-certificates~=2020.12.5', 'cudatoolkit~=11.1.1', 'mkl_fft~=1.3.0', 'freetype~=2.10.4', 'numpy-base~=1.19.2', 'wheel~=0.36.2', 'mkl~=2020.4', 'nettle~=3.6', 'lcms2~=2.11', 'bzip2~=1.0.8', 'zlib~=1.2.11']
One more thing: The cuda_version that clearml finds automatically is wrong.
Perfect! I have to thank you for helping me, not the other way around!
Hurray conda.
Notice it does include cudatoolkit , but conda ignores it
cudatoolkit~=11.1.1
Can you test the same one only serach and replace ~= with == ?
By host you mean the machine on which the agent is running? How does clearml-agent find the cuda_version?
name: core
channels:
- pytorch
- anaconda
- conda-forge
- defaults
dependencies:
- _libgcc_mutex=0.1
- _openmp_mutex=4.5
- blas=1.0
- bzip2=1.0.8
- ca-certificates=2020.10.14
- certifi=2020.6.20
- cloudpickle=1.6.0
- cudatoolkit=11.1.1
- cycler=0.10.0
- cytoolz=0.11.0
- dask-core=2021.2.0
- decorator=4.4.2
- ffmpeg=4.3
- freetype=2.10.4
- gmp=6.2.1
- gnutls=3.6.13
- imageio=2.9.0
- jpeg=9b
- kiwisolver=1.3.1
- lame=3.100
- lcms2=2.11
- ld_impl_linux-64=2.33.1
- libedit=3.1.20191231
- libffi=3.3
- libgcc-ng=9.3.0
- libgfortran-ng=7.3.0
- libiconv=1.16
- libpng=1.6.37
- libstdcxx-ng=9.3.0
- libtiff=4.1.0
- libuv=1.41.0
- llvm-openmp=11.0.1
- lz4-c=1.9.3
- matplotlib-base=3.3.4
- mkl=2020.4
- mkl-service=2.3.0
- mkl_fft=1.3.0
- mkl_random=1.2.0
- ncurses=6.2
- nettle=3.6
- networkx=2.5
- ninja=1.10.2
- numpy=1.19.2
- numpy-base=1.19.2
- olefile=0.46
- openh264=2.1.1
- openssl=1.1.1j
- pip=21.0.1
- pyparsing=2.4.7
- python=3.7.10
- python-dateutil=2.8.1
- python_abi=3.7
- pytorch=1.8.0
- pywavelets=1.1.1
- readline=8.1
- scikit-image=0.17.2
- scipy=1.6.1
- setuptools=52.0.0
- six=1.15.0
- sqlite=3.33.0
- tifffile=2020.10.1
- tk=8.6.10
- toolz=0.11.1
- torchaudio=0.8.0
- torchvision=0.9.0
- tornado=6.1
- typing_extensions=3.7.4.3
- wheel=0.36.2
- xz=5.2.5
- yaml=0.2.5
- zlib=1.2.11
- zstd=1.4.9
- pip:
- aiostream==0.4.2
- attrs==20.3.0
- clearml==0.17.4
- dm-control==0.0.355168290
- dm-env==1.4
- furl==2.1.0
- future==0.18.2
- glfw==2.1.0
- gym==0.18.0
- humanfriendly==9.1
- imageio-ffmpeg==0.4.3
- jsonschema==3.2.0
- labmaze==1.0.3
- lxml==4.6.2
- moviepy==1.0.3
- orderedmultidict==1.0.1
- pathlib2==2.3.5
- pillow==7.2.0
- proglog==0.1.9
- psutil==5.8.0
- pybullet==3.0.9
- pygame==2.0.1
- pyglet==1.5.0
- pyjwt==2.0.1
- pyrsistent==0.17.3
- requests-file==1.5.1
- tensorboard==2.4.1
- tensorboardx==2.1
Perfect, will try it. fyi: The conda_channels that I used are from clearml-agent init
I mean the version which it bases the PyTorch installation on.
My driver says "CUDA Version: 11.2" (I am not even sure this is correct, since I do not remember installing code in this machine, but idk) and there is no pytorch for 11.2, so maybe it fallbacks to cpu?
Mhhm, now conda env creation takes forever since it probably resolves conflicts. At least that is what is happening when I tried to manually install my environment
I guess that has nothing to do with the diff version, right ?
conda env update -p .clearml/venvs-builds/3.8 ./environment.yml
with environment.yml
name: clearml
channels:
- pytorch
- anaconda
- conda-forge
- defaults
dependencies:
- pytorch==1.8.0
Uninstall the current clearml-agent and reinstall this wheel, I hacked it to have ==, let's see if that works
Nvm, I took a look at conda history and there I see it
How does clearml-agent create the conda environment?
For now I can tell you that with conda_freeze: true
it fails, but with conda_freeze: false
it works!
I will try again tomorrow. It s getting late! Thank you for helping so far!
==> 2021-03-11 12:50:38 <==
# cmd: /home/tim/miniconda3/condabin/conda create --yes --mkdir --prefix /home/tim/.clearml/venvs-builds/3.8 python=3.8
--
==> 2021-03-11 12:50:40 <==
# cmd: /home/tim/miniconda3/condabin/conda install -p /home/tim/.clearml/venvs-builds/3.8 -c defaults -c conda-forge -c pytorch cudatoolkit=11.0 --quiet --json
--
==> 2021-03-11 12:50:43 <==
# cmd: /home/tim/miniconda3/condabin/conda install -p /home/tim/.clearml/venvs-builds/3.8 -c defaults -c conda-forge -c pytorch pip<20.2 --quiet --json
--
==> 2021-03-11 12:51:17 <==
# cmd: /home/tim/miniconda3/bin/conda-env update -p /home/tim/.clearml/venvs-builds/3.8 --file /tmp/conda_envaz1ne897.yml --quiet --json
Okay. And
110
means 11.1 and not 11.0? (edited)
110 means 11.0, the odd thing is, it actually installed 11.1, and from the pytorch website this is exactly how they suggest to install with conda...
Let me know if forcing the CUDA version changes anything
# Python 3.7.10 (default, Feb 26 2021, 18:47:35) [GCC 7.3.0]
aiostream==0.4.2
attrs==20.3.0
clearml==0.17.4
dm-control==0.0.355168290
dm-env==1.4
furl==2.1.0
future==0.18.2
glfw==2.1.0
gym==0.18.0
humanfriendly==9.1
imageio-ffmpeg==0.4.3
jsonschema==3.2.0
labmaze==1.0.3
lxml==4.6.2
moviepy==1.0.3
orderedmultidict==1.0.1
pathlib2==2.3.5
pillow==7.2.0
proglog==0.1.9
psutil==5.8.0
pybullet==3.0.9
pygame==2.0.1
pyglet==1.5.0
pyjwt==2.0.1
pyrsistent==0.17.3
requests-file==1.5.1
tensorboard==2.4.1
tensorboardx==2.1
# Conda Packages
blas==1.0
bzip2==1.0.8
ca-certificates==2020.10.14
certifi==2020.6.20
cloudpickle==1.6.0
cudatoolkit==11.1.1
cycler==0.10.0
cytoolz==0.11.0
dask-core==2021.2.0
decorator==4.4.2
ffmpeg==4.3
freetype==2.10.4
gmp==6.2.1
gnutls==3.6.13
imageio==2.9.0
jpeg==9b
kiwisolver==1.3.1
lame==3.100
lcms2==2.11
ld_impl_linux-64==2.33.1
libedit==3.1.20191231
libffi==3.3
libgcc-ng==9.3.0
libgfortran-ng==7.3.0
libiconv==1.16
libpng==1.6.37
libstdcxx-ng==9.3.0
libtiff==4.1.0
libuv==1.41.0
llvm-openmp==11.0.1
lz4-c==1.9.3
matplotlib-base==3.3.4
mkl==2020.4
mkl-service==2.3.0
mkl_fft==1.3.0
mkl_random==1.2.0
ncurses==6.2
nettle==3.6
networkx==2.5
ninja==1.10.2
numpy==1.19.2
numpy-base==1.19.2
olefile==0.46
openh264==2.1.1
openssl==1.1.1j
pip==21.0.1
pyparsing==2.4.7
python==3.7.10
python-dateutil==2.8.1
python_abi==3.7
torch==1.8.0
pywavelets==1.1.1
pyyaml==5.3.1
readline==8.1
scikit-image==0.17.2
scipy==1.6.1
setuptools==52.0.0
six==1.15.0
sqlite==3.33.0
tifffile==2020.10.1
tk==8.6.10
toolz==0.11.1
torchaudio==0.8.0
torchvision==0.9.0
tornado==6.1
typing_extensions==3.7.4.3
wheel==0.36.2
xz==5.2.5
yaml==0.2.5
zlib==1.2.11
zstd==1.4.9
Installed miniconda finally, now trying to run the task