Here is the kernprofiler results from putting @profile before dataset.get:
It took 37s to access a dataset that must be in local cache since I use the same datset multiple times. Dataset size is a few mb stored in azure data storage, is this normal time duration? (had to cut the report because of message size limitation)
Total time: 37.5027 s
File: /home/francisco/miniconda3/envs/py310/lib/python3.10/site-packages/clearml/datasets/dataset.py
Function: get at line 1559
Line # Hits Time Per Hit % Time Line Contents
1684
1685 4 11.4 2.9 0.0 if not dataset_id:
1686 4 2315410.9 578852.7 6.2 dataset_id, _ = cls.get_dataset_id(
1687 4 3.1 0.8 0.0 dataset_project=dataset_project,
1688 4 3.4 0.8 0.0 dataset_name=dataset_name,
1689 4 3.4 0.8 0.0 dataset_version=dataset_version,
1690 4 61.0 15.2 0.0 dataset_filter=dict(
1691 4 3.6 0.9 0.0 tags=dataset_tags,
1692 4 2.1 0.5 0.0 system_tags=system_tags,
1693 4 149.4 37.4 0.0 type=[str(Task.TaskTypes.data_processing)],
1694 4 4.2 1.1 0.0 status=["published"]
1695 4 3.3 0.8 0.0 if only_published
1696 4 3.7 0.9 0.0 else ["published", "completed", "closed"]
1697 4 2.2 0.6 0.0 if only_completed
1698 4 2.2 0.6 0.0 else None,
1699 ),
1700 4 2.6 0.7 0.0 shallow_search=shallow_search
1701 )
1702 4 6.8 1.7 0.0 if not dataset_id and not auto_create:
1703 raise ValueError(
1704 "Could not find Dataset {} {}".format(
1705 "id" if dataset_id else "project/name/version",
1706 dataset_id if dataset_id else (dataset_project, dataset_name, dataset_version),
1707 )
1708 )
1709 4 2.0 0.5 0.0 orig_dataset_id = dataset_id
1710
1711 4 21.6 5.4 0.0 if alias and overridable and running_remotely():
1712 remote_task = Task.get_task(task_id=get_remote_task_id())
1713 dataset_id = remote_task.get_parameter("{}/{}".format(cls.__hyperparams_section, alias))
1714
1715 4 3.4 0.8 0.0 if not dataset_id:
1716 if not auto_create:
1717 raise ValueError(
1718 "Could not find Dataset {} {}".format(
1719 "id" if dataset_id else "project/name/version",
1720 dataset_id if dataset_id else (dataset_project, dataset_name, dataset_version),
1721 )
1722 )
1723 instance = Dataset.create(
1724 dataset_name=dataset_name, dataset_project=dataset_project, dataset_tags=dataset_tags
1725 )
1726 return finish_dataset_get(instance, instance._id)
1727 4 12669482.4 3167370.6 33.8 instance = get_instance(dataset_id)
1728 # Now we have the requested dataset, but if we want a mutable copy instead, we create a new dataset with the
1729 # current one as its parent. So one can add files to it and finalize as a new version.
1730 4 3.5 0.9 0.0 if writable_copy:
1731 writeable_instance = Dataset.create(
1732 dataset_name=instance.name,
1733 dataset_project=instance.project,
1734 dataset_tags=instance.tags,
1735 parent_datasets=[instance.id],
1736 )
1737 return finish_dataset_get(writeable_instance, writeable_instance.id)
1738
1739 4 22510379.1 5627594.8 60.0 return finish_dataset_get(instance, orig_dataset_id)