ClearML FAQ | Hi. I Have A Job That Processes Images And Creates ~5 Gb Of Processed Image Files (Lots Of Small Ones). At The End

Unanswered

Hi. I Have A Job That Processes Images And Creates ~5 Gb Of Processed Image Files (Lots Of Small Ones). At The End - It Creates A

I tried playing with those parameters on my laptop to no great effect.

Here is code you can use to reproduce the issue:

` import os
from pathlib import Path
from tqdm import tqdm
from clearml import Dataset, Task

def dataset_upload_test(project_id:str, bucket_name:str
):
def _random_file(fpath, sizekb):
fileSizeInBytes = 1024 * sizekb
with open(fpath, "wb") as fout:
fout.write(os.urandom(fileSizeInBytes))

def random_dataset(dataset_path, num_files, file_size_kb, num_per_part):
    dataset_path = Path(dataset_path)
    for i_file in tqdm(range(num_files)):
        fpath = (
            dataset_path / f"{int(i_file/num_per_part):05}" / f"f_{i_file:03}.bin"
        )
        fpath.parent.mkdir(exist_ok=True, parents=True)
        _random_file(fpath, file_size_kb)

project_name = "lavi_upload_test"
task_name = "test_upload_01"
task = Task.init(project_name=project_name, task_name=task_name)

dataset_path = Path("random_dataset")
# the next line will generate (2 million non-compressible files with total size ~7.7GB)
random_dataset(dataset_path, 2_000_000, 3, num_per_part=1000)
dataset = Dataset.create(
    dataset_name=task_name,
    dataset_project=project_name,
    dataset_version="0.2",
    output_uri="gs://" + bucket_name,
    description="test dataset upload",
    use_current_task=True,
)
dataset.add_files(dataset_path)
dataset.upload()
dataset.finalize()
task.close()

dataset_upload_test("<your-gcp-project>", "<your-gcs-bucket-name>") `

  				
Posted 
	2 years ago

					More
				  		
  Report
		
					PanickyMoth78
				
					0
					 × 1

297 Views

0 Answers

2 years ago