Unanswered
Hi.
I Have A Job That Processes Images And Creates ~5 Gb Of Processed Image Files (Lots Of Small Ones).
At The End - It Creates A
I tried playing with those parameters on my laptop to no great effect.
Here is code you can use to reproduce the issue:
` import os
from pathlib import Path
from tqdm import tqdm
from clearml import Dataset, Task
def dataset_upload_test(project_id:str, bucket_name:str
):
def _random_file(fpath, sizekb):
fileSizeInBytes = 1024 * sizekb
with open(fpath, "wb") as fout:
fout.write(os.urandom(fileSizeInBytes))
def random_dataset(dataset_path, num_files, file_size_kb, num_per_part):
dataset_path = Path(dataset_path)
for i_file in tqdm(range(num_files)):
fpath = (
dataset_path / f"{int(i_file/num_per_part):05}" / f"f_{i_file:03}.bin"
)
fpath.parent.mkdir(exist_ok=True, parents=True)
_random_file(fpath, file_size_kb)
project_name = "lavi_upload_test"
task_name = "test_upload_01"
task = Task.init(project_name=project_name, task_name=task_name)
dataset_path = Path("random_dataset")
# the next line will generate (2 million non-compressible files with total size ~7.7GB)
random_dataset(dataset_path, 2_000_000, 3, num_per_part=1000)
dataset = Dataset.create(
dataset_name=task_name,
dataset_project=project_name,
dataset_version="0.2",
output_uri="gs://" + bucket_name,
description="test dataset upload",
use_current_task=True,
)
dataset.add_files(dataset_path)
dataset.upload()
dataset.finalize()
task.close()
dataset_upload_test("<your-gcp-project>", "<your-gcs-bucket-name>") `
180 Views
0
Answers
one year ago
one year ago