Unanswered
Hey Guys.
Ive Uploaded A Dataset (Lets Call It Total) And Then I Want To Make A Train / Test Split Of It That Refers Back To Total Dataset.
My Files Are Stored In S3 So Clearml Just Points To The External Files.
But If I Link The Train Back To The Total
Sure,
# Create a new ClearML dataset with S3 output storage
dataset = Dataset.create(
dataset_project=cfg.PROJECT_NAME,
dataset_name=folder_name,
output_uri=cfg.S3_BUCKET_URI,
)
# Dictionary to store S3 URLs mapped by `id`
s3_urls = {}
# Iterate through rows to map `id` → S3 URL
for idx, row in metadata_df.iterrows():
audio_path = row.get("stored_filename") # Ensure column exists
if not isinstance(audio_path, str) or not audio_path.strip():
print(f"Skipping row {idx} due to missing 'stored_filename'")
continue
remote_path = os.path.join(cfg.S3_BUCKET_URI, folder_name, audio_path)
# Upload file to S3
StorageManager.upload_file(audio_path, remote_path)
print(f"Uploaded {audio_path} to {remote_path}")
dataset.add_external_files(remote_path)
# Map S3 URL to the corresponding ID
s3_urls[idx] = remote_path
# Attach metadata with S3 URLs
metadata_df["s3_url"] = metadata_df.index.map(s3_urls.get)
print(metadata_df["s3_url"])
print("-------------")
print(s3_urls)
dataset.set_metadata(metadata_df)
print("Added metadata with S3 URLs")
# Upload and finalize dataset
dataset.upload()
dataset.finalize()
print(f"Uploaded dataset: {dataset.id} to {cfg.S3_BUCKET_URI}")
This creates my total dataset.
Then for example, say I want files with a bitrate less than 60000
def filter_dataset(dataset_id, filter_fn, new_dataset_name):
"""
Creates a filtered dataset based on a user-defined filter function.
:param dataset_id: The ClearML dataset ID to filter from.
:param filter_fn: A lambda function that filters rows (e.g., lambda x: x.bitrate < 8000).
:param new_dataset_name: The name for the new filtered dataset.
"""
print(f"Loading dataset {dataset_id}...")
dataset = Dataset.get(dataset_id=dataset_id)
# Load metadata and apply the filter function
metadata_df = pd.DataFrame(dataset.get_metadata()) # Load stored metadata
print(f"Metadata loaded: {metadata_df.shape[0]} records found.")
# Apply the filter function
filtered_metadata = metadata_df[metadata_df.apply(filter_fn, axis=1)]
print(f"Filtered down to {filtered_metadata.shape[0]} records.")
if filtered_metadata.empty:
print("No matching records found. Exiting.")
return
# Create a new ClearML dataset referencing existing files
filtered_dataset = Dataset.create(
dataset_project=cfg.PROJECT_NAME,
dataset_name=new_dataset_name,
output_uri=cfg.S3_BUCKET_URI,
# parent_datasets=[dataset],
)
# Add metadata to the new dataset
filtered_dataset.set_metadata(filtered_metadata)
# Reference only the files that match the filter criteria
for idx, call_row in filtered_metadata.iterrows():
print(idx)
print(call_row["s3_url"])
filtered_dataset.add_external_files(
call_row["s3_url"]
)
# Finalize and upload
filtered_dataset.upload()
filtered_dataset.finalize()
print(f"Filtered dataset created: {filtered_dataset.id}")
Then I only upload the files to this new dataset if they fulfil the condition.
filtered_dataset = Dataset.create(
dataset_project=cfg.PROJECT_NAME,
dataset_name=new_dataset_name,
output_uri=cfg.S3_BUCKET_URI,
# parent_datasets=[dataset], <-- However I have to comment this out, or it inherits all the items from the parent! Meaning the slice is pointless
)
17 Views
0
Answers
21 days ago
21 days ago