Simplified a little bit and removed private parameters, but thats pretty much the code. We did not try with toy examples, since that was already done with the example pipelines when we implemented and the model training itself is quite simple basic there already (only few hyperparameters set)
That would make sense, although clearml, at least on UI, shows the deeper level of the nested dict as a int, as one would expect
We also disabled the auto_connect_framework
for catboost, but still the same thing
` from importlib.machinery import EXTENSION_SUFFIXES
import catboost
from clearml import Task, Logger, Dataset
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
MODELS = {
'catboost': {
'model_class': catboost.CatBoostClassifier,
'file_extension': 'cbm'
},
'lgbm': {
'model_class': lgb.LGBMClassifier,
'file_extension': 'txt'
}
}
class ModelTrainer():
def init(self):
pass
@staticmethod
def train(X_train, y_train, X_val, y_val, model, fit_params):
model.fit(
X_train, y_train,
eval_set=(X_val, y_val),
**fit_params
)
return model
def load_dataset(
self,
dataset_name,
features,
id_cols=None,
target='target',
dataset_project=''
):
dataset_path = Dataset.get(
dataset_name=dataset_name, dataset_project=dataset_project
).get_local_copy()
dataset = dd.read_parquet(dataset_path).compute()
if id_cols:
return dataset[features], dataset[target], dataset[id_cols]
else:
return dataset[features], dataset[target]
def run(
self, exp_params,
model_params, fit_params, reference_task_params, dataset_params
):
model_name = exp_params['model_name']
exp_identifier = exp_params['identifier']
print(f"starting session - {exp_identifier}")
# initialize ClearML task
task = Task.init(
project_name="RecSys",
task_name=f"model_training - {exp_identifier}",
output_uri=True
)
task.add_tags(['template'])
task.connect(fit_params, 'fit_params')
task.connect(model_params, 'model_params')
task.connect(exp_params, 'exp_params')
task.connect(reference_task_params, 'reference_task_params')
task.connect(dataset_params, 'dataset_params')
self.model = MODELS[model_name]['model_class'](**model_params)
reference_task = Task.get_task(
project_name='RecSys',
task_name=reference_task_params['name']
)
columns = reference_task.artifacts[
reference_task_params['features_articafact_name']
].get()
features = [c for c in columns if c not in ID_COLS + [TARGET]]
task.upload_artifact('features', features)
task.upload_artifact('ID_COLS', ID_COLS)
task.upload_artifact('target', TARGET)
print('number of features to load: ', len(features), '\n features: ')
print(features)
print("loading train data")
# load train data
X_train, y_train, id_cols_train = self.load_dataset(
dataset_name=dataset_params['train_name'],
features=features,
id_cols=ID_COLS,
dataset_project=dataset_params['project_name'],
)
print("loading validation data")
# load validation data
X_val, y_val, id_cols_val = self.load_dataset(
dataset_name=dataset_params['validation_name'],
features=features,
id_cols=ID_COLS,
dataset_project=dataset_params['project_name'],
)
logger = task.get_logger()
print('training model')
# train model
model = self.model
model.fit(
X_train, y_train,
eval_set=(X_val, y_val),
**fit_params
)
print('evaluating model')
# evaluate model with train data
self.evaluation_metrics(
y_train,
np.array([p[1] for p in model.predict_proba(X_train)]),
"train", logger, ids_idx=id_cols_train
)
task.close()
if name == 'main':
rfs = ModelTrainer()
model_params = {
"loss_function": "Logloss",
"eval_metric": "AUC",
"class_weights": {0: 1, 1: 60},
"learning_rate": 0.1
}
fit_params = {
"early_stopping_rounds": 20,
"plot": True
}
reference_task_params = {
'name': 'upload_features',
'features_articafact_name': 'features_list'
}
dataset_params = {
'train_name': 'classifier train',
'validation_name': '_classifier validation',
'test_name': 'classifier test',
'project_name': '',
}
experiment_params = {
'model_name': 'catboost',
'identifier': 'catboost_remote_v0'
}
rfs.run(
experiment_params,
model_params, fit_params,
reference_task_params, dataset_params
) `
When we enqueue the task using the web-ui we have the above error
ShallowGoldfish8 I think I understand the issue,
basically I think the issue is:task.connect(model_params, 'model_params')
Since this is a nested dict:model_params = { "loss_function": "Logloss", "eval_metric": "AUC", "class_weights": {0: 1, 1: 60}, "learning_rate": 0.1 }
The class_weights is stored as a String key, but catboost expects "int" key, hence it fails.
One option would be to remove the task.connect(model_params, 'model_params'')
Another hack (until we fix it) would be to do:task.connect(model_params, 'model_params') model_params["class_weights"] = { 0: model_params["class_weights"].get("0", model_params["class_weights"].get(0)) 1: model_params["class_weights"].get("1", model_params["class_weights"].get(1)) }
wdyt?
Martin, if you want, feel free to add your answer in the stackoverflow so that I can mark it as a solution.
Will do 🙂 give me 5
Martin, if you want, feel free to add your answer in the stackoverflow so that I can mark it as a solution.
oooohhh.. you mean the key of the nested dict, that would make a lot of sense
Hi ShallowGoldfish8 , what versions of ClearML & ClearML-Agent are you using?
When we use clearml-session to create a debug session and run the code from jupyter lab( inside the container) the training script run just fine.
When we enqueue the task using the web-ui we have the above error
UnsightlyHorse88 & ShallowGoldfish8 , can you please provide a code snippet to play with?
CLEARML-AGENT version 1.3.0 CL-server 1.6.0 clearml==1.6.2