` from importlib.machinery import EXTENSION_SUFFIXES
import catboost
from clearml import Task, Logger, Dataset
import lightgbm as lgb
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
MODELS = {
'catboost': {
'model_class': catboost.CatBoostClassifier,
'file_extension': 'cbm'
},
'lgbm': {
'model_class': lgb.LGBMClassifier,
'file_extension': 'txt'
}
}
class ModelTrainer():
def init(self):
pass
@staticmethod
def train(X_train, y_train, X_val, y_val, model, fit_params):
model.fit(
X_train, y_train,
eval_set=(X_val, y_val),
**fit_params
)
return model
def load_dataset(
self,
dataset_name,
features,
id_cols=None,
target='target',
dataset_project=''
):
dataset_path = Dataset.get(
dataset_name=dataset_name, dataset_project=dataset_project
).get_local_copy()
dataset = dd.read_parquet(dataset_path).compute()
if id_cols:
return dataset[features], dataset[target], dataset[id_cols]
else:
return dataset[features], dataset[target]
def run(
self, exp_params,
model_params, fit_params, reference_task_params, dataset_params
):
model_name = exp_params['model_name']
exp_identifier = exp_params['identifier']
print(f"starting session - {exp_identifier}")
# initialize ClearML task
task = Task.init(
project_name="RecSys",
task_name=f"model_training - {exp_identifier}",
output_uri=True
)
task.add_tags(['template'])
task.connect(fit_params, 'fit_params')
task.connect(model_params, 'model_params')
task.connect(exp_params, 'exp_params')
task.connect(reference_task_params, 'reference_task_params')
task.connect(dataset_params, 'dataset_params')
self.model = MODELS[model_name]['model_class'](**model_params)
reference_task = Task.get_task(
project_name='RecSys',
task_name=reference_task_params['name']
)
columns = reference_task.artifacts[
reference_task_params['features_articafact_name']
].get()
features = [c for c in columns if c not in ID_COLS + [TARGET]]
task.upload_artifact('features', features)
task.upload_artifact('ID_COLS', ID_COLS)
task.upload_artifact('target', TARGET)
print('number of features to load: ', len(features), '\n features: ')
print(features)
print("loading train data")
# load train data
X_train, y_train, id_cols_train = self.load_dataset(
dataset_name=dataset_params['train_name'],
features=features,
id_cols=ID_COLS,
dataset_project=dataset_params['project_name'],
)
print("loading validation data")
# load validation data
X_val, y_val, id_cols_val = self.load_dataset(
dataset_name=dataset_params['validation_name'],
features=features,
id_cols=ID_COLS,
dataset_project=dataset_params['project_name'],
)
logger = task.get_logger()
print('training model')
# train model
model = self.model
model.fit(
X_train, y_train,
eval_set=(X_val, y_val),
**fit_params
)
print('evaluating model')
# evaluate model with train data
self.evaluation_metrics(
y_train,
np.array([p[1] for p in model.predict_proba(X_train)]),
"train", logger, ids_idx=id_cols_train
)
task.close()
if name == 'main':
rfs = ModelTrainer()
model_params = {
"loss_function": "Logloss",
"eval_metric": "AUC",
"class_weights": {0: 1, 1: 60},
"learning_rate": 0.1
}
fit_params = {
"early_stopping_rounds": 20,
"plot": True
}
reference_task_params = {
'name': 'upload_features',
'features_articafact_name': 'features_list'
}
dataset_params = {
'train_name': 'classifier train',
'validation_name': '_classifier validation',
'test_name': 'classifier test',
'project_name': '',
}
experiment_params = {
'model_name': 'catboost',
'identifier': 'catboost_remote_v0'
}
rfs.run(
experiment_params,
model_params, fit_params,
reference_task_params, dataset_params
) `