Hello Everyone! I Am New To Clearml And Currently Trying Out Its Capabilities And I Am Having One Issue With Pipelines. Pipeline Runs Properly When I Run It Locally Or If I Clone The Project And Then Run It Remotely By En-Queuing It In A Queue But If I Di

Answered

Hello everyone! I am new to clearML and currently trying out its capabilities and I am having one issue with pipelines. Pipeline runs properly when I run it locally or if I clone the project and then run it remotely by en-queuing it in a queue but if I directly try to run the pipeline through terminal it gets stuck. I am following the clearML tutorial on youtube. (I am running on self hosted clearml server and agent in docker mode). Any help would be highly appreciated!

Through console I can see that it is getting stuck after this:

Downloading filelock-3.16.1-py3-none-any.whl (16 kB)
Installing collected packages: distlib, urllib3, six, rpds-py, PyYAML, pyparsing, pyjwt, psutil, platformdirs, idna, filelock, charset-normalizer, certifi, attrs, virtualenv, requests, referencing, python-dateutil, pathlib2, orderedmultidict, jsonschema-specifications, furl, jsonschema, clearml-agent
Successfully installed PyYAML-6.0.2 attrs-23.2.0 certifi-2024.8.30 charset-normalizer-3.4.0 clearml-agent-1.9.2 distlib-0.3.9 filelock-3.16.1 furl-2.1.3 idna-3.10 jsonschema-4.23.0 jsonschema-specifications-2024.10.1 orderedmultidict-1.0.1 pathlib2-2.3.7.post1 platformdirs-4.3.6 psutil-5.9.8 pyjwt-2.8.0 pyparsing-3.1.4 python-dateutil-2.8.2 referencing-0.35.1 requests-2.31.0 rpds-py-0.21.0 six-1.16.0 urllib3-1.26.20 virtualenv-20.27.1
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: None

The pipeline file i used

from clearml import TaskTypes
from clearml.automation.controller import PipelineDecorator


@PipelineDecorator.component(
    return_values=['X_train', 'y_train', 'X_test', 'y_test'],
    task_type=TaskTypes.data_processing
)
def prepare_data(dataset_name):
    # Imports first
    from clearml import Dataset
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np
    
    
    # Read the data
    data_path = Dataset.get(dataset_name=dataset_name, alias=dataset_name).get_local_copy()
    fashion_mnist_test = pd.read_csv(f"{data_path}/fashion-mnist_test.csv")
    fashion_mnist_train = pd.read_csv(f"{data_path}/fashion-mnist_train.csv")

    # Load in the train and test sets
    X_train = np.array(fashion_mnist_train.iloc[:,1:])
    y_train = np.array(fashion_mnist_train.iloc[:,0])
    X_test = np.array(fashion_mnist_test.iloc[:,1:])
    y_test = np.array(fashion_mnist_test.iloc[:,0])

    # Plot one of them to make sure everything is alright
    plt.imshow(X_train[1].reshape((28, 28)))
    plt.title("Sample Image")
    plt.show()

    return X_train, y_train, X_test, y_test


@PipelineDecorator.component(return_values=['model'], task_type=TaskTypes.training)
def train_model(X_train, y_train):
    # Imports first
    import xgboost as xgb
    from clearml import Task
    
    # Load the data into XGBoost format
    dtrain = xgb.DMatrix(X_train, label=y_train)
    # Set the parameters
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "max_depth": 4,  # the maximum depth of each tree
        "eta": 0.3,  # the training step for each iteration
        "gamma": 0,
        "max_delta_step": 1,
        "subsample": 1,
        "sampling_method": "uniform",
        "seed": 42
    }
    Task.current_task().connect(params)

    # Train the XGBoost Model
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=25,
        evals=[(dtrain, "train")],
        verbose_eval=0,
    )

    # Save the model
    model.save_model("best_model")
    
    return model


@PipelineDecorator.component(return_values=['accuracy'], cache=True, task_type=TaskTypes.qc)
def evaluate_model(model, X_test, y_test):
    # Imports first
    import matplotlib.pyplot as plt
    from sklearn.metrics import accuracy_score
    import xgboost as xgb
    from xgboost import plot_tree
    
    
    # Load the data in XGBoost format
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Make predictions for test data
    y_pred = model.predict(dtest)
    predictions = [round(value) for value in y_pred]

    # Evaluate predictions
    accuracy = accuracy_score(dtest.get_label(), predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return accuracy


@PipelineDecorator.pipeline(name='Simple Pipeline', project='Full Overview', version='0.0.5', default_queue='default', docker='nvidia/cuda:12.4.0-runtime-ubuntu22.04')
def run_pipeline(dataset_name):
        # Imports first
        from clearml import Task

        # Get the data in XGBoost format
        X_train, y_train, X_test, y_test = prepare_data(dataset_name=dataset_name)
        
        # Train an XGBoost model on the data
        model = train_model(X_train, y_train)
        
        # Evaluate the model
        accuracy = evaluate_model(model, X_test, y_test)
        Task.current_task().get_logger().report_single_value(name="Accuracy", value=accuracy)
        
        # This is blocked until the final step is completed successfully!
        print(accuracy)

        return accuracy



if __name__ == "__main__":
    # PipelineDecorator.run_locally()
    run_pipeline(dataset_name="Fashion MNIST")

  				
Posted 
	5 months ago

					More  		
  Report
		
					RipeSeaanemone60
				
					0
					 × 1

Votes Newest

Answers 3

here is the log file

  				
Posted 
	5 months ago

					More  		
  Report
		
					RipeSeaanemone60
				
					0
					 × 1

Hi RipeSeaanemone60 , can you please provide the full log? Is it the pipeline controller that is getting stuck or some step?

  				
Posted 
	5 months ago

					More  		
  Report
		
					CostlyOstrich36
				
					0

Pipeline controller is getting stuck just after installing python packages

  				
Posted 
	5 months ago

					More  		
  Report
		
					RipeSeaanemone60
				
					0
					 × 1

Write your answer

541 Views

3 Answers

5 months ago