Hi Guys, I'M In The Process Of Setting Up A Clearml Server For Experiment Tracking. I Have The Server Hosted In A Virtual Linux Machine On Azure And Run Experiments From Some Local Compute. Our Training Environment Is Pytorch Lightning And I Have Written

Unanswered

Sure. Really, I'm just using the default client:
# ClearML SDK configuration file
api {
web_server: http://server.azure.com:8080
api_server: http://server.azure.com:8008
files_server: http://server.azure.com:8081
credentials {
"access_key" = "..."
"secret_key" = "..."
}

}
sdk {
# ClearML - default SDK configuration

storage {
    cache {
        # Defaults to system temp folder / cache
        default_base_dir: "~/.clearml/cache"
    }

    direct_access: [
        # Objects matching are considered to be available for direct access, i.e. they will not be downloaded
        # or cached, and any download request will return a direct reference.
        # Objects are specified in glob format, available for url and content_type.
        { url: "file://*" }  # file-urls are always directly referenced
    ]
}

metrics {
    # History size for debug files per metric/variant. For each metric/variant combination with an attached file
    # (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
    # X files are stored in the upload destination for each metric/variant combination.
    file_history_size: 100

    # Max history size for matplotlib imshow files per plot title.
    # File names for the uploaded images will be recycled in such a way that no more than
    # X images are stored in the upload destination for each matplotlib plot title.
    matplotlib_untitled_history_size: 100

    # Limit the number of digits after the dot in plot reporting (reducing plot report size)
    # plot_max_num_digits: 5

    # Settings for generated debug images
    images {
        format: JPEG
        quality: 87
        subsampling: 0
    }

    # Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
    tensorboard_single_series_per_graph: false
}

network {
    metrics {
        # Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
        # a specific iteration
        file_upload_threads: 4

        # Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
        # being sent for upload
        file_upload_starvation_warning_sec: 120
    }

    iteration {
        # Max number of retries when getting frames if the server returned an error (http code 500)
        max_retries_on_server_error: 5
        # Backoff factory for consecutive retry attempts.
        # SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
        retry_backoff_factor_sec: 10
    }
}
aws {
    s3 {
        # S3 credentials, used for read/write access by various SDK elements

        # Default, used for any bucket not specified below
        region: ""
        # Specify explicit keys
        key: ""
        secret: ""
        # Or enable credentials chain to let Boto3 pick the right credentials.
        # This includes picking credentials from environment variables,
        # credential file and IAM role using metadata service.
        # Refer to the latest Boto3 docs
        use_credentials_chain: false

        credentials: [
            # specifies key/secret credentials to use when handling s3 urls (read or write)
            # {
            #     bucket: "my-bucket-name"
            #     key: "my-access-key"
            #     secret: "my-secret-key"
            # },
            # {
            #     # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
            #     host: "my-minio-host:9000"
            #     key: "12345678"
            #     secret: "12345678"
            #     multipart: false
            #     secure: false
            # }
        ]
    }
    boto3 {
        pool_connections: 512
        max_multipart_concurrency: 16
    }
}
google.storage {
    # # Default project and credentials file
    # # Will be used when no bucket configuration is found
    # project: "clearml"
    # credentials_json: "/path/to/credentials.json"
    # pool_connections: 512
    # pool_maxsize: 1024

    # # Specific credentials per bucket and sub directory
    # credentials = [
    #     {
    #         bucket: "my-bucket"
    #         subdir: "path/in/bucket" # Not required
    #         project: "clearml"
    #         credentials_json: "/path/to/credentials.json"
    #     },
    # ]
}
azure.storage {
    # containers: [
    #     {
    #         account_name: "clearml"
    #         account_key: "secret"
    #         # container_name:
    #     }
    # ]
}

log {
    # debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
    null_log_propagate: false
    task_log_buffer_capacity: 66

    # disable urllib info and lower levels
    disable_urllib3_info: true
}

development {
    # Development-mode options

    # dev task reuse window
    task_reuse_time_window_in_hours: 72.0

    # Run VCS repository detection asynchronously
    vcs_repo_detect_async: true

    # Store uncommitted git/hg source code diff in experiment manifest when training in development mode
    # This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
    store_uncommitted_code_diff: true

    # Support stopping an experiment in case it was externally stopped, status was changed or task was reset
    support_stopping: true

    # Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
    default_output_uri: ""

    # Default auto generated requirements optimize for smaller requirements
    # If True, analyze the entire repository regardless of the entry point.
    # If False, first analyze the entry point script, if it does not contain other to local files,
    # do not analyze the entire repository.
    force_analyze_entire_repo: false

    # If set to true, *clearml* update message will not be printed to the console
    # this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
    suppress_update_message: false

    # If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
    detect_with_pip_freeze: false

    # Log specific environment variables. OS environments are listed in the "Environment" section
    # of the Hyper-Parameters.
    # multiple selected variables are supported including the suffix '*'.
    # For example: "AWS_*" will log any OS environment variable starting with 'AWS_'.
    # This value can be overwritten with os environment variable CLEARML_LOG_ENVIRONMENT="[AWS_*, CUDA_VERSION]"
    # Example: log_os_environments: ["AWS_*", "CUDA_VERSION"]
    log_os_environments: []

    # Development mode worker
    worker {
        # Status report period in seconds
        report_period_sec: 2

        # ping to the server - check connectivity
        ping_period_sec: 30

        # Log all stdout & stderr
        log_stdout: true

        # Carriage return (\r) support. If zero (0) \r treated as \n and flushed to backend
        # Carriage return flush support in seconds, flush consecutive line feeds (\r) every X (default: 10) seconds
        console_cr_flush_period: 10

        # compatibility feature, report memory usage for the entire machine
        # default (false), report only on the running process and its sub-processes
        report_global_mem_used: false
    }
}

# Apply top-level environment section from configuration into os.environ
apply_environment: false
# Top-level environment section is in the form of:
#   environment {
#     key: value
#     ...
#   }
# and is applied to the OS environment as `key=value` for each key/value pair

# Apply top-level files section from configuration into local file system
apply_files: false
# Top-level files section allows auto-generating files at designated paths with a predefined contents
# and target format. Options include:
#  contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
#  format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
#          base64-encoded contents string, otherwise ignored
#  path: the target file's path, may include ~ and inplace env vars
#  target_format: format used to encode contents before writing into the target file. Supported values are json,
#                 yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
#  overwrite: overwrite the target file in case it exists. Default is true.
#
# Example:
#   files {
#     myfile1 {
#       contents: "The quick brown fox jumped over the lazy dog"
#       path: "/tmp/fox.txt"
#     }
#     myjsonfile {
#       contents: {
#         some {
#           nested {
#             value: [1, 2, 3, 4]
#           }
#         }
#       }
#       path: "/tmp/test.json"
#       target_format: json
#     }
#   }

}

  				
Posted 
	3 years ago

					More
				  		
  Report
		
					GiganticMole91
				
					0
					 × 1

344 Views

0 Answers

3 years ago

2 years ago