` sdk {
# TRAINS - default SDK configuration
storage {
cache {
# Defaults to system temp folder / cache
default_base_dir: "~/.trains/cache"
}
direct_access: [
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
# or cached, and any download request will return a direct reference.
# Objects are specified in glob format, available for url and content_type.
{ url: "file://*" } # file-urls are always directly referenced
]
}
metrics {
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
# X files are stored in the upload destination for each metric/variant combination.
file_history_size: 100
# Max history size for matplotlib imshow files per plot title.
# File names for the uploaded images will be recycled in such a way that no more than
# X images are stored in the upload destination for each matplotlib plot title.
matplotlib_untitled_history_size: 100
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
# plot_max_num_digits: 5
# Settings for generated debug images
images {
format: JPEG
quality: 87
subsampling: 0
}
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
tensorboard_single_series_per_graph: false
}
network {
metrics {
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
# a specific iteration
file_upload_threads: 4
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
# being sent for upload
file_upload_starvation_warning_sec: 120
}
iteration {
# Max number of retries when getting frames if the server returned an error (http code 500)
max_retries_on_server_error: 5
# Backoff factory for consecutive retry attempts.
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
retry_backoff_factor_sec: 10
}
}
aws {
s3 {
# S3 credentials, used for read/write access by various SDK elements
# default, used for any bucket not specified below
key: ""
secret: ""
region: ""
credentials: [
# specifies key/secret credentials to use when handling s3 urls (read or write)
# {
# bucket: "my-bucket-name"
# key: "my-access-key"
# secret: "my-secret-key"
# },
# {
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
# host: "my-minio-host:9000"
# key: "12345678"
# secret: "12345678"
# multipart: false
# secure: false
# }
]
}
boto3 {
pool_connections: 512
max_multipart_concurrency: 16
}
}
google.storage {
# # Default project and credentials file
# # Will be used when no bucket configuration is found
# project: "trains"
# credentials_json: "/path/to/credentials.json"
# # Specific credentials per bucket and sub directory
# credentials = [
# {
# bucket: "my-bucket"
# subdir: "path/in/bucket" # Not required
# project: "trains"
# credentials_json: "/path/to/credentials.json"
# },
# ]
}
azure.storage {
# containers: [
# {
# account_name: "trains"
# account_key: "secret"
# # container_name:
# }
# ]
}
log {
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
null_log_propagate: false
task_log_buffer_capacity: 66
# disable urllib info and lower levels
disable_urllib3_info: true
}
development {
# Development-mode options
# dev task reuse window
task_reuse_time_window_in_hours: 72.0
# Run VCS repository detection asynchronously
vcs_repo_detect_async: true
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
store_uncommitted_code_diff: true
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
support_stopping: true
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
default_output_uri: ""
# Default auto generated requirements optimize for smaller requirements
# If True, analyze the entire repository regardless of the entry point.
# If False, first analyze the entry point script, if it does not contain other to local files,
# do not analyze the entire repository.
force_analyze_entire_repo: false
# If set to true, *trains* update message will not be printed to the console
# this value can be overwritten with os environment variable TRAINS_SUPPRESS_UPDATE_MESSAGE=1
suppress_update_message: false
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
detect_with_pip_freeze: false
# Log specific environment variables. OS environments are enlisted in the "Environment" section
# of the Hyper-Parameters.
# multiple selected variables are supported including the suffix '*'.
# For example: "AWS_*" will log any OS environment variable starting with 'AWS_'.
# This value can be overwritten with os environment variable TRAINS_LOG_ENVIRONMENT="[AWS_*, CUDA_VERSION]"
# Example: log_os_environments: ["AWS_*", "CUDA_VERSION"]
log_os_environments: []
# Development mode worker
worker {
# Status report period in seconds
report_period_sec: 2
# ping to the server - check connectivity
ping_period_sec: 30
# Log all stdout & stderr
log_stdout: true
# compatibility feature, report memory usage for the entire machine
# default (false), report only on the running process and its sub-processes
report_global_mem_used: false
}
}
} `
Hi WickedGoat98
"Failed uploading to //:8081/files_server:"
Seems like the problem. what do you have defined as files_server in the trains.conf
pi {
# Notice: 'host' is the api server (default port 8008), not the web server.
api_server: http://vmd63828.contaboserver.net:30008
web_server: http://vmd63828.contaboserver.net:30080
files_server: http://vmd63828.contaboserver.net:30081
..}
the one I send you the snippet of the api {} config?
WickedGoat98 Actually the fileserver replied, so it all looks fine to me.
Try to run the text example again, see if you are still getting the fileserver error .
the server name is correct, I have been able to upload the example ...
I have to leave i'll be back online in a couple of hours.
Meanwhile see if the ports are correct (just curl to all ports see if you get an answer) if everything is okay, try again to run the text example
also the webserver pods log contains entries
So why is it trying to upload to "//:8081/files_server:" ?
What do you have in the trains.conf on the machine running the experiment ?
api_server and web_server look ok(py38) wgo@NVidia-power:~/dev/Trains/trains$ curl
{"meta":{"id":"bb5cd73435fb4127b9509ce3a771e95b","trx":"bb5cd73435fb4127b9509ce3a771e95b","endpoint":{"name":"","requested_version":1.0,"actual_version":null},"result_code":400,"result_spath /","error_stack":null},"data":{}}(py38) wgo@NVidia-power:~/dev/Trains/trains$ curl
`
<!doctype html>
<html lang="en">
<head> <meta charset="utf-8"> <title>trains</title> <base href="/"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <link rel="icon" type="image/x-icon" href="favicon.ico?v=5"> <script> if (global === undefined) { var global = window; } </script> <meta name="theme-color" content="#1976d2"> </head> <body class="dark-theme"> <sm-root></sm-root> <noscript>Please enable JavaScript to continue using this application.</noscript> <script src="runtime-es2015.d081064e058ab9d4530f.js" crossorigin="use-credentials" type="module"></script><script src="runtime-es5.d081064e058ab9d4530f.js" crossorigin="use-credentials" yfills-es5.dd6fc9cc359ba3460100.js" crossorigin="use-credentials" nomodule defer></script><script src="polyfills-es2015.56361711d9390b84d552.js" crossorigin="use-credentials" type="modula0ebecfcb437fdb1c02.js" crossorigin="use-credentials" type="module"></script><script src="styles-es5.2a0ebecfcb437fdb1c02.js" crossorigin="use-credentials" nomodule defer></script><scrip9.js" crossorigin="use-credentials" type="module"></script><script src="vendor-es5.7dac968ddaeb1621c029.js" crossorigin="use-credentials" nomodule defer></script><script src="main-es2015se-credentials" type="module"></script><script src="main-es5.a828262587d45134a613.js" crossorigin="use-credentials" nomodule defer></script></body> <footer> <script crossorigin="use-credentials" src="app/webapp-common/assets/plotly-1.52.2.min.js"></script> </footer> </html> `
Seems like everything is in order. Can you curl to the API/web/files server?
AgitatedDove14 I don't know why, but now it worksrunfile('/home/wgo/dev/Trains/trains/examples/reporting/text_reporting.py', wdir='/home/wgo/dev/Trains/trains/examples/reporting') TRAINS Task: overwriting (reusing) task id=b31459aa2d414ea7b5aaa8c467ee6ad3 This is standard error test 2020-12-12 11:51:44.841 | INFO | __main__:report_logs:26 - That's it, beautiful and simple logging! (using ANSI colors) TRAINS results page:
reporting text logs This is standard output test hello, this is plain text We are done reporting, have a great day :) TRAINS new version available: upgrade to v0.16.4 is recommended!
thanks for your support
I'm quite new to Kubernetes. What I have found is that the ports I expected, are usedroot@vmd62521:~# kubectl get services -n trains NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mongo-service ClusterIP 10.43.99.44 <none> 27017/TCP 25h webserver-service NodePort 10.43.49.21 <none> 80:30080/TCP 25h redis ClusterIP 10.43.62.222 <none> 6379/TCP 25h elasticsearch-service ClusterIP 10.43.195.218 <none> 9200/TCP 25h apiserver-service NodePort 10.43.195.121 <none> 8008:30008/TCP 25h fileserver-service NodePort 10.43.141.119 <none> 8081:30081/TCP 25h root@vmd62521:~#
redis, mongo and elasticsearch looks also ok
file_server not(py38) wgo@NVidia-power:~/dev/Trains/trains$ curl
`
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>405 Method Not Allowed</title> <h1>Method Not Allowed</h1> <p>The method is not allowed for the requested URL.</p> `
or do you mean the machine I ran the experiment locally?
Yes this one
or do you mean the machine I ran the experiment locally?
And the agent section on this machine is:api_server:
web_server:
files_server:
Is that correct?
root@vmd62521:~# kubectl get pods -n trains -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES agentservices-56655788b6-rnbk4 1/1 Running 1 25h 10.42.0.26 vmd63828.contaboserver.net <none> <none> mongo-76c4699cc-jgtzk 1/1 Running 0 25h 10.42.0.31 vmd63828.contaboserver.net <none> <none> webserver-c48b45b66-9tf6s 1/1 Running 0 25h 10.42.0.27 vmd63828.contaboserver.net <none> <none> apiserver-7d9cd59844-dfd5s 1/1 Running 0 25h 10.42.0.30 vmd63828.contaboserver.net <none> <none> elasticsearch-5c4f9c986b-g7pgd 1/1 Running 0 25h 10.42.0.29 vmd63828.contaboserver.net <none> <none> fileserver-6f49b74556-2m4n2 1/1 Running 0 25h 10.42.0.28 vmd63828.contaboserver.net <none> <none> redis-94f568467-bc2m7 1/1 Running 0 25h 10.42.0.32 vmd63828.contaboserver.net <none> <none> root@vmd62521:~#
now I will check for the exposed ports ...
the log of the fileserver pod seems quite empty
` root@vmd62521:~# kubectl logs fileserver-6f49b74556-2m4n2 -n trains --all-containers
- Serving Flask app "fileserver" (lazy loading)
- Environment: production
WARNING: This is a development server. Do not use it in a production deployment.
Use a production WSGI server instead. - Debug mode: off
root@vmd62521:~#same to the agentservice
root@vmd62521:~# kubectl logs agentservices-56655788b6-rnbk4 apiserver-7d9cd59844-dfd5s -n trains --all-containers
error: --all-containers=true should not be specified with container name apiserver-7d9cd59844-dfd5s
root@vmd62521:~# kubectl logs agentservices-56655788b6-rnbk4 -n trains --all-containers
WARNING: You are using pip version 20.1.1; however, version 20.3.1 is available.
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.
Failed creating temporary copy of ~/.ssh for git credential
root@vmd62521:~# `
to be honest, I don't know if I will find it as it is a Kubernetes cluster (ok only 2 nodes) and might be installed to somewhere ...
I will check if I will find any trains configs on the systems, but they should be the defaults comming with the Helm installer
# TRAINS SDK configuration file api { # Notice: 'host' is the api server (default port 8008), not the web server. api_server:
web_server:
files_server:
`
# Credentials are generated using the webapp, /profile
# Override with os environment: TRAINS_API_ACCESS_KEY / TRAINS_API_SECRET_KEY
credentials {....}
}
sdk {
# TRAINS - default SDK configuration
`