Can you attach the console log? What GPUs are you using? I assume nvidia-smi runs without issue?
I use in clearml-agent the --gpus 0,1 flag.
And also i dont use docker mode. I use virtual env mode.
Just to make sure we're on the same page, you're referring the machine statistics or ALL scalars don't show up?
I see. Can you provide a simple stand alone code snippet that reproduces this behaviour for you?
- And yes - we see only the two monitoring parameters
Hi @<1774969995759980544:profile|SmoggyGoose12> , I think that selecting GPUs works only in docker mode.
Clearml-agent on worker: 1.9.2
Clearml on my computer: 1.16.4
2 gpus - NVIDIA GeForce RTX 3080
I reffer to only the training statistic, in the scalars tab. I can see the monitoring of the gpu's and cpu, memory...
Also i will say again then with only one gpu in the training everything is working great.
Hi, i still have this problem. and would appreciate some help with it.
@<1523701070390366208:profile|CostlyOstrich36> ?
Also, what GPUs are you running on that machine?
What versions of clearml-agent
& clearml
are you using? Is it a self hosted server?
To continue Ido's question - we use the following command:
In our training, we build on Facebook's Detectron2, the events file is created fine, we do see everything in the tensorboard UI, but when using multi GPU we don't see anything in the stated UI windows and clearML states it also doesn't find the iteration number from the code.
What is the command you used to run the agent?
No, i have one queue, with one server in that queue. This server have 2 gpus. Using my training code i can choose if i use 2 gpus or just one in the training....
Hi @<1779681046892122112:profile|EnviousHare17> and @<1774969995759980544:profile|SmoggyGoose12> ,
I run this code example:
# ClearML - Example of pytorch with tensorboard>=v1.14
#
from __future__ import print_function
import argparse
import os
from tempfile import gettempdir
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
from clearml import Task
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(model, epoch, train_loader, args, optimizer, writer):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.data.item()))
niter = epoch*len(train_loader)+batch_idx
writer.add_scalar('Train/Loss', loss.data.item(), niter)
def test(model, test_loader, args, optimizer, writer):
model.eval()
test_loss = 0
correct = 0
for niter, (data, target) in enumerate(test_loader):
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data), Variable(target)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').data.item() # sum up batch loss
pred = output.data.max(1)[1] # get the index of the max log-probability
pred = pred.eq(target.data).cpu().sum()
writer.add_scalar('Test/Loss', pred, niter)
correct += pred
if niter % 100 == 0:
writer.add_image('test', data[0, :, :, :], niter)
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
args = parser.parse_args()
# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples', task_name='PyTorch with TensorBoard') # noqa: F841
writer = SummaryWriter('runs')
writer.add_text('TEXT', 'This is some text', 0)
args.cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)
kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Net()
if args.cuda:
model = nn.DataParallel(model) # Use all available GPUs
model.cuda()
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train(model, epoch, train_loader, args, optimizer, writer)
m = torch.jit.script(model.module) # Access the underlying model for scripting
m.save(os.path.join(gettempdir(), 'model{}'.format(epoch)))
#torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch)))
test(model, test_loader, args, optimizer, writer)
if __name__ == "__main__":
# Hack for supporting Windows OS -
main()
With a 4 gpus machine, and got all the gpus stats (attached an image with the output).
Do you have some an example code that I can run for reproducing the issue? Can you try running this code and check if you are getting all gpus stats?
@<1774969995759980544:profile|SmoggyGoose12> how do you report the scalars? with tb SummaryWriter?
In the UI, if you click the eye symbol, you have only the monitoring options?
We will try it out on our end and let you know. Thanks!!! FYI @<1774969995759980544:profile|SmoggyGoose12>
Yes no issue with nvidia-smi, it recognize the 2 gpu's and successfuly use them for the training. The only problem is the metrics, scalars in the ui when i use 2 gpus
@<1744891825086271488:profile|RoundElephant20> 1. We use Tensorboard SummaryWriter
Even when i dont add the --gpus flag, it doesnt work.
You have two queues, one for 1xGPU and the other for 2xGPU, two workers on the GPU machine are running with each listening to the relevant queue. Is that the setup?