` # ClearML - Example of Pytorch mnist training integration
from future import print_function
import argparse
import os
from tempfile import gettempdir
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from clearml import OutputModel
from clearml import Task, Logger
import time
class Net(nn.Module):
def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4 * 4 * 50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4 * 4 * 50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
Logger.current_logger().report_scalar(
"train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item())
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(args, model, device, test_loader, epoch):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
Logger.current_logger().report_scalar(
"test", "loss", iteration=epoch, value=test_loss)
Logger.current_logger().report_scalar(
"test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset)))
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples', task_name='PyTorch MNIST train')
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=3, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=True,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST(os.path.join('..', 'data'), train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST(os.path.join('..', 'data'), train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(args, model, device, test_loader, epoch)
state_dict = model.state_dict()
filename = os.path.join(gettempdir(), "mnist_cnn1" + ".pt")
torch.save(state_dict, filename)
mv1 = OutputModel(name='mnist_cnn1', task=task)
mv1.update_weights(filename)
filename = os.path.join(gettempdir(), "mnist_cnn2" + ".pt")
torch.save(state_dict, filename)
mv2 = OutputModel(name='mnist_cnn2', task=task)
mv2.update_weights(filename)
if name == 'main':
main() `
Hi, Erez!
Thank you for the example, I checked it out. It really creates two models. But the thing is, these two models have different file names here. In my scenario, however, it's more convenient for me to have the same file name and different directories for the models. In this case, all my models get overwritten by the latest logged one (as in my screenshot above).
Fortunately, if I use upload_artifact()
instead (which I eventually go with) I manage to achieve what I want (see the screenshots below). In my case, it's perfectly enough but I'll be glad to open an issue and describe the bug if it helps
AnxiousSeal95
SweetBadger76 Could you please verify if that is what you meant. I'm still confused if I'm doing something wrong or everything works as intended and Clearml discriminates models only by the file name.
Hi SillySealion58 , yeah in that case we only look at the filename and not the full path. Let me see what we can do internally! Thanks and happy you found a workaround 😄
Unfortunately, the other parameters like tags
and comment
didn't help to separate the models
Hi, Erez!
Thank you for your answer! I'll see if it solves the problem
Hi SillySealion58
you can discriminate between your output models when you instantiate them. There are like parameters name, tags or comment that all belong to the constructor OutputModel .
It would thus be a way of using the same filename for all the checkpoints, and have them differentiated in the task. Does it make sense ?
Thank you but although I'm actually already using the parameter name
mentioned in your response in my code, I can see only one model on the task's page
filename = './models/v1/model.ckpt' torch.save(state_dict, filename) mv1 = OutputModel(name='model_v1', task=task) mv1.update_weights(filename, upload_uri=my_uri) update_model(mynn.multiplier) state_dict = mynn.state_dict() filename = './models/v2/model.ckpt' torch.save(state_dict, filename) mv2 = OutputModel(name='model_v2', task=task) mv2.update_weights(filename, upload_uri=my_uri)
Hi SillySealion58 ,
I'm Erez from ClearML! I'm revisiting the way we manage models on tasks and stumbled upon this unanswered question and wanted to help!
The below code works, and creates 2 models. note that we capture the models when you call torch.save() and we save the filename. The filename is also the "name" which you can use to modify models later on. If this is still relevant, would be happy if you could tell me whether it worked!