Unanswered
Hi! I'M Running Launch_Multi_Mode With Pytorch-Lightning
Hi @<1578555761724755968:profile|GrievingKoala83> ! We have released clearml==1.16.3rc1
which should solve the issue now. Just specify task.launch_multi_node(nodes, devices=gpus)
. For example:
import sys
import os
from argparse import ArgumentParser
import pytorch_lightning as pl
from pytorch_lightning.strategies.ddp import DDPStrategy
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchvision import transforms
from torchvision.datasets.mnist import MNIST
from clearml import Task
class LitClassifier(pl.LightningModule):
def __init__(self, hidden_dim=128, learning_rate=1e-3):
super().__init__()
self.save_hyperparameters()
self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = torch.relu(self.l1(x))
x = torch.relu(self.l2(x))
return x
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
self.log('valid_loss', loss)
return loss
def test_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
@staticmethod
def add_model_specific_args(parent_parser):
parser = ArgumentParser(parents=[parent_parser], add_help=False)
parser.add_argument('--hidden_dim', type=int, default=128)
parser.add_argument('--learning_rate', type=float, default=0.0001)
return parser
if __name__ == '__main__':
Task.force_store_standalone_script()
Task.add_requirements("./requirements.txt")
pl.seed_everything(0)
parser = ArgumentParser()
parser.add_argument('--batch_size', default=32, type=int)
parser.add_argument('--max_epochs', default=3, type=int)
sys.argv.extend(['--max_epochs', '1'])
parser = LitClassifier.add_model_specific_args(parser)
args = parser.parse_args()
task = Task.init(project_name="examples", task_name="pytorch lightning MNIST")
task.execute_remotely(queue_name="Eugene2")
nodes = 2
gpus = 2
config = task.launch_multi_node(nodes, devices=gpus, hide_children=True)
print(os.environ)
# ------------
# data
# ------------
dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor())
mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor())
mnist_train, mnist_val = random_split(dataset, [55000, 5000])
train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
test_loader = DataLoader(mnist_test, batch_size=args.batch_size)
model = LitClassifier(args.hidden_dim, args.learning_rate)
ddp = DDPStrategy(process_group_backend="nccl")
trainer = pl.Trainer(max_epochs=args.max_epochs, devices=gpus, num_nodes=nodes)
trainer.fit(model, train_loader, val_loader)
45 Views
0
Answers
4 months ago
4 months ago