-
-
Notifications
You must be signed in to change notification settings - Fork 615
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Upgrading to 0.4.9 stuck multi-gpu training #2635
Comments
torch.multiprocessing.spawn is used to invoke multi-gpu training (with 2 T40 GPU; CUDA 11.6 installed)
|
Thanks for reporting @Nic-Ma ! |
Hi @vfdev-5 , I think @SachidanandAlle already provided the simple program in the above comment? Thanks. |
@Nic-Ma I do not understand his code snippet, it is incomplete and how to execute it |
Hi @SachidanandAlle , Could you please help share more information about the simple program to reproduce? Thanks in advance. |
Best way is to install MONAILabel and trigger training.. later will share the simple 4 steps to do so.. hope that's helps to reproduce.. otherwise don't want to put extra effort to write the snippet.. |
@SachidanandAlle @Nic-Ma with all respect to your project please put an effort to provide some executable code snippet. |
No worries.. will share you the snippet soon |
Here is the snippet that should help to reproduce the problem... REQUIREMENTS
CODEcreate.pyimport os
import nibabel as nib
import numpy as np
from monai.data import create_test_image_3d
if __name__ == "__main__":
data_dir = "./testdata"
print(f"generating synthetic data to {data_dir} (this may take a while)")
os.makedirs(data_dir)
# set random seed to generate same random data for every node
np.random.seed(seed=0)
for i in range(20):
im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
n = nib.Nifti1Image(im, np.eye(4))
nib.save(n, os.path.join(data_dir, f"img{i:d}.nii.gz"))
n = nib.Nifti1Image(seg, np.eye(4))
nib.save(n, os.path.join(data_dir, f"seg{i:d}.nii.gz")) multi.pyimport logging
import os
import sys
from glob import glob
import monai
import torch
from ignite.metrics import Accuracy
from monai.data import DataLoader, Dataset
from monai.engines import SupervisedTrainer
from monai.handlers import CheckpointSaver, LrScheduleHandler, StatsHandler, from_engine
from monai.inferers import SimpleInferer
from monai.transforms import (
Activationsd,
AsChannelFirstd,
AsDiscreted,
Compose,
KeepLargestConnectedComponentd,
LoadImaged,
RandCropByPosNegLabeld,
RandRotate90d,
ScaleIntensityd,
)
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data.distributed import DistributedSampler
def train(rank, data_dir):
images = sorted(glob(os.path.join(data_dir, "img*.nii.gz")))
segs = sorted(glob(os.path.join(data_dir, "seg*.nii.gz")))
train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]
# define transforms for image and segmentation
train_transforms = Compose(
[
LoadImaged(keys=["image", "label"]),
AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
ScaleIntensityd(keys="image"),
RandCropByPosNegLabeld(
keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
),
RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
]
)
# create a training data loader
train_ds = Dataset(data=train_files, transform=train_transforms)
# create a training data sampler
train_sampler = DistributedSampler(train_ds)
# use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
train_loader = DataLoader(
train_ds,
batch_size=2,
shuffle=False,
num_workers=2,
pin_memory=True,
sampler=train_sampler,
)
# create UNet, DiceLoss and Adam optimizer
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
net = monai.networks.nets.UNet(
spatial_dims=3,
in_channels=1,
out_channels=1,
channels=(16, 32, 64, 128, 256),
strides=(2, 2, 2, 2),
num_res_units=2,
).to(device)
loss = monai.losses.DiceLoss(sigmoid=True)
opt = torch.optim.Adam(net.parameters(), 1e-3)
lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)
# wrap the model with DistributedDataParallel module
net = DistributedDataParallel(net, device_ids=[device])
train_post_transforms = Compose(
[
Activationsd(keys="pred", sigmoid=True),
AsDiscreted(keys="pred", threshold=0.5),
KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
]
)
train_handlers = [
LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
]
if rank == 0:
train_handlers.extend(
[
StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)),
CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2),
]
)
trainer = SupervisedTrainer(
device=device,
max_epochs=5,
train_data_loader=train_loader,
network=net,
optimizer=opt,
loss_function=loss,
inferer=SimpleInferer(),
# if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
amp=True,
postprocessing=train_post_transforms,
key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)},
train_handlers=train_handlers,
)
trainer.run()
torch.distributed.destroy_process_group()
# This way multi-gpu works (SO NOT A PROBLEM => IGNORE)
# torchrun --standalone --nnodes=1 --nproc_per_node=2 multi.py
def main_cmd():
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
torch.distributed.init_process_group(backend="nccl", init_method="env://")
rank = os.environ["LOCAL_RANK"]
train(rank, "./testdata")
# This way multi-gpu invoke NOT WORKING anymore with LATEST version of ignite
# python multi.py
def main_spwan():
os.putenv("MASTER_ADDR", "127.0.0.1")
os.putenv("MASTER_PORT", "1234")
torch.multiprocessing.spawn(main_worker, nprocs=2)
def main_worker(rank):
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
print(f"+++ Worker Rank: {rank}")
torch.distributed.init_process_group(backend="nccl", init_method="env://", world_size=2, rank=rank)
train(rank, "./testdata")
if __name__ == "__main__":
main_spwan() DUMMY DATASET
DO NOT WORK
WORKS
|
#2635 (comment) |
@SachidanandAlle thanks a lot for the repro code snippet ! This issue is related to still open issue #2035 and particularly, more precisely to #2035 (comment) It is by chance that 0.4.8 is working, due to key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)}, In 0.4.9 I removed a warning for DDP context in metric: #2549 and thus ignite is fully unaware of DDP context and tries to set it up on rank zero only when using @SachidanandAlle a quick workaround fix of the current code would be: def train(rank, data_dir):
import ignite
ignite.distributed.set_local_rank(rank)
ignite.distributed.sync()
# ... everything is the same as in the original `multi.py` ... By the way, we can simplify the code a bit more using multi_updated.pyimport logging
import os
import sys
from glob import glob
import ignite.distributed as idist
import monai
import torch
from ignite.metrics import Accuracy
from monai.data import DataLoader, Dataset
from monai.engines import SupervisedTrainer
from monai.handlers import CheckpointSaver, LrScheduleHandler, StatsHandler, from_engine
from monai.inferers import SimpleInferer
from monai.transforms import (
Activationsd,
AsChannelFirstd,
AsDiscreted,
Compose,
KeepLargestConnectedComponentd,
LoadImaged,
RandCropByPosNegLabeld,
RandRotate90d,
ScaleIntensityd,
)
from torch.utils.data.distributed import DistributedSampler
def train(rank, data_dir):
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
images = sorted(glob(os.path.join(data_dir, "img*.nii.gz")))
segs = sorted(glob(os.path.join(data_dir, "seg*.nii.gz")))
train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]
# define transforms for image and segmentation
train_transforms = Compose(
[
LoadImaged(keys=["image", "label"]),
AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
ScaleIntensityd(keys="image"),
RandCropByPosNegLabeld(
keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
),
RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
]
)
# create a training data loader
train_ds = Dataset(data=train_files, transform=train_transforms)
# create a training data sampler
train_sampler = DistributedSampler(train_ds)
# use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
train_loader = DataLoader(
train_ds,
batch_size=2,
shuffle=False,
num_workers=2,
pin_memory=True,
sampler=train_sampler,
)
# create UNet, DiceLoss and Adam optimizer
device = idist.device()
net = monai.networks.nets.UNet(
spatial_dims=3,
in_channels=1,
out_channels=1,
channels=(16, 32, 64, 128, 256),
strides=(2, 2, 2, 2),
num_res_units=2,
)
net = idist.auto_model(net)
loss = monai.losses.DiceLoss(sigmoid=True)
opt = torch.optim.Adam(net.parameters(), 1e-3)
lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)
train_post_transforms = Compose(
[
Activationsd(keys="pred", sigmoid=True),
AsDiscreted(keys="pred", threshold=0.5),
KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
]
)
train_handlers = [
LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
]
if rank == 0:
train_handlers.extend(
[
StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)),
CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2),
]
)
trainer = SupervisedTrainer(
device=device,
max_epochs=5,
train_data_loader=train_loader,
network=net,
optimizer=opt,
loss_function=loss,
inferer=SimpleInferer(),
# if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
amp=True,
postprocessing=train_post_transforms,
key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)},
train_handlers=train_handlers,
)
trainer.run()
# python multi_updated.py
if __name__ == "__main__":
idist.spawn("nccl", fn=train, args=("./testdata", ), nproc_per_node=2, master_port="1234") cc @sadra-barikbin and your PR #2633 |
🐛 Bug description
Hi @vfdev-5 ,
We upgraded ignite from 0.4.8 to 0.4.9 in MONAI 0.9.1 recently: Project-MONAI/MONAI#4605.
Then got the issue report from user:
something changed related to multi-gpu training between 0.9.1 and 0.9.0... monailabel multi-training is not working.. SupervisedTrainer is getting stuck to run inference step to compute the loss.. after debugging a bit.. i see this is the problem... pytorch-ignite==0.4.8 vs pytorch-ignite==0.4.9 when I downgrade it, all is ok..
CC @wyli @SachidanandAlle
Environment
conda
,pip
, source): pipThe text was updated successfully, but these errors were encountered: