add support for multi-GPU training and evaluation

Browse files

Files changed (5) hide show

README.md +13 -0
configs/metadata.json +2 -1
configs/multi_gpu_evaluate.json +28 -0
configs/multi_gpu_train.json +40 -0
docs/README.md +13 -0

README.md CHANGED Viewed

@@ -58,12 +58,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
 python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
 ```
 #### Override the `train` config to execute evaluation with the trained model:
 ```
 python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
 ```
 #### Execute inference:
 ```

 python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
 ```
+#### Override the `train` config to execute multi-GPU training:
+```
+torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
+```
+Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
 #### Override the `train` config to execute evaluation with the trained model:
 ```
 python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
 ```
+#### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
+```
+torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
 #### Execute inference:
 ```

configs/metadata.json CHANGED Viewed

@@ -1,7 +1,8 @@
 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
-    "version": "0.3.2",
     "changelog": {
         "0.3.2": "restructure readme to match updated template",
         "0.3.1": "add figures of workflow and metrics, add invert transform",
         "0.3.0": "update dataset processing",

 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
+    "version": "0.4.0",
     "changelog": {
+        "0.4.0": "add support for multi-GPU training and evaluation",
         "0.3.2": "restructure readme to match updated template",
         "0.3.1": "add figures of workflow and metrics, add invert transform",
         "0.3.0": "update dataset processing",

configs/multi_gpu_evaluate.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "device": "$torch.device(f'cuda:{dist.get_rank()}')",
+    "network": {
+        "_target_": "torch.nn.parallel.DistributedDataParallel",
+        "module": "$@network_def.to(@device)",
+        "device_ids": [
+            "@device"
+        ]
+    },
+    "validate#sampler": {
+        "_target_": "DistributedSampler",
+        "dataset": "@validate#dataset",
+        "even_divisible": false,
+        "shuffle": false
+    },
+    "validate#dataloader#sampler": "@validate#sampler",
+    "validate#handlers#1#_disabled_": "$dist.get_rank() > 0",
+    "evaluating": [
+        "$import torch.distributed as dist",
+        "$dist.init_process_group(backend='nccl')",
+        "$torch.cuda.set_device(@device)",
+        "$setattr(torch.backends.cudnn, 'benchmark', True)",
+        "$import logging",
+        "$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
+        "$@validate#evaluator.run()",
+        "$dist.destroy_process_group()"
+    ]
+}

configs/multi_gpu_train.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+    "device": "$torch.device(f'cuda:{dist.get_rank()}')",
+    "network": {
+        "_target_": "torch.nn.parallel.DistributedDataParallel",
+        "module": "$@network_def.to(@device)",
+        "device_ids": [
+            "@device"
+        ],
+        "find_unused_parameters": true
+    },
+    "train#sampler": {
+        "_target_": "DistributedSampler",
+        "dataset": "@train#dataset",
+        "even_divisible": true,
+        "shuffle": true
+    },
+    "train#dataloader#sampler": "@train#sampler",
+    "train#dataloader#shuffle": false,
+    "train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
+    "validate#sampler": {
+        "_target_": "DistributedSampler",
+        "dataset": "@validate#dataset",
+        "even_divisible": false,
+        "shuffle": false
+    },
+    "validate#dataloader#sampler": "@validate#sampler",
+    "validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
+    "training": [
+        "$import torch.distributed as dist",
+        "$dist.init_process_group(backend='nccl')",
+        "$torch.cuda.set_device(@device)",
+        "$monai.utils.set_determinism(seed=123)",
+        "$setattr(torch.backends.cudnn, 'benchmark', True)",
+        "$import logging",
+        "$@train#trainer.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
+        "$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
+        "$@train#trainer.run()",
+        "$dist.destroy_process_group()"
+    ]
+}

docs/README.md CHANGED Viewed

@@ -51,12 +51,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
 python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
 ```
 #### Override the `train` config to execute evaluation with the trained model:
 ```
 python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
 ```
 #### Execute inference:
 ```

 python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
 ```
+#### Override the `train` config to execute multi-GPU training:
+```
+torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
+```
+Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
 #### Override the `train` config to execute evaluation with the trained model:
 ```
 python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
 ```
+#### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
+```
+torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
 #### Execute inference:
 ```