add support for multi-GPU training and evaluation
Browse files- README.md +13 -0
- configs/metadata.json +2 -1
- configs/multi_gpu_evaluate.json +28 -0
- configs/multi_gpu_train.json +40 -0
- docs/README.md +13 -0
README.md
CHANGED
|
@@ -58,12 +58,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
|
|
| 58 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
| 59 |
```
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
#### Override the `train` config to execute evaluation with the trained model:
|
| 62 |
|
| 63 |
```
|
| 64 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
| 65 |
```
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
#### Execute inference:
|
| 68 |
|
| 69 |
```
|
|
|
|
| 58 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
| 59 |
```
|
| 60 |
|
| 61 |
+
#### Override the `train` config to execute multi-GPU training:
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
|
| 68 |
+
|
| 69 |
#### Override the `train` config to execute evaluation with the trained model:
|
| 70 |
|
| 71 |
```
|
| 72 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
| 73 |
```
|
| 74 |
|
| 75 |
+
#### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
|
| 76 |
+
|
| 77 |
+
```
|
| 78 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
|
| 79 |
+
|
| 80 |
#### Execute inference:
|
| 81 |
|
| 82 |
```
|
configs/metadata.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
|
| 3 |
-
"version": "0.
|
| 4 |
"changelog": {
|
|
|
|
| 5 |
"0.3.2": "restructure readme to match updated template",
|
| 6 |
"0.3.1": "add figures of workflow and metrics, add invert transform",
|
| 7 |
"0.3.0": "update dataset processing",
|
|
|
|
| 1 |
{
|
| 2 |
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
|
| 3 |
+
"version": "0.4.0",
|
| 4 |
"changelog": {
|
| 5 |
+
"0.4.0": "add support for multi-GPU training and evaluation",
|
| 6 |
"0.3.2": "restructure readme to match updated template",
|
| 7 |
"0.3.1": "add figures of workflow and metrics, add invert transform",
|
| 8 |
"0.3.0": "update dataset processing",
|
configs/multi_gpu_evaluate.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"device": "$torch.device(f'cuda:{dist.get_rank()}')",
|
| 3 |
+
"network": {
|
| 4 |
+
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
| 5 |
+
"module": "$@network_def.to(@device)",
|
| 6 |
+
"device_ids": [
|
| 7 |
+
"@device"
|
| 8 |
+
]
|
| 9 |
+
},
|
| 10 |
+
"validate#sampler": {
|
| 11 |
+
"_target_": "DistributedSampler",
|
| 12 |
+
"dataset": "@validate#dataset",
|
| 13 |
+
"even_divisible": false,
|
| 14 |
+
"shuffle": false
|
| 15 |
+
},
|
| 16 |
+
"validate#dataloader#sampler": "@validate#sampler",
|
| 17 |
+
"validate#handlers#1#_disabled_": "$dist.get_rank() > 0",
|
| 18 |
+
"evaluating": [
|
| 19 |
+
"$import torch.distributed as dist",
|
| 20 |
+
"$dist.init_process_group(backend='nccl')",
|
| 21 |
+
"$torch.cuda.set_device(@device)",
|
| 22 |
+
"$setattr(torch.backends.cudnn, 'benchmark', True)",
|
| 23 |
+
"$import logging",
|
| 24 |
+
"$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
|
| 25 |
+
"$@validate#evaluator.run()",
|
| 26 |
+
"$dist.destroy_process_group()"
|
| 27 |
+
]
|
| 28 |
+
}
|
configs/multi_gpu_train.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"device": "$torch.device(f'cuda:{dist.get_rank()}')",
|
| 3 |
+
"network": {
|
| 4 |
+
"_target_": "torch.nn.parallel.DistributedDataParallel",
|
| 5 |
+
"module": "$@network_def.to(@device)",
|
| 6 |
+
"device_ids": [
|
| 7 |
+
"@device"
|
| 8 |
+
],
|
| 9 |
+
"find_unused_parameters": true
|
| 10 |
+
},
|
| 11 |
+
"train#sampler": {
|
| 12 |
+
"_target_": "DistributedSampler",
|
| 13 |
+
"dataset": "@train#dataset",
|
| 14 |
+
"even_divisible": true,
|
| 15 |
+
"shuffle": true
|
| 16 |
+
},
|
| 17 |
+
"train#dataloader#sampler": "@train#sampler",
|
| 18 |
+
"train#dataloader#shuffle": false,
|
| 19 |
+
"train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
|
| 20 |
+
"validate#sampler": {
|
| 21 |
+
"_target_": "DistributedSampler",
|
| 22 |
+
"dataset": "@validate#dataset",
|
| 23 |
+
"even_divisible": false,
|
| 24 |
+
"shuffle": false
|
| 25 |
+
},
|
| 26 |
+
"validate#dataloader#sampler": "@validate#sampler",
|
| 27 |
+
"validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
|
| 28 |
+
"training": [
|
| 29 |
+
"$import torch.distributed as dist",
|
| 30 |
+
"$dist.init_process_group(backend='nccl')",
|
| 31 |
+
"$torch.cuda.set_device(@device)",
|
| 32 |
+
"$monai.utils.set_determinism(seed=123)",
|
| 33 |
+
"$setattr(torch.backends.cudnn, 'benchmark', True)",
|
| 34 |
+
"$import logging",
|
| 35 |
+
"$@train#trainer.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
|
| 36 |
+
"$@validate#evaluator.logger.setLevel(logging.WARNING if dist.get_rank() > 0 else logging.INFO)",
|
| 37 |
+
"$@train#trainer.run()",
|
| 38 |
+
"$dist.destroy_process_group()"
|
| 39 |
+
]
|
| 40 |
+
}
|
docs/README.md
CHANGED
|
@@ -51,12 +51,25 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page]
|
|
| 51 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
| 52 |
```
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
#### Override the `train` config to execute evaluation with the trained model:
|
| 55 |
|
| 56 |
```
|
| 57 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
| 58 |
```
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
#### Execute inference:
|
| 61 |
|
| 62 |
```
|
|
|
|
| 51 |
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
|
| 52 |
```
|
| 53 |
|
| 54 |
+
#### Override the `train` config to execute multi-GPU training:
|
| 55 |
+
|
| 56 |
+
```
|
| 57 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file "['configs/train.json','configs/multi_gpu_train.json']" --logging_file configs/logging.conf
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
|
| 61 |
+
|
| 62 |
#### Override the `train` config to execute evaluation with the trained model:
|
| 63 |
|
| 64 |
```
|
| 65 |
python -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json']" --logging_file configs/logging.conf
|
| 66 |
```
|
| 67 |
|
| 68 |
+
#### Override the `train` config and `evaluate` config to execute multi-GPU evaluation:
|
| 69 |
+
|
| 70 |
+
```
|
| 71 |
+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run evaluating --meta_file configs/metadata.json --config_file "['configs/train.json','configs/evaluate.json','configs/multi_gpu_evaluate.json']" --logging_file configs/logging.conf
|
| 72 |
+
|
| 73 |
#### Execute inference:
|
| 74 |
|
| 75 |
```
|