Skip to content

Commit 4a23de8

Browse files
vfdev-5ydcjeffydcjeff
authored
Nits and fixes: more options for distributed configuration and readme/code filtering (#71)
* More options for distributed configuration and readme/code filtering - removed node_rank as unused. - other nits * Addded persistent_workers option * refactor: rm node_rank, update READMEs * fix: corrects for distributed trainings * fix: uncomment for tmpdir * fix: test * fix: broadcast output_dir on rank 0 * Apply suggestions from code review Co-authored-by: ydcjeff <ydcjeff@outlook.com> Co-authored-by: ydcjeff <ydcjeff@gmail.com>
1 parent 3146ffd commit 4a23de8

File tree

15 files changed

+234
-166
lines changed

15 files changed

+234
-166
lines changed

.github/run_test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
22

3-
set -xu
3+
set -xeuo pipefail
44

55
if [ $1 == "generate" ]; then
66
python ./tests/generate.py

app/utils.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,17 @@
11
"""Utilities module.
22
"""
3-
import importlib.util
3+
from importlib.machinery import SourceFileLoader
44

55

6-
# copied from
7-
# https://github.com/jrieke/traingenerator/blob/76e637975989d11c549c17694c5603a409e184dd/app/utils.py#L14-L29
86
def import_from_file(module_name: str, filepath: str):
97
"""Imports a module from file.
8+
109
Args:
1110
module_name (str): Assigned to the module's __name__ parameter (does not
1211
influence how the module is named outside of this function)
1312
filepath (str): Path to the .py file
13+
1414
Returns:
1515
The module
1616
"""
17-
spec = importlib.util.spec_from_file_location(module_name, filepath)
18-
module = importlib.util.module_from_spec(spec)
19-
spec.loader.exec_module(module)
20-
return module
17+
return SourceFileLoader(module_name, filepath).load_module()

templates/_base/_argparse.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,16 @@
3636
this is recommended to be set to the number of GPUs in your system
3737
so that each process can be bound to a single GPU ({{ nproc_per_node }})""",
3838
},
39-
"nnodes": {
40-
"default": {{nnodes}},
41-
"type": int,
42-
"help": "number of nodes to use for distributed training ({{ nnodes }})",
43-
},
4439
"node_rank": {
4540
"default": {{node_rank}},
4641
"type": int,
4742
"help": "rank of the node for multi-node distributed training ({{ node_rank }})",
4843
},
44+
"nnodes": {
45+
"default": {{nnodes}},
46+
"type": int,
47+
"help": "number of nodes to use for distributed training ({{ nnodes }})",
48+
},
4949
"master_addr": {
5050
"default": {{master_addr}},
5151
"type": str,

templates/_base/_sidebar.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,18 @@ def default_none_options(config):
2525

2626
def distributed_options(config):
2727
st.markdown("## Distributed Training Options")
28-
if st.checkbox("Use distributed training"):
28+
config["use_distributed_training"] = st.checkbox("Use distributed training", value=False)
29+
if config["use_distributed_training"]:
30+
31+
executor = st.selectbox(
32+
"Executor", options=("Use torch.distributed.launch", "main.py spawns children processes")
33+
)
34+
config["use_distributed_launcher"] = executor == "Use torch.distributed.launch"
35+
2936
config["nproc_per_node"] = st.number_input(
30-
"Number of processes to launch on each node (nproc_per_node)", min_value=1
37+
"Number of processes to launch on each node (nproc_per_node)",
38+
min_value=1,
39+
value=2,
3140
)
3241
config["nnodes"] = st.number_input("Number of nodes to use for distributed training (nnodes)", min_value=1)
3342
if config["nnodes"] > 1:
@@ -36,12 +45,6 @@ def distributed_options(config):
3645
" namely 'gloo' and 'nccl' backends. For other backends,"
3746
" please specify spawn_kwargs in main.py"
3847
)
39-
config["node_rank"] = st.number_input(
40-
"Rank of the node for multi-node distributed training (node_rank)",
41-
min_value=0,
42-
)
43-
if config["node_rank"] > (config["nnodes"] - 1):
44-
st.error(f"node_rank should be between 0 and {config['nnodes'] - 1}")
4548
config["master_addr"] = st.text_input(
4649
"Master node TCP/IP address for torch native backends (master_addr)",
4750
value="'127.0.0.1'",

templates/gan/README.md

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ Table of Contents
1111

1212
- [Getting Started](#getting-started)
1313
- [Training](#training)
14-
- [PyTorch Hub](#pytorch-hub)
1514
- [Configurations](#configurations)
1615

1716
</details>
@@ -29,15 +28,12 @@ gan
2928
├── config.py
3029
├── datasets.py
3130
├── handlers.py
32-
├── hubconf.py
3331
├── main.py
3432
├── models.py
3533
├── requirements.txt
3634
├── test_all.py
3735
├── trainers.py
3836
└── utils.py
39-
40-
0 directories, 11 files
4137
```
4238

4339
</details>
@@ -57,75 +53,79 @@ gan
5753
5854
## Training
5955

56+
{% if not use_distributed_training %}
57+
6058
### Single Node, Single GPU
6159

6260
```sh
6361
python main.py --verbose
6462
```
6563

64+
{% else %}
65+
{% if nnodes < 2 %}
66+
6667
### Single Node, Multiple GPUs
6768

69+
{% if use_distributed_launcher %}
70+
6871
- Using `torch.distributed.launch` (recommended)
6972

7073
```sh
7174
python -m torch.distributed.launch \
72-
--nproc_per_node=2 \
75+
--nproc_per_node={{nproc_per_node}} \
7376
--use_env main.py \
7477
--backend="nccl" \
75-
--verbose \
78+
--verbose
7679
```
7780

81+
{% else %}
82+
7883
- Using function spawn inside the code
7984

8085
```sh
8186
python main.py \
8287
--backend="nccl" \
83-
--nproc_per_node=2 \
84-
--verbose \
88+
--nproc_per_node={{nproc_per_node}} \
89+
--verbose
8590
```
8691

92+
{% endif %}
93+
{% else %}
94+
8795
### Multiple Nodes, Multiple GPUs
8896

89-
Let's start training on two nodes with 2 gpus each. We assuming that master node can be connected as master, e.g. ping master.
97+
Let's start training on {{nnodes}} nodes with {{nproc_per_node}} gpus each:
9098

9199
- Execute on master node
92100

93101
```sh
94102
python -m torch.distributed.launch \
95-
--nnodes=2 \
96-
--nproc_per_node=2 \
103+
--nnodes={{nnodes}} \
104+
--nproc_per_node={{nproc_per_node}} \
97105
--node_rank=0 \
98-
--master_addr=master \
99-
--master_port=2222 \
106+
--master_addr={{master_addr}} \
107+
--master_port={{master_port}} \
100108
--use_env main.py \
101109
--backend="nccl" \
102-
--verbose \
110+
--verbose
103111
```
104112

105113
- Execute on worker node
106114

107115
```sh
108116
python -m torch.distributed.launch \
109-
--nnodes=2 \
110-
--nproc_per_node=2 \
111-
--node_rank=1 \
112-
--master_addr=master \
113-
--master_port=2222 \
117+
--nnodes={{nnodes}} \
118+
--nproc_per_node={{nproc_per_node}} \
119+
--node_rank=<node_rank> \
120+
--master_addr={{master_addr}} \
121+
--master_port={{master_port}} \
114122
--use_env main.py \
115123
--backend="nccl" \
116-
--verbose \
124+
--verbose
117125
```
118126

119-
### Colab 8 TPUs
120-
121-
```sh
122-
python main.py --verbose --backend='xla-tpu' --nproc_per_node=8
123-
```
124-
125-
## PyTorch Hub
126-
127-
- Edit `hubconf.py` to use the custom model easily via `torch.hub.load()`.
128-
- Add additional requirements inside `dependencies` list in `hubconf.py`.
127+
{% endif %}
128+
{% endif %}
129129

130130
## Configurations
131131

templates/gan/main.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@
1616

1717
from datasets import get_datasets
1818
from trainers import create_trainers
19-
from handlers import get_handlers, get_logger
20-
from utils import setup_logging, log_metrics, log_basic_info, initialize, resume_from
19+
from utils import setup_logging, log_metrics, log_basic_info, initialize, resume_from, get_handlers, get_logger
2120
from config import get_default_parser
2221

2322

@@ -36,6 +35,19 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
3635
rank = idist.get_rank()
3736
manual_seed(config.seed + rank)
3837

38+
# -----------------------
39+
# create output folder
40+
# -----------------------
41+
42+
if rank == 0:
43+
now = datetime.now().strftime("%Y%m%d-%H%M%S")
44+
name = f"{config.dataset}-backend-{idist.backend()}-{now}"
45+
path = Path(config.output_dir, name)
46+
path.mkdir(parents=True, exist_ok=True)
47+
config.output_dir = path.as_posix()
48+
49+
config.output_dir = Path(idist.broadcast(config.output_dir, src=0))
50+
3951
# -----------------------------
4052
# datasets and dataloaders
4153
# -----------------------------
@@ -45,7 +57,10 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
4557
train_dataloader = idist.auto_dataloader(
4658
train_dataset,
4759
batch_size=config.batch_size,
48-
num_workers=config.num_workers
60+
num_workers=config.num_workers,
61+
{% if use_distributed_training and not use_distributed_launcher %}
62+
persistent_workers=True,
63+
{% endif %}
4964
)
5065

5166
# ------------------------------------------
@@ -58,9 +73,10 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
5873
# -----------------------------
5974
# train_engine and eval_engine
6075
# -----------------------------
61-
real_labels = torch.ones(config.batch_size, device=device)
62-
fake_labels = torch.zeros(config.batch_size, device=device)
63-
fixed_noise = torch.randn(config.batch_size, config.z_dim, 1, 1, device=device)
76+
ws = idist.get_world_size()
77+
real_labels = torch.ones(config.batch_size // ws, device=device)
78+
fake_labels = torch.zeros(config.batch_size // ws, device=device)
79+
fixed_noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device)
6480

6581
train_engine = create_trainers(
6682
config=config,
@@ -75,7 +91,6 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
7591
)
7692

7793
# -------------------------------------------
78-
# update config with optimizer parameters
7994
# setup engines logger with python logging
8095
# print training configurations
8196
# -------------------------------------------
@@ -203,20 +218,17 @@ def main():
203218
parser = ArgumentParser(parents=[get_default_parser()])
204219
config = parser.parse_args()
205220

206-
if config.output_dir:
207-
now = datetime.now().strftime("%Y%m%d-%H%M%S")
208-
name = f'{config.dataset}-backend-{idist.backend()}-{now}'
209-
path = Path(config.output_dir, name)
210-
path.mkdir(parents=True, exist_ok=True)
211-
config.output_dir = path
212-
213221
with idist.Parallel(
214222
backend=config.backend,
223+
{% if use_distributed_training and not use_distributed_launcher %}
215224
nproc_per_node=config.nproc_per_node,
216-
nnodes=config.nnodes,
225+
{% if nnodes > 1 and not use_distributed_launcher%}
217226
node_rank=config.node_rank,
227+
nnodes=config.nnodes,
218228
master_addr=config.master_addr,
219229
master_port=config.master_port,
230+
{% endif %}
231+
{% endif %}
220232
) as parallel:
221233
parallel.run(run, config=config)
222234

templates/gan/test_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def test_get_datasets(tmp_path):
3131
assert isinstance(batch, Iterable)
3232
assert isinstance(batch[0], Tensor)
3333
assert isinstance(batch[1], Number)
34-
assert batch[0].dim == 3
34+
assert batch[0].ndim == 3
3535

3636

3737
def test_models():

templates/gan/trainers.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from typing import Any
55

66
import torch
7+
import ignite.distributed as idist
78
from ignite.engine import Engine
89
from torch.cuda.amp import autocast
910
from torch.optim.optimizer import Optimizer
@@ -86,7 +87,8 @@ def train_function(
8687
errD_real.backward()
8788

8889
# get fake image from generator
89-
noise = torch.randn(config.batch_size, config.z_dim, 1, 1, device=device)
90+
ws = idist.get_world_size()
91+
noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device)
9092
fake = netG(noise)
9193

9294
# train with fake
@@ -125,7 +127,7 @@ def create_trainers(**kwargs) -> Engine:
125127
126128
Parameters
127129
----------
128-
kwargs: keyword arguments passed to both train_function and evaluate_function
130+
kwargs: keyword arguments passed to both train_function
129131
130132
Returns
131133
-------

0 commit comments

Comments
 (0)