Skip to content

Commit 47c3aae

Browse files
authored
test: run torch.distributed.launch and spawn tests (#74)
* test: run torch.distributed.launch and spawn tests * fix: exclude launch and spawm dir in test script * fix: exclude launch and spawm dir in test script * fix: config.model instead of config.dataset * fix: correct exclude in test script * fix: forgot to call --use_env in test script * fix: add backend gloo * fix: increase num_workers for persistent_workers * fix: save logs only on rank zero
1 parent 7a29e98 commit 47c3aae

File tree

10 files changed

+112
-18
lines changed

10 files changed

+112
-18
lines changed

.github/run_test.sh

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,44 @@ set -xeuo pipefail
55
if [ $1 == "generate" ]; then
66
python ./tests/generate.py
77
elif [ $1 == "unittest" ]; then
8-
for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1)
8+
for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1 -not -path "./tests/dist/launch" -not -path "./tests/dist/spawn")
99
do
1010
cd $dir
1111
pip install -r requirements.txt --progress-bar off -q
1212
cd ../../../
1313
done
14-
for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1)
14+
for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1 -not -path "./tests/dist/launch" -not -path "./tests/dist/spawn")
1515
do
1616
cd $dir
1717
pytest
1818
cd ../../../
1919
done
20-
elif [ $1 == "integration" ]; then
21-
for file in $(find ./tests/integration -iname "*.sh")
20+
elif [ $1 == "default" ]; then
21+
for file in $(find ./tests/dist -iname "main.py" -not -path "./tests/dist/launch/*" -not -path "./tests/dist/spawn/*" -not -path "./tests/dist/single/*")
2222
do
23-
bash $file
23+
python $file --verbose --log_every_iters 2 --num_workers 1 --epoch_length 10
24+
done
25+
elif [ $1 == "launch" ]; then
26+
for file in $(find ./tests/dist/launch -iname "main.py" -not -path "./tests/dist/launch/single/*")
27+
do
28+
python -m torch.distributed.launch \
29+
--nproc_per_node 2 \
30+
--use_env $file \
31+
--verbose \
32+
--backend gloo \
33+
--num_workers 1 \
34+
--epoch_length 10 \
35+
--log_every_iters 2
36+
done
37+
elif [ $1 == "spawn" ]; then
38+
for file in $(find ./tests/dist/spawn -iname "main.py" -not -path "./tests/dist/spawn/single/*")
39+
do
40+
python $file \
41+
--verbose \
42+
--backend gloo \
43+
--num_workers 1 \
44+
--epoch_length 10 \
45+
--nproc_per_node 2 \
46+
--log_every_iters 2
2447
done
2548
fi

.github/workflows/ci.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ jobs:
4343
- run: python -m torch.utils.collect_env
4444
- run: bash .github/run_test.sh generate
4545
- run: bash .github/run_test.sh unittest
46-
# - run: bash .github/run_test.sh integration
46+
- run: bash .github/run_test.sh default
47+
- run: bash .github/run_test.sh spawn
48+
- run: bash .github/run_test.sh launch
49+
env:
50+
OMP_NUM_THREADS: 1
4751

4852
lint:
4953
runs-on: ubuntu-latest

templates/_base/_argparse.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,11 @@
5656
"type": int,
5757
"help": "master node port for torch native backends ({{ master_port }})",
5858
},
59-
59+
"epoch_length": {
60+
"default": None,
61+
"type": int,
62+
"help": "epoch_length of Engine.run()"
63+
},
6064
# ignite handlers options
6165
"save_every_iters": {
6266
"default": {{save_every_iters}},

templates/_base/_sidebar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def _setup_common_training_handlers_options(config):
119119
)
120120
config["with_pbars"] = st.checkbox(
121121
"Show two progress bars (with_pbars)",
122-
value=True,
122+
value=False,
123123
help=(
124124
"This option will enable two progress bars - one for epoch,"
125125
" one for iteration if `with_pbar_on_iters` is `False`,"

templates/gan/_sidebar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def dataset_options(config):
2424

2525
def dataloader_options(config):
2626
st.markdown("## DataLoader Options")
27-
config["batch_size"] = st.number_input("Train batch size (batch_size)", min_value=1, value=4)
27+
config["batch_size"] = st.number_input("Train batch size (batch_size)", min_value=1, value=16)
2828
config["num_workers"] = st.number_input("Number of workers (num_workers)", min_value=0, value=2)
2929
st.markdown("---")
3030

templates/gan/main.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,21 @@ def print_times(engine):
156156
logger.info(f"Epoch {engine.state.epoch} done. Time per batch: {timer_handler.value():.3f}[s]")
157157
timer_handler.reset()
158158

159+
@train_engine.on(Events.ITERATION_COMPLETED(every=config.log_every_iters))
160+
@idist.one_rank_only()
161+
def print_logs(engine):
162+
fname = config.output_dir / LOGS_FNAME
163+
columns = ["iteration", ] + list(engine.state.metrics.keys())
164+
values = [str(engine.state.iteration), ] + [str(round(value, 5)) for value in engine.state.metrics.values()]
165+
166+
with open(fname, "a") as f:
167+
if f.tell() == 0:
168+
print("\t".join(columns), file=f)
169+
print("\t".join(values), file=f)
170+
message = f"[{engine.state.epoch}/{config.max_epochs}][{engine.state.iteration % len(train_dataloader)}/{len(train_dataloader)}]"
171+
for name, value in zip(columns, values):
172+
message += f" | {name}: {value}"
173+
159174
# -------------------------------------------------------------
160175
# adding handlers using `trainer.on` decorator API
161176
# -------------------------------------------------------------
@@ -193,7 +208,7 @@ def create_plots(engine):
193208
# setup if done. let's run the training
194209
# ------------------------------------------
195210

196-
train_engine.run(train_dataloader, max_epochs=config.max_epochs)
211+
train_engine.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.epoch_length)
197212

198213
# ------------------------------------------------------------
199214
# close the logger after the training completed / terminated
@@ -211,7 +226,8 @@ def create_plots(engine):
211226
# where is my best and last checkpoint ?
212227
# -----------------------------------------
213228

214-
logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
229+
if best_model_handler is not None:
230+
logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
215231

216232

217233
def main():

templates/image_classification/_sidebar.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ def dataset_options(config):
2020

2121
def dataloader_options(config):
2222
st.markdown("## DataLoader Options")
23-
config["train_batch_size"] = st.number_input("Train batch size (train_batch_size)", min_value=1, value=4)
24-
config["eval_batch_size"] = st.number_input("Eval batch size (eval_batch_size)", min_value=1, value=8)
23+
config["train_batch_size"] = st.number_input("Train batch size (train_batch_size)", min_value=1, value=16)
24+
config["eval_batch_size"] = st.number_input("Eval batch size (eval_batch_size)", min_value=1, value=16)
2525
config["num_workers"] = st.number_input("Number of workers (num_workers)", min_value=0, value=2)
2626
st.markdown("---")
2727

templates/image_classification/main.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
3333

3434
if rank == 0:
3535
now = datetime.now().strftime("%Y%m%d-%H%M%S")
36-
name = f"{config.dataset}-backend-{idist.backend()}-{now}"
36+
name = f"{config.model}-backend-{idist.backend()}-{now}"
3737
path = Path(config.output_dir, name)
3838
path.mkdir(parents=True, exist_ok=True)
3939
config.output_dir = path.as_posix()
@@ -199,7 +199,7 @@ def _():
199199
# setup if done. let's run the training
200200
# ------------------------------------------
201201

202-
train_engine.run(train_dataloader, max_epochs=config.max_epochs)
202+
train_engine.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.epoch_length)
203203

204204
# ------------------------------------------------------------
205205
# close the logger after the training completed / terminated
@@ -217,7 +217,8 @@ def _():
217217
# where is my best and last checkpoint ?
218218
# -----------------------------------------
219219

220-
logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
220+
if best_model_handler is not None:
221+
logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
221222

222223

223224
def main():

templates/single/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def _():
178178
# ------------------------------------------
179179
# TODO : PLEASE provide `max_epochs` parameters
180180

181-
train_engine.run(train_dataloader)
181+
train_engine.run(train_dataloader, epoch_length=config.epoch_length)
182182

183183
# ------------------------------------------------------------
184184
# close the logger after the training completed / terminated
@@ -196,7 +196,8 @@ def _():
196196
# where is my best and last checkpoint ?
197197
# -----------------------------------------
198198

199-
logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
199+
if best_model_handler is not None:
200+
logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
200201

201202

202203
def main():

tests/generate.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,49 @@ def generate():
1515
sys.path.append(f"./templates/{p.stem}")
1616
dist_dir = "./tests/dist"
1717
configs = import_from_file("template_config", f"./templates/{p.stem}/_sidebar.py").get_configs()
18+
configs["setup_timer"] = True
19+
code_gen = CodeGenerator(dist_dir=dist_dir)
20+
[*code_gen.render_templates(p.stem, configs)]
21+
code_gen.make_and_write(p.stem, Path(dist_dir))
22+
shutil.copy(p / "_test_internal.py", f"{dist_dir}/{p.stem}")
23+
24+
25+
def generate_for_dist_launch():
26+
sys.path.append("./app")
27+
28+
from codegen import CodeGenerator
29+
from utils import import_from_file
30+
31+
for p in Path("./templates").iterdir():
32+
if p.is_dir() and not p.stem.startswith("_"):
33+
sys.path.append(f"./templates/{p.stem}")
34+
dist_dir = "./tests/dist/launch"
35+
configs = import_from_file("template_config", f"./templates/{p.stem}/_sidebar.py").get_configs()
36+
configs["use_distributed_training"] = True
37+
configs["use_distributed_launcher"] = True
38+
configs["setup_timer"] = True
39+
configs["nnodes"] = 1
40+
code_gen = CodeGenerator(dist_dir=dist_dir)
41+
[*code_gen.render_templates(p.stem, configs)]
42+
code_gen.make_and_write(p.stem, Path(dist_dir))
43+
shutil.copy(p / "_test_internal.py", f"{dist_dir}/{p.stem}")
44+
45+
46+
def generate_for_dist_spawn():
47+
sys.path.append("./app")
48+
49+
from codegen import CodeGenerator
50+
from utils import import_from_file
51+
52+
for p in Path("./templates").iterdir():
53+
if p.is_dir() and not p.stem.startswith("_"):
54+
sys.path.append(f"./templates/{p.stem}")
55+
dist_dir = "./tests/dist/spawn"
56+
configs = import_from_file("template_config", f"./templates/{p.stem}/_sidebar.py").get_configs()
57+
configs["use_distributed_training"] = True
58+
configs["use_distributed_launcher"] = False
59+
configs["setup_timer"] = True
60+
configs["nnodes"] = 1
1861
code_gen = CodeGenerator(dist_dir=dist_dir)
1962
[*code_gen.render_templates(p.stem, configs)]
2063
code_gen.make_and_write(p.stem, Path(dist_dir))
@@ -23,3 +66,5 @@ def generate():
2366

2467
if __name__ == "__main__":
2568
generate()
69+
generate_for_dist_launch()
70+
generate_for_dist_spawn()

0 commit comments

Comments
 (0)