test: run torch.distributed.launch and spawn tests (#74)

ydcjeff · web-flow · commit 47c3aaeffa49 · 2021-04-17T21:44:17.000+06:30
* test: run torch.distributed.launch and spawn tests

* fix: exclude launch and spawm dir in test script

* fix: exclude launch and spawm dir in test script

* fix: config.model instead of config.dataset

* fix: correct exclude in test script

* fix: forgot to call --use_env in test script

* fix: add backend gloo

* fix: increase num_workers for persistent_workers

* fix: save logs only on rank zero
diff --git a/.github/run_test.sh b/.github/run_test.sh
@@ -5,21 +5,44 @@ set -xeuo pipefail
 if [ $1 == "generate" ]; then
     python ./tests/generate.py
 elif [ $1 == "unittest" ]; then
-    for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1)
+    for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1 -not -path "./tests/dist/launch" -not -path "./tests/dist/spawn")
     do
         cd $dir
         pip install -r requirements.txt --progress-bar off -q
         cd ../../../
     done
-    for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1)
+    for dir in $(find ./tests/dist -type d -mindepth 1 -maxdepth 1 -not -path "./tests/dist/launch" -not -path "./tests/dist/spawn")
     do
         cd $dir
         pytest
         cd ../../../
     done
-elif [ $1 == "integration" ]; then
-    for file in $(find ./tests/integration -iname "*.sh")
+elif [ $1 == "default" ]; then
+    for file in $(find ./tests/dist -iname "main.py" -not -path "./tests/dist/launch/*" -not -path "./tests/dist/spawn/*" -not -path "./tests/dist/single/*")
     do
-        bash $file
+        python $file --verbose --log_every_iters 2 --num_workers 1 --epoch_length 10
+    done
+elif [ $1 == "launch" ]; then
+    for file in $(find ./tests/dist/launch -iname "main.py" -not -path "./tests/dist/launch/single/*")
+    do
+        python -m torch.distributed.launch \
+            --nproc_per_node 2 \
+            --use_env $file \
+            --verbose \
+            --backend gloo \
+            --num_workers 1 \
+            --epoch_length 10 \
+            --log_every_iters 2
+    done
+elif [ $1 == "spawn" ]; then
+    for file in $(find ./tests/dist/spawn -iname "main.py" -not -path "./tests/dist/spawn/single/*")
+    do
+        python $file \
+            --verbose \
+            --backend gloo \
+            --num_workers 1 \
+            --epoch_length 10 \
+            --nproc_per_node 2 \
+            --log_every_iters 2
     done
 fi
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -43,7 +43,11 @@ jobs:
       - run: python -m torch.utils.collect_env
       - run: bash .github/run_test.sh generate
       - run: bash .github/run_test.sh unittest
-      # - run: bash .github/run_test.sh integration
+      - run: bash .github/run_test.sh default
+      - run: bash .github/run_test.sh spawn
+      - run: bash .github/run_test.sh launch
+        env:
+          OMP_NUM_THREADS: 1
 
   lint:
     runs-on: ubuntu-latest
diff --git a/templates/_base/_argparse.py b/templates/_base/_argparse.py
@@ -56,7 +56,11 @@
         "type": int,
         "help": "master node port for torch native backends ({{ master_port }})",
     },
-
+    "epoch_length": {
+        "default": None,
+        "type": int,
+        "help": "epoch_length of Engine.run()"
+    },
     # ignite handlers options
     "save_every_iters": {
         "default": {{save_every_iters}},
diff --git a/templates/_base/_sidebar.py b/templates/_base/_sidebar.py
@@ -119,7 +119,7 @@ def _setup_common_training_handlers_options(config):
     )
     config["with_pbars"] = st.checkbox(
         "Show two progress bars (with_pbars)",
-        value=True,
+        value=False,
         help=(
             "This option will enable two progress bars - one for epoch,"
             " one for iteration if `with_pbar_on_iters` is `False`,"
diff --git a/templates/gan/_sidebar.py b/templates/gan/_sidebar.py
@@ -24,7 +24,7 @@ def dataset_options(config):
 
 def dataloader_options(config):
     st.markdown("## DataLoader Options")
-    config["batch_size"] = st.number_input("Train batch size (batch_size)", min_value=1, value=4)
+    config["batch_size"] = st.number_input("Train batch size (batch_size)", min_value=1, value=16)
     config["num_workers"] = st.number_input("Number of workers (num_workers)", min_value=0, value=2)
     st.markdown("---")
 
diff --git a/templates/gan/main.py b/templates/gan/main.py
@@ -156,6 +156,21 @@ def print_times(engine):
             logger.info(f"Epoch {engine.state.epoch} done. Time per batch: {timer_handler.value():.3f}[s]")
             timer_handler.reset()
 
+    @train_engine.on(Events.ITERATION_COMPLETED(every=config.log_every_iters))
+    @idist.one_rank_only()
+    def print_logs(engine):
+        fname = config.output_dir / LOGS_FNAME
+        columns = ["iteration", ] + list(engine.state.metrics.keys())
+        values = [str(engine.state.iteration), ] + [str(round(value, 5)) for value in engine.state.metrics.values()]
+
+        with open(fname, "a") as f:
+            if f.tell() == 0:
+                print("\t".join(columns), file=f)
+            print("\t".join(values), file=f)
+        message = f"[{engine.state.epoch}/{config.max_epochs}][{engine.state.iteration % len(train_dataloader)}/{len(train_dataloader)}]"
+        for name, value in zip(columns, values):
+            message += f" | {name}: {value}"
+
     # -------------------------------------------------------------
     # adding handlers using `trainer.on` decorator API
     # -------------------------------------------------------------
@@ -193,7 +208,7 @@ def create_plots(engine):
     # setup if done. let's run the training
     # ------------------------------------------
 
-    train_engine.run(train_dataloader, max_epochs=config.max_epochs)
+    train_engine.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.epoch_length)
 
     # ------------------------------------------------------------
     # close the logger after the training completed / terminated
@@ -211,7 +226,8 @@ def create_plots(engine):
     # where is my best and last checkpoint ?
     # -----------------------------------------
 
-    logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
+    if best_model_handler is not None:
+        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
 
 
 def main():
diff --git a/templates/image_classification/_sidebar.py b/templates/image_classification/_sidebar.py
@@ -20,8 +20,8 @@ def dataset_options(config):
 
 def dataloader_options(config):
     st.markdown("## DataLoader Options")
-    config["train_batch_size"] = st.number_input("Train batch size (train_batch_size)", min_value=1, value=4)
-    config["eval_batch_size"] = st.number_input("Eval batch size (eval_batch_size)", min_value=1, value=8)
+    config["train_batch_size"] = st.number_input("Train batch size (train_batch_size)", min_value=1, value=16)
+    config["eval_batch_size"] = st.number_input("Eval batch size (eval_batch_size)", min_value=1, value=16)
     config["num_workers"] = st.number_input("Number of workers (num_workers)", min_value=0, value=2)
     st.markdown("---")
 
diff --git a/templates/image_classification/main.py b/templates/image_classification/main.py
@@ -33,7 +33,7 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
 
     if rank == 0:
         now = datetime.now().strftime("%Y%m%d-%H%M%S")
-        name = f"{config.dataset}-backend-{idist.backend()}-{now}"
+        name = f"{config.model}-backend-{idist.backend()}-{now}"
         path = Path(config.output_dir, name)
         path.mkdir(parents=True, exist_ok=True)
         config.output_dir = path.as_posix()
@@ -199,7 +199,7 @@ def _():
     # setup if done. let's run the training
     # ------------------------------------------
 
-    train_engine.run(train_dataloader, max_epochs=config.max_epochs)
+    train_engine.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.epoch_length)
 
     # ------------------------------------------------------------
     # close the logger after the training completed / terminated
@@ -217,7 +217,8 @@ def _():
     # where is my best and last checkpoint ?
     # -----------------------------------------
 
-    logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
+    if best_model_handler is not None:
+        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
 
 
 def main():
diff --git a/templates/single/main.py b/templates/single/main.py
@@ -178,7 +178,7 @@ def _():
     # ------------------------------------------
     # TODO : PLEASE provide `max_epochs` parameters
 
-    train_engine.run(train_dataloader)
+    train_engine.run(train_dataloader, epoch_length=config.epoch_length)
 
     # ------------------------------------------------------------
     # close the logger after the training completed / terminated
@@ -196,7 +196,8 @@ def _():
     # where is my best and last checkpoint ?
     # -----------------------------------------
 
-    logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
+    if best_model_handler is not None:
+        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
 
 
 def main():
diff --git a/tests/generate.py b/tests/generate.py
@@ -15,6 +15,49 @@ def generate():
             sys.path.append(f"./templates/{p.stem}")
             dist_dir = "./tests/dist"
             configs = import_from_file("template_config", f"./templates/{p.stem}/_sidebar.py").get_configs()
+            configs["setup_timer"] = True
+            code_gen = CodeGenerator(dist_dir=dist_dir)
+            [*code_gen.render_templates(p.stem, configs)]
+            code_gen.make_and_write(p.stem, Path(dist_dir))
+            shutil.copy(p / "_test_internal.py", f"{dist_dir}/{p.stem}")
+
+
+def generate_for_dist_launch():
+    sys.path.append("./app")
+
+    from codegen import CodeGenerator
+    from utils import import_from_file
+
+    for p in Path("./templates").iterdir():
+        if p.is_dir() and not p.stem.startswith("_"):
+            sys.path.append(f"./templates/{p.stem}")
+            dist_dir = "./tests/dist/launch"
+            configs = import_from_file("template_config", f"./templates/{p.stem}/_sidebar.py").get_configs()
+            configs["use_distributed_training"] = True
+            configs["use_distributed_launcher"] = True
+            configs["setup_timer"] = True
+            configs["nnodes"] = 1
+            code_gen = CodeGenerator(dist_dir=dist_dir)
+            [*code_gen.render_templates(p.stem, configs)]
+            code_gen.make_and_write(p.stem, Path(dist_dir))
+            shutil.copy(p / "_test_internal.py", f"{dist_dir}/{p.stem}")
+
+
+def generate_for_dist_spawn():
+    sys.path.append("./app")
+
+    from codegen import CodeGenerator
+    from utils import import_from_file
+
+    for p in Path("./templates").iterdir():
+        if p.is_dir() and not p.stem.startswith("_"):
+            sys.path.append(f"./templates/{p.stem}")
+            dist_dir = "./tests/dist/spawn"
+            configs = import_from_file("template_config", f"./templates/{p.stem}/_sidebar.py").get_configs()
+            configs["use_distributed_training"] = True
+            configs["use_distributed_launcher"] = False
+            configs["setup_timer"] = True
+            configs["nnodes"] = 1
             code_gen = CodeGenerator(dist_dir=dist_dir)
             [*code_gen.render_templates(p.stem, configs)]
             code_gen.make_and_write(p.stem, Path(dist_dir))
@@ -23,3 +66,5 @@ def generate():
 
 if __name__ == "__main__":
     generate()
+    generate_for_dist_launch()
+    generate_for_dist_spawn()

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ def _setup_common_training_handlers_options(config):`
`119`	`119`	`)`
`120`	`120`	`config["with_pbars"] = st.checkbox(`
`121`	`121`	`"Show two progress bars (with_pbars)",`
`122`		`- value=True,`
	`122`	`+ value=False,`
`123`	`123`	`help=(`
`124`	`124`	`"This option will enable two progress bars - one for epoch,"`
`125`	`125`	" one for iteration if `with_pbar_on_iters` is `False`,"