pytorch-ignite
diff --git a/‎.github/run_test.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/run_test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/utils.py‎
Lines changed: 4 additions & 7 deletions b/‎app/utils.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎templates/_base/_argparse.py‎
Lines changed: 5 additions & 5 deletions b/‎templates/_base/_argparse.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎templates/_base/_sidebar.py‎
Lines changed: 11 additions & 8 deletions b/‎templates/_base/_sidebar.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎templates/gan/README.md‎
Lines changed: 30 additions & 30 deletions b/‎templates/gan/README.md‎
Lines changed: 30 additions & 30 deletions
diff --git a/‎templates/gan/main.py‎
Lines changed: 27 additions & 15 deletions b/‎templates/gan/main.py‎
Lines changed: 27 additions & 15 deletions
diff --git a/‎templates/gan/test_all.py‎
Lines changed: 1 addition & 1 deletion b/‎templates/gan/test_all.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎templates/gan/trainers.py‎
Lines changed: 4 additions & 2 deletions b/‎templates/gan/trainers.py‎
Lines changed: 4 additions & 2 deletions
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -xu
+set -xeuo pipefail
 
 if [ $1 == "generate" ]; then
     python ./tests/generate.py
 
@@ -1,20 +1,17 @@
 """Utilities module.
 """
-import importlib.util
+from importlib.machinery import SourceFileLoader
 
 
-# copied from
-# https://github.com/jrieke/traingenerator/blob/76e637975989d11c549c17694c5603a409e184dd/app/utils.py#L14-L29
 def import_from_file(module_name: str, filepath: str):
     """Imports a module from file.
+
     Args:
         module_name (str): Assigned to the module's __name__ parameter (does not
             influence how the module is named outside of this function)
         filepath (str): Path to the .py file
+
     Returns:
         The module
     """
-    spec = importlib.util.spec_from_file_location(module_name, filepath)
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
+    return SourceFileLoader(module_name, filepath).load_module()
@@ -36,16 +36,16 @@
                 this is recommended to be set to the number of GPUs in your system
                 so that each process can be bound to a single GPU ({{ nproc_per_node }})""",
     },
-    "nnodes": {
-        "default": {{nnodes}},
-        "type": int,
-        "help": "number of nodes to use for distributed training ({{ nnodes }})",
-    },
     "node_rank": {
         "default": {{node_rank}},
         "type": int,
         "help": "rank of the node for multi-node distributed training ({{ node_rank }})",
     },
+    "nnodes": {
+        "default": {{nnodes}},
+        "type": int,
+        "help": "number of nodes to use for distributed training ({{ nnodes }})",
+    },
     "master_addr": {
         "default": {{master_addr}},
         "type": str,
 
@@ -25,9 +25,18 @@ def default_none_options(config):
 
 def distributed_options(config):
     st.markdown("## Distributed Training Options")
-    if st.checkbox("Use distributed training"):
+    config["use_distributed_training"] = st.checkbox("Use distributed training", value=False)
+    if config["use_distributed_training"]:
+
+        executor = st.selectbox(
+            "Executor", options=("Use torch.distributed.launch", "main.py spawns children processes")
+        )
+        config["use_distributed_launcher"] = executor == "Use torch.distributed.launch"
+
         config["nproc_per_node"] = st.number_input(
-            "Number of processes to launch on each node (nproc_per_node)", min_value=1
+            "Number of processes to launch on each node (nproc_per_node)",
+            min_value=1,
+            value=2,
         )
         config["nnodes"] = st.number_input("Number of nodes to use for distributed training (nnodes)", min_value=1)
         if config["nnodes"] > 1:
@@ -36,12 +45,6 @@ def distributed_options(config):
                 " namely 'gloo' and 'nccl' backends. For other backends,"
                 " please specify spawn_kwargs in main.py"
             )
-            config["node_rank"] = st.number_input(
-                "Rank of the node for multi-node distributed training (node_rank)",
-                min_value=0,
-            )
-            if config["node_rank"] > (config["nnodes"] - 1):
-                st.error(f"node_rank should be between 0 and {config['nnodes'] - 1}")
             config["master_addr"] = st.text_input(
                 "Master node TCP/IP address for torch native backends (master_addr)",
                 value="'127.0.0.1'",
 
@@ -11,7 +11,6 @@ Table of Contents
 
 - [Getting Started](#getting-started)
 - [Training](#training)
-- [PyTorch Hub](#pytorch-hub)
 - [Configurations](#configurations)
 
 </details>
@@ -29,15 +28,12 @@ gan
 ├── config.py
 ├── datasets.py
 ├── handlers.py
-├── hubconf.py
 ├── main.py
 ├── models.py
 ├── requirements.txt
 ├── test_all.py
 ├── trainers.py
 └── utils.py
-
-0 directories, 11 files
 ```
 
 </details>
@@ -57,75 +53,79 @@ gan
 
 ## Training
 
+{% if not use_distributed_training %}
+
 ### Single Node, Single GPU
 
 ```sh
 python main.py --verbose
 ```
 
+{% else %}
+{% if nnodes < 2 %}
+
 ### Single Node, Multiple GPUs
 
+{% if use_distributed_launcher %}
+
 - Using `torch.distributed.launch` (recommended)
 
   ```sh
   python -m torch.distributed.launch \
-    --nproc_per_node=2 \
+    --nproc_per_node={{nproc_per_node}} \
     --use_env main.py \
     --backend="nccl" \
-    --verbose \
+    --verbose
   ```
 
+{% else %}
+
 - Using function spawn inside the code
 
   ```sh
   python main.py \
     --backend="nccl" \
-    --nproc_per_node=2 \
-    --verbose \
+    --nproc_per_node={{nproc_per_node}} \
+    --verbose
   ```
 
+  {% endif %}
+  {% else %}
+
 ### Multiple Nodes, Multiple GPUs
 
-Let's start training on two nodes with 2 gpus each. We assuming that master node can be connected as master, e.g. ping master.
+Let's start training on {{nnodes}} nodes with {{nproc_per_node}} gpus each:
 
 - Execute on master node
 
   ```sh
   python -m torch.distributed.launch \
-    --nnodes=2 \
-    --nproc_per_node=2 \
+    --nnodes={{nnodes}} \
+    --nproc_per_node={{nproc_per_node}} \
     --node_rank=0 \
-    --master_addr=master \
-    --master_port=2222 \
+    --master_addr={{master_addr}} \
+    --master_port={{master_port}} \
     --use_env main.py \
     --backend="nccl" \
-    --verbose \
+    --verbose
   ```
 
 - Execute on worker node
 
   ```sh
   python -m torch.distributed.launch \
-    --nnodes=2 \
-    --nproc_per_node=2 \
-    --node_rank=1 \
-    --master_addr=master \
-    --master_port=2222 \
+    --nnodes={{nnodes}} \
+    --nproc_per_node={{nproc_per_node}} \
+    --node_rank=<node_rank> \
+    --master_addr={{master_addr}} \
+    --master_port={{master_port}} \
     --use_env main.py \
     --backend="nccl" \
-    --verbose \
+    --verbose
   ```
 
-### Colab 8 TPUs
-
-```sh
-python main.py --verbose --backend='xla-tpu' --nproc_per_node=8
-```
-
-## PyTorch Hub
-
-- Edit `hubconf.py` to use the custom model easily via `torch.hub.load()`.
-- Add additional requirements inside `dependencies` list in `hubconf.py`.
+  {% endif %}
+  {% endif %}
 
 ## Configurations
 
 
@@ -16,8 +16,7 @@
 
 from datasets import get_datasets
 from trainers import create_trainers
-from handlers import get_handlers, get_logger
-from utils import setup_logging, log_metrics, log_basic_info, initialize, resume_from
+from utils import setup_logging, log_metrics, log_basic_info, initialize, resume_from, get_handlers, get_logger
 from config import get_default_parser
 
 
@@ -36,6 +35,19 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
     rank = idist.get_rank()
     manual_seed(config.seed + rank)
 
+    # -----------------------
+    # create output folder
+    # -----------------------
+
+    if rank == 0:
+        now = datetime.now().strftime("%Y%m%d-%H%M%S")
+        name = f"{config.dataset}-backend-{idist.backend()}-{now}"
+        path = Path(config.output_dir, name)
+        path.mkdir(parents=True, exist_ok=True)
+        config.output_dir = path.as_posix()
+
+    config.output_dir = Path(idist.broadcast(config.output_dir, src=0))
+
     # -----------------------------
     # datasets and dataloaders
     # -----------------------------
@@ -45,7 +57,10 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
     train_dataloader = idist.auto_dataloader(
         train_dataset,
         batch_size=config.batch_size,
-        num_workers=config.num_workers
+        num_workers=config.num_workers,
+        {% if use_distributed_training and not use_distributed_launcher %}
+        persistent_workers=True,
+        {% endif %}
     )
 
     # ------------------------------------------
@@ -58,9 +73,10 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
     # -----------------------------
     # train_engine and eval_engine
     # -----------------------------
-    real_labels = torch.ones(config.batch_size, device=device)
-    fake_labels = torch.zeros(config.batch_size, device=device)
-    fixed_noise = torch.randn(config.batch_size, config.z_dim, 1, 1, device=device)
+    ws = idist.get_world_size()
+    real_labels = torch.ones(config.batch_size // ws, device=device)
+    fake_labels = torch.zeros(config.batch_size // ws, device=device)
+    fixed_noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device)
 
     train_engine = create_trainers(
         config=config,
@@ -75,7 +91,6 @@ def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
     )
 
     # -------------------------------------------
-    # update config with optimizer parameters
     # setup engines logger with python logging
     # print training configurations
     # -------------------------------------------
@@ -203,20 +218,17 @@ def main():
     parser = ArgumentParser(parents=[get_default_parser()])
     config = parser.parse_args()
 
-    if config.output_dir:
-        now = datetime.now().strftime("%Y%m%d-%H%M%S")
-        name = f'{config.dataset}-backend-{idist.backend()}-{now}'
-        path = Path(config.output_dir, name)
-        path.mkdir(parents=True, exist_ok=True)
-        config.output_dir = path
-
     with idist.Parallel(
         backend=config.backend,
+{% if use_distributed_training and not use_distributed_launcher %}
         nproc_per_node=config.nproc_per_node,
-        nnodes=config.nnodes,
+{% if nnodes > 1 and not use_distributed_launcher%}
         node_rank=config.node_rank,
+        nnodes=config.nnodes,
         master_addr=config.master_addr,
         master_port=config.master_port,
+{% endif %}
+{% endif %}
     ) as parallel:
         parallel.run(run, config=config)
 
 
@@ -31,7 +31,7 @@ def test_get_datasets(tmp_path):
     assert isinstance(batch, Iterable)
     assert isinstance(batch[0], Tensor)
     assert isinstance(batch[1], Number)
-    assert batch[0].dim == 3
+    assert batch[0].ndim == 3
 
 
 def test_models():
 
@@ -4,6 +4,7 @@
 from typing import Any
 
 import torch
+import ignite.distributed as idist
 from ignite.engine import Engine
 from torch.cuda.amp import autocast
 from torch.optim.optimizer import Optimizer
@@ -86,7 +87,8 @@ def train_function(
     errD_real.backward()
 
     # get fake image from generator
-    noise = torch.randn(config.batch_size, config.z_dim, 1, 1, device=device)
+    ws = idist.get_world_size()
+    noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device)
     fake = netG(noise)
 
     # train with fake
@@ -125,7 +127,7 @@ def create_trainers(**kwargs) -> Engine:
 
     Parameters
     ----------
-    kwargs: keyword arguments passed to both train_function and evaluate_function
+    kwargs: keyword arguments passed to both train_function
 
     Returns
     -------