Add back old ModelEma and rename new one to ModelEmaV2 to avoid compat breaks in dependant code. Shuffle train script, add a few comments, remove DataParallel support, support experimental torchscript training.

rwightman · rwightman · commit 27bbc70d71d3 · 2020-11-29T16:22:19.000-08:00
diff --git a/timm/utils/__init__.py b/timm/utils/__init__.py
@@ -6,5 +6,5 @@
 from .metrics import AverageMeter, accuracy
 from .misc import natural_key, add_bool_arg
 from .model import unwrap_model, get_state_dict
-from .model_ema import ModelEma
+from .model_ema import ModelEma, ModelEmaV2
 from .summary import update_summary, get_outdir
diff --git a/timm/utils/model.py b/timm/utils/model.py
@@ -6,7 +6,10 @@
 
 
 def unwrap_model(model):
-    return model.module if hasattr(model, 'module') else model
+    if isinstance(model, ModelEma):
+        return unwrap_model(model.ema)
+    else:
+        return model.module if hasattr(model, 'module') else model
 
 
 def get_state_dict(model, unwrap_fn=unwrap_model):
diff --git a/timm/utils/model_ema.py b/timm/utils/model_ema.py
@@ -2,15 +2,89 @@
 
 Hacked together by / Copyright 2020 Ross Wightman
 """
+import logging
+from collections import OrderedDict
 from copy import deepcopy
 
 import torch
 import torch.nn as nn
 
+_logger = logging.getLogger(__name__)
+
+
+class ModelEma:
+    """ Model Exponential Moving Average (DEPRECATED)
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This version is deprecated, it does not work with scripted models. Will be removed eventually.
+
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
+    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
+    smoothing of weights to match results. Pay attention to the decay constant you are using
+    relative to your update count per epoch.
+
+    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
+    disable validation of the EMA weights. Validation will have to be done manually in a separate
+    process, or after the training stops converging.
+
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+    def __init__(self, model, decay=0.9999, device='', resume=''):
+        # make a copy of the model for accumulating moving average of weights
+        self.ema = deepcopy(model)
+        self.ema.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if device:
+            self.ema.to(device=device)
+        self.ema_has_module = hasattr(self.ema, 'module')
+        if resume:
+            self._load_checkpoint(resume)
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def _load_checkpoint(self, checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        assert isinstance(checkpoint, dict)
+        if 'state_dict_ema' in checkpoint:
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint['state_dict_ema'].items():
+                # ema model may have been wrapped by DataParallel, and need module prefix
+                if self.ema_has_module:
+                    name = 'module.' + k if not k.startswith('module') else k
+                else:
+                    name = k
+                new_state_dict[name] = v
+            self.ema.load_state_dict(new_state_dict)
+            _logger.info("Loaded state_dict_ema")
+        else:
+            _logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
+
+    def update(self, model):
+        # correct a mismatch in state dict keys
+        needs_module = hasattr(model, 'module') and not self.ema_has_module
+        with torch.no_grad():
+            msd = model.state_dict()
+            for k, ema_v in self.ema.state_dict().items():
+                if needs_module:
+                    k = 'module.' + k
+                model_v = msd[k].detach()
+                if self.device:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
+
+
+class ModelEmaV2(nn.Module):
+    """ Model Exponential Moving Average V2
 
-class ModelEma(nn.Module):
-    """ Model Exponential Moving Average
     Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
 
     This is intended to allow functionality like
     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
@@ -27,22 +101,20 @@ class ModelEma(nn.Module):
 
     This class is sensitive where it is initialized in the sequence of model init,
     GPU assignment and distributed training wrappers.
-    I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
     """
     def __init__(self, model, decay=0.9999, device=None):
-        super(ModelEma, self).__init__()
+        super(ModelEmaV2, self).__init__()
         # make a copy of the model for accumulating moving average of weights
         self.module = deepcopy(model)
         self.module.eval()
         self.decay = decay
         self.device = device  # perform ema on different device from model if set
-        if device is not None:
+        if self.device is not None:
             self.module.to(device=device)
 
     def update(self, model):
         with torch.no_grad():
             for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
-                assert ema_v.shape == model_v.shape
-                if self.device:
+                if self.device is not None:
                     model_v = model_v.to(device=self.device)
                 ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
diff --git a/train.py b/train.py