Merge pull request #297 from rwightman/ema_simplify

rwightman · web-flow · commit 9a25fdf3ad04 · 2020-12-05T11:42:45.000-08:00
Simplified JIT compatible Ema module. Fixes for SiLU export and torchscript training w/ Linear layer.
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -121,7 +121,7 @@ def test_model_load_pretrained(model_name, batch_size):
         create_model(model_name, pretrained=True, in_chans=in_chans)
 
     @pytest.mark.timeout(120)
-    @pytest.mark.parametrize('model_name', list_models(pretrained=True))
+    @pytest.mark.parametrize('model_name', list_models(pretrained=True, exclude_filters=['vit_*']))
     @pytest.mark.parametrize('batch_size', [1])
     def test_model_features_pretrained(model_name, batch_size):
         """Create that pretrained weights load when features_only==True."""
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
@@ -34,7 +34,7 @@
 from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
 from .features import FeatureInfo, FeatureHooks
-from .helpers import build_model_with_cfg
+from .helpers import build_model_with_cfg, default_cfg_for_features
 from .layers import create_conv2d, create_classifier
 from .registry import register_model
 
@@ -453,18 +453,20 @@ def forward(self, x) -> List[torch.Tensor]:
 
 
 def _create_effnet(model_kwargs, variant, pretrained=False):
+    features_only = False
+    model_cls = EfficientNet
     if model_kwargs.pop('features_only', False):
-        load_strict = False
+        features_only = True
         model_kwargs.pop('num_classes', 0)
         model_kwargs.pop('num_features', 0)
         model_kwargs.pop('head_conv', None)
         model_cls = EfficientNetFeatures
-    else:
-        load_strict = True
-        model_cls = EfficientNet
-    return build_model_with_cfg(
+    model = build_model_with_cfg(
         model_cls, variant, pretrained, default_cfg=default_cfgs[variant],
-        pretrained_strict=load_strict, **model_kwargs)
+        pretrained_strict=not features_only, **model_kwargs)
+    if features_only:
+        model.default_cfg = default_cfg_for_features(model.default_cfg)
+    return model
 
 
 def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
diff --git a/timm/models/helpers.py b/timm/models/helpers.py
@@ -14,7 +14,7 @@
 import torch.utils.model_zoo as model_zoo
 
 from .features import FeatureListNet, FeatureDictNet, FeatureHookNet
-from .layers import Conv2dSame
+from .layers import Conv2dSame, Linear
 
 
 _logger = logging.getLogger(__name__)
@@ -234,7 +234,7 @@ def adapt_model_from_string(parent_module, model_string):
         if isinstance(old_module, nn.Linear):
             # FIXME extra checks to ensure this is actually the FC classifier layer and not a diff Linear layer?
             num_features = state_dict[n + '.weight'][1]
-            new_fc = nn.Linear(
+            new_fc = Linear(
                 in_features=num_features, out_features=old_module.out_features, bias=old_module.bias is not None)
             set_layer(new_module, n, new_fc)
             if hasattr(new_module, 'num_features'):
@@ -251,6 +251,15 @@ def adapt_model_from_file(parent_module, model_variant):
         return adapt_model_from_string(parent_module, f.read().strip())
 
 
+def default_cfg_for_features(default_cfg):
+    default_cfg = deepcopy(default_cfg)
+    # remove default pretrained cfg fields that don't have much relevance for feature backbone
+    to_remove = ('num_classes', 'crop_pct', 'classifier')  # add default final pool size?
+    for tr in to_remove:
+        default_cfg.pop(tr, None)
+    return default_cfg
+
+
 def build_model_with_cfg(
         model_cls: Callable,
         variant: str,
@@ -296,5 +305,6 @@ def build_model_with_cfg(
                 else:
                     assert False, f'Unknown feature class {feature_cls}'
         model = feature_cls(model, **feature_cfg)
+        model.default_cfg = default_cfg_for_features(default_cfg)  # add back default_cfg
     
     return model
diff --git a/timm/models/hrnet.py b/timm/models/hrnet.py
@@ -17,7 +17,7 @@
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .features import FeatureInfo
-from .helpers import build_model_with_cfg
+from .helpers import build_model_with_cfg, default_cfg_for_features
 from .layers import create_classifier
 from .registry import register_model
 from .resnet import BasicBlock, Bottleneck  # leveraging ResNet blocks w/ additional features like SE
@@ -773,15 +773,17 @@ def forward(self, x) -> List[torch.tensor]:
 
 def _create_hrnet(variant, pretrained, **model_kwargs):
     model_cls = HighResolutionNet
-    strict = True
+    features_only = False
     if model_kwargs.pop('features_only', False):
         model_cls = HighResolutionNetFeatures
         model_kwargs['num_classes'] = 0
-        strict = False
-
-    return build_model_with_cfg(
+        features_only = True
+    model = build_model_with_cfg(
         model_cls, variant, pretrained, default_cfg=default_cfgs[variant],
-        model_cfg=cfg_cls[variant], pretrained_strict=strict, **model_kwargs)
+        model_cfg=cfg_cls[variant], pretrained_strict=not features_only, **model_kwargs)
+    if features_only:
+        model.default_cfg = default_cfg_for_features(model.default_cfg)
+    return model
 
 
 @register_model
diff --git a/timm/models/inception_v3.py b/timm/models/inception_v3.py
@@ -10,7 +10,7 @@
 from timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 from .helpers import build_model_with_cfg
 from .registry import register_model
-from .layers import trunc_normal_, create_classifier
+from .layers import trunc_normal_, create_classifier, Linear
 
 
 def _cfg(url='', **kwargs):
@@ -250,7 +250,7 @@ def __init__(self, in_channels, num_classes, conv_block=None):
         self.conv0 = conv_block(in_channels, 128, kernel_size=1)
         self.conv1 = conv_block(128, 768, kernel_size=5)
         self.conv1.stddev = 0.01
-        self.fc = nn.Linear(768, num_classes)
+        self.fc = Linear(768, num_classes)
         self.fc.stddev = 0.001
 
     def forward(self, x):
diff --git a/timm/models/layers/__init__.py b/timm/models/layers/__init__.py
@@ -18,6 +18,7 @@
 from .evo_norm import EvoNormBatch2d, EvoNormSample2d
 from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple
 from .inplace_abn import InplaceAbn
+from .linear import Linear
 from .mixed_conv2d import MixedConv2d
 from .norm_act import BatchNormAct2d
 from .padding import get_padding
diff --git a/timm/models/layers/activations.py b/timm/models/layers/activations.py
@@ -119,3 +119,27 @@ def __init__(self, inplace: bool = False):
 
     def forward(self, x):
         return hard_mish(x, self.inplace)
+
+
+class PReLU(nn.PReLU):
+    """Applies PReLU (w/ dummy inplace arg)
+    """
+    def __init__(self, num_parameters: int = 1, init: float = 0.25, inplace: bool = False) -> None:
+        super(PReLU, self).__init__(num_parameters=num_parameters, init=init)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.prelu(input, self.weight)
+
+
+def gelu(x: torch.Tensor, inplace: bool = False) -> torch.Tensor:
+    return F.gelu(x)
+
+
+class GELU(nn.Module):
+    """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
+    """
+    def __init__(self, inplace: bool = False):
+        super(GELU, self).__init__()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.gelu(input)
diff --git a/timm/models/layers/classifier.py b/timm/models/layers/classifier.py
@@ -6,6 +6,7 @@
 from torch.nn import functional as F
 
 from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .linear import Linear
 
 
 def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False):
@@ -21,7 +22,8 @@ def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False
     elif use_conv:
         fc = nn.Conv2d(num_pooled_features, num_classes, 1, bias=True)
     else:
-        fc = nn.Linear(num_pooled_features, num_classes, bias=True)
+        # NOTE: using my Linear wrapper that fixes AMP + torchscript casting issue
+        fc = Linear(num_pooled_features, num_classes, bias=True)
     return global_pool, fc
 
 
diff --git a/timm/models/layers/create_act.py b/timm/models/layers/create_act.py
@@ -19,10 +19,9 @@
     relu6=F.relu6,
     leaky_relu=F.leaky_relu,
     elu=F.elu,
-    prelu=F.prelu,
     celu=F.celu,
     selu=F.selu,
-    gelu=F.gelu,
+    gelu=gelu,
     sigmoid=sigmoid,
     tanh=tanh,
     hard_sigmoid=hard_sigmoid,
@@ -56,10 +55,10 @@
     relu6=nn.ReLU6,
     leaky_relu=nn.LeakyReLU,
     elu=nn.ELU,
-    prelu=nn.PReLU,
+    prelu=PReLU,
     celu=nn.CELU,
     selu=nn.SELU,
-    gelu=nn.GELU,
+    gelu=GELU,
     sigmoid=Sigmoid,
     tanh=Tanh,
     hard_sigmoid=HardSigmoid,
@@ -98,7 +97,10 @@ def get_act_fn(name='relu'):
         # custom autograd, then fallback
         if name in _ACT_FN_ME:
             return _ACT_FN_ME[name]
-    if not is_no_jit():
+    if is_exportable() and name in ('silu', 'swish'):
+        # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
+        return swish
+    if not (is_no_jit() or is_exportable()):
         if name in _ACT_FN_JIT:
             return _ACT_FN_JIT[name]
     return _ACT_FN_DEFAULT[name]
@@ -114,7 +116,10 @@ def get_act_layer(name='relu'):
     if not (is_no_jit() or is_exportable() or is_scriptable()):
         if name in _ACT_LAYER_ME:
             return _ACT_LAYER_ME[name]
-    if not is_no_jit():
+    if is_exportable() and name in ('silu', 'swish'):
+        # FIXME PyTorch SiLU doesn't ONNX export, this is a temp hack
+        return Swish
+    if not (is_no_jit() or is_exportable()):
         if name in _ACT_LAYER_JIT:
             return _ACT_LAYER_JIT[name]
     return _ACT_LAYER_DEFAULT[name]
diff --git a/timm/models/layers/linear.py b/timm/models/layers/linear.py
@@ -0,0 +1,19 @@
+""" Linear layer (alternate definition)
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn as nn
+
+
+class Linear(nn.Linear):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+
+    Wraps torch.nn.Linear to support AMP + torchscript usage by manually casting
+    weight & bias to input.dtype to work around an issue w/ torch.addmm in this use case.
+    """
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if torch.jit.is_scripting():
+            bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
+            return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
+        else:
+            return F.linear(input, self.weight, self.bias)
diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py
@@ -17,8 +17,8 @@
 from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
 from .features import FeatureInfo, FeatureHooks
-from .helpers import build_model_with_cfg
-from .layers import SelectAdaptivePool2d, create_conv2d, get_act_fn, hard_sigmoid
+from .helpers import build_model_with_cfg, default_cfg_for_features
+from .layers import SelectAdaptivePool2d, Linear, create_conv2d, get_act_fn, hard_sigmoid
 from .registry import register_model
 
 __all__ = ['MobileNetV3']
@@ -105,7 +105,7 @@ def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_f
         num_pooled_chs = head_chs * self.global_pool.feat_mult()
         self.conv_head = create_conv2d(num_pooled_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
         self.act2 = act_layer(inplace=True)
-        self.classifier = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
 
         efficientnet_init_weights(self)
 
@@ -123,7 +123,7 @@ def reset_classifier(self, num_classes, global_pool='avg'):
         self.num_classes = num_classes
         # cannot meaningfully change pooling of efficient head after creation
         self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.classifier = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
 
     def forward_features(self, x):
         x = self.conv_stem(x)
@@ -201,19 +201,21 @@ def forward(self, x) -> List[torch.Tensor]:
 
 
 def _create_mnv3(model_kwargs, variant, pretrained=False):
+    features_only = False
+    model_cls = MobileNetV3
     if model_kwargs.pop('features_only', False):
-        load_strict = False
+        features_only = True
         model_kwargs.pop('num_classes', 0)
         model_kwargs.pop('num_features', 0)
         model_kwargs.pop('head_conv', None)
         model_kwargs.pop('head_bias', None)
         model_cls = MobileNetV3Features
-    else:
-        load_strict = True
-        model_cls = MobileNetV3
-    return build_model_with_cfg(
+    model = build_model_with_cfg(
         model_cls, variant, pretrained, default_cfg=default_cfgs[variant],
-        pretrained_strict=load_strict, **model_kwargs)
+        pretrained_strict=not features_only, **model_kwargs)
+    if features_only:
+        model.default_cfg = default_cfg_for_features(model.default_cfg)
+    return model
 
 
 def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
diff --git a/timm/utils/__init__.py b/timm/utils/__init__.py
@@ -6,5 +6,5 @@
 from .metrics import AverageMeter, accuracy
 from .misc import natural_key, add_bool_arg
 from .model import unwrap_model, get_state_dict
-from .model_ema import ModelEma
+from .model_ema import ModelEma, ModelEmaV2
 from .summary import update_summary, get_outdir
diff --git a/timm/utils/model_ema.py b/timm/utils/model_ema.py
@@ -7,13 +7,16 @@
 from copy import deepcopy
 
 import torch
+import torch.nn as nn
 
 _logger = logging.getLogger(__name__)
 
 
 class ModelEma:
-    """ Model Exponential Moving Average
+    """ Model Exponential Moving Average (DEPRECATED)
+
     Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This version is deprecated, it does not work with scripted models. Will be removed eventually.
 
     This is intended to allow functionality like
     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
@@ -30,7 +33,6 @@ class ModelEma:
 
     This class is sensitive where it is initialized in the sequence of model init,
     GPU assignment and distributed training wrappers.
-    I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
     """
     def __init__(self, model, decay=0.9999, device='', resume=''):
         # make a copy of the model for accumulating moving average of weights
@@ -75,3 +77,50 @@ def update(self, model):
                 if self.device:
                     model_v = model_v.to(device=self.device)
                 ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
+
+
+class ModelEmaV2(nn.Module):
+    """ Model Exponential Moving Average V2
+
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    V2 of this module is simpler, it does not match params/buffers based on name but simply
+    iterates in order. It works with torchscript (JIT of full model).
+
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
+    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
+    smoothing of weights to match results. Pay attention to the decay constant you are using
+    relative to your update count per epoch.
+
+    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
+    disable validation of the EMA weights. Validation will have to be done manually in a separate
+    process, or after the training stops converging.
+
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+    def __init__(self, model, decay=0.9999, device=None):
+        super(ModelEmaV2, self).__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def _update(self, model, update_fn):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def update(self, model):
+        self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)
+
+    def set(self, model):
+        self._update(model, update_fn=lambda e, m: m)
diff --git a/timm/version.py b/timm/version.py
diff --git a/train.py b/train.py