huggingface
diff --git a/‎README.md‎
Lines changed: 12 additions & 0 deletions b/‎README.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tests/test_models.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/test_models.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎timm/models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎timm/models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎timm/models/helpers.py‎
Lines changed: 1 addition & 1 deletion b/‎timm/models/helpers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎timm/models/layers/mlp.py‎
Lines changed: 20 additions & 13 deletions b/‎timm/models/layers/mlp.py‎
Lines changed: 20 additions & 13 deletions
@@ -23,6 +23,17 @@ I'm fortunate to be able to dedicate significant time and money of my own suppor
 
 ## What's New
 
+### May 13, 2022
+* Official Swin-V2 models and weights added from (https://github.com/microsoft/Swin-Transformer). Cleaned up to support torchscript.
+* Some refactoring for existing `timm` Swin-V2-CR impl, will likely do a bit more to bring parts closer to official and decide whether to merge some aspects.
+* More Vision Transformer relative position / residual post-norm experiments w/ 512 dim
+  * `vit_relpos_small_patch16_224` - 81.5 @ 224, 82.5 @ 320 -- rel pos, layer scale, no class token, avg pool
+  * `vit_relpos_medium_patch16_rpn_224` - 82.3 @ 224, 83.1 @ 320 -- rel pos + res-post-norm, no class token, avg pool
+  * `vit_relpos_medium_patch16_224` - 82.5 @ 224, 83.3 @ 320 -- rel pos, layer scale, no class token, avg pool
+  * `vit_relpos_base_patch16_gapcls_224` - 82.8 @ 224, 83.9 @ 320 -- rel pos, layer scale, class token, avg pool (by mistake)
+* Bring 512 dim, 8-head 'medium' ViT model variant back to life (after using in a pre DeiT 'small' model for first ViT impl back in 2020)
+* Add ViT relative position support for switching btw existing impl and some additions in official Swin-V2 impl for future trials
+* Sequencer2D impl (https://arxiv.org/abs/2205.01972), added via PR from author (https://github.com/okojoalg)
 
 ### May 2, 2022
 * Vision Transformer experiments adding Relative Position (Swin-V2 log-coord) (`vision_transformer_relpos.py`) and Residual Post-Norm branches (from Swin-V2) (`vision_transformer*.py`)
@@ -390,6 +401,7 @@ A full version of the list below with source links can be found in the [document
 * ReXNet - https://arxiv.org/abs/2007.00992
 * SelecSLS - https://arxiv.org/abs/1907.00837
 * Selective Kernel Networks - https://arxiv.org/abs/1903.06586
+* Sequencer2D - https://arxiv.org/abs/2205.01972
 * Swin S3 (AutoFormerV2) - https://arxiv.org/abs/2111.14725
 * Swin Transformer - https://arxiv.org/abs/2103.14030
 * Swin Transformer V2 - https://arxiv.org/abs/2111.09883
 
@@ -25,7 +25,7 @@
 NON_STD_FILTERS = [
     'vit_*', 'tnt_*', 'pit_*', 'swin_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
     'convit_*', 'levit*', 'visformer*', 'deit*', 'jx_nest_*', 'nest_*', 'xcit_*', 'crossvit_*', 'beit_*',
-    'poolformer_*', 'volo_*', 'sequencer2d_*']
+    'poolformer_*', 'volo_*', 'sequencer2d_*', 'swinv2_*']
 NUM_NON_STD = len(NON_STD_FILTERS)
 
 # exclude models that cause specific test failures
 
@@ -42,6 +42,7 @@
 from .sequencer import *
 from .sknet import *
 from .swin_transformer import *
+from .swin_transformer_v2 import *
 from .swin_transformer_v2_cr import *
 from .tnt import *
 from .tresnet import *
 
@@ -477,7 +477,7 @@ def build_model_with_cfg(
         pretrained_cfg: Optional[Dict] = None,
         model_cfg: Optional[Any] = None,
         feature_cfg: Optional[Dict] = None,
-        pretrained_strict: bool = False,
+        pretrained_strict: bool = True,
         pretrained_filter_fn: Optional[Callable] = None,
         pretrained_custom_load: bool = False,
         kwargs_filter: Optional[Tuple[str]] = None,
 
@@ -10,16 +10,17 @@
 class Mlp(nn.Module):
     """ MLP as used in Vision Transformer, MLP-Mixer and related networks
     """
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
         drop_probs = to_2tuple(drop)
 
-        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
         self.act = act_layer()
         self.drop1 = nn.Dropout(drop_probs[0])
-        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
         self.drop2 = nn.Dropout(drop_probs[1])
 
     def forward(self, x):
@@ -35,17 +36,18 @@ class GluMlp(nn.Module):
     """ MLP w/ GLU style gating
     See: https://arxiv.org/abs/1612.08083, https://arxiv.org/abs/2002.05202
     """
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.Sigmoid, drop=0.):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.Sigmoid, bias=True, drop=0.):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         assert hidden_features % 2 == 0
+        bias = to_2tuple(bias)
         drop_probs = to_2tuple(drop)
 
-        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
         self.act = act_layer()
         self.drop1 = nn.Dropout(drop_probs[0])
-        self.fc2 = nn.Linear(hidden_features // 2, out_features)
+        self.fc2 = nn.Linear(hidden_features // 2, out_features, bias=bias[1])
         self.drop2 = nn.Dropout(drop_probs[1])
 
     def init_weights(self):
@@ -67,14 +69,16 @@ def forward(self, x):
 class GatedMlp(nn.Module):
     """ MLP as used in gMLP
     """
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
-                 gate_layer=None, drop=0.):
+    def __init__(
+            self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
+            gate_layer=None, bias=True, drop=0.):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
         drop_probs = to_2tuple(drop)
 
-        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
         self.act = act_layer()
         self.drop1 = nn.Dropout(drop_probs[0])
         if gate_layer is not None:
@@ -83,7 +87,7 @@ def __init__(self, in_features, hidden_features=None, out_features=None, act_lay
             hidden_features = hidden_features // 2  # FIXME base reduction on gate property?
         else:
             self.gate = nn.Identity()
-        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
         self.drop2 = nn.Dropout(drop_probs[1])
 
     def forward(self, x):
@@ -100,15 +104,18 @@ class ConvMlp(nn.Module):
     """ MLP using 1x1 convs that keeps spatial dims
     """
     def __init__(
-            self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, norm_layer=None, drop=0.):
+            self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU,
+            norm_layer=None, bias=True, drop=0.):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
-        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=True)
+        bias = to_2tuple(bias)
+
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0])
         self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity()
         self.act = act_layer()
-        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=True)
         self.drop = nn.Dropout(drop)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1])
 
     def forward(self, x):
         x = self.fc1(x)