@@ -82,6 +82,7 @@ def _cfg(url='', **kwargs):
8282 url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pth'
8383 ),
8484 'coatnet_2_rw_224': _cfg(url=''),
85+ 'coatnet_3_rw_224': _cfg(url=''),
8586
8687 # Highly experimental configs
8788 'coatnet_bn_0_rw_224': _cfg(
@@ -94,6 +95,8 @@ def _cfg(url='', **kwargs):
9495 'coatnet_rmlp_0_rw_224': _cfg(url=''),
9596 'coatnet_rmlp_1_rw_224': _cfg(
9697 url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pth'),
98+ 'coatnet_rmlp_2_rw_224': _cfg(url=''),
99+ 'coatnet_rmlp_3_rw_224': _cfg(url=''),
97100 'coatnet_nano_cc_224': _cfg(url=''),
98101 'coatnext_nano_rw_224': _cfg(url=''),
99102
@@ -122,10 +125,19 @@ def _cfg(url='', **kwargs):
122125 url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pth',
123126 input_size=(3, 256, 256), pool_size=(8, 8)),
124127 'maxvit_rmlp_tiny_rw_256': _cfg(
125- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-2da819a5 .pth',
128+ url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5 .pth',
126129 input_size=(3, 256, 256), pool_size=(8, 8)),
130+ 'maxvit_rmlp_small_rw_224': _cfg(
131+ url=''),
132+ 'maxvit_rmlp_small_rw_256': _cfg(
133+ url='',
134+ input_size=(3, 256, 256), pool_size=(8, 8)),
135+
127136 'maxvit_tiny_pm_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
137+
128138 'maxxvit_nano_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
139+ 'maxxvit_tiny_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
140+ 'maxxvit_small_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
129141
130142 # Trying to be like the MaxViT paper configs
131143 'maxvit_tiny_224': _cfg(url=''),
@@ -182,7 +194,7 @@ class MaxxVitConvCfg:
182194 attn_layer: str = 'se'
183195 attn_act_layer: str = 'silu'
184196 attn_ratio: float = 0.25
185- init_values: Optional[float] = 1e-5 # for ConvNeXt block
197+ init_values: Optional[float] = 1e-6 # for ConvNeXt block, ignored by MBConv
186198 act_layer: str = 'gelu'
187199 norm_layer: str = ''
188200 norm_layer_cl: str = ''
@@ -218,10 +230,12 @@ def _rw_coat_cfg(
218230 pool_type='avg2',
219231 conv_output_bias=False,
220232 conv_attn_early=False,
233+ conv_attn_act_layer='relu',
221234 conv_norm_layer='',
222235 transformer_shortcut_bias=True,
223236 transformer_norm_layer='layernorm2d',
224237 transformer_norm_layer_cl='layernorm',
238+ init_values=None,
225239 rel_pos_type='bias',
226240 rel_pos_dim=512,
227241):
@@ -246,14 +260,15 @@ def _rw_coat_cfg(
246260 expand_output=False,
247261 output_bias=conv_output_bias,
248262 attn_early=conv_attn_early,
249- attn_act_layer='relu' ,
263+ attn_act_layer=conv_attn_act_layer ,
250264 act_layer='silu',
251265 norm_layer=conv_norm_layer,
252266 ),
253267 transformer_cfg=MaxxVitTransformerCfg(
254268 expand_first=False,
255269 shortcut_bias=transformer_shortcut_bias,
256270 pool_type=pool_type,
271+ init_values=init_values,
257272 norm_layer=transformer_norm_layer,
258273 norm_layer_cl=transformer_norm_layer_cl,
259274 rel_pos_type=rel_pos_type,
@@ -272,6 +287,7 @@ def _rw_max_cfg(
272287 transformer_norm_layer_cl='layernorm',
273288 window_size=None,
274289 dim_head=32,
290+ init_values=None,
275291 rel_pos_type='bias',
276292 rel_pos_dim=512,
277293):
@@ -296,6 +312,7 @@ def _rw_max_cfg(
296312 pool_type=pool_type,
297313 dim_head=dim_head,
298314 window_size=window_size,
315+ init_values=init_values,
299316 norm_layer=transformer_norm_layer,
300317 norm_layer_cl=transformer_norm_layer_cl,
301318 rel_pos_type=rel_pos_type,
@@ -312,7 +329,8 @@ def _next_cfg(
312329 transformer_norm_layer='layernorm2d',
313330 transformer_norm_layer_cl='layernorm',
314331 window_size=None,
315- rel_pos_type='bias',
332+ init_values=1e-6,
333+ rel_pos_type='mlp', # MLP by default for maxxvit
316334 rel_pos_dim=512,
317335):
318336 # For experimental models with convnext instead of mbconv
@@ -322,13 +340,15 @@ def _next_cfg(
322340 stride_mode=stride_mode,
323341 pool_type=pool_type,
324342 expand_output=False,
343+ init_values=init_values,
325344 norm_layer=conv_norm_layer,
326345 norm_layer_cl=conv_norm_layer_cl,
327346 ),
328347 transformer_cfg=MaxxVitTransformerCfg(
329348 expand_first=False,
330349 pool_type=pool_type,
331350 window_size=window_size,
351+ init_values=init_values,
332352 norm_layer=transformer_norm_layer,
333353 norm_layer_cl=transformer_norm_layer_cl,
334354 rel_pos_type=rel_pos_type,
@@ -381,7 +401,21 @@ def _next_cfg(
381401 embed_dim=(128, 256, 512, 1024),
382402 depths=(2, 6, 14, 2),
383403 stem_width=(64, 128),
384- **_rw_coat_cfg(stride_mode='dw'),
404+ **_rw_coat_cfg(
405+ stride_mode='dw',
406+ conv_attn_act_layer='silu',
407+ init_values=1e-6,
408+ ),
409+ ),
410+ coatnet_3_rw_224=MaxxVitCfg(
411+ embed_dim=(192, 384, 768, 1536),
412+ depths=(2, 6, 14, 2),
413+ stem_width=(96, 192),
414+ **_rw_coat_cfg(
415+ stride_mode='dw',
416+ conv_attn_act_layer='silu',
417+ init_values=1e-6,
418+ ),
385419 ),
386420
387421 # Highly experimental configs
@@ -428,6 +462,29 @@ def _next_cfg(
428462 rel_pos_dim=384, # was supposed to be 512, woops
429463 ),
430464 ),
465+ coatnet_rmlp_2_rw_224=MaxxVitCfg(
466+ embed_dim=(128, 256, 512, 1024),
467+ depths=(2, 6, 14, 2),
468+ stem_width=(64, 128),
469+ **_rw_coat_cfg(
470+ stride_mode='dw',
471+ conv_attn_act_layer='silu',
472+ init_values=1e-6,
473+ rel_pos_type='mlp'
474+ ),
475+ ),
476+ coatnet_rmlp_3_rw_224=MaxxVitCfg(
477+ embed_dim=(192, 384, 768, 1536),
478+ depths=(2, 6, 14, 2),
479+ stem_width=(96, 192),
480+ **_rw_coat_cfg(
481+ stride_mode='dw',
482+ conv_attn_act_layer='silu',
483+ init_values=1e-6,
484+ rel_pos_type='mlp'
485+ ),
486+ ),
487+
431488 coatnet_nano_cc_224=MaxxVitCfg(
432489 embed_dim=(64, 128, 256, 512),
433490 depths=(3, 4, 6, 3),
@@ -504,6 +561,7 @@ def _next_cfg(
504561 stem_width=(32, 64),
505562 **_rw_max_cfg(),
506563 ),
564+
507565 maxvit_rmlp_pico_rw_256=MaxxVitCfg(
508566 embed_dim=(32, 64, 128, 256),
509567 depths=(2, 2, 5, 2),
@@ -525,13 +583,35 @@ def _next_cfg(
525583 stem_width=(32, 64),
526584 **_rw_max_cfg(rel_pos_type='mlp'),
527585 ),
586+ maxvit_rmlp_small_rw_224=MaxxVitCfg(
587+ embed_dim=(96, 192, 384, 768),
588+ depths=(2, 2, 5, 2),
589+ block_type=('M',) * 4,
590+ stem_width=(32, 64),
591+ **_rw_max_cfg(
592+ rel_pos_type='mlp',
593+ init_values=1e-6,
594+ ),
595+ ),
596+ maxvit_rmlp_small_rw_256=MaxxVitCfg(
597+ embed_dim=(96, 192, 384, 768),
598+ depths=(2, 2, 5, 2),
599+ block_type=('M',) * 4,
600+ stem_width=(32, 64),
601+ **_rw_max_cfg(
602+ rel_pos_type='mlp',
603+ init_values=1e-6,
604+ ),
605+ ),
606+
528607 maxvit_tiny_pm_256=MaxxVitCfg(
529608 embed_dim=(64, 128, 256, 512),
530609 depths=(2, 2, 5, 2),
531610 block_type=('PM',) * 4,
532611 stem_width=(32, 64),
533612 **_rw_max_cfg(),
534613 ),
614+
535615 maxxvit_nano_rw_256=MaxxVitCfg(
536616 embed_dim=(64, 128, 256, 512),
537617 depths=(1, 2, 3, 1),
@@ -540,6 +620,20 @@ def _next_cfg(
540620 weight_init='normal',
541621 **_next_cfg(),
542622 ),
623+ maxxvit_tiny_rw_256=MaxxVitCfg(
624+ embed_dim=(64, 128, 256, 512),
625+ depths=(2, 2, 5, 2),
626+ block_type=('M',) * 4,
627+ stem_width=(32, 64),
628+ **_next_cfg(),
629+ ),
630+ maxxvit_small_rw_256=MaxxVitCfg(
631+ embed_dim=(96, 192, 384, 768),
632+ depths=(2, 2, 5, 2),
633+ block_type=('M',) * 4,
634+ stem_width=(48, 96),
635+ **_next_cfg(),
636+ ),
543637
544638 # Trying to be like the MaxViT paper configs
545639 maxvit_tiny_224=MaxxVitCfg(
@@ -1641,6 +1735,11 @@ def coatnet_2_rw_224(pretrained=False, **kwargs):
16411735 return _create_maxxvit('coatnet_2_rw_224', pretrained=pretrained, **kwargs)
16421736
16431737
1738+ @register_model
1739+ def coatnet_3_rw_224(pretrained=False, **kwargs):
1740+ return _create_maxxvit('coatnet_3_rw_224', pretrained=pretrained, **kwargs)
1741+
1742+
16441743@register_model
16451744def coatnet_bn_0_rw_224(pretrained=False, **kwargs):
16461745 return _create_maxxvit('coatnet_bn_0_rw_224', pretrained=pretrained, **kwargs)
@@ -1661,6 +1760,16 @@ def coatnet_rmlp_1_rw_224(pretrained=False, **kwargs):
16611760 return _create_maxxvit('coatnet_rmlp_1_rw_224', pretrained=pretrained, **kwargs)
16621761
16631762
1763+ @register_model
1764+ def coatnet_rmlp_2_rw_224(pretrained=False, **kwargs):
1765+ return _create_maxxvit('coatnet_rmlp_2_rw_224', pretrained=pretrained, **kwargs)
1766+
1767+
1768+ @register_model
1769+ def coatnet_rmlp_3_rw_224(pretrained=False, **kwargs):
1770+ return _create_maxxvit('coatnet_rmlp_3_rw_224', pretrained=pretrained, **kwargs)
1771+
1772+
16641773@register_model
16651774def coatnet_nano_cc_224(pretrained=False, **kwargs):
16661775 return _create_maxxvit('coatnet_nano_cc_224', pretrained=pretrained, **kwargs)
@@ -1736,6 +1845,16 @@ def maxvit_rmlp_tiny_rw_256(pretrained=False, **kwargs):
17361845 return _create_maxxvit('maxvit_rmlp_tiny_rw_256', pretrained=pretrained, **kwargs)
17371846
17381847
1848+ @register_model
1849+ def maxvit_rmlp_small_rw_224(pretrained=False, **kwargs):
1850+ return _create_maxxvit('maxvit_rmlp_small_rw_224', pretrained=pretrained, **kwargs)
1851+
1852+
1853+ @register_model
1854+ def maxvit_rmlp_small_rw_256(pretrained=False, **kwargs):
1855+ return _create_maxxvit('maxvit_rmlp_small_rw_256', pretrained=pretrained, **kwargs)
1856+
1857+
17391858@register_model
17401859def maxvit_tiny_pm_256(pretrained=False, **kwargs):
17411860 return _create_maxxvit('maxvit_tiny_pm_256', pretrained=pretrained, **kwargs)
@@ -1746,6 +1865,16 @@ def maxxvit_nano_rw_256(pretrained=False, **kwargs):
17461865 return _create_maxxvit('maxxvit_nano_rw_256', pretrained=pretrained, **kwargs)
17471866
17481867
1868+ @register_model
1869+ def maxxvit_tiny_rw_256(pretrained=False, **kwargs):
1870+ return _create_maxxvit('maxxvit_tiny_rw_256', pretrained=pretrained, **kwargs)
1871+
1872+
1873+ @register_model
1874+ def maxxvit_small_rw_256(pretrained=False, **kwargs):
1875+ return _create_maxxvit('maxxvit_small_rw_256', pretrained=pretrained, **kwargs)
1876+
1877+
17491878@register_model
17501879def maxvit_tiny_224(pretrained=False, **kwargs):
17511880 return _create_maxxvit('maxvit_tiny_224', pretrained=pretrained, **kwargs)
0 commit comments