|
30 | 30 | import torch.nn.functional as F |
31 | 31 | import torch.utils.checkpoint |
32 | 32 |
|
33 | | -from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD |
| 33 | +from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD,\ |
| 34 | + OPENAI_CLIP_MEAN, OPENAI_CLIP_STD |
34 | 35 | from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply, adapt_input_conv, checkpoint_seq |
35 | 36 | from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_ |
36 | 37 | from .registry import register_model |
@@ -106,7 +107,7 @@ def _cfg(url='', **kwargs): |
106 | 107 | 'vit_large_patch14_224': _cfg(url=''), |
107 | 108 | 'vit_huge_patch14_224': _cfg(url=''), |
108 | 109 | 'vit_giant_patch14_224': _cfg(url=''), |
109 | | - 'vit_gee_patch14_224': _cfg(url=''), |
| 110 | + 'vit_gigantic_patch14_224': _cfg(url=''), |
110 | 111 |
|
111 | 112 |
|
112 | 113 | # patch models, imagenet21k (weights from official Google JAX impl) |
@@ -179,17 +180,21 @@ def _cfg(url='', **kwargs): |
179 | 180 | 'vit_base_patch16_18x2_224': _cfg(url=''), |
180 | 181 |
|
181 | 182 | 'vit_base_patch32_224_clip_laion2b': _cfg( |
182 | | - hf_hub_id='', |
183 | | - num_classes=512), |
| 183 | + hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', |
| 184 | + hf_hub_filename='open_clip_pytorch_model.bin', |
| 185 | + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), |
184 | 186 | 'vit_large_patch14_224_clip_laion2b': _cfg( |
185 | | - hf_hub_id='', |
186 | | - num_classes=768), |
| 187 | + hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', |
| 188 | + hf_hub_filename='open_clip_pytorch_model.bin', |
| 189 | + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, num_classes=768), |
187 | 190 | 'vit_huge_patch14_224_clip_laion2b': _cfg( |
188 | | - hf_hub_id='', |
189 | | - num_classes=1024), |
| 191 | + hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', |
| 192 | + hf_hub_filename='open_clip_pytorch_model.bin', |
| 193 | + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=1024), |
190 | 194 | 'vit_giant_patch14_224_clip_laion2b': _cfg( |
191 | | - hf_hub_id='', |
192 | | - num_classes=1024), |
| 195 | + hf_hub_id='CLIP-ViT-g-14-laion2B-s12B-b42K', |
| 196 | + hf_hub_filename='open_clip_pytorch_model.bin', |
| 197 | + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=1024), |
193 | 198 |
|
194 | 199 | } |
195 | 200 |
|
@@ -960,12 +965,11 @@ def vit_giant_patch14_224(pretrained=False, **kwargs): |
960 | 965 |
|
961 | 966 |
|
962 | 967 | @register_model |
963 | | -def vit_gee_patch14_224(pretrained=False, **kwargs): |
964 | | - """ ViT-GEE (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560 |
965 | | - As per https://twitter.com/wightmanr/status/1570549064667889666 |
| 968 | +def vit_gigantic_patch14_224(pretrained=False, **kwargs): |
| 969 | + """ ViT-Gigantic (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560 |
966 | 970 | """ |
967 | 971 | model_kwargs = dict(patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, **kwargs) |
968 | | - model = _create_vision_transformer('vit_gee_patch14_224', pretrained=pretrained, **model_kwargs) |
| 972 | + model = _create_vision_transformer('vit_gigantic_patch14_224', pretrained=pretrained, **model_kwargs) |
969 | 973 | return model |
970 | 974 |
|
971 | 975 |
|
|
0 commit comments