11""" BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
22
33Model from official source: https://github.com/microsoft/unilm/tree/master/beit
4- and
5- https://github.com/microsoft/unilm/tree/master/beit2
64
75@inproceedings{beit,
86title={{BEiT}: {BERT} Pre-Training of Image Transformers},
1210url={https://openreview.net/forum?id=p-BhZSz59o4}
1311}
1412
13+ BEiT-v2 from https://github.com/microsoft/unilm/tree/master/beit2
14+
1515@article{beitv2,
1616title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers},
1717author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei},
2121primaryClass={cs.CV}
2222}
2323
24+ EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636
25+
26+ @article{EVA,
27+ title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
28+ author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang,
29+ Tiejun and Wang, Xinlong and Cao, Yue},
30+ journal={arXiv preprint arXiv:2211.07636},
31+ year={2022}
32+ }
33+
34+
2435At this point only the 1k fine-tuned classification weights and model configs have been added,
2536see original source above for pre-training models and procedure.
2637
3748# https://github.com/facebookresearch/deit/
3849# https://github.com/facebookresearch/dino
3950# --------------------------------------------------------'
51+
52+ # EVA models Copyright (c) 2022 BAAI-Vision
53+
4054import math
4155from functools import partial
4256from typing import Optional , Tuple
4660import torch .nn .functional as F
4761from torch .utils .checkpoint import checkpoint
4862
49- from timm .data import IMAGENET_DEFAULT_MEAN , IMAGENET_DEFAULT_STD
63+ from timm .data import IMAGENET_DEFAULT_MEAN , IMAGENET_DEFAULT_STD , OPENAI_CLIP_MEAN , OPENAI_CLIP_STD
5064from .helpers import build_model_with_cfg
5165from .layers import PatchEmbed , Mlp , DropPath , trunc_normal_
66+ from .pretrained import generate_default_cfgs
5267from .registry import register_model
5368from .vision_transformer import checkpoint_filter_fn
5469
@@ -64,52 +79,72 @@ def _cfg(url='', **kwargs):
6479 }
6580
6681
67- default_cfgs = {
68- 'beit_base_patch16_224' : _cfg (
82+ default_cfgs = generate_default_cfgs ( {
83+ 'beit_base_patch16_224.in22k_ft_in22k_in1k ' : _cfg (
6984 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth' ),
70- 'beit_base_patch16_384' : _cfg (
85+ 'beit_base_patch16_384.in22k_ft_in22k_in1k ' : _cfg (
7186 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth' ,
7287 input_size = (3 , 384 , 384 ), crop_pct = 1.0 ,
7388 ),
74- 'beit_base_patch16_224_in22k ' : _cfg (
89+ 'beit_base_patch16_224.in22k_ft_in22k ' : _cfg (
7590 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth' ,
7691 num_classes = 21841 ,
7792 ),
78- 'beit_large_patch16_224' : _cfg (
93+ 'beit_large_patch16_224.in22k_ft_in22k_in1k ' : _cfg (
7994 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth' ),
80- 'beit_large_patch16_384' : _cfg (
95+ 'beit_large_patch16_384.in22k_ft_in22k_in1k ' : _cfg (
8196 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth' ,
8297 input_size = (3 , 384 , 384 ), crop_pct = 1.0 ,
8398 ),
84- 'beit_large_patch16_512' : _cfg (
99+ 'beit_large_patch16_512.in22k_ft_in22k_in1k ' : _cfg (
85100 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth' ,
86101 input_size = (3 , 512 , 512 ), crop_pct = 1.0 ,
87102 ),
88- 'beit_large_patch16_224_in22k ' : _cfg (
103+ 'beit_large_patch16_224.in22k_ft_in22k ' : _cfg (
89104 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth' ,
90105 num_classes = 21841 ,
91106 ),
92107
93- 'beitv2_base_patch16_224' : _cfg (
108+ 'beitv2_base_patch16_224.in1k_ft_in22k_in1k ' : _cfg (
94109 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth' ,
95110 mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD
96111 ),
97- 'beitv2_base_patch16_224_in22k ' : _cfg (
112+ 'beitv2_base_patch16_224.in1k_ft_in22k ' : _cfg (
98113 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth' ,
99114 num_classes = 21841 ,
100115 mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD
101116 ),
102- 'beitv2_large_patch16_224' : _cfg (
117+ 'beitv2_large_patch16_224.in1k_ft_in22k_in1k ' : _cfg (
103118 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth' ,
104119 crop_pct = 0.95 ,
105120 mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD
106121 ),
107- 'beitv2_large_patch16_224_in22k ' : _cfg (
122+ 'beitv2_large_patch16_224.in1k_ft_in22k ' : _cfg (
108123 url = 'https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth' ,
109124 num_classes = 21841 ,
110125 mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD
111126 ),
112- }
127+
128+ 'eva_giant_patch14_224.clip_ft_in1k' : _cfg (
129+ hf_hub_id = 'BAAI/EVA' , hf_hub_filename = 'eva_clip_vis_enc_sz224_ftcls_89p1.pt' ,
130+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
131+ ),
132+ 'eva_giant_patch14_336.clip_ft_in1k' : _cfg (
133+ hf_hub_id = 'BAAI/EVA' ,
134+ hf_hub_filename = 'eva_clip_vis_enc_sz336_ftcls_89p4.pt' ,
135+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
136+ input_size = (3 , 336 , 336 )),
137+ 'eva_giant_patch14_336.m30m_ft_in22k_in1k' : _cfg (
138+ hf_hub_id = 'BAAI/EVA' ,
139+ hf_hub_filename = 'eva_21k_1k_336px_psz14_ema_89p6.pt' ,
140+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
141+ input_size = (3 , 336 , 336 )),
142+ 'eva_giant_patch14_560.m30m_ft_in22k_in1k' : _cfg (
143+ hf_hub_id = 'BAAI/EVA' ,
144+ hf_hub_filename = 'eva_21k_1k_560px_psz14_ema_89p7.pt' ,
145+ mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD ,
146+ input_size = (3 , 560 , 560 )),
147+ })
113148
114149
115150def gen_relative_position_index (window_size : Tuple [int , int ]) -> torch .Tensor :
@@ -415,7 +450,7 @@ def beit_base_patch16_224(pretrained=False, **kwargs):
415450@register_model
416451def beit_base_patch16_384 (pretrained = False , ** kwargs ):
417452 model_kwargs = dict (
418- img_size = 384 , patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , mlp_ratio = 4 ,
453+ img_size = 384 , patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 ,
419454 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 0.1 , ** kwargs )
420455 model = _create_beit ('beit_base_patch16_384' , pretrained = pretrained , ** model_kwargs )
421456 return model
@@ -424,7 +459,7 @@ def beit_base_patch16_384(pretrained=False, **kwargs):
424459@register_model
425460def beit_base_patch16_224_in22k (pretrained = False , ** kwargs ):
426461 model_kwargs = dict (
427- patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , mlp_ratio = 4 ,
462+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 ,
428463 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 0.1 , ** kwargs )
429464 model = _create_beit ('beit_base_patch16_224_in22k' , pretrained = pretrained , ** model_kwargs )
430465 return model
@@ -433,7 +468,7 @@ def beit_base_patch16_224_in22k(pretrained=False, **kwargs):
433468@register_model
434469def beit_large_patch16_224 (pretrained = False , ** kwargs ):
435470 model_kwargs = dict (
436- patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , mlp_ratio = 4 , qkv_bias = True ,
471+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 ,
437472 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 1e-5 , ** kwargs )
438473 model = _create_beit ('beit_large_patch16_224' , pretrained = pretrained , ** model_kwargs )
439474 return model
@@ -442,7 +477,7 @@ def beit_large_patch16_224(pretrained=False, **kwargs):
442477@register_model
443478def beit_large_patch16_384 (pretrained = False , ** kwargs ):
444479 model_kwargs = dict (
445- img_size = 384 , patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , mlp_ratio = 4 , qkv_bias = True ,
480+ img_size = 384 , patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 ,
446481 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 1e-5 , ** kwargs )
447482 model = _create_beit ('beit_large_patch16_384' , pretrained = pretrained , ** model_kwargs )
448483 return model
@@ -451,7 +486,7 @@ def beit_large_patch16_384(pretrained=False, **kwargs):
451486@register_model
452487def beit_large_patch16_512 (pretrained = False , ** kwargs ):
453488 model_kwargs = dict (
454- img_size = 512 , patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , mlp_ratio = 4 , qkv_bias = True ,
489+ img_size = 512 , patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 ,
455490 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 1e-5 , ** kwargs )
456491 model = _create_beit ('beit_large_patch16_512' , pretrained = pretrained , ** model_kwargs )
457492 return model
@@ -460,7 +495,7 @@ def beit_large_patch16_512(pretrained=False, **kwargs):
460495@register_model
461496def beit_large_patch16_224_in22k (pretrained = False , ** kwargs ):
462497 model_kwargs = dict (
463- patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , mlp_ratio = 4 , qkv_bias = True ,
498+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 ,
464499 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 1e-5 , ** kwargs )
465500 model = _create_beit ('beit_large_patch16_224_in22k' , pretrained = pretrained , ** model_kwargs )
466501 return model
@@ -487,7 +522,7 @@ def beitv2_base_patch16_224_in22k(pretrained=False, **kwargs):
487522@register_model
488523def beitv2_large_patch16_224 (pretrained = False , ** kwargs ):
489524 model_kwargs = dict (
490- patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , mlp_ratio = 4 , qkv_bias = True ,
525+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 ,
491526 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 1e-5 , ** kwargs )
492527 model = _create_beit ('beitv2_large_patch16_224' , pretrained = pretrained , ** model_kwargs )
493528 return model
@@ -496,7 +531,33 @@ def beitv2_large_patch16_224(pretrained=False, **kwargs):
496531@register_model
497532def beitv2_large_patch16_224_in22k (pretrained = False , ** kwargs ):
498533 model_kwargs = dict (
499- patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , mlp_ratio = 4 , qkv_bias = True ,
534+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 ,
500535 use_abs_pos_emb = False , use_rel_pos_bias = True , init_values = 1e-5 , ** kwargs )
501536 model = _create_beit ('beitv2_large_patch16_224_in22k' , pretrained = pretrained , ** model_kwargs )
502537 return model
538+
539+
540+ def eva_giant_patch14_224 (pretrained = False , ** kwargs ):
541+ """ EVA-g model https://arxiv.org/abs/2211.07636 """
542+ model_kwargs = dict (
543+ patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 , ** kwargs )
544+ model = _create_beit ('eva_giant_patch14_224' , pretrained = pretrained , ** model_kwargs )
545+ return model
546+
547+
548+ @register_model
549+ def eva_giant_patch14_336 (pretrained = False , ** kwargs ):
550+ """ EVA-g model https://arxiv.org/abs/2211.07636 """
551+ model_kwargs = dict (
552+ patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 , ** kwargs )
553+ model = _create_beit ('eva_giant_patch14_336' , pretrained = pretrained , ** model_kwargs )
554+ return model
555+
556+
557+ @register_model
558+ def eva_giant_patch14_560 (pretrained = False , ** kwargs ):
559+ """ EVA-g model https://arxiv.org/abs/2211.07636 """
560+ model_kwargs = dict (
561+ patch_size = 14 , embed_dim = 1408 , depth = 40 , num_heads = 16 , mlp_ratio = 6144 / 1408 , ** kwargs )
562+ model = _create_beit ('eva_giant_patch14_560' , pretrained = pretrained , ** model_kwargs )
563+ return model
0 commit comments