@@ -1982,7 +1982,7 @@ def vit_small_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer:
19821982 """ ViT-S/14 for DINOv2
19831983 """
19841984 model_args = dict (
1985- patch_size = 14 , embed_dim = 384 , depth = 12 , num_heads = 6 , init_values = 1.0 , img_size = 518 ,
1985+ patch_size = 14 , embed_dim = 384 , depth = 12 , num_heads = 6 , init_values = 1e-5 , img_size = 518 ,
19861986 )
19871987 model = _create_vision_transformer (
19881988 'vit_small_patch14_dinov2' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
@@ -1994,7 +1994,7 @@ def vit_base_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer:
19941994 """ ViT-B/14 for DINOv2
19951995 """
19961996 model_args = dict (
1997- patch_size = 14 , embed_dim = 768 , depth = 12 , num_heads = 12 , init_values = 1.0 , img_size = 518 ,
1997+ patch_size = 14 , embed_dim = 768 , depth = 12 , num_heads = 12 , init_values = 1e-5 , img_size = 518 ,
19981998 )
19991999 model = _create_vision_transformer (
20002000 'vit_base_patch14_dinov2' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
@@ -2006,7 +2006,7 @@ def vit_large_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer:
20062006 """ ViT-L/14 for DINOv2
20072007 """
20082008 model_args = dict (
2009- patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , init_values = 1.0 , img_size = 518 ,
2009+ patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , init_values = 1e-5 , img_size = 518 ,
20102010 )
20112011 model = _create_vision_transformer (
20122012 'vit_large_patch14_dinov2' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
@@ -2024,7 +2024,7 @@ def vit_giant_patch14_dinov2(pretrained=False, **kwargs) -> VisionTransformer:
20242024 # With SwiGLUPacked, we need to set hidden_features = 2 * 4096 = 8192
20252025
20262026 model_args = dict (
2027- patch_size = 14 , embed_dim = 1536 , depth = 40 , num_heads = 24 , init_values = 1.0 ,
2027+ patch_size = 14 , embed_dim = 1536 , depth = 40 , num_heads = 24 , init_values = 1e-5 ,
20282028 mlp_ratio = 2.66667 * 2 , mlp_layer = SwiGLUPacked , img_size = 518 , act_layer = nn .SiLU
20292029 )
20302030 model = _create_vision_transformer (
0 commit comments