@@ -1919,6 +1919,31 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1919
1919
notes = ('natively QuickGELU, use quickgelu model variant for original results' ,),
1920
1920
crop_pct = 1.0 , input_size = (3 , 378 , 378 ), num_classes = 1024 ),
1921
1921
1922
+ 'vit_large_patch14_clip_224.metaclip2_worldwide' : _cfg (
1923
+ hf_hub_id = 'timm/' ,
1924
+ license = 'cc-by-nc-4.0' ,
1925
+ notes = ('natively QuickGELU, use quickgelu model variant for original results' ,),
1926
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
1927
+ 'vit_huge_patch14_clip_224.metaclip2_worldwide' : _cfg (
1928
+ hf_hub_id = 'timm/' ,
1929
+ license = 'cc-by-nc-4.0' ,
1930
+ notes = ('natively QuickGELU, use quickgelu model variant for original results' ,),
1931
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1932
+ 'vit_huge_patch14_clip_378.metaclip2_worldwide' : _cfg (
1933
+ hf_hub_id = 'timm/' ,
1934
+ license = 'cc-by-nc-4.0' ,
1935
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1936
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' , num_classes = 1024 ),
1937
+ 'vit_gigantic_patch14_clip_224.metaclip2_worldwide' : _cfg (
1938
+ hf_hub_id = 'timm/' ,
1939
+ license = 'cc-by-nc-4.0' ,
1940
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1280 ),
1941
+ 'vit_gigantic_patch14_clip_378.metaclip2_worldwide' : _cfg (
1942
+ hf_hub_id = 'timm/' ,
1943
+ license = 'cc-by-nc-4.0' ,
1944
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1945
+ input_size = (3 , 378 , 378 ), crop_pct = 1.0 , crop_mode = 'squash' , num_classes = 1280 ),
1946
+
1922
1947
'vit_base_patch32_clip_224.metaclip_2pt5b' : _cfg (
1923
1948
hf_hub_id = 'timm/' ,
1924
1949
license = 'cc-by-nc-4.0' ,
@@ -3178,6 +3203,20 @@ def vit_gigantic_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionT
3178
3203
return model
3179
3204
3180
3205
3206
+ @register_model
3207
+ def vit_gigantic_patch14_clip_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3208
+ """ ViT-bigG model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
3209
+ Pretrained weights from CLIP image tower.
3210
+ """
3211
+ model_args = dict (
3212
+ patch_size = 14 , embed_dim = 1664 , mlp_ratio = 64 / 13 , depth = 48 , num_heads = 16 , pre_norm = True ,
3213
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
3214
+ )
3215
+ model = _create_vision_transformer (
3216
+ 'vit_gigantic_patch14_clip_378' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3217
+ return model
3218
+
3219
+
3181
3220
@register_model
3182
3221
def vit_base_patch32_clip_quickgelu_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3183
3222
""" ViT-B/32 CLIP image tower @ 224x224
0 commit comments