Skip to content

Commit 2ddbe5e

Browse files
committed
Add mobileclip2 encoder weights
1 parent 435ab7a commit 2ddbe5e

File tree

1 file changed

+39
-0
lines changed

1 file changed

+39
-0
lines changed

timm/models/vision_transformer.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,6 +1919,31 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19191919
notes=('natively QuickGELU, use quickgelu model variant for original results',),
19201920
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),
19211921

1922+
'vit_large_patch14_clip_224.metaclip2_worldwide': _cfg(
1923+
hf_hub_id='timm/',
1924+
license='cc-by-nc-4.0',
1925+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1926+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
1927+
'vit_huge_patch14_clip_224.metaclip2_worldwide': _cfg(
1928+
hf_hub_id='timm/',
1929+
license='cc-by-nc-4.0',
1930+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1931+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1932+
'vit_huge_patch14_clip_378.metaclip2_worldwide': _cfg(
1933+
hf_hub_id='timm/',
1934+
license='cc-by-nc-4.0',
1935+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1936+
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash', num_classes=1024),
1937+
'vit_gigantic_patch14_clip_224.metaclip2_worldwide': _cfg(
1938+
hf_hub_id='timm/',
1939+
license='cc-by-nc-4.0',
1940+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
1941+
'vit_gigantic_patch14_clip_378.metaclip2_worldwide': _cfg(
1942+
hf_hub_id='timm/',
1943+
license='cc-by-nc-4.0',
1944+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1945+
input_size=(3, 378, 378), crop_pct=1.0, crop_mode='squash', num_classes=1280),
1946+
19221947
'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg(
19231948
hf_hub_id='timm/',
19241949
license='cc-by-nc-4.0',
@@ -3178,6 +3203,20 @@ def vit_gigantic_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionT
31783203
return model
31793204

31803205

3206+
@register_model
3207+
def vit_gigantic_patch14_clip_378(pretrained: bool = False, **kwargs) -> VisionTransformer:
3208+
""" ViT-bigG model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
3209+
Pretrained weights from CLIP image tower.
3210+
"""
3211+
model_args = dict(
3212+
patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True,
3213+
norm_layer=partial(LayerNorm, eps=1e-5),
3214+
)
3215+
model = _create_vision_transformer(
3216+
'vit_gigantic_patch14_clip_378', pretrained=pretrained, **dict(model_args, **kwargs))
3217+
return model
3218+
3219+
31813220
@register_model
31823221
def vit_base_patch32_clip_quickgelu_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
31833222
""" ViT-B/32 CLIP image tower @ 224x224

0 commit comments

Comments
 (0)