Skip to content

Commit 4a67e13

Browse files
committed
Add 8 new Meta Perception Encoder (PE) weight/variants via EVA. Test NaFlexVit support. Fix #2550
1 parent c603c31 commit 4a67e13

File tree

1 file changed

+201
-4
lines changed

1 file changed

+201
-4
lines changed

timm/models/eva.py

Lines changed: 201 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -638,12 +638,10 @@ def __init__(
638638
self.norm = norm_layer(embed_dim) if activate_post_norm else nn.Identity()
639639

640640
if global_pool == 'map':
641-
attn_pool_num_heads = attn_pool_num_heads or num_heads
642-
attn_pool_mlp_ratio = attn_pool_mlp_ratio or mlp_ratio
643641
self.attn_pool = AttentionPoolLatent(
644642
self.embed_dim,
645-
num_heads=attn_pool_num_heads,
646-
mlp_ratio=attn_pool_mlp_ratio,
643+
num_heads=attn_pool_num_heads or num_heads,
644+
mlp_ratio=attn_pool_mlp_ratio or mlp_ratio,
647645
norm_layer=norm_layer,
648646
act_layer=nn.GELU,
649647
)
@@ -1366,6 +1364,20 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
13661364
),
13671365

13681366
# Perception Encoder weights
1367+
'vit_pe_core_tiny_patch16_384.fb': _pe_cfg(
1368+
hf_hub_id='timm/',
1369+
#hf_hub_id='facebook/PE-Core-T16-384',
1370+
#hf_hub_filename='PE-Core-T16-384.pt',
1371+
input_size=(3, 384, 384),
1372+
num_classes=512, # output proj dim
1373+
),
1374+
'vit_pe_core_small_patch16_384.fb': _pe_cfg(
1375+
hf_hub_id='timm/',
1376+
#hf_hub_id='facebook/PE-Core-S16-384',
1377+
#hf_hub_filename='PE-Core-S16-384.pt',
1378+
input_size=(3, 384, 384),
1379+
num_classes=512, # output proj dim
1380+
),
13691381
'vit_pe_core_base_patch16_224.fb': _pe_cfg(
13701382
hf_hub_id='timm/',
13711383
#hf_hub_id='facebook/PE-Core-B16-224',
@@ -1387,20 +1399,64 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
13871399
input_size=(3, 448, 448),
13881400
num_classes=1280, # output proj dim
13891401
),
1402+
13901403
'vit_pe_lang_large_patch14_448.fb': _pe_cfg(
13911404
hf_hub_id='timm/',
13921405
#hf_hub_id='facebook/PE-Lang-L14-448',
13931406
#hf_hub_filename='PE-Lang-L14-448.pt',
13941407
input_size=(3, 448, 448),
13951408
num_classes=0,
13961409
),
1410+
'vit_pe_lang_large_patch14_448.fb_tiling': _pe_cfg(
1411+
hf_hub_id='timm/',
1412+
#hf_hub_id='facebook/PE-Lang-L14-448-Tiling',
1413+
#hf_hub_filename='PE-Lang-L14-448-Tiling.pt',
1414+
input_size=(3, 448, 448),
1415+
num_classes=0,
1416+
),
13971417
'vit_pe_lang_gigantic_patch14_448.fb': _pe_cfg(
13981418
hf_hub_id='timm/',
13991419
#hf_hub_id='facebook/PE-Lang-G14-448',
14001420
#hf_hub_filename='PE-Lang-G14-448.pt',
14011421
input_size=(3, 448, 448),
14021422
num_classes=0,
14031423
),
1424+
'vit_pe_lang_gigantic_patch14_448.fb_tiling': _pe_cfg(
1425+
hf_hub_id='timm/',
1426+
#hf_hub_id='facebook/PE-Lang-G14-448-Tiling',
1427+
#hf_hub_filename='PE-Lang-G14-448-Tiling.pt',
1428+
input_size=(3, 448, 448),
1429+
num_classes=0,
1430+
),
1431+
1432+
'vit_pe_spatial_tiny_patch16_512.fb': _pe_cfg(
1433+
hf_hub_id='timm/',
1434+
#hf_hub_id='facebook/PE-Spatial-T16-512',
1435+
#hf_hub_filename='PE-Spatial-T16-512.pt',
1436+
input_size=(3, 512, 512),
1437+
num_classes=0,
1438+
),
1439+
'vit_pe_spatial_small_patch16_512.fb': _pe_cfg(
1440+
hf_hub_id='timm/',
1441+
#hf_hub_id='facebook/PE-Spatial-S16-512',
1442+
#hf_hub_filename='PE-Spatial-S16-512.pt',
1443+
input_size=(3, 512, 512),
1444+
num_classes=0,
1445+
),
1446+
'vit_pe_spatial_base_patch16_512.fb': _pe_cfg(
1447+
hf_hub_id='timm/',
1448+
#hf_hub_id='facebook/PE-Spatial-B16-512',
1449+
#hf_hub_filename='PE-Spatial-B16-512.pt',
1450+
input_size=(3, 512, 512),
1451+
num_classes=0,
1452+
),
1453+
'vit_pe_spatial_large_patch14_448.fb': _pe_cfg(
1454+
hf_hub_id='timm/',
1455+
#hf_hub_id='facebook/PE-Spatial-L14-448',
1456+
#hf_hub_filename='PE-Spatial-L14-448.pt',
1457+
input_size=(3, 448, 448),
1458+
num_classes=0,
1459+
),
14041460
'vit_pe_spatial_gigantic_patch14_448.fb': _pe_cfg(
14051461
hf_hub_id='timm/',
14061462
#hf_hub_id='facebook/PE-Spatial-G14-448',
@@ -1842,6 +1898,55 @@ def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Ev
18421898
return model
18431899

18441900

1901+
@register_model
1902+
def vit_pe_core_tiny_patch16_384(pretrained: bool = False, **kwargs) -> Eva:
1903+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1904+
model_args = dict(
1905+
patch_size=16,
1906+
embed_dim=192,
1907+
depth=12,
1908+
num_heads=3,
1909+
mlp_ratio=4.0,
1910+
global_pool='map',
1911+
attn_type='rope',
1912+
use_pre_transformer_norm=True,
1913+
use_rot_pos_emb=True,
1914+
ref_feat_shape=(24, 24),
1915+
rope_grid_offset=1.,
1916+
rope_grid_indexing='xy',
1917+
attn_pool_num_heads=8,
1918+
attn_pool_mlp_ratio=4.,
1919+
norm_layer=partial(LayerNorm, eps=1e-5),
1920+
#dynamic_img_size=True
1921+
)
1922+
return _create_eva('vit_pe_core_tiny_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
1923+
1924+
1925+
1926+
@register_model
1927+
def vit_pe_core_small_patch16_384(pretrained: bool = False, **kwargs) -> Eva:
1928+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1929+
model_args = dict(
1930+
patch_size=16,
1931+
embed_dim=384,
1932+
depth=12,
1933+
num_heads=6,
1934+
mlp_ratio=4.0,
1935+
global_pool='map',
1936+
attn_type='rope',
1937+
use_pre_transformer_norm=True,
1938+
use_rot_pos_emb=True,
1939+
ref_feat_shape=(24, 24),
1940+
rope_grid_offset=1.,
1941+
rope_grid_indexing='xy',
1942+
attn_pool_num_heads=8,
1943+
attn_pool_mlp_ratio=4.,
1944+
norm_layer=partial(LayerNorm, eps=1e-5),
1945+
#dynamic_img_size=True
1946+
)
1947+
return _create_eva('vit_pe_core_small_patch16_384', pretrained=pretrained, **dict(model_args, **kwargs))
1948+
1949+
18451950
@register_model
18461951
def vit_pe_core_base_patch16_224(pretrained: bool = False, **kwargs) -> Eva:
18471952
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
@@ -1963,6 +2068,98 @@ def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
19632068
return _create_eva('vit_pe_lang_gigantic_patch14_448', pretrained=pretrained, **dict(model_args, **kwargs))
19642069

19652070

2071+
@register_model
2072+
def vit_pe_spatial_tiny_patch16_512(pretrained: bool = False, **kwargs) -> Eva:
2073+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2074+
model_args = dict(
2075+
patch_size=16,
2076+
embed_dim=192,
2077+
depth=12,
2078+
num_heads=3,
2079+
mlp_ratio=4.0,
2080+
attn_type='rope',
2081+
use_pre_transformer_norm=True,
2082+
use_post_transformer_norm=False,
2083+
use_fc_norm=False, # explicitly disable
2084+
use_rot_pos_emb=True,
2085+
ref_feat_shape=(32, 32),
2086+
rope_grid_offset=1.,
2087+
rope_grid_indexing='xy',
2088+
norm_layer=partial(LayerNorm, eps=1e-5),
2089+
#dynamic_img_size=True
2090+
)
2091+
return _create_eva('vit_pe_spatial_tiny_patch16_512', pretrained=pretrained, **dict(model_args, **kwargs))
2092+
2093+
2094+
@register_model
2095+
def vit_pe_spatial_small_patch16_512(pretrained: bool = False, **kwargs) -> Eva:
2096+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2097+
model_args = dict(
2098+
patch_size=16,
2099+
embed_dim=384,
2100+
depth=12,
2101+
num_heads=6,
2102+
mlp_ratio=4.0,
2103+
attn_type='rope',
2104+
use_pre_transformer_norm=True,
2105+
use_post_transformer_norm=False,
2106+
use_fc_norm=False, # explicitly disable
2107+
use_rot_pos_emb=True,
2108+
ref_feat_shape=(32, 32),
2109+
rope_grid_offset=1.,
2110+
rope_grid_indexing='xy',
2111+
norm_layer=partial(LayerNorm, eps=1e-5),
2112+
#dynamic_img_size=True
2113+
)
2114+
return _create_eva('vit_pe_spatial_small_patch16_512', pretrained=pretrained, **dict(model_args, **kwargs))
2115+
2116+
2117+
@register_model
2118+
def vit_pe_spatial_base_patch16_512(pretrained: bool = False, **kwargs) -> Eva:
2119+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2120+
model_args = dict(
2121+
patch_size=16,
2122+
embed_dim=768,
2123+
depth=12,
2124+
num_heads=12,
2125+
mlp_ratio=4.0,
2126+
attn_type='rope',
2127+
use_pre_transformer_norm=True,
2128+
use_post_transformer_norm=False,
2129+
use_fc_norm=False, # explicitly disable
2130+
use_rot_pos_emb=True,
2131+
ref_feat_shape=(32, 32),
2132+
rope_grid_offset=1.,
2133+
rope_grid_indexing='xy',
2134+
norm_layer=partial(LayerNorm, eps=1e-5),
2135+
#dynamic_img_size=True
2136+
)
2137+
return _create_eva('vit_pe_spatial_base_patch16_512', pretrained=pretrained, **dict(model_args, **kwargs))
2138+
2139+
2140+
@register_model
2141+
def vit_pe_spatial_large_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
2142+
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2143+
model_args = dict(
2144+
patch_size=14,
2145+
embed_dim=1024,
2146+
depth=24,
2147+
num_heads=16,
2148+
mlp_ratio=4.0,
2149+
attn_type='rope',
2150+
use_pre_transformer_norm=True,
2151+
use_post_transformer_norm=False,
2152+
use_fc_norm=False, # explicitly disable
2153+
use_rot_pos_emb=True,
2154+
ref_feat_shape=(32, 32),
2155+
rope_grid_offset=1.,
2156+
rope_grid_indexing='xy',
2157+
norm_layer=partial(LayerNorm, eps=1e-5),
2158+
#dynamic_img_size=True,
2159+
)
2160+
return _create_eva('vit_pe_spatial_large_patch14_448', pretrained=pretrained, **dict(model_args, **kwargs))
2161+
2162+
19662163
@register_model
19672164
def vit_pe_spatial_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
19682165
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""

0 commit comments

Comments
 (0)