@@ -638,12 +638,10 @@ def __init__(
638
638
self .norm = norm_layer (embed_dim ) if activate_post_norm else nn .Identity ()
639
639
640
640
if global_pool == 'map' :
641
- attn_pool_num_heads = attn_pool_num_heads or num_heads
642
- attn_pool_mlp_ratio = attn_pool_mlp_ratio or mlp_ratio
643
641
self .attn_pool = AttentionPoolLatent (
644
642
self .embed_dim ,
645
- num_heads = attn_pool_num_heads ,
646
- mlp_ratio = attn_pool_mlp_ratio ,
643
+ num_heads = attn_pool_num_heads or num_heads ,
644
+ mlp_ratio = attn_pool_mlp_ratio or mlp_ratio ,
647
645
norm_layer = norm_layer ,
648
646
act_layer = nn .GELU ,
649
647
)
@@ -1366,6 +1364,20 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1366
1364
),
1367
1365
1368
1366
# Perception Encoder weights
1367
+ 'vit_pe_core_tiny_patch16_384.fb' : _pe_cfg (
1368
+ hf_hub_id = 'timm/' ,
1369
+ #hf_hub_id='facebook/PE-Core-T16-384',
1370
+ #hf_hub_filename='PE-Core-T16-384.pt',
1371
+ input_size = (3 , 384 , 384 ),
1372
+ num_classes = 512 , # output proj dim
1373
+ ),
1374
+ 'vit_pe_core_small_patch16_384.fb' : _pe_cfg (
1375
+ hf_hub_id = 'timm/' ,
1376
+ #hf_hub_id='facebook/PE-Core-S16-384',
1377
+ #hf_hub_filename='PE-Core-S16-384.pt',
1378
+ input_size = (3 , 384 , 384 ),
1379
+ num_classes = 512 , # output proj dim
1380
+ ),
1369
1381
'vit_pe_core_base_patch16_224.fb' : _pe_cfg (
1370
1382
hf_hub_id = 'timm/' ,
1371
1383
#hf_hub_id='facebook/PE-Core-B16-224',
@@ -1387,20 +1399,64 @@ def _pe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1387
1399
input_size = (3 , 448 , 448 ),
1388
1400
num_classes = 1280 , # output proj dim
1389
1401
),
1402
+
1390
1403
'vit_pe_lang_large_patch14_448.fb' : _pe_cfg (
1391
1404
hf_hub_id = 'timm/' ,
1392
1405
#hf_hub_id='facebook/PE-Lang-L14-448',
1393
1406
#hf_hub_filename='PE-Lang-L14-448.pt',
1394
1407
input_size = (3 , 448 , 448 ),
1395
1408
num_classes = 0 ,
1396
1409
),
1410
+ 'vit_pe_lang_large_patch14_448.fb_tiling' : _pe_cfg (
1411
+ hf_hub_id = 'timm/' ,
1412
+ #hf_hub_id='facebook/PE-Lang-L14-448-Tiling',
1413
+ #hf_hub_filename='PE-Lang-L14-448-Tiling.pt',
1414
+ input_size = (3 , 448 , 448 ),
1415
+ num_classes = 0 ,
1416
+ ),
1397
1417
'vit_pe_lang_gigantic_patch14_448.fb' : _pe_cfg (
1398
1418
hf_hub_id = 'timm/' ,
1399
1419
#hf_hub_id='facebook/PE-Lang-G14-448',
1400
1420
#hf_hub_filename='PE-Lang-G14-448.pt',
1401
1421
input_size = (3 , 448 , 448 ),
1402
1422
num_classes = 0 ,
1403
1423
),
1424
+ 'vit_pe_lang_gigantic_patch14_448.fb_tiling' : _pe_cfg (
1425
+ hf_hub_id = 'timm/' ,
1426
+ #hf_hub_id='facebook/PE-Lang-G14-448-Tiling',
1427
+ #hf_hub_filename='PE-Lang-G14-448-Tiling.pt',
1428
+ input_size = (3 , 448 , 448 ),
1429
+ num_classes = 0 ,
1430
+ ),
1431
+
1432
+ 'vit_pe_spatial_tiny_patch16_512.fb' : _pe_cfg (
1433
+ hf_hub_id = 'timm/' ,
1434
+ #hf_hub_id='facebook/PE-Spatial-T16-512',
1435
+ #hf_hub_filename='PE-Spatial-T16-512.pt',
1436
+ input_size = (3 , 512 , 512 ),
1437
+ num_classes = 0 ,
1438
+ ),
1439
+ 'vit_pe_spatial_small_patch16_512.fb' : _pe_cfg (
1440
+ hf_hub_id = 'timm/' ,
1441
+ #hf_hub_id='facebook/PE-Spatial-S16-512',
1442
+ #hf_hub_filename='PE-Spatial-S16-512.pt',
1443
+ input_size = (3 , 512 , 512 ),
1444
+ num_classes = 0 ,
1445
+ ),
1446
+ 'vit_pe_spatial_base_patch16_512.fb' : _pe_cfg (
1447
+ hf_hub_id = 'timm/' ,
1448
+ #hf_hub_id='facebook/PE-Spatial-B16-512',
1449
+ #hf_hub_filename='PE-Spatial-B16-512.pt',
1450
+ input_size = (3 , 512 , 512 ),
1451
+ num_classes = 0 ,
1452
+ ),
1453
+ 'vit_pe_spatial_large_patch14_448.fb' : _pe_cfg (
1454
+ hf_hub_id = 'timm/' ,
1455
+ #hf_hub_id='facebook/PE-Spatial-L14-448',
1456
+ #hf_hub_filename='PE-Spatial-L14-448.pt',
1457
+ input_size = (3 , 448 , 448 ),
1458
+ num_classes = 0 ,
1459
+ ),
1404
1460
'vit_pe_spatial_gigantic_patch14_448.fb' : _pe_cfg (
1405
1461
hf_hub_id = 'timm/' ,
1406
1462
#hf_hub_id='facebook/PE-Spatial-G14-448',
@@ -1842,6 +1898,55 @@ def vit_base_patch16_rope_reg1_gap_256(pretrained: bool = False, **kwargs) -> Ev
1842
1898
return model
1843
1899
1844
1900
1901
+ @register_model
1902
+ def vit_pe_core_tiny_patch16_384 (pretrained : bool = False , ** kwargs ) -> Eva :
1903
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1904
+ model_args = dict (
1905
+ patch_size = 16 ,
1906
+ embed_dim = 192 ,
1907
+ depth = 12 ,
1908
+ num_heads = 3 ,
1909
+ mlp_ratio = 4.0 ,
1910
+ global_pool = 'map' ,
1911
+ attn_type = 'rope' ,
1912
+ use_pre_transformer_norm = True ,
1913
+ use_rot_pos_emb = True ,
1914
+ ref_feat_shape = (24 , 24 ),
1915
+ rope_grid_offset = 1. ,
1916
+ rope_grid_indexing = 'xy' ,
1917
+ attn_pool_num_heads = 8 ,
1918
+ attn_pool_mlp_ratio = 4. ,
1919
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
1920
+ #dynamic_img_size=True
1921
+ )
1922
+ return _create_eva ('vit_pe_core_tiny_patch16_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1923
+
1924
+
1925
+
1926
+ @register_model
1927
+ def vit_pe_core_small_patch16_384 (pretrained : bool = False , ** kwargs ) -> Eva :
1928
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
1929
+ model_args = dict (
1930
+ patch_size = 16 ,
1931
+ embed_dim = 384 ,
1932
+ depth = 12 ,
1933
+ num_heads = 6 ,
1934
+ mlp_ratio = 4.0 ,
1935
+ global_pool = 'map' ,
1936
+ attn_type = 'rope' ,
1937
+ use_pre_transformer_norm = True ,
1938
+ use_rot_pos_emb = True ,
1939
+ ref_feat_shape = (24 , 24 ),
1940
+ rope_grid_offset = 1. ,
1941
+ rope_grid_indexing = 'xy' ,
1942
+ attn_pool_num_heads = 8 ,
1943
+ attn_pool_mlp_ratio = 4. ,
1944
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
1945
+ #dynamic_img_size=True
1946
+ )
1947
+ return _create_eva ('vit_pe_core_small_patch16_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1948
+
1949
+
1845
1950
@register_model
1846
1951
def vit_pe_core_base_patch16_224 (pretrained : bool = False , ** kwargs ) -> Eva :
1847
1952
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
@@ -1963,6 +2068,98 @@ def vit_pe_lang_gigantic_patch14_448(pretrained: bool = False, **kwargs) -> Eva:
1963
2068
return _create_eva ('vit_pe_lang_gigantic_patch14_448' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1964
2069
1965
2070
2071
+ @register_model
2072
+ def vit_pe_spatial_tiny_patch16_512 (pretrained : bool = False , ** kwargs ) -> Eva :
2073
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2074
+ model_args = dict (
2075
+ patch_size = 16 ,
2076
+ embed_dim = 192 ,
2077
+ depth = 12 ,
2078
+ num_heads = 3 ,
2079
+ mlp_ratio = 4.0 ,
2080
+ attn_type = 'rope' ,
2081
+ use_pre_transformer_norm = True ,
2082
+ use_post_transformer_norm = False ,
2083
+ use_fc_norm = False , # explicitly disable
2084
+ use_rot_pos_emb = True ,
2085
+ ref_feat_shape = (32 , 32 ),
2086
+ rope_grid_offset = 1. ,
2087
+ rope_grid_indexing = 'xy' ,
2088
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2089
+ #dynamic_img_size=True
2090
+ )
2091
+ return _create_eva ('vit_pe_spatial_tiny_patch16_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2092
+
2093
+
2094
+ @register_model
2095
+ def vit_pe_spatial_small_patch16_512 (pretrained : bool = False , ** kwargs ) -> Eva :
2096
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2097
+ model_args = dict (
2098
+ patch_size = 16 ,
2099
+ embed_dim = 384 ,
2100
+ depth = 12 ,
2101
+ num_heads = 6 ,
2102
+ mlp_ratio = 4.0 ,
2103
+ attn_type = 'rope' ,
2104
+ use_pre_transformer_norm = True ,
2105
+ use_post_transformer_norm = False ,
2106
+ use_fc_norm = False , # explicitly disable
2107
+ use_rot_pos_emb = True ,
2108
+ ref_feat_shape = (32 , 32 ),
2109
+ rope_grid_offset = 1. ,
2110
+ rope_grid_indexing = 'xy' ,
2111
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2112
+ #dynamic_img_size=True
2113
+ )
2114
+ return _create_eva ('vit_pe_spatial_small_patch16_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2115
+
2116
+
2117
+ @register_model
2118
+ def vit_pe_spatial_base_patch16_512 (pretrained : bool = False , ** kwargs ) -> Eva :
2119
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2120
+ model_args = dict (
2121
+ patch_size = 16 ,
2122
+ embed_dim = 768 ,
2123
+ depth = 12 ,
2124
+ num_heads = 12 ,
2125
+ mlp_ratio = 4.0 ,
2126
+ attn_type = 'rope' ,
2127
+ use_pre_transformer_norm = True ,
2128
+ use_post_transformer_norm = False ,
2129
+ use_fc_norm = False , # explicitly disable
2130
+ use_rot_pos_emb = True ,
2131
+ ref_feat_shape = (32 , 32 ),
2132
+ rope_grid_offset = 1. ,
2133
+ rope_grid_indexing = 'xy' ,
2134
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2135
+ #dynamic_img_size=True
2136
+ )
2137
+ return _create_eva ('vit_pe_spatial_base_patch16_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2138
+
2139
+
2140
+ @register_model
2141
+ def vit_pe_spatial_large_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
2142
+ """Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
2143
+ model_args = dict (
2144
+ patch_size = 14 ,
2145
+ embed_dim = 1024 ,
2146
+ depth = 24 ,
2147
+ num_heads = 16 ,
2148
+ mlp_ratio = 4.0 ,
2149
+ attn_type = 'rope' ,
2150
+ use_pre_transformer_norm = True ,
2151
+ use_post_transformer_norm = False ,
2152
+ use_fc_norm = False , # explicitly disable
2153
+ use_rot_pos_emb = True ,
2154
+ ref_feat_shape = (32 , 32 ),
2155
+ rope_grid_offset = 1. ,
2156
+ rope_grid_indexing = 'xy' ,
2157
+ norm_layer = partial (LayerNorm , eps = 1e-5 ),
2158
+ #dynamic_img_size=True,
2159
+ )
2160
+ return _create_eva ('vit_pe_spatial_large_patch14_448' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2161
+
2162
+
1966
2163
@register_model
1967
2164
def vit_pe_spatial_gigantic_patch14_448 (pretrained : bool = False , ** kwargs ) -> Eva :
1968
2165
"""Perception Encoder (PE) ViT from Meta (https://arxiv.org/abs/2504.13181)"""
0 commit comments