vllm-project · vllm-bot · Jul 31, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 29, 2025
@@ -6,14 +6,6 @@
 
 from vllm.platforms import current_platform
 
-# TODO: enable when float32 is supported by V1
-# @pytest.fixture(autouse=True)
-# def v1(run_with_both_engines):
-#     # Simple autouse wrapper to run both engines for each test
-#     # This can be promoted up to conftest.py to run for every
-#     # test in a package
-#     pass
-
 
 @pytest.mark.parametrize(
     "model",

@@ -56,17 +56,10 @@
                    enable_test=False),
 ]
 
-V1FlashAttentionImpNotSupported = [
-    "Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
-]
-
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
                            monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
@@ -79,9 +72,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
 def test_embed_models_correctness(hf_runner, vllm_runner,
                                   model_info: EmbedModelInfo, example_prompts,
                                   monkeypatch) -> None:
-    if model_info.name in V1FlashAttentionImpNotSupported:
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
     vllm_extra_kwargs: dict[str, Any] = {}
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}

@@ -4,7 +4,6 @@
 
 import pytest
 
-import vllm.envs as envs
 from vllm import PoolingParams
 
 from ...utils import EmbedModelInfo, RerankModelInfo
@@ -24,14 +23,6 @@
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
@@ -63,10 +54,6 @@ def hf_model_callback(model):
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(hf_runner, vllm_runner,
                             model_info: RerankModelInfo) -> None:
-    if (model_info.architecture == "XLMRobertaForSequenceClassification"
-            and envs.VLLM_USE_V1):
-        pytest.skip("Not supported yet")
-
     mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
 
 

diff --git a/vllm/config.py b/vllm/config.py
@@ -4885,6 +4885,11 @@
         if self.model_config is None:
             return
 
+        # Avoid running try_verify_and_update_config multiple times
+        if getattr(self.model_config, "config_updated", False):
+            return
+        self.model_config.config_updated = True
+
         architecture = self.model_config.architecture
         if architecture is None:
             return

diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
@@ -8,7 +8,6 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
@@ -364,7 +363,6 @@ def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
         return hidden_states
 
 
-@support_torch_compile
 class BertWithRopeEncoder(nn.Module):
 
     def __init__(self,
@@ -398,7 +396,7 @@ def forward(
         return hidden_states
 
 
-class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
+class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -468,7 +466,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class NomicBertModel(BertWithRope):
+class NomicBertModel(BertWithRope, SupportsV0Only):
     # for https://huggingface.co/nomic-ai/nomic-bert-2048
 
     hf_to_vllm_mapper = WeightsMapper(

diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
@@ -8,7 +8,6 @@
 from transformers import ModernBertConfig
 
 from vllm.attention import Attention, AttentionType
-from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -200,7 +199,6 @@ def forward(
         return hidden_states
 
 
-@support_torch_compile
 class ModernBertModel(nn.Module):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"layers.": "encoder_layer.layers."})