Skip to content

[Model][CI] Let more pooling models support v1 #21747

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 31, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions tests/models/language/pooling/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,6 @@

from vllm.platforms import current_platform

# TODO: enable when float32 is supported by V1
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass


@pytest.mark.parametrize(
"model",
Expand Down
10 changes: 0 additions & 10 deletions tests/models/language/pooling/test_gte.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,10 @@
enable_test=False),
]

V1FlashAttentionImpNotSupported = [
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
]


@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
monkeypatch) -> None:
if model_info.name in V1FlashAttentionImpNotSupported:
monkeypatch.setenv("VLLM_USE_V1", "0")

vllm_extra_kwargs: dict[str, Any] = {}
if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
Expand All @@ -79,9 +72,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo, example_prompts,
monkeypatch) -> None:
if model_info.name in V1FlashAttentionImpNotSupported:
monkeypatch.setenv("VLLM_USE_V1", "0")

vllm_extra_kwargs: dict[str, Any] = {}
if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
Expand Down
13 changes: 0 additions & 13 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import pytest

import vllm.envs as envs
from vllm import PoolingParams

from ...utils import EmbedModelInfo, RerankModelInfo
Expand All @@ -24,14 +23,6 @@
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
Expand Down Expand Up @@ -63,10 +54,6 @@ def hf_model_callback(model):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
if (model_info.architecture == "XLMRobertaForSequenceClassification"
and envs.VLLM_USE_V1):
pytest.skip("Not supported yet")

mteb_test_rerank_models(hf_runner, vllm_runner, model_info)


Expand Down
5 changes: 5 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4885,6 +4885,11 @@
if self.model_config is None:
return

# Avoid running try_verify_and_update_config multiple times
if getattr(self.model_config, "config_updated", False):
return
self.model_config.config_updated = True

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

Check failure on line 4891 in vllm/config.py

View workflow job for this annotation

GitHub Actions / pre-commit

"ModelConfig" has no attribute "config_updated" [attr-defined]

architecture = self.model_config.architecture
if architecture is None:
return
Expand Down
6 changes: 2 additions & 4 deletions vllm/model_executor/models/bert_with_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from transformers import PretrainedConfig

from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
Expand Down Expand Up @@ -364,7 +363,6 @@ def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
return hidden_states


@support_torch_compile
class BertWithRopeEncoder(nn.Module):

def __init__(self,
Expand Down Expand Up @@ -398,7 +396,7 @@ def forward(
return hidden_states


class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
class BertWithRope(nn.Module, SupportsQuant):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
Expand Down Expand Up @@ -468,7 +466,7 @@ def load_weights(self, weights: Iterable[tuple[str,
return loaded_params


class NomicBertModel(BertWithRope):
class NomicBertModel(BertWithRope, SupportsV0Only):
# for https://huggingface.co/nomic-ai/nomic-bert-2048

hf_to_vllm_mapper = WeightsMapper(
Expand Down
2 changes: 0 additions & 2 deletions vllm/model_executor/models/modernbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from transformers import ModernBertConfig

from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (QKVParallelLinear,
Expand Down Expand Up @@ -200,7 +199,6 @@ def forward(
return hidden_states


@support_torch_compile
class ModernBertModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"layers.": "encoder_layer.layers."})
Expand Down