Skip to content

[Model][CI] Let more pooling models support v1 #21747

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions tests/models/language/pooling/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,6 @@

from vllm.platforms import current_platform

# TODO: enable when float32 is supported by V1
# @pytest.fixture(autouse=True)
# def v1(run_with_both_engines):
# # Simple autouse wrapper to run both engines for each test
# # This can be promoted up to conftest.py to run for every
# # test in a package
# pass


@pytest.mark.parametrize(
"model",
Expand Down
18 changes: 4 additions & 14 deletions tests/models/language/pooling/test_gte.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,10 @@
enable_test=False),
]

V1FlashAttentionImpNotSupported = [
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-modernbert-base"
]


@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,
monkeypatch) -> None:
if model_info.name in V1FlashAttentionImpNotSupported:
monkeypatch.setenv("VLLM_USE_V1", "0")

def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
vllm_extra_kwargs: dict[str, Any] = {}
if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
Expand All @@ -77,11 +70,8 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo,

@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo, example_prompts,
monkeypatch) -> None:
if model_info.name in V1FlashAttentionImpNotSupported:
monkeypatch.setenv("VLLM_USE_V1", "0")

model_info: EmbedModelInfo,
example_prompts) -> None:
vllm_extra_kwargs: dict[str, Any] = {}
if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
Expand Down
13 changes: 0 additions & 13 deletions tests/models/language/pooling/test_jina.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import pytest

import vllm.envs as envs
from vllm import PoolingParams

from ...utils import EmbedModelInfo, RerankModelInfo
Expand All @@ -24,14 +23,6 @@
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
Expand Down Expand Up @@ -63,10 +54,6 @@ def hf_model_callback(model):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
if (model_info.architecture == "XLMRobertaForSequenceClassification"
and envs.VLLM_USE_V1):
pytest.skip("Not supported yet")

mteb_test_rerank_models(hf_runner, vllm_runner, model_info)


Expand Down
6 changes: 0 additions & 6 deletions tests/models/language/pooling/test_qwen3_reranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,6 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
}
}

if model_info.name == "Qwen/Qwen3-Reranker-4B":
vllm_extra_kwargs["max_num_seqs"] = 1

mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
vllm_extra_kwargs)

Expand All @@ -106,9 +103,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
"tensor_parallel_size": 2,
}

if model_info.name == "Qwen/Qwen3-Reranker-4B":
vllm_extra_kwargs["max_num_seqs"] = 1

mteb_test_rerank_models(Qwen3RerankerHfRunner,
vllm_runner,
model_info,
Expand Down
8 changes: 8 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,9 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
raise ValueError(
"`override_neuron_config` is only supported on Neuron.")

# Avoid running try_verify_and_update_config multiple times
self.config_updated = False

self._verify_quantization()
self._verify_cuda_graph()
self._verify_bnb_config()
Expand Down Expand Up @@ -4885,6 +4888,11 @@ def try_verify_and_update_config(self):
if self.model_config is None:
return

# Avoid running try_verify_and_update_config multiple times
if getattr(self.model_config, "config_updated", False):
return
self.model_config.config_updated = True

architecture = self.model_config.architecture
if architecture is None:
return
Expand Down
5 changes: 1 addition & 4 deletions vllm/model_executor/models/bert_with_rope.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from transformers import PretrainedConfig

from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import (get_act_and_mul_fn,
Expand All @@ -23,7 +22,6 @@
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models import SupportsV0Only
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.utils import WeightsMapper
from vllm.sequence import IntermediateTensors
Expand Down Expand Up @@ -364,7 +362,6 @@ def forward(self, positions: torch.Tensor, hidden_states: torch.Tensor):
return hidden_states


@support_torch_compile
class BertWithRopeEncoder(nn.Module):

def __init__(self,
Expand Down Expand Up @@ -398,7 +395,7 @@ def forward(
return hidden_states


class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant):
class BertWithRope(nn.Module, SupportsQuant):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})

def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
config.num_hidden_layers = config.n_layer

head_dim = config.hidden_size // config.num_attention_heads
rotary_emb_dim = head_dim * config.rotary_emb_fraction
rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
max_trained_positions = getattr(config, "max_trained_positions", 2048)
config.rotary_kwargs = {
"head_size": head_dim,
Expand Down
2 changes: 0 additions & 2 deletions vllm/model_executor/models/modernbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from transformers import ModernBertConfig

from vllm.attention import Attention, AttentionType
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.linear import (QKVParallelLinear,
Expand Down Expand Up @@ -200,7 +199,6 @@ def forward(
return hidden_states


@support_torch_compile
class ModernBertModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={"layers.": "encoder_layer.layers."})
Expand Down