Skip to content

v1: Add Whisper model support (encoder-decoder) #21088

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ steps:
- python3 offline_inference/vision_language_pooling.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
Expand Down Expand Up @@ -500,7 +499,7 @@ steps:
- vllm/
- tests/encoder_decoder
commands:
- pytest -v -s encoder_decoder
- pytest -v -s encoder_decoder

- label: OpenAI-Compatible Tool Use # 20 min
mirror_hardwares: [amdexperimental]
Expand Down
2 changes: 2 additions & 0 deletions examples/offline_inference/encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""
Demonstrate prompting of text-to-text
encoder/decoder models, specifically BART

NOTE: This example is not yet supported in V1.
"""

from vllm import LLM, SamplingParams
Expand Down
1 change: 1 addition & 0 deletions tests/encoder_decoder/test_e2e_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def clear_cache():
current_platform.is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models"
)
@pytest.mark.skip(reason="bart not supported in V1")
def test_encoder_decoder_e2e(
hf_runner,
vllm_runner,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ async def transcribe_audio(client, tokenizer, y, sr):
model=tokenizer.name_or_path,
language="en",
temperature=0.0,
# 5 minutes
# The default of 5 seconds is too aggressive in some cases.
timeout=300,
)
end_time = time.perf_counter()
# NOTE there's no streaming in transcriptions, can't measure ttft
Expand All @@ -49,8 +52,7 @@ async def transcribe_audio(client, tokenizer, y, sr):
return latency, num_output_tokens, transcription.text


async def bound_transcribe(model_name, sem, client, audio, reference):
tokenizer = AutoTokenizer.from_pretrained(model_name)
async def bound_transcribe(sem, client, tokenizer, audio, reference):
# Use semaphore to limit concurrent requests.
async with sem:
result = await transcribe_audio(client, tokenizer, *audio)
Expand All @@ -63,15 +65,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference):
async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request)

# Load tokenizer once outside the loop
tokenizer = AutoTokenizer.from_pretrained(model)

# Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(model, sem, client, (audio, sr), "")
_ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")

tasks: list[asyncio.Task] = []
for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task(
bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
bound_transcribe(sem, client, tokenizer, (audio, sr),
sample["text"]))
tasks.append(task)
return await asyncio.gather(*tasks)

Expand Down
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ async def client(server):

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="bart is not yet supported in V1")
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
completion = await client.completions.create(model=model_name,
prompt="Hello, my name is",
Expand Down
2 changes: 2 additions & 0 deletions tests/models/language/generation/test_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def run_test(
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
@pytest.mark.skip(reason="bart not supported in V1")
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:

Expand All @@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
@pytest.mark.skip(reason="bart not supported in V1")
def test_models_distributed(hf_runner, vllm_runner,
example_encoder_decoder_prompts,
distributed_executor_backend, model, dtype,
Expand Down
1 change: 0 additions & 1 deletion tests/v1/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from vllm.engine.async_llm_engine import AsyncLLMEngine

UNSUPPORTED_MODELS_V1 = [
"openai/whisper-large-v3", # transcription
"facebook/bart-large-cnn", # encoder decoder
"state-spaces/mamba-130m-hf", # mamba1
]
Expand Down
Loading