Skip to content

Commit b74d9af

Browse files
committed
remove hop_length from config
Signed-off-by: NickLucche <[email protected]>
1 parent 3bc4879 commit b74d9af

File tree

4 files changed

+12
-13
lines changed

4 files changed

+12
-13
lines changed

vllm/config.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,10 @@
5757
if TYPE_CHECKING:
5858
from _typeshed import DataclassInstance
5959
from ray.util.placement_group import PlacementGroup
60+
from transformers.configuration_utils import PretrainedConfig
6061

6162
import vllm.model_executor.layers.quantization as me_quant
6263
import vllm.model_executor.models as me_models
63-
from transformers.configuration_utils import PretrainedConfig
6464
from vllm.executor.executor_base import ExecutorBase
6565
from vllm.model_executor.layers.quantization import QuantizationMethods
6666
from vllm.model_executor.layers.quantization.base_config import (
@@ -4966,12 +4966,6 @@ class SpeechToTextConfig:
49664966
16kHz audio input. The input audio will be automatically resampled to this
49674967
rate before processing."""
49684968

4969-
hop_length: Optional[int] = None
4970-
"""Number of samples between successive frames in the log-mel spectrogram.
4971-
If None, uses the model's default hop length. This affects the temporal
4972-
resolution of the audio features and is used for calculating prompt token
4973-
counts."""
4974-
49754969
max_audio_clip_s: int = 30
49764970
"""Maximum duration in seconds for a single audio clip without chunking.
49774971
Audio longer than this will be split into smaller chunks if

vllm/entrypoints/openai/speech_to_text.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,8 @@ async def _speech_to_text_stream_generator(
252252
if res.prompt_token_ids is not None:
253253
num_prompt_tokens = len(res.prompt_token_ids)
254254
if audio_tokens := self.model_cls.get_num_audio_tokens(
255-
audio_duration_s, self.asr_config):
255+
audio_duration_s, self.asr_config,
256+
self.model_config):
256257
num_prompt_tokens += audio_tokens
257258

258259
# We need to do it here, because if there are exceptions in

vllm/model_executor/models/interfaces.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,8 @@ def get_speech_to_text_config(
719719

720720
@classmethod
721721
def get_num_audio_tokens(cls, audio_duration_s: float,
722-
stt_config: SpeechToTextConfig) -> Optional[int]:
722+
stt_config: SpeechToTextConfig,
723+
model_config: ModelConfig) -> Optional[int]:
723724
"""
724725
Map from audio duration to number of audio tokens produced by the ASR
725726
model, without running a forward pass.

vllm/model_executor/models/whisper.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
import numpy as np
99
import torch
1010
from torch import nn
11-
1211
from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
1312
WhisperProcessor)
1413
from transformers.models.whisper.modeling_whisper import sinusoids
14+
1515
from vllm.attention import Attention, AttentionType
1616
from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
1717
VllmConfig)
@@ -816,18 +816,21 @@ def get_speech_to_text_config(cls, model_config: ModelConfig,
816816
return SpeechToTextConfig(
817817
max_audio_clip_s=processor.feature_extractor.chunk_length,
818818
sample_rate=processor.feature_extractor.sampling_rate,
819-
hop_length=processor.feature_extractor.hop_length,
820819
)
821820

822821
@classmethod
823822
def get_num_audio_tokens(cls, audio_duration_s: float,
824-
stt_config: SpeechToTextConfig) -> Optional[int]:
823+
stt_config: SpeechToTextConfig,
824+
model_config: ModelConfig) -> Optional[int]:
825+
processor = cached_get_processor(model_config.model)
826+
hop_length = processor.feature_extractor.hop_length
827+
assert hop_length is not None
825828
# NOTE(NickLucche) user can't pass encoder
826829
# prompts directly at least not to Whisper.
827830
# One indicator of the encoder amount of processing
828831
# is the log-mel spectogram length.
829832
return math.ceil(audio_duration_s * stt_config.sample_rate /
830-
stt_config.hop_length)
833+
hop_length)
831834

832835
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
833836
super().__init__()

0 commit comments

Comments
 (0)