Skip to content

Commit 89e2b42

Browse files
committed
remove hop_length from config
Signed-off-by: NickLucche <[email protected]>
1 parent 703afa5 commit 89e2b42

File tree

4 files changed

+10
-11
lines changed

4 files changed

+10
-11
lines changed

vllm/config.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4945,12 +4945,6 @@ class SpeechToTextConfig:
49454945
16kHz audio input. The input audio will be automatically resampled to this
49464946
rate before processing."""
49474947

4948-
hop_length: Optional[int] = None
4949-
"""Number of samples between successive frames in the log-mel spectrogram.
4950-
If None, uses the model's default hop length. This affects the temporal
4951-
resolution of the audio features and is used for calculating prompt token
4952-
counts."""
4953-
49544948
max_audio_clip_s: int = 30
49554949
"""Maximum duration in seconds for a single audio clip without chunking.
49564950
Audio longer than this will be split into smaller chunks if

vllm/entrypoints/openai/speech_to_text.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,8 @@ async def _speech_to_text_stream_generator(
252252
if res.prompt_token_ids is not None:
253253
num_prompt_tokens = len(res.prompt_token_ids)
254254
if audio_tokens := self.model_cls.get_num_audio_tokens(
255-
audio_duration_s, self.asr_config):
255+
audio_duration_s, self.asr_config,
256+
self.model_config):
256257
num_prompt_tokens += audio_tokens
257258

258259
# We need to do it here, because if there are exceptions in

vllm/model_executor/models/interfaces.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,7 +662,8 @@ def get_speech_to_text_config(
662662

663663
@classmethod
664664
def get_num_audio_tokens(cls, audio_duration_s: float,
665-
stt_config: SpeechToTextConfig) -> Optional[int]:
665+
stt_config: SpeechToTextConfig,
666+
model_config: ModelConfig) -> Optional[int]:
666667
"""
667668
Map from audio duration to number of audio tokens produced by the ASR
668669
model, without running a forward pass.

vllm/model_executor/models/whisper.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -816,18 +816,21 @@ def get_speech_to_text_config(cls, model_config: ModelConfig,
816816
return SpeechToTextConfig(
817817
max_audio_clip_s=processor.feature_extractor.chunk_length,
818818
sample_rate=processor.feature_extractor.sampling_rate,
819-
hop_length=processor.feature_extractor.hop_length,
820819
)
821820

822821
@classmethod
823822
def get_num_audio_tokens(cls, audio_duration_s: float,
824-
stt_config: SpeechToTextConfig) -> Optional[int]:
823+
stt_config: SpeechToTextConfig,
824+
model_config: ModelConfig) -> Optional[int]:
825+
processor = cached_get_processor(model_config.model)
826+
hop_length = processor.feature_extractor.hop_length
827+
assert hop_length is not None
825828
# NOTE(NickLucche) user can't pass encoder
826829
# prompts directly at least not to Whisper.
827830
# One indicator of the encoder amount of processing
828831
# is the log-mel spectogram length.
829832
return math.ceil(audio_duration_s * stt_config.sample_rate /
830-
stt_config.hop_length)
833+
hop_length)
831834

832835
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
833836
super().__init__()

0 commit comments

Comments
 (0)