remove hop_length from config

NickLucche · NickLucche · commit b74d9af29f86 · 2025-07-11T05:30:55.000-04:00
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -57,10 +57,10 @@
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
     from ray.util.placement_group import PlacementGroup
+    from transformers.configuration_utils import PretrainedConfig
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from transformers.configuration_utils import PretrainedConfig
     from vllm.executor.executor_base import ExecutorBase
     from vllm.model_executor.layers.quantization import QuantizationMethods
     from vllm.model_executor.layers.quantization.base_config import (
@@ -4966,12 +4966,6 @@ class SpeechToTextConfig:
     16kHz audio input. The input audio will be automatically resampled to this
     rate before processing."""
 
-    hop_length: Optional[int] = None
-    """Number of samples between successive frames in the log-mel spectrogram.
-    If None, uses the model's default hop length. This affects the temporal
-    resolution of the audio features and is used for calculating prompt token
-    counts."""
-
     max_audio_clip_s: int = 30
     """Maximum duration in seconds for a single audio clip without chunking.
     Audio longer than this will be split into smaller chunks if
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
@@ -252,7 +252,8 @@ async def _speech_to_text_stream_generator(
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                                audio_duration_s, self.asr_config):
+                                audio_duration_s, self.asr_config,
+                                self.model_config):
                             num_prompt_tokens += audio_tokens
 
                     # We need to do it here, because if there are exceptions in
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -719,7 +719,8 @@ def get_speech_to_text_config(
 
     @classmethod
     def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig) -> Optional[int]:
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
         """
         Map from audio duration to number of audio tokens produced by the ASR 
         model, without running a forward pass.
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
@@ -8,10 +8,10 @@
 import numpy as np
 import torch
 from torch import nn
-
 from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
                           WhisperProcessor)
 from transformers.models.whisper.modeling_whisper import sinusoids
+
 from vllm.attention import Attention, AttentionType
 from vllm.config import (CacheConfig, ModelConfig, SpeechToTextConfig,
                          VllmConfig)
@@ -816,18 +816,21 @@ def get_speech_to_text_config(cls, model_config: ModelConfig,
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
             sample_rate=processor.feature_extractor.sampling_rate,
-            hop_length=processor.feature_extractor.hop_length,
         )
 
     @classmethod
     def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig) -> Optional[int]:
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
+        processor = cached_get_processor(model_config.model)
+        hop_length = processor.feature_extractor.hop_length
+        assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
         # prompts directly at least not to Whisper.
         # One indicator of the encoder amount of processing
         # is the log-mel spectogram length.
         return math.ceil(audio_duration_s * stt_config.sample_rate /
-                         stt_config.hop_length)
+                         hop_length)
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()