remove hop_length from config

NickLucche · NickLucche · commit 89e2b422496e · 2025-07-10T13:22:17.000Z
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
diff --git a/vllm/config.py b/vllm/config.py
@@ -4945,12 +4945,6 @@ class SpeechToTextConfig:
     16kHz audio input. The input audio will be automatically resampled to this
     rate before processing."""
 
-    hop_length: Optional[int] = None
-    """Number of samples between successive frames in the log-mel spectrogram.
-    If None, uses the model's default hop length. This affects the temporal
-    resolution of the audio features and is used for calculating prompt token
-    counts."""
-
     max_audio_clip_s: int = 30
     """Maximum duration in seconds for a single audio clip without chunking.
     Audio longer than this will be split into smaller chunks if
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
@@ -252,7 +252,8 @@ async def _speech_to_text_stream_generator(
                     if res.prompt_token_ids is not None:
                         num_prompt_tokens = len(res.prompt_token_ids)
                         if audio_tokens := self.model_cls.get_num_audio_tokens(
-                                audio_duration_s, self.asr_config):
+                                audio_duration_s, self.asr_config,
+                                self.model_config):
                             num_prompt_tokens += audio_tokens
 
                     # We need to do it here, because if there are exceptions in
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -662,7 +662,8 @@ def get_speech_to_text_config(
 
     @classmethod
     def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig) -> Optional[int]:
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
         """
         Map from audio duration to number of audio tokens produced by the ASR 
         model, without running a forward pass.
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
@@ -816,18 +816,21 @@ def get_speech_to_text_config(cls, model_config: ModelConfig,
         return SpeechToTextConfig(
             max_audio_clip_s=processor.feature_extractor.chunk_length,
             sample_rate=processor.feature_extractor.sampling_rate,
-            hop_length=processor.feature_extractor.hop_length,
         )
 
     @classmethod
     def get_num_audio_tokens(cls, audio_duration_s: float,
-                             stt_config: SpeechToTextConfig) -> Optional[int]:
+                             stt_config: SpeechToTextConfig,
+                             model_config: ModelConfig) -> Optional[int]:
+        processor = cached_get_processor(model_config.model)
+        hop_length = processor.feature_extractor.hop_length
+        assert hop_length is not None
         # NOTE(NickLucche) user can't pass encoder
         # prompts directly at least not to Whisper.
         # One indicator of the encoder amount of processing
         # is the log-mel spectogram length.
         return math.ceil(audio_duration_s * stt_config.sample_rate /
-                         stt_config.hop_length)
+                         hop_length)
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()