vllm-project
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/compile/piecewise/test_full_cudagraph.py
Lines changed: 2 additions & 1 deletion b/‎tests/compile/piecewise/test_full_cudagraph.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/compile/piecewise/test_simple.py
Lines changed: 10 additions & 5 deletions b/‎tests/compile/piecewise/test_simple.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎tests/compile/piecewise/test_toy_llama.py
Lines changed: 17 additions & 18 deletions b/‎tests/compile/piecewise/test_toy_llama.py
Lines changed: 17 additions & 18 deletions
diff --git a/‎tests/core/test_scheduler.py
Lines changed: 19 additions & 18 deletions b/‎tests/core/test_scheduler.py
Lines changed: 19 additions & 18 deletions
diff --git a/‎tests/entrypoints/conftest.py
Lines changed: 6 additions & 3 deletions b/‎tests/entrypoints/conftest.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎tests/entrypoints/openai/test_audio.py
Lines changed: 1 addition & 1 deletion b/‎tests/entrypoints/openai/test_audio.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/entrypoints/openai/test_chat.py
Lines changed: 14 additions & 8 deletions b/‎tests/entrypoints/openai/test_chat.py
Lines changed: 14 additions & 8 deletions
diff --git a/‎tests/entrypoints/openai/test_chat_template.py
Lines changed: 2 additions & 3 deletions b/‎tests/entrypoints/openai/test_chat_template.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Lines changed: 6 additions & 3 deletions b/‎tests/entrypoints/openai/test_chat_with_tool_reasoning.py
Lines changed: 6 additions & 3 deletions
@@ -24,7 +24,7 @@ repos:
   rev: v0.11.7
   hooks:
   - id: ruff
-    args: [--output-format, github, --fix]
+    # args: [--output-format, github, --fix]
   - id: ruff-format
     files: |
       (?x)^(
 
@@ -159,7 +159,8 @@ def test_full_cudagraph_with_invalid_backend():
         temporary_environ(
             {
                 "VLLM_USE_V1": "1",
-                "VLLM_FLASH_ATTN_VERSION": "2",  # FA2 not supported with full_cuda_graph
+                # FA2 not supported with full_cuda_graph
+                "VLLM_FLASH_ATTN_VERSION": "2",
             }
         ),
         pytest.raises(RuntimeError),
 
@@ -100,11 +100,16 @@ def test_simple_piecewise_compile(use_inductor):
 
     with (
         compilation_counter.expect(
-            num_graphs_seen=1,  # one graph for the model
-            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
-            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_captured=6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            # one graph for the model
+            num_graphs_seen=1,
+            # 2 * num_layers + 1
+            num_piecewise_graphs_seen=5,
+            # 1 + num_layers
+            num_piecewise_capturable_graphs_seen=3,
+            # num_piecewise_capturable_graphs_seen
+            num_backend_compilations=3,
+            # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+            num_cudagraph_captured=6,
         ),
         set_forward_context({}, vllm_config=vllm_config),
     ):
 
@@ -361,11 +361,14 @@ def test_toy_llama(use_inductor: bool):
         kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
-        num_graphs_seen=1,  # one graph for the model
+        # one graph for the model
+        num_graphs_seen=1,
         num_piecewise_graphs_seen=1,
         num_piecewise_capturable_graphs_seen=1,
-        num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # num_piecewise_capturable_graphs_seen
+        num_backend_compilations=1,
+        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2,
         **kwargs,
     ):
         outputs.append(
@@ -374,16 +377,16 @@ def test_toy_llama(use_inductor: bool):
     run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
 
     with compilation_counter.expect(
-        num_graphs_seen=1,  # one graph for the model
-        num_piecewise_graphs_seen=2 * llama_config.num_layers + 1,  # 2 * num_layers + 1
-        num_piecewise_capturable_graphs_seen=1
-        + llama_config.num_layers,  # 1 + num_layers
-        num_backend_compilations=1
-        + llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=2
-        * (
-            1 + llama_config.num_layers
-        ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # one graph for the model
+        num_graphs_seen=1,
+        # 2 * num_layers + 1
+        num_piecewise_graphs_seen=2 * llama_config.num_layers + 1,
+        # 1 + num_layers
+        num_piecewise_capturable_graphs_seen=1 + llama_config.num_layers,
+        # num_piecewise_capturable_graphs_seen
+        num_backend_compilations=1 + llama_config.num_layers,
+        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_cudagraph_captured=2 * (1 + llama_config.num_layers),
     ):
         outputs.append(
             run_model(
@@ -470,11 +473,7 @@ def benchmark():
                 # and use it later, because it will look up the name `b` in the
                 # enclosing scope, and the value of `b` will always be 256.
                 # it is fine here, because we only use the lambda function once.
-                runtime = do_bench(
-                    lambda: graphs[b][0](  # noqa
-                        input_ids[:b], positions[:b]
-                    )
-                )  # noqa
+                runtime = do_bench(lambda: graphs[b][0](input_ids[:b], positions[:b]))  # noqa
                 piecewise_cudagraph_time[b] = runtime
             else:
                 runtime = do_bench(lambda: graphs[b][0].replay())  # noqa
 
@@ -1109,8 +1109,9 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler._add_seq_group_to_swapped(seq_group)
 
     scheduler._schedule_swapped(budget, curr_loras)
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    computed_blocks_tracker = scheduler.block_manager._computed_blocks_tracker
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1139,15 +1140,15 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_prefills(budget, curr_loras)
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
     # Priority preemption schedule
     scheduler._schedule_priority_preemption(budget)
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1187,8 +1188,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1223,8 +1224,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        2
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(2)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1261,8 +1262,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1289,8 +1290,8 @@ def test_remove_seq_from_computed_blocks_tracker():
     for _, seq_group in seq_and_seq_groups:
         scheduler.add_seq_group(seq_group)
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        0
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(0)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1323,8 +1324,8 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        0
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(0)
     )
     assert seq_id_to_num_tokens_computed is None
 
@@ -1357,7 +1358,7 @@ def test_remove_seq_from_computed_blocks_tracker():
         scheduler.add_seq_group(seq_group)
 
     scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = scheduler.block_manager._computed_blocks_tracker._seq_id_to_num_tokens_computed.get(
-        1
+    seq_id_to_num_tokens_computed = (
+        computed_blocks_tracker._seq_id_to_num_tokens_computed.get(1)
     )
     assert seq_id_to_num_tokens_computed is None
@@ -67,13 +67,15 @@ def sample_complex_json_schema():
         "type": "object",
         "properties": {
             "score": {
+                # Numeric range
                 "type": "integer",
                 "minimum": 0,
-                "maximum": 100,  # Numeric range
+                "maximum": 100,
             },
             "grade": {
+                # Regex pattern
                 "type": "string",
-                "pattern": "^[A-D]$",  # Regex pattern
+                "pattern": "^[A-D]$",
             },
             "email": {
                 "type": "string",
@@ -82,8 +84,9 @@ def sample_complex_json_schema():
             "tags": {
                 "type": "array",
                 "items": {
+                    # Combining length and pattern restrictions
                     "type": "string",
-                    "pattern": "^[a-z]{1,10}$",  # Combining length and pattern restrictions
+                    "pattern": "^[a-z]{1,10}$",
                 },
             },
         },
 
@@ -143,7 +143,7 @@ async def test_single_chat_session_audio_base64encoded(
                 {
                     "type": "audio_url",
                     "audio_url": {
-                        "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+                        "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"  # noqa: E501
                     },
                 },
                 {"type": "text", "text": "What's happening in this audio?"},
 
@@ -38,8 +38,8 @@ def server(
     request,
     monkeypatch_module,
     zephyr_lora_files,  # noqa: F811
-    zephyr_lora_added_tokens_files,
-):  # noqa: F811
+    zephyr_lora_added_tokens_files,  # noqa: F811
+):
     use_v1 = request.param
     monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
@@ -713,12 +713,14 @@ async def test_required_tool_use(
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description": "The city to find the weather for, e.g. 'Vienna'",
+                            "description": "The city to find the weather for, e.g. "
+                            "'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
                             "type": "string",
-                            "description": "The country that the city is in, e.g. 'Austria'",
+                            "description": "The country that the city is in, e.g. "
+                            "'Austria'",
                         },
                         "unit": {
                             "type": "string",
@@ -740,16 +742,19 @@ async def test_required_tool_use(
                     "properties": {
                         "city": {
                             "type": "string",
-                            "description": "The city to get the forecast for, e.g. 'Vienna'",
+                            "description": "The city to get the forecast for, e.g. "
+                            "'Vienna'",
                             "default": "Vienna",
                         },
                         "country": {
                             "type": "string",
-                            "description": "The country that the city is in, e.g. 'Austria'",
+                            "description": "The country that the city is in, e.g. "
+                            "'Austria'",
                         },
                         "days": {
                             "type": "integer",
-                            "description": "Number of days to get the forecast for (1-7)",
+                            "description": "Number of days to get the forecast for "
+                            "(1-7)",
                         },
                         "unit": {
                             "type": "string",
@@ -957,7 +962,8 @@ async def test_complex_message_content(client: openai.AsyncOpenAI):
                 "content": [
                     {
                         "type": "text",
-                        "text": "what is 1+1? please provide the result without any other text.",
+                        "text": "what is 1+1? please provide the result without any "
+                        "other text.",
                     }
                 ],
             }
 
@@ -75,9 +75,8 @@ def test_load_chat_template():
     # Hard coded value for template_chatml.jinja
     assert (
         template_content
-        == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
-    )  # noqa: E501
+        == "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"  # noqa: E501
+    )
 
 
 def test_no_load_chat_template_filelike():
 
@@ -45,11 +45,13 @@ async def client(server):
                 "properties": {
                     "city": {
                         "type": "string",
-                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                        "description": "The city to find the weather for, e.g. "
+                        "'San Francisco'",
                     },
                     "state": {
                         "type": "string",
-                        "description": "the two-letter abbreviation for the state that the city is"
+                        "description": "the two-letter abbreviation for the state that "
+                        "the city is"
                         " in, e.g. 'CA' which would mean 'California'",
                     },
                     "unit": {
@@ -69,7 +71,8 @@ async def client(server):
     {"role": "assistant", "content": "I'm doing well! How can I help you?"},
     {
         "role": "user",
-        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+        "content": "Can you tell me what the temperate will be in Dallas, in "
+        "fahrenheit?",
     },
 ]
Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,8 @@ def test_full_cudagraph_with_invalid_backend():`
`159`	`159`	`temporary_environ(`
`160`	`160`	`{`
`161`	`161`	`"VLLM_USE_V1": "1",`
`162`		`- "VLLM_FLASH_ATTN_VERSION": "2", # FA2 not supported with full_cuda_graph`
	`162`	`+ # FA2 not supported with full_cuda_graph`
	`163`	`+ "VLLM_FLASH_ATTN_VERSION": "2",`
`163`	`164`	`}`
`164`	`165`	`),`
`165`	`166`	`pytest.raises(RuntimeError),`
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ async def test_single_chat_session_audio_base64encoded(`
`143`	`143`	`{`
`144`	`144`	`"type": "audio_url",`
`145`	`145`	`"audio_url": {`
`146`		`- "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"`
	`146`	`+ "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}" # noqa: E501`
`147`	`147`	`},`
`148`	`148`	`},`
`149`	`149`	`{"type": "text", "text": "What's happening in this audio?"},`