Skip to content

[Doc] Add engine args back in to the docs #20674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jul 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ venv.bak/

# mkdocs documentation
/site
docs/argparse
docs/examples

# mypy
Expand Down
15 changes: 10 additions & 5 deletions docs/configuration/engine_args.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
---
toc_depth: 3
---

# Engine Arguments

Engine arguments control the behavior of the vLLM engine.

- For [offline inference](../serving/offline_inference.md), they are part of the arguments to [LLM][vllm.LLM] class.
- For [online serving](../serving/openai_compatible_server.md), they are part of the arguments to `vllm serve`.

You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings.

## `EngineArgs`

However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented.
--8<-- "docs/argparse/engine_args.md"

For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config.
## `AsyncEngineArgs`

!!! note
Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help`
--8<-- "docs/argparse/async_engine_args.md"
105 changes: 105 additions & 0 deletions docs/mkdocs/hooks/generate_argparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
import sys
from argparse import SUPPRESS, HelpFormatter
from pathlib import Path
from typing import Literal
from unittest.mock import MagicMock, patch

ROOT_DIR = Path(__file__).parent.parent.parent.parent
ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse"

sys.path.insert(0, str(ROOT_DIR))
sys.modules["aiohttp"] = MagicMock()
sys.modules["blake3"] = MagicMock()
sys.modules["vllm._C"] = MagicMock()

from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
from vllm.utils import FlexibleArgumentParser # noqa: E402

logger = logging.getLogger("mkdocs")


class MarkdownFormatter(HelpFormatter):
"""Custom formatter that generates markdown for argument groups."""

def __init__(self, prog):
super().__init__(prog,
max_help_position=float('inf'),
width=float('inf'))
self._markdown_output = []

def start_section(self, heading):
if heading not in {"positional arguments", "options"}:
self._markdown_output.append(f"\n### {heading}\n\n")

def end_section(self):
pass

def add_text(self, text):
if text:
self._markdown_output.append(f"{text.strip()}\n\n")

def add_usage(self, usage, actions, groups, prefix=None):
pass

def add_arguments(self, actions):
for action in actions:

option_strings = f'`{"`, `".join(action.option_strings)}`'
self._markdown_output.append(f"#### {option_strings}\n\n")

if choices := action.choices:
choices = f'`{"`, `".join(str(c) for c in choices)}`'
self._markdown_output.append(
f"Possible choices: {choices}\n\n")

self._markdown_output.append(f"{action.help}\n\n")

if (default := action.default) != SUPPRESS:
self._markdown_output.append(f"Default: `{default}`\n\n")

def format_help(self):
"""Return the formatted help as markdown."""
return "".join(self._markdown_output)


def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
"""Create a parser for the given class with markdown formatting.

Args:
cls: The class to create a parser for
**kwargs: Additional keyword arguments to pass to `cls.add_cli_args`.

Returns:
FlexibleArgumentParser: A parser with markdown formatting for the class.
"""
parser = FlexibleArgumentParser()
parser.formatter_class = MarkdownFormatter
with patch("vllm.config.DeviceConfig.__post_init__"):
return cls.add_cli_args(parser, **kwargs)


def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
logger.info("Generating argparse documentation")
logger.debug("Root directory: %s", ROOT_DIR.resolve())
logger.debug("Output directory: %s", ARGPARSE_DOC_DIR.resolve())

# Create the ARGPARSE_DOC_DIR if it doesn't exist
if not ARGPARSE_DOC_DIR.exists():
ARGPARSE_DOC_DIR.mkdir(parents=True)

# Create parsers to document
parsers = {
"engine_args": create_parser(EngineArgs),
"async_engine_args": create_parser(AsyncEngineArgs,
async_args_only=True),
}

# Generate documentation for each parser
for stem, parser in parsers.items():
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
with open(doc_path, "w") as f:
f.write(parser.format_help())
logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
2 changes: 1 addition & 1 deletion docs/mkdocs/hooks/generate_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
for example in sorted(examples, key=lambda e: e.path.stem):
example_name = f"{example.path.stem}.md"
doc_path = EXAMPLE_DOC_DIR / example.category / example_name
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
if not doc_path.parent.exists():
doc_path.parent.mkdir(parents=True)
with open(doc_path, "w+") as f:
f.write(example.generate())
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
21 changes: 21 additions & 0 deletions docs/mkdocs/overrides/partials/toc-item.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<!-- Enables the use of toc_depth in document frontmatter https://github.com/squidfunk/mkdocs-material/issues/4827#issuecomment-1869812019 -->
<li class="md-nav__item">
<a href="{{ toc_item.url }}" class="md-nav__link">
<span class="md-ellipsis">
{{ toc_item.title }}
</span>
</a>

<!-- Table of contents list -->
{% if toc_item.children %}
<nav class="md-nav" aria-label="{{ toc_item.title | striptags }}">
<ul class="md-nav__list">
{% for toc_item in toc_item.children %}
{% if not page.meta.toc_depth or toc_item.level <= page.meta.toc_depth %}
{% include "partials/toc-item.html" %}
{% endif %}
{% endfor %}
</ul>
</nav>
{% endif %}
</li>
2 changes: 2 additions & 0 deletions mkdocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ site_url: https://docs.vllm.ai
repo_url: https://github.com/vllm-project/vllm
edit_uri: edit/main/docs/
exclude_docs: |
argparse
*.inc.md
*.template.md
theme:
Expand Down Expand Up @@ -47,6 +48,7 @@ theme:
hooks:
- docs/mkdocs/hooks/remove_announcement.py
- docs/mkdocs/hooks/generate_examples.py
- docs/mkdocs/hooks/generate_argparse.py
- docs/mkdocs/hooks/url_schemes.py

# Required to stop api-autonav from raising an error
Expand Down
15 changes: 15 additions & 0 deletions requirements/docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,18 @@ mkdocs-awesome-nav
python-markdown-math
regex
ruff

# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
cachetools
cloudpickle
fastapi
msgspec
openai
pillow
psutil
pybase64
pydantic
torch
transformers
zmq
42 changes: 26 additions & 16 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
import warnings
from dataclasses import MISSING, dataclass, fields, is_dataclass
from itertools import permutations
from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
Type, TypeVar, Union, cast, get_args, get_origin)
from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
Literal, Optional, Type, TypeVar, Union, cast, get_args,
get_origin)

import regex as re
import torch
Expand All @@ -33,20 +34,26 @@
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
TaskOption, TokenizerMode, TokenizerPoolConfig,
VllmConfig, get_attr_docs, get_field)
from vllm.executor.executor_base import ExecutorBase
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.platforms import CpuArchEnum, current_platform
from vllm.plugins import load_general_plugins
from vllm.reasoning import ReasoningParserManager
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
from vllm.transformers_utils.utils import check_gguf_file
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
GiB_bytes, get_ip, is_in_ray_actor)

# yapf: enable

if TYPE_CHECKING:
from vllm.executor.executor_base import ExecutorBase
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.usage.usage_lib import UsageContext
else:
ExecutorBase = Any
QuantizationMethods = Any
UsageContext = Any

logger = init_logger(__name__)

# object is used to allow for special typing forms
Expand Down Expand Up @@ -200,14 +207,17 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
kwargs[name] = {"default": default, "help": help}

# Set other kwargs based on the type hints
json_tip = """\n\nShould either be a valid JSON string or JSON keys
passed individually. For example, the following sets of arguments are
equivalent:\n\n
- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n
Additionally, list elements can be passed individually using '+':
- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`\n\n"""
json_tip = """Should either be a valid JSON string or JSON keys
passed individually. For example, the following sets of arguments are
equivalent:

- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n
- `--json-arg.key1 value1 --json-arg.key2.key3 value2`

Additionally, list elements can be passed individually using `+`:

- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n
- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`"""
if dataclass_cls is not None:

def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
Expand All @@ -219,7 +229,7 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
raise argparse.ArgumentTypeError(repr(e)) from e

kwargs[name]["type"] = parse_dataclass
kwargs[name]["help"] += json_tip
kwargs[name]["help"] += f"\n\n{json_tip}"
elif contains_type(type_hints, bool):
# Creates --no-<name> and --<name> flags
kwargs[name]["action"] = argparse.BooleanOptionalAction
Expand Down Expand Up @@ -255,7 +265,7 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
kwargs[name]["type"] = union_dict_and_str
elif contains_type(type_hints, dict):
kwargs[name]["type"] = parse_type(json.loads)
kwargs[name]["help"] += json_tip
kwargs[name]["help"] += f"\n\n{json_tip}"
elif (contains_type(type_hints, str)
or any(is_not_builtin(th) for th in type_hints)):
kwargs[name]["type"] = str
Expand Down Expand Up @@ -1559,7 +1569,6 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
# Enable chunked prefill by default for long context (> 32K)
# models to avoid OOM errors in initial memory profiling phase.
elif use_long_context:
from vllm.platforms import current_platform
is_gpu = current_platform.is_cuda()
use_sliding_window = (model_config.get_sliding_window()
is not None)
Expand Down Expand Up @@ -1667,6 +1676,7 @@ def _set_default_args_v1(self, usage_context: UsageContext,
# NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
# throughput, see PR #17885 for more details.
# So here we do an extra device name check to prevent such regression.
from vllm.usage.usage_lib import UsageContext
if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
# For GPUs like H100 and MI300x, use larger default values.
default_max_num_batched_tokens = {
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.model_executor.model_loader import get_model_cls
from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.utils import MediaConnector
Expand Down Expand Up @@ -524,6 +523,7 @@ def model_config(self) -> ModelConfig:

@cached_property
def model_cls(self):
from vllm.model_executor.model_loader import get_model_cls
return get_model_cls(self.model_config)

@property
Expand Down
21 changes: 14 additions & 7 deletions vllm/inputs/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,21 @@
from vllm.jsontree import JSONTree, json_map_leaves
from vllm.logger import init_logger
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import resolve_mm_processor_kwargs

if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict,
MultiModalRegistry)
from vllm.sequence import SequenceData
from vllm.transformers_utils.tokenizer import AnyTokenizer
else:
ModelConfig = Any
MultiModalDataDict = Any
MultiModalPlaceholderDict = Any
MultiModalRegistry = Any
SequenceData = Any
AnyTokenizer = Any

_T = TypeVar("_T")
_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
Expand All @@ -36,7 +43,7 @@ class InputContext:
modify the inputs.
"""

model_config: "ModelConfig"
model_config: ModelConfig
"""The configuration of the model."""

def get_hf_config(
Expand Down Expand Up @@ -200,9 +207,9 @@ class DummyData(NamedTuple):
Note: This is only used in V0.
"""

seq_data: "SequenceData"
multi_modal_data: Optional["MultiModalDataDict"] = None
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None
seq_data: SequenceData
multi_modal_data: Optional[MultiModalDataDict] = None
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None


class InputRegistry:
Expand All @@ -212,9 +219,9 @@ class InputRegistry:

def dummy_data_for_profiling(
self,
model_config: "ModelConfig",
model_config: ModelConfig,
seq_len: int,
mm_registry: "MultiModalRegistry",
mm_registry: MultiModalRegistry,
is_encoder_data: bool = False,
) -> DummyData:
"""
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from functools import lru_cache
from typing import Callable, Optional, TypeVar, Union

import cloudpickle
import torch.nn as nn

from vllm.logger import init_logger
Expand Down Expand Up @@ -598,6 +597,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
output_filepath = os.path.join(tempdir, "registry_output.tmp")

# `cloudpickle` allows pickling lambda functions directly
import cloudpickle
input_bytes = cloudpickle.dumps((fn, output_filepath))

# cannot use `sys.executable __file__` here because the script
Expand Down
Loading