diff --git a/.gitignore b/.gitignore index 88a42a5c0f64..96b97a552c54 100644 --- a/.gitignore +++ b/.gitignore @@ -146,6 +146,7 @@ venv.bak/ # mkdocs documentation /site +docs/argparse docs/examples # mypy diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md index a0e3594cd581..c3c1d5a1c362 100644 --- a/docs/configuration/engine_args.md +++ b/docs/configuration/engine_args.md @@ -1,3 +1,7 @@ +--- +toc_depth: 3 +--- + # Engine Arguments Engine arguments control the behavior of the vLLM engine. @@ -5,11 +9,12 @@ Engine arguments control the behavior of the vLLM engine. - For [offline inference](../serving/offline_inference.md), they are part of the arguments to [LLM][vllm.LLM] class. - For [online serving](../serving/openai_compatible_server.md), they are part of the arguments to `vllm serve`. -You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments. +The engine argument classes, [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs], are a combination of the configuration classes defined in [vllm.config][]. Therefore, if you are interested in developer documentation, we recommend looking at these configuration classes as they are the source of truth for types, defaults and docstrings. + +## `EngineArgs` -However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented. +--8<-- "docs/argparse/engine_args.md" -For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config. +## `AsyncEngineArgs` -!!! note - Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help` +--8<-- "docs/argparse/async_engine_args.md" diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py new file mode 100644 index 000000000000..64120f2d1513 --- /dev/null +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging +import sys +from argparse import SUPPRESS, HelpFormatter +from pathlib import Path +from typing import Literal +from unittest.mock import MagicMock, patch + +ROOT_DIR = Path(__file__).parent.parent.parent.parent +ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse" + +sys.path.insert(0, str(ROOT_DIR)) +sys.modules["aiohttp"] = MagicMock() +sys.modules["blake3"] = MagicMock() +sys.modules["vllm._C"] = MagicMock() + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 +from vllm.utils import FlexibleArgumentParser # noqa: E402 + +logger = logging.getLogger("mkdocs") + + +class MarkdownFormatter(HelpFormatter): + """Custom formatter that generates markdown for argument groups.""" + + def __init__(self, prog): + super().__init__(prog, + max_help_position=float('inf'), + width=float('inf')) + self._markdown_output = [] + + def start_section(self, heading): + if heading not in {"positional arguments", "options"}: + self._markdown_output.append(f"\n### {heading}\n\n") + + def end_section(self): + pass + + def add_text(self, text): + if text: + self._markdown_output.append(f"{text.strip()}\n\n") + + def add_usage(self, usage, actions, groups, prefix=None): + pass + + def add_arguments(self, actions): + for action in actions: + + option_strings = f'`{"`, `".join(action.option_strings)}`' + self._markdown_output.append(f"#### {option_strings}\n\n") + + if choices := action.choices: + choices = f'`{"`, `".join(str(c) for c in choices)}`' + self._markdown_output.append( + f"Possible choices: {choices}\n\n") + + self._markdown_output.append(f"{action.help}\n\n") + + if (default := action.default) != SUPPRESS: + self._markdown_output.append(f"Default: `{default}`\n\n") + + def format_help(self): + """Return the formatted help as markdown.""" + return "".join(self._markdown_output) + + +def create_parser(cls, **kwargs) -> FlexibleArgumentParser: + """Create a parser for the given class with markdown formatting. + + Args: + cls: The class to create a parser for + **kwargs: Additional keyword arguments to pass to `cls.add_cli_args`. + + Returns: + FlexibleArgumentParser: A parser with markdown formatting for the class. + """ + parser = FlexibleArgumentParser() + parser.formatter_class = MarkdownFormatter + with patch("vllm.config.DeviceConfig.__post_init__"): + return cls.add_cli_args(parser, **kwargs) + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + logger.info("Generating argparse documentation") + logger.debug("Root directory: %s", ROOT_DIR.resolve()) + logger.debug("Output directory: %s", ARGPARSE_DOC_DIR.resolve()) + + # Create the ARGPARSE_DOC_DIR if it doesn't exist + if not ARGPARSE_DOC_DIR.exists(): + ARGPARSE_DOC_DIR.mkdir(parents=True) + + # Create parsers to document + parsers = { + "engine_args": create_parser(EngineArgs), + "async_engine_args": create_parser(AsyncEngineArgs, + async_args_only=True), + } + + # Generate documentation for each parser + for stem, parser in parsers.items(): + doc_path = ARGPARSE_DOC_DIR / f"{stem}.md" + with open(doc_path, "w") as f: + f.write(parser.format_help()) + logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR)) diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 14a28f944d98..0ee52bb34603 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -161,8 +161,8 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): for example in sorted(examples, key=lambda e: e.path.stem): example_name = f"{example.path.stem}.md" doc_path = EXAMPLE_DOC_DIR / example.category / example_name - logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) if not doc_path.parent.exists(): doc_path.parent.mkdir(parents=True) with open(doc_path, "w+") as f: f.write(example.generate()) + logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) diff --git a/docs/mkdocs/overrides/partials/toc-item.html b/docs/mkdocs/overrides/partials/toc-item.html new file mode 100644 index 000000000000..284af59cbe2c --- /dev/null +++ b/docs/mkdocs/overrides/partials/toc-item.html @@ -0,0 +1,21 @@ + +
  • + + + {{ toc_item.title }} + + + + + {% if toc_item.children %} + + {% endif %} +
  • \ No newline at end of file diff --git a/mkdocs.yaml b/mkdocs.yaml index 45b6ffadbeb7..f97aff490738 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -3,6 +3,7 @@ site_url: https://docs.vllm.ai repo_url: https://github.com/vllm-project/vllm edit_uri: edit/main/docs/ exclude_docs: | + argparse *.inc.md *.template.md theme: @@ -47,6 +48,7 @@ theme: hooks: - docs/mkdocs/hooks/remove_announcement.py - docs/mkdocs/hooks/generate_examples.py + - docs/mkdocs/hooks/generate_argparse.py - docs/mkdocs/hooks/url_schemes.py # Required to stop api-autonav from raising an error diff --git a/requirements/docs.txt b/requirements/docs.txt index 64c70cb65c55..e20b6f6e34d7 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -7,3 +7,18 @@ mkdocs-awesome-nav python-markdown-math regex ruff + +# Required for argparse hook only +-f https://download.pytorch.org/whl/cpu +cachetools +cloudpickle +fastapi +msgspec +openai +pillow +psutil +pybase64 +pydantic +torch +transformers +zmq diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e7655b6c3026..b19046946e65 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -12,8 +12,9 @@ import warnings from dataclasses import MISSING, dataclass, fields, is_dataclass from itertools import permutations -from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional, - Type, TypeVar, Union, cast, get_args, get_origin) +from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, + Literal, Optional, Type, TypeVar, Union, cast, get_args, + get_origin) import regex as re import torch @@ -33,20 +34,26 @@ SchedulerConfig, SchedulerPolicy, SpeculativeConfig, TaskOption, TokenizerMode, TokenizerPoolConfig, VllmConfig, get_attr_docs, get_field) -from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file -from vllm.usage.usage_lib import UsageContext from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, GiB_bytes, get_ip, is_in_ray_actor) # yapf: enable +if TYPE_CHECKING: + from vllm.executor.executor_base import ExecutorBase + from vllm.model_executor.layers.quantization import QuantizationMethods + from vllm.usage.usage_lib import UsageContext +else: + ExecutorBase = Any + QuantizationMethods = Any + UsageContext = Any + logger = init_logger(__name__) # object is used to allow for special typing forms @@ -200,14 +207,17 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]: kwargs[name] = {"default": default, "help": help} # Set other kwargs based on the type hints - json_tip = """\n\nShould either be a valid JSON string or JSON keys - passed individually. For example, the following sets of arguments are - equivalent:\n\n - - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n - - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n - Additionally, list elements can be passed individually using '+': - - `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n - - `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`\n\n""" + json_tip = """Should either be a valid JSON string or JSON keys +passed individually. For example, the following sets of arguments are +equivalent: + +- `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n +- `--json-arg.key1 value1 --json-arg.key2.key3 value2` + +Additionally, list elements can be passed individually using `+`: + +- `--json-arg '{"key4": ["value3", "value4", "value5"]}'`\n +- `--json-arg.key4+ value3 --json-arg.key4+='value4,value5'`""" if dataclass_cls is not None: def parse_dataclass(val: str, cls=dataclass_cls) -> Any: @@ -219,7 +229,7 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any: raise argparse.ArgumentTypeError(repr(e)) from e kwargs[name]["type"] = parse_dataclass - kwargs[name]["help"] += json_tip + kwargs[name]["help"] += f"\n\n{json_tip}" elif contains_type(type_hints, bool): # Creates --no- and -- flags kwargs[name]["action"] = argparse.BooleanOptionalAction @@ -255,7 +265,7 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any: kwargs[name]["type"] = union_dict_and_str elif contains_type(type_hints, dict): kwargs[name]["type"] = parse_type(json.loads) - kwargs[name]["help"] += json_tip + kwargs[name]["help"] += f"\n\n{json_tip}" elif (contains_type(type_hints, str) or any(is_not_builtin(th) for th in type_hints)): kwargs[name]["type"] = str @@ -1559,7 +1569,6 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None: # Enable chunked prefill by default for long context (> 32K) # models to avoid OOM errors in initial memory profiling phase. elif use_long_context: - from vllm.platforms import current_platform is_gpu = current_platform.is_cuda() use_sliding_window = (model_config.get_sliding_window() is not None) @@ -1667,6 +1676,7 @@ def _set_default_args_v1(self, usage_context: UsageContext, # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces # throughput, see PR #17885 for more details. # So here we do an extra device name check to prevent such regression. + from vllm.usage.usage_lib import UsageContext if device_memory >= 70 * GiB_bytes and "a100" not in device_name: # For GPUs like H100 and MI300x, use larger default values. default_max_num_batched_tokens = { diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 08e94ec0fa1e..f5b7239cb300 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -38,7 +38,6 @@ from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.model_executor.model_loader import get_model_cls from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.utils import MediaConnector @@ -524,6 +523,7 @@ def model_config(self) -> ModelConfig: @cached_property def model_cls(self): + from vllm.model_executor.model_loader import get_model_cls return get_model_cls(self.model_config) @property diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index fc6e190e5480..082e52aff9eb 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -13,7 +13,6 @@ from vllm.jsontree import JSONTree, json_map_leaves from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import resolve_mm_processor_kwargs if TYPE_CHECKING: @@ -21,6 +20,14 @@ from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict, MultiModalRegistry) from vllm.sequence import SequenceData + from vllm.transformers_utils.tokenizer import AnyTokenizer +else: + ModelConfig = Any + MultiModalDataDict = Any + MultiModalPlaceholderDict = Any + MultiModalRegistry = Any + SequenceData = Any + AnyTokenizer = Any _T = TypeVar("_T") _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig) @@ -36,7 +43,7 @@ class InputContext: modify the inputs. """ - model_config: "ModelConfig" + model_config: ModelConfig """The configuration of the model.""" def get_hf_config( @@ -200,9 +207,9 @@ class DummyData(NamedTuple): Note: This is only used in V0. """ - seq_data: "SequenceData" - multi_modal_data: Optional["MultiModalDataDict"] = None - multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None + seq_data: SequenceData + multi_modal_data: Optional[MultiModalDataDict] = None + multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None class InputRegistry: @@ -212,9 +219,9 @@ class InputRegistry: def dummy_data_for_profiling( self, - model_config: "ModelConfig", + model_config: ModelConfig, seq_len: int, - mm_registry: "MultiModalRegistry", + mm_registry: MultiModalRegistry, is_encoder_data: bool = False, ) -> DummyData: """ diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 27d476929855..03e45bd26d7a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -16,7 +16,6 @@ from functools import lru_cache from typing import Callable, Optional, TypeVar, Union -import cloudpickle import torch.nn as nn from vllm.logger import init_logger @@ -598,6 +597,7 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T: output_filepath = os.path.join(tempdir, "registry_output.tmp") # `cloudpickle` allows pickling lambda functions directly + import cloudpickle input_bytes = cloudpickle.dumps((fn, output_filepath)) # cannot use `sys.executable __file__` here because the script diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 913cb0895bb9..b26311cb7c20 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -7,7 +7,6 @@ from importlib.util import find_spec from typing import TYPE_CHECKING, Optional -import psutil import torch from vllm.logger import init_logger @@ -73,6 +72,7 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: + import psutil return psutil.virtual_memory().total @classmethod diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index c34189013d99..4f4522d726e8 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -7,14 +7,22 @@ from abc import abstractmethod from collections.abc import Sequence from functools import cached_property -from typing import Callable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Optional, Union -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage, ResponsesRequest) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import import_from_path, is_list_of +if TYPE_CHECKING: + from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage, + ResponsesRequest) + from vllm.transformers_utils.tokenizer import AnyTokenizer +else: + ChatCompletionRequest = Any + DeltaMessage = Any + ResponsesRequest = Any + AnyTokenizer = Any + logger = init_logger(__name__) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index ae96ebe4eaa2..01d1769f0e5e 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -16,15 +16,18 @@ from vllm import envs from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer_base import (TokenizerBase, - TokenizerRegistry) from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import make_async if TYPE_CHECKING: from vllm.config import ModelConfig + from vllm.lora.request import LoRARequest + from vllm.transformers_utils.tokenizer_base import TokenizerBase +else: + ModelConfig = Any + LoRARequest = Any + TokenizerBase = Any logger = init_logger(__name__) @@ -222,6 +225,7 @@ def get_tokenizer( tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name), revision=revision) elif tokenizer_mode == "custom": + from vllm.transformers_utils.tokenizer_base import TokenizerRegistry tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name), *args, revision=revision, @@ -271,7 +275,7 @@ def get_tokenizer( def cached_tokenizer_from_config( - model_config: "ModelConfig", + model_config: ModelConfig, **kwargs: Any, ): return cached_get_tokenizer(