Skip to content

feat - add a new endpoint get_tokenizer_info to provide tokenizer/chat-template information #20575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions tests/entrypoints/openai/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,3 +283,106 @@ async def test_detokenize(
response.raise_for_status()

assert response.json() == {"prompt": prompt}


@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_get_tokenizer_info_basic(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
"""Test basic tokenizer info endpoint functionality."""
response = requests.get(server.url_for("get_tokenizer_info"))
response.raise_for_status()
result = response.json()
assert "tokenizer_class" in result
assert isinstance(result["tokenizer_class"], str)
assert result["tokenizer_class"]


@pytest.mark.asyncio
async def test_get_tokenizer_info_schema(server: RemoteOpenAIServer):
"""Test that the response matches expected schema types."""
response = requests.get(server.url_for("get_tokenizer_info"))
response.raise_for_status()
result = response.json()
field_types = {
"add_bos_token": bool,
"add_prefix_space": bool,
"clean_up_tokenization_spaces": bool,
"split_special_tokens": bool,
"bos_token": str,
"eos_token": str,
"pad_token": str,
"unk_token": str,
"chat_template": str,
"errors": str,
"model_max_length": int,
"additional_special_tokens": list,
"added_tokens_decoder": dict,
}
for field, expected_type in field_types.items():
if field in result and result[field] is not None:
assert isinstance(
result[field],
expected_type), (f"{field} should be {expected_type.__name__}")


@pytest.mark.asyncio
async def test_get_tokenizer_info_added_tokens_structure(
server: RemoteOpenAIServer, ):
"""Test added_tokens_decoder structure if present."""
response = requests.get(server.url_for("get_tokenizer_info"))
response.raise_for_status()
result = response.json()
added_tokens = result.get("added_tokens_decoder")
if added_tokens:
for token_id, token_info in added_tokens.items():
assert isinstance(token_id, str), "Token IDs should be strings"
assert isinstance(token_info, dict), "Token info should be a dict"
assert "content" in token_info, "Token info should have content"
assert "special" in token_info, (
"Token info should have special flag")
assert isinstance(token_info["special"],
bool), ("Special flag should be boolean")


@pytest.mark.asyncio
async def test_get_tokenizer_info_consistency_with_tokenize(
server: RemoteOpenAIServer, ):
"""Test that tokenizer info is consistent with tokenization endpoint."""
info_response = requests.get(server.url_for("get_tokenizer_info"))
info_response.raise_for_status()
info = info_response.json()
tokenize_response = requests.post(
server.url_for("tokenize"),
json={
"model": MODEL_NAME,
"prompt": "Hello world!"
},
)
tokenize_response.raise_for_status()
tokenize_result = tokenize_response.json()
info_max_len = info.get("model_max_length")
tokenize_max_len = tokenize_result.get("max_model_len")
if info_max_len and tokenize_max_len:
assert info_max_len >= tokenize_max_len, (
"Info max length should be >= tokenize max length")


@pytest.mark.asyncio
async def test_get_tokenizer_info_chat_template(server: RemoteOpenAIServer):
"""Test chat template is properly included."""
response = requests.get(server.url_for("get_tokenizer_info"))
response.raise_for_status()
result = response.json()
chat_template = result.get("chat_template")
if chat_template:
assert isinstance(chat_template,
str), ("Chat template should be a string")
assert chat_template.strip(), "Chat template should not be empty"
14 changes: 14 additions & 0 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,19 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
assert_never(generator)


def maybe_register_tokenizer_info_endpoint(args):
"""Conditionally register the tokenizer info endpoint if enabled."""
if getattr(args, 'enable_tokenizer_info_endpoint', False):

@router.get("/tokenizer_info")
async def get_tokenizer_info(raw_request: Request):
"""Get comprehensive tokenizer information."""
result = await tokenization(raw_request).get_tokenizer_info()
return JSONResponse(content=result.model_dump(),
status_code=result.code if isinstance(
result, ErrorResponse) else 200)


@router.get("/v1/models")
async def show_available_models(raw_request: Request):
handler = models(raw_request)
Expand Down Expand Up @@ -1531,6 +1544,7 @@ async def run_server_worker(listen_address,
uvicorn_kwargs['log_config'] = log_config

async with build_async_engine_client(args, client_config) as engine_client:
maybe_register_tokenizer_info_endpoint(args)
app = build_app(args)

vllm_config = await engine_client.get_vllm_config()
Expand Down
7 changes: 6 additions & 1 deletion vllm/entrypoints/openai/cli_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,12 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
help=
"If set to True, enable tracking server_load_metrics in the app state."
)

parser.add_argument(
"--enable-tokenizer-info-endpoint",
action='store_true',
default=False,
help="Enable the /tokenizer_info endpoint. May expose chat "
"templates and other tokenizer configuration.")
return parser


Expand Down
10 changes: 10 additions & 0 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1849,6 +1849,16 @@ class DetokenizeResponse(OpenAIBaseModel):
prompt: str


class TokenizerInfoResponse(OpenAIBaseModel):
"""
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""

tokenizer_class: str
model_config = ConfigDict(extra="allow")


class LoadLoRAAdapterRequest(BaseModel):
lora_name: str
lora_path: str
Expand Down
56 changes: 53 additions & 3 deletions vllm/entrypoints/openai/serving_tokenization.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from typing import Final, Optional, Union
from dataclasses import dataclass
from typing import Any, Final, Optional, Union

import jinja2
from fastapi import Request
Expand All @@ -17,11 +17,13 @@
ErrorResponse,
TokenizeChatRequest,
TokenizeRequest,
TokenizeResponse)
TokenizeResponse,
TokenizerInfoResponse)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer

logger = init_logger(__name__)

Expand Down Expand Up @@ -155,3 +157,51 @@ async def create_detokenize(
input_text = prompt_input["prompt"]

return DetokenizeResponse(prompt=input_text)

async def get_tokenizer_info(
self, ) -> Union[TokenizerInfoResponse, ErrorResponse]:
"""Get comprehensive tokenizer information."""
try:
tokenizer = await self.engine_client.get_tokenizer()
info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
return TokenizerInfoResponse(**info)
except Exception as e:
return self.create_error_response(
f"Failed to get tokenizer info: {str(e)}")
Comment on lines +168 to +170
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Catching a broad Exception can hide bugs and swallow important exceptions like KeyboardInterrupt. It's better to log the full traceback to aid in debugging when an unexpected error occurs.

Suggested change
except Exception as e:
return self.create_error_response(
f"Failed to get tokenizer info: {str(e)}")
except Exception as e:
logger.exception("Failed to get tokenizer info.")
return self.create_error_response(
f"Failed to get tokenizer info: {e!s}")



@dataclass
class TokenizerInfo:
tokenizer: AnyTokenizer
chat_template: Optional[str]

def to_dict(self) -> dict[str, Any]:
"""Return the tokenizer configuration."""
return self._get_tokenizer_config()

def _get_tokenizer_config(self) -> dict[str, Any]:
"""Get tokenizer configuration directly from the tokenizer object."""
config = (dict(self.tokenizer.init_kwargs)
if hasattr(self.tokenizer, "init_kwargs")
and self.tokenizer.init_kwargs else {})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
config = (dict(self.tokenizer.init_kwargs)
if hasattr(self.tokenizer, "init_kwargs")
and self.tokenizer.init_kwargs else {})
config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})


# Remove file path fields
config.pop("vocab_file", None)
config.pop("merges_file", None)

config = self._make_json_serializable(config)
config["tokenizer_class"] = type(self.tokenizer).__name__
if self.chat_template:
config["chat_template"] = self.chat_template
return config

def _make_json_serializable(self, obj):
"""Convert any non-JSON-serializable objects to serializable format."""
if hasattr(obj, "content"):
return obj.content
elif isinstance(obj, dict):
return {k: self._make_json_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [self._make_json_serializable(item) for item in obj]
else:
return obj