Skip to content

Commit 4f6d660

Browse files
committed
feat: add MultiModalContent for customizable multi-modal tool presentation
1 parent 81c734b commit 4f6d660

File tree

3 files changed

+78
-6
lines changed

3 files changed

+78
-6
lines changed

docs/tools.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,34 @@ print(result.output)
291291
```
292292
_(This example is complete, it can be run "as is")_
293293

294+
### Customizing Multi-Modal Content Presentation
295+
296+
You can also customize how multi-modal content is presented to the model using [`MultiModalContent`][pydantic_ai.messages.MultiModalContent]:
297+
298+
```python {title="custom_multimodal_presentation.py"}
299+
from pydantic_ai import Agent
300+
from pydantic_ai.messages import MultiModalContent, ImageUrl
301+
302+
agent = Agent('test')
303+
304+
305+
@agent.tool_plain
306+
def get_custom_image() -> MultiModalContent:
307+
"""Get an image with custom presentation."""
308+
return MultiModalContent(
309+
content=ImageUrl(url='https://example.com/chart.png'),
310+
prompt="Here's the requested chart {identifier}:",
311+
tool_return="Chart analysis completed successfully"
312+
)
313+
314+
315+
result = agent.run_sync('Show me the chart')
316+
print(result.output)
317+
#> {"get_custom_image":"Chart analysis completed successfully"}
318+
```
319+
320+
The `{identifier}` placeholder is automatically replaced with a unique identifier for the content. If no custom `prompt` or `tool_return` is provided, default values are used for backward compatibility.
321+
294322
Some models (e.g. Gemini) natively support semi-structured return values, while some expect text (OpenAI) but seem to be just as good at extracting meaning from the data. If a Python object is returned and the model expects a string, the value will be serialized to JSON.
295323

296324
## Function Tools vs. Structured Outputs

pydantic_ai_slim/pydantic_ai/_agent_graph.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -735,19 +735,36 @@ async def process_function_tools( # noqa C901
735735
processed_contents: list[Any] = []
736736
for content in contents:
737737
if isinstance(content, _messages.MultiModalContentTypes):
738-
if isinstance(content, _messages.BinaryContent):
739-
identifier = multi_modal_content_identifier(content.data)
738+
if not isinstance(content, _messages.MultiModalContent):
739+
content = _messages.MultiModalContent(content=content)
740+
# Handle the wrapped content
741+
actual_content = content.content
742+
if isinstance(actual_content, _messages.BinaryContent):
743+
identifier = multi_modal_content_identifier(actual_content.data)
740744
else:
741-
identifier = multi_modal_content_identifier(content.url)
745+
identifier = multi_modal_content_identifier(actual_content.url)
746+
747+
# Use custom prompt or default
748+
if content.prompt is None:
749+
prompt_text = f'This is file {identifier}:'
750+
else:
751+
prompt_text = content.prompt.format(identifier=identifier)
742752

743753
user_parts.append(
744754
_messages.UserPromptPart(
745-
content=[f'This is file {identifier}:', content],
755+
content=[prompt_text, actual_content],
746756
timestamp=result.timestamp,
747757
part_kind='user-prompt',
748758
)
749759
)
750-
processed_contents.append(f'See file {identifier}')
760+
761+
# Use custom tool_return or default
762+
if content.tool_return is None:
763+
tool_return_value = f'See file {identifier}'
764+
else:
765+
tool_return_value = content.tool_return
766+
767+
processed_contents.append(tool_return_value)
751768
else:
752769
processed_contents.append(content)
753770

pydantic_ai_slim/pydantic_ai/messages.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,10 +301,37 @@ def format(self) -> str:
301301
__repr__ = _utils.dataclasses_no_defaults_repr
302302

303303

304+
@dataclass(repr=False)
305+
class MultiModalContent:
306+
"""Wrapper for multi-modal content with custom prompt and tool return value.
307+
308+
This allows customizing how multi-modal content is presented to the model
309+
and what value is returned in the tool response.
310+
"""
311+
312+
content: ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent
313+
"""The actual multi-modal content."""
314+
315+
prompt: str | None = None
316+
"""Custom prompt template with {identifier} placeholder.
317+
If None, uses default "This is file {identifier}:" behavior.
318+
"""
319+
320+
tool_return: Any = None
321+
"""Custom value to return in the tool response.
322+
If None, uses default "See file {identifier}" behavior.
323+
"""
324+
325+
kind: Literal['multi-modal-content'] = 'multi-modal-content'
326+
"""Type identifier for discriminator."""
327+
328+
__repr__ = _utils.dataclasses_no_defaults_repr
329+
330+
304331
UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent'
305332

306333
# Ideally this would be a Union of types, but Python 3.9 requires it to be a string, and strings don't work with `isinstance``.
307-
MultiModalContentTypes = (ImageUrl, AudioUrl, DocumentUrl, VideoUrl, BinaryContent)
334+
MultiModalContentTypes = (ImageUrl, AudioUrl, DocumentUrl, VideoUrl, BinaryContent, MultiModalContent)
308335
_document_format_lookup: dict[str, DocumentFormat] = {
309336
'application/pdf': 'pdf',
310337
'text/plain': 'txt',

0 commit comments

Comments
 (0)