feat: add MultiModalContent for customizable multi-modal tool presentation

Wh1isper · Wh1isper · commit 4f6d6606c381 · 2025-06-24T17:23:09.000+08:00
diff --git a/docs/tools.md b/docs/tools.md
@@ -291,6 +291,34 @@ print(result.output)
 ```
 _(This example is complete, it can be run "as is")_
 
+### Customizing Multi-Modal Content Presentation
+
+You can also customize how multi-modal content is presented to the model using [`MultiModalContent`][pydantic_ai.messages.MultiModalContent]:
+
+```python {title="custom_multimodal_presentation.py"}
+from pydantic_ai import Agent
+from pydantic_ai.messages import MultiModalContent, ImageUrl
+
+agent = Agent('test')
+
+
+@agent.tool_plain
+def get_custom_image() -> MultiModalContent:
+    """Get an image with custom presentation."""
+    return MultiModalContent(
+        content=ImageUrl(url='https://example.com/chart.png'),
+        prompt="Here's the requested chart {identifier}:",
+        tool_return="Chart analysis completed successfully"
+    )
+
+
+result = agent.run_sync('Show me the chart')
+print(result.output)
+#> {"get_custom_image":"Chart analysis completed successfully"}
+```
+
+The `{identifier}` placeholder is automatically replaced with a unique identifier for the content. If no custom `prompt` or `tool_return` is provided, default values are used for backward compatibility.
+
 Some models (e.g. Gemini) natively support semi-structured return values, while some expect text (OpenAI) but seem to be just as good at extracting meaning from the data. If a Python object is returned and the model expects a string, the value will be serialized to JSON.
 
 ## Function Tools vs. Structured Outputs
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -735,19 +735,36 @@ async def process_function_tools(  # noqa C901
                     processed_contents: list[Any] = []
                     for content in contents:
                         if isinstance(content, _messages.MultiModalContentTypes):
-                            if isinstance(content, _messages.BinaryContent):
-                                identifier = multi_modal_content_identifier(content.data)
+                            if not isinstance(content, _messages.MultiModalContent):
+                                content = _messages.MultiModalContent(content=content)
+                            # Handle the wrapped content
+                            actual_content = content.content
+                            if isinstance(actual_content, _messages.BinaryContent):
+                                identifier = multi_modal_content_identifier(actual_content.data)
                             else:
-                                identifier = multi_modal_content_identifier(content.url)
+                                identifier = multi_modal_content_identifier(actual_content.url)
+
+                            # Use custom prompt or default
+                            if content.prompt is None:
+                                prompt_text = f'This is file {identifier}:'
+                            else:
+                                prompt_text = content.prompt.format(identifier=identifier)
 
                             user_parts.append(
                                 _messages.UserPromptPart(
-                                    content=[f'This is file {identifier}:', content],
+                                    content=[prompt_text, actual_content],
                                     timestamp=result.timestamp,
                                     part_kind='user-prompt',
                                 )
                             )
-                            processed_contents.append(f'See file {identifier}')
+
+                            # Use custom tool_return or default
+                            if content.tool_return is None:
+                                tool_return_value = f'See file {identifier}'
+                            else:
+                                tool_return_value = content.tool_return
+
+                            processed_contents.append(tool_return_value)
                         else:
                             processed_contents.append(content)
 
diff --git a/pydantic_ai_slim/pydantic_ai/messages.py b/pydantic_ai_slim/pydantic_ai/messages.py
@@ -301,10 +301,37 @@ def format(self) -> str:
     __repr__ = _utils.dataclasses_no_defaults_repr
 
 
+@dataclass(repr=False)
+class MultiModalContent:
+    """Wrapper for multi-modal content with custom prompt and tool return value.
+
+    This allows customizing how multi-modal content is presented to the model
+    and what value is returned in the tool response.
+    """
+
+    content: ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent
+    """The actual multi-modal content."""
+
+    prompt: str | None = None
+    """Custom prompt template with {identifier} placeholder.
+    If None, uses default "This is file {identifier}:" behavior.
+    """
+
+    tool_return: Any = None
+    """Custom value to return in the tool response.
+    If None, uses default "See file {identifier}" behavior.
+    """
+
+    kind: Literal['multi-modal-content'] = 'multi-modal-content'
+    """Type identifier for discriminator."""
+
+    __repr__ = _utils.dataclasses_no_defaults_repr
+
+
 UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent'
 
 # Ideally this would be a Union of types, but Python 3.9 requires it to be a string, and strings don't work with `isinstance``.
-MultiModalContentTypes = (ImageUrl, AudioUrl, DocumentUrl, VideoUrl, BinaryContent)
+MultiModalContentTypes = (ImageUrl, AudioUrl, DocumentUrl, VideoUrl, BinaryContent, MultiModalContent)
 _document_format_lookup: dict[str, DocumentFormat] = {
     'application/pdf': 'pdf',
     'text/plain': 'txt',