feat: add MultiModalContent for customizable multi-modal tool presentation

Wh1isper · Wh1isper · commit 3b1409154d10 · 2025-06-24T20:02:46.000+08:00
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -734,7 +734,19 @@ async def process_function_tools(  # noqa C901
 
                     processed_contents: list[Any] = []
                     for content in contents:
-                        if isinstance(content, _messages.MultiModalContentTypes):
+                        if isinstance(content, _messages.MultiModalToolResponse):
+                            # Handle new wrapper class with custom content and tool return
+                            user_parts.append(
+                                _messages.UserPromptPart(
+                                    content=list(content.content),
+                                    timestamp=result.timestamp,
+                                    part_kind='user-prompt',
+                                )
+                            )
+                            processed_contents.append(content.tool_return)
+
+                        elif isinstance(content, _messages.MultiModalContentTypes):
+                            # Handle direct multimodal content
                             if isinstance(content, _messages.BinaryContent):
                                 identifier = multi_modal_content_identifier(content.data)
                             else:
@@ -749,6 +761,7 @@ async def process_function_tools(  # noqa C901
                             )
                             processed_contents.append(f'See file {identifier}')
                         else:
+                            # Handle regular content
                             processed_contents.append(content)
 
                     if single_content:
diff --git a/pydantic_ai_slim/pydantic_ai/messages.py b/pydantic_ai_slim/pydantic_ai/messages.py
@@ -303,6 +303,24 @@ def format(self) -> str:
 
 UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent'
 
+
+@dataclass(repr=False)
+class MultiModalToolResponse:
+    """A wrapper for multi-modal content with customizable prompt and tool return value.
+
+    This allows tools to return multi-modal content with custom user prompts and tool return messages,
+    providing more flexibility than the default "This is file {identifier}:" format.
+    """
+
+    content: Sequence[UserContent]
+    """The content sequence to be sent to the model as a UserPromptPart."""
+
+    tool_return: Any
+    """The return value to be used in the tool response."""
+
+    __repr__ = _utils.dataclasses_no_defaults_repr
+
+
 # Ideally this would be a Union of types, but Python 3.9 requires it to be a string, and strings don't work with `isinstance``.
 MultiModalContentTypes = (ImageUrl, AudioUrl, DocumentUrl, VideoUrl, BinaryContent)
 _document_format_lookup: dict[str, DocumentFormat] = {