Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 58 additions & 1 deletion dspy/clients/lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,9 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]):
if isinstance(c, str):
content_blocks.append({"type": "input_text", "text": c})
elif isinstance(c, list):
content_blocks.extend(c)
# Convert each content item from Chat API format to Responses API format
for item in c:
content_blocks.append(_convert_content_item_to_responses_format(item))
request["input"] = [{"role": msg.get("role", "user"), "content": content_blocks}]

# Convert `response_format` to `text.format` for Responses API
Expand All @@ -480,6 +482,61 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]):
return request


def _convert_content_item_to_responses_format(item: dict[str, Any]) -> dict[str, Any]:
"""
Convert a content item from Chat API format to Responses API format.

For images, converts from:
{"type": "image_url", "image_url": {"url": "..."}}
To:
{"type": "input_image", "source": {"type": "url", "url": "..."}}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're absolutely right - I was using the wrong format. Fixed in 6da81b3 to match the OpenAI documentation. The Responses API expects {"type": "input_image", "image_url": "..."} with image_url as a direct property, not a nested source object.

or:
{"type": "input_image", "source": {"type": "base64", "media_type": "...", "data": "..."}}

For text and other types, passes through as-is (already in correct format).
"""
if item.get("type") == "image_url":
image_url = item.get("image_url", {}).get("url", "")

# Check if it's a base64 data URI
if image_url.startswith("data:"):
# Extract media type and base64 data
# Format: ...
parts = image_url.split(",", 1)
if len(parts) == 2:
header, data = parts
# Extract media type from header (e.g., "data:image/png;base64" -> "image/png")
# Handle both "data:image/png;base64" and "data:image/png" formats
media_type_parts = header.split(";")[0].replace("data:", "")
if media_type_parts:
media_type = media_type_parts
else:
# Fallback to a default media type if extraction fails
media_type = "image/png"

return {
"type": "input_image",
"source": {
"type": "base64",
"media_type": media_type,
"data": data,
}
}
# If data URI is malformed (doesn't have comma separator), fall through to URL handling

# Otherwise treat as URL
return {
"type": "input_image",
"source": {
"type": "url",
"url": image_url,
}
}

# For non-image items, return as-is
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot I this we also need to convert text item from type: text to type: input_text. See https://platform.openai.com/docs/api-reference/responses

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! Fixed in 0055def - now converting text items from type: "text" to type: "input_text" to match the OpenAI Responses API format.

return item


def _get_headers(headers: dict[str, Any] | None = None):
headers = headers or {}
return {
Expand Down
134 changes: 132 additions & 2 deletions tests/clients/test_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,8 @@ def test_reasoning_model_requirements(model_name):
lm = dspy.LM(
model=model_name,
)
assert lm.kwargs["temperature"] == None
assert lm.kwargs["max_completion_tokens"] == None
assert lm.kwargs["temperature"] is None
assert lm.kwargs["max_completion_tokens"] is None


def test_dump_state():
Expand Down Expand Up @@ -633,3 +633,133 @@ def test_api_key_not_saved_in_json():
assert saved_state["lm"]["model"] == "openai/gpt-4o-mini"
assert saved_state["lm"]["temperature"] == 1.0
assert saved_state["lm"]["max_tokens"] == 100


def test_responses_api_converts_images_correctly():
"""Test that image_url format is converted to input_image format for Responses API."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot can we remove docstring for the test cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 43c8fc9

from dspy.clients.lm import _convert_chat_request_to_responses_request

# Test with base64 image
request_with_base64_image = {
"model": "openai/gpt-5-mini",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": ""
}
}
]
}
]
}

result = _convert_chat_request_to_responses_request(request_with_base64_image)

assert "input" in result
assert len(result["input"]) == 1
assert result["input"][0]["role"] == "user"

content = result["input"][0]["content"]
assert len(content) == 2

# First item should be text (passed through as-is since it's already in correct format)
assert content[0]["type"] == "text"
assert content[0]["text"] == "What's in this image?"

# Second item should be converted to input_image format
assert content[1]["type"] == "input_image"
assert content[1]["source"]["type"] == "base64"
assert content[1]["source"]["media_type"] == "image/png"
assert content[1]["source"]["data"] == "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="

# Test with URL image
request_with_url_image = {
"model": "openai/gpt-5-mini",
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.jpg"
}
}
]
}
]
}

result = _convert_chat_request_to_responses_request(request_with_url_image)

content = result["input"][0]["content"]
assert len(content) == 1
assert content[0]["type"] == "input_image"
assert content[0]["source"]["type"] == "url"
assert content[0]["source"]["url"] == "https://example.com/image.jpg"


def test_responses_api_with_image_input():
"""Test that LM with model_type='responses' handles Image inputs correctly."""
api_response = make_response(
output_blocks=[
ResponseOutputMessage(
**{
"id": "msg_1",
"type": "message",
"role": "assistant",
"status": "completed",
"content": [
{"type": "output_text", "text": "This is a test answer with image input.", "annotations": []}
],
},
),
]
)

with mock.patch("litellm.responses", autospec=True, return_value=api_response) as dspy_responses:
lm = dspy.LM(
model="openai/gpt-5-mini",
model_type="responses",
cache=False,
temperature=1.0,
max_tokens=16000,
)

# Test with messages containing an image
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image"},
{
"type": "image_url",
"image_url": {
"url": ""
}
}
]
}
]

lm_result = lm(messages=messages)

assert lm_result == [{"text": "This is a test answer with image input."}]

dspy_responses.assert_called_once()
call_args = dspy_responses.call_args.kwargs

# Verify the request was converted correctly
assert "input" in call_args
content = call_args["input"][0]["content"]

# Check that image was converted to input_image format
image_content = [c for c in content if c.get("type") == "input_image"]
assert len(image_content) == 1
assert image_content[0]["source"]["type"] == "base64"
assert image_content[0]["source"]["media_type"] == "image/png"