AstrBotDevs · Sjshi763 · Jan 9, 2026 · Jan 17, 2026 · Jan 17, 2026 · Jan 18, 2026
diff --git a/astrbot/core/message/components.py b/astrbot/core/message/components.py
@@ -528,6 +528,13 @@ class Reply(BaseMessageComponent):
     def __init__(self, **_):
         super().__init__(**_)
 
+    async def to_dict(self) -> dict:
+        chain = self.chain if self.chain is not None else []
+        return {
+            "type": self.type.lower(),
+            "data": {"id": self.id, "chain": [await comp.to_dict() for comp in chain]},
+        }
+
 
 class Poke(BaseMessageComponent):
     type: str = ComponentType.Poke
@@ -630,11 +637,30 @@ async def to_dict(self) -> dict:
 class Json(BaseMessageComponent):
     type = ComponentType.Json
     data: dict
+    raw_data: str | None = None
 
     def __init__(self, data: str | dict, **_):
+        raw_data = None
         if isinstance(data, str):
-            data = json.loads(data)
-        super().__init__(data=data, **_)
+            raw_data = data
+            try:
+                data = json.loads(data)
+            except json.JSONDecodeError:
+                data = {"raw": data}
+        super().__init__(data=data, raw_data=raw_data, **_)
+
+    async def to_dict(self) -> dict:
+        # 如果原始数据是字符串，使用 content 包装形式
+        if self.raw_data is not None:
+            return {
+                "type": self.type.lower(),
+                "data": {"content": self.raw_data},
+            }
+        # 如果原始数据是字典，直接返回原始字典结构
+        return {
+            "type": self.type.lower(),
+            "data": self.data,
+        }
 
 
 class Unknown(BaseMessageComponent):

diff --git a/astrbot/core/provider/sources/openai_source.py b/astrbot/core/provider/sources/openai_source.py
@@ -235,6 +235,31 @@ def _extract_usage(self, usage: CompletionUsage) -> TokenUsage:
             output=completion_tokens,
         )
 
+    def _parse_image_url_part(self, image_field) -> str | None:
+        """解析 OpenAI image_url 部分并提取 URL
+
+        Args:
+            image_field: 可以是字典或字符串格式的 image_url 字段
+
+        Returns:
+            提取的 URL 或 base64 数据，如果无效则返回 None
+        """
+        if isinstance(image_field, dict):
+            url = image_field.get("url")
+        else:
+            url = image_field
+
+        if not url:
+            return None
+
+        # 统一处理 base64 格式，提取纯 base64 数据
+        if isinstance(url, str) and "base64," in url:
+            return url.split("base64,", 1)[1]
+        elif isinstance(url, str) and url.startswith("base64://"):
+            return url.replace("base64://", "")
+        else:
+            return url
+
     async def _parse_openai_completion(
         self, completion: ChatCompletion, tools: ToolSet | None
     ) -> LLMResponse:
@@ -247,18 +272,56 @@ async def _parse_openai_completion(
 
         # parse the text completion
         if choice.message.content is not None:
-            # text completion
-            completion_text = str(choice.message.content).strip()
-            # specially, some providers may set <think> tags around reasoning content in the completion text,
-            # we use regex to remove them, and store then in reasoning_content field
-            reasoning_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
-            matches = reasoning_pattern.findall(completion_text)
-            if matches:
-                llm_response.reasoning_content = "\n".join(
-                    [match.strip() for match in matches],
-                )
-                completion_text = reasoning_pattern.sub("", completion_text).strip()
-            llm_response.result_chain = MessageChain().message(completion_text)
+            # content can be either a plain string or a multimodal list
+            content = choice.message.content
+            # handle multimodal content returned as a list of parts
+            if isinstance(content, list):
+                reasoning_parts = []
+                mc = MessageChain()
+                for part in content:
+                    if not isinstance(part, dict):
+                        # fallback: append as plain text
+                        mc.message(str(part))
+                        continue
+                    ptype = part.get("type")
+                    if ptype == "text":
+                        mc.message(part.get("text", ""))
+                    elif ptype == "image_url":
+                        image_field = part.get("image_url")
+                        url = self._parse_image_url_part(image_field)
+                        if url:
+                            # 判断是 base64 数据还是 URL
+                            if url.startswith("http"):
+                                mc.url_image(url)
+                            else:
+                                mc.base64_image(url)
+                    elif ptype == "think":
+                        # collect reasoning parts for later extraction
+                        think_val = part.get("think")
+                        if think_val:
+                            reasoning_parts.append(str(think_val))
+                    else:
+                        # unknown part type, append its textual representation
+                        mc.message(json.dumps(part, ensure_ascii=False))
+
+                if reasoning_parts:
+                    llm_response.reasoning_content = "\n".join(
+                        [rp.strip() for rp in reasoning_parts]
+                    )
+                llm_response.result_chain = mc
+            else:
+                # text completion (string)
+                completion_text = str(content).strip()
+                # specially, some providers may set <think> tags around reasoning content in the completion text,
+                # we use regex to remove them, and store then in reasoning_content field
+                reasoning_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
+                matches = reasoning_pattern.findall(completion_text)
+                if matches:
+                    llm_response.reasoning_content = "\n".join(
+                        [match.strip() for match in matches],
+                    )
+                    completion_text = reasoning_pattern.sub("", completion_text).strip()
+                llm_response.result_chain = MessageChain().message(completion_text)
 
         # parse the reasoning content if any
         # the priority is higher than the <think> tag extraction