From 531d785629461beaef298717f13a2939e9bb1747 Mon Sep 17 00:00:00 2001
From: Timebomb2018 <18868801967@163.com>
Date: Mon, 27 Apr 2026 17:56:58 +0800
Subject: [PATCH] fix(multimodal): support HTML image tags in document
extraction and chat responses
- Replace plain image URLs with `
` HTML tags in multimodal and document extractor services
- Propagate citations from workflow end events to client responses
- Update system prompts to instruct LLMs to render images using Markdown `` with strict UUID-preserving URL copying
---
api/app/controllers/service/app_api_controller.py | 2 +-
.../core/workflow/nodes/document_extractor/node.py | 2 +-
api/app/services/app_chat_service.py | 10 ++++++++--
api/app/services/draft_run_service.py | 10 ++++++++--
api/app/services/multimodal_service.py | 2 +-
api/app/services/workflow_service.py | 13 ++++++++-----
6 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/api/app/controllers/service/app_api_controller.py b/api/app/controllers/service/app_api_controller.py
index 93e88dc5..c2755bdc 100644
--- a/api/app/controllers/service/app_api_controller.py
+++ b/api/app/controllers/service/app_api_controller.py
@@ -296,7 +296,7 @@ async def chat(
}
)
- # 多 Agent 非流式返回
+ # workflow 非流式返回
result = await app_chat_service.workflow_chat(
message=payload.message,
diff --git a/api/app/core/workflow/nodes/document_extractor/node.py b/api/app/core/workflow/nodes/document_extractor/node.py
index ea1070f4..5fefbc94 100644
--- a/api/app/core/workflow/nodes/document_extractor/node.py
+++ b/api/app/core/workflow/nodes/document_extractor/node.py
@@ -182,7 +182,7 @@ class DocExtractorNode(BaseNode):
mime_type=f"image/{ext}",
is_file=True,
).model_dump())
- text = text + f"\n{placeholder}: {url}"
+ text = text + f"\n{placeholder}:
"
except Exception as e:
logger.error(f"Node {self.node_id}: failed to save image {placeholder}: {e}")
diff --git a/api/app/services/app_chat_service.py b/api/app/services/app_chat_service.py
index 12f54c03..cc2b02f1 100644
--- a/api/app/services/app_chat_service.py
+++ b/api/app/services/app_chat_service.py
@@ -161,7 +161,10 @@ class AppChatService:
f.type == FileType.DOCUMENT for f in files
):
system_prompt += (
- "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]: http://...,请在回答中用 Markdown 格式  展示对应图片。"
+ "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]:
,"
+ "请在回答中用 Markdown 格式  展示对应图片。"
+ "重要:图片 URL 中包含 UUID(如 /storage/permanent/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx),"
+ "必须将 src 属性的值原封不动复制到 Markdown 的括号中,不得增删任何字符。"
)
# 创建 LangChain Agent
@@ -448,7 +451,10 @@ class AppChatService:
):
from langchain.agents import create_agent
system_prompt += (
- "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]: http://...,请在回答中用 Markdown 格式  展示对应图片。"
+ "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]:
,"
+ "请在回答中用 Markdown 格式  展示对应图片。"
+ "重要:图片 URL 中包含 UUID(如 /storage/permanent/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx),"
+ "必须将 src 属性的值原封不动复制到 Markdown 的括号中,不得增删任何字符。"
)
# 创建 LangChain Agent
diff --git a/api/app/services/draft_run_service.py b/api/app/services/draft_run_service.py
index 2566a50f..16d856ca 100644
--- a/api/app/services/draft_run_service.py
+++ b/api/app/services/draft_run_service.py
@@ -650,7 +650,10 @@ class AgentRunService:
)
if has_doc_with_images:
system_prompt += (
- "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]: http://...,请在回答中用 Markdown 格式  展示对应图片。"
+ "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]:
,"
+ "请在回答中用 Markdown 格式  展示对应图片。"
+ "重要:图片 URL 中包含 UUID(如 /storage/permanent/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx),"
+ "必须将 src 属性的值原封不动复制到 Markdown 的括号中,不得增删任何字符。"
)
agent = LangChainAgent(
@@ -924,7 +927,10 @@ class AgentRunService:
)
if has_doc_with_images:
system_prompt += (
- "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]: http://...,请在回答中用 Markdown 格式  展示对应图片。"
+ "\n\n文档文字中包含图片位置标记如 [图片 第2页 第1张]:
,"
+ "请在回答中用 Markdown 格式  展示对应图片。"
+ "重要:图片 URL 中包含 UUID(如 /storage/permanent/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx),"
+ "必须将 src 属性的值原封不动复制到 Markdown 的括号中,不得增删任何字符。"
)
# 创建 LangChain Agent
diff --git a/api/app/services/multimodal_service.py b/api/app/services/multimodal_service.py
index c362158c..dd021357 100644
--- a/api/app/services/multimodal_service.py
+++ b/api/app/services/multimodal_service.py
@@ -400,7 +400,7 @@ class MultimodalService:
# 在文本内容中追加图片位置标记
if result and result[-1].get("type") in ("text", "document"):
key = "text" if "text" in result[-1] else list(result[-1].keys())[-1]
- result[-1][key] = result[-1].get(key, "") + f"\n[图片 {placeholder}]: {img_url}"
+ result[-1][key] = result[-1].get(key, "") + f"\n[图片 {placeholder}]:
"
# 将图片以视觉格式追加到消息内容中
img_file = FileInput(
type=FileType.IMAGE,
diff --git a/api/app/services/workflow_service.py b/api/app/services/workflow_service.py
index b35656d9..27327e99 100644
--- a/api/app/services/workflow_service.py
+++ b/api/app/services/workflow_service.py
@@ -554,13 +554,16 @@ class WorkflowService:
}
}
case "workflow_end":
+ data = {
+ "elapsed_time": payload.get("elapsed_time"),
+ "message_length": len(payload.get("output", "")),
+ "error": payload.get("error", "")
+ }
+ if "citations" in payload and payload["citations"]:
+ data["citations"] = payload["citations"]
return {
"event": "end",
- "data": {
- "elapsed_time": payload.get("elapsed_time"),
- "message_length": len(payload.get("output", "")),
- "error": payload.get("error", "")
- }
+ "data": data
}
case "node_start" | "node_end" | "node_error" | "cycle_item":
return None