Merge pull request #1012 from SuanmoSuanyangTechnology/fix/wxy-032

feat(workflow): augment logging queries and ameliorate error handling
2026-04-27 16:00:49 +08:00
parent c12d06bb07 adb7f873b5
commit ce4a3daec7
7 changed files with 126 additions and 19 deletions
--- a/api/app/core/workflow/executor.py
+++ b/api/app/core/workflow/executor.py
@@ -16,6 +16,7 @@ from app.core.workflow.engine.runtime_schema import ExecutionContext
 from app.core.workflow.engine.state_manager import WorkflowStateManager
 from app.core.workflow.engine.stream_output_coordinator import StreamOutputCoordinator
 from app.core.workflow.engine.variable_pool import VariablePool, VariablePoolInitializer
+from app.core.workflow.nodes.base_node import NodeExecutionError

 logger = logging.getLogger(__name__)

@@ -326,10 +327,43 @@ class WorkflowExecutor:

            logger.error(f"Workflow execution failed: execution_id={self.execution_context.execution_id}, error={e}",
                         exc_info=True)
+
+            # 1) 尝试从 checkpoint 回补已成功节点的 node_outputs
+            recovered: dict[str, Any] = {}
+            try:
+                if self.graph is not None:
+                    recovered = self.graph.get_state(
+                        self.execution_context.checkpoint_config
+                    ).values or {}
+            except Exception as recover_err:
+                logger.warning(
+                    f"Recover state on failure failed: {recover_err}, "
+                    f"execution_id={self.execution_context.execution_id}"
+                )
+
            if result is None:
-                result = {"error": str(e)}
+                result = dict(recovered) if recovered else {}
            else:
-                result["error"] = str(e)
+                # 已有 result 与 recovered 合并，node_outputs 深度合并
+                for k, v in recovered.items():
+                    if k == "node_outputs" and isinstance(v, dict):
+                        existing = result.get("node_outputs") or {}
+                        result["node_outputs"] = {**v, **existing}
+                    else:
+                        result.setdefault(k, v)
+
+            # 2) 如果是节点抛出的 NodeExecutionError，把失败节点的 node_output 注入 node_outputs
+            failed_node_id: str | None = None
+            if isinstance(e, NodeExecutionError):
+                failed_node_id = e.node_id
+                node_outputs = result.setdefault("node_outputs", {})
+                # 不覆盖已有（理论上不会有），保底写入失败节点记录
+                node_outputs.setdefault(e.node_id, e.node_output)
+
+            result["error"] = str(e)
+            if failed_node_id:
+                result["error_node"] = failed_node_id
+
            yield {
                "event": "workflow_end",
                "data": self.result_builder.build_final_output(
--- a/api/app/core/workflow/nodes/base_node.py
+++ b/api/app/core/workflow/nodes/base_node.py
@@ -1,5 +1,6 @@
 import asyncio
 import logging
+import time
 import uuid
 from abc import ABC, abstractmethod
 from datetime import datetime
@@ -22,6 +23,20 @@ from app.services.multimodal_service import MultimodalService
 logger = logging.getLogger(__name__)


+class NodeExecutionError(Exception):
+    """节点执行失败异常。
+
+    携带失败节点的完整 node_output，供 executor 兜底注入 node_outputs，
+    保证 workflow_executions.output_data 里能看到失败节点的日志记录。
+    """
+
+    def __init__(self, node_id: str, node_output: dict[str, Any], error_message: str):
+        super().__init__(f"Node {node_id} execution failed: {error_message}")
+        self.node_id = node_id
+        self.node_output = node_output
+        self.error_message = error_message
+
+
 class BaseNode(ABC):
    """Base class for workflow nodes.

@@ -396,6 +411,8 @@ class BaseNode(ABC):
            "elapsed_time": elapsed_time,
            "token_usage": token_usage,
            "error": None,
+            # 单调递增序号，用于日志按执行顺序排序（JSONB 不保证 key 顺序）
+            "execution_order": time.monotonic_ns(),
            **self._extract_extra_fields(business_result),
        }
        final_output = {
@@ -444,7 +461,9 @@ class BaseNode(ABC):
            "output": None,
            "elapsed_time": elapsed_time,
            "token_usage": None,
-            "error": error_message
+            "error": error_message,
+            # 单调递增序号，用于日志按执行顺序排序
+            "execution_order": time.monotonic_ns(),
        }

        # if error_edge:
@@ -466,7 +485,12 @@ class BaseNode(ABC):
            **node_output
        })
        logger.error(f"Node {self.node_id} execution failed, stopping workflow: {error_message}")
-        raise Exception(f"Node {self.node_id} execution failed: {error_message}")
+        # 抛出自定义异常，把 node_output 带给 executor，供其写入 node_outputs
+        raise NodeExecutionError(
+            node_id=self.node_id,
+            node_output=node_output,
+            error_message=error_message,
+        )

    def _extract_input(self, state: WorkflowState, variable_pool: VariablePool) -> dict[str, Any]:
        """Extracts the input data for this node (used for logging or audit).
--- a/api/app/core/workflow/nodes/cycle_graph/iteration.py
+++ b/api/app/core/workflow/nodes/cycle_graph/iteration.py
@@ -174,6 +174,9 @@ class IterationRuntime:
                            continue
                        node_type = result.get("node_outputs", {}).get(node_name, {}).get("node_type")
                        cycle_variable = {"item": item} if node_type == NodeType.CYCLE_START else None
+                        node_cfg = next(
+                            (n for n in self.cycle_nodes if n.get("id") == node_name), None
+                        )
                        self.event_write({
                            "type": "cycle_item",
                            "data": {
--- a/api/app/core/workflow/nodes/http_request/node.py
+++ b/api/app/core/workflow/nodes/http_request/node.py
@@ -255,9 +255,18 @@ class HttpRequestNode(BaseNode):
            case HttpContentType.NONE:
                return {}
            case HttpContentType.JSON:
-                content["json"] = json.loads(self._render_template(
+                rendered = self._render_template(
                    self.typed_config.body.data, variable_pool
-                ))
+                )
+                if not rendered or not rendered.strip():
+                    # 第三方导入的工作流可能出现 content_type=json 但 data 为空的情况，视为无 body
+                    return {}
+                try:
+                    content["json"] = json.loads(rendered)
+                except json.JSONDecodeError as e:
+                    raise RuntimeError(
+                        f"Invalid JSON body for HTTP request node: {e.msg} (data={rendered!r})"
+                    )
            case HttpContentType.FROM_DATA:
                data = {}
                files = []