From b7ea362efc3c6d398a1e0601e9cf8c1a727beda3 Mon Sep 17 00:00:00 2001 From: OG T Date: Tue, 7 Apr 2026 13:13:42 +0800 Subject: [PATCH] =?UTF-8?q?fix(api):=20Review=20#2=20=E6=8A=80=E8=A1=93?= =?UTF-8?q?=E5=82=B5=E6=B8=85=E7=90=86=20=E2=80=94=20I1/S1/S2/S3=20?= =?UTF-8?q?=E5=85=A8=E6=95=B8=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I1: error_type 欄位補全 - AnomalyCounter.derive_key_from_incident() 新增 從 signal.labels 提取 reason/error_type,確保四欄位完整 S1: 三處 signature 建構邏輯統一 - auto_repair_service._derive_anomaly_key() → 委託 derive_key_from_incident() - approval_execution._get_anomaly_key_from_approval() → 同上 - incident_service.resolve_incident() B4 → 同上 - 消除 3 處重複的 signature 建構程式碼 S2: Redis Pipeline 批次查詢 - get_all_disposition_stats() 從 N+1 hgetall 改為 2 次 Pipeline - Pipeline 1: 批次 hgetall 所有 disposition key - Pipeline 2: 批次 hget metadata (alert_name) - 效能從 O(2N) Redis round-trip 降至 O(2) S3: auto_repair.py get_incident AttributeError 修復 - get_incident() → get_from_working_memory() (pre-existing bug) Co-Authored-By: Claude Haiku 4.5 --- apps/api/src/api/v1/auto_repair.py | 4 +- apps/api/src/services/anomaly_counter.py | 51 ++++++++++++++++++-- apps/api/src/services/approval_execution.py | 19 ++------ apps/api/src/services/auto_repair_service.py | 17 ++----- apps/api/src/services/incident_service.py | 14 ++---- 5 files changed, 58 insertions(+), 47 deletions(-) diff --git a/apps/api/src/api/v1/auto_repair.py b/apps/api/src/api/v1/auto_repair.py index 926dcfb0..46b55781 100644 --- a/apps/api/src/api/v1/auto_repair.py +++ b/apps/api/src/api/v1/auto_repair.py @@ -81,7 +81,7 @@ async def evaluate_auto_repair(incident_id: str) -> EvaluateResponse: """ # 取得 Incident incident_service = get_incident_service() - incident = await incident_service.get_incident(incident_id) + incident = await incident_service.get_from_working_memory(incident_id) if not incident: raise HTTPException( @@ -116,7 +116,7 @@ async def execute_auto_repair(request: ExecuteRequest) -> ExecuteResponse: """ # 取得 Incident incident_service = get_incident_service() - incident = await incident_service.get_incident(request.incident_id) + incident = await incident_service.get_from_working_memory(request.incident_id) if not incident: raise HTTPException( diff --git a/apps/api/src/services/anomaly_counter.py b/apps/api/src/services/anomaly_counter.py index 259298d7..92f9c5a4 100644 --- a/apps/api/src/services/anomaly_counter.py +++ b/apps/api/src/services/anomaly_counter.py @@ -118,6 +118,30 @@ class AnomalyCounter: def __init__(self, redis_client: redis.Redis) -> None: self.redis = redis_client + @staticmethod + def derive_key_from_incident(incident) -> str | None: + """ + 從 Incident 推導 anomaly_key,統一使用 hash_signature() 四欄位。 + 2026-04-07 Claude Code: I1+S1 Fix — 統一推導邏輯,補全 error_type + + Args: + incident: Incident model (需有 signals, affected_services) + + Returns: + anomaly_key (16 char hex) or None + """ + if not incident.signals: + return None + signal = incident.signals[0] + labels = signal.labels or {} + signature = { + "alert_name": signal.alert_name, + "service": incident.affected_services[0] if incident.affected_services else "", + "namespace": labels.get("namespace", ""), + "error_type": labels.get("reason", labels.get("error_type", "")), + } + return AnomalyCounter.hash_signature(signature) + @staticmethod def hash_signature(signature: dict) -> str: """ @@ -446,15 +470,34 @@ class AnomalyCounter: by_anomaly: list[dict] = [] try: + # S2 Fix: 使用 Pipeline 批次查詢,消除 N+1 問題 pattern = f"{self.PREFIX_DISPOSITION}*" keys: list = [] async for key in self.redis.scan_iter(match=pattern, count=100): keys.append(key) + if not keys: + return total_summary, by_anomaly + + # Pipeline 1: 批次取得所有 disposition hash + pipe = self.redis.pipeline(transaction=False) + for key in keys: + pipe.hgetall(key) + results = await pipe.execute() + + # Pipeline 2: 批次取得所有 metadata (alert_name) + anomaly_keys_str = [] for key in keys: - raw = await self.redis.hgetall(key) key_str = key.decode() if isinstance(key, bytes) else key - anomaly_key = key_str.replace(self.PREFIX_DISPOSITION, "") + anomaly_keys_str.append(key_str.replace(self.PREFIX_DISPOSITION, "")) + + meta_pipe = self.redis.pipeline(transaction=False) + for ak in anomaly_keys_str: + meta_pipe.hget(f"{self.PREFIX_METADATA}{ak}", "signature") + meta_results = await meta_pipe.execute() + + for i, raw in enumerate(results): + anomaly_key = anomaly_keys_str[i] d = { "auto_repair": int(raw.get(b"auto_repair", raw.get("auto_repair", 0))), @@ -467,11 +510,9 @@ class AnomalyCounter: for k in total_summary: total_summary[k] += d[k] - # 嘗試取得 alert_name alert_name = "" try: - meta_key = f"{self.PREFIX_METADATA}{anomaly_key}" - meta_raw = await self.redis.hget(meta_key, "signature") + meta_raw = meta_results[i] if meta_raw: sig = json.loads(meta_raw) alert_name = sig.get("alert_name", "") diff --git a/apps/api/src/services/approval_execution.py b/apps/api/src/services/approval_execution.py index c47195fa..28a558a5 100644 --- a/apps/api/src/services/approval_execution.py +++ b/apps/api/src/services/approval_execution.py @@ -219,11 +219,8 @@ class ApprovalExecutionService: async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None: """ - 從 approval → incident → anomaly_signature → hash。 - 2026-04-07 Claude Code: Sprint 4 B3 - - Returns: - anomaly_key or None if not derivable + 從 approval → incident → anomaly_key。 + 2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident() """ try: if not approval.incident_id: @@ -231,18 +228,10 @@ class ApprovalExecutionService: from src.services.incident_service import get_incident_service incident_service = get_incident_service() incident = await incident_service.get_from_working_memory(approval.incident_id) - if not incident or not incident.signals: + if not incident: return None - # 從第一個 signal 建立 anomaly signature - # P0-1 Fix: namespace 從 signal.labels 取,非 getattr - signal = incident.signals[0] - signature = { - "alert_name": signal.alert_name, - "service": incident.affected_services[0] if incident.affected_services else "", - "namespace": signal.labels.get("namespace", "") if signal.labels else "", - } from src.services.anomaly_counter import AnomalyCounter - return AnomalyCounter.hash_signature(signature) + return AnomalyCounter.derive_key_from_incident(incident) except Exception as e: logger.warning("get_anomaly_key_from_approval_failed", error=str(e)) return None diff --git a/apps/api/src/services/auto_repair_service.py b/apps/api/src/services/auto_repair_service.py index 61338b84..4e3cce8f 100644 --- a/apps/api/src/services/auto_repair_service.py +++ b/apps/api/src/services/auto_repair_service.py @@ -485,22 +485,11 @@ class AutoRepairService: @staticmethod def _derive_anomaly_key(incident: Incident) -> str | None: """ - 從 Incident 推導 anomaly_key,統一使用 AnomalyCounter.hash_signature()。 - 2026-04-07 Claude Code: P0-1 Fix — 統一 hash 演算法 - - Returns: - anomaly_key or None if not derivable + 從 Incident 推導 anomaly_key。 + 2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident() """ - if not incident.signals: - return None - signal = incident.signals[0] - signature = { - "alert_name": signal.alert_name, - "service": incident.affected_services[0] if incident.affected_services else "", - "namespace": signal.labels.get("namespace", "") if signal.labels else "", - } from src.services.anomaly_counter import AnomalyCounter - return AnomalyCounter.hash_signature(signature) + return AnomalyCounter.derive_key_from_incident(incident) def _extract_symptoms(self, incident: Incident) -> SymptomPattern: """從 Incident 提取症狀模式""" diff --git a/apps/api/src/services/incident_service.py b/apps/api/src/services/incident_service.py index 106bc4e2..9e0c6ef7 100644 --- a/apps/api/src/services/incident_service.py +++ b/apps/api/src/services/incident_service.py @@ -693,21 +693,13 @@ class IncidentService: logger.exception("kb_extract_task_create_failed", incident_id=incident_id) # 2026-04-07 Claude Code: Sprint 4 B4 — 手動處理推斷 - # 若 resolved 但沒有系統修復紀錄 → manual_resolved + # I1+S1 Fix: 委託 derive_key_from_incident() 統一推導 try: from src.services.anomaly_counter import AnomalyCounter, get_anomaly_counter counter = get_anomaly_counter() - if incident.signals: - # P0-1 Fix: namespace 從 signal.labels 取 - signal = incident.signals[0] - signature = { - "alert_name": signal.alert_name, - "service": incident.affected_services[0] if incident.affected_services else "", - "namespace": signal.labels.get("namespace", "") if signal.labels else "", - } - anomaly_key = AnomalyCounter.hash_signature(signature) + anomaly_key = AnomalyCounter.derive_key_from_incident(incident) + if anomaly_key: disposition = await counter.get_disposition_stats(anomaly_key) - # 排除法: 如果已有 auto/human/cold_start 紀錄,代表系統已處理 has_system_resolution = ( disposition["auto_repair"] > 0 or disposition["human_approved"] > 0