fix(api): Review #2 技術債清理 — I1/S1/S2/S3 全數修正
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 12m13s

I1: error_type 欄位補全
- AnomalyCounter.derive_key_from_incident() 新增
  從 signal.labels 提取 reason/error_type,確保四欄位完整

S1: 三處 signature 建構邏輯統一
- auto_repair_service._derive_anomaly_key() → 委託 derive_key_from_incident()
- approval_execution._get_anomaly_key_from_approval() → 同上
- incident_service.resolve_incident() B4 → 同上
- 消除 3 處重複的 signature 建構程式碼

S2: Redis Pipeline 批次查詢
- get_all_disposition_stats() 從 N+1 hgetall 改為 2 次 Pipeline
- Pipeline 1: 批次 hgetall 所有 disposition key
- Pipeline 2: 批次 hget metadata (alert_name)
- 效能從 O(2N) Redis round-trip 降至 O(2)

S3: auto_repair.py get_incident AttributeError 修復
- get_incident() → get_from_working_memory() (pre-existing bug)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-07 13:13:42 +08:00
parent b20a619a3d
commit b7ea362efc
5 changed files with 58 additions and 47 deletions

View File

@@ -81,7 +81,7 @@ async def evaluate_auto_repair(incident_id: str) -> EvaluateResponse:
"""
# 取得 Incident
incident_service = get_incident_service()
incident = await incident_service.get_incident(incident_id)
incident = await incident_service.get_from_working_memory(incident_id)
if not incident:
raise HTTPException(
@@ -116,7 +116,7 @@ async def execute_auto_repair(request: ExecuteRequest) -> ExecuteResponse:
"""
# 取得 Incident
incident_service = get_incident_service()
incident = await incident_service.get_incident(request.incident_id)
incident = await incident_service.get_from_working_memory(request.incident_id)
if not incident:
raise HTTPException(

View File

@@ -118,6 +118,30 @@ class AnomalyCounter:
def __init__(self, redis_client: redis.Redis) -> None:
self.redis = redis_client
@staticmethod
def derive_key_from_incident(incident) -> str | None:
"""
從 Incident 推導 anomaly_key統一使用 hash_signature() 四欄位。
2026-04-07 Claude Code: I1+S1 Fix — 統一推導邏輯,補全 error_type
Args:
incident: Incident model (需有 signals, affected_services)
Returns:
anomaly_key (16 char hex) or None
"""
if not incident.signals:
return None
signal = incident.signals[0]
labels = signal.labels or {}
signature = {
"alert_name": signal.alert_name,
"service": incident.affected_services[0] if incident.affected_services else "",
"namespace": labels.get("namespace", ""),
"error_type": labels.get("reason", labels.get("error_type", "")),
}
return AnomalyCounter.hash_signature(signature)
@staticmethod
def hash_signature(signature: dict) -> str:
"""
@@ -446,15 +470,34 @@ class AnomalyCounter:
by_anomaly: list[dict] = []
try:
# S2 Fix: 使用 Pipeline 批次查詢,消除 N+1 問題
pattern = f"{self.PREFIX_DISPOSITION}*"
keys: list = []
async for key in self.redis.scan_iter(match=pattern, count=100):
keys.append(key)
if not keys:
return total_summary, by_anomaly
# Pipeline 1: 批次取得所有 disposition hash
pipe = self.redis.pipeline(transaction=False)
for key in keys:
pipe.hgetall(key)
results = await pipe.execute()
# Pipeline 2: 批次取得所有 metadata (alert_name)
anomaly_keys_str = []
for key in keys:
raw = await self.redis.hgetall(key)
key_str = key.decode() if isinstance(key, bytes) else key
anomaly_key = key_str.replace(self.PREFIX_DISPOSITION, "")
anomaly_keys_str.append(key_str.replace(self.PREFIX_DISPOSITION, ""))
meta_pipe = self.redis.pipeline(transaction=False)
for ak in anomaly_keys_str:
meta_pipe.hget(f"{self.PREFIX_METADATA}{ak}", "signature")
meta_results = await meta_pipe.execute()
for i, raw in enumerate(results):
anomaly_key = anomaly_keys_str[i]
d = {
"auto_repair": int(raw.get(b"auto_repair", raw.get("auto_repair", 0))),
@@ -467,11 +510,9 @@ class AnomalyCounter:
for k in total_summary:
total_summary[k] += d[k]
# 嘗試取得 alert_name
alert_name = ""
try:
meta_key = f"{self.PREFIX_METADATA}{anomaly_key}"
meta_raw = await self.redis.hget(meta_key, "signature")
meta_raw = meta_results[i]
if meta_raw:
sig = json.loads(meta_raw)
alert_name = sig.get("alert_name", "")

View File

@@ -219,11 +219,8 @@ class ApprovalExecutionService:
async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None:
"""
從 approval → incident → anomaly_signature → hash
2026-04-07 Claude Code: Sprint 4 B3
Returns:
anomaly_key or None if not derivable
從 approval → incident → anomaly_key
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
"""
try:
if not approval.incident_id:
@@ -231,18 +228,10 @@ class ApprovalExecutionService:
from src.services.incident_service import get_incident_service
incident_service = get_incident_service()
incident = await incident_service.get_from_working_memory(approval.incident_id)
if not incident or not incident.signals:
if not incident:
return None
# 從第一個 signal 建立 anomaly signature
# P0-1 Fix: namespace 從 signal.labels 取,非 getattr
signal = incident.signals[0]
signature = {
"alert_name": signal.alert_name,
"service": incident.affected_services[0] if incident.affected_services else "",
"namespace": signal.labels.get("namespace", "") if signal.labels else "",
}
from src.services.anomaly_counter import AnomalyCounter
return AnomalyCounter.hash_signature(signature)
return AnomalyCounter.derive_key_from_incident(incident)
except Exception as e:
logger.warning("get_anomaly_key_from_approval_failed", error=str(e))
return None

View File

@@ -485,22 +485,11 @@ class AutoRepairService:
@staticmethod
def _derive_anomaly_key(incident: Incident) -> str | None:
"""
從 Incident 推導 anomaly_key,統一使用 AnomalyCounter.hash_signature()
2026-04-07 Claude Code: P0-1 Fix — 統一 hash 演算法
Returns:
anomaly_key or None if not derivable
從 Incident 推導 anomaly_key。
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
"""
if not incident.signals:
return None
signal = incident.signals[0]
signature = {
"alert_name": signal.alert_name,
"service": incident.affected_services[0] if incident.affected_services else "",
"namespace": signal.labels.get("namespace", "") if signal.labels else "",
}
from src.services.anomaly_counter import AnomalyCounter
return AnomalyCounter.hash_signature(signature)
return AnomalyCounter.derive_key_from_incident(incident)
def _extract_symptoms(self, incident: Incident) -> SymptomPattern:
"""從 Incident 提取症狀模式"""

View File

@@ -693,21 +693,13 @@ class IncidentService:
logger.exception("kb_extract_task_create_failed", incident_id=incident_id)
# 2026-04-07 Claude Code: Sprint 4 B4 — 手動處理推斷
# 若 resolved 但沒有系統修復紀錄 → manual_resolved
# I1+S1 Fix: 委託 derive_key_from_incident() 統一推導
try:
from src.services.anomaly_counter import AnomalyCounter, get_anomaly_counter
counter = get_anomaly_counter()
if incident.signals:
# P0-1 Fix: namespace 從 signal.labels 取
signal = incident.signals[0]
signature = {
"alert_name": signal.alert_name,
"service": incident.affected_services[0] if incident.affected_services else "",
"namespace": signal.labels.get("namespace", "") if signal.labels else "",
}
anomaly_key = AnomalyCounter.hash_signature(signature)
anomaly_key = AnomalyCounter.derive_key_from_incident(incident)
if anomaly_key:
disposition = await counter.get_disposition_stats(anomaly_key)
# 排除法: 如果已有 auto/human/cold_start 紀錄,代表系統已處理
has_system_resolution = (
disposition["auto_repair"] > 0
or disposition["human_approved"] > 0