fix(api): Review #2 技術債清理 — I1/S1/S2/S3 全數修正
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 12m13s
All checks were successful
CD Pipeline / build-and-deploy (push) Successful in 12m13s
I1: error_type 欄位補全 - AnomalyCounter.derive_key_from_incident() 新增 從 signal.labels 提取 reason/error_type,確保四欄位完整 S1: 三處 signature 建構邏輯統一 - auto_repair_service._derive_anomaly_key() → 委託 derive_key_from_incident() - approval_execution._get_anomaly_key_from_approval() → 同上 - incident_service.resolve_incident() B4 → 同上 - 消除 3 處重複的 signature 建構程式碼 S2: Redis Pipeline 批次查詢 - get_all_disposition_stats() 從 N+1 hgetall 改為 2 次 Pipeline - Pipeline 1: 批次 hgetall 所有 disposition key - Pipeline 2: 批次 hget metadata (alert_name) - 效能從 O(2N) Redis round-trip 降至 O(2) S3: auto_repair.py get_incident AttributeError 修復 - get_incident() → get_from_working_memory() (pre-existing bug) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -81,7 +81,7 @@ async def evaluate_auto_repair(incident_id: str) -> EvaluateResponse:
|
||||
"""
|
||||
# 取得 Incident
|
||||
incident_service = get_incident_service()
|
||||
incident = await incident_service.get_incident(incident_id)
|
||||
incident = await incident_service.get_from_working_memory(incident_id)
|
||||
|
||||
if not incident:
|
||||
raise HTTPException(
|
||||
@@ -116,7 +116,7 @@ async def execute_auto_repair(request: ExecuteRequest) -> ExecuteResponse:
|
||||
"""
|
||||
# 取得 Incident
|
||||
incident_service = get_incident_service()
|
||||
incident = await incident_service.get_incident(request.incident_id)
|
||||
incident = await incident_service.get_from_working_memory(request.incident_id)
|
||||
|
||||
if not incident:
|
||||
raise HTTPException(
|
||||
|
||||
@@ -118,6 +118,30 @@ class AnomalyCounter:
|
||||
def __init__(self, redis_client: redis.Redis) -> None:
|
||||
self.redis = redis_client
|
||||
|
||||
@staticmethod
|
||||
def derive_key_from_incident(incident) -> str | None:
|
||||
"""
|
||||
從 Incident 推導 anomaly_key,統一使用 hash_signature() 四欄位。
|
||||
2026-04-07 Claude Code: I1+S1 Fix — 統一推導邏輯,補全 error_type
|
||||
|
||||
Args:
|
||||
incident: Incident model (需有 signals, affected_services)
|
||||
|
||||
Returns:
|
||||
anomaly_key (16 char hex) or None
|
||||
"""
|
||||
if not incident.signals:
|
||||
return None
|
||||
signal = incident.signals[0]
|
||||
labels = signal.labels or {}
|
||||
signature = {
|
||||
"alert_name": signal.alert_name,
|
||||
"service": incident.affected_services[0] if incident.affected_services else "",
|
||||
"namespace": labels.get("namespace", ""),
|
||||
"error_type": labels.get("reason", labels.get("error_type", "")),
|
||||
}
|
||||
return AnomalyCounter.hash_signature(signature)
|
||||
|
||||
@staticmethod
|
||||
def hash_signature(signature: dict) -> str:
|
||||
"""
|
||||
@@ -446,15 +470,34 @@ class AnomalyCounter:
|
||||
by_anomaly: list[dict] = []
|
||||
|
||||
try:
|
||||
# S2 Fix: 使用 Pipeline 批次查詢,消除 N+1 問題
|
||||
pattern = f"{self.PREFIX_DISPOSITION}*"
|
||||
keys: list = []
|
||||
async for key in self.redis.scan_iter(match=pattern, count=100):
|
||||
keys.append(key)
|
||||
|
||||
if not keys:
|
||||
return total_summary, by_anomaly
|
||||
|
||||
# Pipeline 1: 批次取得所有 disposition hash
|
||||
pipe = self.redis.pipeline(transaction=False)
|
||||
for key in keys:
|
||||
pipe.hgetall(key)
|
||||
results = await pipe.execute()
|
||||
|
||||
# Pipeline 2: 批次取得所有 metadata (alert_name)
|
||||
anomaly_keys_str = []
|
||||
for key in keys:
|
||||
raw = await self.redis.hgetall(key)
|
||||
key_str = key.decode() if isinstance(key, bytes) else key
|
||||
anomaly_key = key_str.replace(self.PREFIX_DISPOSITION, "")
|
||||
anomaly_keys_str.append(key_str.replace(self.PREFIX_DISPOSITION, ""))
|
||||
|
||||
meta_pipe = self.redis.pipeline(transaction=False)
|
||||
for ak in anomaly_keys_str:
|
||||
meta_pipe.hget(f"{self.PREFIX_METADATA}{ak}", "signature")
|
||||
meta_results = await meta_pipe.execute()
|
||||
|
||||
for i, raw in enumerate(results):
|
||||
anomaly_key = anomaly_keys_str[i]
|
||||
|
||||
d = {
|
||||
"auto_repair": int(raw.get(b"auto_repair", raw.get("auto_repair", 0))),
|
||||
@@ -467,11 +510,9 @@ class AnomalyCounter:
|
||||
for k in total_summary:
|
||||
total_summary[k] += d[k]
|
||||
|
||||
# 嘗試取得 alert_name
|
||||
alert_name = ""
|
||||
try:
|
||||
meta_key = f"{self.PREFIX_METADATA}{anomaly_key}"
|
||||
meta_raw = await self.redis.hget(meta_key, "signature")
|
||||
meta_raw = meta_results[i]
|
||||
if meta_raw:
|
||||
sig = json.loads(meta_raw)
|
||||
alert_name = sig.get("alert_name", "")
|
||||
|
||||
@@ -219,11 +219,8 @@ class ApprovalExecutionService:
|
||||
|
||||
async def _get_anomaly_key_from_approval(self, approval: ApprovalRequest) -> str | None:
|
||||
"""
|
||||
從 approval → incident → anomaly_signature → hash。
|
||||
2026-04-07 Claude Code: Sprint 4 B3
|
||||
|
||||
Returns:
|
||||
anomaly_key or None if not derivable
|
||||
從 approval → incident → anomaly_key。
|
||||
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
|
||||
"""
|
||||
try:
|
||||
if not approval.incident_id:
|
||||
@@ -231,18 +228,10 @@ class ApprovalExecutionService:
|
||||
from src.services.incident_service import get_incident_service
|
||||
incident_service = get_incident_service()
|
||||
incident = await incident_service.get_from_working_memory(approval.incident_id)
|
||||
if not incident or not incident.signals:
|
||||
if not incident:
|
||||
return None
|
||||
# 從第一個 signal 建立 anomaly signature
|
||||
# P0-1 Fix: namespace 從 signal.labels 取,非 getattr
|
||||
signal = incident.signals[0]
|
||||
signature = {
|
||||
"alert_name": signal.alert_name,
|
||||
"service": incident.affected_services[0] if incident.affected_services else "",
|
||||
"namespace": signal.labels.get("namespace", "") if signal.labels else "",
|
||||
}
|
||||
from src.services.anomaly_counter import AnomalyCounter
|
||||
return AnomalyCounter.hash_signature(signature)
|
||||
return AnomalyCounter.derive_key_from_incident(incident)
|
||||
except Exception as e:
|
||||
logger.warning("get_anomaly_key_from_approval_failed", error=str(e))
|
||||
return None
|
||||
|
||||
@@ -485,22 +485,11 @@ class AutoRepairService:
|
||||
@staticmethod
|
||||
def _derive_anomaly_key(incident: Incident) -> str | None:
|
||||
"""
|
||||
從 Incident 推導 anomaly_key,統一使用 AnomalyCounter.hash_signature()。
|
||||
2026-04-07 Claude Code: P0-1 Fix — 統一 hash 演算法
|
||||
|
||||
Returns:
|
||||
anomaly_key or None if not derivable
|
||||
從 Incident 推導 anomaly_key。
|
||||
2026-04-07 Claude Code: I1+S1 Fix — 委託 AnomalyCounter.derive_key_from_incident()
|
||||
"""
|
||||
if not incident.signals:
|
||||
return None
|
||||
signal = incident.signals[0]
|
||||
signature = {
|
||||
"alert_name": signal.alert_name,
|
||||
"service": incident.affected_services[0] if incident.affected_services else "",
|
||||
"namespace": signal.labels.get("namespace", "") if signal.labels else "",
|
||||
}
|
||||
from src.services.anomaly_counter import AnomalyCounter
|
||||
return AnomalyCounter.hash_signature(signature)
|
||||
return AnomalyCounter.derive_key_from_incident(incident)
|
||||
|
||||
def _extract_symptoms(self, incident: Incident) -> SymptomPattern:
|
||||
"""從 Incident 提取症狀模式"""
|
||||
|
||||
@@ -693,21 +693,13 @@ class IncidentService:
|
||||
logger.exception("kb_extract_task_create_failed", incident_id=incident_id)
|
||||
|
||||
# 2026-04-07 Claude Code: Sprint 4 B4 — 手動處理推斷
|
||||
# 若 resolved 但沒有系統修復紀錄 → manual_resolved
|
||||
# I1+S1 Fix: 委託 derive_key_from_incident() 統一推導
|
||||
try:
|
||||
from src.services.anomaly_counter import AnomalyCounter, get_anomaly_counter
|
||||
counter = get_anomaly_counter()
|
||||
if incident.signals:
|
||||
# P0-1 Fix: namespace 從 signal.labels 取
|
||||
signal = incident.signals[0]
|
||||
signature = {
|
||||
"alert_name": signal.alert_name,
|
||||
"service": incident.affected_services[0] if incident.affected_services else "",
|
||||
"namespace": signal.labels.get("namespace", "") if signal.labels else "",
|
||||
}
|
||||
anomaly_key = AnomalyCounter.hash_signature(signature)
|
||||
anomaly_key = AnomalyCounter.derive_key_from_incident(incident)
|
||||
if anomaly_key:
|
||||
disposition = await counter.get_disposition_stats(anomaly_key)
|
||||
# 排除法: 如果已有 auto/human/cold_start 紀錄,代表系統已處理
|
||||
has_system_resolution = (
|
||||
disposition["auto_repair"] > 0
|
||||
or disposition["human_approved"] > 0
|
||||
|
||||
Reference in New Issue
Block a user