fix(api): Phase 18 P0 修復 - 全域熔斷 + Dry-run 驗證
Some checks failed
E2E Health Check / e2e-health (push) Has been cancelled

2026-03-31 首席架構師審查要求 (91/100 條件通過)

P0-1 修復: 全域自動修復熔斷 (ADR-040)
- 整合 check_global_repair_cooldown() 前置檢查
- 有狀態服務黑名單 (PostgreSQL/Redis/ClickHouse)
- 15 分鐘窗口 >5 次則凍結
- 成功修復後 record_global_repair_action()

P0-2 修復: Dry-run 驗證
- restart_deployment 前驗證 Deployment 存在
- delete_pod 前驗證 Pod 存在
- 驗證失敗立即返回,不執行危險操作

安全閉環:
全域熔斷 → 單資源冷卻 → Dry-run → 執行 → 記錄

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-03-31 12:23:02 +08:00
parent c7132a6f07
commit 138a56a432

View File

@@ -165,7 +165,36 @@ class FailureWatcherService(IFailureWatcher):
"analysis": analysis,
}
# Phase 18.3: 安全檢查 - 防止修復風暴
# =====================================================================
# P0-1 修復: 全域自動修復熔斷檢查 (ADR-040)
# 2026-03-31 首席架構師審查要求
# =====================================================================
from src.services.global_repair_cooldown import check_global_repair_cooldown
can_global_repair, global_reason = await check_global_repair_cooldown(
incident_id=audit_log_id,
affected_services=[target_resource],
)
if not can_global_repair:
logger.warning(
"global_repair_cooldown_blocked",
audit_log_id=audit_log_id,
target_resource=target_resource,
reason=global_reason,
)
# 強制升級為 CRITICAL必須人工授權
risk_level = "CRITICAL"
result["risk_level"] = "CRITICAL"
result["next_action"] = "blocked_by_global_cooldown"
await self._request_human_approval(
audit_log_id=audit_log_id,
analysis=analysis,
reason=global_reason,
)
return result
# Phase 18.3: 單資源安全檢查 - 防止修復風暴
can_auto_repair = await self._check_repair_cooldown(
target_resource=target_resource,
namespace=failure_data.get("namespace", "awoooi"),
@@ -202,6 +231,10 @@ class FailureWatcherService(IFailureWatcher):
)
if success:
# P0-1 補充: 記錄全域修復動作 (ADR-040)
from src.services.global_repair_cooldown import record_global_repair_action
await record_global_repair_action()
# 推送揭露通知 (自動修復成功)
await self._push_repair_notification(
audit_log_id=audit_log_id,
@@ -316,14 +349,30 @@ class FailureWatcherService(IFailureWatcher):
# =====================================================================
# Phase 18.3: 實際執行 K8s 修復操作
# P0-2 修復: 加入 Dry-run 驗證 (首席架構師審查要求)
# =====================================================================
if "restart" in repair_strategy.lower() and resource_name:
from src.services.executor import get_executor
from src.services.executor import OperationType, get_executor
executor = get_executor()
# P0-2: Dry-run 驗證資源存在
if resource_type == "deployment":
dry_run = await executor.validate_action(
operation_type=OperationType.RESTART_DEPLOYMENT,
resource_name=resource_name,
namespace=namespace,
)
if not dry_run.passed:
logger.warning(
"auto_repair_dry_run_failed",
audit_log_id=audit_log_id,
resource=f"{resource_type}/{resource_name}",
reason=dry_run.message,
)
return False, f"Dry-run 失敗: {dry_run.message}"
# 重啟 Deployment
result = await executor.restart_deployment(
name=resource_name,
@@ -341,6 +390,21 @@ class FailureWatcherService(IFailureWatcher):
return False, f"❌ 重啟失敗: {result.message}"
elif resource_type == "pod":
# P0-2: Dry-run 驗證 Pod 存在
dry_run = await executor.validate_action(
operation_type=OperationType.DELETE_POD,
resource_name=resource_name,
namespace=namespace,
)
if not dry_run.passed:
logger.warning(
"auto_repair_dry_run_failed",
audit_log_id=audit_log_id,
resource=f"{resource_type}/{resource_name}",
reason=dry_run.message,
)
return False, f"Dry-run 失敗: {dry_run.message}"
# 刪除 Pod 觸發重建
result = await executor.delete_pod(
name=resource_name,