fix(api): Phase 18 P0 修復 - 全域熔斷 + Dry-run 驗證
Some checks failed
E2E Health Check / e2e-health (push) Has been cancelled
Some checks failed
E2E Health Check / e2e-health (push) Has been cancelled
2026-03-31 首席架構師審查要求 (91/100 條件通過) P0-1 修復: 全域自動修復熔斷 (ADR-040) - 整合 check_global_repair_cooldown() 前置檢查 - 有狀態服務黑名單 (PostgreSQL/Redis/ClickHouse) - 15 分鐘窗口 >5 次則凍結 - 成功修復後 record_global_repair_action() P0-2 修復: Dry-run 驗證 - restart_deployment 前驗證 Deployment 存在 - delete_pod 前驗證 Pod 存在 - 驗證失敗立即返回,不執行危險操作 安全閉環: 全域熔斷 → 單資源冷卻 → Dry-run → 執行 → 記錄 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -165,7 +165,36 @@ class FailureWatcherService(IFailureWatcher):
|
||||
"analysis": analysis,
|
||||
}
|
||||
|
||||
# Phase 18.3: 安全檢查 - 防止修復風暴
|
||||
# =====================================================================
|
||||
# P0-1 修復: 全域自動修復熔斷檢查 (ADR-040)
|
||||
# 2026-03-31 首席架構師審查要求
|
||||
# =====================================================================
|
||||
from src.services.global_repair_cooldown import check_global_repair_cooldown
|
||||
|
||||
can_global_repair, global_reason = await check_global_repair_cooldown(
|
||||
incident_id=audit_log_id,
|
||||
affected_services=[target_resource],
|
||||
)
|
||||
|
||||
if not can_global_repair:
|
||||
logger.warning(
|
||||
"global_repair_cooldown_blocked",
|
||||
audit_log_id=audit_log_id,
|
||||
target_resource=target_resource,
|
||||
reason=global_reason,
|
||||
)
|
||||
# 強制升級為 CRITICAL,必須人工授權
|
||||
risk_level = "CRITICAL"
|
||||
result["risk_level"] = "CRITICAL"
|
||||
result["next_action"] = "blocked_by_global_cooldown"
|
||||
await self._request_human_approval(
|
||||
audit_log_id=audit_log_id,
|
||||
analysis=analysis,
|
||||
reason=global_reason,
|
||||
)
|
||||
return result
|
||||
|
||||
# Phase 18.3: 單資源安全檢查 - 防止修復風暴
|
||||
can_auto_repair = await self._check_repair_cooldown(
|
||||
target_resource=target_resource,
|
||||
namespace=failure_data.get("namespace", "awoooi"),
|
||||
@@ -202,6 +231,10 @@ class FailureWatcherService(IFailureWatcher):
|
||||
)
|
||||
|
||||
if success:
|
||||
# P0-1 補充: 記錄全域修復動作 (ADR-040)
|
||||
from src.services.global_repair_cooldown import record_global_repair_action
|
||||
await record_global_repair_action()
|
||||
|
||||
# 推送揭露通知 (自動修復成功)
|
||||
await self._push_repair_notification(
|
||||
audit_log_id=audit_log_id,
|
||||
@@ -316,14 +349,30 @@ class FailureWatcherService(IFailureWatcher):
|
||||
|
||||
# =====================================================================
|
||||
# Phase 18.3: 實際執行 K8s 修復操作
|
||||
# P0-2 修復: 加入 Dry-run 驗證 (首席架構師審查要求)
|
||||
# =====================================================================
|
||||
|
||||
if "restart" in repair_strategy.lower() and resource_name:
|
||||
from src.services.executor import get_executor
|
||||
from src.services.executor import OperationType, get_executor
|
||||
|
||||
executor = get_executor()
|
||||
|
||||
# P0-2: Dry-run 驗證資源存在
|
||||
if resource_type == "deployment":
|
||||
dry_run = await executor.validate_action(
|
||||
operation_type=OperationType.RESTART_DEPLOYMENT,
|
||||
resource_name=resource_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
if not dry_run.passed:
|
||||
logger.warning(
|
||||
"auto_repair_dry_run_failed",
|
||||
audit_log_id=audit_log_id,
|
||||
resource=f"{resource_type}/{resource_name}",
|
||||
reason=dry_run.message,
|
||||
)
|
||||
return False, f"Dry-run 失敗: {dry_run.message}"
|
||||
|
||||
# 重啟 Deployment
|
||||
result = await executor.restart_deployment(
|
||||
name=resource_name,
|
||||
@@ -341,6 +390,21 @@ class FailureWatcherService(IFailureWatcher):
|
||||
return False, f"❌ 重啟失敗: {result.message}"
|
||||
|
||||
elif resource_type == "pod":
|
||||
# P0-2: Dry-run 驗證 Pod 存在
|
||||
dry_run = await executor.validate_action(
|
||||
operation_type=OperationType.DELETE_POD,
|
||||
resource_name=resource_name,
|
||||
namespace=namespace,
|
||||
)
|
||||
if not dry_run.passed:
|
||||
logger.warning(
|
||||
"auto_repair_dry_run_failed",
|
||||
audit_log_id=audit_log_id,
|
||||
resource=f"{resource_type}/{resource_name}",
|
||||
reason=dry_run.message,
|
||||
)
|
||||
return False, f"Dry-run 失敗: {dry_run.message}"
|
||||
|
||||
# 刪除 Pod 觸發重建
|
||||
result = await executor.delete_pod(
|
||||
name=resource_name,
|
||||
|
||||
Reference in New Issue
Block a user