diff --git a/apps/api/src/services/failure_watcher.py b/apps/api/src/services/failure_watcher.py index ad8786a8..c5426a36 100644 --- a/apps/api/src/services/failure_watcher.py +++ b/apps/api/src/services/failure_watcher.py @@ -165,7 +165,36 @@ class FailureWatcherService(IFailureWatcher): "analysis": analysis, } - # Phase 18.3: 安全檢查 - 防止修復風暴 + # ===================================================================== + # P0-1 修復: 全域自動修復熔斷檢查 (ADR-040) + # 2026-03-31 首席架構師審查要求 + # ===================================================================== + from src.services.global_repair_cooldown import check_global_repair_cooldown + + can_global_repair, global_reason = await check_global_repair_cooldown( + incident_id=audit_log_id, + affected_services=[target_resource], + ) + + if not can_global_repair: + logger.warning( + "global_repair_cooldown_blocked", + audit_log_id=audit_log_id, + target_resource=target_resource, + reason=global_reason, + ) + # 強制升級為 CRITICAL,必須人工授權 + risk_level = "CRITICAL" + result["risk_level"] = "CRITICAL" + result["next_action"] = "blocked_by_global_cooldown" + await self._request_human_approval( + audit_log_id=audit_log_id, + analysis=analysis, + reason=global_reason, + ) + return result + + # Phase 18.3: 單資源安全檢查 - 防止修復風暴 can_auto_repair = await self._check_repair_cooldown( target_resource=target_resource, namespace=failure_data.get("namespace", "awoooi"), @@ -202,6 +231,10 @@ class FailureWatcherService(IFailureWatcher): ) if success: + # P0-1 補充: 記錄全域修復動作 (ADR-040) + from src.services.global_repair_cooldown import record_global_repair_action + await record_global_repair_action() + # 推送揭露通知 (自動修復成功) await self._push_repair_notification( audit_log_id=audit_log_id, @@ -316,14 +349,30 @@ class FailureWatcherService(IFailureWatcher): # ===================================================================== # Phase 18.3: 實際執行 K8s 修復操作 + # P0-2 修復: 加入 Dry-run 驗證 (首席架構師審查要求) # ===================================================================== if "restart" in repair_strategy.lower() and resource_name: - from src.services.executor import get_executor + from src.services.executor import OperationType, get_executor executor = get_executor() + # P0-2: Dry-run 驗證資源存在 if resource_type == "deployment": + dry_run = await executor.validate_action( + operation_type=OperationType.RESTART_DEPLOYMENT, + resource_name=resource_name, + namespace=namespace, + ) + if not dry_run.passed: + logger.warning( + "auto_repair_dry_run_failed", + audit_log_id=audit_log_id, + resource=f"{resource_type}/{resource_name}", + reason=dry_run.message, + ) + return False, f"Dry-run 失敗: {dry_run.message}" + # 重啟 Deployment result = await executor.restart_deployment( name=resource_name, @@ -341,6 +390,21 @@ class FailureWatcherService(IFailureWatcher): return False, f"❌ 重啟失敗: {result.message}" elif resource_type == "pod": + # P0-2: Dry-run 驗證 Pod 存在 + dry_run = await executor.validate_action( + operation_type=OperationType.DELETE_POD, + resource_name=resource_name, + namespace=namespace, + ) + if not dry_run.passed: + logger.warning( + "auto_repair_dry_run_failed", + audit_log_id=audit_log_id, + resource=f"{resource_type}/{resource_name}", + reason=dry_run.message, + ) + return False, f"Dry-run 失敗: {dry_run.message}" + # 刪除 Pod 觸發重建 result = await executor.delete_pod( name=resource_name,