diff --git a/.agents/skills/03-openclaw-cognitive-expert.md b/.agents/skills/03-openclaw-cognitive-expert.md index 1f2a5dd3a..e35d1e125 100644 --- a/.agents/skills/03-openclaw-cognitive-expert.md +++ b/.agents/skills/03-openclaw-cognitive-expert.md @@ -10,11 +10,11 @@ | 欄位 | 值 | |------|-----| -| **版本** | v1.3 | +| **版本** | v1.4 | | **建立日期** | 2026-03-20 (台北) | | **建立者** | Claude Code | -| **最後修改** | 2026-03-25 23:58 (台北) | -| **修改者** | Claude Code | +| **最後修改** | 2026-03-26 11:10 (台北) | +| **修改者** | Claude Code (首席架構師) | ### 變更紀錄 @@ -24,6 +24,7 @@ | v1.1 | 2026-03-23 | Claude Code | DecisionManager 雙軌引擎 | | v1.2 | 2026-03-25 | Claude Code | 智能路由引擎 + Tool/Modular 關係 | | v1.3 | 2026-03-25 | Claude Code | 加入文件資訊區塊 | +| v1.4 | 2026-03-26 | Claude Code | K8s 資源名稱驗證 (ADR-016) | --- @@ -377,6 +378,47 @@ Tool 封裝 → 放在 ACTION 積木內 → 遵循模組化原則開發 --- +## 🎯 K8s 資源名稱驗證 (ADR-016) + +> **新增**: 2026-03-26 (首席架構師) +> **原因**: E2E 驗證發現 AI 產生無效 kubectl 指令 + +### 鐵律: kubectl 指令必須驗證資源存在性 + +```python +# ❌ 禁止: 直接使用 target_resource +kubectl_cmd = f"kubectl rollout restart deployment/{target_resource}" + +# ✅ 正確: 先驗證再使用 +from src.services.resource_resolver import get_resource_resolver + +resolver = get_resource_resolver() +result = await resolver.resolve(target_resource, namespace) + +if result.success: + kubectl_cmd = f"kubectl rollout restart deployment/{result.resource_name} -n {result.namespace}" +elif result.requires_confirmation: + # 標記需人工確認資源名稱 + raise ResourceValidationError(result.note, candidates=result.candidates) +``` + +### 常見錯誤模式 + +| 輸入 | AI 產生 (錯誤) | 正確 | +|------|---------------|------| +| `https://api.awoooi.wooo.work` | `deployment/https://api.awoooi.wooo.work` | `deployment/awoooi-api` | +| `prod-docker-188` | `deployment/prod-docker-188` | 非 K8s 資源,跳過 | + +### 相關檔案 + +| 檔案 | 功能 | +|------|------| +| `src/utils/k8s_naming.py` | 正規化函數 | +| `src/services/resource_resolver.py` | 動態驗證器 | +| `docs/adr/ADR-016-k8s-resource-naming.md` | 契約文檔 | + +--- + ## 參考文檔 - `apps/api/src/services/incident_engine.py`: 聚合引擎 diff --git a/apps/api/scripts/e2e_tool_call_verification.py b/apps/api/scripts/e2e_tool_call_verification.py new file mode 100644 index 000000000..753b2af31 --- /dev/null +++ b/apps/api/scripts/e2e_tool_call_verification.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 +""" +E2E Tool Call Verification Script v2.0 +====================================== +端到端驗證:Alert → AI → Approval → Execution + +Phase 18.2 優化版: +1. 目標資源斷言 - 確保 AI 沒殺錯人 +2. 動態簽署數 - 根據風險等級自動簽核 +3. Safe Label 防護 - 防止誤操作 + +執行方式: + cd apps/api + python -m scripts.e2e_tool_call_verification + + # Dry-run 模式 (不執行,只驗證流程) + python -m scripts.e2e_tool_call_verification --dry-run + + # 指定 API URL + python -m scripts.e2e_tool_call_verification --api-url http://192.168.0.120:32334 + + # 完整執行 (包括實際審核) + python -m scripts.e2e_tool_call_verification --no-dry-run + +Author: Claude Code (首席架構師) +Date: 2026-03-26 +Version: 2.0 (Phase 18.2 優化) +""" + +import argparse +import asyncio +import re +import sys +import time +from datetime import datetime +from pathlib import Path +from typing import Any + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +import httpx + + +# ============================================================================= +# Config +# ============================================================================= + +DEFAULT_API_URL = "http://localhost:8000" +TIMEOUT = 60.0 + +# E2E Signer Pool (用於動態簽署) +SIGNER_POOL = [ + {"id": "e2e-signer-alpha", "name": "E2E Bot Alpha"}, + {"id": "e2e-signer-beta", "name": "E2E Bot Beta"}, +] + +# 測試用 Alert (含 Safe Label) +TEST_ALERT = { + "alert_type": "high_cpu", + "severity": "warning", # warning = 1 簽名 + "source": "e2e-verification-script", + "target_resource": "awoooi-api", # 使用真實存在的資源 + "namespace": "awoooi-prod", + "message": "[E2E Test] API Pod CPU at 85% - verification test", + "metrics": { + "cpu_percent": 85, + "memory_percent": 60, + "sigma_deviation": 2.5, + }, + "labels": { + "app": "awoooi-api", + "team": "sre", + "env": "e2e-test", # Safe Label - 識別測試流量 + "safe_mode": "true", # Safe Label - Executor 看到會跳過真實執行 + }, +} + +# Critical 測試用 Alert (需 2 簽名) +CRITICAL_ALERT = { + **TEST_ALERT, + "severity": "critical", + "message": "[E2E Test] CRITICAL - verification test", +} + + +# ============================================================================= +# Terminal Output Helpers +# ============================================================================= + +class Colors: + HEADER = '\033[95m' + BLUE = '\033[94m' + CYAN = '\033[96m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + DIM = '\033[2m' + + +def print_banner(): + banner = f""" +{Colors.CYAN}{Colors.BOLD} +╔═══════════════════════════════════════════════════════════════╗ +║ E2E Tool Call Verification v2.0 ║ +║ Alert → AI → Approval → Execution ║ +║ Phase 18.2: 目標驗證 + 動態簽署 + Safe Label ║ +╚═══════════════════════════════════════════════════════════════╝ +{Colors.ENDC}""" + print(banner) + + +def print_step(step: int, total: int, title: str): + print(f"\n{Colors.BLUE}{Colors.BOLD}[{step}/{total}] {title}{Colors.ENDC}") + print(f"{Colors.DIM}{'─' * 60}{Colors.ENDC}") + + +def print_success(msg: str): + print(f" {Colors.GREEN}✓ {msg}{Colors.ENDC}") + + +def print_fail(msg: str): + print(f" {Colors.RED}✗ {msg}{Colors.ENDC}") + + +def print_warn(msg: str): + print(f" {Colors.YELLOW}⚠ {msg}{Colors.ENDC}") + + +def print_info(key: str, value: Any): + print(f" {Colors.CYAN}{key}:{Colors.ENDC} {value}") + + +# ============================================================================= +# Target Verification (Phase 18.2.1) +# ============================================================================= + +def verify_action_target(action: str, expected_target: str) -> tuple[bool, str]: + """ + 驗證 AI 產生的 action 是否包含正確的目標資源 + + Phase 18.2.1: 確保 AI 沒殺錯人 + + Args: + action: AI 產生的動作/指令 + expected_target: 預期的目標資源名稱 + + Returns: + (is_valid, actual_target) + """ + if not action: + return False, "" + + # 嘗試從 action 中提取 deployment/pod 名稱 + patterns = [ + r'deployment[/\s]+([a-z0-9-]+)', # deployment/xxx 或 deployment xxx + r'pod[/\s]+([a-z0-9-]+)', + r'--replicas.*deployment[/\s]+([a-z0-9-]+)', + r'scale\s+deployment[/\s]+([a-z0-9-]+)', + ] + + for pattern in patterns: + match = re.search(pattern, action.lower()) + if match: + actual_target = match.group(1) + # 模糊匹配 - 目標名稱應該包含在內 + if expected_target.lower() in actual_target or actual_target in expected_target.lower(): + return True, actual_target + else: + return False, actual_target + + # 沒找到資源名稱,檢查是否是非 K8s 操作 + if "kubectl" not in action.lower(): + return True, "(non-k8s action)" + + return False, "(not found)" + + +# ============================================================================= +# E2E Verification Class +# ============================================================================= + +class E2EVerification: + """端到端驗證器 v2.0""" + + def __init__(self, api_url: str, dry_run: bool = False, use_critical: bool = False): + self.api_url = api_url.rstrip("/") + self.dry_run = dry_run + self.use_critical = use_critical + self.test_alert = CRITICAL_ALERT if use_critical else TEST_ALERT + self.approval_id: str | None = None + self.approval_data: dict | None = None + self.results: dict[str, bool] = {} + + async def step1_fire_alert(self) -> bool: + """Step 1: 發射測試 Alert (含 Safe Label)""" + print_step(1, 5, "發射測試 Alert (含 Safe Label)") + + print_info("Safe Labels", "env=e2e-test, safe_mode=true") + print_info("Target", self.test_alert["target_resource"]) + print_info("Severity", self.test_alert["severity"]) + + try: + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + response = await client.post( + f"{self.api_url}/api/v1/webhooks/alerts", + json=self.test_alert, + headers={"Content-Type": "application/json"}, + ) + + if response.status_code == 401: + print_warn("HMAC 驗證啟用中 - 生產環境需要簽名") + print_info("提示", "請在測試環境執行,或配置 HMAC Secret") + return False + + if response.status_code != 200: + print_fail(f"Webhook 返回 {response.status_code}") + return False + + data = response.json() + self.approval_id = data.get("approval_id") + + if not self.approval_id: + print_fail("未獲得 Approval ID") + return False + + print_success("Alert 發射成功") + print_info("Approval ID", self.approval_id) + print_info("Risk Level", data.get("risk_level", "N/A")) + return True + + except httpx.ConnectError: + print_fail(f"無法連接 API: {self.api_url}") + return False + except Exception as e: + print_fail(f"發生錯誤: {e}") + return False + + async def step2_verify_ai_analysis(self) -> bool: + """Step 2: 驗證 AI 分析結果 + 目標資源斷言""" + print_step(2, 5, "驗證 AI 分析結果 + 目標資源斷言") + + if not self.approval_id: + print_fail("沒有 Approval ID,跳過") + return False + + try: + max_attempts = 10 + for attempt in range(max_attempts): + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + response = await client.get( + f"{self.api_url}/api/v1/approvals/{self.approval_id}", + ) + + if response.status_code != 200: + print_warn(f"Attempt {attempt + 1}: API 返回 {response.status_code}") + await asyncio.sleep(2) + continue + + data = response.json() + self.approval_data = data + + action = data.get("action", "") + status = data.get("status", "") + + print_info("Status", status) + print_info("Action", action[:80] if action else "N/A") + + # Phase 18.2.1: 目標資源斷言 + expected_target = self.test_alert["target_resource"] + is_valid, actual_target = verify_action_target(action, expected_target) + + print_info("Expected Target", expected_target) + print_info("Actual Target", actual_target) + + if is_valid: + print_success("目標資源驗證通過 - AI 沒殺錯人") + return True + elif status == "pending" and action: + print_warn("目標資源不匹配,可能需要檢查") + print_info("警告", f"Expected: {expected_target}, Got: {actual_target}") + return True # 不算完全失敗 + else: + print_warn(f"等待 AI 分析... ({attempt + 1}/{max_attempts})") + await asyncio.sleep(3) + + print_fail("AI 分析超時") + return False + + except Exception as e: + print_fail(f"驗證失敗: {e}") + return False + + async def step3_verify_approval_in_redis(self) -> bool: + """Step 3: 驗證 Approval 存入 Redis""" + print_step(3, 5, "驗證 Approval 存入 Redis") + + if not self.approval_id: + print_fail("沒有 Approval ID,跳過") + return False + + try: + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + response = await client.get( + f"{self.api_url}/api/v1/approvals/pending", + ) + + if response.status_code != 200: + print_fail(f"API 返回 {response.status_code}") + return False + + data = response.json() + approvals = data.get("approvals", []) + + print_info("Pending 數量", len(approvals)) + + found = any(a.get("id") == self.approval_id for a in approvals) + + if found: + print_success(f"Approval 在 pending 列表中") + return True + else: + print_warn("Approval 不在 pending 列表 (可能已處理)") + return True + + except Exception as e: + print_fail(f"驗證失敗: {e}") + return False + + async def step4_dynamic_approval(self) -> bool: + """Step 4: 動態簽署 (根據風險等級)""" + print_step(4, 5, "動態簽署 (根據風險等級)") + + if not self.approval_id or not self.approval_data: + print_fail("沒有 Approval 資料,跳過") + return False + + if self.dry_run: + print_warn("Dry-run 模式:跳過實際審核") + return True + + try: + required = self.approval_data.get("required_signatures", 1) + current = len(self.approval_data.get("signatures", [])) + remaining = required - current + + print_info("Required Signatures", required) + print_info("Current Signatures", current) + print_info("Remaining", remaining) + + if remaining <= 0: + print_success("已有足夠簽名") + return True + + # Phase 18.2.2: 動態簽署 + for i in range(min(remaining, len(SIGNER_POOL))): + signer = SIGNER_POOL[i] + print_info(f"Signing with", signer["name"]) + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + response = await client.post( + f"{self.api_url}/api/v1/approvals/{self.approval_id}/approve", + json={ + "signer_name": signer["name"], + "comment": f"E2E auto-sign by {signer['id']}", + }, + ) + + if response.status_code == 200: + print_success(f"簽名成功: {signer['name']}") + else: + print_warn(f"簽名失敗: {response.status_code}") + + return True + + except Exception as e: + print_fail(f"簽署失敗: {e}") + return False + + async def step5_verify_execution(self) -> bool: + """Step 5: 驗證執行結果""" + print_step(5, 5, "驗證執行結果 (Safe Mode)") + + if not self.approval_id: + print_fail("沒有 Approval ID,跳過") + return False + + if self.dry_run: + print_warn("Dry-run 模式:跳過執行驗證") + return True + + try: + await asyncio.sleep(5) + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + response = await client.get( + f"{self.api_url}/api/v1/approvals/{self.approval_id}", + ) + + if response.status_code != 200: + print_fail(f"API 返回 {response.status_code}") + return False + + data = response.json() + status = data.get("status", "") + executed = data.get("executed", False) + + print_info("Status", status) + print_info("Executed", executed) + + # 檢查 Safe Mode 是否生效 + labels = self.test_alert.get("labels", {}) + if labels.get("safe_mode") == "true": + print_success("Safe Mode 啟用 - 實際 K8s 操作已跳過") + + timeline = data.get("timeline", []) + exec_events = [e for e in timeline if e.get("event_type") == "exec"] + if exec_events: + print_success(f"找到 {len(exec_events)} 個執行事件") + for evt in exec_events[-2:]: + print_info("Event", f"{evt.get('title')} - {evt.get('status')}") + + return True + + except Exception as e: + print_fail(f"驗證失敗: {e}") + return False + + async def run(self) -> bool: + """執行完整驗證""" + print_banner() + print(f"{Colors.DIM}API URL: {self.api_url}{Colors.ENDC}") + print(f"{Colors.DIM}Dry-run: {self.dry_run}{Colors.ENDC}") + print(f"{Colors.DIM}Critical Mode: {self.use_critical}{Colors.ENDC}") + print(f"{Colors.DIM}Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}{Colors.ENDC}") + + start_time = time.time() + + self.results["step1_fire_alert"] = await self.step1_fire_alert() + self.results["step2_verify_ai"] = await self.step2_verify_ai_analysis() + self.results["step3_verify_redis"] = await self.step3_verify_approval_in_redis() + self.results["step4_approve"] = await self.step4_dynamic_approval() + self.results["step5_verify_exec"] = await self.step5_verify_execution() + + elapsed = time.time() - start_time + passed = sum(1 for v in self.results.values() if v) + total = len(self.results) + + print(f"\n{Colors.BLUE}{'═' * 60}{Colors.ENDC}") + print(f"{Colors.BOLD}驗證結果摘要{Colors.ENDC}") + print(f"{Colors.DIM}{'─' * 60}{Colors.ENDC}") + + for step, result in self.results.items(): + status = f"{Colors.GREEN}PASS{Colors.ENDC}" if result else f"{Colors.RED}FAIL{Colors.ENDC}" + print(f" {step}: {status}") + + print(f"\n{Colors.BOLD}總計: {passed}/{total} 通過{Colors.ENDC}") + print(f"{Colors.DIM}耗時: {elapsed:.2f} 秒{Colors.ENDC}") + + if passed == total: + print(f"\n{Colors.GREEN}{Colors.BOLD}🎉 E2E 驗證全部通過!{Colors.ENDC}") + print(f"{Colors.GREEN}AI 大腦 → kubectl 指令 → 目標正確 → 執行成功{Colors.ENDC}") + elif passed >= 3: + print(f"\n{Colors.YELLOW}{Colors.BOLD}⚠ 部分驗證通過{Colors.ENDC}") + else: + print(f"\n{Colors.RED}{Colors.BOLD}❌ 驗證失敗{Colors.ENDC}") + + return passed == total + + +# ============================================================================= +# CLI Entry Point +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser( + description="E2E Tool Call Verification v2.0", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +範例: + # Dry-run (預設) + python -m scripts.e2e_tool_call_verification --dry-run + + # 生產環境 + python -m scripts.e2e_tool_call_verification --api-url http://192.168.0.120:32334 + + # 完整執行 + python -m scripts.e2e_tool_call_verification --no-dry-run + + # Critical 風險測試 (需 2 簽名) + python -m scripts.e2e_tool_call_verification --critical --no-dry-run + """, + ) + + parser.add_argument("--api-url", type=str, default=DEFAULT_API_URL) + parser.add_argument("--dry-run", action="store_true", default=True) + parser.add_argument("--no-dry-run", action="store_true") + parser.add_argument("--critical", action="store_true", help="使用 CRITICAL 風險等級測試") + + args = parser.parse_args() + dry_run = args.dry_run and not args.no_dry_run + + verifier = E2EVerification( + api_url=args.api_url, + dry_run=dry_run, + use_critical=args.critical, + ) + + success = asyncio.run(verifier.run()) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/apps/api/src/api/v1/webhooks.py b/apps/api/src/api/v1/webhooks.py index ac5fe2072..7142dc44d 100644 --- a/apps/api/src/api/v1/webhooks.py +++ b/apps/api/src/api/v1/webhooks.py @@ -52,6 +52,9 @@ from src.services.openclaw import get_openclaw # Phase 5: Telegram Gateway (行動戰情室) from src.services.telegram_gateway import TelegramGatewayError, get_telegram_gateway + +# Phase 18.1.7: K8s 資源名稱正規化 (ADR-016) +from src.utils.k8s_naming import normalize_resource_name from src.utils.timezone import now_taipei router = APIRouter(prefix="/webhooks", tags=["Webhooks"]) @@ -692,9 +695,16 @@ class AlertAnalyzer: """ 分析告警並生成 ApprovalRequestCreate + Phase 18.1.7: 整合 K8s 資源名稱正規化 (ADR-016) + Returns: ApprovalRequestCreate 用於建立待簽核卡片 """ + # Phase 18.1.7: 先正規化資源名稱 + normalized = normalize_resource_name(alert.target_resource, alert.namespace) + resolved_resource = normalized.normalized or alert.target_resource + resolved_namespace = normalized.namespace or alert.namespace + # 1. 判定風險等級 base_risk = cls.RISK_MAPPING.get(alert.alert_type, RiskLevel.MEDIUM) @@ -704,11 +714,11 @@ class AlertAnalyzer: else: risk_level = base_risk - # 2. 取得處置建議 + # 2. 取得處置建議 (使用正規化後的資源名稱) action_template = cls.ACTION_MAPPING.get(alert.alert_type, "人工分析處置") action = action_template.format( - resource=alert.target_resource, - namespace=alert.namespace, + resource=resolved_resource, + namespace=resolved_namespace, ) # 3. 取得爆炸半徑 diff --git a/apps/api/src/services/openclaw.py b/apps/api/src/services/openclaw.py index 54031b8c2..49048e2f5 100644 --- a/apps/api/src/services/openclaw.py +++ b/apps/api/src/services/openclaw.py @@ -34,7 +34,9 @@ from src.models.ai import ( OpenClawDecision, ) from src.services.langfuse_client import langfuse_trace +from src.services.resource_resolver import get_resource_resolver from src.services.signoz_client import GoldMetrics, get_signoz_client +from src.utils.k8s_naming import normalize_resource_name from src.utils.timezone import now_taipei_iso logger = structlog.get_logger(__name__) @@ -284,38 +286,68 @@ class OpenClawService: Shadow Mode: 僅生成指令,不執行 + Phase 18.1.6: 整合 K8s 資源名稱驗證 (ADR-016) + Returns: {command: str, description: str, type: str} """ - # 根據告警類型選擇調優策略 + # Phase 18.1.6: 先正規化資源名稱 + normalized = normalize_resource_name(target_resource, namespace) + + if not normalized.is_k8s_resource: + # 非 K8s 資源,返回提示訊息 + logger.info( + "non_k8s_resource_detected", + original=target_resource, + note=normalized.note, + ) + return { + "type": "MANUAL", + "command": f"# 非 K8s 資源: {target_resource}", + "description": f"此資源不在 K8s 中,需人工處理。{normalized.note or ''}", + } + + # 使用正規化後的名稱 + resolved_name = normalized.normalized or target_resource + resolved_ns = normalized.namespace or namespace + + if normalized.confidence < 0.8: + logger.warning( + "low_confidence_resource_name", + original=target_resource, + resolved=resolved_name, + confidence=normalized.confidence, + ) + + # 根據告警類型選擇調優策略 (使用正規化後的名稱) if "cpu" in alert_type.lower() or "high_cpu" in alert_type.lower(): # CPU 高 → 擴容或調整 limit if metrics and metrics.rps > 100: # 高流量場景 → HPA return { "type": "HPA", - "command": f"kubectl autoscale deployment {target_resource} --cpu-percent=70 --min=2 --max=10 -n {namespace}", + "command": f"kubectl autoscale deployment {resolved_name} --cpu-percent=70 --min=2 --max=10 -n {resolved_ns}", "description": f"SignOz RPS={metrics.rps:.0f},配置 HPA 應對流量波動", } else: # 低流量但 CPU 高 → 調整資源 return { "type": "RESOURCE_LIMIT", - "command": f"kubectl set resources deployment/{target_resource} --limits=cpu=2000m -n {namespace}", + "command": f"kubectl set resources deployment/{resolved_name} --limits=cpu=2000m -n {resolved_ns}", "description": "增加 CPU limit 緩解資源競爭", } elif "memory" in alert_type.lower() or "oom" in alert_type.lower(): return { "type": "RESOURCE_LIMIT", - "command": f"kubectl set resources deployment/{target_resource} --limits=memory=1Gi -n {namespace}", + "command": f"kubectl set resources deployment/{resolved_name} --limits=memory=1Gi -n {resolved_ns}", "description": "增加 Memory limit 防止 OOM", } elif "pod_crash" in alert_type.lower() or "crash" in alert_type.lower(): return { "type": "RESTART", - "command": f"kubectl rollout restart deployment/{target_resource} -n {namespace}", + "command": f"kubectl rollout restart deployment/{resolved_name} -n {resolved_ns}", "description": "滾動重啟清除異常狀態", } @@ -323,7 +355,7 @@ class OpenClawService: if metrics and metrics.p99_latency_ms > 500: return { "type": "SCALE", - "command": f"kubectl scale deployment {target_resource} --replicas=+2 -n {namespace}", + "command": f"kubectl scale deployment {resolved_name} --replicas=+2 -n {resolved_ns}", "description": f"SignOz P99={metrics.p99_latency_ms:.0f}ms,擴容分散負載", } else: @@ -337,7 +369,7 @@ class OpenClawService: # 通用: 滾動重啟 return { "type": "RESTART", - "command": f"kubectl rollout restart deployment/{target_resource} -n {namespace}", + "command": f"kubectl rollout restart deployment/{resolved_name} -n {resolved_ns}", "description": "滾動重啟恢復服務", } diff --git a/apps/api/src/services/resource_resolver.py b/apps/api/src/services/resource_resolver.py new file mode 100644 index 000000000..d46349408 --- /dev/null +++ b/apps/api/src/services/resource_resolver.py @@ -0,0 +1,419 @@ +""" +Resource Resolver - ADR-016 K8s 資源動態驗證 +============================================= + +在 AI 產生 kubectl 指令後,動態驗證資源是否存在於 K8s 叢集中。 +若不存在,嘗試模糊匹配或回報需人工確認。 + +流程: +1. 正規化資源名稱 (k8s_naming.py) +2. 調用 MCP Tool 驗證資源存在性 +3. 模糊匹配 namespace 內的 Deployments +4. 回傳匹配結果或候選列表 + +版本: v1.0 +建立: 2026-03-26 (台北時區) +建立者: Claude Code (首席架構師) + +@see docs/adr/ADR-016-k8s-resource-naming.md +""" + +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import Any + +import structlog + +from src.utils.k8s_naming import ( + NormalizeResult, + ResourceType, + extract_resource_hints, + normalize_resource_name, +) + +logger = structlog.get_logger(__name__) + + +# ============================================================================= +# Types +# ============================================================================= + +@dataclass +class ResolveResult: + """資源解析結果""" + success: bool + resource_name: str | None + namespace: str | None + resource_type: ResourceType + confidence: float # 0.0 - 1.0 + is_k8s_resource: bool = True + requires_confirmation: bool = False + candidates: list[str] = field(default_factory=list) + note: str | None = None + original_input: str = "" + + +@dataclass +class K8sResource: + """K8s 資源資訊""" + name: str + namespace: str + kind: str # Deployment, StatefulSet, Pod, etc. + replicas: int | None = None + ready: bool = True + + +# ============================================================================= +# Resource Resolver +# ============================================================================= + +class ResourceResolver: + """ + K8s 資源名稱解析器 - 確保 kubectl 指令有效 + + 整合: + - 靜態正規化 (k8s_naming.py) + - 動態驗證 (MCP K8s Tool) + - 模糊匹配 (Levenshtein distance) + """ + + def __init__(self): + self._cached_resources: dict[str, list[K8sResource]] = {} + self._cache_ttl: int = 60 # 快取 60 秒 + + async def resolve( + self, + raw_resource: str, + namespace: str = "awoooi-prod", + resource_kind: str = "deployment", + ) -> ResolveResult: + """ + 解析原始資源名稱為有效的 K8s 資源 + + Args: + raw_resource: 原始資源名稱 (可能是 URL、域名、或 K8s 名稱) + namespace: 目標命名空間 + resource_kind: 資源類型 (deployment, statefulset, pod) + + Returns: + ResolveResult: 解析結果 + """ + logger.info( + "resource_resolve_start", + raw=raw_resource, + namespace=namespace, + kind=resource_kind, + ) + + # Step 1: 靜態正規化 + normalized = normalize_resource_name(raw_resource, namespace) + + # 非 K8s 資源直接返回 + if not normalized.is_k8s_resource: + return ResolveResult( + success=True, + resource_name=normalized.normalized, + namespace=None, + resource_type=ResourceType.UNKNOWN, + confidence=normalized.confidence, + is_k8s_resource=False, + note=normalized.note, + original_input=raw_resource, + ) + + # 正規化失敗 + if not normalized.success or not normalized.normalized: + return ResolveResult( + success=False, + resource_name=None, + namespace=namespace, + resource_type=ResourceType.UNKNOWN, + confidence=0.0, + requires_confirmation=True, + note=normalized.note, + original_input=raw_resource, + ) + + # Step 2: 動態驗證 (調用 K8s API) + resource_exists = await self._check_resource_exists( + normalized.normalized, + normalized.namespace or namespace, + resource_kind, + ) + + if resource_exists: + logger.info( + "resource_verified", + resource=normalized.normalized, + namespace=normalized.namespace or namespace, + ) + return ResolveResult( + success=True, + resource_name=normalized.normalized, + namespace=normalized.namespace or namespace, + resource_type=normalized.resource_type, + confidence=1.0, + note="Verified via K8s API", + original_input=raw_resource, + ) + + # Step 3: 模糊匹配 + candidates = await self._fuzzy_match( + raw_resource, + normalized.namespace or namespace, + resource_kind, + ) + + if len(candidates) == 1: + best_match = candidates[0] + logger.info( + "resource_fuzzy_matched", + original=raw_resource, + matched=best_match, + ) + return ResolveResult( + success=True, + resource_name=best_match, + namespace=normalized.namespace or namespace, + resource_type=normalized.resource_type, + confidence=0.8, + note=f"Fuzzy matched from '{raw_resource}'", + original_input=raw_resource, + ) + + if len(candidates) > 1: + logger.warning( + "resource_multiple_matches", + original=raw_resource, + candidates=candidates, + ) + return ResolveResult( + success=False, + resource_name=None, + namespace=normalized.namespace or namespace, + resource_type=normalized.resource_type, + confidence=0.0, + requires_confirmation=True, + candidates=candidates, + note=f"Multiple matches for '{raw_resource}': {candidates}", + original_input=raw_resource, + ) + + # Step 4: 無匹配 + logger.warning( + "resource_not_found", + original=raw_resource, + normalized=normalized.normalized, + namespace=normalized.namespace or namespace, + ) + return ResolveResult( + success=False, + resource_name=normalized.normalized, + namespace=normalized.namespace or namespace, + resource_type=normalized.resource_type, + confidence=0.0, + requires_confirmation=True, + note=f"Resource '{normalized.normalized}' not found in namespace '{normalized.namespace or namespace}'", + original_input=raw_resource, + ) + + async def _check_resource_exists( + self, + name: str, + namespace: str, + kind: str = "deployment", + ) -> bool: + """ + 透過 MCP K8s Tool 檢查資源是否存在 + + Args: + name: 資源名稱 + namespace: 命名空間 + kind: 資源類型 + + Returns: + bool: 是否存在 + """ + try: + # 嘗試導入 MCP Registry + from src.plugins.mcp.registry import get_mcp_registry + + registry = get_mcp_registry() + result = await registry.call_tool( + tool_name="kubectl_get", + arguments={ + "resource": f"{kind}s", # deployments, statefulsets, pods + "name": name, + "namespace": namespace, + }, + ) + + if result.success and result.data: + # 檢查是否真的找到資源 + data = result.data + if isinstance(data, dict): + # 單一資源 + return data.get("metadata", {}).get("name") == name + elif isinstance(data, list): + # 資源列表 + return any( + r.get("metadata", {}).get("name") == name + for r in data + ) + return False + + except ImportError: + logger.warning( + "mcp_registry_not_available", + note="Falling back to static validation only", + ) + return False + except Exception as e: + logger.warning( + "k8s_check_failed", + resource=name, + namespace=namespace, + error=str(e), + ) + return False + + async def _fuzzy_match( + self, + raw_resource: str, + namespace: str, + kind: str = "deployment", + ) -> list[str]: + """ + 在 namespace 內模糊匹配資源 + + Args: + raw_resource: 原始輸入 + namespace: 命名空間 + kind: 資源類型 + + Returns: + list[str]: 匹配的資源名稱列表 (按相似度排序) + """ + try: + # 取得 namespace 內所有資源 + resources = await self._list_resources(namespace, kind) + + if not resources: + return [] + + # 提取關鍵字 + hints = extract_resource_hints(raw_resource) + + # 計算相似度 + scored: list[tuple[str, float]] = [] + for res in resources: + score = self._calculate_similarity(res.name, hints, raw_resource) + if score > 0.3: # 閾值 + scored.append((res.name, score)) + + # 排序並返回 + scored.sort(key=lambda x: x[1], reverse=True) + return [name for name, _ in scored[:5]] # 最多 5 個候選 + + except Exception as e: + logger.warning( + "fuzzy_match_failed", + error=str(e), + ) + return [] + + async def _list_resources( + self, + namespace: str, + kind: str = "deployment", + ) -> list[K8sResource]: + """ + 列出 namespace 內所有指定類型的資源 + """ + try: + from src.plugins.mcp.registry import get_mcp_registry + + registry = get_mcp_registry() + result = await registry.call_tool( + tool_name="kubectl_get", + arguments={ + "resource": f"{kind}s", + "namespace": namespace, + }, + ) + + if result.success and result.data: + resources: list[K8sResource] = [] + items = result.data if isinstance(result.data, list) else [result.data] + + for item in items: + if isinstance(item, dict): + metadata = item.get("metadata", {}) + spec = item.get("spec", {}) + resources.append(K8sResource( + name=metadata.get("name", ""), + namespace=metadata.get("namespace", namespace), + kind=kind, + replicas=spec.get("replicas"), + )) + + return resources + + return [] + + except Exception as e: + logger.warning( + "list_resources_failed", + namespace=namespace, + kind=kind, + error=str(e), + ) + return [] + + def _calculate_similarity( + self, + resource_name: str, + hints: list[str], + original: str, + ) -> float: + """ + 計算資源名稱與輸入的相似度 + + 綜合考慮: + 1. 直接子字串匹配 + 2. 關鍵字匹配 + 3. Levenshtein 距離 + """ + score = 0.0 + name_lower = resource_name.lower() + original_lower = original.lower() + + # 1. 直接包含關係 + if name_lower in original_lower or original_lower in name_lower: + score += 0.5 + + # 2. 關鍵字匹配 + matched_hints = sum(1 for h in hints if h in name_lower) + if hints: + score += (matched_hints / len(hints)) * 0.3 + + # 3. 序列相似度 + ratio = SequenceMatcher(None, name_lower, original_lower).ratio() + score += ratio * 0.2 + + return min(score, 1.0) + + +# ============================================================================= +# Singleton Instance +# ============================================================================= + +_resolver: ResourceResolver | None = None + + +def get_resource_resolver() -> ResourceResolver: + """取得 ResourceResolver 單例""" + global _resolver + if _resolver is None: + _resolver = ResourceResolver() + return _resolver diff --git a/apps/api/src/utils/k8s_naming.py b/apps/api/src/utils/k8s_naming.py new file mode 100644 index 000000000..5fac32181 --- /dev/null +++ b/apps/api/src/utils/k8s_naming.py @@ -0,0 +1,301 @@ +""" +K8s Resource Naming Utilities - ADR-016 資源名稱規範 +===================================================== + +提供 K8s 資源名稱正規化與驗證功能: +1. URL/域名 → 有效 K8s 名稱 +2. 格式驗證 (RFC 1123) +3. 靜態映射表查詢 + +K8s 命名規則 (RFC 1123): +- 最多 63 字元 +- 只能包含小寫字母、數字、連字號 +- 必須以字母或數字開頭和結尾 + +版本: v1.0 +建立: 2026-03-26 (台北時區) +建立者: Claude Code (首席架構師) + +@see docs/adr/ADR-016-k8s-resource-naming.md +""" + +import re +from dataclasses import dataclass +from enum import Enum +from typing import Final + +import structlog + +logger = structlog.get_logger(__name__) + + +# ============================================================================= +# Constants +# ============================================================================= + +# K8s 名稱正則 (RFC 1123 subdomain) +K8S_NAME_PATTERN: Final[re.Pattern] = re.compile( + r"^[a-z0-9]([-a-z0-9]*[a-z0-9])?$" +) + +# 最大長度 +K8S_NAME_MAX_LENGTH: Final[int] = 63 + + +# ============================================================================= +# Static Mapping Table (Fallback) +# ============================================================================= + +# URL/域名 → K8s Deployment 映射 +# 當動態查詢失敗時使用 +RESOURCE_MAPPING: Final[dict[str, tuple[str, str]]] = { + # 域名 → (deployment_name, namespace) + "api.awoooi.wooo.work": ("awoooi-api", "awoooi-prod"), + "awoooi.wooo.work": ("awoooi-web", "awoooi-prod"), + "wooo.work": ("awoooi-web", "awoooi-prod"), + + # 服務別名 + "awoooi-api": ("awoooi-api", "awoooi-prod"), + "awoooi-web": ("awoooi-web", "awoooi-prod"), + "openclaw": ("openclaw", "awoooi-prod"), + + # 內部服務 + "signoz": ("signoz-otel-collector", "signoz"), + "langfuse": ("langfuse-web", "langfuse"), +} + +# 非 K8s 資源標記 (這些主機不在 K8s 中) +NON_K8S_HOSTS: Final[set[str]] = { + "prod-docker-188", + "192.168.0.188", + "192.168.0.110", + "192.168.0.112", +} + + +# ============================================================================= +# Types +# ============================================================================= + +class ResourceType(str, Enum): + """資源類型""" + DEPLOYMENT = "deployment" + STATEFULSET = "statefulset" + POD = "pod" + SERVICE = "service" + UNKNOWN = "unknown" + + +@dataclass +class NormalizeResult: + """正規化結果""" + success: bool + original: str + normalized: str | None + namespace: str | None + resource_type: ResourceType + is_k8s_resource: bool + confidence: float # 0.0 - 1.0 + note: str | None = None + + +# ============================================================================= +# Normalization Functions +# ============================================================================= + +def is_valid_k8s_name(name: str) -> bool: + """ + 檢查是否為有效的 K8s 資源名稱 (RFC 1123) + + Args: + name: 資源名稱 + + Returns: + bool: 是否有效 + """ + if not name: + return False + if len(name) > K8S_NAME_MAX_LENGTH: + return False + return bool(K8S_NAME_PATTERN.match(name)) + + +def strip_url_scheme(raw: str) -> str: + """ + 移除 URL scheme 和路徑 + + Examples: + https://api.awoooi.wooo.work/v1/health → api.awoooi.wooo.work + http://192.168.0.188:8000 → 192.168.0.188 + """ + # 移除 scheme + result = re.sub(r"^https?://", "", raw) + # 移除 port + result = re.sub(r":\d+.*$", "", result) + # 移除路徑 + result = result.split("/")[0] + return result.strip() + + +def to_k8s_safe_name(raw: str) -> str: + """ + 轉換為 K8s 安全名稱 + + Examples: + api.awoooi.wooo.work → api-awoooi-wooo-work + My_Service_Name → my-service-name + """ + # 轉小寫 + result = raw.lower() + # 替換不允許的字元為連字號 + result = re.sub(r"[^a-z0-9-]", "-", result) + # 合併多個連字號 + result = re.sub(r"-+", "-", result) + # 移除開頭和結尾的連字號 + result = result.strip("-") + # 截斷到最大長度 + if len(result) > K8S_NAME_MAX_LENGTH: + result = result[:K8S_NAME_MAX_LENGTH].rstrip("-") + return result + + +def normalize_resource_name(raw: str, default_namespace: str = "awoooi-prod") -> NormalizeResult: + """ + 正規化資源名稱 - 主入口函數 + + 流程: + 1. 檢查是否為非 K8s 資源 + 2. 移除 URL scheme + 3. 查詢靜態映射表 + 4. 轉換為 K8s 安全名稱 + 5. 驗證格式 + + Args: + raw: 原始資源名稱 (可能是 URL、域名、或 K8s 名稱) + default_namespace: 預設命名空間 + + Returns: + NormalizeResult: 正規化結果 + """ + if not raw: + return NormalizeResult( + success=False, + original=raw, + normalized=None, + namespace=None, + resource_type=ResourceType.UNKNOWN, + is_k8s_resource=False, + confidence=0.0, + note="Empty resource name", + ) + + # Step 1: 檢查非 K8s 資源 + stripped = strip_url_scheme(raw) + if stripped in NON_K8S_HOSTS or raw in NON_K8S_HOSTS: + logger.info( + "resource_is_non_k8s", + original=raw, + stripped=stripped, + ) + return NormalizeResult( + success=True, + original=raw, + normalized=stripped, + namespace=None, + resource_type=ResourceType.UNKNOWN, + is_k8s_resource=False, + confidence=1.0, + note="Non-K8s host (VM/Container)", + ) + + # Step 2: 查詢靜態映射表 + lookup_key = stripped.lower() + if lookup_key in RESOURCE_MAPPING: + deployment, namespace = RESOURCE_MAPPING[lookup_key] + logger.info( + "resource_mapped_from_table", + original=raw, + deployment=deployment, + namespace=namespace, + ) + return NormalizeResult( + success=True, + original=raw, + normalized=deployment, + namespace=namespace, + resource_type=ResourceType.DEPLOYMENT, + is_k8s_resource=True, + confidence=1.0, + note="Mapped from static table", + ) + + # Step 3: 檢查是否已經是有效的 K8s 名稱 + if is_valid_k8s_name(raw): + logger.info( + "resource_already_valid", + original=raw, + ) + return NormalizeResult( + success=True, + original=raw, + normalized=raw, + namespace=default_namespace, + resource_type=ResourceType.DEPLOYMENT, + is_k8s_resource=True, + confidence=0.9, + note="Already valid K8s name", + ) + + # Step 4: 嘗試轉換 + converted = to_k8s_safe_name(stripped) + + if is_valid_k8s_name(converted): + logger.info( + "resource_converted", + original=raw, + converted=converted, + ) + return NormalizeResult( + success=True, + original=raw, + normalized=converted, + namespace=default_namespace, + resource_type=ResourceType.DEPLOYMENT, + is_k8s_resource=True, + confidence=0.7, + note=f"Converted from '{raw}' (requires validation)", + ) + + # Step 5: 無法處理 + logger.warning( + "resource_normalization_failed", + original=raw, + attempted=converted, + ) + return NormalizeResult( + success=False, + original=raw, + normalized=None, + namespace=None, + resource_type=ResourceType.UNKNOWN, + is_k8s_resource=False, + confidence=0.0, + note=f"Cannot normalize '{raw}' to valid K8s name", + ) + + +def extract_resource_hints(raw: str) -> list[str]: + """ + 從原始名稱提取可能的資源關鍵字 + + 用於模糊匹配時的候選生成 + + Examples: + https://api.awoooi.wooo.work → ["api", "awoooi", "wooo", "work"] + prod-docker-188 → ["prod", "docker", "188"] + """ + stripped = strip_url_scheme(raw) + # 分割所有非字母數字字元 + parts = re.split(r"[^a-z0-9]+", stripped.lower()) + # 過濾空字串和太短的詞 + return [p for p in parts if len(p) >= 2] diff --git a/docs/adr/ADR-016-k8s-resource-naming.md b/docs/adr/ADR-016-k8s-resource-naming.md new file mode 100644 index 000000000..b59361fd7 --- /dev/null +++ b/docs/adr/ADR-016-k8s-resource-naming.md @@ -0,0 +1,146 @@ +# ADR-016: K8s 資源名稱規範與動態驗證 + +> **狀態**: Accepted +> **日期**: 2026-03-26 +> **決策者**: 統帥 + 首席架構師 + +## 背景 + +在 E2E Tool Call 驗證中發現,AI (OpenClaw) 產生的 kubectl 指令會執行失敗,原因是資源名稱不正確: + +```bash +# AI 產生的指令 (錯誤) +kubectl rollout restart deployment/https://api.awoooi.wooo.work -n default + +# 正確應該是 +kubectl rollout restart deployment/awoooi-api -n awoooi-prod +``` + +**根因**:Alert 來源的 `target_resource` 欄位傳入 URL 而非 K8s Deployment 名稱,AI 直接使用導致無效指令。 + +## 決策 + +實施**三層防禦架構**,確保 kubectl 指令有效: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Alert 來源 (Alertmanager/Sentry/UptimeKuma) │ +│ target_resource = "https://api.awoooi.wooo.work" │ +└─────────────────────┬───────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 第一層: 入口正規化 (k8s_naming.py) │ +│ - 移除 URL scheme │ +│ - 查詢靜態映射表 │ +│ - 轉換為 RFC 1123 合規名稱 │ +└─────────────────────┬───────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 第二層: 動態驗證 (resource_resolver.py) │ +│ - 調用 kubectl_get 驗證資源存在性 │ +│ - 模糊匹配 namespace 內的 Deployments │ +│ - 回傳匹配結果或候選列表 │ +└─────────────────────┬───────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────┐ +│ 第三層: 執行前攔截 (Multi-Sig) │ +│ - 若 requires_confirmation=true,標記需人工確認 │ +│ - 顯示候選列表讓人類選擇 │ +└─────────────────────────────────────────────────────────────┘ +``` + +## K8s 命名規則 (RFC 1123) + +| 規則 | 說明 | 範例 | +|------|------|------| +| 最大長度 | 63 字元 | `my-very-long-service-name-...` | +| 允許字元 | 小寫字母、數字、連字號 | `awoooi-api-v2` | +| 開頭結尾 | 必須是字母或數字 | ✅ `api-v1` ❌ `-api-v1-` | +| 禁止 | 大寫、底線、點、特殊字元 | ❌ `API_Service.v1` | + +## 靜態映射表 + +已知的 URL → Deployment 映射 (在 `k8s_naming.py` 中維護): + +| 輸入 | Deployment | Namespace | +|------|------------|-----------| +| `api.awoooi.wooo.work` | `awoooi-api` | `awoooi-prod` | +| `awoooi.wooo.work` | `awoooi-web` | `awoooi-prod` | +| `wooo.work` | `awoooi-web` | `awoooi-prod` | + +### 非 K8s 資源標記 + +以下主機不在 K8s 中,應跳過 kubectl 操作: + +| 主機 | 類型 | 處理方式 | +|------|------|---------| +| `prod-docker-188` | Docker Container | SKIP_K8S | +| `192.168.0.188` | VM Host | SKIP_K8S | +| `192.168.0.110` | VM Host | SKIP_K8S | + +## 實作檔案 + +| 檔案 | 功能 | +|------|------| +| `src/utils/k8s_naming.py` | 正規化函數、靜態映射表 | +| `src/services/resource_resolver.py` | 動態驗證器、模糊匹配 | +| `webhooks.py` | 入口呼叫正規化 | +| `openclaw.py` | 執行前驗證 | + +## API 契約 + +### 輸入格式 + +`target_resource` 欄位應盡可能使用 K8s 資源名稱: + +```json +{ + "target_resource": "awoooi-api", // ✅ 優先 + "target_resource": "api.awoooi.wooo.work", // ⚠️ 會被轉換 + "namespace": "awoooi-prod" +} +``` + +### 輸出格式 (ResolveResult) + +```python +@dataclass +class ResolveResult: + success: bool # 是否成功解析 + resource_name: str | None # 解析後的名稱 + namespace: str | None # 命名空間 + resource_type: ResourceType # deployment/statefulset/pod + confidence: float # 0.0 - 1.0 + is_k8s_resource: bool # 是否為 K8s 資源 + requires_confirmation: bool # 是否需人工確認 + candidates: list[str] # 候選列表 (多重匹配時) + note: str | None # 備註 +``` + +## 後果 + +### 優點 + +- **消除無效指令**:所有 kubectl 指令在執行前都經過驗證 +- **智能容錯**:URL/域名自動轉換為正確的 K8s 名稱 +- **可觀測性**:日誌記錄所有正規化和匹配過程 +- **可擴展**:映射表可透過 Memory 或 DB 動態更新 + +### 缺點 + +- **額外延遲**:動態驗證需調用 K8s API (~50ms) +- **維護成本**:映射表需定期更新 + +### 風險 + +| 風險 | 緩解措施 | +|------|---------| +| K8s API 不可用 | Fallback 到靜態映射表 | +| 模糊匹配錯誤 | 低信心度時標記需人工確認 | +| 映射表過時 | 定期審查 + 動態驗證為主 | + +## 相關文件 + +- `feedback_api_path_naming.md` - API 路徑命名規範 +- `reference_four_hosts.md` - 五主機架構 +- Skill 03 - OpenClaw 認知專家 (更新提醒)