diff --git a/apps/api/src/api/v1/health.py b/apps/api/src/api/v1/health.py index 0c24b7660..bc3d1867b 100644 --- a/apps/api/src/api/v1/health.py +++ b/apps/api/src/api/v1/health.py @@ -47,6 +47,11 @@ class ComponentHealth(BaseModel): status: Literal["up", "down", "degraded"] latency_ms: float | None = None error: str | None = None + provider_name: str | None = None + diagnosis_code: str | None = None + retry_after_seconds: float | None = None + cooldown_remaining_seconds: float | None = None + is_cooldown: bool = False class HealthResponse(BaseModel): @@ -194,16 +199,47 @@ async def _ollama_endpoint_health_check(name: str, url: str) -> ComponentHealth: return ComponentHealth( status="down", error=f"recent endpoint failure cooldown: {cooldown_remaining:.0f}s", + provider_name=name, + diagnosis_code="endpoint_cooldown", + retry_after_seconds=round(cooldown_remaining, 1), + cooldown_remaining_seconds=round(cooldown_remaining, 1), + is_cooldown=True, ) result = await _http_health_check(name, url, "/api/tags") + result.provider_name = name if result.status == "up": + result.diagnosis_code = "endpoint_reachable" record_ollama_endpoint_success(url) else: + result.diagnosis_code = _classify_ollama_endpoint_failure(name, result.error) record_ollama_endpoint_failure(url) return result +def _classify_ollama_endpoint_failure( + provider_name: str, + error: str | None, +) -> str: + """Return a stable diagnosis code for UI/alert rendering.""" + normalized_error = (error or "").lower() + if "cooldown" in normalized_error: + return "endpoint_cooldown" + if "502" in normalized_error or "bad gateway" in normalized_error: + return ( + "local_proxy_upstream_unreachable" + if provider_name == "ollama_local" + else "proxy_upstream_unreachable" + ) + if "timeout" in normalized_error: + return "endpoint_timeout" + if "connection refused" in normalized_error: + return "endpoint_connection_refused" + if "no route to host" in normalized_error or "network is unreachable" in normalized_error: + return "endpoint_network_unreachable" + return "endpoint_unreachable" + + async def check_openclaw() -> ComponentHealth: """Async OpenClaw health check via /health""" return await _http_health_check("openclaw", settings.OPENCLAW_URL, "/health") diff --git a/apps/api/tests/test_health_ollama_provider_chain.py b/apps/api/tests/test_health_ollama_provider_chain.py index 312a4bab8..3c9af9765 100644 --- a/apps/api/tests/test_health_ollama_provider_chain.py +++ b/apps/api/tests/test_health_ollama_provider_chain.py @@ -43,7 +43,10 @@ async def test_ollama_provider_chain_reports_fallback_when_primary_down(monkeypa assert aggregate.latency_ms == 42.0 assert aggregate.error == "primary unavailable; fallback active: ollama_gcp_b" assert details["ollama_gcp_a"].status == "down" + assert details["ollama_gcp_a"].provider_name == "ollama_gcp_a" + assert details["ollama_gcp_a"].diagnosis_code == "endpoint_timeout" assert details["ollama_gcp_b"].status == "up" + assert details["ollama_gcp_b"].diagnosis_code == "endpoint_reachable" assert details["ollama_local"].status == "up" @@ -108,6 +111,9 @@ async def test_ollama_provider_chain_uses_cooldown_after_failure( assert aggregate.status == "degraded" assert details["ollama_gcp_a"].status == "down" assert "cooldown" in (details["ollama_gcp_a"].error or "") + assert details["ollama_gcp_a"].diagnosis_code == "endpoint_cooldown" + assert details["ollama_gcp_a"].is_cooldown is True + assert details["ollama_gcp_a"].retry_after_seconds is not None assert "ollama_gcp_a" not in calls assert {"ollama_gcp_b", "ollama_local"} == set(calls) @@ -132,3 +138,23 @@ async def test_ollama_provider_chain_success_clears_cooldown( await health.check_ollama_provider_chain() assert not is_ollama_endpoint_blocked("http://gcp-a:11434") + + +def test_ollama_failure_classifier_marks_local_proxy_502() -> None: + assert ( + health._classify_ollama_endpoint_failure( + "ollama_local", + "Server error '502 Bad Gateway' for url", + ) + == "local_proxy_upstream_unreachable" + ) + + +def test_ollama_failure_classifier_marks_network_unreachable() -> None: + assert ( + health._classify_ollama_endpoint_failure( + "ollama_local", + "No route to host", + ) + == "endpoint_network_unreachable" + ) diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 4d28e587c..5a29ab182 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -230,6 +230,14 @@ "agent": "Agent", "provider": "Provider" }, + "aiModelHealth": { + "cooldown": "冷卻", + "cooldownSeconds": "冷卻 {seconds}s", + "localProxy": "111 proxy", + "timeout": "逾時", + "network": "網路不可達", + "refused": "拒絕" + }, "loading": "載入中...", "trendUp": "↑{pct}%", "searchPlaceholderShort": "搜尋...", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 4d28e587c..5a29ab182 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -230,6 +230,14 @@ "agent": "Agent", "provider": "Provider" }, + "aiModelHealth": { + "cooldown": "冷卻", + "cooldownSeconds": "冷卻 {seconds}s", + "localProxy": "111 proxy", + "timeout": "逾時", + "network": "網路不可達", + "refused": "拒絕" + }, "loading": "載入中...", "trendUp": "↑{pct}%", "searchPlaceholderShort": "搜尋...", diff --git a/apps/web/src/components/shared/ai-model-status.tsx b/apps/web/src/components/shared/ai-model-status.tsx index 2c078e9a0..ed96d7b91 100644 --- a/apps/web/src/components/shared/ai-model-status.tsx +++ b/apps/web/src/components/shared/ai-model-status.tsx @@ -16,11 +16,18 @@ interface ModelInfo { role: 'primary' | 'backup' | 'local' | 'agent' | 'provider' status: 'up' | 'down' | 'degraded' | 'unknown' latencyMs?: number | null + diagnosisCode?: string | null + retryAfterSeconds?: number | null + isCooldown?: boolean } interface HealthComponent { status?: 'up' | 'down' | 'degraded' latency_ms?: number | null + diagnosis_code?: string | null + retry_after_seconds?: number | null + is_cooldown?: boolean + error?: string | null } interface HealthResponse { @@ -49,6 +56,31 @@ function statusColor(status: ModelInfo['status']) { return '#87867f' } +function modelDetail(model: ModelInfo, t: ReturnType) { + if (typeof model.latencyMs === 'number' && model.status === 'up') { + return `${Math.round(model.latencyMs)}ms` + } + if (model.isCooldown) { + const seconds = Math.max(0, Math.round(model.retryAfterSeconds ?? 0)) + return seconds > 0 + ? t('aiModelHealth.cooldownSeconds', { seconds }) + : t('aiModelHealth.cooldown') + } + if (model.diagnosisCode === 'local_proxy_upstream_unreachable') { + return t('aiModelHealth.localProxy') + } + if (model.diagnosisCode === 'endpoint_timeout') { + return t('aiModelHealth.timeout') + } + if (model.diagnosisCode === 'endpoint_network_unreachable') { + return t('aiModelHealth.network') + } + if (model.diagnosisCode === 'endpoint_connection_refused') { + return t('aiModelHealth.refused') + } + return t(`aiModelRoles.${model.role}` as never) +} + export function AIModelStatus() { const t = useTranslations('dashboard') const [models, setModels] = useState([ @@ -72,6 +104,9 @@ export function AIModelStatus() { role: PROVIDER_ROLES[key] ?? 'provider', status: d.components?.[key]?.status ?? 'unknown', latencyMs: d.components?.[key]?.latency_ms, + diagnosisCode: d.components?.[key]?.diagnosis_code, + retryAfterSeconds: d.components?.[key]?.retry_after_seconds, + isCooldown: d.components?.[key]?.is_cooldown, }))) }) .catch(() => {}) @@ -98,9 +133,7 @@ export function AIModelStatus() { {m.name} - {typeof m.latencyMs === 'number' - ? `${Math.round(m.latencyMs)}ms` - : t(`aiModelRoles.${m.role}` as never)} + {modelDetail(m, t)} ))} diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index fa6866d93..0270fdbdd 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51,6 +51,49 @@ http://127.0.0.1:3107/zh-TW/awooop/work-items?project_id=awoooi&incident_id=INC- - 整體 AI 自動化飛輪:約 73%;仍不能宣稱 24h 全自動 repair 閉環,需以 production evidence 持續補齊。 - 24h 完整 AI Agent 自動修復 production claim:0%;仍維持嚴格口徑,只能宣稱「已驗證的特定 controlled apply / drill-down 能被追蹤」。 +## 2026-05-31|Ollama health 診斷碼與前台冷卻顯示 + +**背景**: + +- 111 local fallback 復原時,production health 曾短暫顯示 `recent endpoint failure cooldown`;這是 API pod 內 60 秒短期抑制,不等於 endpoint 當下仍不可用。 +- 既有 `/api/v1/health` 只回 `status/latency/error`,前台與告警只能把 cooldown / 502 upstream / timeout 都看成同一種 down。 + +**本次調整**: + +- `apps/api/src/api/v1/health.py` 的 `ComponentHealth` 新增: + - `provider_name` + - `diagnosis_code` + - `retry_after_seconds` + - `cooldown_remaining_seconds` + - `is_cooldown` +- Ollama endpoint health 會穩定分類: + - `endpoint_reachable` + - `endpoint_cooldown` + - `local_proxy_upstream_unreachable` + - `proxy_upstream_unreachable` + - `endpoint_timeout` + - `endpoint_connection_refused` + - `endpoint_network_unreachable` + - `endpoint_unreachable` +- `AIModelStatus` 前台卡片讀取這些欄位,將 cooldown 顯示成 `冷卻 Ns`,111 proxy / timeout / network unreachable / refused 也顯示短標籤。 + +**驗證**: + +```text +python3 -m py_compile apps/api/src/api/v1/health.py apps/api/tests/test_health_ollama_provider_chain.py -> pass +pytest tests/test_health_ollama_provider_chain.py -q -> 7 passed +ruff check src/api/v1/health.py tests/test_health_ollama_provider_chain.py --select E9,F401,F821,F841 -> pass +python3 -m json.tool apps/web/messages/zh-TW.json / en.json -> pass +cmp apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass +pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-ai-model-health-diagnosis-20260531.tsbuildinfo -> pass +git diff --check -> pass +``` + +**判讀**: + +- 這不是改路由策略,也不是新增修復動作;只是把健康檢查結果從「一段文字」升級成可被 Telegram / 前台穩定解讀的診斷欄位。 +- 後續若 111 再短暫抖動,前台應能看到「冷卻」或「111 proxy」而不是只看到泛化的 Ollama down。 + ## 2026-05-31|Ollama 111 local fallback 復原確認 **背景**: diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index a544dc93a..6fdb8e2e6 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -2727,6 +2727,12 @@ Phase 6 完成後 - Verification:`110 -> 192.168.0.111:11434 /api/tags -> 200`、`110 -> 127.0.0.1:11437 /api/tags -> 200`、production `/api/v1/health` cooldown expiry 後 `ollama_local=up`;`11437 hermes3:latest /api/generate -> 200 response=OK`,同輪 `11435/11436 hermes3:latest` 也皆成功。111 host 上 `com.momo.ollama111-allow-proxy` 為 running,allowlist 包含 `192.168.0.110/32`,`pmset sleep=0`。 - 判讀:11437 目前已恢復,三層 Ollama health 已全綠;中間的 `recent endpoint failure cooldown` 是 API pod 內 60 秒短期抑制殘留,不代表 endpoint 當下仍不可用。若再復發,告警需把 111 reachability / wake evidence 顯示出來,避免被誤讀成 GCP-A/B Ollama 全掛。 +**T154j Ollama health diagnosis codes surfaced(2026-05-31 台北)**: +- 觸發:T154i 暴露 health surface 只剩 `status/latency/error`,無法讓 Telegram / 前台穩定區分 cooldown、111 proxy upstream、timeout、network unreachable。 +- 修正:`ComponentHealth` 新增 `provider_name`、`diagnosis_code`、`retry_after_seconds`、`cooldown_remaining_seconds`、`is_cooldown`;Ollama endpoint health 分類 `endpoint_reachable`、`endpoint_cooldown`、`local_proxy_upstream_unreachable`、`endpoint_timeout`、`endpoint_network_unreachable` 等。`AIModelStatus` 讀取診斷欄位,前台顯示 `冷卻 Ns`、`111 proxy`、`逾時`、`網路不可達`、`拒絕`。 +- Verification:focused `py_compile` pass、pytest `test_health_ollama_provider_chain.py` 7 passed、ruff `E9/F401/F821/F841` pass、i18n JSON parse/cmp pass、web `tsc --noEmit` pass、`git diff --check` pass。 +- 判讀:T154j 不改 ADR-110 路由策略、不新增自動修復,只把 health truth 變成可機器判讀的欄位,避免未來把 cooldown 或 111 upstream 抖動誤報成「所有 Ollama 掛」。 + **T152 Ansible runtime readiness surfaced(2026-05-24 台北)**: - 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。 - 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。