fix(health): surface ollama endpoint diagnosis
This commit is contained in:
@@ -47,6 +47,11 @@ class ComponentHealth(BaseModel):
|
||||
status: Literal["up", "down", "degraded"]
|
||||
latency_ms: float | None = None
|
||||
error: str | None = None
|
||||
provider_name: str | None = None
|
||||
diagnosis_code: str | None = None
|
||||
retry_after_seconds: float | None = None
|
||||
cooldown_remaining_seconds: float | None = None
|
||||
is_cooldown: bool = False
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
@@ -194,16 +199,47 @@ async def _ollama_endpoint_health_check(name: str, url: str) -> ComponentHealth:
|
||||
return ComponentHealth(
|
||||
status="down",
|
||||
error=f"recent endpoint failure cooldown: {cooldown_remaining:.0f}s",
|
||||
provider_name=name,
|
||||
diagnosis_code="endpoint_cooldown",
|
||||
retry_after_seconds=round(cooldown_remaining, 1),
|
||||
cooldown_remaining_seconds=round(cooldown_remaining, 1),
|
||||
is_cooldown=True,
|
||||
)
|
||||
|
||||
result = await _http_health_check(name, url, "/api/tags")
|
||||
result.provider_name = name
|
||||
if result.status == "up":
|
||||
result.diagnosis_code = "endpoint_reachable"
|
||||
record_ollama_endpoint_success(url)
|
||||
else:
|
||||
result.diagnosis_code = _classify_ollama_endpoint_failure(name, result.error)
|
||||
record_ollama_endpoint_failure(url)
|
||||
return result
|
||||
|
||||
|
||||
def _classify_ollama_endpoint_failure(
|
||||
provider_name: str,
|
||||
error: str | None,
|
||||
) -> str:
|
||||
"""Return a stable diagnosis code for UI/alert rendering."""
|
||||
normalized_error = (error or "").lower()
|
||||
if "cooldown" in normalized_error:
|
||||
return "endpoint_cooldown"
|
||||
if "502" in normalized_error or "bad gateway" in normalized_error:
|
||||
return (
|
||||
"local_proxy_upstream_unreachable"
|
||||
if provider_name == "ollama_local"
|
||||
else "proxy_upstream_unreachable"
|
||||
)
|
||||
if "timeout" in normalized_error:
|
||||
return "endpoint_timeout"
|
||||
if "connection refused" in normalized_error:
|
||||
return "endpoint_connection_refused"
|
||||
if "no route to host" in normalized_error or "network is unreachable" in normalized_error:
|
||||
return "endpoint_network_unreachable"
|
||||
return "endpoint_unreachable"
|
||||
|
||||
|
||||
async def check_openclaw() -> ComponentHealth:
|
||||
"""Async OpenClaw health check via /health"""
|
||||
return await _http_health_check("openclaw", settings.OPENCLAW_URL, "/health")
|
||||
|
||||
@@ -43,7 +43,10 @@ async def test_ollama_provider_chain_reports_fallback_when_primary_down(monkeypa
|
||||
assert aggregate.latency_ms == 42.0
|
||||
assert aggregate.error == "primary unavailable; fallback active: ollama_gcp_b"
|
||||
assert details["ollama_gcp_a"].status == "down"
|
||||
assert details["ollama_gcp_a"].provider_name == "ollama_gcp_a"
|
||||
assert details["ollama_gcp_a"].diagnosis_code == "endpoint_timeout"
|
||||
assert details["ollama_gcp_b"].status == "up"
|
||||
assert details["ollama_gcp_b"].diagnosis_code == "endpoint_reachable"
|
||||
assert details["ollama_local"].status == "up"
|
||||
|
||||
|
||||
@@ -108,6 +111,9 @@ async def test_ollama_provider_chain_uses_cooldown_after_failure(
|
||||
assert aggregate.status == "degraded"
|
||||
assert details["ollama_gcp_a"].status == "down"
|
||||
assert "cooldown" in (details["ollama_gcp_a"].error or "")
|
||||
assert details["ollama_gcp_a"].diagnosis_code == "endpoint_cooldown"
|
||||
assert details["ollama_gcp_a"].is_cooldown is True
|
||||
assert details["ollama_gcp_a"].retry_after_seconds is not None
|
||||
assert "ollama_gcp_a" not in calls
|
||||
assert {"ollama_gcp_b", "ollama_local"} == set(calls)
|
||||
|
||||
@@ -132,3 +138,23 @@ async def test_ollama_provider_chain_success_clears_cooldown(
|
||||
await health.check_ollama_provider_chain()
|
||||
|
||||
assert not is_ollama_endpoint_blocked("http://gcp-a:11434")
|
||||
|
||||
|
||||
def test_ollama_failure_classifier_marks_local_proxy_502() -> None:
|
||||
assert (
|
||||
health._classify_ollama_endpoint_failure(
|
||||
"ollama_local",
|
||||
"Server error '502 Bad Gateway' for url",
|
||||
)
|
||||
== "local_proxy_upstream_unreachable"
|
||||
)
|
||||
|
||||
|
||||
def test_ollama_failure_classifier_marks_network_unreachable() -> None:
|
||||
assert (
|
||||
health._classify_ollama_endpoint_failure(
|
||||
"ollama_local",
|
||||
"No route to host",
|
||||
)
|
||||
== "endpoint_network_unreachable"
|
||||
)
|
||||
|
||||
@@ -230,6 +230,14 @@
|
||||
"agent": "Agent",
|
||||
"provider": "Provider"
|
||||
},
|
||||
"aiModelHealth": {
|
||||
"cooldown": "冷卻",
|
||||
"cooldownSeconds": "冷卻 {seconds}s",
|
||||
"localProxy": "111 proxy",
|
||||
"timeout": "逾時",
|
||||
"network": "網路不可達",
|
||||
"refused": "拒絕"
|
||||
},
|
||||
"loading": "載入中...",
|
||||
"trendUp": "↑{pct}%",
|
||||
"searchPlaceholderShort": "搜尋...",
|
||||
|
||||
@@ -230,6 +230,14 @@
|
||||
"agent": "Agent",
|
||||
"provider": "Provider"
|
||||
},
|
||||
"aiModelHealth": {
|
||||
"cooldown": "冷卻",
|
||||
"cooldownSeconds": "冷卻 {seconds}s",
|
||||
"localProxy": "111 proxy",
|
||||
"timeout": "逾時",
|
||||
"network": "網路不可達",
|
||||
"refused": "拒絕"
|
||||
},
|
||||
"loading": "載入中...",
|
||||
"trendUp": "↑{pct}%",
|
||||
"searchPlaceholderShort": "搜尋...",
|
||||
|
||||
@@ -16,11 +16,18 @@ interface ModelInfo {
|
||||
role: 'primary' | 'backup' | 'local' | 'agent' | 'provider'
|
||||
status: 'up' | 'down' | 'degraded' | 'unknown'
|
||||
latencyMs?: number | null
|
||||
diagnosisCode?: string | null
|
||||
retryAfterSeconds?: number | null
|
||||
isCooldown?: boolean
|
||||
}
|
||||
|
||||
interface HealthComponent {
|
||||
status?: 'up' | 'down' | 'degraded'
|
||||
latency_ms?: number | null
|
||||
diagnosis_code?: string | null
|
||||
retry_after_seconds?: number | null
|
||||
is_cooldown?: boolean
|
||||
error?: string | null
|
||||
}
|
||||
|
||||
interface HealthResponse {
|
||||
@@ -49,6 +56,31 @@ function statusColor(status: ModelInfo['status']) {
|
||||
return '#87867f'
|
||||
}
|
||||
|
||||
function modelDetail(model: ModelInfo, t: ReturnType<typeof useTranslations>) {
|
||||
if (typeof model.latencyMs === 'number' && model.status === 'up') {
|
||||
return `${Math.round(model.latencyMs)}ms`
|
||||
}
|
||||
if (model.isCooldown) {
|
||||
const seconds = Math.max(0, Math.round(model.retryAfterSeconds ?? 0))
|
||||
return seconds > 0
|
||||
? t('aiModelHealth.cooldownSeconds', { seconds })
|
||||
: t('aiModelHealth.cooldown')
|
||||
}
|
||||
if (model.diagnosisCode === 'local_proxy_upstream_unreachable') {
|
||||
return t('aiModelHealth.localProxy')
|
||||
}
|
||||
if (model.diagnosisCode === 'endpoint_timeout') {
|
||||
return t('aiModelHealth.timeout')
|
||||
}
|
||||
if (model.diagnosisCode === 'endpoint_network_unreachable') {
|
||||
return t('aiModelHealth.network')
|
||||
}
|
||||
if (model.diagnosisCode === 'endpoint_connection_refused') {
|
||||
return t('aiModelHealth.refused')
|
||||
}
|
||||
return t(`aiModelRoles.${model.role}` as never)
|
||||
}
|
||||
|
||||
export function AIModelStatus() {
|
||||
const t = useTranslations('dashboard')
|
||||
const [models, setModels] = useState<ModelInfo[]>([
|
||||
@@ -72,6 +104,9 @@ export function AIModelStatus() {
|
||||
role: PROVIDER_ROLES[key] ?? 'provider',
|
||||
status: d.components?.[key]?.status ?? 'unknown',
|
||||
latencyMs: d.components?.[key]?.latency_ms,
|
||||
diagnosisCode: d.components?.[key]?.diagnosis_code,
|
||||
retryAfterSeconds: d.components?.[key]?.retry_after_seconds,
|
||||
isCooldown: d.components?.[key]?.is_cooldown,
|
||||
})))
|
||||
})
|
||||
.catch(() => {})
|
||||
@@ -98,9 +133,7 @@ export function AIModelStatus() {
|
||||
<span style={{ width: 5, height: 5, borderRadius: '50%', background: statusColor(m.status), flexShrink: 0 }} />
|
||||
<span style={{ fontSize: 12, fontWeight: 500, color: '#141413' }}>{m.name}</span>
|
||||
<span style={{ fontSize: 10, color: '#87867f', marginLeft: 'auto' }}>
|
||||
{typeof m.latencyMs === 'number'
|
||||
? `${Math.round(m.latencyMs)}ms`
|
||||
: t(`aiModelRoles.${m.role}` as never)}
|
||||
{modelDetail(m, t)}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
|
||||
@@ -51,6 +51,49 @@ http://127.0.0.1:3107/zh-TW/awooop/work-items?project_id=awoooi&incident_id=INC-
|
||||
- 整體 AI 自動化飛輪:約 73%;仍不能宣稱 24h 全自動 repair 閉環,需以 production evidence 持續補齊。
|
||||
- 24h 完整 AI Agent 自動修復 production claim:0%;仍維持嚴格口徑,只能宣稱「已驗證的特定 controlled apply / drill-down 能被追蹤」。
|
||||
|
||||
## 2026-05-31|Ollama health 診斷碼與前台冷卻顯示
|
||||
|
||||
**背景**:
|
||||
|
||||
- 111 local fallback 復原時,production health 曾短暫顯示 `recent endpoint failure cooldown`;這是 API pod 內 60 秒短期抑制,不等於 endpoint 當下仍不可用。
|
||||
- 既有 `/api/v1/health` 只回 `status/latency/error`,前台與告警只能把 cooldown / 502 upstream / timeout 都看成同一種 down。
|
||||
|
||||
**本次調整**:
|
||||
|
||||
- `apps/api/src/api/v1/health.py` 的 `ComponentHealth` 新增:
|
||||
- `provider_name`
|
||||
- `diagnosis_code`
|
||||
- `retry_after_seconds`
|
||||
- `cooldown_remaining_seconds`
|
||||
- `is_cooldown`
|
||||
- Ollama endpoint health 會穩定分類:
|
||||
- `endpoint_reachable`
|
||||
- `endpoint_cooldown`
|
||||
- `local_proxy_upstream_unreachable`
|
||||
- `proxy_upstream_unreachable`
|
||||
- `endpoint_timeout`
|
||||
- `endpoint_connection_refused`
|
||||
- `endpoint_network_unreachable`
|
||||
- `endpoint_unreachable`
|
||||
- `AIModelStatus` 前台卡片讀取這些欄位,將 cooldown 顯示成 `冷卻 Ns`,111 proxy / timeout / network unreachable / refused 也顯示短標籤。
|
||||
|
||||
**驗證**:
|
||||
|
||||
```text
|
||||
python3 -m py_compile apps/api/src/api/v1/health.py apps/api/tests/test_health_ollama_provider_chain.py -> pass
|
||||
pytest tests/test_health_ollama_provider_chain.py -q -> 7 passed
|
||||
ruff check src/api/v1/health.py tests/test_health_ollama_provider_chain.py --select E9,F401,F821,F841 -> pass
|
||||
python3 -m json.tool apps/web/messages/zh-TW.json / en.json -> pass
|
||||
cmp apps/web/messages/zh-TW.json apps/web/messages/en.json -> pass
|
||||
pnpm --dir apps/web exec tsc --noEmit --tsBuildInfoFile /tmp/awoooi-ai-model-health-diagnosis-20260531.tsbuildinfo -> pass
|
||||
git diff --check -> pass
|
||||
```
|
||||
|
||||
**判讀**:
|
||||
|
||||
- 這不是改路由策略,也不是新增修復動作;只是把健康檢查結果從「一段文字」升級成可被 Telegram / 前台穩定解讀的診斷欄位。
|
||||
- 後續若 111 再短暫抖動,前台應能看到「冷卻」或「111 proxy」而不是只看到泛化的 Ollama down。
|
||||
|
||||
## 2026-05-31|Ollama 111 local fallback 復原確認
|
||||
|
||||
**背景**:
|
||||
|
||||
@@ -2727,6 +2727,12 @@ Phase 6 完成後
|
||||
- Verification:`110 -> 192.168.0.111:11434 /api/tags -> 200`、`110 -> 127.0.0.1:11437 /api/tags -> 200`、production `/api/v1/health` cooldown expiry 後 `ollama_local=up`;`11437 hermes3:latest /api/generate -> 200 response=OK`,同輪 `11435/11436 hermes3:latest` 也皆成功。111 host 上 `com.momo.ollama111-allow-proxy` 為 running,allowlist 包含 `192.168.0.110/32`,`pmset sleep=0`。
|
||||
- 判讀:11437 目前已恢復,三層 Ollama health 已全綠;中間的 `recent endpoint failure cooldown` 是 API pod 內 60 秒短期抑制殘留,不代表 endpoint 當下仍不可用。若再復發,告警需把 111 reachability / wake evidence 顯示出來,避免被誤讀成 GCP-A/B Ollama 全掛。
|
||||
|
||||
**T154j Ollama health diagnosis codes surfaced(2026-05-31 台北)**:
|
||||
- 觸發:T154i 暴露 health surface 只剩 `status/latency/error`,無法讓 Telegram / 前台穩定區分 cooldown、111 proxy upstream、timeout、network unreachable。
|
||||
- 修正:`ComponentHealth` 新增 `provider_name`、`diagnosis_code`、`retry_after_seconds`、`cooldown_remaining_seconds`、`is_cooldown`;Ollama endpoint health 分類 `endpoint_reachable`、`endpoint_cooldown`、`local_proxy_upstream_unreachable`、`endpoint_timeout`、`endpoint_network_unreachable` 等。`AIModelStatus` 讀取診斷欄位,前台顯示 `冷卻 Ns`、`111 proxy`、`逾時`、`網路不可達`、`拒絕`。
|
||||
- Verification:focused `py_compile` pass、pytest `test_health_ollama_provider_chain.py` 7 passed、ruff `E9/F401/F821/F841` pass、i18n JSON parse/cmp pass、web `tsc --noEmit` pass、`git diff --check` pass。
|
||||
- 判讀:T154j 不改 ADR-110 路由策略、不新增自動修復,只把 health truth 變成可機器判讀的欄位,避免未來把 cooldown 或 111 upstream 抖動誤報成「所有 Ollama 掛」。
|
||||
|
||||
**T152 Ansible runtime readiness surfaced(2026-05-24 台北)**:
|
||||
- 觸發:T151 已讓首頁看到 execution backend / Ansible attribution,但 operator 仍看不到 runtime 端缺什麼,容易把「Ansible 有候選」誤解成「Ansible 已能自動修復」。
|
||||
- 修正:API image 複製 `infra/ansible/` 作 read-only catalog;`truth-chain/quality/summary` 新增 `ansible_runtime`,回報 playbook binary、catalog、inventory、playbook_count、can_run_check_mode、blockers。首頁 execution evidence 同步顯示 runtime 狀態;目前 production 顯示 `runtime 未就緒:ansible_playbook_binary_missing`。未安裝 `ansible-core`、未啟用 check-mode / apply。
|
||||
|
||||
Reference in New Issue
Block a user