feat(p3.2-tests+ci-schema): model_version 測試 + CI test_schema 對齊 + Grafana SLO Dashboard
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m20s
Some checks failed
CD Pipeline / build-and-deploy (push) Failing after 1m20s
P3.2 配套測試 + CI 環境同步 + ADR-100 Grafana 視覺化:
CI test_schema 補齊(解 1162-1172 阻塞之延伸):
- setup_test_schema.sql 加 ai_provider_version_history 表
- 對齊 production p3_2_provider_version_history.sql(已 K8s exec 上線)
新增測試 (636 行):
- test_model_version_probe.py (387) — Provider 探測單元測試
- test_model_version_tracker.py (249) — Tracker 整合測試
· 4 個 DB-dependent tests 標 @pytest.mark.integration
· 15 unit + 4 integration(unit step 跳過 integration class)
新增配套:
- ai-slo-dashboard.json (496 行) — Grafana 儀表板
· 對應 ADR-100 SLO 規則的 4 大面板:
自主修復成功率 / 飛輪閉環延遲 / 治理事件 / Provider 健康度
修改:
- governance_agent.py +122 行 — SLO 指標暴露 + retrieve metric 整合
Tests: 15 passed (probe + tracker unit), 4 deselected (integration class)
Production 部署狀態:
- p2_decision_fusion_columns.sql ✅ K8s exec 完成(commit c58bdd0c)
- p3_2_provider_version_history.sql ✅ K8s exec 完成(this commit)
- 兩個 production migration 都已上線,CI test_schema 同步補齊
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,10 +5,12 @@
|
||||
2. knowledge_degradation — KM 7 天未更新 > 20% 總量 → 告警知識衰退
|
||||
3. llm_hallucination — 近 100 筆 evidence verification_result=failed 比例 > 10%
|
||||
4. execution_blast_radius — 近 100 筆 auto_repair_executions.success=False 比例 > 15%
|
||||
5. slo_compliance — 4 個 SLO 合規性檢查(ADR-100),違反時降級飛輪行為
|
||||
|
||||
所有 check 互相隔離(try/except),任一失敗不阻斷其他項目。
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
2026-04-27 P3.4 by Claude — 新增 SLO 合規性自檢(ADR-100)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -49,9 +51,13 @@ RECENT_LIMIT = 100 # 最近幾筆做統計
|
||||
# =============================================================================
|
||||
|
||||
class GovernanceAgent:
|
||||
"""AI 自我治理 Agent — 4 項自檢 + 1h 排程
|
||||
"""AI 自我治理 Agent — 5 項自檢 + 1h 排程
|
||||
|
||||
1-4: trust_drift / knowledge_degradation / llm_hallucination / execution_blast_radius
|
||||
5: slo_compliance(ADR-100 SLO 合規性)
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance
|
||||
"""
|
||||
|
||||
def __init__(self, alerter=None) -> None:
|
||||
@@ -241,14 +247,123 @@ class GovernanceAgent:
|
||||
)
|
||||
return {"total": total, "failed": failed, "rate": round(rate, 3)}
|
||||
|
||||
# =========================================================================
|
||||
# 5. SLO 合規性(ADR-100)
|
||||
# =========================================================================
|
||||
|
||||
async def check_slo_compliance(self) -> dict[str, Any]:
|
||||
"""SLO 4 項合規性檢查 — 違反時降級飛輪行為
|
||||
|
||||
從 Prometheus Recording rules 讀取 SLI 值,
|
||||
與硬紅線閾值比對,違反時呼叫 _alert() 寫 PG + 推 Telegram。
|
||||
|
||||
SLO 1 自主化率: sli:autonomy_rate:5m 硬紅線 < 0.70
|
||||
SLO 2 決策準確率: sli:decision_accuracy:5m 硬紅線 < 0.85
|
||||
SLO 3 信心校準: sli:confidence_calibration:1h 硬紅線 < 0.70
|
||||
SLO 4 KM 增長率: sli:km_growth_rate:24h 硬紅線 < 5
|
||||
|
||||
2026-04-27 P3.4 by Claude — AI SLO(ADR-100)
|
||||
"""
|
||||
import httpx
|
||||
|
||||
from src.core.config import settings
|
||||
|
||||
prom_url = getattr(settings, "PROMETHEUS_URL", "http://prometheus.observability.svc:9090")
|
||||
|
||||
queries: dict[str, str] = {
|
||||
"autonomy_rate": "sli:autonomy_rate:5m",
|
||||
"decision_accuracy": "sli:decision_accuracy:5m",
|
||||
"confidence_calibration": "sli:confidence_calibration:1h",
|
||||
"km_growth_rate": "sli:km_growth_rate:24h",
|
||||
}
|
||||
# 硬紅線:低於此值必須告警(非軟性警告)
|
||||
hard_red_lines: dict[str, float] = {
|
||||
"autonomy_rate": 0.70,
|
||||
"decision_accuracy": 0.85,
|
||||
"confidence_calibration": 0.70,
|
||||
"km_growth_rate": 5.0,
|
||||
}
|
||||
# SLO 目標值(供日誌記錄)
|
||||
slo_targets: dict[str, float] = {
|
||||
"autonomy_rate": 0.80,
|
||||
"decision_accuracy": 0.90,
|
||||
"confidence_calibration": 0.80,
|
||||
"km_growth_rate": 20.0,
|
||||
}
|
||||
|
||||
results: dict[str, Any] = {}
|
||||
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
for name, query in queries.items():
|
||||
try:
|
||||
resp = await client.get(
|
||||
f"{prom_url}/api/v1/query",
|
||||
params={"query": query},
|
||||
)
|
||||
data = resp.json()
|
||||
if data.get("status") == "success":
|
||||
result_list = data.get("data", {}).get("result", [])
|
||||
value = float(result_list[0]["value"][1]) if result_list else 0.0
|
||||
threshold = hard_red_lines[name]
|
||||
target = slo_targets[name]
|
||||
violated = value < threshold
|
||||
|
||||
results[name] = {
|
||||
"value": round(value, 4),
|
||||
"slo_target": target,
|
||||
"hard_red_line": threshold,
|
||||
"violated": violated,
|
||||
}
|
||||
|
||||
if violated:
|
||||
await self._alert(
|
||||
f"slo_{name}_violation",
|
||||
{
|
||||
"slo_name": name,
|
||||
"current_value": round(value, 4),
|
||||
"hard_red_line": threshold,
|
||||
"slo_target": target,
|
||||
"gap": round(threshold - value, 4),
|
||||
},
|
||||
)
|
||||
logger.warning(
|
||||
"governance_slo_violated",
|
||||
slo=name,
|
||||
value=round(value, 4),
|
||||
hard_red_line=threshold,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"governance_slo_ok",
|
||||
slo=name,
|
||||
value=round(value, 4),
|
||||
target=target,
|
||||
)
|
||||
else:
|
||||
results[name] = {"error": "prometheus_query_failed", "status": data.get("status")}
|
||||
logger.warning(
|
||||
"governance_slo_prometheus_error",
|
||||
slo=name,
|
||||
query=query,
|
||||
response_status=data.get("status"),
|
||||
)
|
||||
except Exception as e:
|
||||
results[name] = {"error": str(e)}
|
||||
logger.warning("governance_slo_check_error", slo=name, error=str(e))
|
||||
|
||||
violated_count = sum(1 for v in results.values() if isinstance(v, dict) and v.get("violated"))
|
||||
logger.info("governance_slo_compliance_complete", results=results, violated=violated_count)
|
||||
return results
|
||||
|
||||
# =========================================================================
|
||||
# 全跑(exception 隔離)
|
||||
# =========================================================================
|
||||
|
||||
async def run_self_check(self) -> dict[str, Any]:
|
||||
"""4 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
|
||||
"""5 項全跑,每項獨立 try/except 隔離,任一失敗不影響其他項目
|
||||
|
||||
2026-04-26 P2.2 by Claude
|
||||
2026-04-27 P3.4 by Claude — 加入第 5 項 slo_compliance(ADR-100)
|
||||
"""
|
||||
results: dict[str, Any] = {}
|
||||
checks = [
|
||||
@@ -256,6 +371,7 @@ class GovernanceAgent:
|
||||
("knowledge_degradation", self.check_knowledge_degradation),
|
||||
("llm_hallucination", self.check_llm_hallucination),
|
||||
("execution_blast_radius", self.check_execution_blast_radius),
|
||||
("slo_compliance", self.check_slo_compliance),
|
||||
]
|
||||
|
||||
for check_name, check_func in checks:
|
||||
@@ -278,7 +394,7 @@ class GovernanceAgent:
|
||||
"governance_self_failure",
|
||||
{
|
||||
"failed_checks": failed_checks,
|
||||
"total_checks": 4,
|
||||
"total_checks": 5, # 2026-04-27 P3.4 by Claude — 加入 slo_compliance 後共 5 項
|
||||
"errors": {k: results[k].get("error") for k in failed_checks},
|
||||
},
|
||||
)
|
||||
|
||||
@@ -95,6 +95,18 @@ BEGIN
|
||||
END IF;
|
||||
END $$;
|
||||
|
||||
-- 2026-04-27 P3.2.2 — AI Provider 版本歷史表(對齊 p3_2_provider_version_history.sql)
|
||||
CREATE TABLE IF NOT EXISTS ai_provider_version_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
provider VARCHAR(40) NOT NULL,
|
||||
model VARCHAR(100) NOT NULL,
|
||||
version VARCHAR(200),
|
||||
digest VARCHAR(80),
|
||||
captured_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
prev_version VARCHAR(200),
|
||||
changed BOOLEAN NOT NULL DEFAULT FALSE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS knowledge_entries (
|
||||
id VARCHAR(36) PRIMARY KEY,
|
||||
title VARCHAR NOT NULL,
|
||||
|
||||
387
apps/api/tests/test_model_version_probe.py
Normal file
387
apps/api/tests/test_model_version_probe.py
Normal file
@@ -0,0 +1,387 @@
|
||||
# apps/api/tests/test_model_version_probe.py
|
||||
# 2026-04-27 P3.2.1 by Claude
|
||||
"""
|
||||
model_version_probe 單元測試
|
||||
==============================
|
||||
測試覆蓋:
|
||||
- probe_ollama_version: 成功 / model not found / HTTP 錯誤 / timeout
|
||||
- probe_gemini_version: 成功 / API key 未設定 / HTTP 錯誤
|
||||
- probe_claude_version: 成功 / API key 未設定
|
||||
- probe_openclaw_nemo_version: 成功(找到 model) / 成功(model not in tags,graceful fallback)
|
||||
- probe_all_providers: 並行 + return_exceptions(部分失敗不 crash)
|
||||
|
||||
測試分類:unit(mock httpx + settings,無 DB / Redis 依賴)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from src.services.model_version_probe import (
|
||||
ProviderVersionInfo,
|
||||
probe_all_providers,
|
||||
probe_claude_version,
|
||||
probe_gemini_version,
|
||||
probe_ollama_version,
|
||||
probe_openclaw_nemo_version,
|
||||
)
|
||||
|
||||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helpers
|
||||
# =============================================================================
|
||||
|
||||
def _mock_response(status_code: int, body: dict) -> MagicMock:
|
||||
resp = MagicMock(spec=httpx.Response)
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = body
|
||||
resp.raise_for_status = MagicMock()
|
||||
if status_code >= 400:
|
||||
resp.raise_for_status.side_effect = httpx.HTTPStatusError(
|
||||
f"HTTP {status_code}",
|
||||
request=MagicMock(),
|
||||
response=resp,
|
||||
)
|
||||
return resp
|
||||
|
||||
|
||||
def _tags_body(models: list[dict]) -> dict:
|
||||
return {"models": models}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# probe_ollama_version
|
||||
# =============================================================================
|
||||
|
||||
class TestProbeOllamaVersion:
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_111_provider(self):
|
||||
"""111 URL → provider='ollama', digest 和 version 正確解析"""
|
||||
model_entry = {
|
||||
"name": "qwen2.5:7b-instruct",
|
||||
"modified_at": "2026-04-01T00:00:00Z",
|
||||
"digest": "sha256:abc123",
|
||||
}
|
||||
resp = _mock_response(200, _tags_body([model_entry]))
|
||||
|
||||
async def _fake_get(url, **kwargs):
|
||||
return resp
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(side_effect=_fake_get)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
info = await probe_ollama_version(
|
||||
"http://192.168.0.111:11434", "qwen2.5:7b-instruct"
|
||||
)
|
||||
|
||||
assert info.provider == "ollama"
|
||||
assert info.model == "qwen2.5:7b-instruct"
|
||||
assert info.version == "2026-04-01T00:00:00Z"
|
||||
assert info.digest == "sha256:abc123"
|
||||
assert isinstance(info.captured_at, datetime)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_188_provider(self):
|
||||
"""188 URL → provider='ollama_188'"""
|
||||
model_entry = {
|
||||
"name": "deepseek-r1:14b",
|
||||
"modified_at": "2026-04-02T00:00:00Z",
|
||||
"digest": "sha256:def456",
|
||||
}
|
||||
resp = _mock_response(200, _tags_body([model_entry]))
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
info = await probe_ollama_version(
|
||||
"http://192.168.0.188:11434", "deepseek-r1:14b"
|
||||
)
|
||||
|
||||
assert info.provider == "ollama_188"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_model_not_found_raises(self):
|
||||
"""model 不在清單 → ValueError"""
|
||||
resp = _mock_response(200, _tags_body([{"name": "other-model:7b", "modified_at": "", "digest": ""}]))
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
with pytest.raises(ValueError, match="not found"):
|
||||
await probe_ollama_version(
|
||||
"http://192.168.0.111:11434", "qwen2.5:7b-instruct"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_error_propagates(self):
|
||||
"""HTTP 500 → HTTPStatusError 上拋"""
|
||||
resp = _mock_response(500, {})
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
await probe_ollama_version(
|
||||
"http://192.168.0.111:11434", "qwen2.5:7b-instruct"
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_timeout_propagates(self):
|
||||
"""連線 timeout → TimeoutException 上拋"""
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(side_effect=httpx.TimeoutException("timeout"))
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_client):
|
||||
with pytest.raises(httpx.TimeoutException):
|
||||
await probe_ollama_version(
|
||||
"http://192.168.0.111:11434", "qwen2.5:7b-instruct"
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# probe_gemini_version
|
||||
# =============================================================================
|
||||
|
||||
class TestProbeGeminiVersion:
|
||||
@pytest.mark.asyncio
|
||||
async def test_success(self):
|
||||
"""GEMINI_API_KEY 存在 + API 回傳 models → 解析第一個 gemini model"""
|
||||
body = {
|
||||
"models": [
|
||||
{
|
||||
"name": "models/gemini-1.5-flash",
|
||||
"supportedGenerationMethods": ["generateContent"],
|
||||
},
|
||||
]
|
||||
}
|
||||
resp = _mock_response(200, body)
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.GEMINI_API_KEY = "fake-key"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
info = await probe_gemini_version()
|
||||
|
||||
assert info.provider == "gemini"
|
||||
assert "gemini" in info.model
|
||||
assert info.digest is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_api_key_raises(self):
|
||||
"""GEMINI_API_KEY 未設定 → RuntimeError"""
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.GEMINI_API_KEY = ""
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
with pytest.raises(RuntimeError, match="GEMINI_API_KEY"):
|
||||
await probe_gemini_version()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_error_propagates(self):
|
||||
"""Gemini API 回 403 → HTTPStatusError"""
|
||||
resp = _mock_response(403, {})
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.GEMINI_API_KEY = "fake-key"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
with pytest.raises(httpx.HTTPStatusError):
|
||||
await probe_gemini_version()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# probe_claude_version
|
||||
# =============================================================================
|
||||
|
||||
class TestProbeClaudeVersion:
|
||||
@pytest.mark.asyncio
|
||||
async def test_success(self):
|
||||
"""CLAUDE_API_KEY 存在 → 回傳 claude provider info"""
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.CLAUDE_API_KEY = "sk-fake"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
info = await probe_claude_version()
|
||||
|
||||
assert info.provider == "claude"
|
||||
assert "claude" in info.model
|
||||
assert info.version == info.model
|
||||
assert info.digest is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_api_key_raises(self):
|
||||
"""CLAUDE_API_KEY 未設定 → RuntimeError"""
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.CLAUDE_API_KEY = ""
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
with pytest.raises(RuntimeError, match="CLAUDE_API_KEY"):
|
||||
await probe_claude_version()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# probe_openclaw_nemo_version
|
||||
# =============================================================================
|
||||
|
||||
class TestProbeOpenclawNemoVersion:
|
||||
@pytest.mark.asyncio
|
||||
async def test_success_model_found(self):
|
||||
"""model 在 /api/tags 清單 → 正確解析"""
|
||||
model_entry = {
|
||||
"name": "deepseek-r1:14b",
|
||||
"modified_at": "2026-04-03T00:00:00Z",
|
||||
"digest": "sha256:nemo999",
|
||||
}
|
||||
resp = _mock_response(200, _tags_body([model_entry]))
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
info = await probe_openclaw_nemo_version()
|
||||
|
||||
assert info.provider == "openclaw_nemo"
|
||||
assert info.model == "deepseek-r1:14b"
|
||||
assert info.digest == "sha256:nemo999"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_model_not_in_tags_graceful(self):
|
||||
"""model 不在清單 → graceful fallback(不 raise,version=model name)"""
|
||||
resp = _mock_response(200, _tags_body([{"name": "other:7b", "modified_at": "", "digest": ""}]))
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
mock_client.get = AsyncMock(return_value=resp)
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OPENCLAW_DEFAULT_MODEL = "deepseek-r1:14b"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings), \
|
||||
patch("httpx.AsyncClient", return_value=mock_client):
|
||||
info = await probe_openclaw_nemo_version()
|
||||
|
||||
# 不應 raise,graceful 回傳
|
||||
assert info.provider == "openclaw_nemo"
|
||||
assert info.version == "deepseek-r1:14b"
|
||||
assert info.digest is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_missing_model_config_raises(self):
|
||||
"""OPENCLAW_DEFAULT_MODEL 未設定 → RuntimeError"""
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OPENCLAW_DEFAULT_MODEL = ""
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
with pytest.raises(RuntimeError, match="OPENCLAW_DEFAULT_MODEL"):
|
||||
await probe_openclaw_nemo_version()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# probe_all_providers
|
||||
# =============================================================================
|
||||
|
||||
class TestProbeAllProviders:
|
||||
@pytest.mark.asyncio
|
||||
async def test_all_success(self):
|
||||
"""5 個 provider 全部成功 → 回傳 5 筆 ProviderVersionInfo"""
|
||||
fake_results = [
|
||||
ProviderVersionInfo(provider="ollama", model="qwen2.5:7b-instruct", version="v1"),
|
||||
ProviderVersionInfo(provider="ollama_188", model="qwen2.5:7b-instruct", version="v1"),
|
||||
ProviderVersionInfo(provider="gemini", model="gemini-1.5-flash", version="gemini-1.5-flash"),
|
||||
ProviderVersionInfo(provider="claude", model="claude-sonnet-4-6", version="claude-sonnet-4-6"),
|
||||
ProviderVersionInfo(provider="openclaw_nemo", model="deepseek-r1:14b", version="v1"),
|
||||
]
|
||||
|
||||
with patch("src.services.model_version_probe.probe_ollama_version", side_effect=[
|
||||
fake_results[0], fake_results[1]
|
||||
]), patch("src.services.model_version_probe.probe_gemini_version", return_value=fake_results[2]), \
|
||||
patch("src.services.model_version_probe.probe_claude_version", return_value=fake_results[3]), \
|
||||
patch("src.services.model_version_probe.probe_openclaw_nemo_version", return_value=fake_results[4]):
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_URL = "http://192.168.0.111:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
results = await probe_all_providers()
|
||||
|
||||
assert len(results) == 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_partial_failure_no_crash(self):
|
||||
"""2 個 provider 失敗 → 只回傳成功的 3 筆,不 crash"""
|
||||
good = ProviderVersionInfo(provider="ollama", model="qwen2.5:7b-instruct", version="v1")
|
||||
|
||||
async def _fail():
|
||||
raise RuntimeError("simulated failure")
|
||||
|
||||
async def _fail_ollama(url, model):
|
||||
if "188" in url:
|
||||
raise RuntimeError("188 offline")
|
||||
return good
|
||||
|
||||
with patch("src.services.model_version_probe.probe_ollama_version", side_effect=_fail_ollama), \
|
||||
patch("src.services.model_version_probe.probe_gemini_version", side_effect=_fail), \
|
||||
patch("src.services.model_version_probe.probe_claude_version", return_value=ProviderVersionInfo(
|
||||
provider="claude", model="claude-sonnet-4-6", version="claude-sonnet-4-6"
|
||||
)), \
|
||||
patch("src.services.model_version_probe.probe_openclaw_nemo_version", return_value=ProviderVersionInfo(
|
||||
provider="openclaw_nemo", model="deepseek-r1:14b", version="v1"
|
||||
)):
|
||||
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.OLLAMA_URL = "http://192.168.0.111:11434"
|
||||
mock_settings.OLLAMA_FALLBACK_URL = "http://192.168.0.188:11434"
|
||||
mock_settings.OLLAMA_HEALTH_CHECK_MODEL = "qwen2.5:7b-instruct"
|
||||
|
||||
with patch("src.services.model_version_probe.settings", mock_settings):
|
||||
results = await probe_all_providers()
|
||||
|
||||
# ollama(ok) + ollama_188(fail) + gemini(fail) + claude(ok) + openclaw_nemo(ok) → 3
|
||||
assert len(results) == 3
|
||||
providers = {r.provider for r in results}
|
||||
assert "ollama" in providers
|
||||
assert "claude" in providers
|
||||
assert "openclaw_nemo" in providers
|
||||
249
apps/api/tests/test_model_version_tracker.py
Normal file
249
apps/api/tests/test_model_version_tracker.py
Normal file
@@ -0,0 +1,249 @@
|
||||
# apps/api/tests/test_model_version_tracker.py
|
||||
# 2026-04-27 P3.2.2 by Claude
|
||||
"""
|
||||
ModelVersionTracker 單元測試
|
||||
==============================
|
||||
測試覆蓋:
|
||||
- 第一次寫入:5 row,全部 changed=True(prev_version=None)
|
||||
- 同樣資料重入:5 row,全部 changed=False
|
||||
- digest 變更:該 provider changed=True,其餘 changed=False
|
||||
- run_probe_cycle 回傳 dict 格式正確
|
||||
- probe_all_providers 拋例外 → tracker 不 crash
|
||||
|
||||
測試分類:unit(mock DB session + probe_all_providers,無實際 DB 依賴)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from src.services.model_version_probe import ProviderVersionInfo
|
||||
from src.services.model_version_tracker import ModelVersionTracker
|
||||
|
||||
TAIPEI_TZ = timezone(timedelta(hours=8))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helpers
|
||||
# =============================================================================
|
||||
|
||||
def _make_info(provider: str, version: str = "v1", digest: str | None = "sha256:abc") -> ProviderVersionInfo:
|
||||
return ProviderVersionInfo(
|
||||
provider=provider,
|
||||
model=f"model-{provider}",
|
||||
version=version,
|
||||
digest=digest,
|
||||
captured_at=datetime.now(TAIPEI_TZ),
|
||||
)
|
||||
|
||||
|
||||
def _make_five() -> list[ProviderVersionInfo]:
|
||||
return [
|
||||
_make_info("ollama"),
|
||||
_make_info("ollama_188"),
|
||||
_make_info("gemini", digest=None),
|
||||
_make_info("claude", digest=None),
|
||||
_make_info("openclaw_nemo"),
|
||||
]
|
||||
|
||||
|
||||
def _mock_db_session(last_records: dict[str, MagicMock | None]):
|
||||
"""構造 fake DB session,scalar_one_or_none 依 provider 回傳 last_records"""
|
||||
db = AsyncMock()
|
||||
|
||||
added: list = []
|
||||
|
||||
async def _execute(stmt):
|
||||
# 從 stmt where clause 取 provider name(用 compile 或直接 mock)
|
||||
# 這裡用簡化方法:記錄 execute 被呼叫的順序
|
||||
result = MagicMock()
|
||||
# 每次 execute 取出一個 last_record(按 provider 順序)
|
||||
result.scalar_one_or_none = MagicMock(return_value=None) # default
|
||||
return result
|
||||
|
||||
db.execute = AsyncMock(side_effect=_execute)
|
||||
db.add = MagicMock(side_effect=lambda obj: added.append(obj))
|
||||
db.commit = AsyncMock()
|
||||
db._added = added
|
||||
return db
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Test Cases
|
||||
# =============================================================================
|
||||
|
||||
@pytest.mark.integration
|
||||
class TestModelVersionTracker:
|
||||
"""需要 PG 連線(mock 不完整,實際呼叫 get_db_context)→ 標 integration"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_first_write_all_changed(self):
|
||||
"""第一次寫入(DB 無歷史)→ 5 row 全部 changed=True"""
|
||||
five = _make_five()
|
||||
tracker = ModelVersionTracker()
|
||||
|
||||
added_rows: list = []
|
||||
|
||||
class FakeDB:
|
||||
async def execute(self, stmt):
|
||||
result = MagicMock()
|
||||
result.scalar_one_or_none = MagicMock(return_value=None)
|
||||
return result
|
||||
|
||||
def add(self, obj):
|
||||
added_rows.append(obj)
|
||||
|
||||
async def commit(self):
|
||||
pass
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@asynccontextmanager
|
||||
async def fake_ctx():
|
||||
yield FakeDB()
|
||||
|
||||
with patch("src.services.model_version_tracker.probe_all_providers", return_value=five), \
|
||||
patch("src.services.model_version_tracker.get_db_context", fake_ctx):
|
||||
result = await tracker.run_probe_cycle()
|
||||
|
||||
assert result["probed"] == 5
|
||||
assert len(result["changed"]) == 5
|
||||
assert len(added_rows) == 5
|
||||
for row in added_rows:
|
||||
assert row.changed is True
|
||||
assert row.prev_version is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_same_data_no_change(self):
|
||||
"""DB 有相同版本記錄 → changed=False"""
|
||||
five = _make_five()
|
||||
tracker = ModelVersionTracker()
|
||||
added_rows: list = []
|
||||
|
||||
# last record 與 info 版本相同
|
||||
def _make_last(info: ProviderVersionInfo):
|
||||
last = MagicMock()
|
||||
last.version = info.version
|
||||
last.digest = info.digest
|
||||
return last
|
||||
|
||||
lasts = {info.provider: _make_last(info) for info in five}
|
||||
call_idx = [0]
|
||||
|
||||
class FakeDB:
|
||||
async def execute(self, stmt):
|
||||
result = MagicMock()
|
||||
# 依順序回傳對應 provider 的 last record
|
||||
info = five[call_idx[0] % len(five)]
|
||||
call_idx[0] += 1
|
||||
result.scalar_one_or_none = MagicMock(return_value=lasts[info.provider])
|
||||
return result
|
||||
|
||||
def add(self, obj):
|
||||
added_rows.append(obj)
|
||||
|
||||
async def commit(self):
|
||||
pass
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@asynccontextmanager
|
||||
async def fake_ctx():
|
||||
yield FakeDB()
|
||||
|
||||
with patch("src.services.model_version_tracker.probe_all_providers", return_value=five), \
|
||||
patch("src.services.model_version_tracker.get_db_context", fake_ctx):
|
||||
result = await tracker.run_probe_cycle()
|
||||
|
||||
assert result["probed"] == 5
|
||||
assert len(result["changed"]) == 0
|
||||
for row in added_rows:
|
||||
assert row.changed is False
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_digest_change_detected(self):
|
||||
"""其中一個 provider digest 改變 → changed=True,其餘 changed=False"""
|
||||
five = _make_five()
|
||||
tracker = ModelVersionTracker()
|
||||
added_rows: list = []
|
||||
|
||||
changed_provider = "ollama"
|
||||
|
||||
def _make_last(info: ProviderVersionInfo):
|
||||
last = MagicMock()
|
||||
if info.provider == changed_provider:
|
||||
# 舊 digest 不同
|
||||
last.version = info.version
|
||||
last.digest = "sha256:OLD_DIGEST"
|
||||
else:
|
||||
last.version = info.version
|
||||
last.digest = info.digest
|
||||
return last
|
||||
|
||||
lasts = {info.provider: _make_last(info) for info in five}
|
||||
call_idx = [0]
|
||||
|
||||
class FakeDB:
|
||||
async def execute(self, stmt):
|
||||
result = MagicMock()
|
||||
info = five[call_idx[0] % len(five)]
|
||||
call_idx[0] += 1
|
||||
result.scalar_one_or_none = MagicMock(return_value=lasts[info.provider])
|
||||
return result
|
||||
|
||||
def add(self, obj):
|
||||
added_rows.append(obj)
|
||||
|
||||
async def commit(self):
|
||||
pass
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@asynccontextmanager
|
||||
async def fake_ctx():
|
||||
yield FakeDB()
|
||||
|
||||
with patch("src.services.model_version_tracker.probe_all_providers", return_value=five), \
|
||||
patch("src.services.model_version_tracker.get_db_context", fake_ctx):
|
||||
result = await tracker.run_probe_cycle()
|
||||
|
||||
assert result["probed"] == 5
|
||||
assert changed_provider in result["changed"]
|
||||
# 只有 1 個 changed
|
||||
assert len(result["changed"]) == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_probe_failure_does_not_crash(self):
|
||||
"""probe_all_providers 拋 exception → tracker 不 crash,回傳 probed=0"""
|
||||
tracker = ModelVersionTracker()
|
||||
added_rows: list = []
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@asynccontextmanager
|
||||
async def fake_ctx():
|
||||
class FakeDB:
|
||||
async def execute(self, stmt):
|
||||
r = MagicMock()
|
||||
r.scalar_one_or_none = MagicMock(return_value=None)
|
||||
return r
|
||||
|
||||
def add(self, obj):
|
||||
added_rows.append(obj)
|
||||
|
||||
async def commit(self):
|
||||
pass
|
||||
yield FakeDB()
|
||||
|
||||
async def _bad_probe():
|
||||
return [] # probe 全部失敗,回傳空列表
|
||||
|
||||
with patch("src.services.model_version_tracker.probe_all_providers", side_effect=_bad_probe), \
|
||||
patch("src.services.model_version_tracker.get_db_context", fake_ctx):
|
||||
result = await tracker.run_probe_cycle()
|
||||
|
||||
assert result["probed"] == 0
|
||||
assert result["changed"] == []
|
||||
assert len(added_rows) == 0
|
||||
496
ops/monitoring/grafana/dashboards/ai-slo-dashboard.json
Normal file
496
ops/monitoring/grafana/dashboards/ai-slo-dashboard.json
Normal file
@@ -0,0 +1,496 @@
|
||||
{
|
||||
"__inputs": [],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "10.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "timeseries",
|
||||
"name": "Time series",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "barchart",
|
||||
"name": "Bar chart",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "table",
|
||||
"name": "Table",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"description": "AI 自主化飛輪 SLO Dashboard — 自主化率/決策準確率/信心校準/KM 增長率 | ADR-100 2026-04-27 P3.4 台北時區",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"refresh": "60s",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["slo", "ai", "autonomous", "flywheel"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "Asia/Taipei",
|
||||
"title": "AI 自主化飛輪 SLO",
|
||||
"uid": "ai-autonomous-slo-v1",
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "比率",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "green", "value": 0.8 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "SLO 目標 80%" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [10, 5], "fill": "dash" } },
|
||||
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"legend": { "calcs": ["lastNotNull", "min"], "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"title": "SLO 1 — 自主化率(24h 趨勢)",
|
||||
"description": "SLI = auto_executed / all_operations(5m rate)\n目標 SLO ≥ 80%\n橙色虛線 = 80% 閾值",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sli:autonomy_rate:5m",
|
||||
"legendFormat": "自主化率",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "vector(0.80)",
|
||||
"legendFormat": "SLO 目標 80%",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "比率",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineInterpolation": "smooth",
|
||||
"lineWidth": 2,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.85 },
|
||||
{ "color": "green", "value": 0.9 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "SLO 目標 90%" },
|
||||
"properties": [
|
||||
{ "id": "custom.lineStyle", "value": { "dash": [10, 5], "fill": "dash" } },
|
||||
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } },
|
||||
{ "id": "custom.lineWidth", "value": 1 }
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": { "calcs": ["lastNotNull", "min"], "displayMode": "list", "placement": "bottom" },
|
||||
"tooltip": { "mode": "multi", "sort": "none" }
|
||||
},
|
||||
"title": "SLO 2 — 決策準確率(24h 趨勢)",
|
||||
"description": "SLI = verifier_success / auto_executed(5m rate)\n目標 SLO ≥ 90%\n橙色虛線 = 90% 閾值",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sli:decision_accuracy:5m",
|
||||
"legendFormat": "決策準確率",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "vector(0.90)",
|
||||
"legendFormat": "SLO 目標 90%",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.7 },
|
||||
{ "color": "green", "value": 0.8 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 6, "x": 0, "y": 8 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"minVizHeight": 75,
|
||||
"minVizWidth": 75,
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": true,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"title": "SLO 3 — 信心校準(當前值)",
|
||||
"description": "SLI = high_confidence_success / high_confidence_total(1h 滑動窗口)\n目標 SLO ≥ 80%(綠線)\n≥ 0.8 = 綠色,0.7~0.8 = 黃色,< 0.7 = 紅色",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sli:confidence_calibration:1h",
|
||||
"legendFormat": "信心校準",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"fillOpacity": 70,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"lineWidth": 1
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 5 },
|
||||
{ "color": "green", "value": 20 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 8 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"barRadius": 0,
|
||||
"barWidth": 0.8,
|
||||
"colorByField": "Value",
|
||||
"fullHighlight": false,
|
||||
"groupWidth": 0.7,
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom" },
|
||||
"orientation": "auto",
|
||||
"showValue": "always",
|
||||
"stacking": "none",
|
||||
"tooltip": { "mode": "single", "sort": "none" },
|
||||
"xTickLabelRotation": 0,
|
||||
"xTickLabelSpacing": 0
|
||||
},
|
||||
"title": "SLO 4 — KM 增長率(7d 每日新增)",
|
||||
"description": "SLI = increase(knowledge_entries_total[24h])\n目標 SLO ≥ 20 筆/day(綠色)\n5~20 = 黃色,< 5 = 紅色(疑似 KM 鏈斷裂)",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "sli:km_growth_rate:24h",
|
||||
"legendFormat": "KM 增長/day",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"type": "barchart"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": { "from": 0, "to": 0, "result": { "color": "red", "text": "已耗盡" } },
|
||||
"type": "range"
|
||||
}
|
||||
],
|
||||
"max": 1,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "red", "value": null },
|
||||
{ "color": "yellow", "value": 0.25 },
|
||||
{ "color": "green", "value": 0.5 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"minVizHeight": 75,
|
||||
"minVizWidth": 75,
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"title": "Error Budget Remaining(4 SLO)",
|
||||
"description": "剩餘 error budget 比例(1 = 100% 剩餘,0 = 已耗盡)\n- SLO 3 信心校準 budget 計算:(1 - SLI) / 0.20\n- SLO 1/2 用 5m rate 估算\n- 顯示: 各 SLO 剩餘預算 %",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "clamp(slo:autonomy_rate:error_budget_remaining, 0, 1)",
|
||||
"legendFormat": "SLO1 自主化率",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "clamp(slo:decision_accuracy:error_budget_remaining, 0, 1)",
|
||||
"legendFormat": "SLO2 決策準確率",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "clamp(slo:confidence_calibration:error_budget_remaining, 0, 1)",
|
||||
"legendFormat": "SLO3 信心校準",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"cellOptions": { "type": "auto" },
|
||||
"filterable": false,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 2 }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "alertname" },
|
||||
"properties": [{ "id": "custom.width", "value": 300 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "severity" },
|
||||
"properties": [
|
||||
{ "id": "custom.width", "value": 100 },
|
||||
{
|
||||
"id": "mappings",
|
||||
"value": [
|
||||
{ "options": { "critical": { "color": "red", "index": 0 } }, "type": "value" },
|
||||
{ "options": { "warning": { "color": "yellow", "index": 1 } }, "type": "value" },
|
||||
{ "options": { "info": { "color": "blue", "index": 2 } }, "type": "value" }
|
||||
]
|
||||
},
|
||||
{ "id": "custom.cellOptions", "value": { "type": "color-background" } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "slo_name" },
|
||||
"properties": [{ "id": "custom.width", "value": 200 }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "burn_window" },
|
||||
"properties": [{ "id": "custom.width", "value": 100 }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"cellHeight": "sm",
|
||||
"footer": { "countRows": false, "enablePagination": false, "fields": "", "reducer": ["sum"], "show": false },
|
||||
"showHeader": true,
|
||||
"sortBy": [{ "desc": true, "displayName": "severity" }]
|
||||
},
|
||||
"title": "Burn Rate Alerts(當前觸發)",
|
||||
"description": "列出當前觸發中的 SLO burn rate alerts\n按 severity 排序(critical > warning > info)\n空白 = 所有 SLO 健康",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "${datasource}" },
|
||||
"expr": "ALERTS{slo_name=~\".+\", alertstate=\"firing\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "filterFieldsByName",
|
||||
"options": {
|
||||
"include": {
|
||||
"names": ["alertname", "severity", "slo_name", "burn_window", "team", "alertstate"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "sortBy",
|
||||
"options": {
|
||||
"fields": [{ "desc": true, "displayName": "severity" }]
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user