feat(metrics): expose AI automation health gauges
Some checks failed
CD Pipeline / deploy (push) Has been cancelled

This commit is contained in:
ogt
2026-07-02 14:50:42 +08:00
parent 32e3c4baa5
commit 3e83973db3
4 changed files with 100 additions and 1 deletions

View File

@@ -94,6 +94,7 @@
- 2026-07-02 起 PChome safe mapping lane expansion 必須先從 direct mapping candidate decision lane 開始;`/api/ai/pchome-growth/mapping-backlog/direct-mapping-candidate-decision-lane-closeout-package` 會把 candidate decision package 收斂成 lane receipt、receipt replay、drift verifier 與 product readiness輸出 `primary_human_gate_count=0``drift_count``next_machine_action` 與 hash evidence。此 endpoint 預設不執行搜尋、不開 DB、不寫 DB、不持久化候選只在 `execute_search=1` 時走 controlled read-only candidate search。
- 2026-07-02 起 AI automation scheduled health summary 必須提供 machine-readable endpoint`/api/ai-automation/scheduled-health-summary` 會只讀 smoke history並可選擇 `include_current_smoke=1` 執行不寫 history 的 current smoke收斂 AI smoke、PChome drift monitor、history freshness、daily summary delivery readiness 四個 family輸出 `primary_human_gate_count=0``writes_database_count=0``next_machine_actions` 與 scheduled output endpoints。此 endpoint 不寄 Telegram、不寫 DB、不改排程只提供排程/監控可消費的健康摘要。
- 2026-07-02 起 PChome controlled apply rollback evidence 必須提供聚合 endpoint`/api/ai/pchome-growth/mapping-backlog/direct-mapping-retry-candidate-exception-controlled-apply-rollback-evidence-package` 會聚合 receipt replay、drift verifier、drift recovery、compact readback、artifact retention 五類 evidence輸出 rollback required / ready actions / protected chain / next machine action。此 endpoint 不執行 rollback、不執行 re-apply、不執行 SQL、不寫 DB0 drift 時必須輸出 no-op evidencedrift detected 時才輸出 check-mode reapply action。
- 2026-07-02 起 `/metrics` 必須匯出 AI automation scheduled health summary gauges`momo_ai_automation_scheduled_health_summary_total``momo_ai_automation_scheduled_health_family_status``momo_ai_automation_scheduled_health_primary_human_gate_count``momo_ai_automation_scheduled_health_writes_database_count`。Prometheus scrape 不得寄 Telegram、不寫 DB、不執行 current smoke只讀 scheduled health summary history。
- V10.644 起 `/ai_intelligence` 的商品明細列不得只用句子描述比價;每列必須顯示 PChome 價格、MOMO 參考價、差距、可信度四格價格證據,並保留下一步按鈕。單位價候選需顯示單位價與單位,候選待確認或缺資料則以「待補 / 候選待確認」呈現,不得捏造價格。
- V10.645 起 `/ai_intelligence` 的商品明細分流切換後,必須顯示「這類商品怎麼處理」的行動摘要,包含件數、近 7 天業績、平均可信度、最大價差、代表商品與主按鈕;使用者不得只能看到商品列表而不知道下一步。
- V10.646 起 `/ai_intelligence` 的商品明細必須提供搜尋與排序;搜尋至少涵蓋商品、分類、商品編號與 MOMO 候選資訊,排序至少支援優先級、近 7 天業績、價差、下滑幅度與可信度。搜尋/排序後的行動摘要與明細列表必須使用同一批結果。

View File

@@ -101,7 +101,7 @@
## P1 - Product Visibility And Professional Website Experience
狀態: 進行中
狀態: 已完成
目的: 讓 AI 自動化在產品裡可見,成為專業營運工作流,而不是只藏在後端。
@@ -223,6 +223,7 @@
| P3.1 | Extend receipt / replay / drift pattern to more lanes | 已完成 | direct mapping candidate decision lane closeout route + focused tests | P3.2 scheduled automation health summaries |
| P3.2 | Scheduled automation health summaries | 已完成 | `/api/ai-automation/scheduled-health-summary` + smoke service focused tests | P3.3 rollback evidence packages |
| P3.3 | Rollback evidence packages | 已完成 | controlled apply rollback evidence route + focused tests | P3.4 observability metrics integration |
| P3.4 | Observability metrics integration | 已完成 | `/metrics` exports scheduled health summary gauges + focused tests | P4 source / deployment governance ongoing |
## 後續回報格式

View File

@@ -342,6 +342,20 @@ def prometheus_metrics():
except Exception as e:
sys_log.warning(f"[Metrics] 無法取得 AI 自動化指標: {e}")
try:
from services.ai_automation_smoke_service import build_scheduled_automation_health_summary
_register_ai_automation_health_summary_metrics(
registry,
Gauge,
build_scheduled_automation_health_summary(
history_limit=50,
include_current_smoke=False,
),
)
except Exception as e:
sys_log.warning(f"[Metrics] 無法取得 AI 自動化健康摘要指標: {e}")
return Response(generate_latest(registry), mimetype=CONTENT_TYPE_LATEST)
except ImportError:
@@ -472,6 +486,50 @@ def _register_ai_automation_metrics(registry, gauge_cls, metrics_snapshot):
gauge.labels(**{name: label_values.get(name, "unknown") for name in label_names}).set(values.get(suffix, 0))
def _register_ai_automation_health_summary_metrics(registry, gauge_cls, health_summary):
"""Export scheduled AI automation health summary into Prometheus gauges."""
summary = health_summary.get("summary") or {}
status_counts = {
"ok": int(summary.get("ok") or 0),
"warning": int(summary.get("warning") or 0),
"critical": int(summary.get("critical") or 0),
"total": int(summary.get("total") or 0),
}
status_gauge = gauge_cls(
"momo_ai_automation_scheduled_health_summary_total",
"AI automation scheduled health family counts",
["status"],
registry=registry,
)
for status, value in status_counts.items():
status_gauge.labels(status=status).set(value)
family_status = gauge_cls(
"momo_ai_automation_scheduled_health_family_status",
"AI automation scheduled health family status. Value is 1 for the current status label.",
["family", "status"],
registry=registry,
)
for family in health_summary.get("families", []) or []:
family_key = str(family.get("key") or "unknown")[:80]
status = str(family.get("status") or "unknown")[:40]
family_status.labels(family=family_key, status=status).set(1)
human_gate_gauge = gauge_cls(
"momo_ai_automation_scheduled_health_primary_human_gate_count",
"AI automation scheduled health primary human gate count",
registry=registry,
)
human_gate_gauge.set(int(summary.get("primary_human_gate_count") or 0))
write_gauge = gauge_cls(
"momo_ai_automation_scheduled_health_writes_database_count",
"AI automation scheduled health database write count",
registry=registry,
)
write_gauge.set(int(summary.get("writes_database_count") or 0))
@system_public_bp.route('/settings')
def settings():
"""分類設定頁面"""

View File

@@ -97,6 +97,45 @@ def test_system_metrics_exports_ai_automation_zero_baseline():
)
def test_system_metrics_exports_scheduled_health_summary():
from prometheus_client import CollectorRegistry, Gauge, generate_latest
from routes.system_public_routes import _register_ai_automation_health_summary_metrics
registry = CollectorRegistry()
_register_ai_automation_health_summary_metrics(
registry,
Gauge,
{
"summary": {
"ok": 3,
"warning": 1,
"critical": 0,
"total": 4,
"primary_human_gate_count": 0,
"writes_database_count": 0,
},
"families": [
{"key": "ai_automation_smoke", "status": "ok"},
{"key": "pchome_controlled_apply_drift_monitor", "status": "warning"},
],
},
)
output = generate_latest(registry).decode("utf-8")
assert 'momo_ai_automation_scheduled_health_summary_total{status="ok"} 3.0' in output
assert 'momo_ai_automation_scheduled_health_summary_total{status="warning"} 1.0' in output
assert (
'momo_ai_automation_scheduled_health_family_status{family="ai_automation_smoke",status="ok"} 1.0'
in output
)
assert (
'momo_ai_automation_scheduled_health_family_status{family="pchome_controlled_apply_drift_monitor",status="warning"} 1.0'
in output
)
assert "momo_ai_automation_scheduled_health_primary_human_gate_count 0.0" in output
assert "momo_ai_automation_scheduled_health_writes_database_count 0.0" in output
def test_system_metrics_counts_sales_records_with_raw_count_query():
from prometheus_client import CollectorRegistry, Gauge, generate_latest
from routes.system_public_routes import _set_database_record_counts