feat(monitoring): Phase O-5 Wave C.1 generate_monitoring.py 自動發現
Some checks failed
E2E Health Check / e2e-health (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

- 查詢 Prometheus targets API 取得全量 scrape 狀態
- 10 個預期服務覆蓋率計算 (門檻 70%)
- 已知 DOWN targets 豁免清單 (不影響健康判斷)
- --json 機器可讀輸出 / --check CI 模式 (exit 1 if coverage < threshold)
- 首次執行: 100% 覆蓋率,無真實問題

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
OG T
2026-04-02 21:33:28 +08:00
parent 28bd06d7b3
commit 827923b9b9

View File

@@ -0,0 +1,218 @@
#!/usr/bin/env python3
"""
generate_monitoring.py — 監控覆蓋率自動發現
Phase O-5 Wave C.1 (2026-04-02 ogt)
功能:
1. 查詢 Prometheus targets API取得全量 scrape 狀態
2. 掃描 K8s Services找出未被監控的服務
3. 輸出覆蓋率報告 (JSON + 人可讀格式)
用法:
python3 scripts/generate_monitoring.py
python3 scripts/generate_monitoring.py --json
python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold
"""
import argparse
import json
import subprocess
import sys
from datetime import datetime
import requests
# ============================================================
# 設定
# ============================================================
PROMETHEUS_URL = "http://192.168.0.110:9090"
COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1
# 已知服務清單 (job名稱 → 說明)
EXPECTED_JOBS = {
"awoooi-api": "AWOOOI API (K8s)",
"clawbot": "OpenClaw 188:8088",
"node-exporter-110": "Node Exporter 110",
"node-exporter-112": "Node Exporter 112 (Kali)",
"node-exporter-188": "Node Exporter 188",
"cadvisor-110": "cAdvisor 110",
"prometheus": "Prometheus self-scrape",
"blackbox-http": "Blackbox HTTP probe",
"blackbox-tcp": "Blackbox TCP probe",
"github-actions": "GitHub Actions exporter",
}
# 允許 down 的 target (已知問題,不影響覆蓋率計算)
KNOWN_DOWN_TARGETS = {
"federation-k8s": "K8s federation — SigNoz 內部 Prometheus非外部暴露",
"kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取",
"node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則",
"node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則",
}
def get_prometheus_targets() -> dict:
"""查詢 Prometheus targets API"""
try:
resp = requests.get(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10)
resp.raise_for_status()
return resp.json()["data"]
except requests.RequestException as e:
print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr)
sys.exit(1)
def get_k8s_services() -> list[dict]:
"""查詢 K8s services (需要 kubectl)"""
try:
result = subprocess.run(
["kubectl", "get", "services", "--all-namespaces", "-o", "json"],
capture_output=True,
text=True,
timeout=15,
)
if result.returncode != 0:
return []
data = json.loads(result.stdout)
return data.get("items", [])
except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
return []
def analyze_targets(targets_data: dict) -> dict:
"""分析 targets 狀態"""
active = targets_data.get("activeTargets", [])
jobs: dict[str, dict] = {}
for t in active:
job = t["labels"].get("job", "unknown")
instance = t["labels"].get("instance", "?")
health = t["health"]
if job not in jobs:
jobs[job] = {"up": [], "down": [], "unknown": []}
jobs[job][health].append(instance)
return jobs
def build_report(jobs: dict) -> dict:
"""建立覆蓋率報告"""
total_jobs = len(jobs)
up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"])
partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"])
down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"])
# 只計算非 known_down 的問題
real_down_jobs = {
job: data
for job, data in jobs.items()
if not data["up"] and job not in KNOWN_DOWN_TARGETS
}
expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"])
coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1)
return {
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"prometheus_url": PROMETHEUS_URL,
"summary": {
"total_jobs": total_jobs,
"up_jobs": up_jobs,
"partial_jobs": partial_jobs,
"down_jobs": down_jobs,
"real_down_jobs": len(real_down_jobs),
"expected_coverage_pct": coverage_pct,
},
"jobs": jobs,
"expected_jobs": EXPECTED_JOBS,
"known_down": KNOWN_DOWN_TARGETS,
"real_down_jobs": list(real_down_jobs.keys()),
"missing_expected": [j for j in EXPECTED_JOBS if j not in jobs],
}
def print_human_report(report: dict) -> None:
"""輸出人可讀格式報告"""
s = report["summary"]
print(f"\n{'='*60}")
print(f" AWOOOI 監控覆蓋率報告")
print(f" 生成時間: {report['generated_at']}")
print(f"{'='*60}")
print(f"\n📊 總覽")
print(f" Jobs 總數: {s['total_jobs']}")
print(f" 全部 UP: {s['up_jobs']}")
print(f" 部分 UP: {s['partial_jobs']}")
print(f" 全部 DOWN: {s['down_jobs']}")
print(f" 真實問題 (非已知): {s['real_down_jobs']}")
print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)")
print(f"\n✅ 預期服務狀態")
for job, desc in report["expected_jobs"].items():
jobs = report["jobs"]
if job not in jobs:
status = "❌ 缺失"
elif jobs[job]["up"] and not jobs[job]["down"]:
status = "✅ UP"
elif jobs[job]["up"]:
status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)"
else:
status = "❌ DOWN"
print(f" {status:<30} {job:<25} {desc}")
if report["known_down"]:
print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)")
for job, reason in report["known_down"].items():
if job in report["jobs"]:
print(f" {job:<30} {reason}")
if report["real_down_jobs"]:
print(f"\n🔴 需處理的 DOWN targets")
for job in report["real_down_jobs"]:
instances = report["jobs"][job].get("down", [])
print(f" {job}: {', '.join(instances)}")
if report["missing_expected"]:
print(f"\n🔴 缺少預期服務監控")
for job in report["missing_expected"]:
print(f" {job}: {report['expected_jobs'][job]}")
pct = s["expected_coverage_pct"]
threshold = COVERAGE_THRESHOLD
if pct >= threshold and not report["real_down_jobs"]:
print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n")
elif pct >= threshold:
print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n")
else:
print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n")
def main() -> None:
parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現")
parser.add_argument("--json", action="store_true", help="輸出 JSON 格式")
parser.add_argument(
"--check",
action="store_true",
help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1",
)
args = parser.parse_args()
targets_data = get_prometheus_targets()
jobs = analyze_targets(targets_data)
report = build_report(jobs)
if args.json:
print(json.dumps(report, ensure_ascii=False, indent=2))
else:
print_human_report(report)
if args.check:
pct = report["summary"]["expected_coverage_pct"]
real_down = report["summary"]["real_down_jobs"]
if pct < COVERAGE_THRESHOLD or real_down > 0:
sys.exit(1)
if __name__ == "__main__":
main()