diff --git a/scripts/generate_monitoring.py b/scripts/generate_monitoring.py new file mode 100644 index 00000000..31013ac6 --- /dev/null +++ b/scripts/generate_monitoring.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +generate_monitoring.py — 監控覆蓋率自動發現 +Phase O-5 Wave C.1 (2026-04-02 ogt) + +功能: + 1. 查詢 Prometheus targets API,取得全量 scrape 狀態 + 2. 掃描 K8s Services,找出未被監控的服務 + 3. 輸出覆蓋率報告 (JSON + 人可讀格式) + +用法: + python3 scripts/generate_monitoring.py + python3 scripts/generate_monitoring.py --json + python3 scripts/generate_monitoring.py --check # CI mode: exit 1 if coverage < threshold +""" + +import argparse +import json +import subprocess +import sys +from datetime import datetime + +import requests + +# ============================================================ +# 設定 +# ============================================================ + +PROMETHEUS_URL = "http://192.168.0.110:9090" +COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1 + +# 已知服務清單 (job名稱 → 說明) +EXPECTED_JOBS = { + "awoooi-api": "AWOOOI API (K8s)", + "clawbot": "OpenClaw 188:8088", + "node-exporter-110": "Node Exporter 110", + "node-exporter-112": "Node Exporter 112 (Kali)", + "node-exporter-188": "Node Exporter 188", + "cadvisor-110": "cAdvisor 110", + "prometheus": "Prometheus self-scrape", + "blackbox-http": "Blackbox HTTP probe", + "blackbox-tcp": "Blackbox TCP probe", + "github-actions": "GitHub Actions exporter", +} + +# 允許 down 的 target (已知問題,不影響覆蓋率計算) +KNOWN_DOWN_TARGETS = { + "federation-k8s": "K8s federation — SigNoz 內部 Prometheus,非外部暴露", + "kube-state-metrics": "kube-state-metrics NodePort 30180 — 僅 OTEL Collector 內部存取", + "node-exporter-120": "node-exporter 120 — K8s master 節點防火牆規則", + "node-exporter-121": "node-exporter 121 — K8s worker 節點防火牆規則", +} + + +def get_prometheus_targets() -> dict: + """查詢 Prometheus targets API""" + try: + resp = requests.get(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10) + resp.raise_for_status() + return resp.json()["data"] + except requests.RequestException as e: + print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr) + sys.exit(1) + + +def get_k8s_services() -> list[dict]: + """查詢 K8s services (需要 kubectl)""" + try: + result = subprocess.run( + ["kubectl", "get", "services", "--all-namespaces", "-o", "json"], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode != 0: + return [] + data = json.loads(result.stdout) + return data.get("items", []) + except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError): + return [] + + +def analyze_targets(targets_data: dict) -> dict: + """分析 targets 狀態""" + active = targets_data.get("activeTargets", []) + + jobs: dict[str, dict] = {} + for t in active: + job = t["labels"].get("job", "unknown") + instance = t["labels"].get("instance", "?") + health = t["health"] + + if job not in jobs: + jobs[job] = {"up": [], "down": [], "unknown": []} + jobs[job][health].append(instance) + + return jobs + + +def build_report(jobs: dict) -> dict: + """建立覆蓋率報告""" + total_jobs = len(jobs) + up_jobs = sum(1 for j in jobs.values() if j["up"] and not j["down"]) + partial_jobs = sum(1 for j in jobs.values() if j["up"] and j["down"]) + down_jobs = sum(1 for j in jobs.values() if not j["up"] and j["down"]) + + # 只計算非 known_down 的問題 + real_down_jobs = { + job: data + for job, data in jobs.items() + if not data["up"] and job not in KNOWN_DOWN_TARGETS + } + + expected_covered = sum(1 for j in EXPECTED_JOBS if j in jobs and jobs[j]["up"]) + coverage_pct = round(expected_covered / len(EXPECTED_JOBS) * 100, 1) + + return { + "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "prometheus_url": PROMETHEUS_URL, + "summary": { + "total_jobs": total_jobs, + "up_jobs": up_jobs, + "partial_jobs": partial_jobs, + "down_jobs": down_jobs, + "real_down_jobs": len(real_down_jobs), + "expected_coverage_pct": coverage_pct, + }, + "jobs": jobs, + "expected_jobs": EXPECTED_JOBS, + "known_down": KNOWN_DOWN_TARGETS, + "real_down_jobs": list(real_down_jobs.keys()), + "missing_expected": [j for j in EXPECTED_JOBS if j not in jobs], + } + + +def print_human_report(report: dict) -> None: + """輸出人可讀格式報告""" + s = report["summary"] + print(f"\n{'='*60}") + print(f" AWOOOI 監控覆蓋率報告") + print(f" 生成時間: {report['generated_at']}") + print(f"{'='*60}") + print(f"\n📊 總覽") + print(f" Jobs 總數: {s['total_jobs']}") + print(f" 全部 UP: {s['up_jobs']}") + print(f" 部分 UP: {s['partial_jobs']}") + print(f" 全部 DOWN: {s['down_jobs']}") + print(f" 真實問題 (非已知): {s['real_down_jobs']}") + print(f" 預期覆蓋率: {s['expected_coverage_pct']}% ({COVERAGE_THRESHOLD}% 門檻)") + + print(f"\n✅ 預期服務狀態") + for job, desc in report["expected_jobs"].items(): + jobs = report["jobs"] + if job not in jobs: + status = "❌ 缺失" + elif jobs[job]["up"] and not jobs[job]["down"]: + status = "✅ UP" + elif jobs[job]["up"]: + status = f"⚠️ 部分 UP ({len(jobs[job]['up'])} up, {len(jobs[job]['down'])} down)" + else: + status = "❌ DOWN" + print(f" {status:<30} {job:<25} {desc}") + + if report["known_down"]: + print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)") + for job, reason in report["known_down"].items(): + if job in report["jobs"]: + print(f" {job:<30} {reason}") + + if report["real_down_jobs"]: + print(f"\n🔴 需處理的 DOWN targets") + for job in report["real_down_jobs"]: + instances = report["jobs"][job].get("down", []) + print(f" {job}: {', '.join(instances)}") + + if report["missing_expected"]: + print(f"\n🔴 缺少預期服務監控") + for job in report["missing_expected"]: + print(f" {job}: {report['expected_jobs'][job]}") + + pct = s["expected_coverage_pct"] + threshold = COVERAGE_THRESHOLD + if pct >= threshold and not report["real_down_jobs"]: + print(f"\n✅ 監控健康: 覆蓋率 {pct}% >= {threshold}%,無真實問題\n") + elif pct >= threshold: + print(f"\n⚠️ 覆蓋率達標 ({pct}%),但有 {s['real_down_jobs']} 個真實 DOWN 需處理\n") + else: + print(f"\n❌ 覆蓋率不足: {pct}% < {threshold}%\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description="AWOOOI 監控覆蓋率自動發現") + parser.add_argument("--json", action="store_true", help="輸出 JSON 格式") + parser.add_argument( + "--check", + action="store_true", + help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1", + ) + args = parser.parse_args() + + targets_data = get_prometheus_targets() + jobs = analyze_targets(targets_data) + report = build_report(jobs) + + if args.json: + print(json.dumps(report, ensure_ascii=False, indent=2)) + else: + print_human_report(report) + + if args.check: + pct = report["summary"]["expected_coverage_pct"] + real_down = report["summary"]["real_down_jobs"] + if pct < COVERAGE_THRESHOLD or real_down > 0: + sys.exit(1) + + +if __name__ == "__main__": + main()