fix(monitoring): stabilize post-deploy target coverage
This commit is contained in:
@@ -16,11 +16,14 @@ Phase O-5 Wave C.1 (2026-04-02 ogt)
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from typing import Callable
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import urlopen
|
||||
|
||||
# ============================================================
|
||||
# 設定
|
||||
@@ -28,6 +31,8 @@ import requests
|
||||
|
||||
PROMETHEUS_URL = "http://192.168.0.110:9090"
|
||||
COVERAGE_THRESHOLD = 70 # CI 模式: 覆蓋率低於此值則 exit 1
|
||||
DEFAULT_STABILIZATION_ATTEMPTS = 3
|
||||
DEFAULT_STABILIZATION_SLEEP_SECONDS = 10.0
|
||||
|
||||
# 已知服務清單 (job名稱 → 說明)
|
||||
EXPECTED_JOBS = {
|
||||
@@ -52,13 +57,27 @@ KNOWN_DOWN_TARGETS = {
|
||||
}
|
||||
|
||||
|
||||
def _int_env(name: str, default: int) -> int:
|
||||
try:
|
||||
return max(1, int(os.environ.get(name, default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def _float_env(name: str, default: float) -> float:
|
||||
try:
|
||||
return max(0.0, float(os.environ.get(name, default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_prometheus_targets() -> dict:
|
||||
"""查詢 Prometheus targets API"""
|
||||
try:
|
||||
resp = requests.get(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["data"]
|
||||
except requests.RequestException as e:
|
||||
with urlopen(f"{PROMETHEUS_URL}/api/v1/targets", timeout=10) as resp:
|
||||
payload = json.loads(resp.read().decode("utf-8"))
|
||||
return payload["data"]
|
||||
except (HTTPError, URLError, TimeoutError, json.JSONDecodeError, KeyError) as e:
|
||||
print(f"❌ 無法連接 Prometheus ({PROMETHEUS_URL}): {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -133,6 +152,76 @@ def build_report(jobs: dict) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def build_report_from_targets(targets_data: dict) -> dict:
|
||||
"""從 Prometheus targets API payload 建立覆蓋率報告"""
|
||||
return build_report(analyze_targets(targets_data))
|
||||
|
||||
|
||||
def report_needs_stabilization(report: dict) -> bool:
|
||||
"""是否需要重查,避免 post-deploy 瞬間 scrape 狀態造成 false red."""
|
||||
return bool(report["real_down_jobs"] or report["missing_expected"])
|
||||
|
||||
|
||||
def stabilization_reason(report: dict) -> str:
|
||||
parts: list[str] = []
|
||||
if report["real_down_jobs"]:
|
||||
parts.append(f"real_down={','.join(report['real_down_jobs'])}")
|
||||
if report["missing_expected"]:
|
||||
parts.append(f"missing_expected={','.join(report['missing_expected'])}")
|
||||
return "; ".join(parts) if parts else "stable"
|
||||
|
||||
|
||||
def build_stabilized_report(
|
||||
fetch_targets: Callable[[], dict],
|
||||
attempts: int,
|
||||
sleep_seconds: float,
|
||||
emit_status: bool = True,
|
||||
) -> dict:
|
||||
"""重查 Prometheus targets,讓 CI gate 避開 rollout/scrape freshness 瞬間值."""
|
||||
attempts = max(1, attempts)
|
||||
sleep_seconds = max(0.0, sleep_seconds)
|
||||
|
||||
report: dict | None = None
|
||||
for attempt in range(1, attempts + 1):
|
||||
report = build_report_from_targets(fetch_targets())
|
||||
needs_retry = report_needs_stabilization(report)
|
||||
status = "stable"
|
||||
if needs_retry and attempt < attempts:
|
||||
status = "retrying"
|
||||
elif needs_retry:
|
||||
status = "failed"
|
||||
elif attempt > 1:
|
||||
status = "cleared"
|
||||
|
||||
report["stabilization"] = {
|
||||
"attempt": attempt,
|
||||
"attempts": attempts,
|
||||
"sleep_seconds": sleep_seconds,
|
||||
"status": status,
|
||||
"reason": stabilization_reason(report),
|
||||
}
|
||||
|
||||
if not needs_retry or attempt == attempts:
|
||||
if emit_status and attempt > 1 and not needs_retry:
|
||||
print(
|
||||
"✅ Prometheus target stabilization cleared transient coverage drift",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return report
|
||||
|
||||
if emit_status:
|
||||
print(
|
||||
"⏳ Prometheus target stabilization "
|
||||
f"{attempt}/{attempts}: {stabilization_reason(report)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
if report is None:
|
||||
raise RuntimeError("monitoring report stabilization did not run")
|
||||
return report
|
||||
|
||||
|
||||
def print_human_report(report: dict) -> None:
|
||||
"""輸出人可讀格式報告"""
|
||||
s = report["summary"]
|
||||
@@ -161,11 +250,15 @@ def print_human_report(report: dict) -> None:
|
||||
status = "❌ DOWN"
|
||||
print(f" {status:<30} {job:<25} {desc}")
|
||||
|
||||
if report["known_down"]:
|
||||
known_down_present = [
|
||||
(job, reason)
|
||||
for job, reason in report["known_down"].items()
|
||||
if job in report["jobs"] and report["jobs"][job]["down"]
|
||||
]
|
||||
if known_down_present:
|
||||
print(f"\n⚠️ 已知 DOWN (不影響覆蓋率)")
|
||||
for job, reason in report["known_down"].items():
|
||||
if job in report["jobs"]:
|
||||
print(f" {job:<30} {reason}")
|
||||
for job, reason in known_down_present:
|
||||
print(f" {job:<30} {reason}")
|
||||
|
||||
if report["real_down_jobs"]:
|
||||
print(f"\n🔴 需處理的 DOWN targets")
|
||||
@@ -178,6 +271,15 @@ def print_human_report(report: dict) -> None:
|
||||
for job in report["missing_expected"]:
|
||||
print(f" {job}: {report['expected_jobs'][job]}")
|
||||
|
||||
stabilization = report.get("stabilization")
|
||||
if stabilization and stabilization["attempt"] > 1:
|
||||
print(f"\n⏱️ Prometheus target 穩定化")
|
||||
print(
|
||||
" "
|
||||
f"{stabilization['status']} after "
|
||||
f"{stabilization['attempt']}/{stabilization['attempts']} attempts"
|
||||
)
|
||||
|
||||
pct = s["expected_coverage_pct"]
|
||||
threshold = COVERAGE_THRESHOLD
|
||||
if pct >= threshold and not report["real_down_jobs"]:
|
||||
@@ -196,11 +298,34 @@ def main() -> None:
|
||||
action="store_true",
|
||||
help=f"CI 模式: 覆蓋率 < {COVERAGE_THRESHOLD}% 則 exit 1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stabilization-attempts",
|
||||
type=int,
|
||||
default=_int_env(
|
||||
"AWOOOI_MONITORING_TARGET_STABILIZATION_ATTEMPTS",
|
||||
DEFAULT_STABILIZATION_ATTEMPTS,
|
||||
),
|
||||
help="CI 模式: Prometheus target 狀態重查次數",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stabilization-sleep-seconds",
|
||||
type=float,
|
||||
default=_float_env(
|
||||
"AWOOOI_MONITORING_TARGET_STABILIZATION_SLEEP_SECONDS",
|
||||
DEFAULT_STABILIZATION_SLEEP_SECONDS,
|
||||
),
|
||||
help="CI 模式: Prometheus target 重查間隔秒數",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
targets_data = get_prometheus_targets()
|
||||
jobs = analyze_targets(targets_data)
|
||||
report = build_report(jobs)
|
||||
if args.check:
|
||||
report = build_stabilized_report(
|
||||
get_prometheus_targets,
|
||||
attempts=args.stabilization_attempts,
|
||||
sleep_seconds=args.stabilization_sleep_seconds,
|
||||
)
|
||||
else:
|
||||
report = build_report_from_targets(get_prometheus_targets())
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(report, ensure_ascii=False, indent=2))
|
||||
|
||||
Reference in New Issue
Block a user