diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c09d82c6..6ef329c7 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -12,6 +12,7 @@ - `/home/wooo/monitoring/alerts.yml` - `/home/wooo/monitoring/alerts-unified.canonical.yml` - deploy 後新增 canonical hash 驗證,避免 active rules 與 drift guard canonical 再次分叉。 +- 新增版控版 `scripts/ops/prometheus-rule-drift-guard.sh`,並由 deploy script 一併同步到 110;guard 改為從 canonical 規則檔自動解析 alert/record 名稱,不再硬編過期 required rule 清單。 - 已即時執行 deploy,Prometheus reload 後: - `NoAlertsReceived2Hours` query 已限制 `source="alertmanager"`。 - `SourceProviderIngestionStale` query 已限制 `source=~"sentry|signoz"` 且 24h。 diff --git a/scripts/ops/deploy-alerts.sh b/scripts/ops/deploy-alerts.sh index db226c62..ce8eaf0a 100755 --- a/scripts/ops/deploy-alerts.sh +++ b/scripts/ops/deploy-alerts.sh @@ -8,10 +8,12 @@ set -eo pipefail ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml" SLO_RULES_FILE="ops/monitoring/slo-rules.yml" +DRIFT_GUARD_SCRIPT="scripts/ops/prometheus-rule-drift-guard.sh" TARGET_HOST="192.168.0.110" TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml" TARGET_ALERTS_CANONICAL_PATH="/home/wooo/monitoring/alerts-unified.canonical.yml" TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml" +TARGET_DRIFT_GUARD_PATH="/home/wooo/scripts/prometheus-rule-drift-guard.sh" PROMETHEUS_URL="http://${TARGET_HOST}:9090" DRY_RUN="${1:-}" @@ -35,7 +37,7 @@ remote_file_sha() { } # 確認檔案存在 -for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do +for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE" "$DRIFT_GUARD_SCRIPT"; do if [ ! -f "$file" ]; then echo "ERROR: $file not found" exit 1 @@ -60,6 +62,7 @@ if [ "$DRY_RUN" = "--dry-run" ]; then log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}" log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH}" log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}" + log "DRY RUN: would deploy $DRIFT_GUARD_SCRIPT to ${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH}" ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE") SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE") SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE") @@ -71,13 +74,16 @@ fi ssh wooo@${TARGET_HOST} "\ cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ cp ${TARGET_ALERTS_CANONICAL_PATH} ${TARGET_ALERTS_CANONICAL_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ - cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true" + cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \ + cp ${TARGET_DRIFT_GUARD_PATH} ${TARGET_DRIFT_GUARD_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true" log "✅ 現有規則已備份" # 部署新規則 scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH} scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH} scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH} +scp "$DRIFT_GUARD_SCRIPT" wooo@${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH} +ssh wooo@${TARGET_HOST} "chmod 0755 ${TARGET_DRIFT_GUARD_PATH}" log "✅ 規則已複製到 ${TARGET_HOST}" LOCAL_ALERTS_SHA="$(file_sha "$ALERT_RULES_FILE")" @@ -85,6 +91,8 @@ REMOTE_ALERTS_SHA="$(remote_file_sha "$TARGET_ALERTS_PATH")" REMOTE_ALERTS_CANONICAL_SHA="$(remote_file_sha "$TARGET_ALERTS_CANONICAL_PATH")" LOCAL_SLO_SHA="$(file_sha "$SLO_RULES_FILE")" REMOTE_SLO_SHA="$(remote_file_sha "$TARGET_SLO_PATH")" +LOCAL_DRIFT_GUARD_SHA="$(file_sha "$DRIFT_GUARD_SCRIPT")" +REMOTE_DRIFT_GUARD_SHA="$(remote_file_sha "$TARGET_DRIFT_GUARD_PATH")" if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_SHA" ]; then echo "ERROR: 遠端 alerts.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_SHA}" exit 1 @@ -97,6 +105,10 @@ if [ "$LOCAL_SLO_SHA" != "$REMOTE_SLO_SHA" ]; then echo "ERROR: 遠端 slo-rules.yml hash 不一致 local=${LOCAL_SLO_SHA} remote=${REMOTE_SLO_SHA}" exit 1 fi +if [ "$LOCAL_DRIFT_GUARD_SHA" != "$REMOTE_DRIFT_GUARD_SHA" ]; then + echo "ERROR: 遠端 prometheus-rule-drift-guard.sh hash 不一致 local=${LOCAL_DRIFT_GUARD_SHA} remote=${REMOTE_DRIFT_GUARD_SHA}" + exit 1 +fi log "✅ 遠端規則 hash 驗證通過" # Reload Prometheus diff --git a/scripts/ops/prometheus-rule-drift-guard.sh b/scripts/ops/prometheus-rule-drift-guard.sh new file mode 100755 index 00000000..d83635dd --- /dev/null +++ b/scripts/ops/prometheus-rule-drift-guard.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +# Guard 110 Prometheus alert rules against stale deploys. +# +# The canonical file is the source of truth. The guard restores active +# alerts.yml only when the active file differs from canonical or when +# Prometheus is missing rule names declared by canonical. + +set -uo pipefail + +HOST_LABEL="${HOST_LABEL:-110}" +PROMETHEUS_URL="${PROMETHEUS_URL:-http://127.0.0.1:9090}" +CURRENT_RULES="${CURRENT_RULES:-/home/wooo/monitoring/alerts.yml}" +CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonical.yml}" +TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}" +LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}" + +log() { + mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true + printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE" +} + +write_textfile() { + local status="$1" + local repaired="$2" + local missing_count="$3" + local matches_canonical="$4" + local tmp + mkdir -p "$(dirname "$TEXTFILE")" 2>/dev/null || true + tmp="$(mktemp "${TEXTFILE}.tmp.XXXXXX")" || return 0 + cat >"$tmp" </dev/null || true + mv "$tmp" "$TEXTFILE" 2>/dev/null || rm -f "$tmp" +} + +rules_missing_count() { + python3 - "$PROMETHEUS_URL" "$CANONICAL_RULES" <<'PY' +import json +import re +import sys +import urllib.request + +base_url = sys.argv[1].rstrip("/") +canonical_path = sys.argv[2] + +name_pattern = re.compile(r"^\s*-\s*(?:alert|record):\s*['\"]?([^'\"#]+?)['\"]?\s*(?:#.*)?$") +required: set[str] = set() +try: + with open(canonical_path, encoding="utf-8") as handle: + for line in handle: + match = name_pattern.match(line) + if match: + required.add(match.group(1).strip()) +except Exception as exc: + print(f"CANONICAL_PARSE_FAILED:{exc}") + raise SystemExit(0) + +try: + with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response: + payload = json.loads(response.read().decode("utf-8")) + if payload.get("status") != "success": + raise RuntimeError(payload) + loaded = { + str(rule.get("name") or rule.get("alert") or rule.get("record")) + for group in payload.get("data", {}).get("groups") or [] + for rule in group.get("rules") or [] + } + print(len(required - loaded)) +except Exception as exc: + print(f"QUERY_FAILED:{exc}") +PY +} + +matches_canonical() { + if [ ! -f "$CURRENT_RULES" ] || [ ! -f "$CANONICAL_RULES" ]; then + echo 0 + return + fi + if cmp -s "$CURRENT_RULES" "$CANONICAL_RULES"; then + echo 1 + else + echo 0 + fi +} + +restore_rules() { + local backup_path + backup_path="${CURRENT_RULES}.guard.bak.$(date +%Y%m%d%H%M%S)" + cp "$CURRENT_RULES" "$backup_path" 2>/dev/null || true + cp "$CANONICAL_RULES" "$CURRENT_RULES" + curl -fsS -X POST "${PROMETHEUS_URL}/-/reload" >/dev/null +} + +main() { + if [ ! -f "$CANONICAL_RULES" ]; then + log "canonical rules missing: ${CANONICAL_RULES}" + write_textfile "canonical_missing" 0 999 0 + return 1 + fi + + local missing before_matches repaired after_missing after_matches + missing="$(rules_missing_count)" + before_matches="$(matches_canonical)" + repaired=0 + + if [[ "$missing" == QUERY_FAILED:* || "$missing" == CANONICAL_PARSE_FAILED:* ]]; then + log "Prometheus/canonical query failed: ${missing}" + write_textfile "query_failed" 0 999 "$before_matches" + return 1 + fi + + if [ "$missing" -gt 0 ] || [ "$before_matches" -eq 0 ]; then + log "rule drift detected: missing=${missing} current_matches_canonical=${before_matches}; restoring" + if restore_rules; then + repaired=1 + sleep 3 + else + log "restore failed" + write_textfile "restore_failed" 0 "$missing" "$before_matches" + return 1 + fi + fi + + after_missing="$(rules_missing_count)" + after_matches="$(matches_canonical)" + if [[ "$after_missing" == QUERY_FAILED:* || "$after_missing" == CANONICAL_PARSE_FAILED:* ]]; then + log "post-restore Prometheus/canonical query failed: ${after_missing}" + write_textfile "post_query_failed" "$repaired" 999 "$after_matches" + return 1 + fi + + if [ "$after_missing" -eq 0 ] && [ "$after_matches" -eq 1 ]; then + write_textfile "ok" "$repaired" "$after_missing" "$after_matches" + log "ok repaired=${repaired}" + return 0 + fi + + log "still drifted after check: missing=${after_missing} current_matches_canonical=${after_matches}" + write_textfile "drifted" "$repaired" "$after_missing" "$after_matches" + return 1 +} + +main "$@"