fix(alerts): deploy drift guard with canonical rules
This commit is contained in:
@@ -12,6 +12,7 @@
|
||||
- `/home/wooo/monitoring/alerts.yml`
|
||||
- `/home/wooo/monitoring/alerts-unified.canonical.yml`
|
||||
- deploy 後新增 canonical hash 驗證,避免 active rules 與 drift guard canonical 再次分叉。
|
||||
- 新增版控版 `scripts/ops/prometheus-rule-drift-guard.sh`,並由 deploy script 一併同步到 110;guard 改為從 canonical 規則檔自動解析 alert/record 名稱,不再硬編過期 required rule 清單。
|
||||
- 已即時執行 deploy,Prometheus reload 後:
|
||||
- `NoAlertsReceived2Hours` query 已限制 `source="alertmanager"`。
|
||||
- `SourceProviderIngestionStale` query 已限制 `source=~"sentry|signoz"` 且 24h。
|
||||
|
||||
@@ -8,10 +8,12 @@ set -eo pipefail
|
||||
|
||||
ALERT_RULES_FILE="ops/monitoring/alerts-unified.yml"
|
||||
SLO_RULES_FILE="ops/monitoring/slo-rules.yml"
|
||||
DRIFT_GUARD_SCRIPT="scripts/ops/prometheus-rule-drift-guard.sh"
|
||||
TARGET_HOST="192.168.0.110"
|
||||
TARGET_ALERTS_PATH="/home/wooo/monitoring/alerts.yml"
|
||||
TARGET_ALERTS_CANONICAL_PATH="/home/wooo/monitoring/alerts-unified.canonical.yml"
|
||||
TARGET_SLO_PATH="/home/wooo/monitoring/slo-rules.yml"
|
||||
TARGET_DRIFT_GUARD_PATH="/home/wooo/scripts/prometheus-rule-drift-guard.sh"
|
||||
PROMETHEUS_URL="http://${TARGET_HOST}:9090"
|
||||
DRY_RUN="${1:-}"
|
||||
|
||||
@@ -35,7 +37,7 @@ remote_file_sha() {
|
||||
}
|
||||
|
||||
# 確認檔案存在
|
||||
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE"; do
|
||||
for file in "$ALERT_RULES_FILE" "$SLO_RULES_FILE" "$DRIFT_GUARD_SCRIPT"; do
|
||||
if [ ! -f "$file" ]; then
|
||||
echo "ERROR: $file not found"
|
||||
exit 1
|
||||
@@ -60,6 +62,7 @@ if [ "$DRY_RUN" = "--dry-run" ]; then
|
||||
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_PATH}"
|
||||
log "DRY RUN: would deploy $ALERT_RULES_FILE to ${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH}"
|
||||
log "DRY RUN: would deploy $SLO_RULES_FILE to ${TARGET_HOST}:${TARGET_SLO_PATH}"
|
||||
log "DRY RUN: would deploy $DRIFT_GUARD_SCRIPT to ${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH}"
|
||||
ALERT_COUNT=$(grep -c "alert:" "$ALERT_RULES_FILE")
|
||||
SLO_RECORD_COUNT=$(grep -c "record:" "$SLO_RULES_FILE")
|
||||
SLO_ALERT_COUNT=$(grep -c "alert:" "$SLO_RULES_FILE")
|
||||
@@ -71,13 +74,16 @@ fi
|
||||
ssh wooo@${TARGET_HOST} "\
|
||||
cp ${TARGET_ALERTS_PATH} ${TARGET_ALERTS_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
|
||||
cp ${TARGET_ALERTS_CANONICAL_PATH} ${TARGET_ALERTS_CANONICAL_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
|
||||
cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
|
||||
cp ${TARGET_SLO_PATH} ${TARGET_SLO_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true; \
|
||||
cp ${TARGET_DRIFT_GUARD_PATH} ${TARGET_DRIFT_GUARD_PATH}.bak.\$(date +%Y%m%d%H%M%S) 2>/dev/null || true"
|
||||
log "✅ 現有規則已備份"
|
||||
|
||||
# 部署新規則
|
||||
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_PATH}
|
||||
scp "$ALERT_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_ALERTS_CANONICAL_PATH}
|
||||
scp "$SLO_RULES_FILE" wooo@${TARGET_HOST}:${TARGET_SLO_PATH}
|
||||
scp "$DRIFT_GUARD_SCRIPT" wooo@${TARGET_HOST}:${TARGET_DRIFT_GUARD_PATH}
|
||||
ssh wooo@${TARGET_HOST} "chmod 0755 ${TARGET_DRIFT_GUARD_PATH}"
|
||||
log "✅ 規則已複製到 ${TARGET_HOST}"
|
||||
|
||||
LOCAL_ALERTS_SHA="$(file_sha "$ALERT_RULES_FILE")"
|
||||
@@ -85,6 +91,8 @@ REMOTE_ALERTS_SHA="$(remote_file_sha "$TARGET_ALERTS_PATH")"
|
||||
REMOTE_ALERTS_CANONICAL_SHA="$(remote_file_sha "$TARGET_ALERTS_CANONICAL_PATH")"
|
||||
LOCAL_SLO_SHA="$(file_sha "$SLO_RULES_FILE")"
|
||||
REMOTE_SLO_SHA="$(remote_file_sha "$TARGET_SLO_PATH")"
|
||||
LOCAL_DRIFT_GUARD_SHA="$(file_sha "$DRIFT_GUARD_SCRIPT")"
|
||||
REMOTE_DRIFT_GUARD_SHA="$(remote_file_sha "$TARGET_DRIFT_GUARD_PATH")"
|
||||
if [ "$LOCAL_ALERTS_SHA" != "$REMOTE_ALERTS_SHA" ]; then
|
||||
echo "ERROR: 遠端 alerts.yml hash 不一致 local=${LOCAL_ALERTS_SHA} remote=${REMOTE_ALERTS_SHA}"
|
||||
exit 1
|
||||
@@ -97,6 +105,10 @@ if [ "$LOCAL_SLO_SHA" != "$REMOTE_SLO_SHA" ]; then
|
||||
echo "ERROR: 遠端 slo-rules.yml hash 不一致 local=${LOCAL_SLO_SHA} remote=${REMOTE_SLO_SHA}"
|
||||
exit 1
|
||||
fi
|
||||
if [ "$LOCAL_DRIFT_GUARD_SHA" != "$REMOTE_DRIFT_GUARD_SHA" ]; then
|
||||
echo "ERROR: 遠端 prometheus-rule-drift-guard.sh hash 不一致 local=${LOCAL_DRIFT_GUARD_SHA} remote=${REMOTE_DRIFT_GUARD_SHA}"
|
||||
exit 1
|
||||
fi
|
||||
log "✅ 遠端規則 hash 驗證通過"
|
||||
|
||||
# Reload Prometheus
|
||||
|
||||
155
scripts/ops/prometheus-rule-drift-guard.sh
Executable file
155
scripts/ops/prometheus-rule-drift-guard.sh
Executable file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env bash
|
||||
# Guard 110 Prometheus alert rules against stale deploys.
|
||||
#
|
||||
# The canonical file is the source of truth. The guard restores active
|
||||
# alerts.yml only when the active file differs from canonical or when
|
||||
# Prometheus is missing rule names declared by canonical.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
HOST_LABEL="${HOST_LABEL:-110}"
|
||||
PROMETHEUS_URL="${PROMETHEUS_URL:-http://127.0.0.1:9090}"
|
||||
CURRENT_RULES="${CURRENT_RULES:-/home/wooo/monitoring/alerts.yml}"
|
||||
CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonical.yml}"
|
||||
TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}"
|
||||
LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}"
|
||||
|
||||
log() {
|
||||
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
|
||||
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE"
|
||||
}
|
||||
|
||||
write_textfile() {
|
||||
local status="$1"
|
||||
local repaired="$2"
|
||||
local missing_count="$3"
|
||||
local matches_canonical="$4"
|
||||
local tmp
|
||||
mkdir -p "$(dirname "$TEXTFILE")" 2>/dev/null || true
|
||||
tmp="$(mktemp "${TEXTFILE}.tmp.XXXXXX")" || return 0
|
||||
cat >"$tmp" <<EOF
|
||||
# HELP awoooi_prometheus_rule_drift_guard_last_run_timestamp Unix timestamp of the last Prometheus rule drift guard run.
|
||||
# TYPE awoooi_prometheus_rule_drift_guard_last_run_timestamp gauge
|
||||
awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="${HOST_LABEL}",status="${status}"} $(date +%s)
|
||||
# HELP awoooi_prometheus_rule_drift_guard_repaired Whether the guard restored canonical Prometheus rules on the last run.
|
||||
# TYPE awoooi_prometheus_rule_drift_guard_repaired gauge
|
||||
awoooi_prometheus_rule_drift_guard_repaired{host="${HOST_LABEL}"} ${repaired}
|
||||
# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of canonical live rules missing after the last check.
|
||||
# TYPE awoooi_prometheus_rule_drift_guard_missing_required_count gauge
|
||||
awoooi_prometheus_rule_drift_guard_missing_required_count{host="${HOST_LABEL}"} ${missing_count}
|
||||
# HELP awoooi_prometheus_rule_drift_guard_current_matches_canonical Whether active alerts.yml matches canonical copy.
|
||||
# TYPE awoooi_prometheus_rule_drift_guard_current_matches_canonical gauge
|
||||
awoooi_prometheus_rule_drift_guard_current_matches_canonical{host="${HOST_LABEL}"} ${matches_canonical}
|
||||
EOF
|
||||
chmod 0644 "$tmp" 2>/dev/null || true
|
||||
mv "$tmp" "$TEXTFILE" 2>/dev/null || rm -f "$tmp"
|
||||
}
|
||||
|
||||
rules_missing_count() {
|
||||
python3 - "$PROMETHEUS_URL" "$CANONICAL_RULES" <<'PY'
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
base_url = sys.argv[1].rstrip("/")
|
||||
canonical_path = sys.argv[2]
|
||||
|
||||
name_pattern = re.compile(r"^\s*-\s*(?:alert|record):\s*['\"]?([^'\"#]+?)['\"]?\s*(?:#.*)?$")
|
||||
required: set[str] = set()
|
||||
try:
|
||||
with open(canonical_path, encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
match = name_pattern.match(line)
|
||||
if match:
|
||||
required.add(match.group(1).strip())
|
||||
except Exception as exc:
|
||||
print(f"CANONICAL_PARSE_FAILED:{exc}")
|
||||
raise SystemExit(0)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
if payload.get("status") != "success":
|
||||
raise RuntimeError(payload)
|
||||
loaded = {
|
||||
str(rule.get("name") or rule.get("alert") or rule.get("record"))
|
||||
for group in payload.get("data", {}).get("groups") or []
|
||||
for rule in group.get("rules") or []
|
||||
}
|
||||
print(len(required - loaded))
|
||||
except Exception as exc:
|
||||
print(f"QUERY_FAILED:{exc}")
|
||||
PY
|
||||
}
|
||||
|
||||
matches_canonical() {
|
||||
if [ ! -f "$CURRENT_RULES" ] || [ ! -f "$CANONICAL_RULES" ]; then
|
||||
echo 0
|
||||
return
|
||||
fi
|
||||
if cmp -s "$CURRENT_RULES" "$CANONICAL_RULES"; then
|
||||
echo 1
|
||||
else
|
||||
echo 0
|
||||
fi
|
||||
}
|
||||
|
||||
restore_rules() {
|
||||
local backup_path
|
||||
backup_path="${CURRENT_RULES}.guard.bak.$(date +%Y%m%d%H%M%S)"
|
||||
cp "$CURRENT_RULES" "$backup_path" 2>/dev/null || true
|
||||
cp "$CANONICAL_RULES" "$CURRENT_RULES"
|
||||
curl -fsS -X POST "${PROMETHEUS_URL}/-/reload" >/dev/null
|
||||
}
|
||||
|
||||
main() {
|
||||
if [ ! -f "$CANONICAL_RULES" ]; then
|
||||
log "canonical rules missing: ${CANONICAL_RULES}"
|
||||
write_textfile "canonical_missing" 0 999 0
|
||||
return 1
|
||||
fi
|
||||
|
||||
local missing before_matches repaired after_missing after_matches
|
||||
missing="$(rules_missing_count)"
|
||||
before_matches="$(matches_canonical)"
|
||||
repaired=0
|
||||
|
||||
if [[ "$missing" == QUERY_FAILED:* || "$missing" == CANONICAL_PARSE_FAILED:* ]]; then
|
||||
log "Prometheus/canonical query failed: ${missing}"
|
||||
write_textfile "query_failed" 0 999 "$before_matches"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ "$missing" -gt 0 ] || [ "$before_matches" -eq 0 ]; then
|
||||
log "rule drift detected: missing=${missing} current_matches_canonical=${before_matches}; restoring"
|
||||
if restore_rules; then
|
||||
repaired=1
|
||||
sleep 3
|
||||
else
|
||||
log "restore failed"
|
||||
write_textfile "restore_failed" 0 "$missing" "$before_matches"
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
after_missing="$(rules_missing_count)"
|
||||
after_matches="$(matches_canonical)"
|
||||
if [[ "$after_missing" == QUERY_FAILED:* || "$after_missing" == CANONICAL_PARSE_FAILED:* ]]; then
|
||||
log "post-restore Prometheus/canonical query failed: ${after_missing}"
|
||||
write_textfile "post_query_failed" "$repaired" 999 "$after_matches"
|
||||
return 1
|
||||
fi
|
||||
|
||||
if [ "$after_missing" -eq 0 ] && [ "$after_matches" -eq 1 ]; then
|
||||
write_textfile "ok" "$repaired" "$after_missing" "$after_matches"
|
||||
log "ok repaired=${repaired}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "still drifted after check: missing=${after_missing} current_matches_canonical=${after_matches}"
|
||||
write_textfile "drifted" "$repaired" "$after_missing" "$after_matches"
|
||||
return 1
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user