fix(ops): harden reboot recovery and backup alerts
This commit is contained in:
260
scripts/ops/backup-alert-label-contract-check.py
Executable file
260
scripts/ops/backup-alert-label-contract-check.py
Executable file
@@ -0,0 +1,260 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate the backup alert label contract.
|
||||
|
||||
Node exporter textfile metrics use labels such as job="backup_all" locally, but
|
||||
Prometheus rewrites that metric label to exported_job because the scrape target
|
||||
already has job="node-exporter-110". Backup alerts must therefore use
|
||||
$labels.exported_job in user-facing text and exported_job="..." in expressions.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml")
|
||||
DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml")
|
||||
|
||||
|
||||
class ContractError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _load_alerts(path: Path) -> dict[str, dict[str, Any]]:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
alerts: dict[str, dict[str, Any]] = {}
|
||||
for group in data.get("groups") or []:
|
||||
for rule in group.get("rules") or []:
|
||||
name = rule.get("alert")
|
||||
if name:
|
||||
alerts[name] = rule
|
||||
return alerts
|
||||
|
||||
|
||||
def _annotation_text(rule: dict[str, Any]) -> str:
|
||||
annotations = rule.get("annotations") or {}
|
||||
return "\n".join(str(value) for value in annotations.values())
|
||||
|
||||
|
||||
def _require_alert(alerts: dict[str, dict[str, Any]], name: str) -> dict[str, Any]:
|
||||
if name not in alerts:
|
||||
raise ContractError(f"missing alert: {name}")
|
||||
return alerts[name]
|
||||
|
||||
|
||||
def _require_contains(value: str, expected: str, label: str) -> None:
|
||||
if expected not in value:
|
||||
raise ContractError(f"{label} must contain {expected!r}")
|
||||
|
||||
|
||||
def _require_not_contains(value: str, forbidden: str, label: str) -> None:
|
||||
if forbidden in value:
|
||||
raise ContractError(f"{label} must not contain {forbidden!r}")
|
||||
|
||||
|
||||
def _expected_backup_alerts(path: Path) -> list[str]:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
alerts = data.get("monitoring_contract", {}).get("prometheus_alerts") or []
|
||||
if not alerts:
|
||||
raise ContractError(f"missing monitoring_contract.prometheus_alerts in {path}")
|
||||
return [str(alert) for alert in alerts]
|
||||
|
||||
|
||||
def static_check(path: Path, baseline_path: Path) -> list[str]:
|
||||
alerts = _load_alerts(path)
|
||||
lines: list[str] = []
|
||||
|
||||
missing = sorted(set(_expected_backup_alerts(baseline_path)) - set(alerts))
|
||||
if missing:
|
||||
raise ContractError(f"alerts-unified.yml missing baseline backup alerts: {missing}")
|
||||
lines.append("OK alerts-unified.yml contains every baseline backup alert")
|
||||
|
||||
rule = _require_alert(alerts, "BackupExpectedJobMissing")
|
||||
_require_contains(str(rule.get("expr", "")), "awoooi_backup_job_configured", "BackupExpectedJobMissing expr")
|
||||
text = _annotation_text(rule)
|
||||
_require_contains(text, "$labels.exported_job", "BackupExpectedJobMissing annotations")
|
||||
_require_not_contains(text, "$labels.job", "BackupExpectedJobMissing annotations")
|
||||
lines.append("OK BackupExpectedJobMissing uses exported_job label")
|
||||
|
||||
rule = _require_alert(alerts, "BackupJobStale")
|
||||
_require_contains(str(rule.get("expr", "")), "awoooi_backup_job_fresh", "BackupJobStale expr")
|
||||
text = _annotation_text(rule)
|
||||
_require_contains(text, "$labels.exported_job", "BackupJobStale annotations")
|
||||
_require_not_contains(text, "$labels.job", "BackupJobStale annotations")
|
||||
for required_label in ["$labels.max_age_hours", "$labels.source", "$labels.target"]:
|
||||
_require_contains(text, required_label, "BackupJobStale annotations")
|
||||
lines.append("OK BackupJobStale uses exported_job/source/target labels")
|
||||
|
||||
rule = _require_alert(alerts, "BackupAggregateRunFailed")
|
||||
_require_contains(
|
||||
str(rule.get("expr", "")),
|
||||
'awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"}',
|
||||
"BackupAggregateRunFailed expr",
|
||||
)
|
||||
lines.append("OK BackupAggregateRunFailed filters exported_job=backup_all")
|
||||
|
||||
rule = _require_alert(alerts, "BackupConfigCapturePartial")
|
||||
_require_contains(str(rule.get("expr", "")), "awoooi_backup_config_capture_ok", "BackupConfigCapturePartial expr")
|
||||
text = _annotation_text(rule)
|
||||
for required_label in ["$labels.target", "$labels.source"]:
|
||||
_require_contains(text, required_label, "BackupConfigCapturePartial annotations")
|
||||
lines.append("OK BackupConfigCapturePartial uses target/source labels")
|
||||
|
||||
rule = _require_alert(alerts, "BackupConfigCaptureStatusStale")
|
||||
_require_contains(
|
||||
str(rule.get("expr", "")),
|
||||
"awoooi_backup_config_capture_status_timestamp",
|
||||
"BackupConfigCaptureStatusStale expr",
|
||||
)
|
||||
lines.append("OK BackupConfigCaptureStatusStale checks config capture status timestamp")
|
||||
|
||||
rule = _require_alert(alerts, "BackupScriptMissing")
|
||||
_require_contains(_annotation_text(rule), "$labels.script", "BackupScriptMissing annotations")
|
||||
lines.append("OK BackupScriptMissing uses script label")
|
||||
|
||||
rule = _require_alert(alerts, "BackupCredentialEscrowEvidenceMissing")
|
||||
_require_contains(_annotation_text(rule), "$labels.item", "BackupCredentialEscrowEvidenceMissing annotations")
|
||||
lines.append("OK BackupCredentialEscrowEvidenceMissing uses item label")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]:
|
||||
query = urllib.parse.urlencode({"query": expr})
|
||||
url = f"{base_url.rstrip('/')}/api/v1/query?{query}"
|
||||
with urllib.request.urlopen(url, timeout=8) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
if payload.get("status") != "success":
|
||||
raise ContractError(f"Prometheus query failed for {expr}: {payload}")
|
||||
return payload.get("data", {}).get("result") or []
|
||||
|
||||
|
||||
def _prom_rules(base_url: str) -> list[dict[str, Any]]:
|
||||
url = f"{base_url.rstrip('/')}/api/v1/rules"
|
||||
with urllib.request.urlopen(url, timeout=8) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
if payload.get("status") != "success":
|
||||
raise ContractError(f"Prometheus rules query failed: {payload}")
|
||||
rules: list[dict[str, Any]] = []
|
||||
for group in payload.get("data", {}).get("groups") or []:
|
||||
for rule in group.get("rules") or []:
|
||||
name = rule.get("name") or rule.get("alert")
|
||||
if not name:
|
||||
continue
|
||||
rules.append(
|
||||
{
|
||||
"name": str(name),
|
||||
"health": str(rule.get("health", "")),
|
||||
"state": str(rule.get("state", "")),
|
||||
"group": str(group.get("name", "")),
|
||||
}
|
||||
)
|
||||
return rules
|
||||
|
||||
|
||||
def _require_live_label(base_url: str, expr: str, labels: set[str]) -> str:
|
||||
rows = _prom_query(base_url, expr)
|
||||
if not rows:
|
||||
raise ContractError(f"Prometheus query returned no series: {expr}")
|
||||
metric = rows[0].get("metric") or {}
|
||||
missing = sorted(label for label in labels if label not in metric)
|
||||
if missing:
|
||||
raise ContractError(f"{expr} missing labels {missing}; labels={sorted(metric)}")
|
||||
return f"OK live {expr} exposes labels {','.join(sorted(labels))}"
|
||||
|
||||
|
||||
def _require_live_rules(base_url: str, expected_alerts: list[str]) -> list[str]:
|
||||
rules = _prom_rules(base_url)
|
||||
by_name = {rule["name"]: rule for rule in rules}
|
||||
missing = sorted(set(expected_alerts) - set(by_name))
|
||||
if missing:
|
||||
raise ContractError(f"Prometheus missing loaded backup alert rules: {missing}")
|
||||
|
||||
unhealthy = [
|
||||
f"{rule['name']} health={rule['health']} group={rule['group']}"
|
||||
for rule in by_name.values()
|
||||
if rule["name"] in expected_alerts and rule["health"] not in {"", "ok"}
|
||||
]
|
||||
if unhealthy:
|
||||
raise ContractError(f"Prometheus backup alert rule health is not ok: {unhealthy}")
|
||||
|
||||
state_counts: dict[str, int] = {}
|
||||
for name in expected_alerts:
|
||||
state = by_name[name]["state"] or "unknown"
|
||||
state_counts[state] = state_counts.get(state, 0) + 1
|
||||
state_summary = ",".join(f"{key}={state_counts[key]}" for key in sorted(state_counts))
|
||||
return [
|
||||
f"OK live Prometheus loaded {len(expected_alerts)} baseline backup alert rules",
|
||||
f"OK live Prometheus backup alert rule states {state_summary}",
|
||||
]
|
||||
|
||||
|
||||
def live_check(base_url: str, baseline_path: Path) -> list[str]:
|
||||
lines = [
|
||||
_require_live_label(
|
||||
base_url,
|
||||
'awoooi_backup_job_configured{host="110"}',
|
||||
{"exported_job", "host", "job"},
|
||||
),
|
||||
_require_live_label(
|
||||
base_url,
|
||||
'awoooi_backup_job_fresh{host="110"}',
|
||||
{"exported_job", "host", "job", "source", "target", "max_age_hours"},
|
||||
),
|
||||
_require_live_label(
|
||||
base_url,
|
||||
'awoooi_backup_last_run_failed_count{host="110"}',
|
||||
{"exported_job", "host", "job"},
|
||||
),
|
||||
_require_live_label(
|
||||
base_url,
|
||||
'awoooi_backup_dr_next_step_info{host="110"}',
|
||||
{"host", "next_step"},
|
||||
),
|
||||
_require_live_label(
|
||||
base_url,
|
||||
'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone"}',
|
||||
{"host", "provider", "scope", "max_age_hours"},
|
||||
),
|
||||
_require_live_label(
|
||||
base_url,
|
||||
'awoooi_backup_config_capture_ok{host="110"}',
|
||||
{"host", "target", "source", "critical"},
|
||||
),
|
||||
]
|
||||
lines.extend(_require_live_rules(base_url, _expected_backup_alerts(baseline_path)))
|
||||
return lines
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--rules", type=Path, default=DEFAULT_RULES)
|
||||
parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE)
|
||||
parser.add_argument("--prometheus-url", default="")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
for line in static_check(args.rules, args.baseline):
|
||||
print(line)
|
||||
if args.prometheus_url:
|
||||
for line in live_check(args.prometheus_url, args.baseline):
|
||||
print(line)
|
||||
except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc:
|
||||
print(f"BACKUP_ALERT_LABEL_CONTRACT_FAILED {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print("BACKUP_ALERT_LABEL_CONTRACT_OK")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
242
scripts/ops/backup-alert-live-visibility-check.py
Executable file
242
scripts/ops/backup-alert-live-visibility-check.py
Executable file
@@ -0,0 +1,242 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Verify live visibility for backup gap alerts.
|
||||
|
||||
This read-only check closes the gap between "metrics exist" and "alerts are
|
||||
actually visible". If the offsite or credential-escrow gap metrics are present,
|
||||
the corresponding Prometheus firing alerts must be visible. When Alertmanager is
|
||||
provided, those same alerts must also be active there.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
class VisibilityError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RequiredAlert:
|
||||
name: str
|
||||
labels: dict[str, str]
|
||||
|
||||
|
||||
COMMON_LABELS = {
|
||||
"host": "110",
|
||||
"auto_repair": "false",
|
||||
"alert_category": "infrastructure",
|
||||
"notification_type": "TYPE-1",
|
||||
"severity": "warning",
|
||||
}
|
||||
|
||||
|
||||
def _json_get(url: str, timeout: int) -> Any:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as response:
|
||||
return json.loads(response.read().decode("utf-8"))
|
||||
|
||||
|
||||
def _prom_query(base_url: str, expr: str, timeout: int) -> list[dict[str, Any]]:
|
||||
query = urllib.parse.urlencode({"query": expr})
|
||||
url = f"{base_url.rstrip('/')}/api/v1/query?{query}"
|
||||
payload = _json_get(url, timeout)
|
||||
if payload.get("status") != "success":
|
||||
raise VisibilityError(f"Prometheus query failed for {expr}: {payload}")
|
||||
return payload.get("data", {}).get("result") or []
|
||||
|
||||
|
||||
def _prom_alerts(base_url: str, timeout: int) -> list[dict[str, Any]]:
|
||||
url = f"{base_url.rstrip('/')}/api/v1/alerts"
|
||||
payload = _json_get(url, timeout)
|
||||
if payload.get("status") != "success":
|
||||
raise VisibilityError(f"Prometheus alerts query failed: {payload}")
|
||||
return payload.get("data", {}).get("alerts") or []
|
||||
|
||||
|
||||
def _alertmanager_alerts(base_url: str, timeout: int) -> list[dict[str, Any]]:
|
||||
url = f"{base_url.rstrip('/')}/api/v2/alerts"
|
||||
payload = _json_get(url, timeout)
|
||||
if not isinstance(payload, list):
|
||||
raise VisibilityError(f"Alertmanager alerts query returned unexpected payload: {payload}")
|
||||
return payload
|
||||
|
||||
|
||||
def _float_value(row: dict[str, Any], expr: str) -> float:
|
||||
value = row.get("value")
|
||||
if not isinstance(value, list) or len(value) < 2:
|
||||
raise VisibilityError(f"Prometheus query returned unexpected value for {expr}: {row}")
|
||||
try:
|
||||
return float(value[1])
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise VisibilityError(f"Prometheus query returned non-numeric value for {expr}: {row}") from exc
|
||||
|
||||
|
||||
def _metric_labels(row: dict[str, Any]) -> dict[str, str]:
|
||||
metric = row.get("metric") or {}
|
||||
return {str(key): str(value) for key, value in metric.items()}
|
||||
|
||||
|
||||
def _labels_match(actual: dict[str, str], expected: dict[str, str]) -> bool:
|
||||
return all(actual.get(key) == value for key, value in expected.items())
|
||||
|
||||
|
||||
def _find_prom_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> dict[str, Any] | None:
|
||||
expected = {"alertname": required.name, **required.labels}
|
||||
for alert in alerts:
|
||||
if str(alert.get("state", "")) != "firing":
|
||||
continue
|
||||
labels = {str(key): str(value) for key, value in (alert.get("labels") or {}).items()}
|
||||
if _labels_match(labels, expected):
|
||||
return alert
|
||||
return None
|
||||
|
||||
|
||||
def _find_alertmanager_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> dict[str, Any] | None:
|
||||
expected = {"alertname": required.name, **required.labels}
|
||||
for alert in alerts:
|
||||
status = alert.get("status") or {}
|
||||
if str(status.get("state", "")) != "active":
|
||||
continue
|
||||
labels = {str(key): str(value) for key, value in (alert.get("labels") or {}).items()}
|
||||
if _labels_match(labels, expected):
|
||||
return alert
|
||||
return None
|
||||
|
||||
|
||||
def _require_prom_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> None:
|
||||
if _find_prom_alert(alerts, required) is None:
|
||||
raise VisibilityError(
|
||||
f"missing Prometheus firing alert {required.name} with labels {required.labels}"
|
||||
)
|
||||
|
||||
|
||||
def _require_alertmanager_alert(alerts: list[dict[str, Any]], required: RequiredAlert) -> None:
|
||||
if _find_alertmanager_alert(alerts, required) is None:
|
||||
raise VisibilityError(
|
||||
f"missing Alertmanager active alert {required.name} with labels {required.labels}"
|
||||
)
|
||||
|
||||
|
||||
def _sum_query_values(prometheus_url: str, expr: str, timeout: int) -> float:
|
||||
return sum(_float_value(row, expr) for row in _prom_query(prometheus_url, expr, timeout))
|
||||
|
||||
|
||||
def _max_query_value(prometheus_url: str, expr: str, timeout: int) -> float:
|
||||
rows = _prom_query(prometheus_url, expr, timeout)
|
||||
if not rows:
|
||||
return 0
|
||||
return max(_float_value(row, expr) for row in rows)
|
||||
|
||||
|
||||
def _offsite_required_alerts(prometheus_url: str, host: str, timeout: int) -> tuple[list[RequiredAlert], str]:
|
||||
expr = f'awoooi_backup_offsite_configured{{host="{host}"}}'
|
||||
rows = _prom_query(prometheus_url, expr, timeout)
|
||||
if not rows:
|
||||
raise VisibilityError(f"Prometheus query returned no offsite configured series: {expr}")
|
||||
configured_total = sum(_float_value(row, expr) for row in rows)
|
||||
if configured_total == 0:
|
||||
return (
|
||||
[RequiredAlert("BackupOffsiteCopyNotConfigured", {**COMMON_LABELS, "host": host})],
|
||||
"OK offsite gap metric requires BackupOffsiteCopyNotConfigured visibility",
|
||||
)
|
||||
|
||||
fresh_expr = f'awoooi_backup_offsite_fresh{{host="{host}"}}'
|
||||
if _sum_query_values(prometheus_url, fresh_expr, timeout) > 0:
|
||||
return [], "OK offsite full marker is fresh; no offsite gap alert required"
|
||||
|
||||
enabled_expr = f'awoooi_backup_offsite_full_sync_enabled{{host="{host}"}}'
|
||||
enabled_total = _sum_query_values(prometheus_url, enabled_expr, timeout)
|
||||
if enabled_total > 0:
|
||||
timestamp_expr = f'awoooi_backup_offsite_full_sync_enabled_timestamp{{host="{host}"}}'
|
||||
enabled_timestamp = _max_query_value(prometheus_url, timestamp_expr, timeout)
|
||||
enabled_age = int(time.time() - enabled_timestamp) if enabled_timestamp else 0
|
||||
if enabled_timestamp and enabled_age <= 30 * 3600:
|
||||
return (
|
||||
[],
|
||||
f"OK offsite full sync enabled within grace window; BackupOffsiteCopyStale not required yet age_seconds={enabled_age}",
|
||||
)
|
||||
|
||||
return (
|
||||
[RequiredAlert("BackupOffsiteCopyStale", {**COMMON_LABELS, "host": host})],
|
||||
"OK offsite full marker gap requires BackupOffsiteCopyStale visibility",
|
||||
)
|
||||
|
||||
|
||||
def _escrow_required_alerts(prometheus_url: str, host: str, timeout: int) -> list[RequiredAlert]:
|
||||
expr = f'awoooi_backup_credential_escrow_fresh{{host="{host}"}} == 0'
|
||||
rows = _prom_query(prometheus_url, expr, timeout)
|
||||
required: list[RequiredAlert] = []
|
||||
for row in rows:
|
||||
labels = _metric_labels(row)
|
||||
item = labels.get("item")
|
||||
if not item:
|
||||
raise VisibilityError(f"Credential escrow gap metric missing item label: {row}")
|
||||
required.append(
|
||||
RequiredAlert(
|
||||
"BackupCredentialEscrowEvidenceMissing",
|
||||
{**COMMON_LABELS, "host": host, "item": item},
|
||||
)
|
||||
)
|
||||
return sorted(required, key=lambda alert: alert.labels["item"])
|
||||
|
||||
|
||||
def live_check(prometheus_url: str, alertmanager_url: str, host: str, timeout: int) -> list[str]:
|
||||
required_alerts: list[RequiredAlert] = []
|
||||
lines: list[str] = []
|
||||
|
||||
offsite_alerts, offsite_line = _offsite_required_alerts(prometheus_url, host, timeout)
|
||||
required_alerts.extend(offsite_alerts)
|
||||
lines.append(offsite_line)
|
||||
|
||||
escrow_alerts = _escrow_required_alerts(prometheus_url, host, timeout)
|
||||
required_alerts.extend(escrow_alerts)
|
||||
if escrow_alerts:
|
||||
escrow_items = ", ".join(alert.labels["item"] for alert in escrow_alerts)
|
||||
lines.append(
|
||||
f"OK credential escrow gap metrics require {len(escrow_alerts)} alert(s): {escrow_items}"
|
||||
)
|
||||
else:
|
||||
lines.append("OK credential escrow markers are fresh; no escrow gap alert required")
|
||||
|
||||
prom_alerts = _prom_alerts(prometheus_url, timeout)
|
||||
for required in required_alerts:
|
||||
_require_prom_alert(prom_alerts, required)
|
||||
lines.append(f"OK Prometheus exposes {len(required_alerts)} required backup gap firing alert(s)")
|
||||
|
||||
if alertmanager_url:
|
||||
am_alerts = _alertmanager_alerts(alertmanager_url, timeout)
|
||||
for required in required_alerts:
|
||||
_require_alertmanager_alert(am_alerts, required)
|
||||
lines.append(f"OK Alertmanager exposes {len(required_alerts)} required backup gap active alert(s)")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--prometheus-url", required=True)
|
||||
parser.add_argument("--alertmanager-url", default="")
|
||||
parser.add_argument("--host", default="110")
|
||||
parser.add_argument("--timeout", type=int, default=8)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
for line in live_check(args.prometheus_url, args.alertmanager_url, args.host, args.timeout):
|
||||
print(line)
|
||||
except (VisibilityError, OSError, json.JSONDecodeError) as exc:
|
||||
print(f"BACKUP_ALERT_LIVE_VISIBILITY_FAILED {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print("BACKUP_ALERT_LIVE_VISIBILITY_OK")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,9 +1,9 @@
|
||||
#!/usr/bin/env bash
|
||||
# Guard 110 Prometheus alert rules against stale deploys.
|
||||
#
|
||||
# The canonical file is the source of truth. The guard restores active
|
||||
# alerts.yml only when the active file differs from canonical or when
|
||||
# Prometheus is missing rule names declared by canonical.
|
||||
# This script is intentionally narrow: it only restores the canonical alert
|
||||
# rules file when required recovery/backup rules disappear from live Prometheus
|
||||
# or when the active file differs from the canonical copy.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
@@ -14,6 +14,14 @@ CANONICAL_RULES="${CANONICAL_RULES:-/home/wooo/monitoring/alerts-unified.canonic
|
||||
TEXTFILE="${TEXTFILE:-/home/wooo/node_exporter_textfiles/prometheus_rule_drift_guard.prom}"
|
||||
LOG_FILE="${LOG_FILE:-/home/wooo/logs/prometheus-rule-drift-guard.log}"
|
||||
|
||||
REQUIRED_RULES=(
|
||||
"BackupCredentialEscrowEvidenceMissing"
|
||||
"BackupExpectedJobMissing"
|
||||
"awoooi_recovery_core_ready"
|
||||
"awoooi_recovery_dr_offsite_ready"
|
||||
"ColdStartRecoveryBlocked"
|
||||
)
|
||||
|
||||
log() {
|
||||
mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null || true
|
||||
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >>"$LOG_FILE"
|
||||
@@ -34,7 +42,7 @@ awoooi_prometheus_rule_drift_guard_last_run_timestamp{host="${HOST_LABEL}",statu
|
||||
# HELP awoooi_prometheus_rule_drift_guard_repaired Whether the guard restored canonical Prometheus rules on the last run.
|
||||
# TYPE awoooi_prometheus_rule_drift_guard_repaired gauge
|
||||
awoooi_prometheus_rule_drift_guard_repaired{host="${HOST_LABEL}"} ${repaired}
|
||||
# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of canonical live rules missing after the last check.
|
||||
# HELP awoooi_prometheus_rule_drift_guard_missing_required_count Number of required live rules missing after the last check.
|
||||
# TYPE awoooi_prometheus_rule_drift_guard_missing_required_count gauge
|
||||
awoooi_prometheus_rule_drift_guard_missing_required_count{host="${HOST_LABEL}"} ${missing_count}
|
||||
# HELP awoooi_prometheus_rule_drift_guard_current_matches_canonical Whether active alerts.yml matches canonical copy.
|
||||
@@ -46,27 +54,13 @@ EOF
|
||||
}
|
||||
|
||||
rules_missing_count() {
|
||||
python3 - "$PROMETHEUS_URL" "$CANONICAL_RULES" <<'PY'
|
||||
python3 - "$PROMETHEUS_URL" "${REQUIRED_RULES[@]}" <<'PY'
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
base_url = sys.argv[1].rstrip("/")
|
||||
canonical_path = sys.argv[2]
|
||||
|
||||
name_pattern = re.compile(r"^\s*-\s*(?:alert|record):\s*['\"]?([^'\"#]+?)['\"]?\s*(?:#.*)?$")
|
||||
required: set[str] = set()
|
||||
try:
|
||||
with open(canonical_path, encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
match = name_pattern.match(line)
|
||||
if match:
|
||||
required.add(match.group(1).strip())
|
||||
except Exception as exc:
|
||||
print(f"CANONICAL_PARSE_FAILED:{exc}")
|
||||
raise SystemExit(0)
|
||||
|
||||
required = set(sys.argv[2:])
|
||||
try:
|
||||
with urllib.request.urlopen(f"{base_url}/api/v1/rules", timeout=8) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
@@ -115,8 +109,8 @@ main() {
|
||||
before_matches="$(matches_canonical)"
|
||||
repaired=0
|
||||
|
||||
if [[ "$missing" == QUERY_FAILED:* || "$missing" == CANONICAL_PARSE_FAILED:* ]]; then
|
||||
log "Prometheus/canonical query failed: ${missing}"
|
||||
if [[ "$missing" == QUERY_FAILED:* ]]; then
|
||||
log "Prometheus query failed: ${missing}"
|
||||
write_textfile "query_failed" 0 999 "$before_matches"
|
||||
return 1
|
||||
fi
|
||||
@@ -135,8 +129,8 @@ main() {
|
||||
|
||||
after_missing="$(rules_missing_count)"
|
||||
after_matches="$(matches_canonical)"
|
||||
if [[ "$after_missing" == QUERY_FAILED:* || "$after_missing" == CANONICAL_PARSE_FAILED:* ]]; then
|
||||
log "post-restore Prometheus/canonical query failed: ${after_missing}"
|
||||
if [[ "$after_missing" == QUERY_FAILED:* ]]; then
|
||||
log "post-restore Prometheus query failed: ${after_missing}"
|
||||
write_textfile "post_query_failed" "$repaired" 999 "$after_matches"
|
||||
return 1
|
||||
fi
|
||||
|
||||
148
scripts/ops/recovery-scorecard-contract-check.py
Executable file
148
scripts/ops/recovery-scorecard-contract-check.py
Executable file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate recovery scorecard recording-rule contract."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml")
|
||||
DEFAULT_BASELINE = Path("ops/reboot-recovery/full-stack-backup-baseline.yml")
|
||||
EXPECTED_CORE = 'awoooi_recovery_core_ready{host="110",scope="110_120_121_188"}'
|
||||
EXPECTED_DR = 'awoooi_recovery_dr_offsite_ready{host="110"}'
|
||||
|
||||
|
||||
class ContractError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _rules(path: Path) -> list[dict[str, Any]]:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
rules: list[dict[str, Any]] = []
|
||||
for group in data.get("groups") or []:
|
||||
rules.extend(group.get("rules") or [])
|
||||
return rules
|
||||
|
||||
|
||||
def _expected_recording_rules(path: Path) -> list[str]:
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or []
|
||||
if not rules:
|
||||
raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}")
|
||||
return [str(rule) for rule in rules]
|
||||
|
||||
|
||||
def static_check(rules_path: Path, baseline_path: Path) -> list[str]:
|
||||
rules = _rules(rules_path)
|
||||
by_record = {str(rule.get("record")): rule for rule in rules if rule.get("record")}
|
||||
expected = _expected_recording_rules(baseline_path)
|
||||
missing = sorted(set(expected) - set(by_record))
|
||||
if missing:
|
||||
raise ContractError(f"alerts-unified.yml missing recovery recording rules: {missing}")
|
||||
|
||||
core_expr = str(by_record["awoooi_recovery_core_ready"].get("expr", ""))
|
||||
for required in [
|
||||
"awoooi_cold_start_last_result",
|
||||
"awoooi_cold_start_warn_gates",
|
||||
"awoooi_cold_start_blocked_gates",
|
||||
"awoooi_cold_start_last_green_timestamp",
|
||||
]:
|
||||
if required not in core_expr:
|
||||
raise ContractError(f"awoooi_recovery_core_ready expr missing {required}")
|
||||
|
||||
dr_expr = str(by_record["awoooi_recovery_dr_offsite_ready"].get("expr", ""))
|
||||
for required in [
|
||||
"awoooi_backup_offsite_configured",
|
||||
"awoooi_backup_offsite_fresh",
|
||||
"awoooi_backup_credential_escrow_fresh",
|
||||
]:
|
||||
if required not in dr_expr:
|
||||
raise ContractError(f"awoooi_recovery_dr_offsite_ready expr missing {required}")
|
||||
|
||||
return [
|
||||
"OK alerts-unified.yml contains every recovery scorecard recording rule",
|
||||
"OK recovery core rule depends on cold-start green/warn/blocked/last-green metrics",
|
||||
"OK recovery DR rule depends on provider-neutral offsite freshness and credential escrow freshness",
|
||||
]
|
||||
|
||||
|
||||
def _prom_query(base_url: str, expr: str) -> list[dict[str, Any]]:
|
||||
url = f"{base_url.rstrip('/')}/api/v1/query?" + urllib.parse.urlencode({"query": expr})
|
||||
with urllib.request.urlopen(url, timeout=8) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
if payload.get("status") != "success":
|
||||
raise ContractError(f"Prometheus query failed for {expr}: {payload}")
|
||||
return payload.get("data", {}).get("result") or []
|
||||
|
||||
|
||||
def _single_value(base_url: str, expr: str) -> float:
|
||||
rows = _prom_query(base_url, expr)
|
||||
if len(rows) != 1:
|
||||
raise ContractError(f"Prometheus query expected one series for {expr}, got {len(rows)}")
|
||||
value = rows[0].get("value") or []
|
||||
if len(value) < 2:
|
||||
raise ContractError(f"Prometheus query returned malformed value for {expr}: {rows[0]}")
|
||||
try:
|
||||
number = float(value[1])
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise ContractError(f"Prometheus query returned non-numeric value for {expr}: {rows[0]}") from exc
|
||||
if number not in {0.0, 1.0}:
|
||||
raise ContractError(f"Prometheus recovery scorecard metric must be 0 or 1: {expr}={number}")
|
||||
return number
|
||||
|
||||
|
||||
def live_check(
|
||||
base_url: str,
|
||||
expect_core_ready: bool = False,
|
||||
expect_dr_ready: bool = False,
|
||||
) -> list[str]:
|
||||
core = _single_value(base_url, EXPECTED_CORE)
|
||||
dr = _single_value(base_url, EXPECTED_DR)
|
||||
lines = [
|
||||
f"OK live {EXPECTED_CORE} value={int(core)}",
|
||||
f"OK live {EXPECTED_DR} value={int(dr)}",
|
||||
]
|
||||
if expect_core_ready and core != 1.0:
|
||||
raise ContractError(f"expected core recovery ready, got {core}")
|
||||
if expect_dr_ready and dr != 1.0:
|
||||
raise ContractError(f"expected DR offsite ready, got {dr}")
|
||||
return lines
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--rules", type=Path, default=DEFAULT_RULES)
|
||||
parser.add_argument("--baseline", type=Path, default=DEFAULT_BASELINE)
|
||||
parser.add_argument("--prometheus-url", default="")
|
||||
parser.add_argument("--expect-core-ready", action="store_true")
|
||||
parser.add_argument("--expect-dr-ready", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
for line in static_check(args.rules, args.baseline):
|
||||
print(line)
|
||||
if args.prometheus_url:
|
||||
for line in live_check(
|
||||
args.prometheus_url,
|
||||
args.expect_core_ready,
|
||||
args.expect_dr_ready,
|
||||
):
|
||||
print(line)
|
||||
except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc:
|
||||
print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print("RECOVERY_SCORECARD_CONTRACT_OK")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user