Files
awoooi/scripts/ops/tests/test_host_pressure_alert_contract.py
Your Name 068c18e2f0
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 58s
CD Pipeline / build-and-deploy (push) Has started running
CD Pipeline / post-deploy-checks (push) Has been cancelled
fix(reboot): expose active blocker alert metrics
2026-07-02 22:24:51 +08:00

81 lines
3.2 KiB
Python

from __future__ import annotations
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[3]
ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml"
def load_alerts() -> dict[str, dict]:
payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8"))
alerts: dict[str, dict] = {}
for group in payload["groups"]:
for rule in group.get("rules", []):
if "alert" in rule:
alerts[rule["alert"]] = rule
return alerts
def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
alerts = load_alerts()
rule = alerts["Host110SustainedModeratePressure"]
expr = str(rule["expr"])
annotations = rule["annotations"]
action = annotations["auto_repair_action"]
assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr
assert 'docker_container_cpu_cores{host="110"' in expr
assert 'awoooi_host_process_family_cpu_percent{host="110"' in expr
assert "> 1.0" in expr
assert "> 50" in expr
assert "systemd_control_plane" in expr
assert "gitea" in expr
assert "stockplatform-v2-postgres-1" in expr
assert rule["for"] == "1m"
assert rule["labels"]["auto_repair"] == "true"
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "--load5-per-core-threshold 0.75" in action
assert "--hot-container-cpu-threshold 1.0" in action
assert "--container-cpu-threshold 2.0" in action
assert "--process-family-cpu-threshold 50" in action
assert "不讀 secret" in annotations["runbook"]
assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]
def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None:
alerts = load_alerts()
action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"]
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "scripts/ops/host-sustained-load-controller.py" not in action
def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None:
alerts = load_alerts()
expr = str(alerts["BackupAggregateRunFailed"]["expr"])
assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr
assert 'exported_job="backup_all"} > 0' not in expr
def test_reboot_slo_alerts_project_named_blockers_to_telegram() -> None:
alerts = load_alerts()
blocker = alerts["RebootAutoRecoveryActiveBlocker"]
metric_missing = alerts["RebootAutoRecoveryActiveBlockerMetricMissing"]
assert "awoooi_reboot_auto_recovery_slo_active_blocker" in str(blocker["expr"])
assert blocker["labels"]["notification_type"] == "TYPE-3"
assert "$labels.blocker" in blocker["annotations"]["summary"]
assert "$labels.blocker" in blocker["annotations"]["description"]
assert "Windows99" in blocker["annotations"]["runbook"]
assert "backup-status" in blocker["annotations"]["runbook"]
assert "禁止 reboot" in blocker["annotations"]["runbook"]
assert "awoooi_reboot_auto_recovery_slo_blocker_count" in str(metric_missing["expr"])
assert "awoooi_reboot_auto_recovery_slo_active_blocker" in str(metric_missing["expr"])
assert metric_missing["labels"]["notification_type"] == "TYPE-1"