Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 58s
CD Pipeline / build-and-deploy (push) Has started running
CD Pipeline / post-deploy-checks (push) Has been cancelled
81 lines
3.2 KiB
Python
81 lines
3.2 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[3]
|
|
ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml"
|
|
|
|
|
|
def load_alerts() -> dict[str, dict]:
|
|
payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8"))
|
|
alerts: dict[str, dict] = {}
|
|
for group in payload["groups"]:
|
|
for rule in group.get("rules", []):
|
|
if "alert" in rule:
|
|
alerts[rule["alert"]] = rule
|
|
return alerts
|
|
|
|
|
|
def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
|
|
alerts = load_alerts()
|
|
rule = alerts["Host110SustainedModeratePressure"]
|
|
|
|
expr = str(rule["expr"])
|
|
annotations = rule["annotations"]
|
|
action = annotations["auto_repair_action"]
|
|
|
|
assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr
|
|
assert 'docker_container_cpu_cores{host="110"' in expr
|
|
assert 'awoooi_host_process_family_cpu_percent{host="110"' in expr
|
|
assert "> 1.0" in expr
|
|
assert "> 50" in expr
|
|
assert "systemd_control_plane" in expr
|
|
assert "gitea" in expr
|
|
assert "stockplatform-v2-postgres-1" in expr
|
|
assert rule["for"] == "1m"
|
|
assert rule["labels"]["auto_repair"] == "true"
|
|
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
|
|
assert "--load5-per-core-threshold 0.75" in action
|
|
assert "--hot-container-cpu-threshold 1.0" in action
|
|
assert "--container-cpu-threshold 2.0" in action
|
|
assert "--process-family-cpu-threshold 50" in action
|
|
assert "不讀 secret" in annotations["runbook"]
|
|
assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]
|
|
|
|
|
|
def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None:
|
|
alerts = load_alerts()
|
|
action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"]
|
|
|
|
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
|
|
assert "scripts/ops/host-sustained-load-controller.py" not in action
|
|
|
|
|
|
def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None:
|
|
alerts = load_alerts()
|
|
expr = str(alerts["BackupAggregateRunFailed"]["expr"])
|
|
|
|
assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr
|
|
assert 'exported_job="backup_all"} > 0' not in expr
|
|
|
|
|
|
def test_reboot_slo_alerts_project_named_blockers_to_telegram() -> None:
|
|
alerts = load_alerts()
|
|
blocker = alerts["RebootAutoRecoveryActiveBlocker"]
|
|
metric_missing = alerts["RebootAutoRecoveryActiveBlockerMetricMissing"]
|
|
|
|
assert "awoooi_reboot_auto_recovery_slo_active_blocker" in str(blocker["expr"])
|
|
assert blocker["labels"]["notification_type"] == "TYPE-3"
|
|
assert "$labels.blocker" in blocker["annotations"]["summary"]
|
|
assert "$labels.blocker" in blocker["annotations"]["description"]
|
|
assert "Windows99" in blocker["annotations"]["runbook"]
|
|
assert "backup-status" in blocker["annotations"]["runbook"]
|
|
assert "禁止 reboot" in blocker["annotations"]["runbook"]
|
|
|
|
assert "awoooi_reboot_auto_recovery_slo_blocker_count" in str(metric_missing["expr"])
|
|
assert "awoooi_reboot_auto_recovery_slo_active_blocker" in str(metric_missing["expr"])
|
|
assert metric_missing["labels"]["notification_type"] == "TYPE-1"
|