fix(ops): close 110 pressure and backup alert gaps
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 1m55s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 23:32:55 +08:00
parent 7a53a5287f
commit d658f03ac5
13 changed files with 465 additions and 33 deletions

View File

@@ -114,3 +114,35 @@ def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_sta
)
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered
assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered
def test_dr_phase_ready_when_full_offsite_is_fresh_and_escrow_is_complete(
tmp_path: Path, monkeypatch
) -> None:
exporter = load_exporter()
offsite_dir = tmp_path / "offsite"
escrow_dir = tmp_path / "escrow"
offsite_dir.mkdir()
escrow_dir.mkdir()
now = 1_782_900_000
monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir)
monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir)
monkeypatch.setattr(exporter.time, "time", lambda: now)
monkeypatch.setattr(exporter, "_b2_configured", lambda: False)
monkeypatch.setattr(exporter, "_rclone_configured", lambda: True)
(offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8")
(offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8")
for item in exporter.ESCROW_ITEMS:
(escrow_dir / f"{item}.last_verified").write_text(str(now - 60), encoding="utf-8")
metrics = exporter._offsite_and_escrow_metric_lines("110")
rendered = "\n".join(metrics)
assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered
assert (
'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0'
in rendered
)
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 0' in rendered
assert 'awoooi_backup_dr_phase{host="110",next_step="offsite_and_escrow_ready"} 5' in rendered

View File

@@ -0,0 +1,56 @@
from __future__ import annotations
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[3]
ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml"
def load_alerts() -> dict[str, dict]:
payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8"))
alerts: dict[str, dict] = {}
for group in payload["groups"]:
for rule in group.get("rules", []):
if "alert" in rule:
alerts[rule["alert"]] = rule
return alerts
def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
alerts = load_alerts()
rule = alerts["Host110SustainedModeratePressure"]
expr = str(rule["expr"])
annotations = rule["annotations"]
action = annotations["auto_repair_action"]
assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr
assert 'docker_container_cpu_cores{host="110"' in expr
assert "> 2.0" in expr
assert "gitea" in expr
assert "stockplatform-v2-postgres-1" in expr
assert rule["for"] == "1m"
assert rule["labels"]["auto_repair"] == "true"
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "--load5-per-core-threshold 0.75" in action
assert "不讀 secret" in annotations["runbook"]
assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]
def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None:
alerts = load_alerts()
action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"]
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "scripts/ops/host-sustained-load-controller.py" not in action
def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None:
alerts = load_alerts()
expr = str(alerts["BackupAggregateRunFailed"]["expr"])
assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr
assert 'exported_job="backup_all"} > 0' not in expr