fix(ops): close 110 pressure and backup alert gaps
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 1m55s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 1m55s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -114,3 +114,35 @@ def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_sta
|
||||
)
|
||||
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered
|
||||
assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered
|
||||
|
||||
|
||||
def test_dr_phase_ready_when_full_offsite_is_fresh_and_escrow_is_complete(
|
||||
tmp_path: Path, monkeypatch
|
||||
) -> None:
|
||||
exporter = load_exporter()
|
||||
offsite_dir = tmp_path / "offsite"
|
||||
escrow_dir = tmp_path / "escrow"
|
||||
offsite_dir.mkdir()
|
||||
escrow_dir.mkdir()
|
||||
now = 1_782_900_000
|
||||
|
||||
monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir)
|
||||
monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir)
|
||||
monkeypatch.setattr(exporter.time, "time", lambda: now)
|
||||
monkeypatch.setattr(exporter, "_b2_configured", lambda: False)
|
||||
monkeypatch.setattr(exporter, "_rclone_configured", lambda: True)
|
||||
(offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8")
|
||||
(offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8")
|
||||
for item in exporter.ESCROW_ITEMS:
|
||||
(escrow_dir / f"{item}.last_verified").write_text(str(now - 60), encoding="utf-8")
|
||||
|
||||
metrics = exporter._offsite_and_escrow_metric_lines("110")
|
||||
rendered = "\n".join(metrics)
|
||||
|
||||
assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered
|
||||
assert (
|
||||
'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0'
|
||||
in rendered
|
||||
)
|
||||
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 0' in rendered
|
||||
assert 'awoooi_backup_dr_phase{host="110",next_step="offsite_and_escrow_ready"} 5' in rendered
|
||||
|
||||
56
scripts/ops/tests/test_host_pressure_alert_contract.py
Normal file
56
scripts/ops/tests/test_host_pressure_alert_contract.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3]
|
||||
ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml"
|
||||
|
||||
|
||||
def load_alerts() -> dict[str, dict]:
|
||||
payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8"))
|
||||
alerts: dict[str, dict] = {}
|
||||
for group in payload["groups"]:
|
||||
for rule in group.get("rules", []):
|
||||
if "alert" in rule:
|
||||
alerts[rule["alert"]] = rule
|
||||
return alerts
|
||||
|
||||
|
||||
def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
|
||||
alerts = load_alerts()
|
||||
rule = alerts["Host110SustainedModeratePressure"]
|
||||
|
||||
expr = str(rule["expr"])
|
||||
annotations = rule["annotations"]
|
||||
action = annotations["auto_repair_action"]
|
||||
|
||||
assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr
|
||||
assert 'docker_container_cpu_cores{host="110"' in expr
|
||||
assert "> 2.0" in expr
|
||||
assert "gitea" in expr
|
||||
assert "stockplatform-v2-postgres-1" in expr
|
||||
assert rule["for"] == "1m"
|
||||
assert rule["labels"]["auto_repair"] == "true"
|
||||
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
|
||||
assert "--load5-per-core-threshold 0.75" in action
|
||||
assert "不讀 secret" in annotations["runbook"]
|
||||
assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]
|
||||
|
||||
|
||||
def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None:
|
||||
alerts = load_alerts()
|
||||
action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"]
|
||||
|
||||
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
|
||||
assert "scripts/ops/host-sustained-load-controller.py" not in action
|
||||
|
||||
|
||||
def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None:
|
||||
alerts = load_alerts()
|
||||
expr = str(alerts["BackupAggregateRunFailed"]["expr"])
|
||||
|
||||
assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr
|
||||
assert 'exported_job="backup_all"} > 0' not in expr
|
||||
Reference in New Issue
Block a user