fix(ops): close 110 pressure and backup alert gaps

2026-07-01 23:32:55 +08:00
parent 7a53a5287f
commit d658f03ac5
13 changed files with 465 additions and 33 deletions
--- a/scripts/backup/backup-status.sh
+++ b/scripts/backup/backup-status.sh
@@ -116,6 +116,19 @@ metric_sum() {
    ' "${file}"
 }

+metric_sum_excluding_backup_all() {
+    local file="$1"
+    local metric="$2"
+    if [ ! -s "${file}" ]; then
+        echo 0
+        return 0
+    fi
+    awk -v metric="${metric}" '
+        $1 ~ ("^" metric "\\{") && $1 !~ /(exported_job|job)="backup_all"/ { sum += $2 }
+        END { print sum + 0 }
+    ' "${file}"
+}
+
 metric_first() {
    local file="$1"
    local metric="$2"
@@ -214,6 +227,8 @@ stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh" 0)"
 stale_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" 0)"
 failed_total_110="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")"
 failed_total_188="$(metric_sum "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")"
+component_failed_110="$(metric_sum_excluding_backup_all "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")"
+component_failed_188="$(metric_sum_excluding_backup_all "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")"
 integrity_stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_integrity_fresh" 0)"
 offsite_configured="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_configured")"
 offsite_fresh="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_fresh")"
@@ -221,7 +236,7 @@ offsite_rclone_configured="$(awk '/^awoooi_backup_offsite_configured\{.*provider
 offsite_rclone_fresh="$(awk '/^awoooi_backup_offsite_fresh\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)"
 escrow_missing="$(metric_first "${TEXTFILE_110}" "awoooi_backup_dr_credential_escrow_missing_count")"

-core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + failed_total_110 + failed_total_188 + integrity_stale_110))
+core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + component_failed_110 + component_failed_188 + integrity_stale_110))
 dr_warnings=0
 if [ "${offsite_configured%.*}" -lt 1 ] 2>/dev/null; then
    dr_warnings=$((dr_warnings + 1))
@@ -250,7 +265,7 @@ missing_scripts_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_
 backup_all_ts="$(metric_value_for_label "${TEXTFILE_110}" "awoooi_backup_job_last_success_timestamp" "job" "backup_all")"
 last_backup_all="$(human_timestamp "${backup_all_ts}")"

-message="${headline}; 110備份=${fresh_total_110}/13 fresh failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}"
+message="${headline}; 110備份=${fresh_total_110}/13 fresh component_failed=${component_failed_110} aggregate_failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh component_failed=${component_failed_188} aggregate_failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}"

 if [ "${core_blockers}" -gt 0 ]; then
    message="${message}; stale110=${stale_jobs_110:-none}; stale188=${stale_jobs_188:-none}; missing_script110=${missing_scripts_110:-none}; missing_script188=${missing_scripts_188:-none}"
--- a/scripts/backup/tests/test_backup_status_contract.py
+++ b/scripts/backup/tests/test_backup_status_contract.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[3]
+BACKUP_STATUS = ROOT / "scripts" / "backup" / "backup-status.sh"
+
+
+def test_backup_status_keeps_aggregate_failure_out_of_core_blockers() -> None:
+    text = BACKUP_STATUS.read_text(encoding="utf-8")
+
+    assert "metric_sum_excluding_backup_all" in text
+    assert "component_failed_110" in text
+    assert "component_failed_188" in text
+    assert "aggregate_failed=${failed_total_110}" in text
+    core_line = next(line for line in text.splitlines() if line.startswith("core_blockers="))
+    assert "component_failed_110" in core_line
+    assert "component_failed_188" in core_line
+    assert "failed_total_110" not in core_line
+    assert "failed_total_188" not in core_line
--- a/scripts/ops/backup-alert-label-contract-check.py
+++ b/scripts/ops/backup-alert-label-contract-check.py
@@ -97,10 +97,10 @@ def static_check(path: Path, baseline_path: Path) -> list[str]:
    rule = _require_alert(alerts, "BackupAggregateRunFailed")
    _require_contains(
        str(rule.get("expr", "")),
-        'awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"}',
+        'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}',
        "BackupAggregateRunFailed expr",
    )
-    lines.append("OK BackupAggregateRunFailed filters exported_job=backup_all")
+    lines.append("OK BackupAggregateRunFailed excludes aggregate-only backup_all noise")

    rule = _require_alert(alerts, "BackupConfigCapturePartial")
    _require_contains(str(rule.get("expr", "")), "awoooi_backup_config_capture_ok", "BackupConfigCapturePartial expr")
--- a/scripts/ops/backup-health-textfile-exporter.py
+++ b/scripts/ops/backup-health-textfile-exporter.py
@@ -643,6 +643,9 @@ def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
    if not offsite_configured:
        next_step = "configure_google_drive_rclone_on_110_tty"
        phase = 1
+    elif escrow_missing_count == 0 and full_fresh:
+        next_step = "offsite_and_escrow_ready"
+        phase = 5
    elif escrow_missing_count > 0 and full_fresh:
        next_step = "complete_credential_escrow_review"
        phase = 3
--- a/scripts/ops/tests/test_backup_health_textfile_exporter.py
+++ b/scripts/ops/tests/test_backup_health_textfile_exporter.py
@@ -114,3 +114,35 @@ def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_sta
    )
    assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered
    assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered
+
+
+def test_dr_phase_ready_when_full_offsite_is_fresh_and_escrow_is_complete(
+    tmp_path: Path, monkeypatch
+) -> None:
+    exporter = load_exporter()
+    offsite_dir = tmp_path / "offsite"
+    escrow_dir = tmp_path / "escrow"
+    offsite_dir.mkdir()
+    escrow_dir.mkdir()
+    now = 1_782_900_000
+
+    monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir)
+    monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir)
+    monkeypatch.setattr(exporter.time, "time", lambda: now)
+    monkeypatch.setattr(exporter, "_b2_configured", lambda: False)
+    monkeypatch.setattr(exporter, "_rclone_configured", lambda: True)
+    (offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8")
+    (offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8")
+    for item in exporter.ESCROW_ITEMS:
+        (escrow_dir / f"{item}.last_verified").write_text(str(now - 60), encoding="utf-8")
+
+    metrics = exporter._offsite_and_escrow_metric_lines("110")
+    rendered = "\n".join(metrics)
+
+    assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered
+    assert (
+        'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0'
+        in rendered
+    )
+    assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 0' in rendered
+    assert 'awoooi_backup_dr_phase{host="110",next_step="offsite_and_escrow_ready"} 5' in rendered
--- a/scripts/ops/tests/test_host_pressure_alert_contract.py
+++ b/scripts/ops/tests/test_host_pressure_alert_contract.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+
+ROOT = Path(__file__).resolve().parents[3]
+ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml"
+
+
+def load_alerts() -> dict[str, dict]:
+    payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8"))
+    alerts: dict[str, dict] = {}
+    for group in payload["groups"]:
+        for rule in group.get("rules", []):
+            if "alert" in rule:
+                alerts[rule["alert"]] = rule
+    return alerts
+
+
+def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
+    alerts = load_alerts()
+    rule = alerts["Host110SustainedModeratePressure"]
+
+    expr = str(rule["expr"])
+    annotations = rule["annotations"]
+    action = annotations["auto_repair_action"]
+
+    assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr
+    assert 'docker_container_cpu_cores{host="110"' in expr
+    assert "> 2.0" in expr
+    assert "gitea" in expr
+    assert "stockplatform-v2-postgres-1" in expr
+    assert rule["for"] == "1m"
+    assert rule["labels"]["auto_repair"] == "true"
+    assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
+    assert "--load5-per-core-threshold 0.75" in action
+    assert "不讀 secret" in annotations["runbook"]
+    assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]
+
+
+def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None:
+    alerts = load_alerts()
+    action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"]
+
+    assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
+    assert "scripts/ops/host-sustained-load-controller.py" not in action
+
+
+def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None:
+    alerts = load_alerts()
+    expr = str(alerts["BackupAggregateRunFailed"]["expr"])
+
+    assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr
+    assert 'exported_job="backup_all"} > 0' not in expr
--- a/scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh
+++ b/scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh
@@ -0,0 +1,202 @@
+#!/usr/bin/env bash
+# Apply the committed redacted P0-005 credential escrow closeout receipt to
+# host 110 as non-secret marker files. This script never reads credential
+# values; it only forwards non-secret evidence refs that already passed the
+# repository-side closeout contract.
+
+set -euo pipefail
+
+TARGET_HOST="${TARGET_HOST:-wooo@192.168.0.110}"
+SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-15}"
+SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
+RECEIPT_PATH="${RECEIPT_PATH:-docs/operations/awoooi-credential-escrow-evidence-controlled-closeout-receipt.snapshot.json}"
+REMOTE_MARKER_SCRIPT="${REMOTE_MARKER_SCRIPT:-/backup/scripts/mark-credential-escrow-verified.sh}"
+REMOTE_ESCROW_DIR="${REMOTE_ESCROW_DIR:-/backup/escrow-evidence}"
+NOTE="${NOTE:-p0-005-controlled-closeout-receipt}"
+MODE="check"
+ROLLBACK_DIR=""
+
+SSH_OPTS=(
+  -n
+  -o BatchMode=yes
+  -o ConnectTimeout="$SSH_CONNECT_TIMEOUT"
+  -o ConnectionAttempts=1
+  -o ServerAliveInterval=5
+  -o ServerAliveCountMax=1
+  -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
+)
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  apply-credential-escrow-closeout-receipt-to-110.sh --check
+  apply-credential-escrow-closeout-receipt-to-110.sh --dry-run
+  apply-credential-escrow-closeout-receipt-to-110.sh --apply
+  apply-credential-escrow-closeout-receipt-to-110.sh --rollback --rollback-dir <dir>
+
+Rules:
+  - Reads only the committed redacted closeout receipt.
+  - Sends only non-secret evidence refs to host 110.
+  - Does not read passwords, tokens, private keys, .env, auth, sessions, or SQLite.
+USAGE
+}
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --check|--dry-run|--apply|--rollback)
+      MODE="${1#--}"
+      shift
+      ;;
+    --rollback-dir)
+      ROLLBACK_DIR="${2:-}"
+      shift 2
+      ;;
+    --receipt)
+      RECEIPT_PATH="${2:-}"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [ ! -f "$RECEIPT_PATH" ]; then
+  echo "receipt not found: $RECEIPT_PATH" >&2
+  exit 66
+fi
+
+receipt_items() {
+  python3 - "$RECEIPT_PATH" <<'PY'
+import json
+import re
+import sys
+from pathlib import Path
+
+path = Path(sys.argv[1])
+data = json.loads(path.read_text(encoding="utf-8"))
+expected = [
+    "restic_repository_password",
+    "offsite_provider_credentials",
+    "break_glass_admin_credentials",
+    "dns_registrar_recovery",
+    "oauth_ai_provider_recovery",
+]
+if data.get("schema_version") != "credential_escrow_evidence_controlled_closeout_receipt_v1":
+    raise SystemExit("receipt schema mismatch")
+if data.get("status") != "ready_for_p0_005_controlled_closeout":
+    raise SystemExit("receipt status is not closeout-ready")
+result = data.get("result") or {}
+if result.get("accepted_item_count") != len(expected):
+    raise SystemExit("receipt accepted item count mismatch")
+if result.get("projected_effective_escrow_missing_count") != 0:
+    raise SystemExit("receipt projected missing count is not zero")
+if result.get("secret_value_collection_allowed") is not False:
+    raise SystemExit("receipt secret collection flag is not false")
+if result.get("runtime_gate_count") != 0:
+    raise SystemExit("receipt runtime gate count is not zero")
+items = data.get("evidence_refs") or []
+if [item.get("item_id") for item in items] != expected:
+    raise SystemExit("receipt item order mismatch")
+safe_ref = re.compile(r"^[A-Za-z0-9._:-]{6,160}$")
+for item in items:
+    item_id = str(item.get("item_id") or "")
+    ref = str(item.get("non_secret_evidence_ref") or "")
+    if item.get("contains_secret_value") is not False:
+        raise SystemExit(f"{item_id}: contains_secret_value must be false")
+    if not safe_ref.match(ref):
+        raise SystemExit(f"{item_id}: unsafe evidence ref")
+    print(f"{item_id}\t{ref}")
+PY
+}
+
+remote_status() {
+  ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "$REMOTE_MARKER_SCRIPT --status"
+}
+
+remote_prepare_rollback() {
+  local items_shell="$1"
+  ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "set -euo pipefail
+rollback=\"${REMOTE_ESCROW_DIR}/.rollback-p0-005-closeout-\$(date +%Y%m%d-%H%M%S)\"
+mkdir -p \"\$rollback\"
+for item in ${items_shell}; do
+  marker=\"${REMOTE_ESCROW_DIR}/\${item}.last_verified\"
+  if [ -f \"\$marker\" ]; then
+    cp -p \"\$marker\" \"\$rollback/\${item}.last_verified\"
+  else
+    : > \"\$rollback/\${item}.was_missing\"
+  fi
+done
+echo \"ROLLBACK_DIR=\$rollback\""
+}
+
+run_marker_command() {
+  local item="$1"
+  local evidence_ref="$2"
+  local mode_flag="$3"
+  local quoted_note quoted_item quoted_ref
+  printf -v quoted_item '%q' "$item"
+  printf -v quoted_ref '%q' "$evidence_ref"
+  printf -v quoted_note '%q' "$NOTE"
+  ssh "${SSH_OPTS[@]}" "$TARGET_HOST" \
+    "$REMOTE_MARKER_SCRIPT --item $quoted_item --evidence-id $quoted_ref --note $quoted_note $mode_flag"
+}
+
+rollback_markers() {
+  if [ -z "$ROLLBACK_DIR" ]; then
+    echo "--rollback requires --rollback-dir" >&2
+    exit 2
+  fi
+  ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "set -euo pipefail
+test -d '$ROLLBACK_DIR'
+for marker in '$ROLLBACK_DIR'/*.last_verified; do
+  [ -e \"\$marker\" ] || continue
+  item=\$(basename \"\$marker\" .last_verified)
+  cp -p \"\$marker\" '${REMOTE_ESCROW_DIR}/'\${item}.last_verified
+done
+for missing in '$ROLLBACK_DIR'/*.was_missing; do
+  [ -e \"\$missing\" ] || continue
+  item=\$(basename \"\$missing\" .was_missing)
+  rm -f '${REMOTE_ESCROW_DIR}/'\${item}.last_verified
+done
+echo ROLLBACK_APPLIED '$ROLLBACK_DIR'
+${REMOTE_MARKER_SCRIPT} --status"
+}
+
+items_tsv="$(receipt_items)"
+items_shell="$(awk '{printf "%s ", $1}' <<<"$items_tsv")"
+
+case "$MODE" in
+  check)
+    echo "RECEIPT_OK $RECEIPT_PATH"
+    echo "$items_tsv" | awk '{print "RECEIPT_ITEM item="$1" evidence_ref="$2}'
+    remote_status
+    ;;
+  dry-run)
+    echo "RECEIPT_OK $RECEIPT_PATH"
+    while IFS=$'\t' read -r item evidence_ref; do
+      run_marker_command "$item" "$evidence_ref" "--dry-run"
+    done <<<"$items_tsv"
+    ;;
+  apply)
+    echo "RECEIPT_OK $RECEIPT_PATH"
+    remote_prepare_rollback "$items_shell"
+    while IFS=$'\t' read -r item evidence_ref; do
+      run_marker_command "$item" "$evidence_ref" ""
+    done <<<"$items_tsv"
+    remote_status
+    ;;
+  rollback)
+    rollback_markers
+    ;;
+  *)
+    echo "unsupported mode: $MODE" >&2
+    exit 2
+    ;;
+esac
--- a/scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py
+++ b/scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py
@@ -9,6 +9,12 @@ from pathlib import Path
 ROOT = Path(__file__).resolve().parents[3]
 SCRIPT = ROOT / "scripts" / "reboot-recovery" / "dr-escrow-evidence-checklist.py"
 PREFLIGHT_SCRIPT = ROOT / "scripts" / "reboot-recovery" / "post-reboot-owner-response-preflight.py"
+APPLY_RECEIPT_SCRIPT = (
+    ROOT
+    / "scripts"
+    / "reboot-recovery"
+    / "apply-credential-escrow-closeout-receipt-to-110.sh"
+)

 ITEMS = {
    "restic_repository_password",
@@ -98,6 +104,25 @@ def test_checklist_outputs_marker_dry_run_commands_only() -> None:
        assert "password=" not in command.lower()


+def test_apply_closeout_receipt_script_is_no_secret_controlled_marker_write() -> None:
+    text = APPLY_RECEIPT_SCRIPT.read_text(encoding="utf-8")
+
+    assert "--dry-run" in text
+    assert "--apply" in text
+    assert "--rollback" in text
+    assert "awoooi-credential-escrow-evidence-controlled-closeout-receipt.snapshot.json" in text
+    assert "contains_secret_value" in text
+    assert "secret_value_collection_allowed" in text
+    assert "mark-credential-escrow-verified.sh" in text
+    assert "ROLLBACK_DIR=" in text
+    assert "-n" in text
+    assert "cat ~/.ssh" not in text
+    assert "cat .env" not in text
+    assert "auth.json" not in text
+    assert ".sqlite" not in text
+    assert "cat ~/.codex/sessions" not in text
+
+
 def test_unfilled_skeleton_fails_closed_against_preflight(tmp_path: Path) -> None:
    payload = load_checklist()