fix(ops): close 110 pressure and backup alert gaps
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 1m55s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 23:32:55 +08:00
parent 7a53a5287f
commit d658f03ac5
13 changed files with 465 additions and 33 deletions

View File

@@ -116,6 +116,19 @@ metric_sum() {
' "${file}"
}
metric_sum_excluding_backup_all() {
local file="$1"
local metric="$2"
if [ ! -s "${file}" ]; then
echo 0
return 0
fi
awk -v metric="${metric}" '
$1 ~ ("^" metric "\\{") && $1 !~ /(exported_job|job)="backup_all"/ { sum += $2 }
END { print sum + 0 }
' "${file}"
}
metric_first() {
local file="$1"
local metric="$2"
@@ -214,6 +227,8 @@ stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh" 0)"
stale_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" 0)"
failed_total_110="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")"
failed_total_188="$(metric_sum "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")"
component_failed_110="$(metric_sum_excluding_backup_all "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")"
component_failed_188="$(metric_sum_excluding_backup_all "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")"
integrity_stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_integrity_fresh" 0)"
offsite_configured="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_configured")"
offsite_fresh="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_fresh")"
@@ -221,7 +236,7 @@ offsite_rclone_configured="$(awk '/^awoooi_backup_offsite_configured\{.*provider
offsite_rclone_fresh="$(awk '/^awoooi_backup_offsite_fresh\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)"
escrow_missing="$(metric_first "${TEXTFILE_110}" "awoooi_backup_dr_credential_escrow_missing_count")"
core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + failed_total_110 + failed_total_188 + integrity_stale_110))
core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + component_failed_110 + component_failed_188 + integrity_stale_110))
dr_warnings=0
if [ "${offsite_configured%.*}" -lt 1 ] 2>/dev/null; then
dr_warnings=$((dr_warnings + 1))
@@ -250,7 +265,7 @@ missing_scripts_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_
backup_all_ts="$(metric_value_for_label "${TEXTFILE_110}" "awoooi_backup_job_last_success_timestamp" "job" "backup_all")"
last_backup_all="$(human_timestamp "${backup_all_ts}")"
message="${headline}; 110備份=${fresh_total_110}/13 fresh failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}"
message="${headline}; 110備份=${fresh_total_110}/13 fresh component_failed=${component_failed_110} aggregate_failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh component_failed=${component_failed_188} aggregate_failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}"
if [ "${core_blockers}" -gt 0 ]; then
message="${message}; stale110=${stale_jobs_110:-none}; stale188=${stale_jobs_188:-none}; missing_script110=${missing_scripts_110:-none}; missing_script188=${missing_scripts_188:-none}"

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
from pathlib import Path
ROOT = Path(__file__).resolve().parents[3]
BACKUP_STATUS = ROOT / "scripts" / "backup" / "backup-status.sh"
def test_backup_status_keeps_aggregate_failure_out_of_core_blockers() -> None:
text = BACKUP_STATUS.read_text(encoding="utf-8")
assert "metric_sum_excluding_backup_all" in text
assert "component_failed_110" in text
assert "component_failed_188" in text
assert "aggregate_failed=${failed_total_110}" in text
core_line = next(line for line in text.splitlines() if line.startswith("core_blockers="))
assert "component_failed_110" in core_line
assert "component_failed_188" in core_line
assert "failed_total_110" not in core_line
assert "failed_total_188" not in core_line

View File

@@ -97,10 +97,10 @@ def static_check(path: Path, baseline_path: Path) -> list[str]:
rule = _require_alert(alerts, "BackupAggregateRunFailed")
_require_contains(
str(rule.get("expr", "")),
'awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"}',
'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}',
"BackupAggregateRunFailed expr",
)
lines.append("OK BackupAggregateRunFailed filters exported_job=backup_all")
lines.append("OK BackupAggregateRunFailed excludes aggregate-only backup_all noise")
rule = _require_alert(alerts, "BackupConfigCapturePartial")
_require_contains(str(rule.get("expr", "")), "awoooi_backup_config_capture_ok", "BackupConfigCapturePartial expr")

View File

@@ -643,6 +643,9 @@ def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
if not offsite_configured:
next_step = "configure_google_drive_rclone_on_110_tty"
phase = 1
elif escrow_missing_count == 0 and full_fresh:
next_step = "offsite_and_escrow_ready"
phase = 5
elif escrow_missing_count > 0 and full_fresh:
next_step = "complete_credential_escrow_review"
phase = 3

View File

@@ -114,3 +114,35 @@ def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_sta
)
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered
assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered
def test_dr_phase_ready_when_full_offsite_is_fresh_and_escrow_is_complete(
tmp_path: Path, monkeypatch
) -> None:
exporter = load_exporter()
offsite_dir = tmp_path / "offsite"
escrow_dir = tmp_path / "escrow"
offsite_dir.mkdir()
escrow_dir.mkdir()
now = 1_782_900_000
monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir)
monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir)
monkeypatch.setattr(exporter.time, "time", lambda: now)
monkeypatch.setattr(exporter, "_b2_configured", lambda: False)
monkeypatch.setattr(exporter, "_rclone_configured", lambda: True)
(offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8")
(offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8")
for item in exporter.ESCROW_ITEMS:
(escrow_dir / f"{item}.last_verified").write_text(str(now - 60), encoding="utf-8")
metrics = exporter._offsite_and_escrow_metric_lines("110")
rendered = "\n".join(metrics)
assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered
assert (
'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0'
in rendered
)
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 0' in rendered
assert 'awoooi_backup_dr_phase{host="110",next_step="offsite_and_escrow_ready"} 5' in rendered

View File

@@ -0,0 +1,56 @@
from __future__ import annotations
from pathlib import Path
import yaml
ROOT = Path(__file__).resolve().parents[3]
ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml"
def load_alerts() -> dict[str, dict]:
payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8"))
alerts: dict[str, dict] = {}
for group in payload["groups"]:
for rule in group.get("rules", []):
if "alert" in rule:
alerts[rule["alert"]] = rule
return alerts
def test_110_moderate_pressure_alert_routes_to_live_controller() -> None:
alerts = load_alerts()
rule = alerts["Host110SustainedModeratePressure"]
expr = str(rule["expr"])
annotations = rule["annotations"]
action = annotations["auto_repair_action"]
assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr
assert 'docker_container_cpu_cores{host="110"' in expr
assert "> 2.0" in expr
assert "gitea" in expr
assert "stockplatform-v2-postgres-1" in expr
assert rule["for"] == "1m"
assert rule["labels"]["auto_repair"] == "true"
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "--load5-per-core-threshold 0.75" in action
assert "不讀 secret" in annotations["runbook"]
assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"]
def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None:
alerts = load_alerts()
action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"]
assert "/home/wooo/scripts/host-sustained-load-controller.py" in action
assert "scripts/ops/host-sustained-load-controller.py" not in action
def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None:
alerts = load_alerts()
expr = str(alerts["BackupAggregateRunFailed"]["expr"])
assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr
assert 'exported_job="backup_all"} > 0' not in expr

View File

@@ -0,0 +1,202 @@
#!/usr/bin/env bash
# Apply the committed redacted P0-005 credential escrow closeout receipt to
# host 110 as non-secret marker files. This script never reads credential
# values; it only forwards non-secret evidence refs that already passed the
# repository-side closeout contract.
set -euo pipefail
TARGET_HOST="${TARGET_HOST:-wooo@192.168.0.110}"
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-15}"
SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}"
RECEIPT_PATH="${RECEIPT_PATH:-docs/operations/awoooi-credential-escrow-evidence-controlled-closeout-receipt.snapshot.json}"
REMOTE_MARKER_SCRIPT="${REMOTE_MARKER_SCRIPT:-/backup/scripts/mark-credential-escrow-verified.sh}"
REMOTE_ESCROW_DIR="${REMOTE_ESCROW_DIR:-/backup/escrow-evidence}"
NOTE="${NOTE:-p0-005-controlled-closeout-receipt}"
MODE="check"
ROLLBACK_DIR=""
SSH_OPTS=(
-n
-o BatchMode=yes
-o ConnectTimeout="$SSH_CONNECT_TIMEOUT"
-o ConnectionAttempts=1
-o ServerAliveInterval=5
-o ServerAliveCountMax=1
-o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING"
)
usage() {
cat <<'USAGE'
Usage:
apply-credential-escrow-closeout-receipt-to-110.sh --check
apply-credential-escrow-closeout-receipt-to-110.sh --dry-run
apply-credential-escrow-closeout-receipt-to-110.sh --apply
apply-credential-escrow-closeout-receipt-to-110.sh --rollback --rollback-dir <dir>
Rules:
- Reads only the committed redacted closeout receipt.
- Sends only non-secret evidence refs to host 110.
- Does not read passwords, tokens, private keys, .env, auth, sessions, or SQLite.
USAGE
}
while [ "$#" -gt 0 ]; do
case "$1" in
--check|--dry-run|--apply|--rollback)
MODE="${1#--}"
shift
;;
--rollback-dir)
ROLLBACK_DIR="${2:-}"
shift 2
;;
--receipt)
RECEIPT_PATH="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [ ! -f "$RECEIPT_PATH" ]; then
echo "receipt not found: $RECEIPT_PATH" >&2
exit 66
fi
receipt_items() {
python3 - "$RECEIPT_PATH" <<'PY'
import json
import re
import sys
from pathlib import Path
path = Path(sys.argv[1])
data = json.loads(path.read_text(encoding="utf-8"))
expected = [
"restic_repository_password",
"offsite_provider_credentials",
"break_glass_admin_credentials",
"dns_registrar_recovery",
"oauth_ai_provider_recovery",
]
if data.get("schema_version") != "credential_escrow_evidence_controlled_closeout_receipt_v1":
raise SystemExit("receipt schema mismatch")
if data.get("status") != "ready_for_p0_005_controlled_closeout":
raise SystemExit("receipt status is not closeout-ready")
result = data.get("result") or {}
if result.get("accepted_item_count") != len(expected):
raise SystemExit("receipt accepted item count mismatch")
if result.get("projected_effective_escrow_missing_count") != 0:
raise SystemExit("receipt projected missing count is not zero")
if result.get("secret_value_collection_allowed") is not False:
raise SystemExit("receipt secret collection flag is not false")
if result.get("runtime_gate_count") != 0:
raise SystemExit("receipt runtime gate count is not zero")
items = data.get("evidence_refs") or []
if [item.get("item_id") for item in items] != expected:
raise SystemExit("receipt item order mismatch")
safe_ref = re.compile(r"^[A-Za-z0-9._:-]{6,160}$")
for item in items:
item_id = str(item.get("item_id") or "")
ref = str(item.get("non_secret_evidence_ref") or "")
if item.get("contains_secret_value") is not False:
raise SystemExit(f"{item_id}: contains_secret_value must be false")
if not safe_ref.match(ref):
raise SystemExit(f"{item_id}: unsafe evidence ref")
print(f"{item_id}\t{ref}")
PY
}
remote_status() {
ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "$REMOTE_MARKER_SCRIPT --status"
}
remote_prepare_rollback() {
local items_shell="$1"
ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "set -euo pipefail
rollback=\"${REMOTE_ESCROW_DIR}/.rollback-p0-005-closeout-\$(date +%Y%m%d-%H%M%S)\"
mkdir -p \"\$rollback\"
for item in ${items_shell}; do
marker=\"${REMOTE_ESCROW_DIR}/\${item}.last_verified\"
if [ -f \"\$marker\" ]; then
cp -p \"\$marker\" \"\$rollback/\${item}.last_verified\"
else
: > \"\$rollback/\${item}.was_missing\"
fi
done
echo \"ROLLBACK_DIR=\$rollback\""
}
run_marker_command() {
local item="$1"
local evidence_ref="$2"
local mode_flag="$3"
local quoted_note quoted_item quoted_ref
printf -v quoted_item '%q' "$item"
printf -v quoted_ref '%q' "$evidence_ref"
printf -v quoted_note '%q' "$NOTE"
ssh "${SSH_OPTS[@]}" "$TARGET_HOST" \
"$REMOTE_MARKER_SCRIPT --item $quoted_item --evidence-id $quoted_ref --note $quoted_note $mode_flag"
}
rollback_markers() {
if [ -z "$ROLLBACK_DIR" ]; then
echo "--rollback requires --rollback-dir" >&2
exit 2
fi
ssh "${SSH_OPTS[@]}" "$TARGET_HOST" "set -euo pipefail
test -d '$ROLLBACK_DIR'
for marker in '$ROLLBACK_DIR'/*.last_verified; do
[ -e \"\$marker\" ] || continue
item=\$(basename \"\$marker\" .last_verified)
cp -p \"\$marker\" '${REMOTE_ESCROW_DIR}/'\${item}.last_verified
done
for missing in '$ROLLBACK_DIR'/*.was_missing; do
[ -e \"\$missing\" ] || continue
item=\$(basename \"\$missing\" .was_missing)
rm -f '${REMOTE_ESCROW_DIR}/'\${item}.last_verified
done
echo ROLLBACK_APPLIED '$ROLLBACK_DIR'
${REMOTE_MARKER_SCRIPT} --status"
}
items_tsv="$(receipt_items)"
items_shell="$(awk '{printf "%s ", $1}' <<<"$items_tsv")"
case "$MODE" in
check)
echo "RECEIPT_OK $RECEIPT_PATH"
echo "$items_tsv" | awk '{print "RECEIPT_ITEM item="$1" evidence_ref="$2}'
remote_status
;;
dry-run)
echo "RECEIPT_OK $RECEIPT_PATH"
while IFS=$'\t' read -r item evidence_ref; do
run_marker_command "$item" "$evidence_ref" "--dry-run"
done <<<"$items_tsv"
;;
apply)
echo "RECEIPT_OK $RECEIPT_PATH"
remote_prepare_rollback "$items_shell"
while IFS=$'\t' read -r item evidence_ref; do
run_marker_command "$item" "$evidence_ref" ""
done <<<"$items_tsv"
remote_status
;;
rollback)
rollback_markers
;;
*)
echo "unsupported mode: $MODE" >&2
exit 2
;;
esac

View File

@@ -9,6 +9,12 @@ from pathlib import Path
ROOT = Path(__file__).resolve().parents[3]
SCRIPT = ROOT / "scripts" / "reboot-recovery" / "dr-escrow-evidence-checklist.py"
PREFLIGHT_SCRIPT = ROOT / "scripts" / "reboot-recovery" / "post-reboot-owner-response-preflight.py"
APPLY_RECEIPT_SCRIPT = (
ROOT
/ "scripts"
/ "reboot-recovery"
/ "apply-credential-escrow-closeout-receipt-to-110.sh"
)
ITEMS = {
"restic_repository_password",
@@ -98,6 +104,25 @@ def test_checklist_outputs_marker_dry_run_commands_only() -> None:
assert "password=" not in command.lower()
def test_apply_closeout_receipt_script_is_no_secret_controlled_marker_write() -> None:
text = APPLY_RECEIPT_SCRIPT.read_text(encoding="utf-8")
assert "--dry-run" in text
assert "--apply" in text
assert "--rollback" in text
assert "awoooi-credential-escrow-evidence-controlled-closeout-receipt.snapshot.json" in text
assert "contains_secret_value" in text
assert "secret_value_collection_allowed" in text
assert "mark-credential-escrow-verified.sh" in text
assert "ROLLBACK_DIR=" in text
assert "-n" in text
assert "cat ~/.ssh" not in text
assert "cat .env" not in text
assert "auth.json" not in text
assert ".sqlite" not in text
assert "cat ~/.codex/sessions" not in text
def test_unfilled_skeleton_fails_closed_against_preflight(tmp_path: Path) -> None:
payload = load_checklist()