diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index bb3d7bc4..b571aacf 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -478,8 +478,16 @@ jobs: ;; scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh) ;; + scripts/reboot-recovery/full-stack-cold-start-check.sh) + ;; + scripts/reboot-recovery/full-stack-recovery-scorecard.sh) + ;; + scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh) + ;; scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py) ;; + scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py) + ;; scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.service) ;; scripts/reboot-recovery/awoooi-reboot-auto-recovery-slo.timer) @@ -648,7 +656,10 @@ jobs: ../../scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh \ ../../scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh \ ../../scripts/reboot-recovery/post-start-quick-check.sh \ - ../../scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh + ../../scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh \ + ../../scripts/reboot-recovery/full-stack-cold-start-check.sh \ + ../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh \ + ../../scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh DATABASE_URL="${DATABASE_URL:-postgresql+asyncpg://ci:ci@localhost/ci}" \ PYTHONFAULTHANDLER=1 python3.11 -m pytest \ tests/test_agent_replay_normalizer.py \ @@ -685,6 +696,7 @@ jobs: ../../ops/runner/test_cd_controlled_runtime_profile.py \ ../../ops/runner/test_verify_awoooi_non110_cd_closure.py \ ../../scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py \ + ../../scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py \ ../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py \ ../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py \ ../../scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py \ diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 274afe70..9d3a6565 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,23 @@ +## 2026-06-30 — 09:07 P0-006 cold-start monitor bounded probe hardening + +**照主線處理的問題**: +- Production Prometheus 顯示 `awoooi_cold_start_last_result{result="check_failed"}=1`,`warn_gates=0`、`blocked_gates=0`,Alertmanager 仍有 `ColdStartLastGreenTooOld` firing;因此目前不能宣稱 10 分鐘自動恢復 SLO ready,真正問題是 cold-start monitor 無法穩定產生 GREEN summary。 +- 本機重跑 `full-stack-recovery-scorecard.sh` 與 `full-stack-cold-start-check.sh` 時,110 read-only SSH / offsite evidence probe 會卡住;已中止檢查 wrapper 並清掉本輪留下的 verify-only 孤兒程序,未進行 runtime 寫入。 +- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 已對 SSH probe 加上 `ConnectionAttempts=1`、`ServerAliveInterval=5`、`ServerAliveCountMax=1` 與每個遠端命令 45 秒 timeout,避免單一 host/evidence probe 卡住整個 scorecard。 +- `scripts/reboot-recovery/full-stack-recovery-scorecard.sh` 已對 110 offsite escrow evidence readback 加同等 bounded SSH / remote command timeout,讓 DR scorecard fail-visible 而不是無界等待。 +- `scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh` 已對 deploy parity 的 hash/pattern SSH readback 加同等 bounded timeout,避免 recovery scorecard 卡在 parity verifier。 +- `.gitea/workflows/cd.yaml` controlled-runtime profile 已納入三支 recovery shell、`bash -n` 與 bounded probe contract test,避免此類 P0 monitor 修正掉到重型 B5 / runner 壓力路徑。 + +**驗證**: +- `bash -n`:三支 recovery shell 與 `.gitea/workflows/cd.yaml` 通過。 +- Bounded probe contract + CD profile guard:`28 passed`。 +- Gitea runner pressure guard:`workflow_files=11`、`auto_branch_events_on_110=0`、`generic_runner_labels=0`。 +- Gitea step env secret guard:`no Gitea run/with secrets or legacy Telegram routes`。 +- `git diff --check`:通過。 +- 短 timeout 實跑 `full-stack-recovery-scorecard.sh` 已返回,不再卡死;目前 truth 為 `CORE_COLD_START_GREEN=0`、`CORE_COLD_START_FIRING_ALERTS=1`、`CORE_COLD_START_DEPLOY_PARITY=0`、offsite evidence `unknown`、`RECOVERY_STATE=CORE_NOT_READY_DR_OFFSITE_PENDING`。 + +**邊界**:未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未 workflow_dispatch,未操作 K8s / DB 寫入,未讀 secret / token / raw sessions / SQLite / `.env`,未使用 GitHub / `gh` / GitHub API。 + ## 2026-06-30 — 08:55 P0 mainline priority-order production readback **照主線完成的實作**: diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index e4e10d6f..fbb2a6d1 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -321,9 +321,14 @@ def test_reboot_auto_recovery_slo_sources_stay_on_controlled_runtime_profile() - "scripts/reboot-recovery/reboot-auto-recovery-host-probe.sh)", "scripts/reboot-recovery/reboot-auto-recovery-slo-exporter.sh)", "scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py)", + "scripts/reboot-recovery/full-stack-cold-start-check.sh)", + "scripts/reboot-recovery/full-stack-recovery-scorecard.sh)", + "scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)", + "scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py)", "scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py)", "scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py)", "../../scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py", + "../../scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py", "../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_installer.py", "../../scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py", ] @@ -336,12 +341,20 @@ def test_post_start_recovery_verifiers_stay_on_controlled_runtime_profile() -> N expected_sources = [ "scripts/reboot-recovery/post-start-quick-check.sh)", "scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh)", + "scripts/reboot-recovery/full-stack-cold-start-check.sh)", + "scripts/reboot-recovery/full-stack-recovery-scorecard.sh)", + "scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh)", "scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py)", "scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py)", + "scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py)", "../../scripts/reboot-recovery/post-start-quick-check.sh", "../../scripts/reboot-recovery/188-host-hygiene-maintenance-checklist.sh", + "../../scripts/reboot-recovery/full-stack-cold-start-check.sh", + "../../scripts/reboot-recovery/full-stack-recovery-scorecard.sh", + "../../scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh", "../../scripts/reboot-recovery/tests/test_188_host_hygiene_checklist.py", "../../scripts/reboot-recovery/tests/test_post_start_quick_check_contract.py", + "../../scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py", ] for source in expected_sources: assert source in text diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 9e6e3629..98b97227 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -4,7 +4,14 @@ set -uo pipefail -SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=6) +SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}" +SSH_OPTS=( + -o BatchMode=yes + -o ConnectTimeout=6 + -o ConnectionAttempts=1 + -o ServerAliveInterval=5 + -o ServerAliveCountMax=1 +) SEND_ALERT_TEST=0 MONITOR_READ_ONLY=0 NO_COLOR_FLAG=0 @@ -129,10 +136,12 @@ ssh_cmd() { local user_host="$1" local cmd="$2" local prefix="" + local quoted_cmd="" if [ -n "${REMOTE_SUDO_PASSWORD:-}" ]; then printf -v prefix 'REMOTE_SUDO_PASSWORD=%q ' "$REMOTE_SUDO_PASSWORD" fi - ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}${cmd}" + printf -v quoted_cmd '%q' "$cmd" + ssh "${SSH_OPTS[@]}" "$user_host" "${prefix}if command -v timeout >/dev/null 2>&1; then timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_cmd}; else bash -lc ${quoted_cmd}; fi" } host_has_ip() { diff --git a/scripts/reboot-recovery/full-stack-recovery-scorecard.sh b/scripts/reboot-recovery/full-stack-recovery-scorecard.sh index edd9cf7e..7c55640c 100755 --- a/scripts/reboot-recovery/full-stack-recovery-scorecard.sh +++ b/scripts/reboot-recovery/full-stack-recovery-scorecard.sh @@ -9,6 +9,7 @@ PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}" ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}" SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}" SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}" +REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}" REQUIRE_CORE=0 REQUIRE_DR=0 @@ -56,7 +57,14 @@ while [ "$#" -gt 0 ]; do esac done -ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING") +ssh_opts=( + -o BatchMode="$SSH_BATCH_MODE" + -o ConnectTimeout=6 + -o ConnectionAttempts=1 + -o ServerAliveInterval=5 + -o ServerAliveCountMax=1 + -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING" +) status_value() { local key="$1" @@ -98,6 +106,13 @@ except Exception: PY } +remote_110_read() { + local command="$1" + local quoted_command="" + printf -v quoted_command '%q' "$command" + ssh "${ssh_opts[@]}" "$REMOTE_110" "if command -v timeout >/dev/null 2>&1; then timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi" +} + echo "AWOOOI full-stack recovery scorecard" date '+%Y-%m-%d %H:%M:%S %Z' echo @@ -127,7 +142,7 @@ else status_value BACKUP_GAP_ALERT_VISIBILITY 0 fi -evidence_report="$(ssh "${ssh_opts[@]}" "$REMOTE_110" '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)" +evidence_report="$(remote_110_read '/backup/scripts/offsite-escrow-evidence-report.sh --no-color' 2>/tmp/awoooi-scorecard-offsite-evidence.err || true)" extract_report_value() { local key="$1" diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py new file mode 100644 index 00000000..8fe11e91 --- /dev/null +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +COLD_START_CHECK = ROOT / "scripts" / "reboot-recovery" / "full-stack-cold-start-check.sh" +RECOVERY_SCORECARD = ( + ROOT / "scripts" / "reboot-recovery" / "full-stack-recovery-scorecard.sh" +) +VERIFY_DEPLOY = ROOT / "scripts" / "reboot-recovery" / "verify-cold-start-monitor-deploy.sh" + + +def test_full_stack_cold_start_check_bounds_ssh_probes() -> None: + text = COLD_START_CHECK.read_text(encoding="utf-8") + + assert 'SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}"' in text + assert "-o ConnectionAttempts=1" in text + assert "-o ServerAliveInterval=5" in text + assert "-o ServerAliveCountMax=1" in text + assert "timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text + assert "printf -v quoted_cmd '%q' \"$cmd\"" in text + + +def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None: + text = RECOVERY_SCORECARD.read_text(encoding="utf-8") + + assert 'REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"' in text + assert "-o ConnectionAttempts=1" in text + assert "-o ServerAliveInterval=5" in text + assert "-o ServerAliveCountMax=1" in text + assert "remote_110_read()" in text + assert "timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text + assert "offsite-escrow-evidence-report.sh --no-color" in text + + +def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None: + text = VERIFY_DEPLOY.read_text(encoding="utf-8") + + assert 'REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}"' in text + assert "-o ConnectionAttempts=1" in text + assert "-o ServerAliveInterval=5" in text + assert "-o ServerAliveCountMax=1" in text + assert "remote_read()" in text + assert "timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc" in text + assert 'remote_read "sha256sum' in text + assert 'if remote_read "grep -Fq' in text diff --git a/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh b/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh index e69ce784..b26d28f7 100755 --- a/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh +++ b/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh @@ -7,9 +7,17 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" REMOTE="${REMOTE:-wooo@192.168.0.110}" SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}" SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}" +REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}" PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}" -ssh_opts=(-o BatchMode="$SSH_BATCH_MODE" -o ConnectTimeout=6 -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING") +ssh_opts=( + -o BatchMode="$SSH_BATCH_MODE" + -o ConnectTimeout=6 + -o ConnectionAttempts=1 + -o ServerAliveInterval=5 + -o ServerAliveCountMax=1 + -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING" +) local_sha256() { if command -v sha256sum >/dev/null 2>&1; then @@ -20,7 +28,14 @@ local_sha256() { } remote_sha256() { - ssh "${ssh_opts[@]}" "$REMOTE" "sha256sum '$1' 2>/dev/null | awk '{print \$1}'" + remote_read "sha256sum '$1' 2>/dev/null | awk '{print \$1}'" +} + +remote_read() { + local command="$1" + local quoted_command="" + printf -v quoted_command '%q' "$command" + ssh "${ssh_opts[@]}" "$REMOTE" "if command -v timeout >/dev/null 2>&1; then timeout ${REMOTE_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi" } require_same_hash() { @@ -46,7 +61,7 @@ require_remote_pattern() { local pattern="$1" local path="$2" local label="$3" - if ssh "${ssh_opts[@]}" "$REMOTE" "grep -Fq '$pattern' '$path'"; then + if remote_read "grep -Fq '$pattern' '$path'"; then echo "OK $label" else echo "BLOCKED $label missing in $path" >&2