From 85d8eeb0db2dfdc8a0a5c93c42c4facc4e27599b Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 3 Jul 2026 00:45:28 +0800 Subject: [PATCH] fix(reboot): reject unknown uptime as fresh boot --- docs/LOGBOOK.md | 20 +++++++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 4 +- .../reboot-recovery/reboot-event-detector.py | 24 +++++--- .../tests/test_reboot_event_detector.py | 60 +++++++++++++++++++ 4 files changed, 100 insertions(+), 8 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 8eb369cb3..53f5fec85 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,23 @@ +## 2026-07-03 — 00:43 P0-006 reboot-event detector unknown uptime false-fresh fix + +**完成內容**: +- 依 production action matrix primary lane `reboot_event_detector_and_host_probe` 執行 verify-only host probe / event detector;未重啟、未 restart、未寫 state。 +- Live host probe 證據:99 reachable 但 `uptime_seconds=unknown`、110 reachable / systemd running / startup unit inactive、111 `reachable=0`、112/120/121 reachable、188 reachable 但 `systemd_state=degraded` 且 `awoooi-startup.service failed`。 +- 發現並修正 `scripts/reboot-recovery/reboot-event-detector.py`:舊邏輯把 `uptime_seconds=unknown` 解析成 `-1`,造成 99 ping-only readback 被誤判成 `fresh_boot_hosts=['99']`;同時把 `reachable_unknown_boot` 從 boot-id change 判定排除。 +- 新規則:fresh boot 必須 `reachable=true`、`uptime_seconds >= 0` 且在 target window 內;boot-id change 必須前後 boot id 都不是 `unknown` / `reachable_unknown_boot` placeholder。 +- `FULL-STACK-COLD-START-SOP.md` 升到 v1.98,固定 unknown uptime 不得當 fresh reboot 證據。 + +**驗證**: +- `python3.11 -m pytest scripts/reboot-recovery/tests/test_reboot_event_detector.py scripts/reboot-recovery/tests/test_reboot_auto_recovery_slo_scorecard.py -q -p no:cacheprovider`:`15 passed`。 +- `DATABASE_URL=postgresql+asyncpg://test:test@localhost/test PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_reboot_auto_recovery_slo_scorecard_api.py -q -p no:cacheprovider`:`8 passed`。 +- `python3.11 -m py_compile scripts/reboot-recovery/reboot-event-detector.py scripts/reboot-recovery/reboot-auto-recovery-slo-scorecard.py apps/api/src/services/reboot_auto_recovery_slo_scorecard.py`:通過。 +- `git diff --check`:通過。 +- 修正後 live no-write detector artifact `/tmp/awoooi-reboot-detector-fix-20260703-004256`:`reboot_detected=false`、`fresh_boot_hosts=[]`、`rebooted_hosts=[]`、`unreachable_hosts=['111']`、`all_required_hosts_observed=false`、`all_required_hosts_in_reboot_window=false`、`recovery_deadline_status=target_window_elapsed`、`state_written=false`。 + +**仍維持**: +- P0-006 仍未達 10 分鐘全主機自動恢復 SLO;111 unreachable、188 degraded/startup failed、99 uptime unknown、Windows99 VMware verifier / no-secret remote execution channel 未 ready 都必須繼續作為 blocker。 +- 未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / gh;未 workflow_dispatch;未重啟 host / VM / service;未 Docker / Nginx / K3s / DB / firewall restart;未 DROP / TRUNCATE / restore / prune / delete / force push。 + ## 2026-07-03 — 00:36 P0-006 reboot SLO action-matrix alert routing **完成內容**: diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index b5159acbb..e4921fe61 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -1,6 +1,6 @@ # AWOOOI 全棧冷啟動與主機重啟 SOP -> Version: v1.97 +> Version: v1.98 > Last updated: 2026-07-03 Asia/Taipei > Scope: 110 / 120 / 121 / 188 full-stack reboot recovery. 112 Kali is recorded as P3 optional and is not part of this recovery path. @@ -28,6 +28,8 @@ v1.96 reboot SLO per-blocker alert projection rule:重啟後不得只看 `awoo v1.97 reboot SLO action-matrix routing rule:重啟後不得只把 active blocker 丟給人工判讀。`reboot-auto-recovery-slo-scorecard.py`、production `/api/v1/agents/reboot-auto-recovery-slo-scorecard` 與 exporter artifact 必須輸出 `active_blocker_action_matrix`,每個 blocker 固定含 `category`、`owner_lane`、`telegram_severity`、`evidence_inputs`、`next_safe_action`、`post_verifier`、`controlled_apply_mode` 與 `forbidden_actions`。`awoooi_reboot_auto_recovery_slo_active_blocker` metric 必須帶 `category`、`owner_lane`、`severity`、`primary` labels,Telegram / Alertmanager 優先讀這些 labels;若 labels 缺失,只能判為 exporter/action-matrix drift。`windows99_remote_execution_channel_unavailable` 必須排在 `windows99_vmware_autostart_readback_missing` 前,因為未恢復 no-secret collection channel 時不可只等 VMware verifier。所有 action matrix row 的 `controlled_apply_authorized_by_scorecard=false`,表示 scorecard 只授權 verifier / check-mode 路由,不授權重啟、VM power change、Docker / Nginx / K3s / DB / firewall restart、restore、prune、delete 或 secret 讀取;低風險 controlled apply 必須進各自 lane 的 check-mode、rollback、post-verifier。 +v1.98 reboot-event detector unknown-uptime rule:`reboot-event-detector.py` 不得把 `uptime_seconds=unknown`、`boot_id=unknown` 或 `boot_id=reachable_unknown_boot` 當作 fresh reboot 或 boot id changed。只有 `reachable=true` 且 `uptime_seconds >= 0` 且 `uptime_seconds <= target_seconds` 才能進 `fresh_boot_hosts`;只有前後兩個 boot id 都不是 placeholder 時才能判定 `boot_id_changed`。若 99 只剩 ping / TCP reachable、111 unreachable、188 degraded 或 startup failed,必須維持 `all_required_hosts_in_reboot_window=false` 與 SLO blocked;不得用 ping-only host 當 10 分鐘內重啟證據。 + 2026-07-02 110 control-path / Harbor recovery receipt rule:若 Gitea Harbor repair queue 仍保留 `harbor_110_remote_ssh_publickey_auth_stalled`、remote-control unavailable、jobs stale 或 historical failure,但同一輪本地證據同時證明 `wooo` command path ready、110 local Harbor `/v2/` ready、public/internal registry `/v2/` 回 `401`,則該 Gitea Harbor repair 失敗只能列為 historical queue metadata,不得再當成 current SSH blocker。必須用 `/api/v1/agents/harbor-registry-controlled-recovery-receipt` 或同等 validator 合併 `diagnose-110-ssh-publickey-auth.sh`、`recover-110-control-path-and-harbor-local.sh --check`、public Gitea queue readback 與 registry `/v2/` verifier,並把機器可讀結果寫入 `docs/operations/harbor-110-control-path-recovery-readback-2026-07-02.snapshot.json` 類型的 snapshot。2026-07-02 live receipt 顯示:public/internal registry `/v2/` 均為 `401`、latest visible CD `#4335` 為 `Success`、Gitea Harbor repair failure 已是 `historical_after_latest_cd_success=true`;active blockers 收斂為 110 controlled CD lane config / binary / registration / service guardrail、active action container pressure,以及 Gitea CD jobs head-SHA / stale readback mismatch。若 local-console output 只有 `AWOOOI_110_CONTROLLED_CD_LANE_READY` marker,non110 runner parser 不得從 110 `BLOCKER` 行推導 non110 blocker;non110 只有看到 `AWOOOI_NON110_RUNNER_READY` marker 才能列入 active blocker。 2026-07-02 110 controlled CD lane fail-closed enforcer staging rule:110 runner 壓力事故後,legacy / generic runner 仍必須 fail-closed;但 `awoooi-cd-lane-drain.service` 的非 secret staging artifact 不得再被 enforcer 無差別封回 stub。`scripts/reboot-recovery/enforce-110-runner-failclosed.sh` 只有在 `config.yaml` 符合 `capacity <= 1`、只含 `awoooi-host:host` 與 `awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04`、binary 是 executable ELF、systemd unit 具備 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`、`CPUAccounting` / `MemoryAccounting` / `TasksAccounting` / `NoNewPrivileges` 等 guardrail,且 service `inactive`、`MainPID=0`、未 enabled / 未 masked 時,才可保留 drain config / binary / unit,並輸出 `CONTROLLED_DRAIN_STAGING_ALLOWED=1` 與 textfile metric。此 staging 規則不得讀 token、不得讀 `.runner` 內容、不得註冊 runner、不得啟動 service;若 registration 缺失,readiness verifier 仍必須只留下 `controlled_cd_lane_registration_missing` / `controlled_cd_lane_service_not_active` 類 blocker。若 `CONTROLLED_DRAIN_STAGING_ALLOWED=0` 且 config / binary 又被搬走,優先修 source enforcer / unit guardrail,不要手工反覆補同一組 artifact。 diff --git a/scripts/reboot-recovery/reboot-event-detector.py b/scripts/reboot-recovery/reboot-event-detector.py index e7b68d498..b720f8eed 100644 --- a/scripts/reboot-recovery/reboot-event-detector.py +++ b/scripts/reboot-recovery/reboot-event-detector.py @@ -45,6 +45,13 @@ def int_value(value: Any, default: int = -1) -> int: return default +def known_boot_id(value: Any) -> str: + boot_id = str(value or "") + if boot_id in {"", "unknown", "reachable_unknown_boot"}: + return "" + return boot_id + + def parse_host_probe(text: str) -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] for raw_line in text.splitlines(): @@ -114,19 +121,22 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: if not current["reachable"]: unreachable_hosts.append(alias) previous_boot_id = ( - str(previous_host.get("boot_id")) - if isinstance(previous_host, dict) and previous_host.get("boot_id") + known_boot_id(previous_host.get("boot_id")) + if isinstance(previous_host, dict) else "" ) - current_boot_id = str(current.get("boot_id") or "") + current_boot_id = known_boot_id(current.get("boot_id")) boot_id_changed = bool( previous_boot_id - and previous_boot_id != "unknown" and current_boot_id - and current_boot_id != "unknown" and previous_boot_id != current_boot_id ) - fresh_boot = bool(current.get("reachable") and int_value(current.get("uptime_seconds")) <= target_seconds) + uptime_seconds = int_value(current.get("uptime_seconds")) + fresh_boot = bool( + current.get("reachable") + and uptime_seconds >= 0 + and uptime_seconds <= target_seconds + ) if boot_id_changed: changed_boot_id_hosts.append(alias) if fresh_boot: @@ -142,7 +152,7 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]: "uptime_seconds": current.get("uptime_seconds"), "deadline_at": ( observed_at - + timedelta(seconds=max(0, target_seconds - int_value(current.get("uptime_seconds"), 0))) + + timedelta(seconds=max(0, target_seconds - uptime_seconds)) ).isoformat(timespec="seconds"), } ) diff --git a/scripts/reboot-recovery/tests/test_reboot_event_detector.py b/scripts/reboot-recovery/tests/test_reboot_event_detector.py index 12569be12..b31ca859a 100644 --- a/scripts/reboot-recovery/tests/test_reboot_event_detector.py +++ b/scripts/reboot-recovery/tests/test_reboot_event_detector.py @@ -110,3 +110,63 @@ def test_reboot_detector_fails_visible_when_windows_or_vm_host_missing(tmp_path: assert "99" in payload["missing_hosts"] assert payload["all_required_hosts_observed"] is False assert payload["all_required_hosts_in_reboot_window"] is False + + +def test_reboot_detector_does_not_treat_unknown_uptime_as_fresh_boot( + tmp_path: Path, +) -> None: + probe_path = tmp_path / "host-probe.txt" + state_path = tmp_path / "state.json" + output_path = tmp_path / "event.json" + probe_path.write_text( + "\n".join( + [ + "AWOOOI_REBOOT_AUTO_RECOVERY_HOST_PROBE=1", + "TARGET_HOSTS=99", + ( + "HOST_BOOT alias=99 target=192.168.0.99 " + "startup_unit=vmware-host-autostart reachable=1 " + "boot_id=reachable_unknown_boot uptime_seconds=unknown " + "systemd_state=ping_reachable startup_enabled=unknown " + "startup_active=unknown" + ), + ] + ) + + "\n", + encoding="utf-8", + ) + state_path.write_text( + json.dumps({"hosts": {"99": {"boot_id": "win-boot-1"}}}), + encoding="utf-8", + ) + + subprocess.run( + [ + sys.executable, + str(SCRIPT), + "--host-probe-file", + str(probe_path), + "--state-file", + str(state_path), + "--target-minutes", + "10", + "--generated-at", + "2026-06-30T18:00:00+08:00", + "--output", + str(output_path), + "--required-host", + "99", + "--no-write-state", + ], + check=True, + ) + + payload = json.loads(output_path.read_text(encoding="utf-8")) + + assert payload["observed_hosts"] == ["99"] + assert payload["reboot_detected"] is False + assert payload["fresh_boot_hosts"] == [] + assert payload["rebooted_hosts"] == [] + assert payload["all_required_hosts_observed"] is True + assert payload["all_required_hosts_in_reboot_window"] is False + assert payload["state_written"] is False