From acf1d0e127cc8bfa3238da9db31a430dbe31932c Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 23:33:19 +0800 Subject: [PATCH] fix(recovery): auto-start controlled 110 drain lane --- docs/LOGBOOK.md | 13 +++++++++++++ ...eboot-cold-start-backup-recovery-workplan.md | 2 +- ops/runner/README.md | 17 ++++++++++++----- scripts/reboot-recovery/awoooi-startup-110.sh | 9 ++++++--- .../test_cold_start_monitor_bounded_probes.py | 4 ++-- 5 files changed, 34 insertions(+), 11 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e2e0a201..c4547b02 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,16 @@ +## 2026-06-30 — 23:30 110 startup guarded auto-starts controlled drain lane + +**照主線修正的問題**: +- 23:27 live queue 已證明 Harbor repair `#4115 Waiting` 卡在 `awoooi-host` no matching runner;`awoooi-startup-110.sh` 雖已有 controlled drain lane guardrails,但預設 `AWOOOI_START_CONTROLLED_CD_LANE=0`,重啟後即使 config / binary / labels / root restore-source 全部合格,也不會自動恢復 `awoooi-host` repair lane。 +- 將 startup 預設改成 guarded auto-start:`START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-1}"`。legacy host / generic runner 仍維持 fail-closed;drain lane 只有在 `capacity=1`、labels 僅 `awoooi-ubuntu` / `awoooi-host`、無泛用重型 label、binary / config 可用、root restore-source left `0` 時才會 `enable --now awoooi-cd-lane-drain.service`。 +- 仍保留明確關閉開關:`AWOOOI_START_CONTROLLED_CD_LANE=0` 可讓 startup 不拉起 drain lane;不得用此變更恢復 legacy runner、generic label、`ubuntu-latest`、StockPlatform/headless/Playwright 重型 label。 + +**驗證**: +- 更新 `scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py`,斷言 controlled drain lane 預設 guard-on,且關閉/guard fail 會保持 closed。 +- 更新 `ops/runner/README.md`,把「startup 不自動重開 runner」改成「legacy 不啟動;受控 drain lane guard 後自動拉起」。 + +**邊界**:只改 110 startup source / test / runner README / LOGBOOK;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未 SSH 寫入、未重啟主機、未 restart Docker / Nginx / K3s / DB / firewall。 + ## 2026-06-30 — 23:27 Post-push cold-start / Harbor / Stock readback **照主線修正的問題**: diff --git a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md index 4e2a28cd..13570bd8 100644 --- a/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md +++ b/docs/workplans/2026-06-04-reboot-cold-start-backup-recovery-workplan.md @@ -19,7 +19,7 @@ | P0-2 | DONE_THIS_INCIDENT | 使用者可見 502:Tsenyang | `www.tsenyang.com` / `tsenyang.com` 由 502 恢復為 200;188 `tsenyang-website` container running;local `127.0.0.1:3000` 回 200。 | 下次同類 502 先查 release symlink / image / container;不先動 Nginx、DNS、DB、主機重啟。 | | P0-3 | BLOCKED | StockPlatform data freshness | 23:27 public route / health 為 200,但 `/api/v1/system/freshness` 與 `/api/v1/system/ingestion` 回 `status=not_configured`、`blockers=["postgres_not_ready"]`、`latest_trading_date=null`;網站可達不代表資料最新。 | 恢復 110 control path 後,read-only 查 `/home/wooo/stockplatform-v2` compose / DB schema / migration status;禁止 fake freshness、manual DB rows、restore/prune。 | | P0-4 | BLOCKED | AWOOOI production 版本最新性 | Gitea SSH `main` 最新已到 `68084470 feat(agent): expose deploy marker receipt input`,且包含 `3de828f97 fix(agent): surface harbor cd retry receipt blocker`。Public Gitea queue 23:27 讀到 latest CD `#4117 Running` for `68084470`;production 尚不能證明已部署最新 source。 | 補 deploy marker / runtime SHA / endpoint readback 一致;Harbor `/v2` 恢復前 CD 無法把最新 source 發到 production,未一致前不可宣稱 AWOOOI 最新。 | -| P0-5 | BLOCKED | 110 control path / Harbor registry `/v2` | 23:27 queue readback 回 `status=blocked_harbor_110_repair_no_matching_runner`;Harbor repair workflow `#4115` 仍 `Waiting`,no-matching label `awoooi-host`,jobs API 仍 stale/mismatched。Live route probe 同步回 public registry `/v2=502`、internal `192.168.0.110:5000/v2=502`、Harbor health `502`、SignOz `502`。Harbor receipt validator 對 live queue 回 `status=blocked_waiting_harbor_controlled_recovery_receipt`,active blockers 含 `gitea_queue_harbor_110_repair_no_matching_runner`、`gitea_queue_harbor_110_repair_jobs_stale_or_mismatched`、`public_registry_v2_verifier_not_green`、`internal_registry_v2_verifier_not_green`。23:27 cold-start 同步證明 110 registry `/v2` blocked、110 SSH read-only check blocked、K3s pull refused by `110:5000`。 | 讓 110-local repair workflow 或 110 console/local script 真正執行 `recover-110-control-path-and-harbor-local.sh --check` / `--apply-all`,並讀回 public/internal `/v2` 為 `200/401`。恢復 SSH read-only command path 後才能驗證 Stock DB、Gitea dump、110 backup completeness。 | +| P0-5 | BLOCKED_SOURCE_ADVANCED | 110 control path / Harbor registry `/v2` | 23:27 queue readback 回 `status=blocked_harbor_110_repair_no_matching_runner`;Harbor repair workflow `#4115` 仍 `Waiting`,no-matching label `awoooi-host`,jobs API 仍 stale/mismatched。Live route probe 同步回 public registry `/v2=502`、internal `192.168.0.110:5000/v2=502`、Harbor health `502`、SignOz `502`。Harbor receipt validator 對 live queue 回 `status=blocked_waiting_harbor_controlled_recovery_receipt`,active blockers 含 `gitea_queue_harbor_110_repair_no_matching_runner`、`gitea_queue_harbor_110_repair_jobs_stale_or_mismatched`、`public_registry_v2_verifier_not_green`、`internal_registry_v2_verifier_not_green`。23:27 cold-start 同步證明 110 registry `/v2` blocked、110 SSH read-only check blocked、K3s pull refused by `110:5000`。23:30 source 修正讓 `awoooi-startup-110.sh` 預設 guard-on 啟動 controlled drain lane;legacy runner / generic label 仍 fail-closed。 | 部署 / 讀回 110 startup source 後,確認 `awoooi-cd-lane-drain.service` 只在 guardrails 通過時 active,讓 110-local repair workflow 或 110 console/local script 真正執行 `recover-110-control-path-and-harbor-local.sh --check` / `--apply-all`,並讀回 public/internal `/v2` 為 `200/401`。恢復 SSH read-only command path 後才能驗證 Stock DB、Gitea dump、110 backup completeness。 | | P0-6 | BLOCKED_BACKUP_COMPLETENESS | Gitea repo visibility 與完整備份 | Gitea version API 200;public repo search 只列 4 個 public repo;`stockplatform-v2` public page/API 404,但 internal `git ls-remote` 成功;188 `/home/ollama/backup/110/gitea` 起初為空。已建立 verified emergency bundle `/home/ollama/backup/110/gitea/git-bundles/20260630-190931`:4 個 public/internal repo bundle verify + checksum 成功,`AwoooGo`、`stockplatform-v2`、`vibework` 因 private auth fail-closed。20:18 summary 因 110 `backup-status` 不可讀回,`BACKUP_CORE_GREEN=0`、`DR_ESCROW_BLOCKED=1`、`DR_ESCROW_EVIDENCE_UNKNOWN=1`。 | 188 `gitea_repo_mirror_from_110` subtree metric / alert 已補;下一步仍是恢復 110 SSH command path 後跑正式 `gitea dump`、private repo 非互動備份、repo count、backup-status 與 restore drill readback。unknown 不得當作 backup / DR green。 | | P0-7 | SOURCE_READY_RUNTIME_BLOCKED | 99 VMware / VM autostart | repo 已有 `windows99-vmware-autostart.ps1`;22:05 host probe 讀到 99 ping reachable 但 `boot_id=reachable_unknown_boot` / uptime unknown,111 不可達,112/120/121/188 可讀,188 startup unit failed/degraded。先前只讀 readback 顯示 99 RDP 3389 / SSH 22 可達、WinRM 5985 fail,`administrator@192.168.0.99` SSH publickey denied。 | 恢復 99 可控通道或由 console 套用腳本;完成後讀回 111/188/120/121/112 boot evidence,要求 all-host required observed/reachable 且 99 不再是 unknown uptime。 | | P0-8 | SOURCE_READY_RUNTIME_BLOCKED | 502 maintenance fallback / Telegram / backup alert | L0/L1 fallback runbook、Nginx snippet、reboot / backup alert rules 已在 source;runtime 尚需部署與外部 L1 provider readback。 | L0 以測試 vhost 驗證 `X-AWOOOI-Fallback`;L1 需外部雲端/CDN probe;Telegram 以脫敏 alert receipt 驗證。 | diff --git a/ops/runner/README.md b/ops/runner/README.md index 6196c20e..a3350bdc 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -229,18 +229,25 @@ AWOOI 的 Docker lock,會和 AWOOI Web image 內的 Next production build 疊 長期方向仍是 runner 隔離或 build offload;此 gate 是在 shared runner 尚未 拆分前,降低重型前端 build 互相踩踏的保守保護層。 -### 第四層補充: startup 不自動重開 Gitea runner +### 第四層補充: startup 只自動拉起受控 drain lane 2026-06-27 110 CPU 事故止血後,`gitea-act-runner-host.service` 維持 inactive 是 刻意降壓狀態。`scripts/reboot-recovery/awoooi-startup-110.sh` 仍可修正 runner -`shutdown_timeout` 與 labels,也會停用 legacy Docker runner,但預設不會啟動 -host runner。只有明確設定下列開關時才允許 startup 拉起 runner: +`shutdown_timeout` 與 labels,也會停用 legacy Docker runner,但不會啟動 legacy +host runner。2026-06-30 全主機重啟後,`awoooi-host` repair lane 卡在 no matching +runner,確認 post-boot 自動恢復必須讓受控 drain lane 走 guardrails 後自動拉起。 +因此 startup 預設會嘗試開啟 `awoooi-cd-lane-drain.service`,但只在 +`capacity=1`、labels 僅 `awoooi-ubuntu` / `awoooi-host`、無泛用重型 label、 +binary / config / `.runner` path 可用、root restore-source left `0` 且 systemd +limit verifier 成立時放行;不成立就維持 fail-closed。 ```bash -AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 /usr/local/bin/awoooi-startup-110.sh +AWOOOI_START_CONTROLLED_CD_LANE=0 /usr/local/bin/awoooi-startup-110.sh ``` -未完成 runner 限流 / 搬遷前,不要把這個開關加入 systemd environment。 +上方只用於明確關閉受控 drain lane;不得用它恢復 legacy runner。未完成 runner +限流 / 搬遷前,仍不得加入 `AWOOOI_START_GITEA_RUNNER_ON_BOOT=1` 或解除 legacy +runner mask。 ### 第五層修復: legacy Docker runner drain diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 4329a766..0db0ad71 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -282,7 +282,7 @@ fi # 2026-06-27 Codex: 110 是 production / registry / observability 主機; # legacy runner 預設維持停用降壓;controlled drain lane 可在受控授權下啟動。 # ────────────────────────────────────────────── -log "[6/6] 檢查 Gitea Act Runner(預設不自動啟動)..." +log "[6/6] 檢查 Gitea Act Runner(legacy fail-closed;controlled drain 守門自動啟動)..." RUNNER_DIR="/home/wooo/act-runner" RUNNER_SERVICE="gitea-act-runner-host.service" RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled" @@ -296,7 +296,7 @@ CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled" CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml" CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled" START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}" -START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}" +START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-1}" START_GITEA_RUNNER_ALLOWED=0 START_CD_LANE_ALLOWED=0 RUNNER_FAIL_CLOSED_SERVICES=( @@ -317,6 +317,9 @@ RUNNER_FAIL_CLOSED_BINARY_PATHS=( ) # Host runner still needs both keys. The direct cd-lane stays fail-closed until # it is migrated or hard-limited outside this production host pressure lane. +# The drain lane is a narrow post-reboot recovery path; it auto-starts only when +# config/binary/label/restore-source guardrails pass, and can be explicitly +# disabled with AWOOOI_START_CONTROLLED_CD_LANE=0. if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then START_GITEA_RUNNER_ALLOWED=1 fi @@ -759,7 +762,7 @@ if [ "$START_CD_LANE_ALLOWED" = "1" ]; then systemctl enable --now "$CD_LANE_DRAIN_SERVICE" >/dev/null 2>&1 || true ensure_controlled_cd_lane_open else - log "✅ controlled cd-lane remains closed unless AWOOOI_START_CONTROLLED_CD_LANE=1 passes guardrails" + log "✅ controlled cd-lane remains closed because guardrails failed or AWOOOI_START_CONTROLLED_CD_LANE=0" fi # ────────────────────────────────────────────── diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 9c798623..98cbbb0a 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -79,7 +79,7 @@ def test_startup_110_quarantines_corrupt_docker_container_metadata() -> None: def test_startup_110_opens_only_controlled_cd_lane_after_guardrails() -> None: text = STARTUP_110.read_text(encoding="utf-8") - assert 'START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}"' in text + assert 'START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-1}"' in text assert "cd_lane_root_restore_sources_left()" in text assert 'CD_LANE_ROOT_RESTORE_LEFT="$(cd_lane_root_restore_sources_left)"' in text assert 'START_CD_LANE_ALLOWED=1' in text @@ -91,7 +91,7 @@ def test_startup_110_opens_only_controlled_cd_lane_after_guardrails() -> None: assert 'if [ "$START_CD_LANE_ALLOWED" = "1" ] && [ "$binary" = "$CD_LANE_DRAIN_BINARY" ]; then' in text assert 'systemctl enable --now "$RUNNER_SERVICE"' in text assert "legacy runner 仍維持 fail-closed" in text - assert "controlled cd-lane remains closed unless AWOOOI_START_CONTROLLED_CD_LANE=1 passes guardrails" in text + assert "controlled cd-lane remains closed because guardrails failed or AWOOOI_START_CONTROLLED_CD_LANE=0" in text def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None: