From e580954e82fb7933ad2853e56ad873f30e47cb77 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 13:58:22 +0800 Subject: [PATCH] fix(recovery): surface 110 local package blocker --- docs/LOGBOOK.md | 16 ++++++++++++++++ .../full-stack-cold-start-check.sh | 5 +++++ .../test_cold_start_monitor_bounded_probes.py | 3 +++ 3 files changed, 24 insertions(+) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 50483b15..d797c0cf 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -27,6 +27,22 @@ **邊界**:只改 metadata-only LOG tagging contract、AI Loop current blocker evidence 投影、tests 與 LOGBOOK;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未執行 KM/RAG/PlayBook/MCP live write;未執行 110 runtime apply。 +## 2026-07-01 — 14:50 110 control path / recovery package blocker readback + +**照主線處理的問題**: +- 使用最新 Gitea main 後重跑 110 bounded diagnosis:`NODE_LOAD1=7.69`、`NODE_LOAD5=13.03`、`NODE_LOAD1_PER_CPU=0.64`、`NODE_LOAD_CLASSIFIER=load_not_high`,表示 110 CPU/load 已從高壓狀態下降;但 `wooo` publickey 仍是 `publickey_offer_timeout`,legacy runner units 仍回 `systemctl_show_timeout`,所以問題已收斂為 110 SSH/control path blocker,不是繼續泛稱 CPU 過高。 +- public route / queue readback 仍 blocked:`https://registry.wooo.work/v2/` 502、`https://harbor.wooo.work/api/v2.0/health` 502;Gitea public queue 回 `status=blocked_harbor_110_remote_ssh_publickey_auth_stalled`,current CD `#4247 Waiting`,Harbor repair `#4237 Running` 但 jobs API stale/cross-workflow mismatch。 +- 13:46 cold-start artifact `/tmp/awoooi-full-stack-cold-start-after-110-console-20260701-134632.log` 回 `PASS=62 WARN=8 BLOCKED=6`。仍不可宣稱全服務 10 分鐘自動恢復;hard blockers 含 110 registry `/v2`、110 SSH read-only check、K3s registry pull refused by `110:5000`、SignOz TLS / 502、Stock route 502、188 MOMO daily sales stale。 +- 99 VMware console 已可進入 110 local shell;現場只讀確認 110 home 有舊版 `full-stack-cold-start-check.sh`、`harbor-watchdog.sh`、`systemd-units-textfile-exporter.py` 等腳本,但沒有 `recover-110-control-path-and-harbor-local.sh`;`/home/wooo/awoooi` 也不存在。代表目前 scorecard 指向的 local recovery package 未在 110 live console 可用,這是自動恢復閉環缺口。 +- `scripts/reboot-recovery/full-stack-cold-start-check.sh` 已補 machine-readable blocker:SSH 110 blocked 時輸出 `/usr/local/bin/recover-110-control-path-and-harbor-local.sh`、`/usr/local/bin/repair-110-ssh-publickey-auth-local.sh`、`/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh` expected paths,並固定 `SSH_110_RECOVERY_PACKAGE_BLOCKER=package_presence_not_verifiable_while_remote_control_channel_unavailable` / `SSH_110_RECOVERY_PACKAGE_NEXT_ACTION=verify_or_preinstall_local_recovery_package_from_console_before_harbor_repair_retry`。 + +**仍維持**: +- 未讀 secret / token / `.env` / raw sessions / SQLite / auth;未讀 authorized_keys 內容或 `.runner` 內容;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch。 +- 未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall;console 曾觸發 `sudo` password prompt 但已中斷,未輸入密碼。 + +**下一步**: +- P0 不切支線:先讓 110 local recovery package 可在 110 console 或已恢復 SSH control path 執行;接著跑 `recover-110-control-path-and-harbor-local.sh --check`,只有 check 指向 metadata drift 時才做 `--apply-ssh-control-path`,再驗證 Harbor `/v2` 200/401、Gitea queue、cold-start、Stock freshness 與 SLO scorecard。 + ## 2026-07-01 — 14:15 Harbor receipt 納入 stale repair running blocker **照主線修正的問題**: diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 3e1a99bd..7430a32e 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -451,6 +451,11 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 fail "ssh 110 read-only check" echo "SSH_110_BLOCKER remote_control_channel_unavailable" echo "SSH_110_NEXT_ACTION local_console_run_recover_110_control_path_and_harbor_local_check" + echo "SSH_110_RECOVERY_PACKAGE_EXPECTED /usr/local/bin/recover-110-control-path-and-harbor-local.sh" + echo "SSH_110_RECOVERY_PACKAGE_EXPECTED /usr/local/bin/repair-110-ssh-publickey-auth-local.sh" + echo "SSH_110_RECOVERY_PACKAGE_EXPECTED /usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh" + echo "SSH_110_RECOVERY_PACKAGE_BLOCKER package_presence_not_verifiable_while_remote_control_channel_unavailable" + echo "SSH_110_RECOVERY_PACKAGE_NEXT_ACTION verify_or_preinstall_local_recovery_package_from_console_before_harbor_repair_retry" echo "$out" return fi diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 6e7f0a17..8b4b05e2 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -30,6 +30,9 @@ def test_full_stack_cold_start_check_bounds_ssh_probes() -> None: assert 'timeout 3 systemctl "$@"' in text assert "SSH_110_BLOCKER remote_control_channel_unavailable" in text assert "SSH_110_NEXT_ACTION local_console_run_recover_110_control_path_and_harbor_local_check" in text + assert "SSH_110_RECOVERY_PACKAGE_EXPECTED /usr/local/bin/recover-110-control-path-and-harbor-local.sh" in text + assert "SSH_110_RECOVERY_PACKAGE_BLOCKER package_presence_not_verifiable_while_remote_control_channel_unavailable" in text + assert "SSH_110_RECOVERY_PACKAGE_NEXT_ACTION verify_or_preinstall_local_recovery_package_from_console_before_harbor_repair_retry" in text def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None: