diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c9b7ef46..8ca5bef0 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,22 @@ +## 2026-07-02 — 10:52 110 startup SSH control path metadata 自動修復 + +**照主線修正的問題**: +- 全主機 cold-start readback 顯示 110 唯一 BLOCKED 是 read-only SSH command path timeout:`SSH_110_BLOCKER remote_control_channel_unavailable`;node-exporter / runner pressure evidence 顯示 110 runner job count 為 0、load classifier 不是高負載,因此主線問題收斂為重啟後 control path metadata / session path 需能自動修復。 +- `scripts/reboot-recovery/awoooi-startup-110.sh` 新增 `repair_ssh_control_path_metadata`,在 Docker / Harbor / runner 啟動前先呼叫 `/usr/local/bin/repair-110-ssh-publickey-auth-local.sh --apply`,預設 target user `wooo`、`RELOAD_SSH=0`,只修復 `.ssh` / `authorized_keys` ownership / mode 與 `sshd -t` syntax guard。 +- 這條路徑不讀、不列印、不建立 key material;helper 缺失或修復失敗時記錄 warning 並繼續既有 startup recovery,避免 SSH guard 反過來阻斷 Docker / Harbor recovery。 +- `scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py` 新增 contract,鎖定 SSH metadata repair 必須在 Docker 前執行、必須使用 local-only repair helper、預設不 reload SSH、不得包含 `cat authorized_keys` / `ssh-keygen` / `ssh-copy-id`。 + +**驗證**: +- `bash -n scripts/reboot-recovery/awoooi-startup-110.sh scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh`:通過。 +- `python3.11 -m pytest scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py -q`:`20 passed`。 +- `python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q`:`43 passed`。 +- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`:`GITEA_RUNNER_PRESSURE_GUARD_OK workflow_files=12 scheduled_workflows=4 auto_branch_events_on_110=0 generic_runner_labels=0`。 +- `git diff --check`:通過。 + +**邊界**:未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未 workflow_dispatch,未恢復 generic runner。當前 110 live SSH command path 仍需等此版部署到 startup 後由本機 startup / console path 套用,不能把 source 修正宣稱成 runtime 已解。 + +**下一步**:commit / push Gitea `main`,讀回 Gitea CD controlled-runtime;CD 成功後再重跑 cold-start scorecard,確認 110 command path 是否已恢復,若仍 blocked 則走 local console recovery package 驗證,不展開支線。 + ## 2026-07-01 — 23:52 Gitea CD #4315 B5 socket / queue historical blocker 修正 **照主線修正的問題**: diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 0db0ad71..7bd73517 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -99,7 +99,26 @@ quarantine_corrupt_docker_container_metadata() { log "Docker corrupt metadata quarantine dir=$qdir count=$count" } +repair_ssh_control_path_metadata() { + local script="${AWOOOI_110_SSH_REPAIR_SCRIPT:-/usr/local/bin/repair-110-ssh-publickey-auth-local.sh}" + local target_user="${AWOOOI_110_SSH_REPAIR_USER:-wooo}" + local reload_ssh="${AWOOOI_110_SSH_RELOAD_AFTER_REPAIR:-0}" + + log "[0/6] 修復 110 SSH control path metadata..." + if [ ! -x "$script" ]; then + log "⚠️ SSH control path metadata repair helper missing or not executable: $script" + return 0 + fi + + if TARGET_USER="$target_user" RELOAD_SSH="$reload_ssh" "$script" --apply; then + log "✅ 110 SSH control path metadata guard completed target_user=$target_user reload_ssh=$reload_ssh" + else + log "⚠️ 110 SSH control path metadata guard failed; continuing startup" + fi +} + log "=== 192.168.0.110 啟動序列開始 ===" +repair_ssh_control_path_metadata # ────────────────────────────────────────────── # STEP 1: Docker 修復(若 BoltDB 損壞) diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 1ea5c0fe..c2a1eb93 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -129,6 +129,25 @@ def test_startup_110_quarantines_corrupt_docker_container_metadata() -> None: assert 'run_bounded "$DOCKER_START_TIMEOUT_SECONDS" systemctl start docker.socket docker.service' in text +def test_startup_110_repairs_ssh_control_path_metadata_before_docker() -> None: + text = STARTUP_110.read_text(encoding="utf-8") + + repair_fn = text.index("repair_ssh_control_path_metadata()") + repair_call = text.index("repair_ssh_control_path_metadata", repair_fn + 1) + docker_step = text.index('log "[1/5] 檢查 Docker..."') + + assert repair_fn < repair_call < docker_step + assert 'AWOOOI_110_SSH_REPAIR_SCRIPT:-/usr/local/bin/repair-110-ssh-publickey-auth-local.sh' in text + assert 'AWOOOI_110_SSH_REPAIR_USER:-wooo' in text + assert 'AWOOOI_110_SSH_RELOAD_AFTER_REPAIR:-0' in text + assert 'TARGET_USER="$target_user" RELOAD_SSH="$reload_ssh" "$script" --apply' in text + assert "SSH control path metadata guard completed" in text + assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text + assert "cat ~/.ssh/authorized_keys" not in text + assert "ssh-keygen" not in text + assert "ssh-copy-id" not in text + + def test_startup_110_opens_only_controlled_cd_lane_after_guardrails() -> None: text = STARTUP_110.read_text(encoding="utf-8")