From 214f4c1e88ad7cc75c5295ec5600b11de3366a6f Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 21:41:31 +0800 Subject: [PATCH] fix(recovery): include 110 controlled lane verifier --- docs/LOGBOOK.md | 1 + .../test_cd_controlled_runtime_profile.py | 2 ++ scripts/reboot-recovery/deploy-to-110.sh | 4 +++ ...cover-110-control-path-and-harbor-local.sh | 32 ++++++++++++++++++- ...cover_110_control_path_and_harbor_local.py | 17 +++++++++- 5 files changed, 54 insertions(+), 2 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2bec1424..056d5b59 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -50672,6 +50672,7 @@ production browser smoke: - verifier 僅讀 metadata,不讀 `.runner` 內容、不印 runner token;檢查 110 host selector、controlled drain lane `capacity=1`、`awoooi-host:host` / `awoooi-ubuntu` labels、ELF binary、registration metadata 存在、systemd CPU / memory / tasks / `NoNewPrivileges` guardrails、legacy runner fail-closed、root restore-source left `0`、active action container / heavy process / load 壓力。 - `awoooi-cd-lane-drain.service` 與 `awoooi-startup-110.sh` 產生的 controlled drain unit 新增 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`,避免 service active 但未註冊時假裝可承接 `awoooi-host` queue。 - `ops/runner/verify-awoooi-non110-cd-closure.py` 的 Harbor 110 no-matching next action 改為先在 110 跑 `check-awoooi-110-controlled-cd-lane-readiness.sh`,通過後再恢復 `awoooi-host` control path 並重讀 queue/closure。 +- `recover-110-control-path-and-harbor-local.sh` 的 `--check` 串入 controlled lane verifier;`deploy-to-110.sh` 同步安裝 `/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh`,讓 110 local console / root shell recovery 包同時涵蓋 SSH metadata、Harbor watchdog 與 `awoooi-host` lane readiness。 **本地驗證結果**: - `pytest ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py ops/runner/test_verify_awoooi_non110_cd_closure.py ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_guard_gitea_runner_pressure.py scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py -q`:`56 passed`。 diff --git a/ops/runner/test_cd_controlled_runtime_profile.py b/ops/runner/test_cd_controlled_runtime_profile.py index bcb15e31..3b8dc72f 100644 --- a/ops/runner/test_cd_controlled_runtime_profile.py +++ b/ops/runner/test_cd_controlled_runtime_profile.py @@ -171,8 +171,10 @@ def test_deploy_to_110_syncs_local_control_path_recovery_helpers() -> None: assert "repair-110-ssh-publickey-auth-local.sh" in text assert "recover-110-control-path-and-harbor-local.sh" in text + assert "check-awoooi-110-controlled-cd-lane-readiness.sh" in text assert "/usr/local/bin/repair-110-ssh-publickey-auth-local.sh" in text assert "/usr/local/bin/recover-110-control-path-and-harbor-local.sh" in text + assert "/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh" in text def test_onboarding_warning_step_template_stays_on_controlled_runtime_profile() -> None: diff --git a/scripts/reboot-recovery/deploy-to-110.sh b/scripts/reboot-recovery/deploy-to-110.sh index b887a352..ff1b19c9 100644 --- a/scripts/reboot-recovery/deploy-to-110.sh +++ b/scripts/reboot-recovery/deploy-to-110.sh @@ -9,6 +9,7 @@ set -euo pipefail HOST="wooo@192.168.0.110" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" echo "=== 部署 awoooi-startup-110 + harbor-watchdog 到 192.168.0.110 ===" @@ -18,6 +19,7 @@ scp "$SCRIPT_DIR/awoooi-startup-110.sh" "$HOST:/tmp/awoooi-startup-110.sh" scp "$SCRIPT_DIR/awoooi-startup-110.service" "$HOST:/tmp/awoooi-startup-110.service" scp "$SCRIPT_DIR/repair-110-ssh-publickey-auth-local.sh" "$HOST:/tmp/repair-110-ssh-publickey-auth-local.sh" scp "$SCRIPT_DIR/recover-110-control-path-and-harbor-local.sh" "$HOST:/tmp/recover-110-control-path-and-harbor-local.sh" +scp "$ROOT_DIR/ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh" "$HOST:/tmp/check-awoooi-110-controlled-cd-lane-readiness.sh" # 2. 上傳 watchdog echo "[2/5] 上傳 harbor-watchdog..." @@ -32,6 +34,8 @@ ssh "$HOST" "sudo cp /tmp/awoooi-startup-110.sh /usr/local/bin/awoooi-startup-11 sudo chmod +x /usr/local/bin/repair-110-ssh-publickey-auth-local.sh && \ sudo cp /tmp/recover-110-control-path-and-harbor-local.sh /usr/local/bin/recover-110-control-path-and-harbor-local.sh && \ sudo chmod +x /usr/local/bin/recover-110-control-path-and-harbor-local.sh && \ + sudo cp /tmp/check-awoooi-110-controlled-cd-lane-readiness.sh /usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh && \ + sudo chmod +x /usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh && \ sudo cp /tmp/awoooi-startup-110.service /etc/systemd/system/awoooi-startup-110.service && \ sudo systemctl daemon-reload && \ sudo systemctl enable awoooi-startup-110.service && \ diff --git a/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh b/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh index a4a6c97b..4e1293e8 100644 --- a/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh +++ b/scripts/reboot-recovery/recover-110-control-path-and-harbor-local.sh @@ -13,6 +13,7 @@ EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP:-192.168.0.110}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SSH_REPAIR_SCRIPT="${AWOOOI_110_SSH_REPAIR_SCRIPT:-}" HARBOR_WATCHDOG_SCRIPT="${AWOOOI_HARBOR_WATCHDOG_SCRIPT:-}" +CONTROLLED_LANE_VERIFIER_SCRIPT="${AWOOOI_110_CONTROLLED_LANE_VERIFIER_SCRIPT:-}" RELOAD_SSH="${RELOAD_SSH:-0}" usage() { @@ -20,7 +21,7 @@ usage() { Usage: recover-110-control-path-and-harbor-local.sh [--check|--apply-ssh-metadata|--repair-harbor-once|--apply-all] Modes: - --check Read-only checks for SSH metadata and Harbor readiness. + --check Read-only checks for SSH metadata, Harbor readiness, and controlled CD lane readiness. --apply-ssh-metadata Fix TARGET_USER home/.ssh/authorized_keys metadata only. --repair-harbor-once Run one bounded Harbor watchdog repair cycle only. --apply-all Apply SSH metadata repair, then one Harbor repair cycle. @@ -120,6 +121,22 @@ resolve_harbor_watchdog_script() { return 1 } +resolve_controlled_lane_verifier_script() { + if [ -n "$CONTROLLED_LANE_VERIFIER_SCRIPT" ] && [ -x "$CONTROLLED_LANE_VERIFIER_SCRIPT" ]; then + printf '%s\n' "$CONTROLLED_LANE_VERIFIER_SCRIPT" + return 0 + fi + if [ -x "/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh" ]; then + printf '%s\n' "/usr/local/bin/check-awoooi-110-controlled-cd-lane-readiness.sh" + return 0 + fi + if [ -x "$SCRIPT_DIR/../../ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh" ]; then + printf '%s\n' "$SCRIPT_DIR/../../ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh" + return 0 + fi + return 1 +} + run_ssh_check() { local script if ! script="$(resolve_ssh_repair_script)"; then @@ -158,9 +175,20 @@ run_harbor_repair_once() { "$script" --repair-once } +run_controlled_lane_check() { + local script + if ! script="$(resolve_controlled_lane_verifier_script)"; then + echo "CONTROLLED_LANE_VERIFIER_SCRIPT_STATUS=missing" + return 1 + fi + TARGET_HOST_IP="$EXPECTED_HOST_IP" "$script" +} + echo "AWOOOI_110_CONTROL_PATH_AND_HARBOR_LOCAL_RECOVERY mode=${MODE} target_user=${TARGET_USER}" echo "expected_host_ip=${EXPECTED_HOST_IP}" echo "operation_boundary_secret_value_read=false" +echo "operation_boundary_runner_token_read=false" +echo "operation_boundary_raw_runner_registration_read=false" echo "operation_boundary_host_reboot_performed=false" echo "operation_boundary_docker_daemon_restart_performed=false" echo "operation_boundary_node_drain_performed=false" @@ -169,6 +197,7 @@ case "$MODE" in check) run_ssh_check || true run_harbor_check || true + run_controlled_lane_check || true ;; apply_ssh_metadata) run_ssh_apply @@ -179,6 +208,7 @@ case "$MODE" in apply_all) run_ssh_apply run_harbor_repair_once + run_controlled_lane_check || true ;; *) echo "Unknown internal mode: $MODE" >&2 diff --git a/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py b/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py index ebb9fc86..23e56183 100644 --- a/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py +++ b/scripts/reboot-recovery/tests/test_recover_110_control_path_and_harbor_local.py @@ -18,11 +18,15 @@ def test_recover_110_orchestrator_contracts() -> None: assert "--repair-harbor-once" in text assert "--apply-all" in text assert "operation_boundary_secret_value_read=false" in text + assert "operation_boundary_runner_token_read=false" in text + assert "operation_boundary_raw_runner_registration_read=false" in text assert "operation_boundary_host_reboot_performed=false" in text assert "operation_boundary_docker_daemon_restart_performed=false" in text assert "repair-110-ssh-publickey-auth-local.sh" in text assert "harbor-watchdog.sh" in text + assert "check-awoooi-110-controlled-cd-lane-readiness.sh" in text assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text + assert "cat \"$CD_LANE_DRAIN_DIR/data/.runner\"" not in text forbidden = [ "systemctl restart docker", @@ -41,6 +45,7 @@ def test_recover_110_orchestrator_contracts() -> None: def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> None: ssh_helper = tmp_path / "ssh-helper.sh" harbor_helper = tmp_path / "harbor-helper.sh" + lane_helper = tmp_path / "lane-helper.sh" ssh_helper.write_text( "#!/usr/bin/env bash\n" "echo SSH_HELPER_MODE=$1\n" @@ -53,7 +58,13 @@ def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> N "echo HARBOR_RUNTIME_WRITE=false\n", encoding="utf-8", ) - for helper in (ssh_helper, harbor_helper): + lane_helper.write_text( + "#!/usr/bin/env bash\n" + "echo LANE_VERIFIER_MODE=check\n" + "echo LANE_VERIFIER_REGISTRATION_CONTENT_READ=false\n", + encoding="utf-8", + ) + for helper in (ssh_helper, harbor_helper, lane_helper): helper.chmod(helper.stat().st_mode | stat.S_IXUSR) env = { @@ -61,6 +72,7 @@ def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> N "ALLOW_NON_110": "1", "AWOOOI_110_SSH_REPAIR_SCRIPT": str(ssh_helper), "AWOOOI_HARBOR_WATCHDOG_SCRIPT": str(harbor_helper), + "AWOOOI_110_CONTROLLED_LANE_VERIFIER_SCRIPT": str(lane_helper), } result = subprocess.run( ["bash", str(RECOVERY), "--check"], @@ -75,6 +87,9 @@ def test_recover_110_check_uses_fake_helpers_without_writes(tmp_path: Path) -> N assert "AWOOOI_110_CONTROL_PATH_AND_HARBOR_LOCAL_RECOVERY mode=check" in result.stdout assert "SSH_HELPER_MODE=--check" in result.stdout assert "HARBOR_HELPER_MODE=--check" in result.stdout + assert "LANE_VERIFIER_MODE=check" in result.stdout assert "SSH_METADATA_WRITE=false" in result.stdout assert "HARBOR_RUNTIME_WRITE=false" in result.stdout + assert "LANE_VERIFIER_REGISTRATION_CONTENT_READ=false" in result.stdout assert "operation_boundary_secret_value_read=false" in result.stdout + assert "operation_boundary_raw_runner_registration_read=false" in result.stdout