From b7b475983fa47450dc284838d354c29a95e0c96b Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 28 Jun 2026 10:42:35 +0800 Subject: [PATCH] fix(runner): accept controlled cd lane drain readback --- docs/LOGBOOK.md | 20 +++++++++ scripts/reboot-recovery/awoooi-startup-110.sh | 44 ++++++++++++++++--- .../full-stack-cold-start-check.sh | 34 +++++++++++++- .../p3-controlled-release-gate.sh | 33 +++++++++++++- .../reboot-recovery/post-start-quick-check.sh | 34 +++++++++++++- 5 files changed, 156 insertions(+), 9 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 3c98d604..ad7554e4 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -48384,3 +48384,23 @@ production browser smoke: **仍未做**: - 尚未推 main;等待本修正進 feature、再於 main CD idle 時 normal push。 + +## 2026-06-28 — 10:45 controlled cd-lane drain source verifier 補齊 + +**時間與來源**: +- 2026-06-28 10:33-10:45 Asia/Taipei。 +- 來源:110 live `awoooi-cd-lane-drain.service` readback、feature branch `codex/source-controlled-cd-lane-drain-20260628`。 + +**完成內容**: +- `scripts/reboot-recovery/awoooi-startup-110.sh` 新增 drain lane verifier:當 `/home/wooo/awoooi-cd-lane-drain` 符合 `capacity=1`、AWOOOI 專用 labels、binary 為 ELF、systemd active 時,保留 drain lane 並只 fail-close regular `awoooi-cd-lane.service`。 +- `full-stack-cold-start-check.sh`、`post-start-quick-check.sh`、`p3-controlled-release-gate.sh` 新增 `CD_LANE_DRAIN_CONTROLLED` 與 `CD_LANE_GUARDRAILS_OK` readback,接受 regular controlled/failclosed 或 drain controlled-open 任一通過。 +- legacy direct/Gitea runner fail-closed、direct runner process count、runner binary stub 檢查維持原樣;未恢復泛用 runner。 + +**本地與 live 驗證結果**: +- `bash -n scripts/reboot-recovery/awoooi-startup-110.sh scripts/reboot-recovery/full-stack-cold-start-check.sh scripts/reboot-recovery/post-start-quick-check.sh scripts/reboot-recovery/p3-controlled-release-gate.sh`:通過。 +- `git diff --check`:通過。 +- 110 live readback:`CD_LANE_DRAIN_CONTROLLED mode=controlled_open load=loaded unitfile=enabled active=active capacity=1 labels=1 binary_elf=1 ok=1`、`CD_LANE_GUARDRAILS_OK 1`。 + +**仍維持**: +- regular `awoooi-cd-lane.service` masked/inactive;legacy direct runner units fail-closed。 +- 不讀 `.runner`、SQLite、raw session、auth、`.env`;只驗 systemd、capacity/labels 與 binary kind。 diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 73162107..13bb5966 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -198,6 +198,10 @@ CD_LANE_DIR="/home/wooo/awoooi-cd-lane" CD_LANE_SERVICE="awoooi-cd-lane.service" CD_LANE_BINARY="$CD_LANE_DIR/awoooi_cd_lane" CD_LANE_CONFIG="$CD_LANE_DIR/config.yaml" +CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain" +CD_LANE_DRAIN_SERVICE="awoooi-cd-lane-drain.service" +CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled" +CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml" CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled" START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}" START_CD_LANE_ALLOWED=0 @@ -300,18 +304,46 @@ EOF chattr +i "$unit_file" >/dev/null 2>&1 || true } -cd_lane_config_is_controlled() { - [ -f "$CD_LANE_CONFIG" ] || return 1 - grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$CD_LANE_CONFIG" || return 1 - grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$CD_LANE_CONFIG" || return 1 - grep -q 'awoooi-host:host' "$CD_LANE_CONFIG" || return 1 - if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$CD_LANE_CONFIG"; then +cd_lane_config_path_is_controlled() { + local config_path="$1" + [ -f "$config_path" ] || return 1 + grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$config_path" || return 1 + grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$config_path" || return 1 + grep -q 'awoooi-host:host' "$config_path" || return 1 + if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$config_path"; then return 1 fi return 0 } +cd_lane_config_is_controlled() { + cd_lane_config_path_is_controlled "$CD_LANE_CONFIG" +} + +cd_lane_drain_config_is_controlled() { + cd_lane_config_path_is_controlled "$CD_LANE_DRAIN_CONFIG" +} + +cd_lane_drain_is_controlled_open() { + local active + active="$(systemctl show "$CD_LANE_DRAIN_SERVICE" -p ActiveState --value 2>/dev/null || true)" + [ "$active" = "active" ] || return 1 + cd_lane_drain_config_is_controlled || return 1 + file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1 + return 0 +} + ensure_cd_lane_fail_closed() { + if cd_lane_drain_is_controlled_open; then + log "✅ controlled cd-lane drain verifier passed; preserving drain lane and fail-closing regular lane only" + systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true + install_cd_lane_fail_closed_unit + pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true + systemctl daemon-reload >/dev/null 2>&1 || true + return 0 + fi if { [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ -e "/run/awoooi-cd-lane-controlled-open" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; } \ && cd_lane_config_is_controlled \ && file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 9fa0eebf..1237a516 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -327,6 +327,38 @@ elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && cd_lane_mode=controlled_open fi echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok" +cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) +cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) +cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) +cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true) +cd_lane_drain_capacity_ok=0 +cd_lane_drain_labels_ok=0 +if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then + cd_lane_drain_capacity_ok=1 +fi +if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \ + && grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \ + && ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then + cd_lane_drain_labels_ok=1 +fi +cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing) +cd_lane_drain_binary_elf=0 +echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1 +cd_lane_drain_ok=0 +cd_lane_drain_mode=absent +if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then + cd_lane_drain_mode=blocked +fi +if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then + cd_lane_drain_ok=1 + cd_lane_drain_mode=controlled_open +fi +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok" +cd_lane_guard_ok=0 +if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then + cd_lane_guard_ok=1 +fi +echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do @@ -362,7 +394,7 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120 else fail "110 legacy direct/Gitea runner units are not fail-closed" fi - grep -q "CD_LANE_CONTROLLED .*ok=1" <<<"$out" && ok "110 controlled cd-lane is safe or fail-closed" || fail "110 controlled cd-lane is neither safe-open nor fail-closed" + grep -q "CD_LANE_GUARDRAILS_OK 1" <<<"$out" && ok "110 controlled cd-lane is safe, drained, or fail-closed" || fail "110 controlled cd-lane is neither safe-open/drained nor fail-closed" grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 legacy direct runner process count is zero" || fail "110 legacy direct runner process detected" grep -q "RUNNER_FAILCLOSED_BINARY_ELF" <<<"$out" && fail "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting" diff --git a/scripts/reboot-recovery/p3-controlled-release-gate.sh b/scripts/reboot-recovery/p3-controlled-release-gate.sh index 67a40e3f..68a25911 100755 --- a/scripts/reboot-recovery/p3-controlled-release-gate.sh +++ b/scripts/reboot-recovery/p3-controlled-release-gate.sh @@ -346,7 +346,38 @@ elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && cd_lane_mode=controlled_open fi echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok" -[ "$cd_lane_ok" = "1" ] || bad=1 +cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) +cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) +cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) +cd_lane_drain_capacity_ok=0 +cd_lane_drain_labels_ok=0 +if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then + cd_lane_drain_capacity_ok=1 +fi +if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \ + && grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \ + && ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then + cd_lane_drain_labels_ok=1 +fi +cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing) +cd_lane_drain_binary_elf=0 +echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1 +cd_lane_drain_ok=0 +cd_lane_drain_mode=absent +if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then + cd_lane_drain_mode=blocked +fi +if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then + cd_lane_drain_ok=1 + cd_lane_drain_mode=controlled_open +fi +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok" +cd_lane_guard_ok=0 +if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then + cd_lane_guard_ok=1 +fi +echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" +[ "$cd_lane_guard_ok" = "1" ] || bad=1 direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" [ "$direct_runner_count" = "0" ] || bad=1 diff --git a/scripts/reboot-recovery/post-start-quick-check.sh b/scripts/reboot-recovery/post-start-quick-check.sh index b93dbdcf..332e3eb5 100755 --- a/scripts/reboot-recovery/post-start-quick-check.sh +++ b/scripts/reboot-recovery/post-start-quick-check.sh @@ -579,6 +579,38 @@ elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && cd_lane_mode=controlled_open fi echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok" +cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) +cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) +cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) +cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true) +cd_lane_drain_capacity_ok=0 +cd_lane_drain_labels_ok=0 +if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then + cd_lane_drain_capacity_ok=1 +fi +if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \ + && grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \ + && ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then + cd_lane_drain_labels_ok=1 +fi +cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing) +cd_lane_drain_binary_elf=0 +echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1 +cd_lane_drain_ok=0 +cd_lane_drain_mode=absent +if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then + cd_lane_drain_mode=blocked +fi +if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then + cd_lane_drain_ok=1 + cd_lane_drain_mode=controlled_open +fi +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok" +cd_lane_guard_ok=0 +if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then + cd_lane_guard_ok=1 +fi +echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do @@ -599,7 +631,7 @@ if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' else blocked "110 legacy direct/Gitea runner units are not fail-closed" fi -grep -q "CD_LANE_CONTROLLED .*ok=1" "$runner_tmp" && ok "110 controlled cd-lane is safe or fail-closed" || blocked "110 controlled cd-lane is neither safe-open nor fail-closed" +grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe, drained, or fail-closed" || blocked "110 controlled cd-lane is neither safe-open/drained nor fail-closed" grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || blocked "110 legacy direct runner process detected" grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing" grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking"