From 04fdf9d07ac41cc7021ca9a1c409a581bc92422c Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Jul 2026 11:10:36 +0800 Subject: [PATCH] fix(runner): preserve guarded active cd drain lane --- .../enforce-110-runner-failclosed.sh | 75 ++++++++++++++++--- .../test_cold_start_monitor_bounded_probes.py | 17 ++++- 2 files changed, 76 insertions(+), 16 deletions(-) diff --git a/scripts/reboot-recovery/enforce-110-runner-failclosed.sh b/scripts/reboot-recovery/enforce-110-runner-failclosed.sh index 4f1a17f4..42ce24b9 100755 --- a/scripts/reboot-recovery/enforce-110-runner-failclosed.sh +++ b/scripts/reboot-recovery/enforce-110-runner-failclosed.sh @@ -111,6 +111,16 @@ as_root() { fi } +bounded_chattr_recursive_clear() { + local path="$1" + [ -e "$path" ] || return 0 + if command -v timeout >/dev/null 2>&1; then + as_root timeout 5s chattr -R -i "$path" >/dev/null 2>&1 || true + return 0 + fi + as_root chattr -R -i "$path" >/dev/null 2>&1 || true +} + host_is_110() { if command -v ip >/dev/null 2>&1; then ip -o -4 addr show 2>/dev/null | awk '{print $4}' | grep -q '^192\.168\.0\.110/' @@ -255,6 +265,22 @@ controlled_drain_service_inactive() { [ "$unitfile" != "enabled" ] || return 1 } +controlled_drain_registration_present() { + [ -s "$CONTROLLED_DRAIN_REGISTRATION" ] +} + +controlled_drain_service_active_guarded() { + local load active unitfile mainpid + load="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p LoadState --value 2>/dev/null || true)" + active="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p ActiveState --value 2>/dev/null || true)" + unitfile="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p UnitFileState --value 2>/dev/null || true)" + mainpid="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p MainPID --value 2>/dev/null || true)" + [ "$load" = "loaded" ] || return 1 + [ "$active" = "active" ] || return 1 + [ "${mainpid:-0}" != "0" ] || return 1 + [ "$unitfile" != "masked" ] || return 1 +} + controlled_drain_staging_allowed() { controlled_drain_config_safe \ && controlled_drain_binary_safe \ @@ -262,6 +288,22 @@ controlled_drain_staging_allowed() { && controlled_drain_service_inactive } +controlled_drain_active_allowed() { + controlled_drain_config_safe \ + && controlled_drain_binary_safe \ + && controlled_drain_unit_safe \ + && controlled_drain_registration_present \ + && controlled_drain_service_active_guarded +} + +controlled_drain_preserve_allowed() { + controlled_drain_staging_allowed || controlled_drain_active_allowed +} + +lane_process_count_ok() { + [ "$(count_lane_processes)" = "0" ] || controlled_drain_active_allowed +} + list_action_runner_units() { { systemctl list-unit-files 'actions.runner.*' --no-legend --plain 2>/dev/null | awk '{print $1}' @@ -272,9 +314,11 @@ list_action_runner_units() { stop_and_mask_units() { local unit for unit in "${RUNNER_UNITS[@]}"; do - if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then + if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_preserve_allowed; then as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true - as_root systemctl disable "$unit" >/dev/null 2>&1 || true + if controlled_drain_staging_allowed; then + as_root systemctl disable "$unit" >/dev/null 2>&1 || true + fi continue fi as_root systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true @@ -301,7 +345,9 @@ stop_and_mask_action_runner_units() { kill_runner_processes() { pkill -KILL -f '^/home/wooo/awoooi-cd-lane/awoooi_cd_lane' >/dev/null 2>&1 || true - pkill -KILL -f '^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled' >/dev/null 2>&1 || true + if ! controlled_drain_active_allowed; then + pkill -KILL -f '^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled' >/dev/null 2>&1 || true + fi pkill -KILL -f '^/home/wooo/act-runner/act_runner' >/dev/null 2>&1 || true pkill -KILL -f '^/home/wooo/act-runner-controlled/act_runner' >/dev/null 2>&1 || true pkill -KILL -f '^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner' >/dev/null 2>&1 || true @@ -348,7 +394,7 @@ seal_lane_binary_restore_sources() { local path while IFS= read -r -d '' path; do [ -e "$path" ] || continue - if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then + if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_preserve_allowed; then continue fi write_failclosed_stub "$path" @@ -367,7 +413,7 @@ quarantine_lane_registration_sources() { local target for lane_dir in "/home/wooo/awoooi-cd-lane" "/home/wooo/awoooi-cd-lane-drain"; do [ -d "$lane_dir" ] || continue - if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_staging_allowed; then + if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_preserve_allowed; then continue fi quarantine_dir="$lane_dir/quarantine-failclosed-${STAMP}" @@ -393,7 +439,7 @@ quarantine_lane_registration_sources() { seal_live_binary_paths() { local path for path in "${LIVE_BINARY_PATHS[@]}"; do - if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then + if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_preserve_allowed; then continue fi write_failclosed_stub "$path" @@ -631,7 +677,7 @@ seal_enforcer_disabler_artifacts() { while IFS= read -r -d '' path; do [ -e "$path" ] || [ -L "$path" ] || continue as_root mkdir -p "$target_root" >/dev/null 2>&1 || true - as_root chattr -R -i "$path" >/dev/null 2>&1 || true + bounded_chattr_recursive_clear "$path" as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true done < <( as_root find /etc/systemd/system -maxdepth 1 -type d \( \ @@ -740,7 +786,7 @@ seal_root_restore_sources() { as_root mkdir -p "$target_root" >/dev/null 2>&1 || true moved=1 fi - as_root chattr -R -i "$path" >/dev/null 2>&1 || true + bounded_chattr_recursive_clear "$path" as_root mv "$path" "$target_root/" >/dev/null 2>&1 || true done < <( as_root find /root -maxdepth 1 -type d \( \ @@ -772,7 +818,7 @@ EOF while IFS= read -r -d '' path; do [ -d "$path" ] || continue as_root mkdir -p "$target_root" >/dev/null 2>&1 || true - as_root chattr -R -i "$path" >/dev/null 2>&1 || true + bounded_chattr_recursive_clear "$path" as_root mv "$path" "$target_root/$(basename "$path").sealed" >/dev/null 2>&1 || true as_root mkdir -p "$path" >/dev/null 2>&1 || true if [ -f "$current" ]; then @@ -805,7 +851,7 @@ mask_unit_file_to_devnull() { seal_lane_unit_files() { mask_unit_file_to_devnull "awoooi-cd-lane.service" - if controlled_drain_staging_allowed; then + if controlled_drain_preserve_allowed; then return 0 fi mask_unit_file_to_devnull "$CONTROLLED_DRAIN_UNIT" @@ -822,7 +868,7 @@ root_restore_sources_left() { unit_ok() { local unit="$1" local load active unitfile mainpid - if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then + if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_preserve_allowed; then return 0 fi load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)" @@ -877,6 +923,9 @@ awoooi_runner_failclosed_enforcer_apply_performed $APPLY_PERFORMED # HELP awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed Controlled drain lane non-secret guardrail staging allowance. # TYPE awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed gauge awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed $(controlled_drain_staging_allowed && echo 1 || echo 0) +# HELP awoooi_runner_failclosed_enforcer_controlled_drain_active_allowed Controlled drain lane guarded active allowance. +# TYPE awoooi_runner_failclosed_enforcer_controlled_drain_active_allowed gauge +awoooi_runner_failclosed_enforcer_controlled_drain_active_allowed $(controlled_drain_active_allowed && echo 1 || echo 0) EOF as_root install -o root -g root -m 0644 "$tmp" "$dir/awoooi_runner_failclosed_enforcer.prom" >/dev/null 2>&1 || true rm -f "$tmp" @@ -892,6 +941,8 @@ print_readback() { echo "RUNNER_PROCESS_COUNT=$(count_runner_processes)" echo "ROOT_RESTORE_SOURCES_LEFT=$(root_restore_sources_left)" echo "CONTROLLED_DRAIN_STAGING_ALLOWED=$(controlled_drain_staging_allowed && echo 1 || echo 0)" + echo "CONTROLLED_DRAIN_ACTIVE_ALLOWED=$(controlled_drain_active_allowed && echo 1 || echo 0)" + echo "CONTROLLED_DRAIN_PRESERVE_ALLOWED=$(controlled_drain_preserve_allowed && echo 1 || echo 0)" echo "RUNNER_UNITS_BAD_COUNT=$(runner_units_bad_count)" for unit in "${RUNNER_UNITS[@]}"; do load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)" @@ -952,7 +1003,7 @@ write_metrics "/home/wooo/node_exporter_textfiles" print_readback if [ "$(count_active_job_containers)" = "0" ] \ - && [ "$(count_lane_processes)" = "0" ] \ + && lane_process_count_ok \ && [ "$(count_runner_processes)" = "0" ] \ && [ "$(root_restore_sources_left)" = "0" ] \ && [ "$(runner_units_bad_count)" = "0" ]; then diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index c2a1eb93..e829af36 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -226,14 +226,21 @@ def test_runner_failclosed_enforcer_does_not_seal_live_startup_recovery_script() assert "awoooi-startup-110.sh.*controlled*" in text -def test_runner_failclosed_enforcer_preserves_controlled_drain_staging_only() -> None: +def test_runner_failclosed_enforcer_preserves_controlled_drain_staging_or_guarded_active() -> None: text = FAILCLOSED_ENFORCER.read_text(encoding="utf-8") assert "controlled_drain_staging_allowed()" in text + assert "controlled_drain_active_allowed()" in text + assert "controlled_drain_preserve_allowed()" in text assert "controlled_drain_config_safe" in text assert "controlled_drain_binary_safe" in text assert "controlled_drain_unit_safe" in text assert "controlled_drain_service_inactive" in text + assert "controlled_drain_registration_present" in text + assert "controlled_drain_service_active_guarded" in text + assert 'lane_process_count_ok \\' in text + assert "bounded_chattr_recursive_clear()" in text + assert "as_root timeout 5s chattr -R -i" in text assert "awoooi-host:host" in text assert ( "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" @@ -245,10 +252,12 @@ def test_runner_failclosed_enforcer_preserves_controlled_drain_staging_only() -> assert 'grep -Eq \'^[[:space:]]*MemoryAccounting=true\'' in text assert 'grep -Eq \'^[[:space:]]*TasksAccounting=true\'' in text assert '[ "$unitfile" != "enabled" ] || return 1' in text - assert 'if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then' in text - assert 'if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then' in text - assert 'if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_staging_allowed; then' in text + assert 'if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_preserve_allowed; then' in text + assert 'if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_preserve_allowed; then' in text + assert 'if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_preserve_allowed; then' in text assert "CONTROLLED_DRAIN_STAGING_ALLOWED=" in text + assert "CONTROLLED_DRAIN_ACTIVE_ALLOWED=" in text + assert "CONTROLLED_DRAIN_PRESERVE_ALLOWED=" in text def test_controlled_cd_lane_unit_source_has_required_accounting_guardrails() -> None: