fix(reboot): fail closed direct cd lane pressure path [skip ci]
This commit is contained in:
@@ -195,6 +195,7 @@ RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled"
|
||||
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}"
|
||||
START_GITEA_RUNNER_ALLOWED=0
|
||||
RUNNER_FAIL_CLOSED_SERVICES=(
|
||||
"awoooi-cd-lane.service"
|
||||
"awoooi-direct-runner-open.service"
|
||||
"awoooi-direct-runner.service"
|
||||
"gitea-act-runner-host.service"
|
||||
@@ -203,6 +204,7 @@ RUNNER_FAIL_CLOSED_SERVICES=(
|
||||
"gitea-act-runner-awoooi-open.service"
|
||||
)
|
||||
RUNNER_FAIL_CLOSED_BINARY_PATHS=(
|
||||
"/home/wooo/awoooi-cd-lane/awoooi_cd_lane"
|
||||
"/home/wooo/act-runner/act_runner"
|
||||
"/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard"
|
||||
"/home/wooo/act-runner-controlled/act_runner"
|
||||
@@ -264,6 +266,33 @@ EOF
|
||||
chattr +i "$path" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
install_cd_lane_fail_closed_unit() {
|
||||
local unit_file="/etc/systemd/system/awoooi-cd-lane.service"
|
||||
local tmp
|
||||
local quarantine_stamp
|
||||
quarantine_stamp="$(date +%Y%m%d%H%M%S)"
|
||||
|
||||
if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then
|
||||
chattr -i "$unit_file" >/dev/null 2>&1 || true
|
||||
if ! grep -q "AWOOOI direct CD lane fail-closed" "$unit_file" 2>/dev/null; then
|
||||
mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
fi
|
||||
tmp="$(mktemp)"
|
||||
cat >"$tmp" <<'EOF'
|
||||
[Unit]
|
||||
Description=AWOOOI direct CD lane fail-closed after 2026-06-28 pressure incident
|
||||
ConditionPathExists=/run/awoooi-cd-lane-enabled
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/bin/false
|
||||
EOF
|
||||
install -o root -g root -m 0444 "$tmp" "$unit_file" >/dev/null 2>&1 || true
|
||||
rm -f "$tmp"
|
||||
chattr +i "$unit_file" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
ensure_host_runner_fail_closed() {
|
||||
local unit
|
||||
local binary
|
||||
@@ -273,8 +302,12 @@ ensure_host_runner_fail_closed() {
|
||||
systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
|
||||
systemctl reset-failed "$unit" >/dev/null 2>&1 || true
|
||||
systemctl disable "$unit" >/dev/null 2>&1 || true
|
||||
systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system"
|
||||
mask_runner_unit_file "$unit" "/etc/systemd/system"
|
||||
if [ "$unit" = "awoooi-cd-lane.service" ]; then
|
||||
install_cd_lane_fail_closed_unit
|
||||
else
|
||||
systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system"
|
||||
mask_runner_unit_file "$unit" "/etc/systemd/system"
|
||||
fi
|
||||
done
|
||||
systemctl daemon-reload >/dev/null 2>&1 || true
|
||||
|
||||
@@ -289,6 +322,7 @@ ensure_host_runner_fail_closed() {
|
||||
fi
|
||||
|
||||
pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true
|
||||
pkill -KILL -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane daemon" >/dev/null 2>&1 || true
|
||||
for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do
|
||||
guard_runner_binary_fail_closed "$binary"
|
||||
done
|
||||
|
||||
@@ -286,16 +286,24 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*"
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
||||
done
|
||||
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
for u in awoooi-cd-lane.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
||||
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
||||
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
||||
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid"
|
||||
execstart=$(systemctl show "$u" -p ExecStart --value 2>/dev/null || true)
|
||||
unit_ok=0
|
||||
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
||||
unit_ok=1
|
||||
fi
|
||||
if [ "$u" = "awoooi-cd-lane.service" ] && [ "$active" = "inactive" ] && echo "$execstart" | grep -q "/bin/false"; then
|
||||
unit_ok=1
|
||||
fi
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
|
||||
done
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
|
||||
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
kind=$(file -b "$p" 2>/dev/null || echo missing)
|
||||
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
|
||||
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
|
||||
@@ -323,12 +331,12 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -120
|
||||
else
|
||||
warn "runner watchdog state not confirmed"
|
||||
fi
|
||||
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && ($3 != "load=masked" || $4 != "unitfile=masked") {bad=1} END {exit bad}' <<<"$out"; then
|
||||
ok "110 direct/Gitea runner fail-closed units are masked"
|
||||
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' <<<"$out"; then
|
||||
ok "110 direct runner/CD lane units are fail-closed"
|
||||
else
|
||||
fail "110 direct/Gitea runner fail-closed units are not all masked"
|
||||
fail "110 direct runner/CD lane units are not fail-closed"
|
||||
fi
|
||||
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 direct runner process count is zero" || fail "110 direct runner process detected"
|
||||
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" <<<"$out" && ok "110 direct runner/CD lane process count is zero" || fail "110 direct runner/CD lane process detected"
|
||||
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" <<<"$out" && fail "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
|
||||
grep -q "sentry-self-hosted-clickhouse-1.*Restarting" <<<"$out" && warn "Sentry ClickHouse restarting" || ok "Sentry ClickHouse not visibly restarting"
|
||||
}
|
||||
|
||||
@@ -306,17 +306,25 @@ check_runner_guardrails() {
|
||||
local out bad
|
||||
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
|
||||
bad=0
|
||||
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
for u in awoooi-cd-lane.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
||||
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
||||
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active"
|
||||
[ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ] || bad=1
|
||||
execstart=$(systemctl show "$u" -p ExecStart --value 2>/dev/null || true)
|
||||
unit_ok=0
|
||||
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
||||
unit_ok=1
|
||||
fi
|
||||
if [ "$u" = "awoooi-cd-lane.service" ] && [ "$active" = "inactive" ] && echo "$execstart" | grep -q "/bin/false"; then
|
||||
unit_ok=1
|
||||
fi
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active ok=$unit_ok"
|
||||
[ "$unit_ok" = "1" ] || bad=1
|
||||
done
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
|
||||
[ "$direct_runner_count" = "0" ] || bad=1
|
||||
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
kind=$(file -b "$p" 2>/dev/null || echo missing)
|
||||
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
|
||||
echo "$kind" | grep -qi "ELF" && bad=1
|
||||
@@ -338,7 +346,7 @@ echo "BAD_RUNNER_GUARDRAILS $bad"
|
||||
return
|
||||
fi
|
||||
echo "$out"
|
||||
grep -q "BAD_RUNNER_GUARDRAILS 0" <<<"$out" && ok "all discovered runner units have watchdog disabled and CPU/memory limits" || blocked "runner guardrails incomplete"
|
||||
grep -q "BAD_RUNNER_GUARDRAILS 0" <<<"$out" && ok "runner/CD lane fail-closed guardrails complete" || blocked "runner/CD lane guardrails incomplete"
|
||||
}
|
||||
|
||||
check_job_containers() {
|
||||
|
||||
@@ -538,16 +538,24 @@ fi
|
||||
section "110 runner fail-closed guard"
|
||||
runner_tmp="$(mktemp -t post-start-runner.XXXXXX)"
|
||||
if ssh_read "wooo@192.168.0.110" '
|
||||
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
for u in awoooi-cd-lane.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
||||
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
||||
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
||||
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid"
|
||||
execstart=$(systemctl show "$u" -p ExecStart --value 2>/dev/null || true)
|
||||
unit_ok=0
|
||||
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
||||
unit_ok=1
|
||||
fi
|
||||
if [ "$u" = "awoooi-cd-lane.service" ] && [ "$active" = "inactive" ] && echo "$execstart" | grep -q "/bin/false"; then
|
||||
unit_ok=1
|
||||
fi
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
|
||||
done
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
|
||||
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
for p in /home/wooo/awoooi-cd-lane/awoooi_cd_lane /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
kind=$(file -b "$p" 2>/dev/null || echo missing)
|
||||
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
|
||||
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
|
||||
@@ -560,12 +568,12 @@ else
|
||||
blocked "110 runner fail-closed readback failed"
|
||||
fi
|
||||
cat "$runner_tmp"
|
||||
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && ($3 != "load=masked" || $4 != "unitfile=masked") {bad=1} END {exit bad}' "$runner_tmp"; then
|
||||
ok "110 direct/Gitea runner fail-closed units are masked"
|
||||
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' "$runner_tmp"; then
|
||||
ok "110 direct runner/CD lane units are fail-closed"
|
||||
else
|
||||
blocked "110 direct/Gitea runner fail-closed units are not all masked"
|
||||
blocked "110 direct runner/CD lane units are not fail-closed"
|
||||
fi
|
||||
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 direct runner process count is zero" || blocked "110 direct runner process detected"
|
||||
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 direct runner/CD lane process count is zero" || blocked "110 direct runner/CD lane process detected"
|
||||
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && blocked "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
|
||||
grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || blocked "110 host pressure gate is blocking"
|
||||
rm -f "$runner_tmp"
|
||||
|
||||
Reference in New Issue
Block a user