fix(ops): preserve controlled drain lane staging
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-02 01:20:35 +08:00
parent fe5bc42210
commit c1823b5f62
5 changed files with 233 additions and 4 deletions

View File

@@ -96,6 +96,13 @@ LIVE_BINARY_PATHS=(
"/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner"
)
CONTROLLED_DRAIN_UNIT="${CONTROLLED_DRAIN_UNIT:-awoooi-cd-lane-drain.service}"
CONTROLLED_DRAIN_DIR="${CONTROLLED_DRAIN_DIR:-/home/wooo/awoooi-cd-lane-drain}"
CONTROLLED_DRAIN_BINARY="${CONTROLLED_DRAIN_BINARY:-$CONTROLLED_DRAIN_DIR/awoooi_cd_lane_controlled}"
CONTROLLED_DRAIN_CONFIG="${CONTROLLED_DRAIN_CONFIG:-$CONTROLLED_DRAIN_DIR/config.yaml}"
CONTROLLED_DRAIN_REGISTRATION="${CONTROLLED_DRAIN_REGISTRATION:-$CONTROLLED_DRAIN_DIR/data/.runner}"
CONTROLLED_DRAIN_MAX_CAPACITY="${CONTROLLED_DRAIN_MAX_CAPACITY:-1}"
as_root() {
if [ "${EUID:-$(id -u)}" -eq 0 ]; then
"$@"
@@ -137,6 +144,124 @@ count_runner_processes() {
pgrep -f '^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner|Runner.Listener|Runner.Worker' 2>/dev/null | wc -l | tr -d ' '
}
extract_runner_capacity() {
local config_path="$1"
awk '
/^runner:[[:space:]]*$/ {
in_runner=1
next
}
in_runner && /^[^[:space:]]/ && $0 !~ /^runner:[[:space:]]*$/ {
in_runner=0
}
in_runner && /^[[:space:]]*capacity:[[:space:]]*/ {
line=$0
sub(/^[[:space:]]*capacity:[[:space:]]*/, "", line)
gsub(/["'\'']/, "", line)
print line
exit
}
' "$config_path"
}
extract_runner_labels() {
local config_path="$1"
awk '
/^[[:space:]]*labels:[[:space:]]*$/ {
in_labels=1
next
}
in_labels && /^[[:space:]]*-[[:space:]]*/ {
line=$0
sub(/^[[:space:]]*-[[:space:]]*"/, "", line)
sub(/^[[:space:]]*-[[:space:]]*/, "", line)
sub(/"[[:space:]]*$/, "", line)
print line
next
}
in_labels && /^[^[:space:]]/ {
in_labels=0
}
' "$config_path"
}
label_name() {
printf '%s' "${1%%:*}"
}
controlled_drain_config_safe() {
local capacity labels label name has_host=0 has_ubuntu=0
[ -r "$CONTROLLED_DRAIN_CONFIG" ] || return 1
capacity="$(extract_runner_capacity "$CONTROLLED_DRAIN_CONFIG" | head -1)"
printf '%s' "${capacity:-}" | grep -Eq '^[0-9]+$' || return 1
[ "$capacity" -le "$CONTROLLED_DRAIN_MAX_CAPACITY" ] || return 1
labels="$(extract_runner_labels "$CONTROLLED_DRAIN_CONFIG" || true)"
[ -n "$labels" ] || return 1
while IFS= read -r label; do
[ -n "$label" ] || continue
name="$(label_name "$label")"
case "$name" in
awoooi-host)
[ "$label" = "awoooi-host:host" ] || return 1
has_host=1
;;
awoooi-ubuntu)
[ "$label" = "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" ] || return 1
has_ubuntu=1
;;
ubuntu-latest|ubuntu-*|self-hosted|stockplatform*|stock-platform*|headless*|playwright*)
return 1
;;
*)
return 1
;;
esac
done <<<"$labels"
[ "$has_host" -eq 1 ] && [ "$has_ubuntu" -eq 1 ]
}
controlled_drain_binary_safe() {
local kind
[ -f "$CONTROLLED_DRAIN_BINARY" ] && [ -x "$CONTROLLED_DRAIN_BINARY" ] || return 1
kind="$(file -b "$CONTROLLED_DRAIN_BINARY" 2>/dev/null || echo missing)"
grep -qi 'ELF' <<<"$kind"
}
controlled_drain_unit_safe() {
local text
text="$(systemctl cat "$CONTROLLED_DRAIN_UNIT" 2>/dev/null || true)"
[ -n "$text" ] || return 1
grep -Fq -- "ConditionPathExists=$CONTROLLED_DRAIN_REGISTRATION" <<<"$text" || return 1
grep -Fq -- "$CONTROLLED_DRAIN_BINARY daemon --config $CONTROLLED_DRAIN_CONFIG" <<<"$text" || return 1
grep -Eq '^[[:space:]]*CPUAccounting=true' <<<"$text" || return 1
grep -Eq '^[[:space:]]*CPUQuota=' <<<"$text" || return 1
grep -Eq '^[[:space:]]*MemoryAccounting=true' <<<"$text" || return 1
grep -Eq '^[[:space:]]*Memory(High|Max)=' <<<"$text" || return 1
grep -Eq '^[[:space:]]*TasksAccounting=true' <<<"$text" || return 1
grep -Eq '^[[:space:]]*TasksMax=' <<<"$text" || return 1
grep -Eq '^[[:space:]]*NoNewPrivileges=true' <<<"$text" || return 1
}
controlled_drain_service_inactive() {
local load active unitfile mainpid
load="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p LoadState --value 2>/dev/null || true)"
active="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p ActiveState --value 2>/dev/null || true)"
unitfile="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p UnitFileState --value 2>/dev/null || true)"
mainpid="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p MainPID --value 2>/dev/null || true)"
{ [ "$active" = "inactive" ] || [ "$active" = "failed" ] || [ "$active" = "unknown" ] || [ -z "$active" ]; } || return 1
[ "${mainpid:-0}" = "0" ] || return 1
[ "$load" != "masked" ] || return 1
[ "$unitfile" != "masked" ] || return 1
[ "$unitfile" != "enabled" ] || return 1
}
controlled_drain_staging_allowed() {
controlled_drain_config_safe \
&& controlled_drain_binary_safe \
&& controlled_drain_unit_safe \
&& controlled_drain_service_inactive
}
list_action_runner_units() {
{
systemctl list-unit-files 'actions.runner.*' --no-legend --plain 2>/dev/null | awk '{print $1}'
@@ -147,6 +272,11 @@ list_action_runner_units() {
stop_and_mask_units() {
local unit
for unit in "${RUNNER_UNITS[@]}"; do
if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then
as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true
as_root systemctl disable "$unit" >/dev/null 2>&1 || true
continue
fi
as_root systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
as_root systemctl stop "$unit" >/dev/null 2>&1 || true
as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true
@@ -218,6 +348,9 @@ seal_lane_binary_restore_sources() {
local path
while IFS= read -r -d '' path; do
[ -e "$path" ] || continue
if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then
continue
fi
write_failclosed_stub "$path"
done < <(
{
@@ -234,6 +367,9 @@ quarantine_lane_registration_sources() {
local target
for lane_dir in "/home/wooo/awoooi-cd-lane" "/home/wooo/awoooi-cd-lane-drain"; do
[ -d "$lane_dir" ] || continue
if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_staging_allowed; then
continue
fi
quarantine_dir="$lane_dir/quarantine-failclosed-${STAMP}"
as_root chattr -i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
as_root mkdir -p "$quarantine_dir" >/dev/null 2>&1 || true
@@ -257,6 +393,9 @@ quarantine_lane_registration_sources() {
seal_live_binary_paths() {
local path
for path in "${LIVE_BINARY_PATHS[@]}"; do
if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then
continue
fi
write_failclosed_stub "$path"
done
}
@@ -666,7 +805,10 @@ mask_unit_file_to_devnull() {
seal_lane_unit_files() {
mask_unit_file_to_devnull "awoooi-cd-lane.service"
mask_unit_file_to_devnull "awoooi-cd-lane-drain.service"
if controlled_drain_staging_allowed; then
return 0
fi
mask_unit_file_to_devnull "$CONTROLLED_DRAIN_UNIT"
}
root_restore_sources_left() {
@@ -680,6 +822,9 @@ root_restore_sources_left() {
unit_ok() {
local unit="$1"
local load active unitfile mainpid
if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then
return 0
fi
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"
active="$(systemctl show "$unit" -p ActiveState --value 2>/dev/null || true)"
unitfile="$(systemctl show "$unit" -p UnitFileState --value 2>/dev/null || true)"
@@ -729,6 +874,9 @@ awoooi_runner_failclosed_enforcer_root_restore_sources_left $(root_restore_sourc
# HELP awoooi_runner_failclosed_enforcer_apply_performed Whether this run used apply mode.
# TYPE awoooi_runner_failclosed_enforcer_apply_performed gauge
awoooi_runner_failclosed_enforcer_apply_performed $APPLY_PERFORMED
# HELP awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed Controlled drain lane non-secret guardrail staging allowance.
# TYPE awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed gauge
awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed $(controlled_drain_staging_allowed && echo 1 || echo 0)
EOF
as_root install -o root -g root -m 0644 "$tmp" "$dir/awoooi_runner_failclosed_enforcer.prom" >/dev/null 2>&1 || true
rm -f "$tmp"
@@ -743,6 +891,7 @@ print_readback() {
echo "LANE_PROCESS_COUNT=$(count_lane_processes)"
echo "RUNNER_PROCESS_COUNT=$(count_runner_processes)"
echo "ROOT_RESTORE_SOURCES_LEFT=$(root_restore_sources_left)"
echo "CONTROLLED_DRAIN_STAGING_ALLOWED=$(controlled_drain_staging_allowed && echo 1 || echo 0)"
echo "RUNNER_UNITS_BAD_COUNT=$(runner_units_bad_count)"
for unit in "${RUNNER_UNITS[@]}"; do
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"

View File

@@ -22,6 +22,7 @@ REPAIR_STARTUP_STUB = (
FAILCLOSED_ENFORCER = (
ROOT / "scripts" / "reboot-recovery" / "enforce-110-runner-failclosed.sh"
)
CONTROLLED_CD_LANE_DRAIN_UNIT = ROOT / "ops" / "runner" / "awoooi-cd-lane-drain.service"
SSH_AUTH_DIAGNOSE = (
ROOT / "scripts" / "reboot-recovery" / "diagnose-110-ssh-publickey-auth.sh"
)
@@ -206,6 +207,47 @@ def test_runner_failclosed_enforcer_does_not_seal_live_startup_recovery_script()
assert "awoooi-startup-110.sh.*controlled*" in text
def test_runner_failclosed_enforcer_preserves_controlled_drain_staging_only() -> None:
text = FAILCLOSED_ENFORCER.read_text(encoding="utf-8")
assert "controlled_drain_staging_allowed()" in text
assert "controlled_drain_config_safe" in text
assert "controlled_drain_binary_safe" in text
assert "controlled_drain_unit_safe" in text
assert "controlled_drain_service_inactive" in text
assert "awoooi-host:host" in text
assert (
"awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
in text
)
assert "ubuntu-latest|ubuntu-*|self-hosted|stockplatform*|stock-platform*|headless*|playwright*)" in text
assert 'grep -Fq -- "ConditionPathExists=$CONTROLLED_DRAIN_REGISTRATION"' in text
assert 'grep -Eq \'^[[:space:]]*CPUAccounting=true\'' in text
assert 'grep -Eq \'^[[:space:]]*MemoryAccounting=true\'' in text
assert 'grep -Eq \'^[[:space:]]*TasksAccounting=true\'' in text
assert '[ "$unitfile" != "enabled" ] || return 1' in text
assert 'if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then' in text
assert 'if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then' in text
assert 'if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_staging_allowed; then' in text
assert "CONTROLLED_DRAIN_STAGING_ALLOWED=" in text
def test_controlled_cd_lane_unit_source_has_required_accounting_guardrails() -> None:
text = CONTROLLED_CD_LANE_DRAIN_UNIT.read_text(encoding="utf-8")
assert "ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner" in text
assert "CPUAccounting=true" in text
assert "CPUQuota=250%" in text
assert "MemoryAccounting=true" in text
assert "MemoryHigh=8G" in text
assert "MemoryMax=12G" in text
assert "TasksAccounting=true" in text
assert "TasksMax=512" in text
assert "IOAccounting=true" in text
assert "IOWeight=100" in text
assert "NoNewPrivileges=true" in text
def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None:
text = SSH_AUTH_DIAGNOSE.read_text(encoding="utf-8")