fix(ops): preserve controlled drain lane staging
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -96,6 +96,13 @@ LIVE_BINARY_PATHS=(
|
||||
"/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner"
|
||||
)
|
||||
|
||||
CONTROLLED_DRAIN_UNIT="${CONTROLLED_DRAIN_UNIT:-awoooi-cd-lane-drain.service}"
|
||||
CONTROLLED_DRAIN_DIR="${CONTROLLED_DRAIN_DIR:-/home/wooo/awoooi-cd-lane-drain}"
|
||||
CONTROLLED_DRAIN_BINARY="${CONTROLLED_DRAIN_BINARY:-$CONTROLLED_DRAIN_DIR/awoooi_cd_lane_controlled}"
|
||||
CONTROLLED_DRAIN_CONFIG="${CONTROLLED_DRAIN_CONFIG:-$CONTROLLED_DRAIN_DIR/config.yaml}"
|
||||
CONTROLLED_DRAIN_REGISTRATION="${CONTROLLED_DRAIN_REGISTRATION:-$CONTROLLED_DRAIN_DIR/data/.runner}"
|
||||
CONTROLLED_DRAIN_MAX_CAPACITY="${CONTROLLED_DRAIN_MAX_CAPACITY:-1}"
|
||||
|
||||
as_root() {
|
||||
if [ "${EUID:-$(id -u)}" -eq 0 ]; then
|
||||
"$@"
|
||||
@@ -137,6 +144,124 @@ count_runner_processes() {
|
||||
pgrep -f '^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner|Runner.Listener|Runner.Worker' 2>/dev/null | wc -l | tr -d ' '
|
||||
}
|
||||
|
||||
extract_runner_capacity() {
|
||||
local config_path="$1"
|
||||
awk '
|
||||
/^runner:[[:space:]]*$/ {
|
||||
in_runner=1
|
||||
next
|
||||
}
|
||||
in_runner && /^[^[:space:]]/ && $0 !~ /^runner:[[:space:]]*$/ {
|
||||
in_runner=0
|
||||
}
|
||||
in_runner && /^[[:space:]]*capacity:[[:space:]]*/ {
|
||||
line=$0
|
||||
sub(/^[[:space:]]*capacity:[[:space:]]*/, "", line)
|
||||
gsub(/["'\'']/, "", line)
|
||||
print line
|
||||
exit
|
||||
}
|
||||
' "$config_path"
|
||||
}
|
||||
|
||||
extract_runner_labels() {
|
||||
local config_path="$1"
|
||||
awk '
|
||||
/^[[:space:]]*labels:[[:space:]]*$/ {
|
||||
in_labels=1
|
||||
next
|
||||
}
|
||||
in_labels && /^[[:space:]]*-[[:space:]]*/ {
|
||||
line=$0
|
||||
sub(/^[[:space:]]*-[[:space:]]*"/, "", line)
|
||||
sub(/^[[:space:]]*-[[:space:]]*/, "", line)
|
||||
sub(/"[[:space:]]*$/, "", line)
|
||||
print line
|
||||
next
|
||||
}
|
||||
in_labels && /^[^[:space:]]/ {
|
||||
in_labels=0
|
||||
}
|
||||
' "$config_path"
|
||||
}
|
||||
|
||||
label_name() {
|
||||
printf '%s' "${1%%:*}"
|
||||
}
|
||||
|
||||
controlled_drain_config_safe() {
|
||||
local capacity labels label name has_host=0 has_ubuntu=0
|
||||
[ -r "$CONTROLLED_DRAIN_CONFIG" ] || return 1
|
||||
capacity="$(extract_runner_capacity "$CONTROLLED_DRAIN_CONFIG" | head -1)"
|
||||
printf '%s' "${capacity:-}" | grep -Eq '^[0-9]+$' || return 1
|
||||
[ "$capacity" -le "$CONTROLLED_DRAIN_MAX_CAPACITY" ] || return 1
|
||||
labels="$(extract_runner_labels "$CONTROLLED_DRAIN_CONFIG" || true)"
|
||||
[ -n "$labels" ] || return 1
|
||||
while IFS= read -r label; do
|
||||
[ -n "$label" ] || continue
|
||||
name="$(label_name "$label")"
|
||||
case "$name" in
|
||||
awoooi-host)
|
||||
[ "$label" = "awoooi-host:host" ] || return 1
|
||||
has_host=1
|
||||
;;
|
||||
awoooi-ubuntu)
|
||||
[ "$label" = "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" ] || return 1
|
||||
has_ubuntu=1
|
||||
;;
|
||||
ubuntu-latest|ubuntu-*|self-hosted|stockplatform*|stock-platform*|headless*|playwright*)
|
||||
return 1
|
||||
;;
|
||||
*)
|
||||
return 1
|
||||
;;
|
||||
esac
|
||||
done <<<"$labels"
|
||||
[ "$has_host" -eq 1 ] && [ "$has_ubuntu" -eq 1 ]
|
||||
}
|
||||
|
||||
controlled_drain_binary_safe() {
|
||||
local kind
|
||||
[ -f "$CONTROLLED_DRAIN_BINARY" ] && [ -x "$CONTROLLED_DRAIN_BINARY" ] || return 1
|
||||
kind="$(file -b "$CONTROLLED_DRAIN_BINARY" 2>/dev/null || echo missing)"
|
||||
grep -qi 'ELF' <<<"$kind"
|
||||
}
|
||||
|
||||
controlled_drain_unit_safe() {
|
||||
local text
|
||||
text="$(systemctl cat "$CONTROLLED_DRAIN_UNIT" 2>/dev/null || true)"
|
||||
[ -n "$text" ] || return 1
|
||||
grep -Fq -- "ConditionPathExists=$CONTROLLED_DRAIN_REGISTRATION" <<<"$text" || return 1
|
||||
grep -Fq -- "$CONTROLLED_DRAIN_BINARY daemon --config $CONTROLLED_DRAIN_CONFIG" <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*CPUAccounting=true' <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*CPUQuota=' <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*MemoryAccounting=true' <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*Memory(High|Max)=' <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*TasksAccounting=true' <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*TasksMax=' <<<"$text" || return 1
|
||||
grep -Eq '^[[:space:]]*NoNewPrivileges=true' <<<"$text" || return 1
|
||||
}
|
||||
|
||||
controlled_drain_service_inactive() {
|
||||
local load active unitfile mainpid
|
||||
load="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p LoadState --value 2>/dev/null || true)"
|
||||
active="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p ActiveState --value 2>/dev/null || true)"
|
||||
unitfile="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p UnitFileState --value 2>/dev/null || true)"
|
||||
mainpid="$(systemctl show "$CONTROLLED_DRAIN_UNIT" -p MainPID --value 2>/dev/null || true)"
|
||||
{ [ "$active" = "inactive" ] || [ "$active" = "failed" ] || [ "$active" = "unknown" ] || [ -z "$active" ]; } || return 1
|
||||
[ "${mainpid:-0}" = "0" ] || return 1
|
||||
[ "$load" != "masked" ] || return 1
|
||||
[ "$unitfile" != "masked" ] || return 1
|
||||
[ "$unitfile" != "enabled" ] || return 1
|
||||
}
|
||||
|
||||
controlled_drain_staging_allowed() {
|
||||
controlled_drain_config_safe \
|
||||
&& controlled_drain_binary_safe \
|
||||
&& controlled_drain_unit_safe \
|
||||
&& controlled_drain_service_inactive
|
||||
}
|
||||
|
||||
list_action_runner_units() {
|
||||
{
|
||||
systemctl list-unit-files 'actions.runner.*' --no-legend --plain 2>/dev/null | awk '{print $1}'
|
||||
@@ -147,6 +272,11 @@ list_action_runner_units() {
|
||||
stop_and_mask_units() {
|
||||
local unit
|
||||
for unit in "${RUNNER_UNITS[@]}"; do
|
||||
if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then
|
||||
as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true
|
||||
as_root systemctl disable "$unit" >/dev/null 2>&1 || true
|
||||
continue
|
||||
fi
|
||||
as_root systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
|
||||
as_root systemctl stop "$unit" >/dev/null 2>&1 || true
|
||||
as_root systemctl reset-failed "$unit" >/dev/null 2>&1 || true
|
||||
@@ -218,6 +348,9 @@ seal_lane_binary_restore_sources() {
|
||||
local path
|
||||
while IFS= read -r -d '' path; do
|
||||
[ -e "$path" ] || continue
|
||||
if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then
|
||||
continue
|
||||
fi
|
||||
write_failclosed_stub "$path"
|
||||
done < <(
|
||||
{
|
||||
@@ -234,6 +367,9 @@ quarantine_lane_registration_sources() {
|
||||
local target
|
||||
for lane_dir in "/home/wooo/awoooi-cd-lane" "/home/wooo/awoooi-cd-lane-drain"; do
|
||||
[ -d "$lane_dir" ] || continue
|
||||
if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_staging_allowed; then
|
||||
continue
|
||||
fi
|
||||
quarantine_dir="$lane_dir/quarantine-failclosed-${STAMP}"
|
||||
as_root chattr -i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
|
||||
as_root mkdir -p "$quarantine_dir" >/dev/null 2>&1 || true
|
||||
@@ -257,6 +393,9 @@ quarantine_lane_registration_sources() {
|
||||
seal_live_binary_paths() {
|
||||
local path
|
||||
for path in "${LIVE_BINARY_PATHS[@]}"; do
|
||||
if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then
|
||||
continue
|
||||
fi
|
||||
write_failclosed_stub "$path"
|
||||
done
|
||||
}
|
||||
@@ -666,7 +805,10 @@ mask_unit_file_to_devnull() {
|
||||
|
||||
seal_lane_unit_files() {
|
||||
mask_unit_file_to_devnull "awoooi-cd-lane.service"
|
||||
mask_unit_file_to_devnull "awoooi-cd-lane-drain.service"
|
||||
if controlled_drain_staging_allowed; then
|
||||
return 0
|
||||
fi
|
||||
mask_unit_file_to_devnull "$CONTROLLED_DRAIN_UNIT"
|
||||
}
|
||||
|
||||
root_restore_sources_left() {
|
||||
@@ -680,6 +822,9 @@ root_restore_sources_left() {
|
||||
unit_ok() {
|
||||
local unit="$1"
|
||||
local load active unitfile mainpid
|
||||
if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then
|
||||
return 0
|
||||
fi
|
||||
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"
|
||||
active="$(systemctl show "$unit" -p ActiveState --value 2>/dev/null || true)"
|
||||
unitfile="$(systemctl show "$unit" -p UnitFileState --value 2>/dev/null || true)"
|
||||
@@ -729,6 +874,9 @@ awoooi_runner_failclosed_enforcer_root_restore_sources_left $(root_restore_sourc
|
||||
# HELP awoooi_runner_failclosed_enforcer_apply_performed Whether this run used apply mode.
|
||||
# TYPE awoooi_runner_failclosed_enforcer_apply_performed gauge
|
||||
awoooi_runner_failclosed_enforcer_apply_performed $APPLY_PERFORMED
|
||||
# HELP awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed Controlled drain lane non-secret guardrail staging allowance.
|
||||
# TYPE awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed gauge
|
||||
awoooi_runner_failclosed_enforcer_controlled_drain_staging_allowed $(controlled_drain_staging_allowed && echo 1 || echo 0)
|
||||
EOF
|
||||
as_root install -o root -g root -m 0644 "$tmp" "$dir/awoooi_runner_failclosed_enforcer.prom" >/dev/null 2>&1 || true
|
||||
rm -f "$tmp"
|
||||
@@ -743,6 +891,7 @@ print_readback() {
|
||||
echo "LANE_PROCESS_COUNT=$(count_lane_processes)"
|
||||
echo "RUNNER_PROCESS_COUNT=$(count_runner_processes)"
|
||||
echo "ROOT_RESTORE_SOURCES_LEFT=$(root_restore_sources_left)"
|
||||
echo "CONTROLLED_DRAIN_STAGING_ALLOWED=$(controlled_drain_staging_allowed && echo 1 || echo 0)"
|
||||
echo "RUNNER_UNITS_BAD_COUNT=$(runner_units_bad_count)"
|
||||
for unit in "${RUNNER_UNITS[@]}"; do
|
||||
load="$(systemctl show "$unit" -p LoadState --value 2>/dev/null || true)"
|
||||
|
||||
@@ -22,6 +22,7 @@ REPAIR_STARTUP_STUB = (
|
||||
FAILCLOSED_ENFORCER = (
|
||||
ROOT / "scripts" / "reboot-recovery" / "enforce-110-runner-failclosed.sh"
|
||||
)
|
||||
CONTROLLED_CD_LANE_DRAIN_UNIT = ROOT / "ops" / "runner" / "awoooi-cd-lane-drain.service"
|
||||
SSH_AUTH_DIAGNOSE = (
|
||||
ROOT / "scripts" / "reboot-recovery" / "diagnose-110-ssh-publickey-auth.sh"
|
||||
)
|
||||
@@ -206,6 +207,47 @@ def test_runner_failclosed_enforcer_does_not_seal_live_startup_recovery_script()
|
||||
assert "awoooi-startup-110.sh.*controlled*" in text
|
||||
|
||||
|
||||
def test_runner_failclosed_enforcer_preserves_controlled_drain_staging_only() -> None:
|
||||
text = FAILCLOSED_ENFORCER.read_text(encoding="utf-8")
|
||||
|
||||
assert "controlled_drain_staging_allowed()" in text
|
||||
assert "controlled_drain_config_safe" in text
|
||||
assert "controlled_drain_binary_safe" in text
|
||||
assert "controlled_drain_unit_safe" in text
|
||||
assert "controlled_drain_service_inactive" in text
|
||||
assert "awoooi-host:host" in text
|
||||
assert (
|
||||
"awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"
|
||||
in text
|
||||
)
|
||||
assert "ubuntu-latest|ubuntu-*|self-hosted|stockplatform*|stock-platform*|headless*|playwright*)" in text
|
||||
assert 'grep -Fq -- "ConditionPathExists=$CONTROLLED_DRAIN_REGISTRATION"' in text
|
||||
assert 'grep -Eq \'^[[:space:]]*CPUAccounting=true\'' in text
|
||||
assert 'grep -Eq \'^[[:space:]]*MemoryAccounting=true\'' in text
|
||||
assert 'grep -Eq \'^[[:space:]]*TasksAccounting=true\'' in text
|
||||
assert '[ "$unitfile" != "enabled" ] || return 1' in text
|
||||
assert 'if [ "$unit" = "$CONTROLLED_DRAIN_UNIT" ] && controlled_drain_staging_allowed; then' in text
|
||||
assert 'if [ "$path" = "$CONTROLLED_DRAIN_BINARY" ] && controlled_drain_staging_allowed; then' in text
|
||||
assert 'if [ "$lane_dir" = "$CONTROLLED_DRAIN_DIR" ] && controlled_drain_staging_allowed; then' in text
|
||||
assert "CONTROLLED_DRAIN_STAGING_ALLOWED=" in text
|
||||
|
||||
|
||||
def test_controlled_cd_lane_unit_source_has_required_accounting_guardrails() -> None:
|
||||
text = CONTROLLED_CD_LANE_DRAIN_UNIT.read_text(encoding="utf-8")
|
||||
|
||||
assert "ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner" in text
|
||||
assert "CPUAccounting=true" in text
|
||||
assert "CPUQuota=250%" in text
|
||||
assert "MemoryAccounting=true" in text
|
||||
assert "MemoryHigh=8G" in text
|
||||
assert "MemoryMax=12G" in text
|
||||
assert "TasksAccounting=true" in text
|
||||
assert "TasksMax=512" in text
|
||||
assert "IOAccounting=true" in text
|
||||
assert "IOWeight=100" in text
|
||||
assert "NoNewPrivileges=true" in text
|
||||
|
||||
|
||||
def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None:
|
||||
text = SSH_AUTH_DIAGNOSE.read_text(encoding="utf-8")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user