From f1758cdf0c823d84bad11c3941dc40445150db71 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 30 Jun 2026 21:37:24 +0800 Subject: [PATCH] fix(runner): verify 110 controlled cd lane readiness --- docs/LOGBOOK.md | 23 + ops/runner/awoooi-cd-lane-drain.service | 1 + ...awoooi-110-controlled-cd-lane-readiness.sh | 453 ++++++++++++++++++ ...awoooi_110_controlled_cd_lane_readiness.py | 267 +++++++++++ .../test_verify_awoooi_non110_cd_closure.py | 6 + ops/runner/verify-awoooi-non110-cd-closure.py | 3 +- scripts/reboot-recovery/awoooi-startup-110.sh | 1 + .../test_cold_start_monitor_bounded_probes.py | 1 + 8 files changed, 754 insertions(+), 1 deletion(-) create mode 100755 ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh create mode 100644 ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index c4f9f442..2bec1424 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -50664,3 +50664,26 @@ production browser smoke: **下一步**: - commit/push 後等待新的 Gitea CD run;deploy marker 更新後讀回 `delivery-closure-workbench`、`awoooi-priority-work-order-readback`、`stockplatform-public-api-controlled-recovery-preflight`。 + +## 2026-06-30 — 21:42 110 controlled `awoooi-host` lane readiness verifier + +**完成內容**: +- 新增 `ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh`,把目前 Gitea `harbor-110-local-repair` 顯示 `No matching online runner with label: awoooi-host` 的主線 blocker 收斂成可重跑 verifier。 +- verifier 僅讀 metadata,不讀 `.runner` 內容、不印 runner token;檢查 110 host selector、controlled drain lane `capacity=1`、`awoooi-host:host` / `awoooi-ubuntu` labels、ELF binary、registration metadata 存在、systemd CPU / memory / tasks / `NoNewPrivileges` guardrails、legacy runner fail-closed、root restore-source left `0`、active action container / heavy process / load 壓力。 +- `awoooi-cd-lane-drain.service` 與 `awoooi-startup-110.sh` 產生的 controlled drain unit 新增 `ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner`,避免 service active 但未註冊時假裝可承接 `awoooi-host` queue。 +- `ops/runner/verify-awoooi-non110-cd-closure.py` 的 Harbor 110 no-matching next action 改為先在 110 跑 `check-awoooi-110-controlled-cd-lane-readiness.sh`,通過後再恢復 `awoooi-host` control path 並重讀 queue/closure。 + +**本地驗證結果**: +- `pytest ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py ops/runner/test_verify_awoooi_non110_cd_closure.py ops/runner/test_cd_controlled_runtime_profile.py ops/runner/test_guard_gitea_runner_pressure.py scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py -q`:`56 passed`。 +- `python3.11 -m ruff check ...`:通過。 +- `bash -n ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh scripts/reboot-recovery/awoooi-startup-110.sh`:通過。 +- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`:通過,`auto_branch_events_on_110=0`、`generic_runner_labels=0`。 +- `node scripts/ci/check-gitea-step-env-secrets.js`:通過。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 `.runner` 內容。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart,沒有 workflow_dispatch,沒有 runtime write。 + +**下一步**: +- commit/push 後讀回 Gitea queue / non110 CD closure / registry;若仍是 `awoooi-host` no-matching,下一個 controlled apply target 是在 110 上跑此 verifier,依 safe_next_step 恢復 controlled drain lane 或補 registration metadata,再重讀 Harbor repair queue。 diff --git a/ops/runner/awoooi-cd-lane-drain.service b/ops/runner/awoooi-cd-lane-drain.service index 13f2f67a..9b2ce163 100644 --- a/ops/runner/awoooi-cd-lane-drain.service +++ b/ops/runner/awoooi-cd-lane-drain.service @@ -3,6 +3,7 @@ Description=AWOOOI controlled CD lane drain After=network-online.target docker.service Wants=network-online.target Requires=docker.service +ConditionPathExists=/home/wooo/awoooi-cd-lane-drain/data/.runner [Service] Type=simple diff --git a/ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh b/ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh new file mode 100755 index 00000000..d0ff5b23 --- /dev/null +++ b/ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh @@ -0,0 +1,453 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Read-only verifier for the 110 controlled AWOOOI CD lane. It only prints +# metadata needed to decide whether the awoooi-host queue can be served safely. +# It never reads or prints runner registration contents. + +TARGET_HOST_IP="${TARGET_HOST_IP:-192.168.0.110}" +CD_LANE_DRAIN_DIR="${CD_LANE_DRAIN_DIR:-/home/wooo/awoooi-cd-lane-drain}" +CD_LANE_DRAIN_SERVICE="${CD_LANE_DRAIN_SERVICE:-awoooi-cd-lane-drain.service}" +CD_LANE_DRAIN_BINARY="${CD_LANE_DRAIN_BINARY:-${CD_LANE_DRAIN_DIR}/awoooi_cd_lane_controlled}" +CD_LANE_DRAIN_CONFIG="${CD_LANE_DRAIN_CONFIG:-${CD_LANE_DRAIN_DIR}/config.yaml}" +CD_LANE_DRAIN_REGISTRATION_PATHS="${CD_LANE_DRAIN_REGISTRATION_PATHS:-${CD_LANE_DRAIN_DIR}/data/.runner ${CD_LANE_DRAIN_DIR}/.runner}" +PRIMARY_CD_LANE_SERVICE="${PRIMARY_CD_LANE_SERVICE:-awoooi-cd-lane.service}" +ROOT_RESTORE_PARENT="${ROOT_RESTORE_PARENT:-/root}" +MAX_CAPACITY="${MAX_CAPACITY:-1}" +MAX_HEAVY_PROCESS_COUNT="${MAX_HEAVY_PROCESS_COUNT:-0}" +MAX_ACTIVE_ACTION_CONTAINERS="${MAX_ACTIVE_ACTION_CONTAINERS:-0}" +MAX_LOAD_PER_CORE="${MAX_LOAD_PER_CORE:-1.25}" +REQUIRE_ACTIVE_SERVICE="${REQUIRE_ACTIVE_SERVICE:-1}" +REQUIRE_PRIMARY_LANE_FAILCLOSED="${REQUIRE_PRIMARY_LANE_FAILCLOSED:-1}" +LEGACY_RUNNER_SERVICE_NAMES="${LEGACY_RUNNER_SERVICE_NAMES:-awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service}" +LEGACY_RUNNER_BINARY_PATHS="${LEGACY_RUNNER_BINARY_PATHS:-/home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner}" +FORBIDDEN_LABEL_RE="${FORBIDDEN_LABEL_RE:-^(ubuntu-latest|ubuntu-[0-9].*|self-hosted|stockplatform.*|stock-platform.*|headless.*|playwright.*)$}" + +BLOCKERS=() +WARNINGS=() +CONFIG_READY=0 +BINARY_READY=0 +REGISTRATION_READY=0 +SERVICE_READY=0 +LEGACY_FAILCLOSED=0 +PRIMARY_LANE_FAILCLOSED=0 +ROOT_RESTORE_LEFT="unknown" + +section() { + printf '\n== %s ==\n' "$1" +} + +blocker() { + BLOCKERS+=("$1") + printf 'BLOCKER %s\n' "$1" +} + +warning() { + WARNINGS+=("$1") + printf 'WARNING %s\n' "$1" +} + +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +host_ips() { + if command_exists ip; then + ip -o -4 addr show 2>/dev/null | awk '{print $4}' | sed 's#/.*##' | sort -u + return 0 + fi + hostname -I 2>/dev/null | tr ' ' '\n' | awk 'NF' | sort -u || true +} + +host_has_ip() { + local ip="$1" + host_ips | grep -qx "$ip" +} + +systemd_cat() { + local unit="$1" + if systemctl cat "$unit" >/dev/null 2>&1; then + systemctl cat "$unit" 2>/dev/null + return 0 + fi + return 1 +} + +systemd_show() { + local unit="$1" + systemctl show "$unit" \ + -p LoadState \ + -p ActiveState \ + -p UnitFileState \ + -p MainPID \ + --no-pager 2>/dev/null || true +} + +systemd_value() { + local unit="$1" + local key="$2" + systemd_show "$unit" | awk -F= -v k="$key" '$1 == k {print $2; found=1} END {if (!found) print ""}' +} + +unit_failclosed_or_absent() { + local unit="$1" + local state load unitfile active mainpid + state="$(systemd_show "$unit" | tr '\n' ' ')" + load="$(printf '%s\n' "$state" | tr ' ' '\n' | awk -F= '$1 == "LoadState" {print $2; exit}')" + unitfile="$(printf '%s\n' "$state" | tr ' ' '\n' | awk -F= '$1 == "UnitFileState" {print $2; exit}')" + active="$(printf '%s\n' "$state" | tr ' ' '\n' | awk -F= '$1 == "ActiveState" {print $2; exit}')" + mainpid="$(printf '%s\n' "$state" | tr ' ' '\n' | awk -F= '$1 == "MainPID" {print $2; exit}')" + printf 'FAILCLOSED_UNIT unit=%s load=%s unitfile=%s active=%s mainpid=%s\n' \ + "$unit" "${load:-unknown}" "${unitfile:-unknown}" "${active:-unknown}" "${mainpid:-unknown}" + if [ "${load:-}" = "not-found" ]; then + return 0 + fi + if [ "${load:-}" = "masked" ] && [ "${unitfile:-}" = "masked" ] && [ "${active:-}" != "active" ] && [ "${mainpid:-0}" = "0" ]; then + return 0 + fi + return 1 +} + +extract_runner_capacity() { + local config_path="$1" + awk ' + /^runner:[[:space:]]*$/ { + in_runner=1 + next + } + in_runner && /^[^[:space:]]/ && $0 !~ /^runner:[[:space:]]*$/ { + in_runner=0 + } + in_runner && /^[[:space:]]*capacity:[[:space:]]*/ { + line=$0 + sub(/^[[:space:]]*capacity:[[:space:]]*/, "", line) + gsub(/["'\'']/, "", line) + print line + exit + } + ' "$config_path" +} + +extract_runner_labels() { + local config_path="$1" + awk ' + /^[[:space:]]*labels:[[:space:]]*$/ { + in_labels=1 + next + } + in_labels && /^[[:space:]]*-[[:space:]]*/ { + line=$0 + sub(/^[[:space:]]*-[[:space:]]*"/, "", line) + sub(/^[[:space:]]*-[[:space:]]*/, "", line) + sub(/"[[:space:]]*$/, "", line) + print line + next + } + in_labels && /^[^[:space:]]/ { + in_labels=0 + } + ' "$config_path" +} + +label_name() { + printf '%s' "${1%%:*}" +} + +active_action_container_count() { + if ! command_exists docker; then + echo 0 + return 0 + fi + docker ps --format '{{.Names}}' 2>/dev/null | grep -Ec '^GITEA-ACTIONS-TASK-' || true +} + +heavy_process_count() { + { + pgrep -f '(^|/)(chrome|chromium|chromium-browser)( |$)' 2>/dev/null || true + pgrep -f 'playwright|stockplatform.*smoke|next build|turbo build|vite build' 2>/dev/null || true + } | sort -u | wc -l | tr -d ' ' +} + +load_per_core_ok() { + if [ ! -r /proc/loadavg ] || ! command_exists awk; then + warning "loadavg_unavailable" + return 0 + fi + local cores load1 ratio + cores="$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 1)" + load1="$(awk '{print $1}' /proc/loadavg)" + ratio="$(awk -v load_value="$load1" -v cores="${cores:-1}" 'BEGIN { if (cores < 1) cores=1; printf "%.6f", load_value / cores }')" + printf 'LOAD_READBACK load1=%s cores=%s load_per_core=%s max=%s\n' "$load1" "$cores" "$ratio" "$MAX_LOAD_PER_CORE" + awk -v ratio="$ratio" -v max="$MAX_LOAD_PER_CORE" 'BEGIN { exit !(ratio <= max) }' +} + +check_host_selector() { + section "host selector" + printf 'target_host_ip=%s\n' "$TARGET_HOST_IP" + printf 'host_ips=%s\n' "$(host_ips | paste -sd, -)" + if [ -n "$TARGET_HOST_IP" ] && ! host_has_ip "$TARGET_HOST_IP"; then + blocker "target_host_ip_not_present_${TARGET_HOST_IP}" + fi +} + +check_config() { + section "controlled lane config" + local capacity labels label name has_host=0 has_ubuntu=0 forbidden=0 + if [ ! -r "$CD_LANE_DRAIN_CONFIG" ]; then + printf 'CD_LANE_CONFIG path=%s readable=0\n' "$CD_LANE_DRAIN_CONFIG" + blocker "controlled_cd_lane_config_missing" + return 0 + fi + capacity="$(extract_runner_capacity "$CD_LANE_DRAIN_CONFIG" | head -1)" + printf 'CD_LANE_CONFIG path=%s readable=1 capacity=%s max_capacity=%s\n' "$CD_LANE_DRAIN_CONFIG" "${capacity:-missing}" "$MAX_CAPACITY" + if ! printf '%s' "${capacity:-}" | grep -Eq '^[0-9]+$'; then + blocker "controlled_cd_lane_capacity_missing" + elif [ "$capacity" -gt "$MAX_CAPACITY" ]; then + blocker "controlled_cd_lane_capacity_too_high:${capacity}" + fi + + labels="$(extract_runner_labels "$CD_LANE_DRAIN_CONFIG" || true)" + if [ -z "$labels" ]; then + blocker "controlled_cd_lane_labels_missing" + fi + while IFS= read -r label; do + [ -n "$label" ] || continue + name="$(label_name "$label")" + printf 'CD_LANE_LABEL label=%s name=%s\n' "$label" "$name" + if [ "$label" = "awoooi-host:host" ]; then + has_host=1 + fi + case "$label" in + awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04) + has_ubuntu=1 + ;; + esac + if printf '%s' "$name" | grep -Eq "$FORBIDDEN_LABEL_RE"; then + forbidden=1 + blocker "controlled_cd_lane_forbidden_label:${name}" + fi + if [ "$name" != "awoooi-host" ] && [ "$name" != "awoooi-ubuntu" ]; then + forbidden=1 + blocker "controlled_cd_lane_unexpected_label:${name}" + fi + done <<<"$labels" + + [ "$has_host" -eq 1 ] || blocker "controlled_cd_lane_awoooi_host_label_missing" + [ "$has_ubuntu" -eq 1 ] || blocker "controlled_cd_lane_awoooi_ubuntu_label_missing" + if [ "$has_host" -eq 1 ] && [ "$has_ubuntu" -eq 1 ] && [ "$forbidden" -eq 0 ] \ + && printf '%s' "${capacity:-}" | grep -Eq '^[0-9]+$' && [ "$capacity" -le "$MAX_CAPACITY" ]; then + CONFIG_READY=1 + fi +} + +check_binary() { + section "controlled lane binary" + local kind + kind="$(file -b "$CD_LANE_DRAIN_BINARY" 2>/dev/null || echo missing)" + printf 'CD_LANE_BINARY path=%s executable=%s kind=%s\n' \ + "$CD_LANE_DRAIN_BINARY" "$([ -x "$CD_LANE_DRAIN_BINARY" ] && [ -f "$CD_LANE_DRAIN_BINARY" ] && echo 1 || echo 0)" "$kind" + if [ -x "$CD_LANE_DRAIN_BINARY" ] && [ -f "$CD_LANE_DRAIN_BINARY" ] && grep -qi "ELF" <<<"$kind"; then + BINARY_READY=1 + else + blocker "controlled_cd_lane_binary_not_ready" + fi +} + +check_registration() { + section "controlled lane registration metadata" + local registration found=0 mode size + for registration in $CD_LANE_DRAIN_REGISTRATION_PATHS; do + if [ -f "$registration" ] && [ -s "$registration" ]; then + mode="$(stat -c '%a' "$registration" 2>/dev/null || stat -f '%Lp' "$registration" 2>/dev/null || echo unknown)" + size="$(stat -c '%s' "$registration" 2>/dev/null || stat -f '%z' "$registration" 2>/dev/null || echo unknown)" + printf 'CD_LANE_REGISTRATION path=%s present=1 size_bytes=%s mode=%s content_read=false\n' "$registration" "$size" "$mode" + found=1 + else + printf 'CD_LANE_REGISTRATION path=%s present=0 content_read=false\n' "$registration" + fi + done + if [ "$found" -eq 1 ]; then + REGISTRATION_READY=1 + else + blocker "controlled_cd_lane_registration_missing" + fi +} + +unit_has_required_limits() { + local text="$1" + grep -Eq '^[[:space:]]*CPUAccounting=true' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*CPUQuota=' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*MemoryAccounting=true' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*Memory(Max|High)=' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*TasksAccounting=true' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*TasksMax=' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*NoNewPrivileges=true' <<<"$text" || return 1 + grep -Eq '^[[:space:]]*Restart=' <<<"$text" || return 1 + return 0 +} + +unit_has_target_match() { + local text="$1" + grep -Fq -- "$CD_LANE_DRAIN_BINARY" <<<"$text" || return 1 + grep -Fq -- "$CD_LANE_DRAIN_CONFIG" <<<"$text" || return 1 + return 0 +} + +unit_has_registration_condition() { + local text="$1" + local registration + for registration in $CD_LANE_DRAIN_REGISTRATION_PATHS; do + if grep -Eq "^[[:space:]]*ConditionPathExists=${registration//\//\\/}[[:space:]]*$" <<<"$text"; then + return 0 + fi + done + return 1 +} + +check_service() { + section "controlled lane service" + local text state active mainpid limits_ok=0 target_ok=0 condition_ok=0 + if ! text="$(systemd_cat "$CD_LANE_DRAIN_SERVICE" 2>/dev/null)"; then + printf 'CD_LANE_SERVICE unit=%s installed=0\n' "$CD_LANE_DRAIN_SERVICE" + blocker "controlled_cd_lane_service_missing" + return 0 + fi + state="$(systemd_show "$CD_LANE_DRAIN_SERVICE" | tr '\n' ' ')" + active="$(printf '%s\n' "$state" | tr ' ' '\n' | awk -F= '$1 == "ActiveState" {print $2; exit}')" + mainpid="$(printf '%s\n' "$state" | tr ' ' '\n' | awk -F= '$1 == "MainPID" {print $2; exit}')" + printf 'CD_LANE_SERVICE unit=%s installed=1 %s\n' "$CD_LANE_DRAIN_SERVICE" "$state" + if unit_has_required_limits "$text"; then + limits_ok=1 + else + blocker "controlled_cd_lane_service_limits_missing" + fi + if unit_has_target_match "$text"; then + target_ok=1 + else + blocker "controlled_cd_lane_service_target_mismatch" + fi + if unit_has_registration_condition "$text"; then + condition_ok=1 + else + blocker "controlled_cd_lane_service_registration_condition_missing" + fi + printf 'CD_LANE_SERVICE_GUARDRAILS unit=%s active=%s main_pid=%s limits=%s target_match=%s registration_condition=%s\n' \ + "$CD_LANE_DRAIN_SERVICE" "${active:-unknown}" "${mainpid:-0}" "$limits_ok" "$target_ok" "$condition_ok" + if [ "$REQUIRE_ACTIVE_SERVICE" = "1" ]; then + if [ "${active:-}" != "active" ] || ! printf '%s' "${mainpid:-0}" | grep -Eq '^[1-9][0-9]*$'; then + blocker "controlled_cd_lane_service_not_active" + fi + fi + if [ "$limits_ok" -eq 1 ] && [ "$target_ok" -eq 1 ] && [ "$condition_ok" -eq 1 ]; then + if [ "$REQUIRE_ACTIVE_SERVICE" != "1" ] || { [ "${active:-}" = "active" ] && printf '%s' "${mainpid:-0}" | grep -Eq '^[1-9][0-9]*$'; }; then + SERVICE_READY=1 + fi + fi +} + +check_failclosed_boundaries() { + section "legacy fail-closed boundary" + local unit legacy_blockers=0 path kind + if [ "$REQUIRE_PRIMARY_LANE_FAILCLOSED" = "1" ]; then + if unit_failclosed_or_absent "$PRIMARY_CD_LANE_SERVICE"; then + PRIMARY_LANE_FAILCLOSED=1 + else + legacy_blockers=$((legacy_blockers + 1)) + blocker "primary_cd_lane_not_failclosed:${PRIMARY_CD_LANE_SERVICE}" + fi + else + PRIMARY_LANE_FAILCLOSED=1 + fi + + for unit in $LEGACY_RUNNER_SERVICE_NAMES; do + if ! unit_failclosed_or_absent "$unit"; then + legacy_blockers=$((legacy_blockers + 1)) + blocker "legacy_runner_unit_not_failclosed:${unit}" + fi + done + for path in $LEGACY_RUNNER_BINARY_PATHS; do + kind="$(file -b "$path" 2>/dev/null || echo missing)" + printf 'FAILCLOSED_BINARY path=%s kind=%s\n' "$path" "$kind" + if grep -qi "ELF" <<<"$kind"; then + legacy_blockers=$((legacy_blockers + 1)) + blocker "legacy_runner_binary_restored:${path}" + fi + done + if [ "$legacy_blockers" -eq 0 ]; then + LEGACY_FAILCLOSED=1 + fi +} + +check_pressure() { + section "pressure readback" + local containers heavy + containers="$(active_action_container_count)" + heavy="$(heavy_process_count)" + printf 'ACTIVE_ACTION_CONTAINERS=%s max=%s\n' "$containers" "$MAX_ACTIVE_ACTION_CONTAINERS" + printf 'HEAVY_PROCESS_COUNT=%s max=%s\n' "$heavy" "$MAX_HEAVY_PROCESS_COUNT" + [ "$containers" -le "$MAX_ACTIVE_ACTION_CONTAINERS" ] || blocker "active_action_containers_present:${containers}" + [ "$heavy" -le "$MAX_HEAVY_PROCESS_COUNT" ] || blocker "heavy_processes_present:${heavy}" + load_per_core_ok || blocker "host_load_per_core_too_high" +} + +check_restore_sources() { + section "restore-source readback" + if [ -d "$ROOT_RESTORE_PARENT" ]; then + ROOT_RESTORE_LEFT="$(find "$ROOT_RESTORE_PARENT" -maxdepth 1 -type d \( -name 'awoooi-cd-lane-disabled-*' -o -name 'awoooi-cd-lane-drain-disabled-*' \) -print 2>/dev/null | wc -l | tr -d ' ')" + else + ROOT_RESTORE_LEFT=0 + fi + printf 'CD_LANE_ROOT_RESTORE_SOURCES parent=%s left=%s\n' "$ROOT_RESTORE_PARENT" "$ROOT_RESTORE_LEFT" + [ "$ROOT_RESTORE_LEFT" = "0" ] || blocker "cd_lane_root_restore_sources_left:${ROOT_RESTORE_LEFT}" +} + +print_verdict() { + section "verdict" + printf 'CONFIG_READY=%s\n' "$CONFIG_READY" + printf 'BINARY_READY=%s\n' "$BINARY_READY" + printf 'REGISTRATION_READY=%s\n' "$REGISTRATION_READY" + printf 'SERVICE_READY=%s\n' "$SERVICE_READY" + printf 'LEGACY_FAILCLOSED=%s\n' "$LEGACY_FAILCLOSED" + printf 'PRIMARY_LANE_FAILCLOSED=%s\n' "$PRIMARY_LANE_FAILCLOSED" + printf 'WARNING_COUNT=%s\n' "${#WARNINGS[@]}" + printf 'BLOCKER_COUNT=%s\n' "${#BLOCKERS[@]}" + if [ "${#BLOCKERS[@]}" -eq 0 ]; then + printf 'AWOOOI_110_CONTROLLED_CD_LANE_READY=1\n' + printf 'safe_next_step=rerun_harbor_110_local_repair_queue_readback_and_non110_cd_closure_verifier\n' + return 0 + fi + printf 'AWOOOI_110_CONTROLLED_CD_LANE_READY=0\n' + if [ "$REGISTRATION_READY" -eq 0 ]; then + printf 'safe_next_step=restore_or_register_awoooi_cd_lane_drain_registration_without_printing_token_then_rerun_this_verifier\n' + elif [ "$ROOT_RESTORE_LEFT" != "0" ]; then + printf 'safe_next_step=quarantine_cd_lane_root_restore_sources_then_rerun_this_verifier\n' + elif [ "$SERVICE_READY" -eq 0 ] && [ "$CONFIG_READY" -eq 1 ] && [ "$BINARY_READY" -eq 1 ]; then + printf 'safe_next_step=start_awoooi_cd_lane_drain_service_after_apply_window_then_rerun_this_verifier\n' + else + printf 'safe_next_step=fix_controlled_cd_lane_guardrail_blockers_then_rerun_this_verifier\n' + fi + return 1 +} + +main() { + section "audit metadata" + printf 'read_only=true\n' + printf 'secret_values_collected=false\n' + printf 'runner_token_read=false\n' + printf 'raw_runner_registration_read=false\n' + printf 'timestamp=%s\n' "$(date -Is 2>/dev/null || date)" + printf 'host=%s\n' "$(hostname 2>/dev/null || echo unknown)" + printf 'user=%s\n' "$(id -un 2>/dev/null || echo unknown)" + + check_host_selector + check_config + check_binary + check_registration + check_service + check_failclosed_boundaries + check_restore_sources + check_pressure + print_verdict +} + +main "$@" diff --git a/ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py b/ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py new file mode 100644 index 00000000..2df31b0b --- /dev/null +++ b/ops/runner/test_check_awoooi_110_controlled_cd_lane_readiness.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import subprocess +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +VERIFIER = ROOT / "ops/runner/check-awoooi-110-controlled-cd-lane-readiness.sh" + + +def _write_fake_bin(path: Path, name: str, body: str) -> None: + target = path / name + target.write_text(body, encoding="utf-8") + target.chmod(0o755) + + +def _write_config(path: Path, *, forbidden_label: bool = False) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + labels = [ + ' - "awoooi-host:host"', + ' - "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"', + ] + if forbidden_label: + labels.append(' - "ubuntu-latest:docker://node:22"') + path.write_text( + "\n".join( + [ + "runner:", + " capacity: 1", + " labels:", + *labels, + "", + ] + ), + encoding="utf-8", + ) + + +def _write_unit( + path: Path, + *, + binary: Path, + config: Path, + registration: Path, + include_registration_condition: bool = True, +) -> None: + condition = f"ConditionPathExists={registration}\n" if include_registration_condition else "" + path.write_text( + f""" +[Unit] +Description=AWOOOI controlled CD lane drain +After=network-online.target docker.service +Wants=network-online.target +Requires=docker.service +{condition} +[Service] +Type=simple +User=wooo +WorkingDirectory={binary.parent}/data +Environment=HOME=/home/wooo +Environment=AWOOOI_CONTROLLED_RUNNER_OPEN=1 +ExecStart={binary} daemon --config {config} +Restart=always +RestartSec=10 +KillSignal=SIGINT +TimeoutStopSec=3700 +SuccessExitStatus=0 130 143 +CPUAccounting=true +CPUQuota=250% +MemoryAccounting=true +MemoryHigh=8G +MemoryMax=12G +TasksAccounting=true +TasksMax=512 +NoNewPrivileges=true +""".strip() + + "\n", + encoding="utf-8", + ) + + +def _run_verifier( + tmp_path: Path, + *, + registration_present: bool = True, + forbidden_label: bool = False, + active_service: bool = True, + include_registration_condition: bool = True, + legacy_active: bool = False, +) -> subprocess.CompletedProcess[str]: + fake_bin = tmp_path / "bin" + unit_dir = tmp_path / "units" + root_restore = tmp_path / "root" + lane_dir = tmp_path / "awoooi-cd-lane-drain" + data_dir = lane_dir / "data" + fake_bin.mkdir() + unit_dir.mkdir() + root_restore.mkdir() + data_dir.mkdir(parents=True) + + binary = lane_dir / "awoooi_cd_lane_controlled" + config = lane_dir / "config.yaml" + registration = data_dir / ".runner" + binary.write_bytes(b"\x7fELF controlled test binary\n") + binary.chmod(0o755) + _write_config(config, forbidden_label=forbidden_label) + if registration_present: + registration.write_text("secret-token-like-content-not-printed\n", encoding="utf-8") + _write_unit( + unit_dir / "awoooi-cd-lane-drain.service", + binary=binary, + config=config, + registration=registration, + include_registration_condition=include_registration_condition, + ) + + legacy_state = "active" if legacy_active else "inactive" + legacy_pid = "4321" if legacy_active else "0" + _write_fake_bin( + fake_bin, + "systemctl", + f"""#!/usr/bin/env bash +set -euo pipefail +cmd="${{1:-}}"; unit="${{2:-}}" +case "$cmd" in + show) + unit="${{2:-}}" + if [ -f "{unit_dir}/$unit" ]; then + printf 'LoadState=loaded\\nActiveState={"active" if active_service else "inactive"}\\nUnitFileState=enabled\\nMainPID={"1234" if active_service else "0"}\\n' + exit 0 + fi + case "$unit" in + awoooi-cd-lane.service) + printf 'LoadState=masked\\nActiveState=inactive\\nUnitFileState=masked\\nMainPID=0\\n' + exit 0 + ;; + awoooi-direct-runner-open.service|awoooi-direct-runner.service|gitea-act-runner-host.service|gitea-act-runner-awoooi-controlled.service|gitea-awoooi-controlled-runner.service|gitea-act-runner-awoooi-open.service) + printf 'LoadState=%s\\nActiveState=%s\\nUnitFileState=%s\\nMainPID=%s\\n' '{"loaded" if legacy_active else "masked"}' "{legacy_state}" '{"enabled" if legacy_active else "masked"}' "{legacy_pid}" + exit 0 + ;; + esac + printf 'LoadState=not-found\\nActiveState=inactive\\nUnitFileState=\\nMainPID=0\\n' + exit 0 + ;; + cat) + if [ -f "{unit_dir}/$unit" ]; then cat "{unit_dir}/$unit"; exit 0; fi + exit 1 + ;; +esac +exit 1 +""", + ) + _write_fake_bin( + fake_bin, + "ip", + """#!/usr/bin/env bash +if [ "${1:-}" = "-o" ] && [ "${2:-}" = "-4" ] && [ "${3:-}" = "addr" ]; then + printf '2: eth0 inet 192.168.0.110/24 brd 192.168.0.255 scope global eth0\\n' + exit 0 +fi +exit 1 +""", + ) + _write_fake_bin( + fake_bin, + "docker", + """#!/usr/bin/env bash +if [ "${1:-}" = "ps" ]; then exit 0; fi +exit 0 +""", + ) + _write_fake_bin( + fake_bin, + "pgrep", + """#!/usr/bin/env bash +exit 1 +""", + ) + _write_fake_bin( + fake_bin, + "file", + f"""#!/usr/bin/env bash +case "${{*:-}}" in + *"{binary}") printf 'ELF 64-bit LSB executable\\n'; exit 0 ;; +esac +printf 'POSIX shell script\\n' +""", + ) + + env = { + **os.environ, + "PATH": f"{fake_bin}:{os.environ['PATH']}", + "CD_LANE_DRAIN_DIR": str(lane_dir), + "CD_LANE_DRAIN_BINARY": str(binary), + "CD_LANE_DRAIN_CONFIG": str(config), + "CD_LANE_DRAIN_REGISTRATION_PATHS": str(registration), + "ROOT_RESTORE_PARENT": str(root_restore), + "LEGACY_RUNNER_BINARY_PATHS": str(tmp_path / "legacy_act_runner"), + "MAX_HEAVY_PROCESS_COUNT": "0", + "MAX_ACTIVE_ACTION_CONTAINERS": "0", + } + return subprocess.run( + ["bash", str(VERIFIER)], + check=False, + env=env, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + +def test_110_controlled_cd_lane_ready_without_printing_registration_content( + tmp_path: Path, +) -> None: + result = _run_verifier(tmp_path) + + assert result.returncode == 0, result.stdout + result.stderr + assert "AWOOOI_110_CONTROLLED_CD_LANE_READY=1" in result.stdout + assert "runner_token_read=false" in result.stdout + assert "raw_runner_registration_read=false" in result.stdout + assert "content_read=false" in result.stdout + assert "secret-token-like-content" not in result.stdout + assert "CD_LANE_SERVICE_GUARDRAILS" in result.stdout + assert "registration_condition=1" in result.stdout + assert ( + "safe_next_step=rerun_harbor_110_local_repair_queue_readback_and_non110_cd_closure_verifier" + in result.stdout + ) + + +def test_110_controlled_cd_lane_blocks_missing_registration(tmp_path: Path) -> None: + result = _run_verifier(tmp_path, registration_present=False) + + assert result.returncode == 1 + assert "BLOCKER controlled_cd_lane_registration_missing" in result.stdout + assert "AWOOOI_110_CONTROLLED_CD_LANE_READY=0" in result.stdout + assert ( + "safe_next_step=restore_or_register_awoooi_cd_lane_drain_registration_without_printing_token_then_rerun_this_verifier" + in result.stdout + ) + + +def test_110_controlled_cd_lane_blocks_forbidden_generic_label(tmp_path: Path) -> None: + result = _run_verifier(tmp_path, forbidden_label=True) + + assert result.returncode == 1 + assert "BLOCKER controlled_cd_lane_forbidden_label:ubuntu-latest" in result.stdout + assert "BLOCKER controlled_cd_lane_unexpected_label:ubuntu-latest" in result.stdout + assert "AWOOOI_110_CONTROLLED_CD_LANE_READY=0" in result.stdout + + +def test_110_controlled_cd_lane_requires_registration_condition(tmp_path: Path) -> None: + result = _run_verifier(tmp_path, include_registration_condition=False) + + assert result.returncode == 1 + assert "BLOCKER controlled_cd_lane_service_registration_condition_missing" in result.stdout + assert "AWOOOI_110_CONTROLLED_CD_LANE_READY=0" in result.stdout + + +def test_110_controlled_cd_lane_blocks_active_legacy_runner(tmp_path: Path) -> None: + result = _run_verifier(tmp_path, legacy_active=True) + + assert result.returncode == 1 + assert "BLOCKER legacy_runner_unit_not_failclosed:gitea-act-runner-host.service" in result.stdout + assert "AWOOOI_110_CONTROLLED_CD_LANE_READY=0" in result.stdout diff --git a/ops/runner/test_verify_awoooi_non110_cd_closure.py b/ops/runner/test_verify_awoooi_non110_cd_closure.py index 5a79edfd..c8826100 100644 --- a/ops/runner/test_verify_awoooi_non110_cd_closure.py +++ b/ops/runner/test_verify_awoooi_non110_cd_closure.py @@ -204,7 +204,13 @@ def test_closure_verifier_prioritizes_harbor_110_runner_label_blocker() -> None: == "awoooi-host" ) assert payload["progress"]["next_blocked_step_id"] == "public_queue_runner_match" + assert "check_awoooi_110_controlled_cd_lane_readiness" in payload[ + "next_actions" + ][0] assert "awoooi_host_runner_control_path" in payload["next_actions"][0] + assert "check_awoooi_110_controlled_cd_lane_readiness" in payload["progress"][ + "next_blocked_step_action" + ] assert "awoooi_host_runner_control_path" in payload["progress"][ "next_blocked_step_action" ] diff --git a/ops/runner/verify-awoooi-non110-cd-closure.py b/ops/runner/verify-awoooi-non110-cd-closure.py index fe3b7876..2185fcf7 100755 --- a/ops/runner/verify-awoooi-non110-cd-closure.py +++ b/ops/runner/verify-awoooi-non110-cd-closure.py @@ -316,7 +316,8 @@ def build_closure_verifier( or bool(harbor_110_repair_no_matching_runner_label) ) queue_runner_match_next_action = ( - "restore_awoooi_host_runner_control_path_without_legacy_or_generic_labels_" + "run_ops_runner_check_awoooi_110_controlled_cd_lane_readiness_on_110_" + "then_restore_awoooi_host_runner_control_path_without_legacy_or_generic_labels_" "then_rerun_harbor_110_repair_queue_readback" if harbor_110_repair_no_matching_runner else "rerun_public_queue_readback_until_no_matching_runner_is_absent" diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 01c97f1d..4329a766 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -442,6 +442,7 @@ Description=AWOOOI controlled CD lane drain bypass for old queued guards After=network-online.target docker.service Wants=network-online.target Requires=docker.service +ConditionPathExists=${CD_LANE_DRAIN_DIR}/data/.runner [Service] Type=simple diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 0c44b1eb..9c798623 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -84,6 +84,7 @@ def test_startup_110_opens_only_controlled_cd_lane_after_guardrails() -> None: assert 'CD_LANE_ROOT_RESTORE_LEFT="$(cd_lane_root_restore_sources_left)"' in text assert 'START_CD_LANE_ALLOWED=1' in text assert 'install_controlled_cd_lane_drain_unit' in text + assert 'ConditionPathExists=${CD_LANE_DRAIN_DIR}/data/.runner' in text assert 'systemctl unmask "$CD_LANE_DRAIN_SERVICE"' in text assert 'systemctl enable --now "$CD_LANE_DRAIN_SERVICE"' in text assert 'ensure_controlled_cd_lane_open' in text