From 54eea881e39a4bdff645d2daa725b916fe250df4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 28 Jun 2026 08:38:15 +0800 Subject: [PATCH] fix(ci): fail closed host runner pressure guards [skip ci] --- .gitea/workflows/cd.yaml | 24 +++++------ scripts/ci/wait-host-web-build-pressure.sh | 14 +++---- scripts/reboot-recovery/awoooi-startup-110.sh | 40 +++++++------------ 3 files changed, 32 insertions(+), 46 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 3e49cd55..65068e82 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -81,8 +81,8 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Host Web Build Pressure - # 2026-06-28 Codex: commander controlled automation keeps this - # non-mutating pressure check as evidence + warn-only by default. + # 2026-06-28 Codex: 110 runner pressure is incident-grade; default + # behavior stays fail-closed until CI is relocated or rate-limited. run: bash scripts/ci/wait-host-web-build-pressure.sh - name: Guard Workflow Secret Surfaces @@ -379,18 +379,16 @@ jobs: # building, the job container can disappear and Docker reports RWLayer=nil. # A Docker-network lock is global to the host daemon and survives container # namespaces, unlike /tmp/flock inside the transient job container. - # 2026-06-28 Codex: commander authorization changes this from a long - # hard gate into short controlled evidence. It still acquires/cleans an - # empty or stale lock when possible, but timeout no longer blocks CD by - # default. Set DOCKER_BUILD_LOCK_WARN_ONLY=0 to restore fail-closed mode. + # 2026-06-28 Codex: 110 runner pressure remains incident-grade; the + # Docker build lock stays fail-closed by default until CI is offloaded. - name: Acquire Docker Build Lock run: | LOCK_NAME="awoooi-cd-docker-build-lock" - LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-1}" - STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-900}" - EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-30}" - WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-3}" - WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-5}" + LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-0}" + STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-7200}" + EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-300}" + WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-180}" + WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-10}" for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do if docker network create \ @@ -1260,8 +1258,8 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Host Web Build Pressure - # 2026-06-28 Codex: post-deploy keeps pressure evidence but no longer - # treats host contention as the default terminal state. + # 2026-06-28 Codex: post-deploy is browser-heavy; fail closed on host + # pressure until runner load is isolated from production. run: bash scripts/ci/wait-host-web-build-pressure.sh - name: Get Commit Info diff --git a/scripts/ci/wait-host-web-build-pressure.sh b/scripts/ci/wait-host-web-build-pressure.sh index 038fa1e4..29e78384 100755 --- a/scripts/ci/wait-host-web-build-pressure.sh +++ b/scripts/ci/wait-host-web-build-pressure.sh @@ -10,14 +10,12 @@ set -euo pipefail # 2026-06-28 Codex: CD trigger after opening the AWOOI direct runner warn-only guard. # 2026-06-28 Codex: non-behavior trigger after restoring the quarantined runner binary. # 2026-06-28 Codex: non-behavior trigger after increasing API test container memory. -# 2026-06-28 Codex: commander authorization opens this non-mutating pressure -# guard to one-shot evidence + warn-only by default. Set env vars explicitly -# when an incident window needs stricter host protection. Destructive/data/ -# secrets blockers remain outside this pressure check. +# 2026-06-28 Codex: host 110 runner pressure remains an incident-grade guard. +# Controlled apply is open, but this pressure gate stays fail-closed by default. -ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-1}}" -SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-3}}" -WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}" +ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-60}}" +SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}" +WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-0}" MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}" MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}" # One Gitea Actions task container/process group is the current job itself. @@ -187,7 +185,7 @@ done echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks" if [ "$WARN_ONLY" = "1" ]; then - echo "⚠️ continuing under commander controlled automation; pressure evidence was captured" + echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan" exit 0 fi diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index f4fb7545..aaca6054 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -185,24 +185,19 @@ fi # STEP 6: Gitea Act Runner(CI/CD 核心) # 2026-04-05 Claude Code: 加入 — 解決重開機後 Gitea runner 離線、CD 失效 # 2026-06-27 Codex: 110 runner labels 收斂,避免接泛用 shared CI。 -# 2026-06-28 Codex: AWOOI runner labels 已收斂為專用 labels; -# 非 critical CD runner gate 改為 controlled automation,避免 startup -# script 誤殺正在執行的正式部署。sentinel 僅在明確要求時作為第二鑰匙。 +# 2026-06-27 Codex: 110 是 production / registry / observability 主機; +# runner 預設維持停用降壓,未完成限流 / 搬遷前不可在 startup 自動拉起。 # ────────────────────────────────────────────── -log "[6/6] 檢查 Gitea Act Runner(預設受控啟動)..." +log "[6/6] 檢查 Gitea Act Runner(預設不自動啟動)..." RUNNER_DIR="/home/wooo/act-runner" RUNNER_SERVICE="gitea-act-runner-host.service" -RUNNER_ENABLE_SENTINEL="${AWOOOI_RUNNER_ENABLE_SENTINEL:-/run/awoooi-runner-host-enabled}" -START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-1}" -REQUIRE_RUNNER_ENABLE_SENTINEL="${AWOOOI_REQUIRE_RUNNER_ENABLE_SENTINEL:-0}" -STOP_GITEA_RUNNER_WHEN_DISABLED="${AWOOOI_STOP_GITEA_RUNNER_WHEN_DISABLED:-0}" +RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled" +START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}" START_GITEA_RUNNER_ALLOWED=0 -if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ]; then - if [ "$REQUIRE_RUNNER_ENABLE_SENTINEL" = "1" ] && [ ! -e "$RUNNER_ENABLE_SENTINEL" ]; then - START_GITEA_RUNNER_ALLOWED=0 - else - START_GITEA_RUNNER_ALLOWED=1 - fi +# The runtime operator sentinel is the second key for an authorized deployment +# window. A single env var or a stale sentinel alone must not reopen host CI. +if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then + START_GITEA_RUNNER_ALLOWED=1 fi if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then # 若舊的 .runner 配置指向過期 hostname,只有在明確允許啟動 runner @@ -271,19 +266,14 @@ PY nohup "$RUNNER_DIR/run-host-runner.sh" >> "$RUNNER_DIR/host-runner.log" 2>&1 & fi else - if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ "$REQUIRE_RUNNER_ENABLE_SENTINEL" = "1" ]; then - log "⛔ AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 但缺少 $RUNNER_ENABLE_SENTINEL;runner startup 暫停" + if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ]; then + log "⛔ AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 但缺少 $RUNNER_ENABLE_SENTINEL;runner fail-closed" else - log "⏸️ Gitea host runner 本次不啟動;AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 可重新打開" - fi - if [ "$STOP_GITEA_RUNNER_WHEN_DISABLED" = "1" ]; then - log "⚠️ AWOOI_STOP_GITEA_RUNNER_WHEN_DISABLED=1,停止 runner" - systemctl disable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true - systemctl kill -s SIGKILL "$RUNNER_SERVICE" >/dev/null 2>&1 || true - pkill -KILL -f "$RUNNER_DIR/act_runner daemon" >/dev/null 2>&1 || true - else - log "✅ 不停止既有 runner;避免中斷正在執行的 CD / post-deploy job" + log "⏸️ Gitea host runner 維持停用;需同時設定 AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 與建立 $RUNNER_ENABLE_SENTINEL 才允許 startup 啟動" fi + systemctl disable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true + systemctl kill -s SIGKILL "$RUNNER_SERVICE" >/dev/null 2>&1 || true + pkill -KILL -f "$RUNNER_DIR/act_runner daemon" >/dev/null 2>&1 || true fi # 已停用 Docker-wrapped runner;避免它搶走 host label job。