fix(ci): fail closed host runner pressure guards [skip ci]
This commit is contained in:
@@ -81,8 +81,8 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for Host Web Build Pressure
|
||||
# 2026-06-28 Codex: commander controlled automation keeps this
|
||||
# non-mutating pressure check as evidence + warn-only by default.
|
||||
# 2026-06-28 Codex: 110 runner pressure is incident-grade; default
|
||||
# behavior stays fail-closed until CI is relocated or rate-limited.
|
||||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||||
|
||||
- name: Guard Workflow Secret Surfaces
|
||||
@@ -379,18 +379,16 @@ jobs:
|
||||
# building, the job container can disappear and Docker reports RWLayer=nil.
|
||||
# A Docker-network lock is global to the host daemon and survives container
|
||||
# namespaces, unlike /tmp/flock inside the transient job container.
|
||||
# 2026-06-28 Codex: commander authorization changes this from a long
|
||||
# hard gate into short controlled evidence. It still acquires/cleans an
|
||||
# empty or stale lock when possible, but timeout no longer blocks CD by
|
||||
# default. Set DOCKER_BUILD_LOCK_WARN_ONLY=0 to restore fail-closed mode.
|
||||
# 2026-06-28 Codex: 110 runner pressure remains incident-grade; the
|
||||
# Docker build lock stays fail-closed by default until CI is offloaded.
|
||||
- name: Acquire Docker Build Lock
|
||||
run: |
|
||||
LOCK_NAME="awoooi-cd-docker-build-lock"
|
||||
LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-1}"
|
||||
STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-900}"
|
||||
EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-30}"
|
||||
WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-3}"
|
||||
WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-5}"
|
||||
LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-0}"
|
||||
STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-7200}"
|
||||
EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-300}"
|
||||
WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-180}"
|
||||
WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-10}"
|
||||
|
||||
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
|
||||
if docker network create \
|
||||
@@ -1260,8 +1258,8 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for Host Web Build Pressure
|
||||
# 2026-06-28 Codex: post-deploy keeps pressure evidence but no longer
|
||||
# treats host contention as the default terminal state.
|
||||
# 2026-06-28 Codex: post-deploy is browser-heavy; fail closed on host
|
||||
# pressure until runner load is isolated from production.
|
||||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||||
|
||||
- name: Get Commit Info
|
||||
|
||||
@@ -10,14 +10,12 @@ set -euo pipefail
|
||||
# 2026-06-28 Codex: CD trigger after opening the AWOOI direct runner warn-only guard.
|
||||
# 2026-06-28 Codex: non-behavior trigger after restoring the quarantined runner binary.
|
||||
# 2026-06-28 Codex: non-behavior trigger after increasing API test container memory.
|
||||
# 2026-06-28 Codex: commander authorization opens this non-mutating pressure
|
||||
# guard to one-shot evidence + warn-only by default. Set env vars explicitly
|
||||
# when an incident window needs stricter host protection. Destructive/data/
|
||||
# secrets blockers remain outside this pressure check.
|
||||
# 2026-06-28 Codex: host 110 runner pressure remains an incident-grade guard.
|
||||
# Controlled apply is open, but this pressure gate stays fail-closed by default.
|
||||
|
||||
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-1}}"
|
||||
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-3}}"
|
||||
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}"
|
||||
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-60}}"
|
||||
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}"
|
||||
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-0}"
|
||||
MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}"
|
||||
MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}"
|
||||
# One Gitea Actions task container/process group is the current job itself.
|
||||
@@ -187,7 +185,7 @@ done
|
||||
|
||||
echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks"
|
||||
if [ "$WARN_ONLY" = "1" ]; then
|
||||
echo "⚠️ continuing under commander controlled automation; pressure evidence was captured"
|
||||
echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
@@ -185,24 +185,19 @@ fi
|
||||
# STEP 6: Gitea Act Runner(CI/CD 核心)
|
||||
# 2026-04-05 Claude Code: 加入 — 解決重開機後 Gitea runner 離線、CD 失效
|
||||
# 2026-06-27 Codex: 110 runner labels 收斂,避免接泛用 shared CI。
|
||||
# 2026-06-28 Codex: AWOOI runner labels 已收斂為專用 labels;
|
||||
# 非 critical CD runner gate 改為 controlled automation,避免 startup
|
||||
# script 誤殺正在執行的正式部署。sentinel 僅在明確要求時作為第二鑰匙。
|
||||
# 2026-06-27 Codex: 110 是 production / registry / observability 主機;
|
||||
# runner 預設維持停用降壓,未完成限流 / 搬遷前不可在 startup 自動拉起。
|
||||
# ──────────────────────────────────────────────
|
||||
log "[6/6] 檢查 Gitea Act Runner(預設受控啟動)..."
|
||||
log "[6/6] 檢查 Gitea Act Runner(預設不自動啟動)..."
|
||||
RUNNER_DIR="/home/wooo/act-runner"
|
||||
RUNNER_SERVICE="gitea-act-runner-host.service"
|
||||
RUNNER_ENABLE_SENTINEL="${AWOOOI_RUNNER_ENABLE_SENTINEL:-/run/awoooi-runner-host-enabled}"
|
||||
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-1}"
|
||||
REQUIRE_RUNNER_ENABLE_SENTINEL="${AWOOOI_REQUIRE_RUNNER_ENABLE_SENTINEL:-0}"
|
||||
STOP_GITEA_RUNNER_WHEN_DISABLED="${AWOOOI_STOP_GITEA_RUNNER_WHEN_DISABLED:-0}"
|
||||
RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled"
|
||||
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}"
|
||||
START_GITEA_RUNNER_ALLOWED=0
|
||||
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ]; then
|
||||
if [ "$REQUIRE_RUNNER_ENABLE_SENTINEL" = "1" ] && [ ! -e "$RUNNER_ENABLE_SENTINEL" ]; then
|
||||
START_GITEA_RUNNER_ALLOWED=0
|
||||
else
|
||||
START_GITEA_RUNNER_ALLOWED=1
|
||||
fi
|
||||
# The runtime operator sentinel is the second key for an authorized deployment
|
||||
# window. A single env var or a stale sentinel alone must not reopen host CI.
|
||||
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then
|
||||
START_GITEA_RUNNER_ALLOWED=1
|
||||
fi
|
||||
if [ -x "$RUNNER_DIR/act_runner" ] && [ -f "$RUNNER_DIR/config.yaml" ]; then
|
||||
# 若舊的 .runner 配置指向過期 hostname,只有在明確允許啟動 runner
|
||||
@@ -271,19 +266,14 @@ PY
|
||||
nohup "$RUNNER_DIR/run-host-runner.sh" >> "$RUNNER_DIR/host-runner.log" 2>&1 &
|
||||
fi
|
||||
else
|
||||
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ "$REQUIRE_RUNNER_ENABLE_SENTINEL" = "1" ]; then
|
||||
log "⛔ AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 但缺少 $RUNNER_ENABLE_SENTINEL;runner startup 暫停"
|
||||
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ]; then
|
||||
log "⛔ AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 但缺少 $RUNNER_ENABLE_SENTINEL;runner fail-closed"
|
||||
else
|
||||
log "⏸️ Gitea host runner 本次不啟動;AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 可重新打開"
|
||||
fi
|
||||
if [ "$STOP_GITEA_RUNNER_WHEN_DISABLED" = "1" ]; then
|
||||
log "⚠️ AWOOI_STOP_GITEA_RUNNER_WHEN_DISABLED=1,停止 runner"
|
||||
systemctl disable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true
|
||||
systemctl kill -s SIGKILL "$RUNNER_SERVICE" >/dev/null 2>&1 || true
|
||||
pkill -KILL -f "$RUNNER_DIR/act_runner daemon" >/dev/null 2>&1 || true
|
||||
else
|
||||
log "✅ 不停止既有 runner;避免中斷正在執行的 CD / post-deploy job"
|
||||
log "⏸️ Gitea host runner 維持停用;需同時設定 AWOOOI_START_GITEA_RUNNER_ON_BOOT=1 與建立 $RUNNER_ENABLE_SENTINEL 才允許 startup 啟動"
|
||||
fi
|
||||
systemctl disable --now "$RUNNER_SERVICE" >/dev/null 2>&1 || true
|
||||
systemctl kill -s SIGKILL "$RUNNER_SERVICE" >/dev/null 2>&1 || true
|
||||
pkill -KILL -f "$RUNNER_DIR/act_runner daemon" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
# 已停用 Docker-wrapped runner;避免它搶走 host label job。
|
||||
|
||||
Reference in New Issue
Block a user