fix(ci): isolate 110 runner labels and gate host pressure [skip ci]

This commit is contained in:
ogt
2026-06-27 19:37:09 +08:00
parent 551227f3bb
commit 5f37de539c
11 changed files with 195 additions and 21 deletions

View File

@@ -18,7 +18,7 @@ env:
jobs:
market-watch:
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
timeout-minutes: 10
steps:
- uses: actions/checkout@v4

View File

@@ -14,7 +14,7 @@ on:
jobs:
ai-technology-watch:
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
timeout-minutes: 10
steps:
- uses: actions/checkout@v4

View File

@@ -26,7 +26,7 @@ env:
jobs:
build-and-deploy-dev:
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
steps:
- uses: actions/checkout@v4

View File

@@ -79,6 +79,12 @@ jobs:
- uses: actions/checkout@v4
- name: Wait for Host Web Build Pressure
# 2026-06-27 Codex: fail closed before tests too. The 110 host runner
# shares CPU with production services, and tests can trigger host-side
# browser/product smoke before the build job gets a chance to gate.
run: bash scripts/ci/wait-host-web-build-pressure.sh
- name: Guard Workflow Secret Surfaces
run: node scripts/ci/check-gitea-step-env-secrets.js
@@ -325,6 +331,11 @@ jobs:
- uses: actions/checkout@v4
- name: Wait for Host Web Build Pressure
# 2026-06-27 Codex: post-deploy smoke is also browser-heavy. Refuse to
# add another smoke run while active CI/build/smoke pressure is present.
run: bash scripts/ci/wait-host-web-build-pressure.sh
- name: Get Commit Info
id: commit
run: |

View File

@@ -23,7 +23,7 @@ env:
jobs:
ai-code-review:
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
timeout-minutes: 8
steps:
- uses: actions/checkout@v4

View File

@@ -22,7 +22,7 @@ env:
jobs:
deploy-alerts:
name: "Deploy Prometheus Alert Rules"
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
timeout-minutes: 5
steps:
- uses: actions/checkout@v4

View File

@@ -23,7 +23,7 @@ env:
jobs:
e2e-health:
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
steps:
- uses: actions/checkout@v4

View File

@@ -24,7 +24,7 @@ env:
jobs:
migrate:
runs-on: ubuntu-latest # 或 self-hosted runner on 110
runs-on: awoooi-ubuntu # 或 self-hosted runner on 110
steps:
- name: Checkout

View File

@@ -25,7 +25,7 @@ on:
jobs:
check-type-sync:
runs-on: ubuntu-latest
runs-on: awoooi-ubuntu
steps:
- uses: actions/checkout@v4

View File

@@ -3,13 +3,31 @@ set -euo pipefail
# 2026-05-21 Codex: protect the shared 110 host runner from overlapping
# host-side frontend production builds launched by other repositories.
# This is intentionally a wait gate, not an auto-repair step: it never kills,
# renices, or rewrites another repo's process tree.
# 2026-06-27 Codex: make the gate enforce real host pressure too. 110 is both a
# production host and a CI host, so CD must not start a new Docker/Next build
# while load, BuildKit, Gitea Actions, or headless smoke pressure is already high.
# This gate never kills, renices, or rewrites another repo's process tree.
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-60}"
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-10}"
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}"
PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu}"
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-60}}"
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}"
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-0}"
MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}"
MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}"
MAX_ACTIVE_CI_PROCESS_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_PROCESS_GROUPS:-0}"
MAX_ACTIVE_CI_CONTAINERS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_CONTAINERS:-0}"
MAX_ORPHAN_BROWSER_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ORPHAN_BROWSER_GROUPS:-0}"
METRICS_FILE="${HOST_RUNAWAY_PROCESS_METRICS_FILE:-${HOST_WEB_BUILD_PRESSURE_METRICS_FILE:-/home/wooo/node_exporter_textfiles/host_runaway_process.prom}}"
EXPORTER="${HOST_RUNAWAY_PROCESS_EXPORTER:-/home/wooo/scripts/host-runaway-process-exporter.py}"
default_ps_command() {
if ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu >/dev/null 2>&1; then
printf '%s\n' "ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu"
return
fi
printf '%s\n' "ps -axo pid=,ppid=,pcpu=,pmem=,command="
}
PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-$(default_ps_command)}"
list_foreign_web_builds() {
bash -c "$PS_COMMAND" | awk '
@@ -25,23 +43,141 @@ list_foreign_web_builds() {
'
}
refresh_metrics() {
if [ -x "$EXPORTER" ]; then
AIOPS_HOST_LABEL="${AIOPS_HOST_LABEL:-110}" \
NODE_EXPORTER_TEXTFILE_DIR="${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}" \
AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS="${AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS:-1800}" \
AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT="${AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT:-50}" \
"$EXPORTER" >/dev/null 2>&1 || true
fi
}
metric_value() {
local name="$1"
if [ ! -r "$METRICS_FILE" ]; then
return 1
fi
awk -v metric="$name" '
$1 ~ ("^" metric "(\\{|$)") {
value = $NF
}
END {
if (value != "") print value
else exit 1
}
' "$METRICS_FILE"
}
metric_sum() {
local name="$1"
if [ ! -r "$METRICS_FILE" ]; then
return 1
fi
awk -v metric="$name" '
$1 ~ ("^" metric "(\\{|$)") {
sum += $NF
found = 1
}
END {
if (found) print sum
else exit 1
}
' "$METRICS_FILE"
}
load5_per_core() {
metric_value "awoooi_host_load5_per_core" 2>/dev/null || awk '
BEGIN {
cores = 0
while ((getline line < "/proc/cpuinfo") > 0) {
if (line ~ /^processor[[:space:]]*:/) cores += 1
}
close("/proc/cpuinfo")
if (cores < 1) cores = 1
if ((getline loadline < "/proc/loadavg") <= 0) exit 1
split(loadline, parts, " ")
printf "%.6f\n", parts[2] / cores
}
'
}
greater_than() {
awk -v left="$1" -v right="$2" 'BEGIN { exit !(left > right) }'
}
list_headless_smoke_pressure() {
bash -c "$PS_COMMAND" | awk '
BEGIN { IGNORECASE = 1 }
/[c]hrome.*\/tmp\/stockplatform|[s]tockplatform-[[:alnum:]_-]*smoke|[h]eadless=new/ {
if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next
print
}
'
}
pressure_report() {
local report=""
local load_ratio active_ci_cpu active_ci_groups active_ci_containers orphan_groups
load_ratio="$(load5_per_core 2>/dev/null || echo 0)"
active_ci_cpu="$(metric_value "awoooi_host_gitea_actions_active_process_cpu_percent" 2>/dev/null || echo 0)"
active_ci_groups="$(metric_value "awoooi_host_gitea_actions_active_process_group_count" 2>/dev/null || echo 0)"
active_ci_containers="$(metric_value "awoooi_host_gitea_actions_active_container_count" 2>/dev/null || echo 0)"
orphan_groups="$(
metric_sum "awoooi_host_runaway_browser_orphan_group_count" 2>/dev/null \
|| metric_sum "awoooi_host_orphan_browser_group_count" 2>/dev/null \
|| echo 0
)"
if greater_than "$load_ratio" "$MAX_LOAD5_PER_CORE"; then
report="${report}host load5/core ${load_ratio} > ${MAX_LOAD5_PER_CORE}"$'\n'
fi
if greater_than "$active_ci_cpu" "$MAX_CI_CPU_PERCENT"; then
report="${report}active CI/BuildKit CPU ${active_ci_cpu}% > ${MAX_CI_CPU_PERCENT}%"$'\n'
fi
if greater_than "$active_ci_groups" "$MAX_ACTIVE_CI_PROCESS_GROUPS"; then
report="${report}active CI/BuildKit process groups ${active_ci_groups} > ${MAX_ACTIVE_CI_PROCESS_GROUPS}"$'\n'
fi
if greater_than "$active_ci_containers" "$MAX_ACTIVE_CI_CONTAINERS"; then
report="${report}active Gitea Actions containers ${active_ci_containers} > ${MAX_ACTIVE_CI_CONTAINERS}"$'\n'
fi
if greater_than "$orphan_groups" "$MAX_ORPHAN_BROWSER_GROUPS"; then
report="${report}orphan browser/smoke groups ${orphan_groups} > ${MAX_ORPHAN_BROWSER_GROUPS}"$'\n'
fi
local smoke_pressure
smoke_pressure="$(list_headless_smoke_pressure || true)"
if [ -n "$smoke_pressure" ]; then
report="${report}active headless smoke pressure detected"$'\n'"$(printf '%s\n' "$smoke_pressure" | head -n 6)"$'\n'
fi
printf '%s' "$report"
}
for attempt in $(seq 1 "$ATTEMPTS"); do
refresh_metrics
active_builds="$(list_foreign_web_builds || true)"
if [ -z "$active_builds" ]; then
echo "✅ no foreign host web build pressure detected"
host_pressure="$(pressure_report || true)"
if [ -z "$active_builds" ] && [ -z "$host_pressure" ]; then
echo "✅ no host web/build/smoke pressure detected"
exit 0
fi
echo " foreign host web build pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s"
printf '%s\n' "$active_builds" | head -n 8
echo "⏳ host web/build/smoke pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s"
if [ -n "$host_pressure" ]; then
printf '%s\n' "$host_pressure" | sed -n '1,12p'
fi
if [ -n "$active_builds" ]; then
printf '%s\n' "$active_builds" | head -n 8
fi
sleep "$SLEEP_SECONDS"
done
echo "⚠️ foreign host web build pressure still active after ${ATTEMPTS} checks"
echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks"
if [ "$WARN_ONLY" = "1" ]; then
echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
exit 0
fi
echo "❌ refusing to start AWOOI image build while foreign host web builds are still active"
echo "❌ refusing to start AWOOI image build while host web/build/smoke pressure is still active"
exit 1

View File

@@ -219,6 +219,33 @@ else:
lines.insert(idx + 1, " shutdown_timeout: 1h")
break
path.write_text("\n".join(lines) + "\n")
PY
# 110 是 production / registry / observability 主機,不再接泛用
# ubuntu-latest 類 job。泛用 label 會讓 StockPlatform 等其他 repo 的
# build 跑到 110造成 CPU 長時間尖峰。AWOOOI 非 host job 改用
# awoooi-ubuntuCD / post-deploy 保留 awoooi-host。
python3 - "$RUNNER_DIR/config.yaml" <<'PY' || true
import sys
from pathlib import Path
path = Path(sys.argv[1])
lines = path.read_text().splitlines()
output = []
idx = 0
while idx < len(lines):
line = lines[idx]
if line.strip() == "labels:":
output.append(line)
output.append(' - "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"')
output.append(' - "awoooi-host:host"')
idx += 1
while idx < len(lines) and (lines[idx].startswith(" - ") or not lines[idx].strip()):
idx += 1
continue
output.append(line)
idx += 1
path.write_text("\n".join(output) + "\n")
PY
if systemctl list-unit-files "$RUNNER_SERVICE" >/dev/null 2>&1; then
@@ -307,7 +334,7 @@ log "Harbor: http://192.168.0.110:5000"
log "Gitea: http://192.168.0.110:3001"
log "Grafana: http://192.168.0.110:3002"
log "Alertmanager: http://192.168.0.110:9093"
log "Gitea Runner: docker logs gitea-runner"
log "Gitea Runner: systemctl --user status gitea-act-runner-host.service"
log "Sentry: http://192.168.0.110:9000"
exit 0