fix(ci): isolate 110 runner labels and gate host pressure [skip ci]
This commit is contained in:
@@ -18,7 +18,7 @@ env:
|
||||
|
||||
jobs:
|
||||
market-watch:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -14,7 +14,7 @@ on:
|
||||
|
||||
jobs:
|
||||
ai-technology-watch:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -26,7 +26,7 @@ env:
|
||||
|
||||
jobs:
|
||||
build-and-deploy-dev:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
@@ -79,6 +79,12 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for Host Web Build Pressure
|
||||
# 2026-06-27 Codex: fail closed before tests too. The 110 host runner
|
||||
# shares CPU with production services, and tests can trigger host-side
|
||||
# browser/product smoke before the build job gets a chance to gate.
|
||||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||||
|
||||
- name: Guard Workflow Secret Surfaces
|
||||
run: node scripts/ci/check-gitea-step-env-secrets.js
|
||||
|
||||
@@ -325,6 +331,11 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for Host Web Build Pressure
|
||||
# 2026-06-27 Codex: post-deploy smoke is also browser-heavy. Refuse to
|
||||
# add another smoke run while active CI/build/smoke pressure is present.
|
||||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||||
|
||||
- name: Get Commit Info
|
||||
id: commit
|
||||
run: |
|
||||
|
||||
@@ -23,7 +23,7 @@ env:
|
||||
|
||||
jobs:
|
||||
ai-code-review:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
timeout-minutes: 8
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -22,7 +22,7 @@ env:
|
||||
jobs:
|
||||
deploy-alerts:
|
||||
name: "Deploy Prometheus Alert Rules"
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -23,7 +23,7 @@ env:
|
||||
|
||||
jobs:
|
||||
e2e-health:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ env:
|
||||
|
||||
jobs:
|
||||
migrate:
|
||||
runs-on: ubuntu-latest # 或 self-hosted runner on 110
|
||||
runs-on: awoooi-ubuntu # 或 self-hosted runner on 110
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
@@ -25,7 +25,7 @@ on:
|
||||
|
||||
jobs:
|
||||
check-type-sync:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: awoooi-ubuntu
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
@@ -3,13 +3,31 @@ set -euo pipefail
|
||||
|
||||
# 2026-05-21 Codex: protect the shared 110 host runner from overlapping
|
||||
# host-side frontend production builds launched by other repositories.
|
||||
# This is intentionally a wait gate, not an auto-repair step: it never kills,
|
||||
# renices, or rewrites another repo's process tree.
|
||||
# 2026-06-27 Codex: make the gate enforce real host pressure too. 110 is both a
|
||||
# production host and a CI host, so CD must not start a new Docker/Next build
|
||||
# while load, BuildKit, Gitea Actions, or headless smoke pressure is already high.
|
||||
# This gate never kills, renices, or rewrites another repo's process tree.
|
||||
|
||||
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-60}"
|
||||
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-10}"
|
||||
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}"
|
||||
PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu}"
|
||||
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-60}}"
|
||||
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}"
|
||||
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-0}"
|
||||
MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}"
|
||||
MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}"
|
||||
MAX_ACTIVE_CI_PROCESS_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_PROCESS_GROUPS:-0}"
|
||||
MAX_ACTIVE_CI_CONTAINERS="${HOST_WEB_BUILD_PRESSURE_MAX_ACTIVE_CI_CONTAINERS:-0}"
|
||||
MAX_ORPHAN_BROWSER_GROUPS="${HOST_WEB_BUILD_PRESSURE_MAX_ORPHAN_BROWSER_GROUPS:-0}"
|
||||
METRICS_FILE="${HOST_RUNAWAY_PROCESS_METRICS_FILE:-${HOST_WEB_BUILD_PRESSURE_METRICS_FILE:-/home/wooo/node_exporter_textfiles/host_runaway_process.prom}}"
|
||||
EXPORTER="${HOST_RUNAWAY_PROCESS_EXPORTER:-/home/wooo/scripts/host-runaway-process-exporter.py}"
|
||||
|
||||
default_ps_command() {
|
||||
if ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu >/dev/null 2>&1; then
|
||||
printf '%s\n' "ps -eo pid=,ppid=,pcpu=,pmem=,args= --sort=-pcpu"
|
||||
return
|
||||
fi
|
||||
printf '%s\n' "ps -axo pid=,ppid=,pcpu=,pmem=,command="
|
||||
}
|
||||
|
||||
PS_COMMAND="${HOST_WEB_BUILD_PRESSURE_PS_COMMAND:-$(default_ps_command)}"
|
||||
|
||||
list_foreign_web_builds() {
|
||||
bash -c "$PS_COMMAND" | awk '
|
||||
@@ -25,23 +43,141 @@ list_foreign_web_builds() {
|
||||
'
|
||||
}
|
||||
|
||||
refresh_metrics() {
|
||||
if [ -x "$EXPORTER" ]; then
|
||||
AIOPS_HOST_LABEL="${AIOPS_HOST_LABEL:-110}" \
|
||||
NODE_EXPORTER_TEXTFILE_DIR="${NODE_EXPORTER_TEXTFILE_DIR:-/home/wooo/node_exporter_textfiles}" \
|
||||
AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS="${AIOPS_RUNAWAY_PROCESS_MIN_AGE_SECONDS:-1800}" \
|
||||
AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT="${AIOPS_RUNAWAY_PROCESS_MIN_CPU_PERCENT:-50}" \
|
||||
"$EXPORTER" >/dev/null 2>&1 || true
|
||||
fi
|
||||
}
|
||||
|
||||
metric_value() {
|
||||
local name="$1"
|
||||
if [ ! -r "$METRICS_FILE" ]; then
|
||||
return 1
|
||||
fi
|
||||
awk -v metric="$name" '
|
||||
$1 ~ ("^" metric "(\\{|$)") {
|
||||
value = $NF
|
||||
}
|
||||
END {
|
||||
if (value != "") print value
|
||||
else exit 1
|
||||
}
|
||||
' "$METRICS_FILE"
|
||||
}
|
||||
|
||||
metric_sum() {
|
||||
local name="$1"
|
||||
if [ ! -r "$METRICS_FILE" ]; then
|
||||
return 1
|
||||
fi
|
||||
awk -v metric="$name" '
|
||||
$1 ~ ("^" metric "(\\{|$)") {
|
||||
sum += $NF
|
||||
found = 1
|
||||
}
|
||||
END {
|
||||
if (found) print sum
|
||||
else exit 1
|
||||
}
|
||||
' "$METRICS_FILE"
|
||||
}
|
||||
|
||||
load5_per_core() {
|
||||
metric_value "awoooi_host_load5_per_core" 2>/dev/null || awk '
|
||||
BEGIN {
|
||||
cores = 0
|
||||
while ((getline line < "/proc/cpuinfo") > 0) {
|
||||
if (line ~ /^processor[[:space:]]*:/) cores += 1
|
||||
}
|
||||
close("/proc/cpuinfo")
|
||||
if (cores < 1) cores = 1
|
||||
if ((getline loadline < "/proc/loadavg") <= 0) exit 1
|
||||
split(loadline, parts, " ")
|
||||
printf "%.6f\n", parts[2] / cores
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
greater_than() {
|
||||
awk -v left="$1" -v right="$2" 'BEGIN { exit !(left > right) }'
|
||||
}
|
||||
|
||||
list_headless_smoke_pressure() {
|
||||
bash -c "$PS_COMMAND" | awk '
|
||||
BEGIN { IGNORECASE = 1 }
|
||||
/[c]hrome.*\/tmp\/stockplatform|[s]tockplatform-[[:alnum:]_-]*smoke|[h]eadless=new/ {
|
||||
if ($0 ~ /scripts\/ci\/wait-host-web-build-pressure\.sh/) next
|
||||
print
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
pressure_report() {
|
||||
local report=""
|
||||
local load_ratio active_ci_cpu active_ci_groups active_ci_containers orphan_groups
|
||||
load_ratio="$(load5_per_core 2>/dev/null || echo 0)"
|
||||
active_ci_cpu="$(metric_value "awoooi_host_gitea_actions_active_process_cpu_percent" 2>/dev/null || echo 0)"
|
||||
active_ci_groups="$(metric_value "awoooi_host_gitea_actions_active_process_group_count" 2>/dev/null || echo 0)"
|
||||
active_ci_containers="$(metric_value "awoooi_host_gitea_actions_active_container_count" 2>/dev/null || echo 0)"
|
||||
orphan_groups="$(
|
||||
metric_sum "awoooi_host_runaway_browser_orphan_group_count" 2>/dev/null \
|
||||
|| metric_sum "awoooi_host_orphan_browser_group_count" 2>/dev/null \
|
||||
|| echo 0
|
||||
)"
|
||||
|
||||
if greater_than "$load_ratio" "$MAX_LOAD5_PER_CORE"; then
|
||||
report="${report}host load5/core ${load_ratio} > ${MAX_LOAD5_PER_CORE}"$'\n'
|
||||
fi
|
||||
if greater_than "$active_ci_cpu" "$MAX_CI_CPU_PERCENT"; then
|
||||
report="${report}active CI/BuildKit CPU ${active_ci_cpu}% > ${MAX_CI_CPU_PERCENT}%"$'\n'
|
||||
fi
|
||||
if greater_than "$active_ci_groups" "$MAX_ACTIVE_CI_PROCESS_GROUPS"; then
|
||||
report="${report}active CI/BuildKit process groups ${active_ci_groups} > ${MAX_ACTIVE_CI_PROCESS_GROUPS}"$'\n'
|
||||
fi
|
||||
if greater_than "$active_ci_containers" "$MAX_ACTIVE_CI_CONTAINERS"; then
|
||||
report="${report}active Gitea Actions containers ${active_ci_containers} > ${MAX_ACTIVE_CI_CONTAINERS}"$'\n'
|
||||
fi
|
||||
if greater_than "$orphan_groups" "$MAX_ORPHAN_BROWSER_GROUPS"; then
|
||||
report="${report}orphan browser/smoke groups ${orphan_groups} > ${MAX_ORPHAN_BROWSER_GROUPS}"$'\n'
|
||||
fi
|
||||
|
||||
local smoke_pressure
|
||||
smoke_pressure="$(list_headless_smoke_pressure || true)"
|
||||
if [ -n "$smoke_pressure" ]; then
|
||||
report="${report}active headless smoke pressure detected"$'\n'"$(printf '%s\n' "$smoke_pressure" | head -n 6)"$'\n'
|
||||
fi
|
||||
|
||||
printf '%s' "$report"
|
||||
}
|
||||
|
||||
for attempt in $(seq 1 "$ATTEMPTS"); do
|
||||
refresh_metrics
|
||||
active_builds="$(list_foreign_web_builds || true)"
|
||||
if [ -z "$active_builds" ]; then
|
||||
echo "✅ no foreign host web build pressure detected"
|
||||
host_pressure="$(pressure_report || true)"
|
||||
if [ -z "$active_builds" ] && [ -z "$host_pressure" ]; then
|
||||
echo "✅ no host web/build/smoke pressure detected"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "⏳ foreign host web build pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s"
|
||||
printf '%s\n' "$active_builds" | head -n 8
|
||||
echo "⏳ host web/build/smoke pressure detected (attempt ${attempt}/${ATTEMPTS}); waiting ${SLEEP_SECONDS}s"
|
||||
if [ -n "$host_pressure" ]; then
|
||||
printf '%s\n' "$host_pressure" | sed -n '1,12p'
|
||||
fi
|
||||
if [ -n "$active_builds" ]; then
|
||||
printf '%s\n' "$active_builds" | head -n 8
|
||||
fi
|
||||
sleep "$SLEEP_SECONDS"
|
||||
done
|
||||
|
||||
echo "⚠️ foreign host web build pressure still active after ${ATTEMPTS} checks"
|
||||
echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks"
|
||||
if [ "$WARN_ONLY" = "1" ]; then
|
||||
echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "❌ refusing to start AWOOI image build while foreign host web builds are still active"
|
||||
echo "❌ refusing to start AWOOI image build while host web/build/smoke pressure is still active"
|
||||
exit 1
|
||||
|
||||
@@ -219,6 +219,33 @@ else:
|
||||
lines.insert(idx + 1, " shutdown_timeout: 1h")
|
||||
break
|
||||
path.write_text("\n".join(lines) + "\n")
|
||||
PY
|
||||
|
||||
# 110 是 production / registry / observability 主機,不再接泛用
|
||||
# ubuntu-latest 類 job。泛用 label 會讓 StockPlatform 等其他 repo 的
|
||||
# build 跑到 110,造成 CPU 長時間尖峰。AWOOOI 非 host job 改用
|
||||
# awoooi-ubuntu,CD / post-deploy 保留 awoooi-host。
|
||||
python3 - "$RUNNER_DIR/config.yaml" <<'PY' || true
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
path = Path(sys.argv[1])
|
||||
lines = path.read_text().splitlines()
|
||||
output = []
|
||||
idx = 0
|
||||
while idx < len(lines):
|
||||
line = lines[idx]
|
||||
if line.strip() == "labels:":
|
||||
output.append(line)
|
||||
output.append(' - "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04"')
|
||||
output.append(' - "awoooi-host:host"')
|
||||
idx += 1
|
||||
while idx < len(lines) and (lines[idx].startswith(" - ") or not lines[idx].strip()):
|
||||
idx += 1
|
||||
continue
|
||||
output.append(line)
|
||||
idx += 1
|
||||
path.write_text("\n".join(output) + "\n")
|
||||
PY
|
||||
|
||||
if systemctl list-unit-files "$RUNNER_SERVICE" >/dev/null 2>&1; then
|
||||
@@ -307,7 +334,7 @@ log "Harbor: http://192.168.0.110:5000"
|
||||
log "Gitea: http://192.168.0.110:3001"
|
||||
log "Grafana: http://192.168.0.110:3002"
|
||||
log "Alertmanager: http://192.168.0.110:9093"
|
||||
log "Gitea Runner: docker logs gitea-runner"
|
||||
log "Gitea Runner: systemctl --user status gitea-act-runner-host.service"
|
||||
log "Sentry: http://192.168.0.110:9000"
|
||||
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user