From c7241365ebef83c525c2a56fe2482b08b6f572a0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Jul 2026 00:33:42 +0800 Subject: [PATCH] fix(ops): classify hostedtoolcache ci load [metadata-only] --- docs/LOGBOOK.md | 22 ++++++++++++ ...recovery-readback-2026-07-01.snapshot.json | 3 ++ docs/runbooks/FULL-STACK-COLD-START-SOP.md | 22 ++++++++++++ scripts/ops/host-runaway-process-exporter.py | 3 +- scripts/ops/host-sustained-load-evidence.py | 10 +++++- .../test_host_runaway_process_exporter.py | 36 +++++++++++++++++-- 6 files changed, 91 insertions(+), 5 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 9535817a..84377576 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51990,6 +51990,28 @@ production browser smoke: **下一步**: - 以 `[skip ci]` source commit 推回 Gitea main,避免只剩 live drift;然後回到 Gitea queue / 110 control-path / reboot SLO blocker 主線。 +## 2026-07-02 — 00:31 P0 110 active CI load classifier 修正 + +**完成內容**: +- `HostLoadAverageSustainedHigh` 再次 firing 時,SSH 診斷讀回 `node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm install --frozen-lockfile`,CPU 約 `269%`;這是 Gitea Actions 類 CI install load,不是 StockPlatform DB、orphan browser、DB deadlock 或磁碟 IO wait。 +- 同輪 `vmstat` 顯示 `b=0`、IO wait 約 `0-1%`,所以不是 disk stuck;StockPlatform DB 已被 cgroup 壓住,後續 direct stats 回到低 CPU。 +- 根因是 `host-runaway-process-exporter.py` / `host-sustained-load-evidence.py` 未把 `/opt/hostedtoolcache`、`pnpm install --frozen-lockfile`、`npm ci`、`yarn install` 歸類為 active Gitea Actions load,導致 `awoooi_host_gitea_actions_active_process_count` 可誤報 `0`。 +- 已修正 classifier,active CI install load 會走 runner / queue / pressure gate lane,不會再誤導成 unknown / Gitea backlog / Stock DB 壓力。 +- 新增 regression test,確認 sanitized evidence 不輸出 `/opt/hostedtoolcache` 或 raw `pnpm install` command。 + +**本地驗證結果**: +- `python3.11 -m pytest scripts/ops/tests/test_host_runaway_process_exporter.py scripts/ops/tests/test_host_pressure_alert_contract.py -q`:`24 passed`。 +- `python3.11 -m py_compile scripts/ops/host-runaway-process-exporter.py scripts/ops/host-sustained-load-evidence.py scripts/ops/tests/test_host_runaway_process_exporter.py`:通過。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker / Nginx / K3s / DB / firewall restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune / process kill。 + +**下一步**: +- 同步新版 exporter / evidence script 到 110 live textfile path,跑一次 read-only exporter,讀回 active CI metrics;再 rebase / commit / push source。 + ## 2026-07-02 — P0 AI loop stale control-path blocker production readback 修正 **完成內容**: diff --git a/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json b/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json index a3ad4d73..073c367d 100644 --- a/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json +++ b/docs/operations/post-reboot-runtime-recovery-readback-2026-07-01.snapshot.json @@ -147,7 +147,10 @@ }, "source_updates": [ "scripts/ops/docker-disk-pressure-retention-cleanup.py", + "scripts/ops/host-runaway-process-exporter.py", + "scripts/ops/host-sustained-load-evidence.py", "scripts/ops/tests/test_docker_disk_pressure_retention_cleanup.py", + "scripts/ops/tests/test_host_runaway_process_exporter.py", "ops/reboot-recovery/full-stack-cold-start-baseline.yml", "docs/runbooks/FULL-STACK-COLD-START-SOP.md", "docs/LOGBOOK.md" diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 6fb7205e..7c8007df 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -2499,3 +2499,25 @@ docker update --cpus=1.5 --memory=4g --memory-swap=6g --restart unless-stopped s - 可宣稱:110 sustained moderate CPU alert 已主動觸發、已定位、已用 Docker guardrail 降壓,StockPlatform route / health / freshness 未受損。 - 不可宣稱:110 永久不會再高負載;Gitea queue / hook backlog 仍需在後續 queue / control-path lane 持續觀察。 - 不得用 DB restart、container restart、manual DB write、query kill 或 fake freshness 來處理 StockPlatform pressure。 + +### 16.5 2026-07-02 hostedtoolcache / pnpm install 必須算 active CI load + +同一輪 110 high load 診斷發現 `node /opt/hostedtoolcache/node/.../pnpm install --frozen-lockfile` 會造成短期 runnable burst。這是 Gitea Actions 類 CI install load,但舊 exporter 只認 `/.cache/act/`、Docker build、BuildKit、`next build`,導致 `awoooi_host_gitea_actions_active_process_count` 誤報 `0`,進而讓 controller 看起來像 unknown / Gitea backlog,而不是 active CI。 + +修正 source-of-truth: + +- `scripts/ops/host-runaway-process-exporter.py` +- `scripts/ops/host-sustained-load-evidence.py` +- `scripts/ops/tests/test_host_runaway_process_exporter.py` + +新增分類規則: + +- `/opt/hostedtoolcache/` +- `pnpm install` +- `npm ci` +- `yarn install` + +宣告限制: + +- active CI load 只能走 runner / queue / pressure gate lane;不得直接 `SIGKILL`、`docker restart`、`systemctl restart`、DB restart 或恢復 generic runner。 +- evidence output 仍必須保持 sanitized,不輸出 hostedtoolcache 完整路徑、raw command、URL、workspace path 或 secret。 diff --git a/scripts/ops/host-runaway-process-exporter.py b/scripts/ops/host-runaway-process-exporter.py index 1f6bc633..381efcab 100755 --- a/scripts/ops/host-runaway-process-exporter.py +++ b/scripts/ops/host-runaway-process-exporter.py @@ -83,7 +83,8 @@ DEFAULT_RULES = ( GITEA_ACTION_PROCESS_RE = re.compile( r"(/\.cache/act/|/home/wooo/\.cache/act/|\bdocker build\b|\bdocker-buildx\b|" - r"\bbuildx build\b|\bpnpm turbo build\b|\bturbo build\b|\bnext build\b)" + r"/opt/hostedtoolcache/|\bbuildx build\b|\bpnpm turbo build\b|\bturbo build\b|" + r"\bnext build\b|\bpnpm install\b|\bnpm ci\b|\byarn install\b)" ) HOST_PRESSURE_GATE_RE = re.compile(r"wait-host-web-build-pressure\.sh|awoooi-wait-host-web-build-pressure\.sh") diff --git a/scripts/ops/host-sustained-load-evidence.py b/scripts/ops/host-sustained-load-evidence.py index ef1417b7..42cb8dd1 100755 --- a/scripts/ops/host-sustained-load-evidence.py +++ b/scripts/ops/host-sustained-load-evidence.py @@ -175,7 +175,15 @@ def parse_ps_text(text: str) -> list[dict[str, Any]]: def classify_process_family(comm: str, args: str) -> str: text = f"{comm} {args}".lower() - if "act_runner" in text or "gitea-actions-task" in text or "/.cache/act/" in text: + if ( + "act_runner" in text + or "gitea-actions-task" in text + or "/.cache/act/" in text + or "/opt/hostedtoolcache/" in text + or "pnpm install" in text + or "npm ci" in text + or "yarn install" in text + ): return "gitea_actions_runner" if "docker build" in text or "buildx" in text or "buildkit" in text: return "docker_build" diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index b72d1675..9dfe1c7a 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -135,6 +135,7 @@ def test_counts_buildkit_runner_process_load() -> None: 102 101 100 100 239 2.0 S docker-buildx /home/wooo/.docker/cli-plugins/docker-buildx buildx build -f apps/web/Dockerfile . 200 150 200 200 210 12.5 S turbo turbo build --filter=@awoooi/web --concurrency=1 201 200 200 200 200 145.0 S node node /app/apps/web/node_modules/.bin/../next/dist/bin/next build + 250 150 250 250 30 269.0 R node node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm install --frozen-lockfile 300 1 300 300 9999 0.1 S act_runner act_runner daemon --config /config.yaml 400 1 400 400 120 30.0 S node node apps/web/server.js """ @@ -142,12 +143,41 @@ def test_counts_buildkit_runner_process_load() -> None: load = exporter.active_gitea_action_process_load(rows) - assert load.group_count == 2 - assert load.process_count == 5 - assert load.cpu_percent == 160.5 + assert load.group_count == 3 + assert load.process_count == 6 + assert load.cpu_percent == 429.5 assert load.oldest_age_seconds == 240 +def test_sustained_load_evidence_classifies_hostedtoolcache_pnpm_as_ci(tmp_path: Path) -> None: + ps_file = tmp_path / "ps.txt" + ps_file.write_text( + "250 150 250 30 269.0 0.7 node node /opt/hostedtoolcache/node/20.20.2/x64/bin/pnpm install --frozen-lockfile\n", + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(EVIDENCE_PATH), + "--host", + "110", + "--ps-file", + str(ps_file), + "--json", + ], + check=True, + capture_output=True, + text=True, + ) + + payload = json.loads(result.stdout) + assert payload["top_process_families"][0]["family"] == "gitea_actions_runner" + assert payload["top_processes_sanitized"][0]["family"] == "gitea_actions_runner" + assert "/opt/hostedtoolcache" not in result.stdout + assert "pnpm install" not in result.stdout + + def test_ignores_the_host_pressure_gate_process_group() -> None: exporter = load_exporter() rows = exporter.parse_ps_rows(