From 3f656a5246ec5e49a6a682d914da236e456a9dcf Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 2 Jul 2026 13:06:53 +0800 Subject: [PATCH] fix(ops): align host pressure evidence routing --- docs/LOGBOOK.md | 21 ++++++++ .../HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md | 2 + scripts/ops/host-sustained-load-evidence.py | 32 ++++++++----- .../test_host_runaway_process_exporter.py | 48 +++++++++++++++++++ 4 files changed, 91 insertions(+), 12 deletions(-) diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 2c2c22b0..3ad0ba0c 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -15,6 +15,27 @@ **仍維持**: - 未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未送 Telegram;未觸發 workflow;未對 production DB 寫入。 +## 2026-07-02 — 13:05 110 CPU evidence / controller 分流一致性修正 + +**完成內容**: +- `scripts/ops/host-sustained-load-evidence.py` 的 recommendation 優先序已與 `host-sustained-load-controller.py` 對齊:若 StockPlatform Postgres/API container 或 `postgres` process-family 達 early threshold,先回 `postgres_hot_query_or_backup_export_playbook`;再判斷 Gitea。 +- 新增 regression:當 `gitea` container 略高但 `stockplatform-v2-postgres-1` 同時熱、且 `postgres` family 超過門檻時,evidence 必須回 Stock/Postgres playbook,不再讓 operator 看到 controller 說 Stock、evidence 說 Gitea 的不一致狀態。 +- 已 live 部署到 110:`/home/wooo/scripts/host-sustained-load-evidence.py` SHA `7cb8343ea407a86fd67d8eea42507ca1a752a0561495289781e611cd01f7e7d2`。 + +**live readback 證據**: +- evidence:`recommendation=postgres_hot_query_or_backup_export_playbook`,top containers 為 `stockplatform-v2-postgres-1=1.6271 cores`、`gitea=1.6196 cores`;top families 為 `postgres=155.5`、`systemd_control_plane=72.4`、`gitea_service=53.1`。 +- controller:`classification=blocked_stockplatform_hot_query_or_api_pressure_requires_playbook`,`dry_run=/home/wooo/scripts/host-sustained-load-evidence.py ... --json`,`stock_process_cpu_percent=155.5`、`gitea_process_cpu_percent=53.1`、`load5_per_core=0.5825`。 +- redaction 仍保持:不輸出 raw command line、workspace path、URL、secret。 + +**驗證**: +- `python3.11 -m pytest scripts/ops/tests/test_host_runaway_process_exporter.py scripts/ops/tests/test_gitea_queue_hook_backlog_playbook.py scripts/ops/tests/test_host_pressure_alert_contract.py ops/runner/test_cd_controlled_runtime_profile.py -q`:`76 passed`。 +- `python3.11 -m py_compile scripts/ops/host-sustained-load-evidence.py scripts/ops/tests/test_host_runaway_process_exporter.py`:通過。 +- `python3.11 ops/runner/guard-gitea-runner-pressure.py --root .`:通過,`auto_branch_events_on_110=0`、`generic_runner_labels=0`。 +- `git diff --check`:通過。 + +**下一步**: +- commit / push Gitea `main`,讀回 CD;CD 成功後接續 Stock/Postgres hot query / backup export playbook,而不是再回到 Gitea 泛查。 + ## 2026-07-02 — 12:50 110 Gitea CPU 壓力 check-mode playbook 實作 **完成內容**: diff --git a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md index 27998faa..51969267 100644 --- a/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md +++ b/docs/runbooks/HOST-RUNAWAY-PROCESS-AIOPS-PLAYBOOK.md @@ -108,6 +108,8 @@ python3 scripts/ops/gitea-queue-hook-backlog-playbook.py \ Gitea playbook 只能讀公開 `/api/healthz`、`/api/v1/version`、`/metrics` 與 sanitized host textfile。必須輸出 active Actions、Gitea health/version、`gitea_hooktasks`、Gitea container CPU、process-family CPU、Docker stats freshness 與 operation boundaries;不得讀 app.ini、secret、`.runner`、raw session、DB,也不得 restart / reload / kill。若 health 正常、active Actions 為 0、Gitea container 超過 `1.0 core` 且 `gitea_hooktasks >= 1000`,分類為 `blocked_gitea_hooktask_backlog_check_required`,再進 public queue / hook backlog 判讀。 +若 Gitea 與 StockPlatform 同時超過 early triage 門檻,controller / evidence 必須保持相同排序:StockPlatform Postgres/API container 或 `postgres` process-family 達門檻時,先回 `postgres_hot_query_or_backup_export_playbook`;只有 Stock/Postgres 未命中時才回 Gitea playbook。這是為了避免同一個 110 CPU 事件在不同 verifier 間出現相互矛盾的下一步。 + 如果只看到 `HostLoadAverageSustainedHigh`,且 orphan / active CI / swap 都無明確命中,AI 必須先跑只讀脫敏 evidence collector: ```bash diff --git a/scripts/ops/host-sustained-load-evidence.py b/scripts/ops/host-sustained-load-evidence.py index b5d4e612..060d0aa0 100755 --- a/scripts/ops/host-sustained-load-evidence.py +++ b/scripts/ops/host-sustained-load-evidence.py @@ -276,26 +276,34 @@ def top_docker_containers(samples: list[dict[str, Any]], *, host: str, top_n: in def recommend_playbook(process_families: list[dict[str, Any]], containers: list[dict[str, Any]]) -> str: - top_container = containers[0] if containers else {} - top_container_name = str(top_container.get("container_name") or "").lower() - top_container_cpu = float(top_container.get("cpu_cores") or 0.0) top_family = process_families[0] if process_families else {} family = str(top_family.get("family") or "") family_cpu = { str(item.get("family") or ""): float(item.get("cpu_percent") or 0.0) for item in process_families } + container_cpu = { + str(item.get("container_name") or "").lower(): float(item.get("cpu_cores") or 0.0) + for item in containers + } + stock_container_cpu = max( + [ + cpu + for name, cpu in container_cpu.items() + if "stockplatform-v2-postgres-1" in name + or name == "stockplatform-v2-api-1" + or ("postgres" in name and "gitea" not in name) + ] + or [0.0] + ) + gitea_container_cpu = max( + [cpu for name, cpu in container_cpu.items() if "gitea" in name] or [0.0] + ) - if "gitea" in top_container_name and top_container_cpu >= 1.0: - return "gitea_queue_or_hook_backlog_playbook" - if ( - ( - "postgres" in top_container_name - or "stockplatform-v2-postgres-1" in top_container_name - ) - and top_container_cpu >= 1.0 - ) or family_cpu.get("postgres", 0.0) >= 50.0: + if stock_container_cpu >= 1.0 or family_cpu.get("postgres", 0.0) >= 50.0: return "postgres_hot_query_or_backup_export_playbook" + if gitea_container_cpu >= 1.0: + return "gitea_queue_or_hook_backlog_playbook" if family_cpu.get("gitea_service", 0.0) >= 50.0: return "gitea_queue_or_hook_backlog_playbook" if family in {"docker_build", "web_build", "gitea_actions_runner"}: diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index 94f986f6..598e202a 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -961,6 +961,54 @@ def test_sustained_load_evidence_prioritizes_hot_gitea_container_over_control_pl assert "/home/wooo/gitea/app.ini" not in result.stdout +def test_sustained_load_evidence_aligns_stock_postgres_priority_with_controller( + tmp_path: Path, +) -> None: + ps_file = tmp_path / "ps.txt" + ps_file.write_text( + "\n".join( + [ + "100 1 100 78516 79.8 0.5 postgres postgres: stockplatform SELECT", + "200 1 200 78513 53.1 1.3 gitea /usr/local/bin/gitea web --config /home/wooo/gitea/app.ini", + ] + ), + encoding="utf-8", + ) + docker_file = tmp_path / "docker.prom" + docker_file.write_text( + "\n".join( + [ + 'docker_container_cpu_cores{host="110",container_name="gitea"} 1.6725', + 'docker_container_cpu_cores{host="110",container_name="stockplatform-v2-postgres-1"} 1.6317', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(EVIDENCE_PATH), + "--host", + "110", + "--ps-file", + str(ps_file), + "--docker-stats-file", + str(docker_file), + "--json", + ], + check=True, + capture_output=True, + text=True, + ) + + payload = json.loads(result.stdout) + assert payload["recommendation"] == "postgres_hot_query_or_backup_export_playbook" + assert payload["top_containers"][0]["container_name"] == "gitea" + assert payload["top_process_families"][0]["family"] == "postgres" + assert "/home/wooo/gitea/app.ini" not in result.stdout + + def test_sustained_load_evidence_keeps_stale_container_samples_untrusted(tmp_path: Path) -> None: metrics_file = tmp_path / "host.prom" metrics_file.write_text(