From c095a5db4fe4253e7afaafb7b1656196c5328e49 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 1 Jul 2026 08:50:01 +0800 Subject: [PATCH] fix(agent): classify gitea sustained load pressure --- .../ai_agent_autonomous_runtime_control.py | 15 ++++++ ...est_ai_agent_autonomous_runtime_control.py | 3 +- docs/LOGBOOK.md | 4 +- ops/monitoring/alerts-unified.yml | 2 +- ops/monitoring/alerts.yml | 2 +- scripts/ops/host-sustained-load-controller.py | 45 ++++++++++++++++ .../test_host_runaway_process_exporter.py | 51 +++++++++++++++++++ 7 files changed, 117 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/ai_agent_autonomous_runtime_control.py b/apps/api/src/services/ai_agent_autonomous_runtime_control.py index afb6d674..493ddfd2 100644 --- a/apps/api/src/services/ai_agent_autonomous_runtime_control.py +++ b/apps/api/src/services/ai_agent_autonomous_runtime_control.py @@ -1515,6 +1515,21 @@ def _build_host_sustained_load_controlled_automation_readback() -> dict[str, Any "destructive_prune", ], }, + { + "class_id": "gitea_queue_or_hook_backlog", + "alertnames": ["HostLoadAverageSustainedHigh"], + "classifier": "host-sustained-load-controller.py:blocked_gitea_queue_or_hook_backlog_requires_playbook", + "controlled_action": "run sanitized evidence collector, then use Gitea queue/hook backlog playbook in check-mode", + "controlled_apply_allowed": False, + "post_apply_verifier": "host-sustained-load-evidence.py readback plus Gitea queue/load verifier", + "rollback": "source-specific rollback required before apply; no generic runner restore", + "forbidden_actions": [ + "gitea_container_restart_without_playbook", + "legacy_runner_restore", + "generic_runner_label_restore", + "warn_only_pressure_gate", + ], + }, { "class_id": "unknown_sustained_load", "alertnames": ["HostLoadAverageSustainedHigh"], diff --git a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py index 0dd4e761..fe035613 100644 --- a/apps/api/tests/test_ai_agent_autonomous_runtime_control.py +++ b/apps/api/tests/test_ai_agent_autonomous_runtime_control.py @@ -772,7 +772,7 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows(): assert host_load["current_work_item_id"] == ( "P1-D2-host-sustained-load-controlled-automation" ) - assert host_load["rollups"]["action_class_count"] == 4 + assert host_load["rollups"]["action_class_count"] == 5 assert host_load["rollups"]["controlled_apply_class_count"] == 2 assert host_load["rollups"]["required_asset_count"] == 6 assert host_load["rollups"]["ready_asset_count"] == 6 @@ -780,6 +780,7 @@ def test_runtime_receipt_readback_summarizes_live_executor_closure_rows(): "orphan_browser_smoke_runaway_process", "ci_runner_load_saturation", "memory_or_swap_pressure", + "gitea_queue_or_hook_backlog", "unknown_sustained_load", } == {item["class_id"] for item in host_load["action_classes"]} assert host_load["operation_boundaries"]["executes_on_read"] is False diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 08dbdfcd..703036d5 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,7 +1,7 @@ ## 2026-07-01 — 08:37 Host sustained-load AI controlled automation **照主線修正的問題**: -- 新增 `scripts/ops/host-sustained-load-controller.py`,把 `HostLoadAverageSustainedHigh` 從「SSH 看 top / 人工判斷」改成可機器讀取的 AI controlled packet:orphan browser / smoke load、合法 Gitea Actions / BuildKit saturation、memory / swap pressure、unknown sustained load 四類分流。 +- 新增 `scripts/ops/host-sustained-load-controller.py`,把 `HostLoadAverageSustainedHigh` 從「SSH 看 top / 人工判斷」改成可機器讀取的 AI controlled packet:orphan browser / smoke load、合法 Gitea Actions / BuildKit saturation、Gitea queue / hook backlog、memory / swap pressure、unknown sustained load 五類分流。 - 新增 `scripts/ops/host-sustained-load-evidence.py`,unknown sustained load 不再回到 raw SSH top,而是產生脫敏 process-family / container evidence 給 source-specific PlayBook、KM、RAG 與後續 controller decision 使用;不輸出 raw command line、workspace path、URL 或 secret value。 - `host-runaway-process-remediation.py` 的 apply gate 從 owner / maintenance-window 必填改成 controlled apply receipt 必填:`--controlled-apply-id`、`--evidence-ref`、`--post-apply-verifier`、`--confirm-apply`;owner / maintenance-window 只保留為可選 evidence。若目標 process group 已消失,回報 `already_exited` / `missing_process_group_count`,不再 traceback。 - `ops/monitoring/alerts.yml` 與 `ops/monitoring/alerts-unified.yml` 將 sustained load 的 `auto_repair_action` 指向 controller,runbook 改為 AI controlled packet / dry-run / controlled SIGTERM / verifier;清掉 orphan browser 與 remediation authorization 告警中的人工批准語意。 @@ -12,7 +12,7 @@ - `PYTHONDONTWRITEBYTECODE=1 python3.11 -m pytest scripts/ops/tests/test_host_runaway_process_exporter.py -q --tb=short -p no:cacheprovider` 通過(16 passed)。 - `DATABASE_URL=sqlite:///test.db PYTHONDONTWRITEBYTECODE=1 python3.11 -m pytest apps/api/tests/test_ai_agent_autonomous_runtime_control.py -q --tb=short -p no:cacheprovider` 通過(10 passed)。 - `git diff --check` 通過。 -- 110 live readback:從 `http://192.168.0.110:9100/metrics` 只篩 `awoooi_host_*` 指標到 `/tmp/awoooi-host110-load.prom`;controller 回 `classification=blocked_unknown_sustained_load_requires_source_specific_playbook`、`load5_per_core=1.56`、`monitor_up=1`、active CI `0`、orphan rule `null`,並給出 `host-sustained-load-evidence.py --json` 只讀脫敏證據指令;本次未執行 host write / signal / restart。 +- 110 live readback:從 `http://192.168.0.110:9100/metrics` 只篩 `awoooi_host_*` / Docker CPU 指標;controller / evidence readback 顯示 `load5_per_core=2.563333`、active CI `0`、orphan group `0`、`gitea=3.4019` CPU cores,classification / recommendation 指向 `gitea_queue_or_hook_backlog`;本次未執行 host write / signal / restart。 **邊界**:本段只做 source / test / Prometheus rule / API readback 實作;未對 live host 送 SIGTERM;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未 workflow_dispatch;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未恢復 legacy / generic runner label。 diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 55edfcbc..93f5089f 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -101,7 +101,7 @@ groups: annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json'" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" runbook: "交給 host-sustained-load-controller 產生 AI controlled packet:orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier;合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet;unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook;swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。" - alert: HostOutOfMemory diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 991522d3..97dad7fe 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -69,7 +69,7 @@ groups: annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --json'" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" runbook: "交給 host-sustained-load-controller 產生 AI controlled packet:orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier;合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet;unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook;swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。" - alert: HostOutOfMemory diff --git a/scripts/ops/host-sustained-load-controller.py b/scripts/ops/host-sustained-load-controller.py index af6a1568..33428fab 100755 --- a/scripts/ops/host-sustained-load-controller.py +++ b/scripts/ops/host-sustained-load-controller.py @@ -25,6 +25,7 @@ from typing import Any DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom") +DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom") SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1" LABEL_RE = re.compile(r"(?P[A-Za-z_][A-Za-z0-9_]*)=\"(?P(?:[^\"\\\\]|\\\\.)*)\"") METRIC_RE = re.compile( @@ -39,6 +40,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--host", default="110") parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE) + parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE) parser.add_argument("--load5-per-core-threshold", type=float, default=1.5) parser.add_argument("--ci-stale-age-seconds", type=int, default=1800) parser.add_argument("--json", action="store_true", help="Print JSON only.") @@ -133,10 +135,30 @@ def _top_orphan_rule(samples: list[dict[str, Any]], *, host: str) -> dict[str, A return sorted(candidates, key=lambda item: (-item["cpu_percent"], item["rule"]))[0] +def _top_container_cpu(samples: list[dict[str, Any]], *, host: str) -> dict[str, Any] | None: + candidates = [] + for sample in samples: + if sample["name"] != "docker_container_cpu_cores": + continue + labels = sample["labels"] + if labels.get("host", host) != host: + continue + candidates.append( + { + "container_name": labels.get("container_name") or labels.get("name") or "unknown", + "cpu_cores": round(float(sample["value"]), 6), + } + ) + if not candidates: + return None + return sorted(candidates, key=lambda item: (-item["cpu_cores"], item["container_name"]))[0] + + def build_packet( *, host: str, samples: list[dict[str, Any]], + docker_samples: list[dict[str, Any]], load5_per_core_threshold: float, ci_stale_age_seconds: int, ) -> dict[str, Any]: @@ -187,6 +209,9 @@ def build_packet( ) ) top_orphan = _top_orphan_rule(samples, host=host) + top_container = _top_container_cpu(docker_samples, host=host) + top_container_name = str((top_container or {}).get("container_name") or "").lower() + top_container_cpu = float((top_container or {}).get("cpu_cores") or 0.0) classification = "observing_load_within_threshold" severity = "info" @@ -244,6 +269,19 @@ def build_packet( if controlled_apply_allowed else "keep_pressure_gate_fail_closed_until_ci_load_clears" ) + elif ( + load5_per_core > load5_per_core_threshold + and top_container_name == "gitea" + and top_container_cpu >= 2.0 + ): + classification = "blocked_gitea_queue_or_hook_backlog_requires_playbook" + severity = "critical" + dry_run_command = ( + "scripts/ops/host-sustained-load-evidence.py " + f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " + f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + ) + next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode" elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85: classification = "blocked_memory_or_swap_pressure_requires_service_playbook" severity = "critical" @@ -278,6 +316,7 @@ def build_packet( "active_ci_process_cpu_percent": round(active_ci_cpu, 3), "active_ci_oldest_age_seconds": active_ci_oldest_age, "top_orphan_rule": top_orphan, + "top_container_cpu": top_container, }, "commands": { "dry_run": dry_run_command, @@ -316,9 +355,15 @@ def main() -> int: samples = parse_prometheus_text(text) except FileNotFoundError: samples = [] + try: + docker_text = args.docker_stats_file.read_text(encoding="utf-8") + docker_samples = parse_prometheus_text(docker_text) + except FileNotFoundError: + docker_samples = [] packet = build_packet( host=args.host, samples=samples, + docker_samples=docker_samples, load5_per_core_threshold=args.load5_per_core_threshold, ci_stale_age_seconds=args.ci_stale_age_seconds, ) diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index a97da0ea..d1d8259d 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -374,6 +374,57 @@ def test_sustained_load_controller_blocks_monitor_authority_violation(tmp_path: assert payload["controlled_apply_allowed"] is False +def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="110"} 2.5', + 'awoooi_host_swap_used_ratio{host="110"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="110"} 0', + 'awoooi_host_gitea_actions_active_container_count{host="110"} 0', + 'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0', + 'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0', + ] + ), + encoding="utf-8", + ) + docker_file = tmp_path / "docker.prom" + docker_file.write_text( + "\n".join( + [ + 'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4', + 'docker_container_cpu_cores{host="110",container_name="redis"} 0.2', + ] + ), + encoding="utf-8", + ) + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "110", + "--metrics-file", + str(metrics_file), + "--docker-stats-file", + str(docker_file), + "--json", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 75 + payload = json.loads(result.stdout) + assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook" + assert payload["readback"]["top_container_cpu"]["container_name"] == "gitea" + assert payload["controlled_apply_allowed"] is False + assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"] + + def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None: metrics_file = tmp_path / "host.prom" metrics_file.write_text(