fix(agent): classify gitea sustained load pressure
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m10s
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m10s
CD Pipeline / build-and-deploy (push) Has been cancelled
CD Pipeline / post-deploy-checks (push) Has been cancelled
This commit is contained in:
@@ -25,6 +25,7 @@ from typing import Any
|
||||
|
||||
|
||||
DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
|
||||
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
|
||||
SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
|
||||
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
|
||||
METRIC_RE = re.compile(
|
||||
@@ -39,6 +40,7 @@ def parse_args() -> argparse.Namespace:
|
||||
)
|
||||
parser.add_argument("--host", default="110")
|
||||
parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE)
|
||||
parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
|
||||
parser.add_argument("--load5-per-core-threshold", type=float, default=1.5)
|
||||
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
|
||||
parser.add_argument("--json", action="store_true", help="Print JSON only.")
|
||||
@@ -133,10 +135,30 @@ def _top_orphan_rule(samples: list[dict[str, Any]], *, host: str) -> dict[str, A
|
||||
return sorted(candidates, key=lambda item: (-item["cpu_percent"], item["rule"]))[0]
|
||||
|
||||
|
||||
def _top_container_cpu(samples: list[dict[str, Any]], *, host: str) -> dict[str, Any] | None:
|
||||
candidates = []
|
||||
for sample in samples:
|
||||
if sample["name"] != "docker_container_cpu_cores":
|
||||
continue
|
||||
labels = sample["labels"]
|
||||
if labels.get("host", host) != host:
|
||||
continue
|
||||
candidates.append(
|
||||
{
|
||||
"container_name": labels.get("container_name") or labels.get("name") or "unknown",
|
||||
"cpu_cores": round(float(sample["value"]), 6),
|
||||
}
|
||||
)
|
||||
if not candidates:
|
||||
return None
|
||||
return sorted(candidates, key=lambda item: (-item["cpu_cores"], item["container_name"]))[0]
|
||||
|
||||
|
||||
def build_packet(
|
||||
*,
|
||||
host: str,
|
||||
samples: list[dict[str, Any]],
|
||||
docker_samples: list[dict[str, Any]],
|
||||
load5_per_core_threshold: float,
|
||||
ci_stale_age_seconds: int,
|
||||
) -> dict[str, Any]:
|
||||
@@ -187,6 +209,9 @@ def build_packet(
|
||||
)
|
||||
)
|
||||
top_orphan = _top_orphan_rule(samples, host=host)
|
||||
top_container = _top_container_cpu(docker_samples, host=host)
|
||||
top_container_name = str((top_container or {}).get("container_name") or "").lower()
|
||||
top_container_cpu = float((top_container or {}).get("cpu_cores") or 0.0)
|
||||
|
||||
classification = "observing_load_within_threshold"
|
||||
severity = "info"
|
||||
@@ -244,6 +269,19 @@ def build_packet(
|
||||
if controlled_apply_allowed
|
||||
else "keep_pressure_gate_fail_closed_until_ci_load_clears"
|
||||
)
|
||||
elif (
|
||||
load5_per_core > load5_per_core_threshold
|
||||
and top_container_name == "gitea"
|
||||
and top_container_cpu >= 2.0
|
||||
):
|
||||
classification = "blocked_gitea_queue_or_hook_backlog_requires_playbook"
|
||||
severity = "critical"
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
|
||||
elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
|
||||
classification = "blocked_memory_or_swap_pressure_requires_service_playbook"
|
||||
severity = "critical"
|
||||
@@ -278,6 +316,7 @@ def build_packet(
|
||||
"active_ci_process_cpu_percent": round(active_ci_cpu, 3),
|
||||
"active_ci_oldest_age_seconds": active_ci_oldest_age,
|
||||
"top_orphan_rule": top_orphan,
|
||||
"top_container_cpu": top_container,
|
||||
},
|
||||
"commands": {
|
||||
"dry_run": dry_run_command,
|
||||
@@ -316,9 +355,15 @@ def main() -> int:
|
||||
samples = parse_prometheus_text(text)
|
||||
except FileNotFoundError:
|
||||
samples = []
|
||||
try:
|
||||
docker_text = args.docker_stats_file.read_text(encoding="utf-8")
|
||||
docker_samples = parse_prometheus_text(docker_text)
|
||||
except FileNotFoundError:
|
||||
docker_samples = []
|
||||
packet = build_packet(
|
||||
host=args.host,
|
||||
samples=samples,
|
||||
docker_samples=docker_samples,
|
||||
load5_per_core_threshold=args.load5_per_core_threshold,
|
||||
ci_stale_age_seconds=args.ci_stale_age_seconds,
|
||||
)
|
||||
|
||||
@@ -374,6 +374,57 @@ def test_sustained_load_controller_blocks_monitor_authority_violation(tmp_path:
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
|
||||
|
||||
def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
|
||||
'awoooi_host_load5_per_core{host="110"} 2.5',
|
||||
'awoooi_host_swap_used_ratio{host="110"} 0.1',
|
||||
'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
|
||||
'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
|
||||
'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
|
||||
'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
docker_file = tmp_path / "docker.prom"
|
||||
docker_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4',
|
||||
'docker_container_cpu_cores{host="110",container_name="redis"} 0.2',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(CONTROLLER_PATH),
|
||||
"--host",
|
||||
"110",
|
||||
"--metrics-file",
|
||||
str(metrics_file),
|
||||
"--docker-stats-file",
|
||||
str(docker_file),
|
||||
"--json",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 75
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook"
|
||||
assert payload["readback"]["top_container_cpu"]["container_name"] == "gitea"
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
|
||||
|
||||
def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
|
||||
Reference in New Issue
Block a user