fix(ops): prioritize live gitea pressure routing
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / build-and-deploy (push) Successful in 4m34s
CD Pipeline / post-deploy-checks (push) Successful in 6m0s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / build-and-deploy (push) Successful in 4m34s
CD Pipeline / post-deploy-checks (push) Successful in 6m0s
This commit is contained in:
@@ -537,15 +537,6 @@ def build_packet(
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
|
||||
elif control_plane_cpu >= process_family_cpu_threshold:
|
||||
classification = "blocked_control_plane_saturation_requires_playbook"
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
next_action = "run_control_plane_saturation_playbook_check_mode"
|
||||
elif (
|
||||
"stockplatform-v2-postgres-1" in top_container_name
|
||||
and top_container_cpu >= hot_container_cpu_threshold
|
||||
@@ -572,6 +563,15 @@ def build_packet(
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
|
||||
elif control_plane_cpu >= process_family_cpu_threshold:
|
||||
classification = "blocked_control_plane_saturation_requires_playbook"
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
next_action = "run_control_plane_saturation_playbook_check_mode"
|
||||
elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
|
||||
classification = "blocked_memory_or_swap_pressure_requires_service_playbook"
|
||||
severity = "critical"
|
||||
|
||||
@@ -281,11 +281,23 @@ def recommend_playbook(process_families: list[dict[str, Any]], containers: list[
|
||||
top_container_cpu = float(top_container.get("cpu_cores") or 0.0)
|
||||
top_family = process_families[0] if process_families else {}
|
||||
family = str(top_family.get("family") or "")
|
||||
family_cpu = {
|
||||
str(item.get("family") or ""): float(item.get("cpu_percent") or 0.0)
|
||||
for item in process_families
|
||||
}
|
||||
|
||||
if "gitea" in top_container_name and top_container_cpu >= 2.0:
|
||||
if "gitea" in top_container_name and top_container_cpu >= 1.0:
|
||||
return "gitea_queue_or_hook_backlog_playbook"
|
||||
if "postgres" in top_container_name or "postgres" in family:
|
||||
if (
|
||||
(
|
||||
"postgres" in top_container_name
|
||||
or "stockplatform-v2-postgres-1" in top_container_name
|
||||
)
|
||||
and top_container_cpu >= 1.0
|
||||
) or family_cpu.get("postgres", 0.0) >= 50.0:
|
||||
return "postgres_hot_query_or_backup_export_playbook"
|
||||
if family_cpu.get("gitea_service", 0.0) >= 50.0:
|
||||
return "gitea_queue_or_hook_backlog_playbook"
|
||||
if family in {"docker_build", "web_build", "gitea_actions_runner"}:
|
||||
return "build_or_runner_pressure_playbook"
|
||||
if family in {"systemd_control_plane", "ssh_control_plane"}:
|
||||
|
||||
@@ -526,6 +526,80 @@ def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
|
||||
|
||||
def test_sustained_load_controller_prioritizes_hot_gitea_container_over_control_plane_average(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
|
||||
'awoooi_host_load5_per_core{host="110"} 0.70',
|
||||
'awoooi_host_swap_used_ratio{host="110"} 0.1',
|
||||
'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
|
||||
'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
|
||||
'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
|
||||
'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
docker_file = tmp_path / "docker.prom"
|
||||
docker_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'docker_container_cpu_cores{host="110",container_name="gitea"} 1.59',
|
||||
'docker_container_cpu_cores{host="110",container_name="redis"} 0.2',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
ps_file = tmp_path / "ps.txt"
|
||||
ps_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"100 1 100 75507 61.8 0.0 systemd /sbin/init",
|
||||
"101 1 101 75469 6.7 0.0 dbus-daemon @dbus-daemon --system",
|
||||
"200 1 200 75348 53.1 1.3 gitea /usr/local/bin/gitea web --config /home/wooo/gitea/app.ini",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(CONTROLLER_PATH),
|
||||
"--host",
|
||||
"110",
|
||||
"--load5-per-core-threshold",
|
||||
"0.75",
|
||||
"--hot-container-cpu-threshold",
|
||||
"1.0",
|
||||
"--container-cpu-threshold",
|
||||
"2.0",
|
||||
"--metrics-file",
|
||||
str(metrics_file),
|
||||
"--docker-stats-file",
|
||||
str(docker_file),
|
||||
"--ps-file",
|
||||
str(ps_file),
|
||||
"--json",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 75
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook"
|
||||
assert payload["next_action"] == "run_gitea_queue_or_hook_backlog_playbook_check_mode"
|
||||
assert payload["readback"]["control_plane_process_cpu_percent"] == 68.5
|
||||
assert payload["readback"]["top_container_cpu"]["container_name"] == "gitea"
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/gitea/app.ini" not in result.stdout
|
||||
|
||||
|
||||
def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
@@ -842,6 +916,50 @@ def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path:
|
||||
assert "/home/wooo" not in result.stdout
|
||||
|
||||
|
||||
def test_sustained_load_evidence_prioritizes_hot_gitea_container_over_control_plane_average(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
ps_file = tmp_path / "ps.txt"
|
||||
ps_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"100 1 100 75507 61.8 0.0 systemd /sbin/init",
|
||||
"101 1 101 75469 6.7 0.0 dbus-daemon @dbus-daemon --system",
|
||||
"200 1 200 75348 53.1 1.3 gitea /usr/local/bin/gitea web --config /home/wooo/gitea/app.ini",
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
docker_file = tmp_path / "docker.prom"
|
||||
docker_file.write_text(
|
||||
'docker_container_cpu_cores{host="110",container_name="gitea"} 1.4591\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(EVIDENCE_PATH),
|
||||
"--host",
|
||||
"110",
|
||||
"--ps-file",
|
||||
str(ps_file),
|
||||
"--docker-stats-file",
|
||||
str(docker_file),
|
||||
"--json",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["recommendation"] == "gitea_queue_or_hook_backlog_playbook"
|
||||
assert payload["top_process_families"][0]["family"] == "systemd_control_plane"
|
||||
assert payload["top_containers"][0]["container_name"] == "gitea"
|
||||
assert "/home/wooo/gitea/app.ini" not in result.stdout
|
||||
|
||||
|
||||
def test_sustained_load_evidence_keeps_stale_container_samples_untrusted(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
|
||||
Reference in New Issue
Block a user