fix(ops): honor host load controller readback paths
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled
This commit is contained in:
@@ -379,6 +379,8 @@ def build_packet(
|
||||
docker_samples: list[dict[str, Any]],
|
||||
docker_stats_status: dict[str, Any],
|
||||
process_summary: dict[str, Any],
|
||||
metrics_file: Path,
|
||||
docker_stats_file: Path,
|
||||
load5_per_core_threshold: float,
|
||||
container_cpu_threshold: float,
|
||||
hot_container_cpu_threshold: float,
|
||||
@@ -454,7 +456,8 @@ def build_packet(
|
||||
remediation_script = script_dir / "host-runaway-process-remediation.py"
|
||||
verifier_command = (
|
||||
f"{controller_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}"
|
||||
f"--host {host} --metrics-file {metrics_file}"
|
||||
f" --docker-stats-file {docker_stats_file}"
|
||||
)
|
||||
|
||||
if monitor_up != 1:
|
||||
@@ -478,7 +481,8 @@ def build_packet(
|
||||
"--evidence-ref ${EVIDENCE_REF} "
|
||||
"--post-apply-verifier "
|
||||
f"'{controller_script} --host "
|
||||
f"{host} --metrics-file {DEFAULT_METRICS_FILE}' "
|
||||
f"{host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file}' "
|
||||
"--wait-seconds 10"
|
||||
)
|
||||
next_action = "run_orphan_browser_remediation_dry_run_then_controlled_sigterm"
|
||||
@@ -518,8 +522,8 @@ def build_packet(
|
||||
)
|
||||
dry_run_command = (
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
f"--host {host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file} --json"
|
||||
)
|
||||
next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode"
|
||||
elif (
|
||||
@@ -534,8 +538,8 @@ def build_packet(
|
||||
)
|
||||
dry_run_command = (
|
||||
f"{gitea_playbook_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
f"--host {host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file} --json"
|
||||
)
|
||||
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
|
||||
elif (
|
||||
@@ -549,8 +553,8 @@ def build_packet(
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
f"--host {host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file} --json"
|
||||
)
|
||||
next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode"
|
||||
elif (
|
||||
@@ -560,8 +564,8 @@ def build_packet(
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
f"{gitea_playbook_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
f"--host {host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file} --json"
|
||||
)
|
||||
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
|
||||
elif control_plane_cpu >= process_family_cpu_threshold:
|
||||
@@ -569,8 +573,8 @@ def build_packet(
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
f"--host {host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file} --json"
|
||||
)
|
||||
next_action = "run_control_plane_saturation_playbook_check_mode"
|
||||
elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
|
||||
@@ -582,8 +586,8 @@ def build_packet(
|
||||
severity = "critical"
|
||||
dry_run_command = (
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
f"--host {host} --metrics-file {metrics_file} "
|
||||
f"--docker-stats-file {docker_stats_file} --json"
|
||||
)
|
||||
next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook"
|
||||
|
||||
@@ -602,6 +606,8 @@ def build_packet(
|
||||
"container_cpu_threshold": container_cpu_threshold,
|
||||
"hot_container_cpu_threshold": hot_container_cpu_threshold,
|
||||
"process_family_cpu_threshold": process_family_cpu_threshold,
|
||||
"metrics_file": str(metrics_file),
|
||||
"docker_stats_file": str(docker_stats_file),
|
||||
"script_dir": str(script_dir),
|
||||
"swap_used_ratio": round(swap_used_ratio, 6),
|
||||
"remediation_authorized": remediation_authorized,
|
||||
@@ -670,6 +676,8 @@ def main() -> int:
|
||||
max_age_seconds=args.docker_stats_max_age_seconds,
|
||||
),
|
||||
process_summary=_summarize_processes(_parse_ps_text(_collect_ps_text(args.ps_file)), top_n=args.top_n),
|
||||
metrics_file=args.metrics_file,
|
||||
docker_stats_file=args.docker_stats_file,
|
||||
load5_per_core_threshold=args.load5_per_core_threshold,
|
||||
container_cpu_threshold=args.container_cpu_threshold,
|
||||
hot_container_cpu_threshold=args.hot_container_cpu_threshold,
|
||||
|
||||
@@ -875,6 +875,60 @@ def test_sustained_load_controller_routes_control_plane_family_pressure(tmp_path
|
||||
assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"]
|
||||
|
||||
|
||||
def test_sustained_load_controller_honors_custom_readback_paths(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'awoooi_host_runaway_process_monitor_up{host="188",mode="read_only"} 1',
|
||||
'awoooi_host_load5_per_core{host="188"} 0.40',
|
||||
'awoooi_host_swap_used_ratio{host="188"} 0.1',
|
||||
'awoooi_host_runaway_process_remediation_authorized{host="188"} 0',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
docker_file = tmp_path / "docker.prom"
|
||||
docker_file.write_text(
|
||||
'docker_container_cpu_cores{host="188",container_name="momo-scheduler"} 0.70\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
ps_file = tmp_path / "ps.txt"
|
||||
ps_file.write_text("100 1 100 3600 5.0 0.0 python python worker.py\n", encoding="utf-8")
|
||||
script_dir = tmp_path / "deployed-scripts"
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(CONTROLLER_PATH),
|
||||
"--host",
|
||||
"188",
|
||||
"--metrics-file",
|
||||
str(metrics_file),
|
||||
"--docker-stats-file",
|
||||
str(docker_file),
|
||||
"--script-dir",
|
||||
str(script_dir),
|
||||
"--ps-file",
|
||||
str(ps_file),
|
||||
"--json",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["classification"] == "observing_load_within_threshold"
|
||||
assert payload["readback"]["metrics_file"] == str(metrics_file)
|
||||
assert payload["readback"]["docker_stats_file"] == str(docker_file)
|
||||
assert payload["readback"]["script_dir"] == str(script_dir)
|
||||
assert str(metrics_file) in payload["commands"]["post_apply_verifier"]
|
||||
assert str(docker_file) in payload["commands"]["post_apply_verifier"]
|
||||
assert str(script_dir / "host-sustained-load-controller.py") in payload["commands"]["post_apply_verifier"]
|
||||
assert "/home/wooo/node_exporter_textfiles" not in payload["commands"]["post_apply_verifier"]
|
||||
|
||||
|
||||
def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None:
|
||||
ps_file = tmp_path / "ps.txt"
|
||||
ps_file.write_text(
|
||||
|
||||
Reference in New Issue
Block a user