diff --git a/scripts/ops/host-sustained-load-controller.py b/scripts/ops/host-sustained-load-controller.py index 6d3045e64..1238da4d1 100755 --- a/scripts/ops/host-sustained-load-controller.py +++ b/scripts/ops/host-sustained-load-controller.py @@ -379,6 +379,8 @@ def build_packet( docker_samples: list[dict[str, Any]], docker_stats_status: dict[str, Any], process_summary: dict[str, Any], + metrics_file: Path, + docker_stats_file: Path, load5_per_core_threshold: float, container_cpu_threshold: float, hot_container_cpu_threshold: float, @@ -454,7 +456,8 @@ def build_packet( remediation_script = script_dir / "host-runaway-process-remediation.py" verifier_command = ( f"{controller_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}" + f"--host {host} --metrics-file {metrics_file}" + f" --docker-stats-file {docker_stats_file}" ) if monitor_up != 1: @@ -478,7 +481,8 @@ def build_packet( "--evidence-ref ${EVIDENCE_REF} " "--post-apply-verifier " f"'{controller_script} --host " - f"{host} --metrics-file {DEFAULT_METRICS_FILE}' " + f"{host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file}' " "--wait-seconds 10" ) next_action = "run_orphan_browser_remediation_dry_run_then_controlled_sigterm" @@ -518,8 +522,8 @@ def build_packet( ) dry_run_command = ( f"{evidence_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " - f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + f"--host {host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file} --json" ) next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode" elif ( @@ -534,8 +538,8 @@ def build_packet( ) dry_run_command = ( f"{gitea_playbook_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " - f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + f"--host {host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file} --json" ) next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode" elif ( @@ -549,8 +553,8 @@ def build_packet( severity = "critical" if load5_per_core > load5_per_core_threshold else "warning" dry_run_command = ( f"{evidence_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " - f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + f"--host {host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file} --json" ) next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode" elif ( @@ -560,8 +564,8 @@ def build_packet( severity = "critical" if load5_per_core > load5_per_core_threshold else "warning" dry_run_command = ( f"{gitea_playbook_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " - f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + f"--host {host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file} --json" ) next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode" elif control_plane_cpu >= process_family_cpu_threshold: @@ -569,8 +573,8 @@ def build_packet( severity = "critical" if load5_per_core > load5_per_core_threshold else "warning" dry_run_command = ( f"{evidence_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " - f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + f"--host {host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file} --json" ) next_action = "run_control_plane_saturation_playbook_check_mode" elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85: @@ -582,8 +586,8 @@ def build_packet( severity = "critical" dry_run_command = ( f"{evidence_script} " - f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} " - f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json" + f"--host {host} --metrics-file {metrics_file} " + f"--docker-stats-file {docker_stats_file} --json" ) next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook" @@ -602,6 +606,8 @@ def build_packet( "container_cpu_threshold": container_cpu_threshold, "hot_container_cpu_threshold": hot_container_cpu_threshold, "process_family_cpu_threshold": process_family_cpu_threshold, + "metrics_file": str(metrics_file), + "docker_stats_file": str(docker_stats_file), "script_dir": str(script_dir), "swap_used_ratio": round(swap_used_ratio, 6), "remediation_authorized": remediation_authorized, @@ -670,6 +676,8 @@ def main() -> int: max_age_seconds=args.docker_stats_max_age_seconds, ), process_summary=_summarize_processes(_parse_ps_text(_collect_ps_text(args.ps_file)), top_n=args.top_n), + metrics_file=args.metrics_file, + docker_stats_file=args.docker_stats_file, load5_per_core_threshold=args.load5_per_core_threshold, container_cpu_threshold=args.container_cpu_threshold, hot_container_cpu_threshold=args.hot_container_cpu_threshold, diff --git a/scripts/ops/tests/test_host_runaway_process_exporter.py b/scripts/ops/tests/test_host_runaway_process_exporter.py index 598e202a6..7c147e1ca 100644 --- a/scripts/ops/tests/test_host_runaway_process_exporter.py +++ b/scripts/ops/tests/test_host_runaway_process_exporter.py @@ -875,6 +875,60 @@ def test_sustained_load_controller_routes_control_plane_family_pressure(tmp_path assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"] +def test_sustained_load_controller_honors_custom_readback_paths(tmp_path: Path) -> None: + metrics_file = tmp_path / "host.prom" + metrics_file.write_text( + "\n".join( + [ + 'awoooi_host_runaway_process_monitor_up{host="188",mode="read_only"} 1', + 'awoooi_host_load5_per_core{host="188"} 0.40', + 'awoooi_host_swap_used_ratio{host="188"} 0.1', + 'awoooi_host_runaway_process_remediation_authorized{host="188"} 0', + ] + ), + encoding="utf-8", + ) + docker_file = tmp_path / "docker.prom" + docker_file.write_text( + 'docker_container_cpu_cores{host="188",container_name="momo-scheduler"} 0.70\n', + encoding="utf-8", + ) + ps_file = tmp_path / "ps.txt" + ps_file.write_text("100 1 100 3600 5.0 0.0 python python worker.py\n", encoding="utf-8") + script_dir = tmp_path / "deployed-scripts" + + result = subprocess.run( + [ + sys.executable, + str(CONTROLLER_PATH), + "--host", + "188", + "--metrics-file", + str(metrics_file), + "--docker-stats-file", + str(docker_file), + "--script-dir", + str(script_dir), + "--ps-file", + str(ps_file), + "--json", + ], + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + payload = json.loads(result.stdout) + assert payload["classification"] == "observing_load_within_threshold" + assert payload["readback"]["metrics_file"] == str(metrics_file) + assert payload["readback"]["docker_stats_file"] == str(docker_file) + assert payload["readback"]["script_dir"] == str(script_dir) + assert str(metrics_file) in payload["commands"]["post_apply_verifier"] + assert str(docker_file) in payload["commands"]["post_apply_verifier"] + assert str(script_dir / "host-sustained-load-controller.py") in payload["commands"]["post_apply_verifier"] + assert "/home/wooo/node_exporter_textfiles" not in payload["commands"]["post_apply_verifier"] + + def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None: ps_file = tmp_path / "ps.txt" ps_file.write_text(