fix(ops): honor host load controller readback paths
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
Your Name
2026-07-03 01:17:01 +08:00
parent 8caafdd06a
commit 2a83ec01b8
2 changed files with 76 additions and 14 deletions

View File

@@ -379,6 +379,8 @@ def build_packet(
docker_samples: list[dict[str, Any]],
docker_stats_status: dict[str, Any],
process_summary: dict[str, Any],
metrics_file: Path,
docker_stats_file: Path,
load5_per_core_threshold: float,
container_cpu_threshold: float,
hot_container_cpu_threshold: float,
@@ -454,7 +456,8 @@ def build_packet(
remediation_script = script_dir / "host-runaway-process-remediation.py"
verifier_command = (
f"{controller_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}"
f"--host {host} --metrics-file {metrics_file}"
f" --docker-stats-file {docker_stats_file}"
)
if monitor_up != 1:
@@ -478,7 +481,8 @@ def build_packet(
"--evidence-ref ${EVIDENCE_REF} "
"--post-apply-verifier "
f"'{controller_script} --host "
f"{host} --metrics-file {DEFAULT_METRICS_FILE}' "
f"{host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file}' "
"--wait-seconds 10"
)
next_action = "run_orphan_browser_remediation_dry_run_then_controlled_sigterm"
@@ -518,8 +522,8 @@ def build_packet(
)
dry_run_command = (
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
f"--host {host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file} --json"
)
next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode"
elif (
@@ -534,8 +538,8 @@ def build_packet(
)
dry_run_command = (
f"{gitea_playbook_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
f"--host {host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file} --json"
)
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
elif (
@@ -549,8 +553,8 @@ def build_packet(
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
f"--host {host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file} --json"
)
next_action = "run_stockplatform_hot_query_or_api_pressure_playbook_check_mode"
elif (
@@ -560,8 +564,8 @@ def build_packet(
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
f"{gitea_playbook_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
f"--host {host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file} --json"
)
next_action = "run_gitea_queue_or_hook_backlog_playbook_check_mode"
elif control_plane_cpu >= process_family_cpu_threshold:
@@ -569,8 +573,8 @@ def build_packet(
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
f"--host {host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file} --json"
)
next_action = "run_control_plane_saturation_playbook_check_mode"
elif load5_per_core > load5_per_core_threshold and swap_used_ratio >= 0.85:
@@ -582,8 +586,8 @@ def build_packet(
severity = "critical"
dry_run_command = (
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
f"--host {host} --metrics-file {metrics_file} "
f"--docker-stats-file {docker_stats_file} --json"
)
next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook"
@@ -602,6 +606,8 @@ def build_packet(
"container_cpu_threshold": container_cpu_threshold,
"hot_container_cpu_threshold": hot_container_cpu_threshold,
"process_family_cpu_threshold": process_family_cpu_threshold,
"metrics_file": str(metrics_file),
"docker_stats_file": str(docker_stats_file),
"script_dir": str(script_dir),
"swap_used_ratio": round(swap_used_ratio, 6),
"remediation_authorized": remediation_authorized,
@@ -670,6 +676,8 @@ def main() -> int:
max_age_seconds=args.docker_stats_max_age_seconds,
),
process_summary=_summarize_processes(_parse_ps_text(_collect_ps_text(args.ps_file)), top_n=args.top_n),
metrics_file=args.metrics_file,
docker_stats_file=args.docker_stats_file,
load5_per_core_threshold=args.load5_per_core_threshold,
container_cpu_threshold=args.container_cpu_threshold,
hot_container_cpu_threshold=args.hot_container_cpu_threshold,

View File

@@ -875,6 +875,60 @@ def test_sustained_load_controller_routes_control_plane_family_pressure(tmp_path
assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"]
def test_sustained_load_controller_honors_custom_readback_paths(tmp_path: Path) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(
"\n".join(
[
'awoooi_host_runaway_process_monitor_up{host="188",mode="read_only"} 1',
'awoooi_host_load5_per_core{host="188"} 0.40',
'awoooi_host_swap_used_ratio{host="188"} 0.1',
'awoooi_host_runaway_process_remediation_authorized{host="188"} 0',
]
),
encoding="utf-8",
)
docker_file = tmp_path / "docker.prom"
docker_file.write_text(
'docker_container_cpu_cores{host="188",container_name="momo-scheduler"} 0.70\n',
encoding="utf-8",
)
ps_file = tmp_path / "ps.txt"
ps_file.write_text("100 1 100 3600 5.0 0.0 python python worker.py\n", encoding="utf-8")
script_dir = tmp_path / "deployed-scripts"
result = subprocess.run(
[
sys.executable,
str(CONTROLLER_PATH),
"--host",
"188",
"--metrics-file",
str(metrics_file),
"--docker-stats-file",
str(docker_file),
"--script-dir",
str(script_dir),
"--ps-file",
str(ps_file),
"--json",
],
capture_output=True,
text=True,
)
assert result.returncode == 0
payload = json.loads(result.stdout)
assert payload["classification"] == "observing_load_within_threshold"
assert payload["readback"]["metrics_file"] == str(metrics_file)
assert payload["readback"]["docker_stats_file"] == str(docker_file)
assert payload["readback"]["script_dir"] == str(script_dir)
assert str(metrics_file) in payload["commands"]["post_apply_verifier"]
assert str(docker_file) in payload["commands"]["post_apply_verifier"]
assert str(script_dir / "host-sustained-load-controller.py") in payload["commands"]["post_apply_verifier"]
assert "/home/wooo/node_exporter_textfiles" not in payload["commands"]["post_apply_verifier"]
def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None:
ps_file = tmp_path / "ps.txt"
ps_file.write_text(