fix(ops): use deployed host pressure helper paths
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m3s
CD Pipeline / build-and-deploy (push) Successful in 4m58s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 1s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Successful in 10s
CD Pipeline / post-deploy-checks (push) Successful in 3m11s

This commit is contained in:
Your Name
2026-07-02 12:14:35 +08:00
parent 36f1fffbda
commit 859e407129
4 changed files with 49 additions and 24 deletions

View File

@@ -29,6 +29,7 @@ from typing import Any
DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
DEFAULT_SCRIPT_DIR = Path("/home/wooo/scripts")
SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
METRIC_RE = re.compile(
@@ -54,6 +55,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--hot-container-cpu-threshold", type=float, default=1.0)
parser.add_argument("--process-family-cpu-threshold", type=float, default=50.0)
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
parser.add_argument("--script-dir", type=Path, default=DEFAULT_SCRIPT_DIR)
parser.add_argument("--ps-file", type=Path)
parser.add_argument("--top-n", type=int, default=8)
parser.add_argument("--json", action="store_true", help="Print JSON only.")
@@ -382,6 +384,7 @@ def build_packet(
hot_container_cpu_threshold: float,
process_family_cpu_threshold: float,
ci_stale_age_seconds: int,
script_dir: Path = DEFAULT_SCRIPT_DIR,
) -> dict[str, Any]:
monitor_up = int(
_sample_value(
@@ -445,8 +448,11 @@ def build_packet(
next_action = "keep_read_only_monitoring"
dry_run_command = ""
controlled_apply_command = ""
controller_script = script_dir / "host-sustained-load-controller.py"
evidence_script = script_dir / "host-sustained-load-evidence.py"
remediation_script = script_dir / "host-runaway-process-remediation.py"
verifier_command = (
"scripts/ops/host-sustained-load-controller.py "
f"{controller_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}"
)
@@ -463,14 +469,14 @@ def build_packet(
severity = "critical"
controlled_apply_allowed = True
rule = top_orphan["rule"]
dry_run_command = f"scripts/ops/host-runaway-process-remediation.py --rule {rule}"
dry_run_command = f"{remediation_script} --rule {rule}"
controlled_apply_command = (
"scripts/ops/host-runaway-process-remediation.py "
f"{remediation_script} "
f"--rule {rule} --apply --confirm-apply "
"--controlled-apply-id ${CONTROLLED_APPLY_ID} "
"--evidence-ref ${EVIDENCE_REF} "
"--post-apply-verifier "
"'scripts/ops/host-sustained-load-controller.py --host "
f"'{controller_script} --host "
f"{host} --metrics-file {DEFAULT_METRICS_FILE}' "
"--wait-seconds 10"
)
@@ -510,7 +516,7 @@ def build_packet(
else "warning"
)
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
@@ -526,7 +532,7 @@ def build_packet(
else "warning"
)
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
@@ -535,7 +541,7 @@ def build_packet(
classification = "blocked_control_plane_saturation_requires_playbook"
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
@@ -550,7 +556,7 @@ def build_packet(
classification = "blocked_stockplatform_hot_query_or_api_pressure_requires_playbook"
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
@@ -561,7 +567,7 @@ def build_packet(
classification = "blocked_gitea_queue_or_hook_backlog_requires_playbook"
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
@@ -574,10 +580,9 @@ def build_packet(
classification = "blocked_unknown_sustained_load_requires_source_specific_playbook"
severity = "critical"
dry_run_command = (
"scripts/ops/host-sustained-load-evidence.py "
f"{evidence_script} "
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
"--docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom "
"--json"
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
)
next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook"
@@ -596,6 +601,7 @@ def build_packet(
"container_cpu_threshold": container_cpu_threshold,
"hot_container_cpu_threshold": hot_container_cpu_threshold,
"process_family_cpu_threshold": process_family_cpu_threshold,
"script_dir": str(script_dir),
"swap_used_ratio": round(swap_used_ratio, 6),
"remediation_authorized": remediation_authorized,
"active_ci_container_count": active_ci_containers,
@@ -668,6 +674,7 @@ def main() -> int:
hot_container_cpu_threshold=args.hot_container_cpu_threshold,
process_family_cpu_threshold=args.process_family_cpu_threshold,
ci_stale_age_seconds=args.ci_stale_age_seconds,
script_dir=args.script_dir,
)
if args.json:
print(json.dumps(packet, ensure_ascii=False, indent=2, sort_keys=True))

View File

@@ -334,8 +334,13 @@ def test_sustained_load_controller_routes_orphan_browser_to_controlled_remediati
payload = json.loads(result.stdout)
assert payload["classification"] == "controlled_orphan_browser_remediation_ready"
assert payload["controlled_apply_allowed"] is True
assert "host-runaway-process-remediation.py --rule stockplatform_headless_smoke" in payload["commands"]["dry_run"]
assert (
"/home/wooo/scripts/host-runaway-process-remediation.py "
"--rule stockplatform_headless_smoke"
) in payload["commands"]["dry_run"]
assert "--controlled-apply-id" in payload["commands"]["controlled_apply"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"]
assert payload["operation_boundaries"]["process_signal_performed"] is False
@@ -460,7 +465,8 @@ def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_
assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook"
assert payload["readback"]["top_container_cpu"]["container_name"] == "gitea"
assert payload["controlled_apply_allowed"] is False
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is_moderate(
@@ -516,7 +522,8 @@ def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is
assert payload["severity"] == "warning"
assert payload["readback"]["container_cpu_threshold"] == 2.0
assert payload["readback"]["top_container_cpu"]["cpu_cores"] == 2.08
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None:
@@ -608,7 +615,8 @@ def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp
payload = json.loads(result.stdout)
assert payload["classification"] == "blocked_unknown_sustained_load_requires_source_specific_playbook"
assert payload["controlled_apply_allowed"] is False
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
assert payload["operation_boundaries"]["process_signal_performed"] is False
@@ -683,7 +691,8 @@ def test_sustained_load_controller_routes_moderate_stock_container_pressure(tmp_
assert payload["readback"]["top_container_cpu"]["container_name"] == "stockplatform-v2-postgres-1"
assert payload["readback"]["top_process_family"]["family"] == "gitea_service"
assert payload["controlled_apply_allowed"] is False
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
assert "/home/wooo/gitea/app.ini" not in result.stdout
@@ -785,6 +794,10 @@ def test_sustained_load_controller_routes_control_plane_family_pressure(tmp_path
assert payload["classification"] == "blocked_control_plane_saturation_requires_playbook"
assert payload["readback"]["control_plane_process_cpu_percent"] == 55.0
assert payload["next_action"] == "run_control_plane_saturation_playbook_check_mode"
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "/home/wooo/scripts/host-sustained-load-controller.py" in payload["commands"]["post_apply_verifier"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"]
def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None:
@@ -913,7 +926,8 @@ def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp
== "blocked_unknown_sustained_load_requires_source_specific_playbook"
)
assert payload["controlled_apply_allowed"] is False
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
assert "scripts/ops/" not in payload["commands"]["dry_run"]
assert payload["operation_boundaries"]["host_write_performed"] is False