fix(ops): use deployed host pressure helper paths
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m3s
CD Pipeline / build-and-deploy (push) Successful in 4m58s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 1s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Successful in 10s
CD Pipeline / post-deploy-checks (push) Successful in 3m11s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 1s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 1m3s
CD Pipeline / build-and-deploy (push) Successful in 4m58s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 1s
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Successful in 10s
CD Pipeline / post-deploy-checks (push) Successful in 3m11s
This commit is contained in:
@@ -29,6 +29,7 @@ from typing import Any
|
||||
DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
|
||||
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
|
||||
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
|
||||
DEFAULT_SCRIPT_DIR = Path("/home/wooo/scripts")
|
||||
SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
|
||||
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
|
||||
METRIC_RE = re.compile(
|
||||
@@ -54,6 +55,7 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--hot-container-cpu-threshold", type=float, default=1.0)
|
||||
parser.add_argument("--process-family-cpu-threshold", type=float, default=50.0)
|
||||
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
|
||||
parser.add_argument("--script-dir", type=Path, default=DEFAULT_SCRIPT_DIR)
|
||||
parser.add_argument("--ps-file", type=Path)
|
||||
parser.add_argument("--top-n", type=int, default=8)
|
||||
parser.add_argument("--json", action="store_true", help="Print JSON only.")
|
||||
@@ -382,6 +384,7 @@ def build_packet(
|
||||
hot_container_cpu_threshold: float,
|
||||
process_family_cpu_threshold: float,
|
||||
ci_stale_age_seconds: int,
|
||||
script_dir: Path = DEFAULT_SCRIPT_DIR,
|
||||
) -> dict[str, Any]:
|
||||
monitor_up = int(
|
||||
_sample_value(
|
||||
@@ -445,8 +448,11 @@ def build_packet(
|
||||
next_action = "keep_read_only_monitoring"
|
||||
dry_run_command = ""
|
||||
controlled_apply_command = ""
|
||||
controller_script = script_dir / "host-sustained-load-controller.py"
|
||||
evidence_script = script_dir / "host-sustained-load-evidence.py"
|
||||
remediation_script = script_dir / "host-runaway-process-remediation.py"
|
||||
verifier_command = (
|
||||
"scripts/ops/host-sustained-load-controller.py "
|
||||
f"{controller_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE}"
|
||||
)
|
||||
|
||||
@@ -463,14 +469,14 @@ def build_packet(
|
||||
severity = "critical"
|
||||
controlled_apply_allowed = True
|
||||
rule = top_orphan["rule"]
|
||||
dry_run_command = f"scripts/ops/host-runaway-process-remediation.py --rule {rule}"
|
||||
dry_run_command = f"{remediation_script} --rule {rule}"
|
||||
controlled_apply_command = (
|
||||
"scripts/ops/host-runaway-process-remediation.py "
|
||||
f"{remediation_script} "
|
||||
f"--rule {rule} --apply --confirm-apply "
|
||||
"--controlled-apply-id ${CONTROLLED_APPLY_ID} "
|
||||
"--evidence-ref ${EVIDENCE_REF} "
|
||||
"--post-apply-verifier "
|
||||
"'scripts/ops/host-sustained-load-controller.py --host "
|
||||
f"'{controller_script} --host "
|
||||
f"{host} --metrics-file {DEFAULT_METRICS_FILE}' "
|
||||
"--wait-seconds 10"
|
||||
)
|
||||
@@ -510,7 +516,7 @@ def build_packet(
|
||||
else "warning"
|
||||
)
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
@@ -526,7 +532,7 @@ def build_packet(
|
||||
else "warning"
|
||||
)
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
@@ -535,7 +541,7 @@ def build_packet(
|
||||
classification = "blocked_control_plane_saturation_requires_playbook"
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
@@ -550,7 +556,7 @@ def build_packet(
|
||||
classification = "blocked_stockplatform_hot_query_or_api_pressure_requires_playbook"
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
@@ -561,7 +567,7 @@ def build_packet(
|
||||
classification = "blocked_gitea_queue_or_hook_backlog_requires_playbook"
|
||||
severity = "critical" if load5_per_core > load5_per_core_threshold else "warning"
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
@@ -574,10 +580,9 @@ def build_packet(
|
||||
classification = "blocked_unknown_sustained_load_requires_source_specific_playbook"
|
||||
severity = "critical"
|
||||
dry_run_command = (
|
||||
"scripts/ops/host-sustained-load-evidence.py "
|
||||
f"{evidence_script} "
|
||||
f"--host {host} --metrics-file {DEFAULT_METRICS_FILE} "
|
||||
"--docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom "
|
||||
"--json"
|
||||
f"--docker-stats-file {DEFAULT_DOCKER_STATS_FILE} --json"
|
||||
)
|
||||
next_action = "collect_sanitized_top_process_and_container_stats_then_select_playbook"
|
||||
|
||||
@@ -596,6 +601,7 @@ def build_packet(
|
||||
"container_cpu_threshold": container_cpu_threshold,
|
||||
"hot_container_cpu_threshold": hot_container_cpu_threshold,
|
||||
"process_family_cpu_threshold": process_family_cpu_threshold,
|
||||
"script_dir": str(script_dir),
|
||||
"swap_used_ratio": round(swap_used_ratio, 6),
|
||||
"remediation_authorized": remediation_authorized,
|
||||
"active_ci_container_count": active_ci_containers,
|
||||
@@ -668,6 +674,7 @@ def main() -> int:
|
||||
hot_container_cpu_threshold=args.hot_container_cpu_threshold,
|
||||
process_family_cpu_threshold=args.process_family_cpu_threshold,
|
||||
ci_stale_age_seconds=args.ci_stale_age_seconds,
|
||||
script_dir=args.script_dir,
|
||||
)
|
||||
if args.json:
|
||||
print(json.dumps(packet, ensure_ascii=False, indent=2, sort_keys=True))
|
||||
|
||||
@@ -334,8 +334,13 @@ def test_sustained_load_controller_routes_orphan_browser_to_controlled_remediati
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["classification"] == "controlled_orphan_browser_remediation_ready"
|
||||
assert payload["controlled_apply_allowed"] is True
|
||||
assert "host-runaway-process-remediation.py --rule stockplatform_headless_smoke" in payload["commands"]["dry_run"]
|
||||
assert (
|
||||
"/home/wooo/scripts/host-runaway-process-remediation.py "
|
||||
"--rule stockplatform_headless_smoke"
|
||||
) in payload["commands"]["dry_run"]
|
||||
assert "--controlled-apply-id" in payload["commands"]["controlled_apply"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"]
|
||||
assert payload["operation_boundaries"]["process_signal_performed"] is False
|
||||
|
||||
|
||||
@@ -460,7 +465,8 @@ def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_
|
||||
assert payload["classification"] == "blocked_gitea_queue_or_hook_backlog_requires_playbook"
|
||||
assert payload["readback"]["top_container_cpu"]["container_name"] == "gitea"
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
|
||||
|
||||
def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is_moderate(
|
||||
@@ -516,7 +522,8 @@ def test_sustained_load_controller_routes_gitea_quota_pressure_even_when_load_is
|
||||
assert payload["severity"] == "warning"
|
||||
assert payload["readback"]["container_cpu_threshold"] == 2.0
|
||||
assert payload["readback"]["top_container_cpu"]["cpu_cores"] == 2.08
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
|
||||
|
||||
def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None:
|
||||
@@ -608,7 +615,8 @@ def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["classification"] == "blocked_unknown_sustained_load_requires_source_specific_playbook"
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
assert payload["operation_boundaries"]["process_signal_performed"] is False
|
||||
|
||||
|
||||
@@ -683,7 +691,8 @@ def test_sustained_load_controller_routes_moderate_stock_container_pressure(tmp_
|
||||
assert payload["readback"]["top_container_cpu"]["container_name"] == "stockplatform-v2-postgres-1"
|
||||
assert payload["readback"]["top_process_family"]["family"] == "gitea_service"
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/gitea/app.ini" not in result.stdout
|
||||
|
||||
|
||||
@@ -785,6 +794,10 @@ def test_sustained_load_controller_routes_control_plane_family_pressure(tmp_path
|
||||
assert payload["classification"] == "blocked_control_plane_saturation_requires_playbook"
|
||||
assert payload["readback"]["control_plane_process_cpu_percent"] == 55.0
|
||||
assert payload["next_action"] == "run_control_plane_saturation_playbook_check_mode"
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/scripts/host-sustained-load-controller.py" in payload["commands"]["post_apply_verifier"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["post_apply_verifier"]
|
||||
|
||||
|
||||
def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path: Path) -> None:
|
||||
@@ -913,7 +926,8 @@ def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp
|
||||
== "blocked_unknown_sustained_load_requires_source_specific_playbook"
|
||||
)
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "/home/wooo/scripts/host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
assert "scripts/ops/" not in payload["commands"]["dry_run"]
|
||||
assert payload["operation_boundaries"]["host_write_performed"] is False
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user