fix(recovery): ignore stale docker cpu attribution
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 33s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
CD Pipeline / build-and-deploy (push) Has been skipped
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Failing after 1m41s
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 33s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
CD Pipeline / build-and-deploy (push) Has been skipped
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Failing after 1m41s
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -20,12 +20,14 @@ from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
|
||||
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
|
||||
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
|
||||
SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
|
||||
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
|
||||
METRIC_RE = re.compile(
|
||||
@@ -41,6 +43,11 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--host", default="110")
|
||||
parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE)
|
||||
parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
|
||||
parser.add_argument(
|
||||
"--docker-stats-max-age-seconds",
|
||||
type=int,
|
||||
default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS,
|
||||
)
|
||||
parser.add_argument("--load5-per-core-threshold", type=float, default=1.5)
|
||||
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
|
||||
parser.add_argument("--json", action="store_true", help="Print JSON only.")
|
||||
@@ -92,6 +99,55 @@ def _sample_value(
|
||||
return default
|
||||
|
||||
|
||||
def _sample_value_any(samples: list[dict[str, Any]], name: str) -> float | None:
|
||||
for sample in samples:
|
||||
if sample["name"] == name:
|
||||
return float(sample["value"])
|
||||
return None
|
||||
|
||||
|
||||
def _textfile_mtime_seconds(samples: list[dict[str, Any]], suffix: str) -> float | None:
|
||||
for sample in samples:
|
||||
if sample["name"] != "node_textfile_mtime_seconds":
|
||||
continue
|
||||
file_label = str(sample["labels"].get("file") or "")
|
||||
if file_label.endswith(suffix):
|
||||
return float(sample["value"])
|
||||
return None
|
||||
|
||||
|
||||
def docker_stats_freshness(
|
||||
*,
|
||||
samples: list[dict[str, Any]],
|
||||
docker_stats_file: Path,
|
||||
max_age_seconds: int,
|
||||
) -> dict[str, Any]:
|
||||
mtime = _textfile_mtime_seconds(samples, "docker_stats.prom")
|
||||
now = _sample_value_any(samples, "node_time_seconds")
|
||||
source = "node_textfile_mtime_seconds"
|
||||
if mtime is None:
|
||||
try:
|
||||
mtime = docker_stats_file.stat().st_mtime
|
||||
now = time.time()
|
||||
source = "file_stat_mtime"
|
||||
except FileNotFoundError:
|
||||
return {
|
||||
"fresh": False,
|
||||
"age_seconds": None,
|
||||
"max_age_seconds": max_age_seconds,
|
||||
"source": "missing",
|
||||
}
|
||||
if now is None:
|
||||
now = time.time()
|
||||
age_seconds = max(0, int(now - mtime))
|
||||
return {
|
||||
"fresh": age_seconds <= max_age_seconds,
|
||||
"age_seconds": age_seconds,
|
||||
"max_age_seconds": max_age_seconds,
|
||||
"source": source,
|
||||
}
|
||||
|
||||
|
||||
def _rule_values(samples: list[dict[str, Any]], name: str, *, host: str) -> list[dict[str, Any]]:
|
||||
values = []
|
||||
for sample in samples:
|
||||
@@ -159,6 +215,7 @@ def build_packet(
|
||||
host: str,
|
||||
samples: list[dict[str, Any]],
|
||||
docker_samples: list[dict[str, Any]],
|
||||
docker_stats_status: dict[str, Any],
|
||||
load5_per_core_threshold: float,
|
||||
ci_stale_age_seconds: int,
|
||||
) -> dict[str, Any]:
|
||||
@@ -209,7 +266,8 @@ def build_packet(
|
||||
)
|
||||
)
|
||||
top_orphan = _top_orphan_rule(samples, host=host)
|
||||
top_container = _top_container_cpu(docker_samples, host=host)
|
||||
raw_top_container = _top_container_cpu(docker_samples, host=host)
|
||||
top_container = raw_top_container if docker_stats_status.get("fresh") is True else None
|
||||
top_container_name = str((top_container or {}).get("container_name") or "").lower()
|
||||
top_container_cpu = float((top_container or {}).get("cpu_cores") or 0.0)
|
||||
|
||||
@@ -317,6 +375,8 @@ def build_packet(
|
||||
"active_ci_oldest_age_seconds": active_ci_oldest_age,
|
||||
"top_orphan_rule": top_orphan,
|
||||
"top_container_cpu": top_container,
|
||||
"top_container_cpu_untrusted": raw_top_container,
|
||||
"docker_stats": docker_stats_status,
|
||||
},
|
||||
"commands": {
|
||||
"dry_run": dry_run_command,
|
||||
@@ -364,6 +424,11 @@ def main() -> int:
|
||||
host=args.host,
|
||||
samples=samples,
|
||||
docker_samples=docker_samples,
|
||||
docker_stats_status=docker_stats_freshness(
|
||||
samples=samples,
|
||||
docker_stats_file=args.docker_stats_file,
|
||||
max_age_seconds=args.docker_stats_max_age_seconds,
|
||||
),
|
||||
load5_per_core_threshold=args.load5_per_core_threshold,
|
||||
ci_stale_age_seconds=args.ci_stale_age_seconds,
|
||||
)
|
||||
|
||||
@@ -14,12 +14,14 @@ import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
DEFAULT_HOST_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
|
||||
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
|
||||
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
|
||||
SCHEMA_VERSION = "host_sustained_load_sanitized_evidence_v1"
|
||||
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
|
||||
METRIC_RE = re.compile(
|
||||
@@ -33,6 +35,11 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", "110"))
|
||||
parser.add_argument("--metrics-file", type=Path, default=DEFAULT_HOST_METRICS_FILE)
|
||||
parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
|
||||
parser.add_argument(
|
||||
"--docker-stats-max-age-seconds",
|
||||
type=int,
|
||||
default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS,
|
||||
)
|
||||
parser.add_argument("--ps-file", type=Path)
|
||||
parser.add_argument("--top-n", type=int, default=8)
|
||||
parser.add_argument("--json", action="store_true")
|
||||
@@ -66,6 +73,55 @@ def parse_prometheus_text(text: str) -> list[dict[str, Any]]:
|
||||
return samples
|
||||
|
||||
|
||||
def _sample_value_any(samples: list[dict[str, Any]], name: str) -> float | None:
|
||||
for sample in samples:
|
||||
if sample["name"] == name:
|
||||
return float(sample["value"])
|
||||
return None
|
||||
|
||||
|
||||
def _textfile_mtime_seconds(samples: list[dict[str, Any]], suffix: str) -> float | None:
|
||||
for sample in samples:
|
||||
if sample["name"] != "node_textfile_mtime_seconds":
|
||||
continue
|
||||
file_label = str(sample["labels"].get("file") or "")
|
||||
if file_label.endswith(suffix):
|
||||
return float(sample["value"])
|
||||
return None
|
||||
|
||||
|
||||
def docker_stats_freshness(
|
||||
*,
|
||||
samples: list[dict[str, Any]],
|
||||
docker_stats_file: Path,
|
||||
max_age_seconds: int,
|
||||
) -> dict[str, Any]:
|
||||
mtime = _textfile_mtime_seconds(samples, "docker_stats.prom")
|
||||
now = _sample_value_any(samples, "node_time_seconds")
|
||||
source = "node_textfile_mtime_seconds"
|
||||
if mtime is None:
|
||||
try:
|
||||
mtime = docker_stats_file.stat().st_mtime
|
||||
now = time.time()
|
||||
source = "file_stat_mtime"
|
||||
except FileNotFoundError:
|
||||
return {
|
||||
"fresh": False,
|
||||
"age_seconds": None,
|
||||
"max_age_seconds": max_age_seconds,
|
||||
"source": "missing",
|
||||
}
|
||||
if now is None:
|
||||
now = time.time()
|
||||
age_seconds = max(0, int(now - mtime))
|
||||
return {
|
||||
"fresh": age_seconds <= max_age_seconds,
|
||||
"age_seconds": age_seconds,
|
||||
"max_age_seconds": max_age_seconds,
|
||||
"source": source,
|
||||
}
|
||||
|
||||
|
||||
def read_text(path: Path | None) -> str:
|
||||
if path is None:
|
||||
return ""
|
||||
@@ -234,8 +290,14 @@ def recommend_playbook(process_families: list[dict[str, Any]], containers: list[
|
||||
def build_payload(args: argparse.Namespace) -> dict[str, Any]:
|
||||
host_samples = parse_prometheus_text(read_text(args.metrics_file))
|
||||
docker_samples = parse_prometheus_text(read_text(args.docker_stats_file))
|
||||
docker_stats_status = docker_stats_freshness(
|
||||
samples=host_samples,
|
||||
docker_stats_file=args.docker_stats_file,
|
||||
max_age_seconds=args.docker_stats_max_age_seconds,
|
||||
)
|
||||
process_summary = summarize_processes(parse_ps_text(collect_ps_text(args.ps_file)), top_n=args.top_n)
|
||||
containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n)
|
||||
untrusted_containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n)
|
||||
containers = untrusted_containers if docker_stats_status.get("fresh") is True else []
|
||||
recommendation = recommend_playbook(process_summary["families"], containers)
|
||||
|
||||
return {
|
||||
@@ -248,10 +310,12 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"readback": {
|
||||
"host_metric_sample_count": len(host_samples),
|
||||
"docker_metric_sample_count": len(docker_samples),
|
||||
"docker_stats": docker_stats_status,
|
||||
"top_container_count": len(containers),
|
||||
"top_process_family_count": len(process_summary["families"]),
|
||||
},
|
||||
"top_containers": containers,
|
||||
"top_containers_untrusted": untrusted_containers,
|
||||
"top_process_families": process_summary["families"],
|
||||
"top_processes_sanitized": process_summary["top_processes"],
|
||||
"redaction": {
|
||||
|
||||
@@ -425,6 +425,60 @@ def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_
|
||||
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
|
||||
|
||||
|
||||
def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
|
||||
'awoooi_host_load5_per_core{host="110"} 2.5',
|
||||
'awoooi_host_swap_used_ratio{host="110"} 0.1',
|
||||
'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
|
||||
'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
|
||||
'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
|
||||
'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0',
|
||||
'node_textfile_mtime_seconds{file="/host/home/wooo/node_exporter_textfiles/docker_stats.prom"} 1000',
|
||||
'node_time_seconds 5000',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
docker_file = tmp_path / "docker.prom"
|
||||
docker_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4',
|
||||
'docker_container_cpu_cores{host="110",container_name="redis"} 0.2',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(CONTROLLER_PATH),
|
||||
"--host",
|
||||
"110",
|
||||
"--metrics-file",
|
||||
str(metrics_file),
|
||||
"--docker-stats-file",
|
||||
str(docker_file),
|
||||
"--json",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 75
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["classification"] == "blocked_unknown_sustained_load_requires_source_specific_playbook"
|
||||
assert payload["readback"]["docker_stats"]["fresh"] is False
|
||||
assert payload["readback"]["top_container_cpu"] is None
|
||||
assert payload["readback"]["top_container_cpu_untrusted"]["container_name"] == "gitea"
|
||||
assert payload["controlled_apply_allowed"] is False
|
||||
|
||||
|
||||
def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
@@ -506,6 +560,55 @@ def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path:
|
||||
assert "/home/wooo" not in result.stdout
|
||||
|
||||
|
||||
def test_sustained_load_evidence_keeps_stale_container_samples_untrusted(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
'node_textfile_mtime_seconds{file="/host/home/wooo/node_exporter_textfiles/docker_stats.prom"} 1000',
|
||||
'node_time_seconds 5000',
|
||||
]
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
docker_file = tmp_path / "docker.prom"
|
||||
docker_file.write_text(
|
||||
'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
ps_file = tmp_path / "ps.txt"
|
||||
ps_file.write_text(
|
||||
"100 1 100 120 5.0 1.0 python python monitor.py\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(EVIDENCE_PATH),
|
||||
"--host",
|
||||
"110",
|
||||
"--metrics-file",
|
||||
str(metrics_file),
|
||||
"--ps-file",
|
||||
str(ps_file),
|
||||
"--docker-stats-file",
|
||||
str(docker_file),
|
||||
"--json",
|
||||
],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
payload = json.loads(result.stdout)
|
||||
assert payload["recommendation"] != "gitea_queue_or_hook_backlog_playbook"
|
||||
assert payload["readback"]["docker_stats"]["fresh"] is False
|
||||
assert payload["top_containers"] == []
|
||||
assert payload["top_containers_untrusted"][0]["container_name"] == "gitea"
|
||||
assert payload["operation_boundaries"]["host_write_performed"] is False
|
||||
|
||||
|
||||
def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
|
||||
metrics_file = tmp_path / "host.prom"
|
||||
metrics_file.write_text(
|
||||
|
||||
Reference in New Issue
Block a user