fix(recovery): ignore stale docker cpu attribution
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 33s
AWOOOI Harbor 110 Local Repair / workflow-shape (push) Successful in 0s
CD Pipeline / build-and-deploy (push) Has been skipped
AWOOOI Harbor 110 Local Repair / harbor-110-local-repair (push) Failing after 1m41s
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 14:06:51 +08:00
parent e580954e82
commit 1ac8808607
7 changed files with 318 additions and 4 deletions

View File

@@ -20,12 +20,14 @@ from __future__ import annotations
import argparse
import json
import re
import time
from pathlib import Path
from typing import Any
DEFAULT_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
SCHEMA_VERSION = "host_sustained_load_controlled_automation_v1"
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
METRIC_RE = re.compile(
@@ -41,6 +43,11 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--host", default="110")
parser.add_argument("--metrics-file", type=Path, default=DEFAULT_METRICS_FILE)
parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
parser.add_argument(
"--docker-stats-max-age-seconds",
type=int,
default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS,
)
parser.add_argument("--load5-per-core-threshold", type=float, default=1.5)
parser.add_argument("--ci-stale-age-seconds", type=int, default=1800)
parser.add_argument("--json", action="store_true", help="Print JSON only.")
@@ -92,6 +99,55 @@ def _sample_value(
return default
def _sample_value_any(samples: list[dict[str, Any]], name: str) -> float | None:
for sample in samples:
if sample["name"] == name:
return float(sample["value"])
return None
def _textfile_mtime_seconds(samples: list[dict[str, Any]], suffix: str) -> float | None:
for sample in samples:
if sample["name"] != "node_textfile_mtime_seconds":
continue
file_label = str(sample["labels"].get("file") or "")
if file_label.endswith(suffix):
return float(sample["value"])
return None
def docker_stats_freshness(
*,
samples: list[dict[str, Any]],
docker_stats_file: Path,
max_age_seconds: int,
) -> dict[str, Any]:
mtime = _textfile_mtime_seconds(samples, "docker_stats.prom")
now = _sample_value_any(samples, "node_time_seconds")
source = "node_textfile_mtime_seconds"
if mtime is None:
try:
mtime = docker_stats_file.stat().st_mtime
now = time.time()
source = "file_stat_mtime"
except FileNotFoundError:
return {
"fresh": False,
"age_seconds": None,
"max_age_seconds": max_age_seconds,
"source": "missing",
}
if now is None:
now = time.time()
age_seconds = max(0, int(now - mtime))
return {
"fresh": age_seconds <= max_age_seconds,
"age_seconds": age_seconds,
"max_age_seconds": max_age_seconds,
"source": source,
}
def _rule_values(samples: list[dict[str, Any]], name: str, *, host: str) -> list[dict[str, Any]]:
values = []
for sample in samples:
@@ -159,6 +215,7 @@ def build_packet(
host: str,
samples: list[dict[str, Any]],
docker_samples: list[dict[str, Any]],
docker_stats_status: dict[str, Any],
load5_per_core_threshold: float,
ci_stale_age_seconds: int,
) -> dict[str, Any]:
@@ -209,7 +266,8 @@ def build_packet(
)
)
top_orphan = _top_orphan_rule(samples, host=host)
top_container = _top_container_cpu(docker_samples, host=host)
raw_top_container = _top_container_cpu(docker_samples, host=host)
top_container = raw_top_container if docker_stats_status.get("fresh") is True else None
top_container_name = str((top_container or {}).get("container_name") or "").lower()
top_container_cpu = float((top_container or {}).get("cpu_cores") or 0.0)
@@ -317,6 +375,8 @@ def build_packet(
"active_ci_oldest_age_seconds": active_ci_oldest_age,
"top_orphan_rule": top_orphan,
"top_container_cpu": top_container,
"top_container_cpu_untrusted": raw_top_container,
"docker_stats": docker_stats_status,
},
"commands": {
"dry_run": dry_run_command,
@@ -364,6 +424,11 @@ def main() -> int:
host=args.host,
samples=samples,
docker_samples=docker_samples,
docker_stats_status=docker_stats_freshness(
samples=samples,
docker_stats_file=args.docker_stats_file,
max_age_seconds=args.docker_stats_max_age_seconds,
),
load5_per_core_threshold=args.load5_per_core_threshold,
ci_stale_age_seconds=args.ci_stale_age_seconds,
)

View File

@@ -14,12 +14,14 @@ import json
import os
import re
import subprocess
import time
from pathlib import Path
from typing import Any
DEFAULT_HOST_METRICS_FILE = Path("/home/wooo/node_exporter_textfiles/host_runaway_process.prom")
DEFAULT_DOCKER_STATS_FILE = Path("/home/wooo/node_exporter_textfiles/docker_stats.prom")
DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS = 300
SCHEMA_VERSION = "host_sustained_load_sanitized_evidence_v1"
LABEL_RE = re.compile(r"(?P<key>[A-Za-z_][A-Za-z0-9_]*)=\"(?P<value>(?:[^\"\\\\]|\\\\.)*)\"")
METRIC_RE = re.compile(
@@ -33,6 +35,11 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--host", default=os.environ.get("AIOPS_HOST_LABEL", "110"))
parser.add_argument("--metrics-file", type=Path, default=DEFAULT_HOST_METRICS_FILE)
parser.add_argument("--docker-stats-file", type=Path, default=DEFAULT_DOCKER_STATS_FILE)
parser.add_argument(
"--docker-stats-max-age-seconds",
type=int,
default=DEFAULT_DOCKER_STATS_MAX_AGE_SECONDS,
)
parser.add_argument("--ps-file", type=Path)
parser.add_argument("--top-n", type=int, default=8)
parser.add_argument("--json", action="store_true")
@@ -66,6 +73,55 @@ def parse_prometheus_text(text: str) -> list[dict[str, Any]]:
return samples
def _sample_value_any(samples: list[dict[str, Any]], name: str) -> float | None:
for sample in samples:
if sample["name"] == name:
return float(sample["value"])
return None
def _textfile_mtime_seconds(samples: list[dict[str, Any]], suffix: str) -> float | None:
for sample in samples:
if sample["name"] != "node_textfile_mtime_seconds":
continue
file_label = str(sample["labels"].get("file") or "")
if file_label.endswith(suffix):
return float(sample["value"])
return None
def docker_stats_freshness(
*,
samples: list[dict[str, Any]],
docker_stats_file: Path,
max_age_seconds: int,
) -> dict[str, Any]:
mtime = _textfile_mtime_seconds(samples, "docker_stats.prom")
now = _sample_value_any(samples, "node_time_seconds")
source = "node_textfile_mtime_seconds"
if mtime is None:
try:
mtime = docker_stats_file.stat().st_mtime
now = time.time()
source = "file_stat_mtime"
except FileNotFoundError:
return {
"fresh": False,
"age_seconds": None,
"max_age_seconds": max_age_seconds,
"source": "missing",
}
if now is None:
now = time.time()
age_seconds = max(0, int(now - mtime))
return {
"fresh": age_seconds <= max_age_seconds,
"age_seconds": age_seconds,
"max_age_seconds": max_age_seconds,
"source": source,
}
def read_text(path: Path | None) -> str:
if path is None:
return ""
@@ -234,8 +290,14 @@ def recommend_playbook(process_families: list[dict[str, Any]], containers: list[
def build_payload(args: argparse.Namespace) -> dict[str, Any]:
host_samples = parse_prometheus_text(read_text(args.metrics_file))
docker_samples = parse_prometheus_text(read_text(args.docker_stats_file))
docker_stats_status = docker_stats_freshness(
samples=host_samples,
docker_stats_file=args.docker_stats_file,
max_age_seconds=args.docker_stats_max_age_seconds,
)
process_summary = summarize_processes(parse_ps_text(collect_ps_text(args.ps_file)), top_n=args.top_n)
containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n)
untrusted_containers = top_docker_containers(docker_samples, host=args.host, top_n=args.top_n)
containers = untrusted_containers if docker_stats_status.get("fresh") is True else []
recommendation = recommend_playbook(process_summary["families"], containers)
return {
@@ -248,10 +310,12 @@ def build_payload(args: argparse.Namespace) -> dict[str, Any]:
"readback": {
"host_metric_sample_count": len(host_samples),
"docker_metric_sample_count": len(docker_samples),
"docker_stats": docker_stats_status,
"top_container_count": len(containers),
"top_process_family_count": len(process_summary["families"]),
},
"top_containers": containers,
"top_containers_untrusted": untrusted_containers,
"top_process_families": process_summary["families"],
"top_processes_sanitized": process_summary["top_processes"],
"redaction": {

View File

@@ -425,6 +425,60 @@ def test_sustained_load_controller_routes_gitea_backlog_from_docker_metrics(tmp_
assert "host-sustained-load-evidence.py" in payload["commands"]["dry_run"]
def test_sustained_load_controller_ignores_stale_docker_stats_attribution(tmp_path: Path) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(
"\n".join(
[
'awoooi_host_runaway_process_monitor_up{host="110",mode="read_only"} 1',
'awoooi_host_load5_per_core{host="110"} 2.5',
'awoooi_host_swap_used_ratio{host="110"} 0.1',
'awoooi_host_runaway_process_remediation_authorized{host="110"} 0',
'awoooi_host_gitea_actions_active_container_count{host="110"} 0',
'awoooi_host_gitea_actions_active_process_group_count{host="110"} 0',
'awoooi_host_runaway_browser_orphan_group_count{host="110",rule="stockplatform_headless_smoke",min_age_seconds="1800",min_cpu_percent="50"} 0',
'node_textfile_mtime_seconds{file="/host/home/wooo/node_exporter_textfiles/docker_stats.prom"} 1000',
'node_time_seconds 5000',
]
),
encoding="utf-8",
)
docker_file = tmp_path / "docker.prom"
docker_file.write_text(
"\n".join(
[
'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4',
'docker_container_cpu_cores{host="110",container_name="redis"} 0.2',
]
),
encoding="utf-8",
)
result = subprocess.run(
[
sys.executable,
str(CONTROLLER_PATH),
"--host",
"110",
"--metrics-file",
str(metrics_file),
"--docker-stats-file",
str(docker_file),
"--json",
],
capture_output=True,
text=True,
)
assert result.returncode == 75
payload = json.loads(result.stdout)
assert payload["classification"] == "blocked_unknown_sustained_load_requires_source_specific_playbook"
assert payload["readback"]["docker_stats"]["fresh"] is False
assert payload["readback"]["top_container_cpu"] is None
assert payload["readback"]["top_container_cpu_untrusted"]["container_name"] == "gitea"
assert payload["controlled_apply_allowed"] is False
def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(
@@ -506,6 +560,55 @@ def test_sustained_load_evidence_emits_sanitized_gitea_recommendation(tmp_path:
assert "/home/wooo" not in result.stdout
def test_sustained_load_evidence_keeps_stale_container_samples_untrusted(tmp_path: Path) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(
"\n".join(
[
'node_textfile_mtime_seconds{file="/host/home/wooo/node_exporter_textfiles/docker_stats.prom"} 1000',
'node_time_seconds 5000',
]
),
encoding="utf-8",
)
docker_file = tmp_path / "docker.prom"
docker_file.write_text(
'docker_container_cpu_cores{host="110",container_name="gitea"} 3.4\n',
encoding="utf-8",
)
ps_file = tmp_path / "ps.txt"
ps_file.write_text(
"100 1 100 120 5.0 1.0 python python monitor.py\n",
encoding="utf-8",
)
result = subprocess.run(
[
sys.executable,
str(EVIDENCE_PATH),
"--host",
"110",
"--metrics-file",
str(metrics_file),
"--ps-file",
str(ps_file),
"--docker-stats-file",
str(docker_file),
"--json",
],
check=True,
capture_output=True,
text=True,
)
payload = json.loads(result.stdout)
assert payload["recommendation"] != "gitea_queue_or_hook_backlog_playbook"
assert payload["readback"]["docker_stats"]["fresh"] is False
assert payload["top_containers"] == []
assert payload["top_containers_untrusted"][0]["container_name"] == "gitea"
assert payload["operation_boundaries"]["host_write_performed"] is False
def test_sustained_load_controller_routes_unknown_load_to_sanitized_evidence(tmp_path: Path) -> None:
metrics_file = tmp_path / "host.prom"
metrics_file.write_text(