fix(ops): recognize repo-scoped CI containers in load guard [skip ci]

This commit is contained in:
ogt
2026-06-27 12:45:01 +08:00
parent c4fcd9cb12
commit 7f706feded
6 changed files with 63 additions and 9 deletions

View File

@@ -24,6 +24,9 @@ TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_
OUTPUT_NAME = "host_runaway_process.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
LABEL_RE = re.compile(r'["\\\n]')
GITEA_ACTION_CONTAINER_RE = re.compile(
r"^(?:GITEA-ACTIONS-|[A-Za-z0-9][A-Za-z0-9_.-]*-(?:cd|code-review)-[0-9]+-)"
)
@dataclass(frozen=True)
@@ -214,7 +217,7 @@ def active_gitea_action_containers(docker_file: Path | None = None) -> int:
names = run_text(["docker", "ps", "--format", "{{.Names}}"], timeout=10).splitlines()
except Exception:
return -1
return sum(1 for name in names if "GITEA-ACTIONS-TASK-" in name)
return sum(1 for name in names if GITEA_ACTION_CONTAINER_RE.search(name))
def load5_per_core() -> float:

View File

@@ -11,7 +11,8 @@ set -euo pipefail
# bash scripts/ops/stop-stale-gitea-actions-jobs.sh --apply
#
# Safety rules:
# - Only touches Docker containers named GITEA-ACTIONS-*.
# - Only touches Docker containers named GITEA-ACTIONS-* or repo-scoped
# Gitea job containers such as awoooi-cd-5901-1-e2e-smoke.
# - Defaults to containers older than 20 minutes.
# - Known long-running workflows get a higher stop threshold than the alert threshold.
# - Skips containers with recent log output unless --force is provided.
@@ -24,17 +25,20 @@ threshold_for_name() {
local name="$1"
case "$name" in
*WORKFLOW-CD-Pipeline_JOB-deploy*)
*WORKFLOW-CD-Pipeline_JOB-deploy*|*-cd-*-deploy*)
# .gitea/workflows/cd.yaml deploy job timeout is 60m. Give act/Gitea
# cleanup a buffer before treating the container as abandoned.
echo 4500
;;
*WORKFLOW-CD-Pipeline_JOB-tests*|*WORKFLOW-CD-Pipeline_JOB-post-deploy-checks*)
*WORKFLOW-CD-Pipeline_JOB-tests*|*WORKFLOW-CD-Pipeline_JOB-post-deploy-checks*|*-cd-*-tests*|*-cd-*-post-deploy*)
echo 2400
;;
*WORKFLOW-Code-Review_JOB-ai-code-review*)
*WORKFLOW-Code-Review_JOB-ai-code-review*|*-code-review-*)
echo 720
;;
*-cd-*-e2e-smoke*|*-cd-*-source-link-smoke*)
echo 900
;;
*WORKFLOW-Deploy-Alert-Rules_JOB-deploy-alerts*)
echo 900
;;
@@ -97,7 +101,7 @@ while read -r name; do
fi
docker stop "$name"
fi
done < <(docker ps --format '{{.Names}}' | grep '^GITEA-ACTIONS-' || true)
done < <(docker ps --format '{{.Names}}' | grep -E '^(GITEA-ACTIONS-|[A-Za-z0-9][A-Za-z0-9_.-]*-(cd|code-review)-[0-9]+-)' || true)
if [[ "$found" == "0" ]]; then
echo "No stale Gitea Actions containers older than policy threshold (minimum ${MIN_AGE_SECONDS}s)."

View File

@@ -93,6 +93,26 @@ def test_renders_ci_load_and_swap_without_authorizing_repair(tmp_path: Path) ->
assert 'rule="stockplatform_headless_smoke"' in metrics
def test_counts_modern_gitea_action_container_names(tmp_path: Path) -> None:
exporter = load_exporter()
docker_file = tmp_path / "docker.txt"
docker_file.write_text(
"\n".join(
[
"GITEA-ACTIONS-TASK-123",
"awoooi-cd-5901-1-e2e-smoke",
"awoooi-cd-5873-1-source-link-smoke",
"awoooi-code-review-3323-1-ai-code-review",
"gitea",
"stockplatform-v2-api-1",
]
),
encoding="utf-8",
)
assert exporter.active_gitea_action_containers(docker_file) == 4
def test_remediation_defaults_to_dry_run(tmp_path: Path) -> None:
ps_file = tmp_path / "ps.txt"
ps_file.write_text(