fix(backup): monitor gitea private bundle coverage
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 33s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 33s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -76,6 +76,8 @@ if [ "${#REPOS[@]}" -eq 0 ]; then
|
||||
"wooo/AwoooGo"
|
||||
"wooo/stockplatform-v2"
|
||||
"wooo/vibework"
|
||||
"wooo/momo-pro-system"
|
||||
"wooo/tsenyang-website"
|
||||
)
|
||||
fi
|
||||
|
||||
|
||||
@@ -25,11 +25,26 @@ TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_
|
||||
OUTPUT_NAME = "backup_health.prom"
|
||||
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
||||
LABEL_RE = re.compile(r'["\\\n]')
|
||||
GITEA_BUNDLE_ROOT = Path(
|
||||
os.environ.get("AIOPS_GITEA_BUNDLE_ROOT", "/home/ollama/backup/110/gitea/git-bundles/latest-private-complete")
|
||||
)
|
||||
GITEA_BUNDLE_MAX_AGE_HOURS = float(os.environ.get("AIOPS_GITEA_BUNDLE_MAX_AGE_HOURS", "25"))
|
||||
BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh"))
|
||||
BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env"))
|
||||
OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite"))
|
||||
ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence"))
|
||||
CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json"))
|
||||
DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS = [
|
||||
"wooo/awoooi",
|
||||
"wooo/ewoooc",
|
||||
"wooo/2026FIFAWorldCup",
|
||||
"wooo/agent-bounty-protocol",
|
||||
"wooo/AwoooGo",
|
||||
"wooo/stockplatform-v2",
|
||||
"wooo/vibework",
|
||||
"wooo/momo-pro-system",
|
||||
"wooo/tsenyang-website",
|
||||
]
|
||||
ESCROW_ITEMS = [
|
||||
"restic_repository_password",
|
||||
"offsite_provider_credentials",
|
||||
@@ -209,6 +224,161 @@ def _newest_tree_timestamp(root: Path, max_entries: int = 5000) -> tuple[int, in
|
||||
return newest, count
|
||||
|
||||
|
||||
def _expected_gitea_bundle_repos() -> list[str]:
|
||||
configured = os.environ.get("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "")
|
||||
repos = [item.strip() for item in configured.split(",") if item.strip()]
|
||||
return repos or DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS
|
||||
|
||||
|
||||
def _gitea_bundle_slug(repo: str) -> str:
|
||||
return repo.replace("/", "__")
|
||||
|
||||
|
||||
def _read_gitea_bundle_manifest(root: Path) -> tuple[Path | None, dict[str, dict[str, str]]]:
|
||||
for manifest_path in [root / "manifest.remote.tsv", root / "manifest.tsv"]:
|
||||
try:
|
||||
rows = manifest_path.read_text(encoding="utf-8", errors="replace").splitlines()
|
||||
except OSError:
|
||||
continue
|
||||
if not rows:
|
||||
return manifest_path, {}
|
||||
header = rows[0].split("\t")
|
||||
parsed: dict[str, dict[str, str]] = {}
|
||||
for line in rows[1:]:
|
||||
if not line.strip():
|
||||
continue
|
||||
values = line.split("\t")
|
||||
row = {key: values[index] if index < len(values) else "" for index, key in enumerate(header)}
|
||||
repo = row.get("repo", "").strip()
|
||||
if repo:
|
||||
parsed[repo] = row
|
||||
return manifest_path, parsed
|
||||
return None, {}
|
||||
|
||||
|
||||
def _resolve_bundle_path(root: Path, repo: str, manifest_value: str) -> Path:
|
||||
if manifest_value:
|
||||
candidate = Path(manifest_value)
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
local_candidate = root / candidate.name
|
||||
if local_candidate.exists():
|
||||
return local_candidate
|
||||
slug_candidate = root / f"{_gitea_bundle_slug(repo)}.bundle"
|
||||
if slug_candidate.exists():
|
||||
return slug_candidate
|
||||
repo_name = repo.rsplit("/", 1)[-1]
|
||||
matches = sorted(root.glob(f"{repo_name}-local-*.bundle"), key=lambda path: path.stat().st_mtime, reverse=True)
|
||||
return matches[0] if matches else slug_candidate
|
||||
|
||||
|
||||
def _resolve_checksum_path(root: Path, bundle_path: Path, manifest_value: str) -> Path:
|
||||
if manifest_value:
|
||||
candidate = Path(manifest_value)
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
local_candidate = root / candidate.name
|
||||
if local_candidate.exists():
|
||||
return local_candidate
|
||||
return bundle_path.with_name(f"{bundle_path.name}.sha256")
|
||||
|
||||
|
||||
def _checksum_digest_present(path: Path) -> int:
|
||||
try:
|
||||
first = path.read_text(encoding="utf-8", errors="replace").split()[0]
|
||||
except (IndexError, OSError):
|
||||
return 0
|
||||
return 1 if re.fullmatch(r"[0-9a-fA-F]{64}", first) else 0
|
||||
|
||||
|
||||
def _gitea_bundle_metric_lines(host: str) -> tuple[list[str], int]:
|
||||
now = int(time.time())
|
||||
root = GITEA_BUNDLE_ROOT
|
||||
manifest_path, manifest_rows = _read_gitea_bundle_manifest(root)
|
||||
root_exists = int(root.exists())
|
||||
manifest_present = int(manifest_path is not None)
|
||||
newest_ts, file_count = _newest_tree_timestamp(root, max_entries=20000)
|
||||
age = now - newest_ts if newest_ts else 0
|
||||
bundle_fresh = 1 if newest_ts and age <= int(GITEA_BUNDLE_MAX_AGE_HOURS * 3600) else 0
|
||||
root_label = _escape_label(str(root))
|
||||
labels = f'host="{_escape_label(host)}",root="{root_label}",max_age_hours="{GITEA_BUNDLE_MAX_AGE_HOURS:g}"'
|
||||
lines = [
|
||||
f"awoooi_gitea_bundle_root_exists{{{labels}}} {root_exists}",
|
||||
f"awoooi_gitea_bundle_manifest_present{{{labels}}} {manifest_present}",
|
||||
f"awoooi_gitea_bundle_newest_timestamp{{{labels}}} {newest_ts}",
|
||||
f"awoooi_gitea_bundle_age_seconds{{{labels}}} {age}",
|
||||
f"awoooi_gitea_bundle_fresh{{{labels}}} {bundle_fresh}",
|
||||
f"awoooi_gitea_bundle_file_count{{{labels}}} {file_count}",
|
||||
]
|
||||
|
||||
missing_count = 0
|
||||
failed_count = 0
|
||||
checksum_missing_count = 0
|
||||
expected_repos = _expected_gitea_bundle_repos()
|
||||
for repo in expected_repos:
|
||||
row = manifest_rows.get(repo, {})
|
||||
status = row.get("status", "manifest_row_missing")
|
||||
bundle_path = _resolve_bundle_path(root, repo, row.get("bundle", ""))
|
||||
checksum_path = _resolve_checksum_path(root, bundle_path, row.get("checksum", ""))
|
||||
bundle_present = int(bundle_path.is_file())
|
||||
checksum_present = int(checksum_path.is_file())
|
||||
checksum_digest_present = _checksum_digest_present(checksum_path) if checksum_present else 0
|
||||
if status == "manifest_row_missing" and bundle_present and checksum_digest_present:
|
||||
status = "bundle_file_only"
|
||||
try:
|
||||
head_count = int(row.get("head_count", "0") or 0)
|
||||
except ValueError:
|
||||
head_count = 0
|
||||
repo_ok = int(
|
||||
bundle_present == 1
|
||||
and checksum_present == 1
|
||||
and checksum_digest_present == 1
|
||||
and status in {"ok", "bundle_file_only"}
|
||||
)
|
||||
if not bundle_present:
|
||||
missing_count += 1
|
||||
if not checksum_present or not checksum_digest_present:
|
||||
checksum_missing_count += 1
|
||||
if repo_ok == 0:
|
||||
failed_count += 1
|
||||
repo_labels = f'host="{_escape_label(host)}",repo="{_escape_label(repo)}"'
|
||||
status_labels = (
|
||||
f'{repo_labels},status="{_escape_label(status)}",'
|
||||
f'bundle="{_escape_label(bundle_path.name)}"'
|
||||
)
|
||||
lines.extend(
|
||||
[
|
||||
f"awoooi_gitea_bundle_expected_repo_info{{{repo_labels}}} 1",
|
||||
f"awoooi_gitea_bundle_repo_status_info{{{status_labels}}} 1",
|
||||
f"awoooi_gitea_bundle_repo_present{{{repo_labels}}} {bundle_present}",
|
||||
f"awoooi_gitea_bundle_repo_ok{{{repo_labels}}} {repo_ok}",
|
||||
f"awoooi_gitea_bundle_repo_head_count{{{repo_labels}}} {head_count}",
|
||||
f"awoooi_gitea_bundle_checksum_present{{{repo_labels}}} {checksum_present}",
|
||||
f"awoooi_gitea_bundle_checksum_digest_present{{{repo_labels}}} {checksum_digest_present}",
|
||||
]
|
||||
)
|
||||
|
||||
all_expected_ok = int(
|
||||
root_exists == 1
|
||||
and manifest_present == 1
|
||||
and bundle_fresh == 1
|
||||
and expected_repos
|
||||
and missing_count == 0
|
||||
and failed_count == 0
|
||||
and checksum_missing_count == 0
|
||||
)
|
||||
lines.extend(
|
||||
[
|
||||
f"awoooi_gitea_bundle_expected_repo_count{{{labels}}} {len(expected_repos)}",
|
||||
f"awoooi_gitea_bundle_expected_repo_missing_count{{{labels}}} {missing_count}",
|
||||
f"awoooi_gitea_bundle_failed_repo_count{{{labels}}} {failed_count}",
|
||||
f"awoooi_gitea_bundle_checksum_missing_count{{{labels}}} {checksum_missing_count}",
|
||||
f"awoooi_gitea_bundle_all_expected_ok{{{labels}}} {all_expected_ok}",
|
||||
]
|
||||
)
|
||||
return lines, all_expected_ok
|
||||
|
||||
|
||||
def _read_backup_110_timestamp() -> int:
|
||||
candidates = [
|
||||
Path("/home/ollama/node_exporter_textfiles/backup.prom"),
|
||||
@@ -759,6 +929,42 @@ def _base_lines(host: str) -> list[str]:
|
||||
"# TYPE awoooi_backup_cron_singular_entry_count gauge",
|
||||
"# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.",
|
||||
"# TYPE awoooi_backup_cron_singular_entry_ok gauge",
|
||||
"# HELP awoooi_gitea_bundle_root_exists Whether the 188 Gitea private bundle backup root exists.",
|
||||
"# TYPE awoooi_gitea_bundle_root_exists gauge",
|
||||
"# HELP awoooi_gitea_bundle_manifest_present Whether the 188 Gitea private bundle manifest exists.",
|
||||
"# TYPE awoooi_gitea_bundle_manifest_present gauge",
|
||||
"# HELP awoooi_gitea_bundle_newest_timestamp Unix timestamp of newest file under the 188 Gitea private bundle backup root.",
|
||||
"# TYPE awoooi_gitea_bundle_newest_timestamp gauge",
|
||||
"# HELP awoooi_gitea_bundle_age_seconds Age of newest 188 Gitea private bundle backup evidence.",
|
||||
"# TYPE awoooi_gitea_bundle_age_seconds gauge",
|
||||
"# HELP awoooi_gitea_bundle_fresh Whether the 188 Gitea private bundle backup evidence is fresh.",
|
||||
"# TYPE awoooi_gitea_bundle_fresh gauge",
|
||||
"# HELP awoooi_gitea_bundle_file_count Number of files scanned under the 188 Gitea private bundle backup root.",
|
||||
"# TYPE awoooi_gitea_bundle_file_count gauge",
|
||||
"# HELP awoooi_gitea_bundle_expected_repo_info Expected Gitea repository that must have a private bundle backup.",
|
||||
"# TYPE awoooi_gitea_bundle_expected_repo_info gauge",
|
||||
"# HELP awoooi_gitea_bundle_repo_status_info Manifest or fallback status for an expected Gitea private bundle backup.",
|
||||
"# TYPE awoooi_gitea_bundle_repo_status_info gauge",
|
||||
"# HELP awoooi_gitea_bundle_repo_present Whether an expected Gitea repository bundle file exists.",
|
||||
"# TYPE awoooi_gitea_bundle_repo_present gauge",
|
||||
"# HELP awoooi_gitea_bundle_repo_ok Whether an expected Gitea repository has a present bundle and checksum evidence.",
|
||||
"# TYPE awoooi_gitea_bundle_repo_ok gauge",
|
||||
"# HELP awoooi_gitea_bundle_repo_head_count Head count recorded in the Gitea private bundle manifest.",
|
||||
"# TYPE awoooi_gitea_bundle_repo_head_count gauge",
|
||||
"# HELP awoooi_gitea_bundle_checksum_present Whether checksum evidence exists for an expected Gitea private bundle.",
|
||||
"# TYPE awoooi_gitea_bundle_checksum_present gauge",
|
||||
"# HELP awoooi_gitea_bundle_checksum_digest_present Whether checksum evidence contains a SHA-256 digest.",
|
||||
"# TYPE awoooi_gitea_bundle_checksum_digest_present gauge",
|
||||
"# HELP awoooi_gitea_bundle_expected_repo_count Number of expected Gitea private bundle repositories.",
|
||||
"# TYPE awoooi_gitea_bundle_expected_repo_count gauge",
|
||||
"# HELP awoooi_gitea_bundle_expected_repo_missing_count Number of expected Gitea private bundle repositories with no bundle file.",
|
||||
"# TYPE awoooi_gitea_bundle_expected_repo_missing_count gauge",
|
||||
"# HELP awoooi_gitea_bundle_failed_repo_count Number of expected Gitea private bundle repositories without complete bundle/checksum evidence.",
|
||||
"# TYPE awoooi_gitea_bundle_failed_repo_count gauge",
|
||||
"# HELP awoooi_gitea_bundle_checksum_missing_count Number of expected Gitea private bundle repositories without checksum evidence.",
|
||||
"# TYPE awoooi_gitea_bundle_checksum_missing_count gauge",
|
||||
"# HELP awoooi_gitea_bundle_all_expected_ok Whether all expected Gitea private bundle repositories are present and fresh.",
|
||||
"# TYPE awoooi_gitea_bundle_all_expected_ok gauge",
|
||||
"# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.",
|
||||
"# TYPE awoooi_velero_monitor_up gauge",
|
||||
"# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.",
|
||||
@@ -939,15 +1145,17 @@ def _collect_188(host: str) -> list[str]:
|
||||
sample_count=gitea_mirror_count,
|
||||
)
|
||||
)
|
||||
gitea_bundle_lines, gitea_bundle_ok = _gitea_bundle_metric_lines(host)
|
||||
lines.extend(gitea_bundle_lines)
|
||||
coverage_labels = (
|
||||
f'host="{_escape_label(host)}",'
|
||||
'domain="service",'
|
||||
'required_jobs="backup_from_110,gitea_repo_mirror_from_110"'
|
||||
'required_jobs="backup_from_110,gitea_repo_mirror_from_110,gitea_private_bundle_completeness"'
|
||||
)
|
||||
lines.append(f"awoooi_backup_coverage_domain_expected_info{{{coverage_labels}}} 1")
|
||||
lines.append(
|
||||
"awoooi_backup_coverage_domain_fresh"
|
||||
f"{{{coverage_labels}}} {1 if gitea_mirror_fresh else 0}"
|
||||
f"{{{coverage_labels}}} {1 if gitea_mirror_fresh and gitea_bundle_ok else 0}"
|
||||
)
|
||||
momo_ts = _newest_file_timestamp([
|
||||
"/home/ollama/momo_backups/*.sql.gz",
|
||||
|
||||
86
scripts/ops/tests/test_backup_health_textfile_exporter.py
Normal file
86
scripts/ops/tests/test_backup_health_textfile_exporter.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SCRIPT_ROOT = Path(__file__).resolve().parents[1]
|
||||
EXPORTER_PATH = SCRIPT_ROOT / "backup-health-textfile-exporter.py"
|
||||
|
||||
|
||||
def load_exporter():
|
||||
spec = importlib.util.spec_from_file_location("backup_health_textfile_exporter", EXPORTER_PATH)
|
||||
assert spec and spec.loader
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def test_gitea_bundle_metrics_require_all_expected_repos(tmp_path: Path, monkeypatch) -> None:
|
||||
exporter = load_exporter()
|
||||
bundle_root = tmp_path / "latest-private-complete"
|
||||
bundle_root.mkdir()
|
||||
monkeypatch.setattr(exporter, "GITEA_BUNDLE_ROOT", bundle_root)
|
||||
monkeypatch.setattr(exporter, "GITEA_BUNDLE_MAX_AGE_HOURS", 25)
|
||||
monkeypatch.setenv("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "wooo/awoooi,wooo/tsenyang-website")
|
||||
|
||||
(bundle_root / "manifest.remote.tsv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"repo\tstatus\thead_count\tbundle\tchecksum",
|
||||
f"wooo/awoooi\tok\t2\t{bundle_root / 'wooo__awoooi.bundle'}\t{bundle_root / 'wooo__awoooi.bundle.sha256'}",
|
||||
]
|
||||
)
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(bundle_root / "wooo__awoooi.bundle").write_text("bundle", encoding="utf-8")
|
||||
(bundle_root / "wooo__awoooi.bundle.sha256").write_text(
|
||||
"a" * 64 + " wooo__awoooi.bundle\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(bundle_root / "tsenyang-website-local-20260701.bundle").write_text("bundle", encoding="utf-8")
|
||||
(bundle_root / "tsenyang-website-local-20260701.bundle.sha256").write_text(
|
||||
"b" * 64 + " tsenyang-website-local-20260701.bundle\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
metrics, all_ok = exporter._gitea_bundle_metric_lines("188")
|
||||
|
||||
rendered = "\n".join(metrics)
|
||||
assert all_ok == 1
|
||||
assert 'awoooi_gitea_bundle_expected_repo_count{host="188"' in rendered
|
||||
assert 'awoooi_gitea_bundle_expected_repo_missing_count{host="188"' in rendered
|
||||
assert 'awoooi_gitea_bundle_failed_repo_count{host="188"' in rendered
|
||||
assert 'repo="wooo/tsenyang-website",status="bundle_file_only"' in rendered
|
||||
assert 'awoooi_gitea_bundle_all_expected_ok{host="188"' in rendered
|
||||
assert rendered.rstrip().endswith(" 1")
|
||||
|
||||
|
||||
def test_gitea_bundle_metrics_fail_when_checksum_missing(tmp_path: Path, monkeypatch) -> None:
|
||||
exporter = load_exporter()
|
||||
bundle_root = tmp_path / "latest-private-complete"
|
||||
bundle_root.mkdir()
|
||||
monkeypatch.setattr(exporter, "GITEA_BUNDLE_ROOT", bundle_root)
|
||||
monkeypatch.setenv("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "wooo/awoooi")
|
||||
|
||||
(bundle_root / "manifest.remote.tsv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"repo\tstatus\thead_count\tbundle\tchecksum",
|
||||
f"wooo/awoooi\tok\t2\t{bundle_root / 'wooo__awoooi.bundle'}\t{bundle_root / 'wooo__awoooi.bundle.sha256'}",
|
||||
]
|
||||
)
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(bundle_root / "wooo__awoooi.bundle").write_text("bundle", encoding="utf-8")
|
||||
|
||||
metrics, all_ok = exporter._gitea_bundle_metric_lines("188")
|
||||
|
||||
rendered = "\n".join(metrics)
|
||||
assert all_ok == 0
|
||||
assert 'awoooi_gitea_bundle_checksum_missing_count{host="188"' in rendered
|
||||
assert rendered.rstrip().endswith(" 0")
|
||||
Reference in New Issue
Block a user