fix(backup): monitor gitea private bundle coverage
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 33s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped

This commit is contained in:
Your Name
2026-07-01 15:23:21 +08:00
parent 4a128b3f48
commit b12cbc2b78
5 changed files with 332 additions and 2 deletions

View File

@@ -76,6 +76,8 @@ if [ "${#REPOS[@]}" -eq 0 ]; then
"wooo/AwoooGo"
"wooo/stockplatform-v2"
"wooo/vibework"
"wooo/momo-pro-system"
"wooo/tsenyang-website"
)
fi

View File

@@ -25,11 +25,26 @@ TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_
OUTPUT_NAME = "backup_health.prom"
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
LABEL_RE = re.compile(r'["\\\n]')
GITEA_BUNDLE_ROOT = Path(
os.environ.get("AIOPS_GITEA_BUNDLE_ROOT", "/home/ollama/backup/110/gitea/git-bundles/latest-private-complete")
)
GITEA_BUNDLE_MAX_AGE_HOURS = float(os.environ.get("AIOPS_GITEA_BUNDLE_MAX_AGE_HOURS", "25"))
BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh"))
BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env"))
OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite"))
ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence"))
CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json"))
DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS = [
"wooo/awoooi",
"wooo/ewoooc",
"wooo/2026FIFAWorldCup",
"wooo/agent-bounty-protocol",
"wooo/AwoooGo",
"wooo/stockplatform-v2",
"wooo/vibework",
"wooo/momo-pro-system",
"wooo/tsenyang-website",
]
ESCROW_ITEMS = [
"restic_repository_password",
"offsite_provider_credentials",
@@ -209,6 +224,161 @@ def _newest_tree_timestamp(root: Path, max_entries: int = 5000) -> tuple[int, in
return newest, count
def _expected_gitea_bundle_repos() -> list[str]:
configured = os.environ.get("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "")
repos = [item.strip() for item in configured.split(",") if item.strip()]
return repos or DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS
def _gitea_bundle_slug(repo: str) -> str:
return repo.replace("/", "__")
def _read_gitea_bundle_manifest(root: Path) -> tuple[Path | None, dict[str, dict[str, str]]]:
for manifest_path in [root / "manifest.remote.tsv", root / "manifest.tsv"]:
try:
rows = manifest_path.read_text(encoding="utf-8", errors="replace").splitlines()
except OSError:
continue
if not rows:
return manifest_path, {}
header = rows[0].split("\t")
parsed: dict[str, dict[str, str]] = {}
for line in rows[1:]:
if not line.strip():
continue
values = line.split("\t")
row = {key: values[index] if index < len(values) else "" for index, key in enumerate(header)}
repo = row.get("repo", "").strip()
if repo:
parsed[repo] = row
return manifest_path, parsed
return None, {}
def _resolve_bundle_path(root: Path, repo: str, manifest_value: str) -> Path:
if manifest_value:
candidate = Path(manifest_value)
if candidate.exists():
return candidate
local_candidate = root / candidate.name
if local_candidate.exists():
return local_candidate
slug_candidate = root / f"{_gitea_bundle_slug(repo)}.bundle"
if slug_candidate.exists():
return slug_candidate
repo_name = repo.rsplit("/", 1)[-1]
matches = sorted(root.glob(f"{repo_name}-local-*.bundle"), key=lambda path: path.stat().st_mtime, reverse=True)
return matches[0] if matches else slug_candidate
def _resolve_checksum_path(root: Path, bundle_path: Path, manifest_value: str) -> Path:
if manifest_value:
candidate = Path(manifest_value)
if candidate.exists():
return candidate
local_candidate = root / candidate.name
if local_candidate.exists():
return local_candidate
return bundle_path.with_name(f"{bundle_path.name}.sha256")
def _checksum_digest_present(path: Path) -> int:
try:
first = path.read_text(encoding="utf-8", errors="replace").split()[0]
except (IndexError, OSError):
return 0
return 1 if re.fullmatch(r"[0-9a-fA-F]{64}", first) else 0
def _gitea_bundle_metric_lines(host: str) -> tuple[list[str], int]:
now = int(time.time())
root = GITEA_BUNDLE_ROOT
manifest_path, manifest_rows = _read_gitea_bundle_manifest(root)
root_exists = int(root.exists())
manifest_present = int(manifest_path is not None)
newest_ts, file_count = _newest_tree_timestamp(root, max_entries=20000)
age = now - newest_ts if newest_ts else 0
bundle_fresh = 1 if newest_ts and age <= int(GITEA_BUNDLE_MAX_AGE_HOURS * 3600) else 0
root_label = _escape_label(str(root))
labels = f'host="{_escape_label(host)}",root="{root_label}",max_age_hours="{GITEA_BUNDLE_MAX_AGE_HOURS:g}"'
lines = [
f"awoooi_gitea_bundle_root_exists{{{labels}}} {root_exists}",
f"awoooi_gitea_bundle_manifest_present{{{labels}}} {manifest_present}",
f"awoooi_gitea_bundle_newest_timestamp{{{labels}}} {newest_ts}",
f"awoooi_gitea_bundle_age_seconds{{{labels}}} {age}",
f"awoooi_gitea_bundle_fresh{{{labels}}} {bundle_fresh}",
f"awoooi_gitea_bundle_file_count{{{labels}}} {file_count}",
]
missing_count = 0
failed_count = 0
checksum_missing_count = 0
expected_repos = _expected_gitea_bundle_repos()
for repo in expected_repos:
row = manifest_rows.get(repo, {})
status = row.get("status", "manifest_row_missing")
bundle_path = _resolve_bundle_path(root, repo, row.get("bundle", ""))
checksum_path = _resolve_checksum_path(root, bundle_path, row.get("checksum", ""))
bundle_present = int(bundle_path.is_file())
checksum_present = int(checksum_path.is_file())
checksum_digest_present = _checksum_digest_present(checksum_path) if checksum_present else 0
if status == "manifest_row_missing" and bundle_present and checksum_digest_present:
status = "bundle_file_only"
try:
head_count = int(row.get("head_count", "0") or 0)
except ValueError:
head_count = 0
repo_ok = int(
bundle_present == 1
and checksum_present == 1
and checksum_digest_present == 1
and status in {"ok", "bundle_file_only"}
)
if not bundle_present:
missing_count += 1
if not checksum_present or not checksum_digest_present:
checksum_missing_count += 1
if repo_ok == 0:
failed_count += 1
repo_labels = f'host="{_escape_label(host)}",repo="{_escape_label(repo)}"'
status_labels = (
f'{repo_labels},status="{_escape_label(status)}",'
f'bundle="{_escape_label(bundle_path.name)}"'
)
lines.extend(
[
f"awoooi_gitea_bundle_expected_repo_info{{{repo_labels}}} 1",
f"awoooi_gitea_bundle_repo_status_info{{{status_labels}}} 1",
f"awoooi_gitea_bundle_repo_present{{{repo_labels}}} {bundle_present}",
f"awoooi_gitea_bundle_repo_ok{{{repo_labels}}} {repo_ok}",
f"awoooi_gitea_bundle_repo_head_count{{{repo_labels}}} {head_count}",
f"awoooi_gitea_bundle_checksum_present{{{repo_labels}}} {checksum_present}",
f"awoooi_gitea_bundle_checksum_digest_present{{{repo_labels}}} {checksum_digest_present}",
]
)
all_expected_ok = int(
root_exists == 1
and manifest_present == 1
and bundle_fresh == 1
and expected_repos
and missing_count == 0
and failed_count == 0
and checksum_missing_count == 0
)
lines.extend(
[
f"awoooi_gitea_bundle_expected_repo_count{{{labels}}} {len(expected_repos)}",
f"awoooi_gitea_bundle_expected_repo_missing_count{{{labels}}} {missing_count}",
f"awoooi_gitea_bundle_failed_repo_count{{{labels}}} {failed_count}",
f"awoooi_gitea_bundle_checksum_missing_count{{{labels}}} {checksum_missing_count}",
f"awoooi_gitea_bundle_all_expected_ok{{{labels}}} {all_expected_ok}",
]
)
return lines, all_expected_ok
def _read_backup_110_timestamp() -> int:
candidates = [
Path("/home/ollama/node_exporter_textfiles/backup.prom"),
@@ -759,6 +929,42 @@ def _base_lines(host: str) -> list[str]:
"# TYPE awoooi_backup_cron_singular_entry_count gauge",
"# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.",
"# TYPE awoooi_backup_cron_singular_entry_ok gauge",
"# HELP awoooi_gitea_bundle_root_exists Whether the 188 Gitea private bundle backup root exists.",
"# TYPE awoooi_gitea_bundle_root_exists gauge",
"# HELP awoooi_gitea_bundle_manifest_present Whether the 188 Gitea private bundle manifest exists.",
"# TYPE awoooi_gitea_bundle_manifest_present gauge",
"# HELP awoooi_gitea_bundle_newest_timestamp Unix timestamp of newest file under the 188 Gitea private bundle backup root.",
"# TYPE awoooi_gitea_bundle_newest_timestamp gauge",
"# HELP awoooi_gitea_bundle_age_seconds Age of newest 188 Gitea private bundle backup evidence.",
"# TYPE awoooi_gitea_bundle_age_seconds gauge",
"# HELP awoooi_gitea_bundle_fresh Whether the 188 Gitea private bundle backup evidence is fresh.",
"# TYPE awoooi_gitea_bundle_fresh gauge",
"# HELP awoooi_gitea_bundle_file_count Number of files scanned under the 188 Gitea private bundle backup root.",
"# TYPE awoooi_gitea_bundle_file_count gauge",
"# HELP awoooi_gitea_bundle_expected_repo_info Expected Gitea repository that must have a private bundle backup.",
"# TYPE awoooi_gitea_bundle_expected_repo_info gauge",
"# HELP awoooi_gitea_bundle_repo_status_info Manifest or fallback status for an expected Gitea private bundle backup.",
"# TYPE awoooi_gitea_bundle_repo_status_info gauge",
"# HELP awoooi_gitea_bundle_repo_present Whether an expected Gitea repository bundle file exists.",
"# TYPE awoooi_gitea_bundle_repo_present gauge",
"# HELP awoooi_gitea_bundle_repo_ok Whether an expected Gitea repository has a present bundle and checksum evidence.",
"# TYPE awoooi_gitea_bundle_repo_ok gauge",
"# HELP awoooi_gitea_bundle_repo_head_count Head count recorded in the Gitea private bundle manifest.",
"# TYPE awoooi_gitea_bundle_repo_head_count gauge",
"# HELP awoooi_gitea_bundle_checksum_present Whether checksum evidence exists for an expected Gitea private bundle.",
"# TYPE awoooi_gitea_bundle_checksum_present gauge",
"# HELP awoooi_gitea_bundle_checksum_digest_present Whether checksum evidence contains a SHA-256 digest.",
"# TYPE awoooi_gitea_bundle_checksum_digest_present gauge",
"# HELP awoooi_gitea_bundle_expected_repo_count Number of expected Gitea private bundle repositories.",
"# TYPE awoooi_gitea_bundle_expected_repo_count gauge",
"# HELP awoooi_gitea_bundle_expected_repo_missing_count Number of expected Gitea private bundle repositories with no bundle file.",
"# TYPE awoooi_gitea_bundle_expected_repo_missing_count gauge",
"# HELP awoooi_gitea_bundle_failed_repo_count Number of expected Gitea private bundle repositories without complete bundle/checksum evidence.",
"# TYPE awoooi_gitea_bundle_failed_repo_count gauge",
"# HELP awoooi_gitea_bundle_checksum_missing_count Number of expected Gitea private bundle repositories without checksum evidence.",
"# TYPE awoooi_gitea_bundle_checksum_missing_count gauge",
"# HELP awoooi_gitea_bundle_all_expected_ok Whether all expected Gitea private bundle repositories are present and fresh.",
"# TYPE awoooi_gitea_bundle_all_expected_ok gauge",
"# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.",
"# TYPE awoooi_velero_monitor_up gauge",
"# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.",
@@ -939,15 +1145,17 @@ def _collect_188(host: str) -> list[str]:
sample_count=gitea_mirror_count,
)
)
gitea_bundle_lines, gitea_bundle_ok = _gitea_bundle_metric_lines(host)
lines.extend(gitea_bundle_lines)
coverage_labels = (
f'host="{_escape_label(host)}",'
'domain="service",'
'required_jobs="backup_from_110,gitea_repo_mirror_from_110"'
'required_jobs="backup_from_110,gitea_repo_mirror_from_110,gitea_private_bundle_completeness"'
)
lines.append(f"awoooi_backup_coverage_domain_expected_info{{{coverage_labels}}} 1")
lines.append(
"awoooi_backup_coverage_domain_fresh"
f"{{{coverage_labels}}} {1 if gitea_mirror_fresh else 0}"
f"{{{coverage_labels}}} {1 if gitea_mirror_fresh and gitea_bundle_ok else 0}"
)
momo_ts = _newest_file_timestamp([
"/home/ollama/momo_backups/*.sql.gz",

View File

@@ -0,0 +1,86 @@
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
SCRIPT_ROOT = Path(__file__).resolve().parents[1]
EXPORTER_PATH = SCRIPT_ROOT / "backup-health-textfile-exporter.py"
def load_exporter():
spec = importlib.util.spec_from_file_location("backup_health_textfile_exporter", EXPORTER_PATH)
assert spec and spec.loader
module = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = module
spec.loader.exec_module(module)
return module
def test_gitea_bundle_metrics_require_all_expected_repos(tmp_path: Path, monkeypatch) -> None:
exporter = load_exporter()
bundle_root = tmp_path / "latest-private-complete"
bundle_root.mkdir()
monkeypatch.setattr(exporter, "GITEA_BUNDLE_ROOT", bundle_root)
monkeypatch.setattr(exporter, "GITEA_BUNDLE_MAX_AGE_HOURS", 25)
monkeypatch.setenv("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "wooo/awoooi,wooo/tsenyang-website")
(bundle_root / "manifest.remote.tsv").write_text(
"\n".join(
[
"repo\tstatus\thead_count\tbundle\tchecksum",
f"wooo/awoooi\tok\t2\t{bundle_root / 'wooo__awoooi.bundle'}\t{bundle_root / 'wooo__awoooi.bundle.sha256'}",
]
)
+ "\n",
encoding="utf-8",
)
(bundle_root / "wooo__awoooi.bundle").write_text("bundle", encoding="utf-8")
(bundle_root / "wooo__awoooi.bundle.sha256").write_text(
"a" * 64 + " wooo__awoooi.bundle\n",
encoding="utf-8",
)
(bundle_root / "tsenyang-website-local-20260701.bundle").write_text("bundle", encoding="utf-8")
(bundle_root / "tsenyang-website-local-20260701.bundle.sha256").write_text(
"b" * 64 + " tsenyang-website-local-20260701.bundle\n",
encoding="utf-8",
)
metrics, all_ok = exporter._gitea_bundle_metric_lines("188")
rendered = "\n".join(metrics)
assert all_ok == 1
assert 'awoooi_gitea_bundle_expected_repo_count{host="188"' in rendered
assert 'awoooi_gitea_bundle_expected_repo_missing_count{host="188"' in rendered
assert 'awoooi_gitea_bundle_failed_repo_count{host="188"' in rendered
assert 'repo="wooo/tsenyang-website",status="bundle_file_only"' in rendered
assert 'awoooi_gitea_bundle_all_expected_ok{host="188"' in rendered
assert rendered.rstrip().endswith(" 1")
def test_gitea_bundle_metrics_fail_when_checksum_missing(tmp_path: Path, monkeypatch) -> None:
exporter = load_exporter()
bundle_root = tmp_path / "latest-private-complete"
bundle_root.mkdir()
monkeypatch.setattr(exporter, "GITEA_BUNDLE_ROOT", bundle_root)
monkeypatch.setenv("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "wooo/awoooi")
(bundle_root / "manifest.remote.tsv").write_text(
"\n".join(
[
"repo\tstatus\thead_count\tbundle\tchecksum",
f"wooo/awoooi\tok\t2\t{bundle_root / 'wooo__awoooi.bundle'}\t{bundle_root / 'wooo__awoooi.bundle.sha256'}",
]
)
+ "\n",
encoding="utf-8",
)
(bundle_root / "wooo__awoooi.bundle").write_text("bundle", encoding="utf-8")
metrics, all_ok = exporter._gitea_bundle_metric_lines("188")
rendered = "\n".join(metrics)
assert all_ok == 0
assert 'awoooi_gitea_bundle_checksum_missing_count{host="188"' in rendered
assert rendered.rstrip().endswith(" 0")