Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 1m55s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
1210 lines
55 KiB
Python
Executable File
1210 lines
55 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Backup health textfile exporter for full-stack reboot readiness.
|
|
|
|
2026-05-06 ogt + Codex: backup coverage follow-up after the reboot incident.
|
|
Why: a green service gate is not enough if the last restorable copy is stale.
|
|
This exporter is read-only; it checks cron/script presence and the latest
|
|
successful backup evidence, then writes node-exporter textfile metrics.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import shlex
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_exporter/textfile_collector"))
|
|
OUTPUT_NAME = "backup_health.prom"
|
|
HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename)
|
|
LABEL_RE = re.compile(r'["\\\n]')
|
|
GITEA_BUNDLE_ROOT = Path(
|
|
os.environ.get("AIOPS_GITEA_BUNDLE_ROOT", "/home/ollama/backup/110/gitea/git-bundles/latest-private-complete")
|
|
)
|
|
GITEA_BUNDLE_MAX_AGE_HOURS = float(os.environ.get("AIOPS_GITEA_BUNDLE_MAX_AGE_HOURS", "25"))
|
|
BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh"))
|
|
BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env"))
|
|
OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite"))
|
|
ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence"))
|
|
CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json"))
|
|
DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS = [
|
|
"wooo/awoooi",
|
|
"wooo/ewoooc",
|
|
"wooo/2026FIFAWorldCup",
|
|
"wooo/agent-bounty-protocol",
|
|
"wooo/AwoooGo",
|
|
"wooo/stockplatform-v2",
|
|
"wooo/vibework",
|
|
"wooo/momo-pro-system",
|
|
"wooo/tsenyang-website",
|
|
]
|
|
ESCROW_ITEMS = [
|
|
"restic_repository_password",
|
|
"offsite_provider_credentials",
|
|
"break_glass_admin_credentials",
|
|
"dns_registrar_recovery",
|
|
"oauth_ai_provider_recovery",
|
|
]
|
|
|
|
|
|
def _escape_label(value: str) -> str:
|
|
return LABEL_RE.sub(lambda m: {"\n": r"\n", "\\": r"\\", '"': r"\""}[m.group(0)], value)
|
|
|
|
|
|
def _run(command: list[str], timeout: int = 30) -> tuple[int, str, str]:
|
|
try:
|
|
result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, check=False)
|
|
except FileNotFoundError as exc:
|
|
return 127, "", str(exc)
|
|
except subprocess.TimeoutExpired as exc:
|
|
stdout = exc.stdout if isinstance(exc.stdout, str) else ""
|
|
stderr = exc.stderr if isinstance(exc.stderr, str) else "timeout"
|
|
return 124, stdout, stderr
|
|
return result.returncode, result.stdout, result.stderr
|
|
|
|
|
|
def _parse_time(value: str) -> int:
|
|
if not value:
|
|
return 0
|
|
normalized = re.sub(r"\.(\d{6})\d+([+-]\d\d:\d\d|Z)$", r".\1\2", value)
|
|
normalized = normalized.replace("Z", "+00:00")
|
|
try:
|
|
return int(datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp())
|
|
except ValueError:
|
|
return 0
|
|
|
|
|
|
def _parse_marker_timestamp(text: str) -> int:
|
|
match = re.search(r"\b(\d{10})\b", text)
|
|
if match:
|
|
return int(match.group(1))
|
|
for line in text.splitlines():
|
|
parsed = _parse_time(line.strip())
|
|
if parsed:
|
|
return parsed
|
|
return 0
|
|
|
|
|
|
def _marker_timestamp(paths: list[Path]) -> int:
|
|
for path in paths:
|
|
try:
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
parsed = _parse_marker_timestamp(text)
|
|
return parsed or int(path.stat().st_mtime)
|
|
except OSError:
|
|
continue
|
|
return 0
|
|
|
|
|
|
def _shell_export_value(path: Path, key: str) -> str:
|
|
try:
|
|
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
return ""
|
|
for line in lines:
|
|
try:
|
|
tokens = shlex.split(line, comments=True, posix=True)
|
|
except ValueError:
|
|
continue
|
|
if tokens and tokens[0] == "export":
|
|
tokens = tokens[1:]
|
|
for token in tokens:
|
|
if not token.startswith(f"{key}="):
|
|
continue
|
|
return token.split("=", 1)[1].strip()
|
|
return ""
|
|
|
|
|
|
def _backup_config_value(key: str) -> str:
|
|
for path in [BACKUP_OFFSITE_ENV, BACKUP_COMMON_SH]:
|
|
value = _shell_export_value(path, key)
|
|
if value:
|
|
default_match = re.fullmatch(r"\$\{" + re.escape(key) + r":-([^}]+)\}", value)
|
|
if default_match:
|
|
return default_match.group(1)
|
|
return value
|
|
return ""
|
|
|
|
|
|
def _configured_secret(value: str) -> bool:
|
|
return value.strip() not in {"", "CHANGE_ME", "CHANGEME", "TODO", "REDACTED"}
|
|
|
|
|
|
def _b2_configured() -> bool:
|
|
return (
|
|
_configured_secret(_backup_config_value("B2_ACCOUNT_ID"))
|
|
and _configured_secret(_backup_config_value("B2_APPLICATION_KEY"))
|
|
and _configured_secret(_backup_config_value("B2_BUCKET"))
|
|
)
|
|
|
|
|
|
def _rclone_configured() -> bool:
|
|
remote = _backup_config_value("OFFSITE_RCLONE_REMOTE") or os.environ.get("OFFSITE_RCLONE_REMOTE", "gdrive")
|
|
rc, stdout, _ = _run(["rclone", "listremotes"], timeout=10)
|
|
if rc == 0 and remote:
|
|
return f"{remote}:" in {line.strip() for line in stdout.splitlines()}
|
|
for path in [
|
|
Path.home() / ".config/rclone/rclone.conf",
|
|
Path("/home/wooo/.config/rclone/rclone.conf"),
|
|
Path("/root/.config/rclone/rclone.conf"),
|
|
Path("/etc/rclone.conf"),
|
|
]:
|
|
try:
|
|
if path.is_file() and path.stat().st_size > 0:
|
|
return True
|
|
except OSError:
|
|
continue
|
|
return False
|
|
|
|
|
|
def _cron_text() -> str:
|
|
rc, stdout, _ = _run(["crontab", "-l"], timeout=10)
|
|
return stdout if rc == 0 else ""
|
|
|
|
|
|
def _active_cron_lines(cron: str) -> list[str]:
|
|
return [line.strip() for line in cron.splitlines() if line.strip() and not line.lstrip().startswith("#")]
|
|
|
|
|
|
def _cron_duplicate_metric_lines(host: str, cron: str) -> list[str]:
|
|
lines: list[str] = []
|
|
active_lines = _active_cron_lines(cron)
|
|
duplicate_count = max(0, len(active_lines) - len(set(active_lines)))
|
|
lines.append(f'awoooi_backup_cron_active_duplicate_count{{host="{_escape_label(host)}"}} {duplicate_count}')
|
|
|
|
singular_patterns = {
|
|
"backup_health_exporter": "/home/wooo/scripts/backup-health-textfile-exporter.py",
|
|
"offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
|
|
"offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
|
|
"offsite_sync_gated": "/backup/scripts/sync-offsite-backups.sh --mode sync",
|
|
"offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
|
|
}
|
|
for entry, pattern in singular_patterns.items():
|
|
count = sum(1 for line in active_lines if pattern in line)
|
|
labels = f'host="{_escape_label(host)}",entry="{_escape_label(entry)}"'
|
|
lines.append(f"awoooi_backup_cron_singular_entry_count{{{labels}}} {count}")
|
|
lines.append(f"awoooi_backup_cron_singular_entry_ok{{{labels}}} {1 if count == 1 else 0}")
|
|
return lines
|
|
|
|
|
|
def _newest_file_timestamp(patterns: list[str]) -> int:
|
|
newest = 0
|
|
for pattern in patterns:
|
|
for path in Path("/").glob(pattern.lstrip("/")):
|
|
try:
|
|
if path.is_file():
|
|
newest = max(newest, int(path.stat().st_mtime))
|
|
except OSError:
|
|
continue
|
|
return newest
|
|
|
|
|
|
def _newest_tree_timestamp(root: Path, max_entries: int = 5000) -> tuple[int, int]:
|
|
if not root.exists():
|
|
return 0, 0
|
|
newest = 0
|
|
count = 0
|
|
for path in root.rglob("*"):
|
|
try:
|
|
if not path.is_file():
|
|
continue
|
|
count += 1
|
|
newest = max(newest, int(path.stat().st_mtime))
|
|
except OSError:
|
|
continue
|
|
if count >= max_entries:
|
|
break
|
|
return newest, count
|
|
|
|
|
|
def _expected_gitea_bundle_repos() -> list[str]:
|
|
configured = os.environ.get("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "")
|
|
repos = [item.strip() for item in configured.split(",") if item.strip()]
|
|
return repos or DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS
|
|
|
|
|
|
def _gitea_bundle_slug(repo: str) -> str:
|
|
return repo.replace("/", "__")
|
|
|
|
|
|
def _read_gitea_bundle_manifest(root: Path) -> tuple[Path | None, dict[str, dict[str, str]]]:
|
|
for manifest_path in [root / "manifest.remote.tsv", root / "manifest.tsv"]:
|
|
try:
|
|
rows = manifest_path.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
continue
|
|
if not rows:
|
|
return manifest_path, {}
|
|
header = rows[0].split("\t")
|
|
parsed: dict[str, dict[str, str]] = {}
|
|
for line in rows[1:]:
|
|
if not line.strip():
|
|
continue
|
|
values = line.split("\t")
|
|
row = {key: values[index] if index < len(values) else "" for index, key in enumerate(header)}
|
|
repo = row.get("repo", "").strip()
|
|
if repo:
|
|
parsed[repo] = row
|
|
return manifest_path, parsed
|
|
return None, {}
|
|
|
|
|
|
def _resolve_bundle_path(root: Path, repo: str, manifest_value: str) -> Path:
|
|
if manifest_value:
|
|
candidate = Path(manifest_value)
|
|
if candidate.exists():
|
|
return candidate
|
|
local_candidate = root / candidate.name
|
|
if local_candidate.exists():
|
|
return local_candidate
|
|
slug_candidate = root / f"{_gitea_bundle_slug(repo)}.bundle"
|
|
if slug_candidate.exists():
|
|
return slug_candidate
|
|
repo_name = repo.rsplit("/", 1)[-1]
|
|
matches = sorted(root.glob(f"{repo_name}-local-*.bundle"), key=lambda path: path.stat().st_mtime, reverse=True)
|
|
return matches[0] if matches else slug_candidate
|
|
|
|
|
|
def _resolve_checksum_path(root: Path, bundle_path: Path, manifest_value: str) -> Path:
|
|
if manifest_value:
|
|
candidate = Path(manifest_value)
|
|
if candidate.exists():
|
|
return candidate
|
|
local_candidate = root / candidate.name
|
|
if local_candidate.exists():
|
|
return local_candidate
|
|
return bundle_path.with_name(f"{bundle_path.name}.sha256")
|
|
|
|
|
|
def _checksum_digest_present(path: Path) -> int:
|
|
try:
|
|
first = path.read_text(encoding="utf-8", errors="replace").split()[0]
|
|
except (IndexError, OSError):
|
|
return 0
|
|
return 1 if re.fullmatch(r"[0-9a-fA-F]{64}", first) else 0
|
|
|
|
|
|
def _gitea_bundle_metric_lines(host: str) -> tuple[list[str], int]:
|
|
now = int(time.time())
|
|
root = GITEA_BUNDLE_ROOT
|
|
manifest_path, manifest_rows = _read_gitea_bundle_manifest(root)
|
|
root_exists = int(root.exists())
|
|
manifest_present = int(manifest_path is not None)
|
|
newest_ts, file_count = _newest_tree_timestamp(root, max_entries=20000)
|
|
age = now - newest_ts if newest_ts else 0
|
|
bundle_fresh = 1 if newest_ts and age <= int(GITEA_BUNDLE_MAX_AGE_HOURS * 3600) else 0
|
|
root_label = _escape_label(str(root))
|
|
labels = f'host="{_escape_label(host)}",root="{root_label}",max_age_hours="{GITEA_BUNDLE_MAX_AGE_HOURS:g}"'
|
|
lines = [
|
|
f"awoooi_gitea_bundle_root_exists{{{labels}}} {root_exists}",
|
|
f"awoooi_gitea_bundle_manifest_present{{{labels}}} {manifest_present}",
|
|
f"awoooi_gitea_bundle_newest_timestamp{{{labels}}} {newest_ts}",
|
|
f"awoooi_gitea_bundle_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_gitea_bundle_fresh{{{labels}}} {bundle_fresh}",
|
|
f"awoooi_gitea_bundle_file_count{{{labels}}} {file_count}",
|
|
]
|
|
|
|
missing_count = 0
|
|
failed_count = 0
|
|
checksum_missing_count = 0
|
|
expected_repos = _expected_gitea_bundle_repos()
|
|
for repo in expected_repos:
|
|
row = manifest_rows.get(repo, {})
|
|
status = row.get("status", "manifest_row_missing")
|
|
bundle_path = _resolve_bundle_path(root, repo, row.get("bundle", ""))
|
|
checksum_path = _resolve_checksum_path(root, bundle_path, row.get("checksum", ""))
|
|
bundle_present = int(bundle_path.is_file())
|
|
checksum_present = int(checksum_path.is_file())
|
|
checksum_digest_present = _checksum_digest_present(checksum_path) if checksum_present else 0
|
|
if status == "manifest_row_missing" and bundle_present and checksum_digest_present:
|
|
status = "bundle_file_only"
|
|
try:
|
|
head_count = int(row.get("head_count", "0") or 0)
|
|
except ValueError:
|
|
head_count = 0
|
|
repo_ok = int(
|
|
bundle_present == 1
|
|
and checksum_present == 1
|
|
and checksum_digest_present == 1
|
|
and status in {"ok", "bundle_file_only"}
|
|
)
|
|
if not bundle_present:
|
|
missing_count += 1
|
|
if not checksum_present or not checksum_digest_present:
|
|
checksum_missing_count += 1
|
|
if repo_ok == 0:
|
|
failed_count += 1
|
|
repo_labels = f'host="{_escape_label(host)}",repo="{_escape_label(repo)}"'
|
|
status_labels = (
|
|
f'{repo_labels},status="{_escape_label(status)}",'
|
|
f'bundle="{_escape_label(bundle_path.name)}"'
|
|
)
|
|
lines.extend(
|
|
[
|
|
f"awoooi_gitea_bundle_expected_repo_info{{{repo_labels}}} 1",
|
|
f"awoooi_gitea_bundle_repo_status_info{{{status_labels}}} 1",
|
|
f"awoooi_gitea_bundle_repo_present{{{repo_labels}}} {bundle_present}",
|
|
f"awoooi_gitea_bundle_repo_ok{{{repo_labels}}} {repo_ok}",
|
|
f"awoooi_gitea_bundle_repo_head_count{{{repo_labels}}} {head_count}",
|
|
f"awoooi_gitea_bundle_checksum_present{{{repo_labels}}} {checksum_present}",
|
|
f"awoooi_gitea_bundle_checksum_digest_present{{{repo_labels}}} {checksum_digest_present}",
|
|
]
|
|
)
|
|
|
|
all_expected_ok = int(
|
|
root_exists == 1
|
|
and manifest_present == 1
|
|
and bundle_fresh == 1
|
|
and expected_repos
|
|
and missing_count == 0
|
|
and failed_count == 0
|
|
and checksum_missing_count == 0
|
|
)
|
|
lines.extend(
|
|
[
|
|
f"awoooi_gitea_bundle_expected_repo_count{{{labels}}} {len(expected_repos)}",
|
|
f"awoooi_gitea_bundle_expected_repo_missing_count{{{labels}}} {missing_count}",
|
|
f"awoooi_gitea_bundle_failed_repo_count{{{labels}}} {failed_count}",
|
|
f"awoooi_gitea_bundle_checksum_missing_count{{{labels}}} {checksum_missing_count}",
|
|
f"awoooi_gitea_bundle_all_expected_ok{{{labels}}} {all_expected_ok}",
|
|
]
|
|
)
|
|
return lines, all_expected_ok
|
|
|
|
|
|
def _read_backup_110_timestamp() -> int:
|
|
candidates = [
|
|
Path("/home/ollama/node_exporter_textfiles/backup.prom"),
|
|
Path("/home/ollama/backup/110/last_success"),
|
|
]
|
|
for path in candidates:
|
|
try:
|
|
text = path.read_text(encoding="utf-8", errors="replace")
|
|
except OSError:
|
|
continue
|
|
match = re.search(r"(?:backup_110_last_success_timestamp\s+)?(\d{10})", text)
|
|
if match:
|
|
return int(match.group(1))
|
|
return 0
|
|
|
|
|
|
def _latest_restic_snapshot(repo: str) -> tuple[int, int]:
|
|
password_file = os.environ.get("RESTIC_PASSWORD_FILE", "/backup/scripts/.restic-password")
|
|
if not Path(repo).exists() or not Path(password_file).exists():
|
|
return 0, 0
|
|
rc, stdout, _ = _run(
|
|
["restic", "-r", repo, "snapshots", "--json", "--password-file", password_file],
|
|
timeout=45,
|
|
)
|
|
if rc != 0:
|
|
return 0, 0
|
|
try:
|
|
rows = json.loads(stdout)
|
|
except json.JSONDecodeError:
|
|
return 0, 0
|
|
timestamps = [_parse_time(str(row.get("time", ""))) for row in rows]
|
|
timestamps = [value for value in timestamps if value > 0]
|
|
return (max(timestamps), len(timestamps)) if timestamps else (0, 0)
|
|
|
|
|
|
def _backup_all_failed_count_from_log(path: Path) -> tuple[int, int]:
|
|
try:
|
|
lines = path.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
return 0, -1
|
|
for line in reversed(lines):
|
|
if "全服務備份完成" not in line:
|
|
continue
|
|
ts_match = re.match(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]", line)
|
|
timestamp = 0
|
|
if ts_match:
|
|
timestamp = int(datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc).timestamp()) - 8 * 3600
|
|
failed_match = re.search(r"-\s+(\d+)\s+個失敗", line)
|
|
if failed_match:
|
|
return timestamp, int(failed_match.group(1))
|
|
if "全部成功" in line:
|
|
return timestamp, 0
|
|
return 0, -1
|
|
|
|
|
|
def _latest_backup_all_failed_count() -> tuple[int, int]:
|
|
candidates = [
|
|
_backup_all_failed_count_from_log(Path("/backup/logs/cron.log")),
|
|
_backup_all_failed_count_from_log(Path("/backup/logs/backup.log")),
|
|
]
|
|
candidates = [row for row in candidates if row[0] > 0 and row[1] >= 0]
|
|
if not candidates:
|
|
return 0, -1
|
|
return max(candidates, key=lambda row: row[0])
|
|
|
|
|
|
def _read_key_value_status(path: str) -> dict[str, int | str]:
|
|
values: dict[str, int | str] = {}
|
|
try:
|
|
lines = Path(path).read_text(encoding="utf-8", errors="replace").splitlines()
|
|
except OSError:
|
|
return values
|
|
for line in lines:
|
|
if not line or line.startswith("#") or "=" not in line:
|
|
continue
|
|
key, value = line.split("=", 1)
|
|
key = key.strip()
|
|
value = value.strip()
|
|
try:
|
|
values[key] = int(float(value))
|
|
except ValueError:
|
|
values[key] = value
|
|
return values
|
|
|
|
|
|
def _integrity_metric_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
specs = [
|
|
("restic_check", "/backup/integrity/check.status", 192),
|
|
("restore_drill", "/backup/integrity/restore-drill.status", 744),
|
|
]
|
|
lines: list[str] = []
|
|
for scope, path, max_age_hours in specs:
|
|
values = _read_key_value_status(path)
|
|
timestamp = int(values.get("timestamp", 0)) if "timestamp" in values else 0
|
|
failed_count = int(values.get("failed_count", -1)) if "failed_count" in values else -1
|
|
checked_count = int(values.get("checked_repo_count", 0)) if "checked_repo_count" in values else 0
|
|
age = now - timestamp if timestamp else 0
|
|
fresh = 1 if timestamp and age <= max_age_hours * 3600 and failed_count == 0 else 0
|
|
labels = f'host="{_escape_label(host)}",scope="{scope}",max_age_hours="{max_age_hours}"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_integrity_last_success_timestamp{{{labels}}} {timestamp if failed_count == 0 else 0}",
|
|
f"awoooi_backup_integrity_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_integrity_fresh{{{labels}}} {fresh}",
|
|
f"awoooi_backup_integrity_failed_repo_count{{{labels}}} {failed_count}",
|
|
f"awoooi_backup_integrity_checked_repo_count{{{labels}}} {checked_count}",
|
|
]
|
|
)
|
|
return lines
|
|
|
|
|
|
def _config_capture_metric_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
labels = f'host="{_escape_label(host)}"'
|
|
try:
|
|
document = json.loads(CONFIG_CAPTURE_STATUS_FILE.read_text(encoding="utf-8", errors="replace"))
|
|
except (OSError, json.JSONDecodeError):
|
|
return [
|
|
f"awoooi_backup_config_capture_status_timestamp{{{labels}}} 0",
|
|
f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} 0",
|
|
f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} -1",
|
|
]
|
|
|
|
timestamp = int(document.get("timestamp") or 0)
|
|
critical_failed = int(document.get("critical_failed_count", -1))
|
|
failed_count = int(document.get("failed_count", -1))
|
|
snapshot_id = str(document.get("snapshot_id") or "unknown")
|
|
duration = int(document.get("duration_seconds", 0) or 0)
|
|
age = now - timestamp if timestamp else 0
|
|
lines = [
|
|
f"awoooi_backup_config_capture_status_timestamp{{{labels},snapshot_id=\"{_escape_label(snapshot_id)}\"}} {timestamp}",
|
|
f"awoooi_backup_config_capture_status_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_config_capture_critical_failed_count{{{labels}}} {critical_failed}",
|
|
f"awoooi_backup_config_capture_failed_count{{{labels}}} {failed_count}",
|
|
f"awoooi_backup_config_capture_duration_seconds{{{labels}}} {duration}",
|
|
]
|
|
for item in document.get("items") or []:
|
|
target = str(item.get("target") or "unknown")
|
|
source = str(item.get("source") or "unknown")
|
|
critical = "true" if item.get("critical") else "false"
|
|
ok = 1 if item.get("ok") else 0
|
|
item_labels = (
|
|
f'host="{_escape_label(host)}",'
|
|
f'target="{_escape_label(target)}",'
|
|
f'source="{_escape_label(source)}",'
|
|
f'critical="{critical}"'
|
|
)
|
|
lines.append(f"awoooi_backup_config_capture_ok{{{item_labels}}} {ok}")
|
|
return lines
|
|
|
|
|
|
def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
lines: list[str] = []
|
|
b2_configured = int(_b2_configured())
|
|
rclone_configured = int(_rclone_configured())
|
|
b2_full_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "b2-last-success",
|
|
OFFSITE_STATUS_DIR / "b2.last_success",
|
|
OFFSITE_STATUS_DIR / "last_success",
|
|
Path("/backup/logs/offsite-b2.status"),
|
|
]
|
|
)
|
|
b2_partial_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "b2-partial-last-success",
|
|
OFFSITE_STATUS_DIR / "b2.partial_last_success",
|
|
]
|
|
)
|
|
rclone_full_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "rclone-last-success",
|
|
OFFSITE_STATUS_DIR / "rclone.last_success",
|
|
OFFSITE_STATUS_DIR / "last_success",
|
|
Path("/backup/logs/rclone-sync.status"),
|
|
]
|
|
)
|
|
rclone_partial_timestamp = _marker_timestamp(
|
|
[
|
|
OFFSITE_STATUS_DIR / "rclone-partial-last-success",
|
|
OFFSITE_STATUS_DIR / "rclone.partial_last_success",
|
|
]
|
|
)
|
|
offsite_specs = [
|
|
("b2", b2_configured, b2_full_timestamp),
|
|
("rclone", rclone_configured, rclone_full_timestamp),
|
|
]
|
|
for provider, configured, timestamp in offsite_specs:
|
|
age = now - timestamp if timestamp else 0
|
|
fresh = 1 if configured and timestamp and age <= 48 * 3600 else 0
|
|
labels = f'host="{_escape_label(host)}",provider="{provider}",max_age_hours="48"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_offsite_configured{{{labels}}} {configured}",
|
|
f"awoooi_backup_offsite_last_success_timestamp{{{labels}}} {timestamp}",
|
|
f"awoooi_backup_offsite_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_offsite_fresh{{{labels}}} {fresh}",
|
|
]
|
|
)
|
|
|
|
partial_fresh_by_provider: dict[str, int] = {}
|
|
for provider, configured, timestamp in [
|
|
("b2", b2_configured, b2_partial_timestamp),
|
|
("rclone", rclone_configured, rclone_partial_timestamp),
|
|
]:
|
|
partial_age = now - timestamp if timestamp else 0
|
|
partial_fresh = 1 if configured and timestamp and partial_age <= 48 * 3600 else 0
|
|
partial_fresh_by_provider[provider] = partial_fresh
|
|
partial_labels = f'host="{_escape_label(host)}",provider="{provider}",scope="partial",max_age_hours="48"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_offsite_partial_last_success_timestamp{{{partial_labels}}} {timestamp}",
|
|
f"awoooi_backup_offsite_partial_age_seconds{{{partial_labels}}} {partial_age}",
|
|
f"awoooi_backup_offsite_partial_fresh{{{partial_labels}}} {partial_fresh}",
|
|
]
|
|
)
|
|
|
|
full_sync_enable_marker = OFFSITE_STATUS_DIR / "enable-rclone-sync"
|
|
try:
|
|
full_sync_enabled = 1 if full_sync_enable_marker.is_file() else 0
|
|
full_sync_enabled_timestamp = int(full_sync_enable_marker.stat().st_mtime) if full_sync_enabled else 0
|
|
except OSError:
|
|
full_sync_enabled = 0
|
|
full_sync_enabled_timestamp = 0
|
|
full_sync_labels = f'host="{_escape_label(host)}",provider="rclone"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_offsite_full_sync_enabled{{{full_sync_labels}}} {full_sync_enabled}",
|
|
f"awoooi_backup_offsite_full_sync_enabled_timestamp{{{full_sync_labels}}} {full_sync_enabled_timestamp}",
|
|
]
|
|
)
|
|
|
|
escrow_missing_count = 0
|
|
for item in ESCROW_ITEMS:
|
|
timestamp = _marker_timestamp(
|
|
[
|
|
ESCROW_EVIDENCE_DIR / f"{item}.last_verified",
|
|
ESCROW_EVIDENCE_DIR / f"{item}.verified",
|
|
ESCROW_EVIDENCE_DIR / item,
|
|
]
|
|
)
|
|
age = now - timestamp if timestamp else 0
|
|
fresh = 1 if timestamp and age <= 744 * 3600 else 0
|
|
escrow_missing_count += 0 if fresh else 1
|
|
labels = f'host="{_escape_label(host)}",item="{item}",max_age_hours="744"'
|
|
lines.extend(
|
|
[
|
|
f"awoooi_backup_credential_escrow_expected_info{{{labels}}} 1",
|
|
f"awoooi_backup_credential_escrow_last_verified_timestamp{{{labels}}} {timestamp}",
|
|
f"awoooi_backup_credential_escrow_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_credential_escrow_fresh{{{labels}}} {fresh}",
|
|
]
|
|
)
|
|
offsite_configured = 1 if b2_configured or rclone_configured else 0
|
|
any_partial_fresh = 1 if any(partial_fresh_by_provider.values()) else 0
|
|
full_fresh = 1 if (
|
|
(b2_configured and b2_full_timestamp and now - b2_full_timestamp <= 48 * 3600)
|
|
or (rclone_configured and rclone_full_timestamp and now - rclone_full_timestamp <= 48 * 3600)
|
|
) else 0
|
|
if not offsite_configured:
|
|
next_step = "configure_google_drive_rclone_on_110_tty"
|
|
phase = 1
|
|
elif escrow_missing_count == 0 and full_fresh:
|
|
next_step = "offsite_and_escrow_ready"
|
|
phase = 5
|
|
elif escrow_missing_count > 0 and full_fresh:
|
|
next_step = "complete_credential_escrow_review"
|
|
phase = 3
|
|
elif not any_partial_fresh:
|
|
next_step = "run_small_dry_run_then_partial_sync"
|
|
phase = 2
|
|
elif escrow_missing_count > 0:
|
|
next_step = "complete_credential_escrow_review"
|
|
phase = 3
|
|
elif not full_fresh:
|
|
next_step = "pre_full_sync_review"
|
|
phase = 4
|
|
else:
|
|
next_step = "offsite_and_escrow_ready"
|
|
phase = 5
|
|
|
|
lines.extend(
|
|
[
|
|
f'awoooi_backup_dr_credential_escrow_missing_count{{host="{_escape_label(host)}"}} {escrow_missing_count}',
|
|
f'awoooi_backup_dr_phase{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} {phase}',
|
|
f'awoooi_backup_dr_next_step_info{{host="{_escape_label(host)}",next_step="{_escape_label(next_step)}"}} 1',
|
|
]
|
|
)
|
|
return lines
|
|
|
|
|
|
def _retention_metric_lines(host: str) -> list[str]:
|
|
mode = (_backup_config_value("BACKUP_RETENTION_MODE") or os.environ.get("BACKUP_RETENTION_MODE", "")).strip()
|
|
keep_last = (_backup_config_value("KEEP_LAST") or os.environ.get("KEEP_LAST", "")).strip()
|
|
offsite_delete_old = (
|
|
_backup_config_value("OFFSITE_SYNC_DELETE_OLD") or os.environ.get("OFFSITE_SYNC_DELETE_OLD", "")
|
|
).strip()
|
|
|
|
latest_only = 1 if mode == "latest" and keep_last == "1" else 0
|
|
offsite_mirror = 1 if offsite_delete_old == "1" else 0
|
|
labels = f'host="{_escape_label(host)}",scope="restic",mode="{_escape_label(mode or "unknown")}",keep_last="{_escape_label(keep_last or "unknown")}"'
|
|
offsite_labels = (
|
|
f'host="{_escape_label(host)}",scope="offsite",provider="rclone",'
|
|
f'delete_old="{_escape_label(offsite_delete_old or "unknown")}"'
|
|
)
|
|
return [
|
|
f"awoooi_backup_retention_latest_only{{{labels}}} {latest_only}",
|
|
f"awoooi_backup_retention_offsite_delete_old_enabled{{{offsite_labels}}} {offsite_mirror}",
|
|
]
|
|
|
|
|
|
def _collect_velero_from_k8s() -> dict[str, int | str]:
|
|
remote_script = r"""
|
|
python3 - <<'PY'
|
|
import datetime as dt
|
|
import json
|
|
import subprocess
|
|
import time
|
|
|
|
|
|
def kubectl(args):
|
|
for prefix in (["sudo", "-n", "kubectl"], ["kubectl"]):
|
|
result = subprocess.run(prefix + args, capture_output=True, text=True, timeout=20, check=False)
|
|
if result.returncode == 0:
|
|
return result.stdout
|
|
return ""
|
|
|
|
|
|
def load_json(args):
|
|
text = kubectl(args + ["-o", "json"])
|
|
try:
|
|
return json.loads(text) if text else {}
|
|
except json.JSONDecodeError:
|
|
return {}
|
|
|
|
|
|
def parse_ts(value):
|
|
if not value:
|
|
return 0
|
|
try:
|
|
return int(dt.datetime.fromisoformat(value.replace("Z", "+00:00")).timestamp())
|
|
except ValueError:
|
|
return 0
|
|
|
|
|
|
now = int(time.time())
|
|
schedules = load_json(["get", "schedules.velero.io", "-n", "velero"]).get("items") or []
|
|
backups = load_json(["get", "backups.velero.io", "-n", "velero"]).get("items") or []
|
|
cron = load_json(["get", "cronjob", "-n", "velero", "backup-restore-test"])
|
|
jobs = load_json(["get", "jobs", "-n", "velero", "-l", "component=backup-restore-test"]).get("items") or []
|
|
|
|
completed = []
|
|
for item in backups:
|
|
if item.get("status", {}).get("phase") != "Completed":
|
|
continue
|
|
timestamp = parse_ts(item.get("status", {}).get("completionTimestamp") or item.get("metadata", {}).get("creationTimestamp"))
|
|
if timestamp:
|
|
completed.append(timestamp)
|
|
|
|
failed_jobs = 0
|
|
for job in jobs:
|
|
conditions = job.get("status", {}).get("conditions") or []
|
|
if any(row.get("type") == "Failed" and row.get("status") == "True" for row in conditions):
|
|
failed_jobs += 1
|
|
|
|
last_success = parse_ts((cron.get("status") or {}).get("lastSuccessfulTime"))
|
|
latest_backup = max(completed) if completed else 0
|
|
|
|
print("monitor_up=1")
|
|
print(f"schedule_count={len(schedules)}")
|
|
print(f"schedule_paused_count={sum(1 for item in schedules if item.get('spec', {}).get('paused'))}")
|
|
print(f"latest_completed_backup_timestamp={latest_backup}")
|
|
print(f"latest_completed_backup_age_seconds={now - latest_backup if latest_backup else 0}")
|
|
print(f"latest_completed_backup_fresh={1 if latest_backup and now - latest_backup <= 90000 else 0}")
|
|
print(f"restore_test_cron_present={1 if cron.get('metadata', {}).get('name') == 'backup-restore-test' else 0}")
|
|
print(f"restore_test_last_success_timestamp={last_success}")
|
|
print(f"restore_test_last_success_age_seconds={now - last_success if last_success else 0}")
|
|
print(f"restore_test_last_success_fresh={1 if last_success and now - last_success <= 691200 else 0}")
|
|
print(f"restore_test_failed_jobs={failed_jobs}")
|
|
PY
|
|
"""
|
|
hosts = os.environ.get("AIOPS_K8S_QUERY_HOSTS", "192.168.0.120 192.168.0.121 192.168.0.125").split()
|
|
values: dict[str, int | str] = {"monitor_up": 0, "source": "unreachable"}
|
|
for host in hosts:
|
|
rc, stdout, _ = _run(
|
|
[
|
|
"ssh",
|
|
"-o",
|
|
"BatchMode=yes",
|
|
"-o",
|
|
"StrictHostKeyChecking=accept-new",
|
|
"-o",
|
|
"ConnectTimeout=8",
|
|
f"wooo@{host}",
|
|
remote_script,
|
|
],
|
|
timeout=45,
|
|
)
|
|
if rc != 0:
|
|
continue
|
|
parsed: dict[str, int | str] = {"source": f"{host}-kubectl"}
|
|
for line in stdout.splitlines():
|
|
if "=" not in line:
|
|
continue
|
|
key, value = line.split("=", 1)
|
|
try:
|
|
parsed[key.strip()] = int(float(value.strip()))
|
|
except ValueError:
|
|
continue
|
|
if int(parsed.get("monitor_up", 0)) == 1:
|
|
return parsed
|
|
return values
|
|
|
|
|
|
def _velero_metric_lines(host: str) -> list[str]:
|
|
values = _collect_velero_from_k8s()
|
|
labels = f'host="{_escape_label(host)}",source="{_escape_label(str(values.get("source", "unreachable")))}",namespace="velero"'
|
|
return [
|
|
f"awoooi_velero_monitor_up{{{labels}}} {values.get('monitor_up', 0)}",
|
|
f"awoooi_velero_schedule_count{{{labels}}} {values.get('schedule_count', 0)}",
|
|
f"awoooi_velero_schedule_paused_count{{{labels}}} {values.get('schedule_paused_count', 0)}",
|
|
f"awoooi_velero_latest_completed_backup_timestamp{{{labels}}} {values.get('latest_completed_backup_timestamp', 0)}",
|
|
f"awoooi_velero_latest_completed_backup_age_seconds{{{labels}}} {values.get('latest_completed_backup_age_seconds', 0)}",
|
|
f"awoooi_velero_latest_completed_backup_fresh{{{labels},max_age_hours=\"25\"}} {values.get('latest_completed_backup_fresh', 0)}",
|
|
f"awoooi_velero_restore_test_cron_present{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_cron_present', 0)}",
|
|
f"awoooi_velero_restore_test_last_success_timestamp{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_timestamp', 0)}",
|
|
f"awoooi_velero_restore_test_last_success_age_seconds{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_last_success_age_seconds', 0)}",
|
|
f"awoooi_velero_restore_test_last_success_fresh{{{labels},cronjob=\"backup-restore-test\",max_age_hours=\"192\"}} {values.get('restore_test_last_success_fresh', 0)}",
|
|
f"awoooi_velero_restore_test_failed_jobs{{{labels},cronjob=\"backup-restore-test\"}} {values.get('restore_test_failed_jobs', 0)}",
|
|
]
|
|
|
|
|
|
def _metric_lines_for_job(
|
|
*,
|
|
host: str,
|
|
job: str,
|
|
source: str,
|
|
target: str,
|
|
backup_type: str,
|
|
last_success: int,
|
|
max_age_hours: float,
|
|
sample_count: int = 0,
|
|
) -> list[str]:
|
|
now = int(time.time())
|
|
labels = (
|
|
f'host="{_escape_label(host)}",'
|
|
f'job="{_escape_label(job)}",'
|
|
f'type="{_escape_label(backup_type)}",'
|
|
f'source="{_escape_label(source)}",'
|
|
f'target="{_escape_label(target)}",'
|
|
f'max_age_hours="{max_age_hours:g}"'
|
|
)
|
|
age = now - last_success if last_success > 0 else 0
|
|
fresh = 1 if last_success > 0 and age <= int(max_age_hours * 3600) else 0
|
|
return [
|
|
f"awoooi_backup_expected_job_info{{{labels}}} 1",
|
|
f"awoooi_backup_job_last_success_timestamp{{{labels}}} {last_success}",
|
|
f"awoooi_backup_job_age_seconds{{{labels}}} {age}",
|
|
f"awoooi_backup_job_fresh{{{labels}}} {fresh}",
|
|
f"awoooi_backup_job_snapshot_count{{{labels}}} {sample_count}",
|
|
]
|
|
|
|
|
|
def _base_lines(host: str) -> list[str]:
|
|
now = int(time.time())
|
|
return [
|
|
"# HELP awoooi_backup_health_monitor_up Whether the backup health exporter completed.",
|
|
"# TYPE awoooi_backup_health_monitor_up gauge",
|
|
"# HELP awoooi_backup_health_last_run_timestamp Unix timestamp of the last backup health exporter run.",
|
|
"# TYPE awoooi_backup_health_last_run_timestamp gauge",
|
|
"# HELP awoooi_backup_expected_job_info Expected backup job inventory.",
|
|
"# TYPE awoooi_backup_expected_job_info gauge",
|
|
"# HELP awoooi_backup_coverage_domain_expected_info Expected backup coverage domains for host, DB, website, service, package, tool, and log recovery.",
|
|
"# TYPE awoooi_backup_coverage_domain_expected_info gauge",
|
|
"# HELP awoooi_backup_coverage_domain_fresh Whether every required evidence job for a backup coverage domain is fresh.",
|
|
"# TYPE awoooi_backup_coverage_domain_fresh gauge",
|
|
"# HELP awoooi_backup_job_configured Whether the expected backup cron/config is present.",
|
|
"# TYPE awoooi_backup_job_configured gauge",
|
|
"# HELP awoooi_backup_script_present Whether the backup script exists on this host.",
|
|
"# TYPE awoooi_backup_script_present gauge",
|
|
"# HELP awoooi_backup_job_last_success_timestamp Unix timestamp of the latest successful backup evidence.",
|
|
"# TYPE awoooi_backup_job_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_job_age_seconds Age of the latest successful backup evidence.",
|
|
"# TYPE awoooi_backup_job_age_seconds gauge",
|
|
"# HELP awoooi_backup_job_fresh Whether the latest successful backup evidence is within max_age_hours.",
|
|
"# TYPE awoooi_backup_job_fresh gauge",
|
|
"# HELP awoooi_backup_job_snapshot_count Number of snapshots or files considered for this job.",
|
|
"# TYPE awoooi_backup_job_snapshot_count gauge",
|
|
"# HELP awoooi_backup_last_run_failed_count Failed component count from the last aggregate backup run.",
|
|
"# TYPE awoooi_backup_last_run_failed_count gauge",
|
|
"# HELP awoooi_backup_integrity_last_success_timestamp Unix timestamp of latest successful backup integrity or restore drill run.",
|
|
"# TYPE awoooi_backup_integrity_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_integrity_age_seconds Age of backup integrity or restore drill status.",
|
|
"# TYPE awoooi_backup_integrity_age_seconds gauge",
|
|
"# HELP awoooi_backup_integrity_fresh Whether backup integrity or restore drill status is fresh and successful.",
|
|
"# TYPE awoooi_backup_integrity_fresh gauge",
|
|
"# HELP awoooi_backup_integrity_failed_repo_count Failed repository count from backup integrity or restore drill run.",
|
|
"# TYPE awoooi_backup_integrity_failed_repo_count gauge",
|
|
"# HELP awoooi_backup_integrity_checked_repo_count Checked repository count from backup integrity or restore drill run.",
|
|
"# TYPE awoooi_backup_integrity_checked_repo_count gauge",
|
|
"# HELP awoooi_backup_config_capture_status_timestamp Unix timestamp of the latest config-capture coverage status.",
|
|
"# TYPE awoooi_backup_config_capture_status_timestamp gauge",
|
|
"# HELP awoooi_backup_config_capture_status_age_seconds Age of the latest config-capture coverage status.",
|
|
"# TYPE awoooi_backup_config_capture_status_age_seconds gauge",
|
|
"# HELP awoooi_backup_config_capture_critical_failed_count Critical config-capture targets missing from the latest configs backup.",
|
|
"# TYPE awoooi_backup_config_capture_critical_failed_count gauge",
|
|
"# HELP awoooi_backup_config_capture_failed_count Total config-capture targets missing from the latest configs backup.",
|
|
"# TYPE awoooi_backup_config_capture_failed_count gauge",
|
|
"# HELP awoooi_backup_config_capture_duration_seconds Duration of the latest configs backup capture run.",
|
|
"# TYPE awoooi_backup_config_capture_duration_seconds gauge",
|
|
"# HELP awoooi_backup_config_capture_ok Whether the latest configs backup captured a specific target.",
|
|
"# TYPE awoooi_backup_config_capture_ok gauge",
|
|
"# HELP awoooi_backup_offsite_configured Whether an offsite backup provider appears configured without exposing credentials.",
|
|
"# TYPE awoooi_backup_offsite_configured gauge",
|
|
"# HELP awoooi_backup_offsite_last_success_timestamp Unix timestamp of latest offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_offsite_age_seconds Age of latest offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_age_seconds gauge",
|
|
"# HELP awoooi_backup_offsite_fresh Whether offsite copy success marker is fresh.",
|
|
"# TYPE awoooi_backup_offsite_fresh gauge",
|
|
"# HELP awoooi_backup_offsite_partial_last_success_timestamp Unix timestamp of latest partial offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_partial_last_success_timestamp gauge",
|
|
"# HELP awoooi_backup_offsite_partial_age_seconds Age of latest partial offsite copy success marker.",
|
|
"# TYPE awoooi_backup_offsite_partial_age_seconds gauge",
|
|
"# HELP awoooi_backup_offsite_partial_fresh Whether partial offsite copy success marker is fresh.",
|
|
"# TYPE awoooi_backup_offsite_partial_fresh gauge",
|
|
"# HELP awoooi_backup_offsite_full_sync_enabled Whether the gated full offsite sync enable marker exists.",
|
|
"# TYPE awoooi_backup_offsite_full_sync_enabled gauge",
|
|
"# HELP awoooi_backup_offsite_full_sync_enabled_timestamp Unix timestamp of the gated full offsite sync enable marker.",
|
|
"# TYPE awoooi_backup_offsite_full_sync_enabled_timestamp gauge",
|
|
"# HELP awoooi_backup_credential_escrow_expected_info Expected credential escrow evidence inventory.",
|
|
"# TYPE awoooi_backup_credential_escrow_expected_info gauge",
|
|
"# HELP awoooi_backup_credential_escrow_last_verified_timestamp Unix timestamp of credential escrow verification evidence.",
|
|
"# TYPE awoooi_backup_credential_escrow_last_verified_timestamp gauge",
|
|
"# HELP awoooi_backup_credential_escrow_age_seconds Age of credential escrow verification evidence.",
|
|
"# TYPE awoooi_backup_credential_escrow_age_seconds gauge",
|
|
"# HELP awoooi_backup_credential_escrow_fresh Whether credential escrow verification evidence is fresh.",
|
|
"# TYPE awoooi_backup_credential_escrow_fresh gauge",
|
|
"# HELP awoooi_backup_dr_credential_escrow_missing_count Number of credential escrow items that still need fresh human verification.",
|
|
"# TYPE awoooi_backup_dr_credential_escrow_missing_count gauge",
|
|
"# HELP awoooi_backup_dr_phase Numeric DR offsite completion phase for AI/operator triage.",
|
|
"# TYPE awoooi_backup_dr_phase gauge",
|
|
"# HELP awoooi_backup_dr_next_step_info Current human-safe next step for DR offsite completion.",
|
|
"# TYPE awoooi_backup_dr_next_step_info gauge",
|
|
"# HELP awoooi_backup_retention_latest_only Whether local restic backup retention is configured as latest-only keep-last=1.",
|
|
"# TYPE awoooi_backup_retention_latest_only gauge",
|
|
"# HELP awoooi_backup_retention_offsite_delete_old_enabled Whether offsite rclone sync is allowed to delete old remote backup files after successful mirror.",
|
|
"# TYPE awoooi_backup_retention_offsite_delete_old_enabled gauge",
|
|
"# HELP awoooi_backup_cron_active_duplicate_count Number of exact duplicate active crontab entries on the backup host.",
|
|
"# TYPE awoooi_backup_cron_active_duplicate_count gauge",
|
|
"# HELP awoooi_backup_cron_singular_entry_count Number of active crontab entries matching a backup/offsite singleton pattern.",
|
|
"# TYPE awoooi_backup_cron_singular_entry_count gauge",
|
|
"# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.",
|
|
"# TYPE awoooi_backup_cron_singular_entry_ok gauge",
|
|
"# HELP awoooi_gitea_bundle_root_exists Whether the 188 Gitea private bundle backup root exists.",
|
|
"# TYPE awoooi_gitea_bundle_root_exists gauge",
|
|
"# HELP awoooi_gitea_bundle_manifest_present Whether the 188 Gitea private bundle manifest exists.",
|
|
"# TYPE awoooi_gitea_bundle_manifest_present gauge",
|
|
"# HELP awoooi_gitea_bundle_newest_timestamp Unix timestamp of newest file under the 188 Gitea private bundle backup root.",
|
|
"# TYPE awoooi_gitea_bundle_newest_timestamp gauge",
|
|
"# HELP awoooi_gitea_bundle_age_seconds Age of newest 188 Gitea private bundle backup evidence.",
|
|
"# TYPE awoooi_gitea_bundle_age_seconds gauge",
|
|
"# HELP awoooi_gitea_bundle_fresh Whether the 188 Gitea private bundle backup evidence is fresh.",
|
|
"# TYPE awoooi_gitea_bundle_fresh gauge",
|
|
"# HELP awoooi_gitea_bundle_file_count Number of files scanned under the 188 Gitea private bundle backup root.",
|
|
"# TYPE awoooi_gitea_bundle_file_count gauge",
|
|
"# HELP awoooi_gitea_bundle_expected_repo_info Expected Gitea repository that must have a private bundle backup.",
|
|
"# TYPE awoooi_gitea_bundle_expected_repo_info gauge",
|
|
"# HELP awoooi_gitea_bundle_repo_status_info Manifest or fallback status for an expected Gitea private bundle backup.",
|
|
"# TYPE awoooi_gitea_bundle_repo_status_info gauge",
|
|
"# HELP awoooi_gitea_bundle_repo_present Whether an expected Gitea repository bundle file exists.",
|
|
"# TYPE awoooi_gitea_bundle_repo_present gauge",
|
|
"# HELP awoooi_gitea_bundle_repo_ok Whether an expected Gitea repository has a present bundle and checksum evidence.",
|
|
"# TYPE awoooi_gitea_bundle_repo_ok gauge",
|
|
"# HELP awoooi_gitea_bundle_repo_head_count Head count recorded in the Gitea private bundle manifest.",
|
|
"# TYPE awoooi_gitea_bundle_repo_head_count gauge",
|
|
"# HELP awoooi_gitea_bundle_checksum_present Whether checksum evidence exists for an expected Gitea private bundle.",
|
|
"# TYPE awoooi_gitea_bundle_checksum_present gauge",
|
|
"# HELP awoooi_gitea_bundle_checksum_digest_present Whether checksum evidence contains a SHA-256 digest.",
|
|
"# TYPE awoooi_gitea_bundle_checksum_digest_present gauge",
|
|
"# HELP awoooi_gitea_bundle_expected_repo_count Number of expected Gitea private bundle repositories.",
|
|
"# TYPE awoooi_gitea_bundle_expected_repo_count gauge",
|
|
"# HELP awoooi_gitea_bundle_expected_repo_missing_count Number of expected Gitea private bundle repositories with no bundle file.",
|
|
"# TYPE awoooi_gitea_bundle_expected_repo_missing_count gauge",
|
|
"# HELP awoooi_gitea_bundle_failed_repo_count Number of expected Gitea private bundle repositories without complete bundle/checksum evidence.",
|
|
"# TYPE awoooi_gitea_bundle_failed_repo_count gauge",
|
|
"# HELP awoooi_gitea_bundle_checksum_missing_count Number of expected Gitea private bundle repositories without checksum evidence.",
|
|
"# TYPE awoooi_gitea_bundle_checksum_missing_count gauge",
|
|
"# HELP awoooi_gitea_bundle_all_expected_ok Whether all expected Gitea private bundle repositories are present and fresh.",
|
|
"# TYPE awoooi_gitea_bundle_all_expected_ok gauge",
|
|
"# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.",
|
|
"# TYPE awoooi_velero_monitor_up gauge",
|
|
"# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.",
|
|
"# TYPE awoooi_velero_schedule_count gauge",
|
|
"# HELP awoooi_velero_schedule_paused_count Number of paused Velero schedules.",
|
|
"# TYPE awoooi_velero_schedule_paused_count gauge",
|
|
"# HELP awoooi_velero_latest_completed_backup_timestamp Unix timestamp of latest Completed Velero backup.",
|
|
"# TYPE awoooi_velero_latest_completed_backup_timestamp gauge",
|
|
"# HELP awoooi_velero_latest_completed_backup_age_seconds Age of latest Completed Velero backup.",
|
|
"# TYPE awoooi_velero_latest_completed_backup_age_seconds gauge",
|
|
"# HELP awoooi_velero_latest_completed_backup_fresh Whether latest Completed Velero backup is within max_age_hours.",
|
|
"# TYPE awoooi_velero_latest_completed_backup_fresh gauge",
|
|
"# HELP awoooi_velero_restore_test_cron_present Whether backup-restore-test CronJob exists.",
|
|
"# TYPE awoooi_velero_restore_test_cron_present gauge",
|
|
"# HELP awoooi_velero_restore_test_last_success_timestamp Unix timestamp of backup-restore-test lastSuccessfulTime.",
|
|
"# TYPE awoooi_velero_restore_test_last_success_timestamp gauge",
|
|
"# HELP awoooi_velero_restore_test_last_success_age_seconds Age of backup-restore-test lastSuccessfulTime.",
|
|
"# TYPE awoooi_velero_restore_test_last_success_age_seconds gauge",
|
|
"# HELP awoooi_velero_restore_test_last_success_fresh Whether backup-restore-test lastSuccessfulTime is within max_age_hours.",
|
|
"# TYPE awoooi_velero_restore_test_last_success_fresh gauge",
|
|
"# HELP awoooi_velero_restore_test_failed_jobs Failed backup-restore-test jobs retained in velero namespace.",
|
|
"# TYPE awoooi_velero_restore_test_failed_jobs gauge",
|
|
f'awoooi_backup_health_monitor_up{{host="{_escape_label(host)}"}} 1',
|
|
f'awoooi_backup_health_last_run_timestamp{{host="{_escape_label(host)}"}} {now}',
|
|
]
|
|
|
|
|
|
def _collect_110(host: str) -> list[str]:
|
|
cron = _cron_text()
|
|
lines = _base_lines(host)
|
|
expected_crons = {
|
|
"backup_all": "/backup/scripts/backup-all.sh",
|
|
"awoooi_frequent": "/backup/scripts/backup-awoooi-frequent.sh",
|
|
"offsite_status": "/backup/scripts/sync-offsite-backups.sh --mode status",
|
|
"offsite_sync_gated": "/backup/offsite/enable-rclone-sync",
|
|
"offsite_escrow_evidence_report": "/backup/scripts/offsite-escrow-evidence-report.sh --no-color",
|
|
"offsite_full_sync_verify": "/backup/scripts/verify-offsite-full-sync.sh --write-textfile",
|
|
"backup_integrity_check": "/backup/scripts/check-backup-integrity.sh --mode check",
|
|
"backup_restore_drill": "/backup/scripts/check-backup-integrity.sh --mode restore-drill",
|
|
}
|
|
for job, pattern in expected_crons.items():
|
|
labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
|
|
lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
|
|
for script in [
|
|
"backup-all.sh",
|
|
"backup-awoooi.sh",
|
|
"backup-awoooi-frequent.sh",
|
|
"backup-configs.sh",
|
|
"backup-sentry.sh",
|
|
"backup-ai-artifacts.sh",
|
|
"backup-public-routes.sh",
|
|
"configure-offsite-rclone.sh",
|
|
"configure-offsite-b2.sh",
|
|
"sync-offsite-backups.sh",
|
|
"backup-offsite-readiness-gate.sh",
|
|
"offsite-escrow-evidence-report.sh",
|
|
"verify-offsite-full-sync.sh",
|
|
"mark-credential-escrow-verified.sh",
|
|
"check-backup-integrity.sh",
|
|
"backup-gitea.sh",
|
|
"backup-harbor.sh",
|
|
"backup-momo.sh",
|
|
"backup-langfuse.sh",
|
|
"backup-monitoring.sh",
|
|
"backup-signoz.sh",
|
|
"backup-open-webui.sh",
|
|
"backup-clawbot.sh",
|
|
]:
|
|
labels = f'host="{_escape_label(host)}",script="{_escape_label(script)}"'
|
|
lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path('/backup/scripts', script).exists())}")
|
|
|
|
job_freshness: dict[str, int] = {}
|
|
for job, repo, max_age in [
|
|
("awoooi_db", "/backup/awoooi", 7),
|
|
("configs", "/backup/configs", 48),
|
|
("sentry", "/backup/sentry", 48),
|
|
("gitea", "/backup/gitea", 48),
|
|
("harbor", "/backup/harbor", 48),
|
|
("momo", "/backup/momo", 48),
|
|
("langfuse", "/backup/langfuse", 48),
|
|
("monitoring", "/backup/monitoring", 48),
|
|
("signoz", "/backup/signoz", 48),
|
|
("open_webui", "/backup/open-webui", 48),
|
|
("clawbot", "/backup/clawbot", 48),
|
|
("ai_artifacts", "/backup/ai-artifacts", 48),
|
|
("public_routes", "/backup/public-routes", 168),
|
|
]:
|
|
timestamp, count = _latest_restic_snapshot(repo)
|
|
age = int(time.time()) - timestamp if timestamp else 0
|
|
job_freshness[job] = 1 if timestamp and age <= max_age * 3600 else 0
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job=job,
|
|
source="110-restic",
|
|
target=repo,
|
|
backup_type="restic",
|
|
last_success=timestamp,
|
|
max_age_hours=max_age,
|
|
sample_count=count,
|
|
)
|
|
)
|
|
|
|
coverage_domains = {
|
|
"host": ["configs"],
|
|
"database": ["awoooi_db"],
|
|
"website": ["public_routes", "momo"],
|
|
"service": ["gitea", "harbor", "sentry", "monitoring", "signoz"],
|
|
"package": ["configs", "ai_artifacts"],
|
|
"tool": ["ai_artifacts", "open_webui", "clawbot", "langfuse"],
|
|
"log": ["monitoring", "signoz", "sentry"],
|
|
}
|
|
for domain, jobs in coverage_domains.items():
|
|
required = ",".join(jobs)
|
|
fresh = 1 if all(job_freshness.get(job, 0) == 1 for job in jobs) else 0
|
|
labels = (
|
|
f'host="{_escape_label(host)}",'
|
|
f'domain="{_escape_label(domain)}",'
|
|
f'required_jobs="{_escape_label(required)}"'
|
|
)
|
|
lines.append(f"awoooi_backup_coverage_domain_expected_info{{{labels}}} 1")
|
|
lines.append(f"awoooi_backup_coverage_domain_fresh{{{labels}}} {fresh}")
|
|
|
|
backup_all_ts, failed_count = _latest_backup_all_failed_count()
|
|
labels = f'host="{_escape_label(host)}",job="backup_all"'
|
|
lines.append(f"awoooi_backup_last_run_failed_count{{{labels}}} {failed_count}")
|
|
lines.append(f"awoooi_backup_job_last_success_timestamp{{{labels},type=\"aggregate\",source=\"110-cron-log\",target=\"/backup/logs/cron.log\",max_age_hours=\"48\"}} {backup_all_ts if failed_count == 0 else 0}")
|
|
lines.extend(_integrity_metric_lines(host))
|
|
lines.extend(_config_capture_metric_lines(host))
|
|
lines.extend(_offsite_and_escrow_metric_lines(host))
|
|
lines.extend(_retention_metric_lines(host))
|
|
lines.extend(_cron_duplicate_metric_lines(host, cron))
|
|
lines.extend(_velero_metric_lines(host))
|
|
return lines
|
|
|
|
|
|
def _collect_188(host: str) -> list[str]:
|
|
cron = _cron_text()
|
|
lines = _base_lines(host)
|
|
for job, pattern in {
|
|
"backup_from_110": "/home/ollama/bin/backup-from-110.sh",
|
|
"momo_pg_daily": "/home/ollama/bin/momo-pg-backup.sh",
|
|
}.items():
|
|
labels = f'host="{_escape_label(host)}",job="{_escape_label(job)}"'
|
|
lines.append(f"awoooi_backup_job_configured{{{labels}}} {int(pattern in cron)}")
|
|
|
|
for script in [
|
|
"/home/ollama/bin/backup-from-110.sh",
|
|
"/home/ollama/bin/momo-pg-backup.sh",
|
|
"/home/ollama/awoooi-ops/pg-backup.sh",
|
|
]:
|
|
labels = f'host="{_escape_label(host)}",script="{_escape_label(Path(script).name)}"'
|
|
lines.append(f"awoooi_backup_script_present{{{labels}}} {int(Path(script).exists() and os.access(script, os.X_OK))}")
|
|
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job="backup_from_110",
|
|
source="188-rsync",
|
|
target="/home/ollama/backup/110",
|
|
backup_type="rsync",
|
|
last_success=_read_backup_110_timestamp(),
|
|
max_age_hours=25,
|
|
sample_count=1,
|
|
)
|
|
)
|
|
gitea_mirror_ts, gitea_mirror_count = _newest_tree_timestamp(Path("/home/ollama/backup/110/gitea"))
|
|
gitea_mirror_fresh = 1 if gitea_mirror_ts and int(time.time()) - gitea_mirror_ts <= 25 * 3600 else 0
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job="gitea_repo_mirror_from_110",
|
|
source="188-rsync-subtree",
|
|
target="/home/ollama/backup/110/gitea",
|
|
backup_type="rsync_subtree",
|
|
last_success=gitea_mirror_ts,
|
|
max_age_hours=25,
|
|
sample_count=gitea_mirror_count,
|
|
)
|
|
)
|
|
gitea_bundle_lines, gitea_bundle_ok = _gitea_bundle_metric_lines(host)
|
|
lines.extend(gitea_bundle_lines)
|
|
coverage_labels = (
|
|
f'host="{_escape_label(host)}",'
|
|
'domain="service",'
|
|
'required_jobs="backup_from_110,gitea_repo_mirror_from_110,gitea_private_bundle_completeness"'
|
|
)
|
|
lines.append(f"awoooi_backup_coverage_domain_expected_info{{{coverage_labels}}} 1")
|
|
lines.append(
|
|
"awoooi_backup_coverage_domain_fresh"
|
|
f"{{{coverage_labels}}} {1 if gitea_mirror_fresh and gitea_bundle_ok else 0}"
|
|
)
|
|
momo_ts = _newest_file_timestamp([
|
|
"/home/ollama/momo_backups/*.sql.gz",
|
|
"/home/ollama/momo-pro/backups/*.sql.gz",
|
|
"/home/ollama/backups/momo_analytics_*.sql.gz",
|
|
])
|
|
lines.extend(
|
|
_metric_lines_for_job(
|
|
host=host,
|
|
job="momo_pg_daily",
|
|
source="188-pg-dump",
|
|
target="/home/ollama/momo_backups",
|
|
backup_type="pg_dump",
|
|
last_success=momo_ts,
|
|
max_age_hours=30,
|
|
sample_count=1 if momo_ts else 0,
|
|
)
|
|
)
|
|
return lines
|
|
|
|
|
|
def collect() -> str:
|
|
host = HOST_LABEL
|
|
if host == "110":
|
|
lines = _collect_110(host)
|
|
elif host == "188":
|
|
lines = _collect_188(host)
|
|
else:
|
|
lines = _base_lines(host)
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def main() -> None:
|
|
TEXTFILE_DIR.mkdir(parents=True, exist_ok=True)
|
|
payload = collect()
|
|
with tempfile.NamedTemporaryFile("w", dir=TEXTFILE_DIR, delete=False, encoding="utf-8") as tmp:
|
|
tmp.write(payload)
|
|
tmp_path = Path(tmp.name)
|
|
output_path = TEXTFILE_DIR / OUTPUT_NAME
|
|
tmp_path.replace(output_path)
|
|
output_path.chmod(0o644)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|