diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index e0531dc3..b05132df 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,20 @@ +## 2026-07-01 — 15:25 Gitea live recovery after 110 Docker/control-plane failure + +**照主線修復的問題**: +- Gitea live 從外部 `https://gitea.wooo.work/api/v1/version` 502、內部 `192.168.0.110:3001` connection refused、Git SSH `192.168.0.110:2222` connection refused,收斂為 110 Docker/control-plane failure,不是 repo history 消失。 +- 110 console 確認 `docker ps` 無法連 Docker daemon;`sudo systemctl restart docker` 回 `Transport endpoint is not connected`,一般 Docker restart 不能恢復。 +- 以 99 VMware 對 `192.168.0.110_Ubuntu_64-bit_DevOps` 執行 power-level `Reset` 後,開機觸發 `AWOOOI reboot auto-recovery 10-minute SLO verifier`;15:15:25 讀回 `git2222=open`、`gitea3001=open`、Gitea HTTPS `200`。 +- Gitea API / internal API 均讀回 `{"version":"1.25.5"}`;9 個 expected private repos 均可透過 Gitea SSH 讀回 heads:`awoooi`、`ewoooc`、`2026FIFAWorldCup`、`agent-bounty-protocol`、`AwoooGo`、`stockplatform-v2`、`vibework`、`momo-pro-system`、`tsenyang-website`。 +- 188 backup exporter 讀回 `awoooi_gitea_bundle_expected_repo_missing_count=0`、`failed_repo_count=0`、`checksum_missing_count=0`、`all_expected_ok=1`;Gitea private bundle backup 沒有再只靠 public repo search 判斷。 + +**仍維持 / 未完成**: +- `registry.wooo.work/v2/` 與 `harbor.wooo.work/api/v2.0/health` 仍回 502,110 `5000/5001` 仍 closed;這是 Harbor/registry cold-start / auto-recovery 缺口,不能宣稱全 110 服務完成。 +- 110 SSH 在 post-boot 高負載窗口仍會 timeout;不得因此重開 legacy / generic runner,runner 仍放最後。 +- 未讀 secret / token / `.env` / raw sessions / SQLite / auth;未使用 GitHub / `gh` / GitHub API;未刪 repo、未 restore、未 prune、未 DB write。 + +**下一步**: +- 繼續同一 P0:先等 110 post-boot load 收斂,再只針對 Harbor/registry 讀 `docker ps` / compose status 與 bounded repair;Gitea 已恢復後要把 private bundle exporter / alert / tests commit 並推到 Gitea。 + ## 2026-07-01 — 14:36 Gitea repo visibility / emergency bundle P0 repair **照主線修復的問題**: diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 93f5089f..74a24faa 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -1504,6 +1504,23 @@ groups: description: "backup-health exporter 沒有輸出 188 的 service 備份覆蓋指標,無法確認 110 rsync 後 Gitea 等關鍵子樹是否真的存在。" runbook: "部署新版 scripts/ops/backup-health-textfile-exporter.py 到 188,刷新 /home/ollama/node_exporter_textfiles/backup_health.prom。" + - alert: GiteaPrivateBundleBackupIncomplete + expr: absent(awoooi_gitea_bundle_all_expected_ok{host="188"}) or awoooi_gitea_bundle_all_expected_ok{host="188"} == 0 + for: 10m + labels: + severity: critical + layer: host-backup + component: gitea-private-bundle + host: "188" + team: ops + alert_category: infrastructure + notification_type: TYPE-3 + auto_repair: "false" + annotations: + summary: "188 Gitea private repo bundle 備份不完整" + description: "Gitea private bundle manifest、bundle、checksum 或 freshness 有缺口;public repo search 不能代表完整備份。" + runbook: "先讀 awoooi_gitea_bundle_* 指標找缺少的 repo,執行 scripts/backup/gitea-repo-bundle-backup.sh 產生 bundle 並同步到 188;不得刪 repo、不得 restore 到 production、不得讀 token。" + - alert: BackupCoverageDomainStale expr: awoooi_backup_coverage_domain_fresh == 0 for: 15m diff --git a/scripts/backup/gitea-repo-bundle-backup.sh b/scripts/backup/gitea-repo-bundle-backup.sh index f4616b9c..ab98eef3 100755 --- a/scripts/backup/gitea-repo-bundle-backup.sh +++ b/scripts/backup/gitea-repo-bundle-backup.sh @@ -76,6 +76,8 @@ if [ "${#REPOS[@]}" -eq 0 ]; then "wooo/AwoooGo" "wooo/stockplatform-v2" "wooo/vibework" + "wooo/momo-pro-system" + "wooo/tsenyang-website" ) fi diff --git a/scripts/ops/backup-health-textfile-exporter.py b/scripts/ops/backup-health-textfile-exporter.py index 8cec4ac5..3332655d 100755 --- a/scripts/ops/backup-health-textfile-exporter.py +++ b/scripts/ops/backup-health-textfile-exporter.py @@ -25,11 +25,26 @@ TEXTFILE_DIR = Path(os.environ.get("NODE_EXPORTER_TEXTFILE_DIR", "/var/lib/node_ OUTPUT_NAME = "backup_health.prom" HOST_LABEL = os.environ.get("AIOPS_HOST_LABEL", os.uname().nodename) LABEL_RE = re.compile(r'["\\\n]') +GITEA_BUNDLE_ROOT = Path( + os.environ.get("AIOPS_GITEA_BUNDLE_ROOT", "/home/ollama/backup/110/gitea/git-bundles/latest-private-complete") +) +GITEA_BUNDLE_MAX_AGE_HOURS = float(os.environ.get("AIOPS_GITEA_BUNDLE_MAX_AGE_HOURS", "25")) BACKUP_COMMON_SH = Path(os.environ.get("AIOPS_BACKUP_COMMON_SH", "/backup/scripts/common.sh")) BACKUP_OFFSITE_ENV = Path(os.environ.get("AIOPS_BACKUP_OFFSITE_ENV", "/backup/scripts/offsite.env")) OFFSITE_STATUS_DIR = Path(os.environ.get("AIOPS_OFFSITE_STATUS_DIR", "/backup/offsite")) ESCROW_EVIDENCE_DIR = Path(os.environ.get("AIOPS_ESCROW_EVIDENCE_DIR", "/backup/escrow-evidence")) CONFIG_CAPTURE_STATUS_FILE = Path(os.environ.get("AIOPS_CONFIG_CAPTURE_STATUS_FILE", "/backup/status/backup-configs-last-status.json")) +DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS = [ + "wooo/awoooi", + "wooo/ewoooc", + "wooo/2026FIFAWorldCup", + "wooo/agent-bounty-protocol", + "wooo/AwoooGo", + "wooo/stockplatform-v2", + "wooo/vibework", + "wooo/momo-pro-system", + "wooo/tsenyang-website", +] ESCROW_ITEMS = [ "restic_repository_password", "offsite_provider_credentials", @@ -209,6 +224,161 @@ def _newest_tree_timestamp(root: Path, max_entries: int = 5000) -> tuple[int, in return newest, count +def _expected_gitea_bundle_repos() -> list[str]: + configured = os.environ.get("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "") + repos = [item.strip() for item in configured.split(",") if item.strip()] + return repos or DEFAULT_GITEA_BUNDLE_EXPECTED_REPOS + + +def _gitea_bundle_slug(repo: str) -> str: + return repo.replace("/", "__") + + +def _read_gitea_bundle_manifest(root: Path) -> tuple[Path | None, dict[str, dict[str, str]]]: + for manifest_path in [root / "manifest.remote.tsv", root / "manifest.tsv"]: + try: + rows = manifest_path.read_text(encoding="utf-8", errors="replace").splitlines() + except OSError: + continue + if not rows: + return manifest_path, {} + header = rows[0].split("\t") + parsed: dict[str, dict[str, str]] = {} + for line in rows[1:]: + if not line.strip(): + continue + values = line.split("\t") + row = {key: values[index] if index < len(values) else "" for index, key in enumerate(header)} + repo = row.get("repo", "").strip() + if repo: + parsed[repo] = row + return manifest_path, parsed + return None, {} + + +def _resolve_bundle_path(root: Path, repo: str, manifest_value: str) -> Path: + if manifest_value: + candidate = Path(manifest_value) + if candidate.exists(): + return candidate + local_candidate = root / candidate.name + if local_candidate.exists(): + return local_candidate + slug_candidate = root / f"{_gitea_bundle_slug(repo)}.bundle" + if slug_candidate.exists(): + return slug_candidate + repo_name = repo.rsplit("/", 1)[-1] + matches = sorted(root.glob(f"{repo_name}-local-*.bundle"), key=lambda path: path.stat().st_mtime, reverse=True) + return matches[0] if matches else slug_candidate + + +def _resolve_checksum_path(root: Path, bundle_path: Path, manifest_value: str) -> Path: + if manifest_value: + candidate = Path(manifest_value) + if candidate.exists(): + return candidate + local_candidate = root / candidate.name + if local_candidate.exists(): + return local_candidate + return bundle_path.with_name(f"{bundle_path.name}.sha256") + + +def _checksum_digest_present(path: Path) -> int: + try: + first = path.read_text(encoding="utf-8", errors="replace").split()[0] + except (IndexError, OSError): + return 0 + return 1 if re.fullmatch(r"[0-9a-fA-F]{64}", first) else 0 + + +def _gitea_bundle_metric_lines(host: str) -> tuple[list[str], int]: + now = int(time.time()) + root = GITEA_BUNDLE_ROOT + manifest_path, manifest_rows = _read_gitea_bundle_manifest(root) + root_exists = int(root.exists()) + manifest_present = int(manifest_path is not None) + newest_ts, file_count = _newest_tree_timestamp(root, max_entries=20000) + age = now - newest_ts if newest_ts else 0 + bundle_fresh = 1 if newest_ts and age <= int(GITEA_BUNDLE_MAX_AGE_HOURS * 3600) else 0 + root_label = _escape_label(str(root)) + labels = f'host="{_escape_label(host)}",root="{root_label}",max_age_hours="{GITEA_BUNDLE_MAX_AGE_HOURS:g}"' + lines = [ + f"awoooi_gitea_bundle_root_exists{{{labels}}} {root_exists}", + f"awoooi_gitea_bundle_manifest_present{{{labels}}} {manifest_present}", + f"awoooi_gitea_bundle_newest_timestamp{{{labels}}} {newest_ts}", + f"awoooi_gitea_bundle_age_seconds{{{labels}}} {age}", + f"awoooi_gitea_bundle_fresh{{{labels}}} {bundle_fresh}", + f"awoooi_gitea_bundle_file_count{{{labels}}} {file_count}", + ] + + missing_count = 0 + failed_count = 0 + checksum_missing_count = 0 + expected_repos = _expected_gitea_bundle_repos() + for repo in expected_repos: + row = manifest_rows.get(repo, {}) + status = row.get("status", "manifest_row_missing") + bundle_path = _resolve_bundle_path(root, repo, row.get("bundle", "")) + checksum_path = _resolve_checksum_path(root, bundle_path, row.get("checksum", "")) + bundle_present = int(bundle_path.is_file()) + checksum_present = int(checksum_path.is_file()) + checksum_digest_present = _checksum_digest_present(checksum_path) if checksum_present else 0 + if status == "manifest_row_missing" and bundle_present and checksum_digest_present: + status = "bundle_file_only" + try: + head_count = int(row.get("head_count", "0") or 0) + except ValueError: + head_count = 0 + repo_ok = int( + bundle_present == 1 + and checksum_present == 1 + and checksum_digest_present == 1 + and status in {"ok", "bundle_file_only"} + ) + if not bundle_present: + missing_count += 1 + if not checksum_present or not checksum_digest_present: + checksum_missing_count += 1 + if repo_ok == 0: + failed_count += 1 + repo_labels = f'host="{_escape_label(host)}",repo="{_escape_label(repo)}"' + status_labels = ( + f'{repo_labels},status="{_escape_label(status)}",' + f'bundle="{_escape_label(bundle_path.name)}"' + ) + lines.extend( + [ + f"awoooi_gitea_bundle_expected_repo_info{{{repo_labels}}} 1", + f"awoooi_gitea_bundle_repo_status_info{{{status_labels}}} 1", + f"awoooi_gitea_bundle_repo_present{{{repo_labels}}} {bundle_present}", + f"awoooi_gitea_bundle_repo_ok{{{repo_labels}}} {repo_ok}", + f"awoooi_gitea_bundle_repo_head_count{{{repo_labels}}} {head_count}", + f"awoooi_gitea_bundle_checksum_present{{{repo_labels}}} {checksum_present}", + f"awoooi_gitea_bundle_checksum_digest_present{{{repo_labels}}} {checksum_digest_present}", + ] + ) + + all_expected_ok = int( + root_exists == 1 + and manifest_present == 1 + and bundle_fresh == 1 + and expected_repos + and missing_count == 0 + and failed_count == 0 + and checksum_missing_count == 0 + ) + lines.extend( + [ + f"awoooi_gitea_bundle_expected_repo_count{{{labels}}} {len(expected_repos)}", + f"awoooi_gitea_bundle_expected_repo_missing_count{{{labels}}} {missing_count}", + f"awoooi_gitea_bundle_failed_repo_count{{{labels}}} {failed_count}", + f"awoooi_gitea_bundle_checksum_missing_count{{{labels}}} {checksum_missing_count}", + f"awoooi_gitea_bundle_all_expected_ok{{{labels}}} {all_expected_ok}", + ] + ) + return lines, all_expected_ok + + def _read_backup_110_timestamp() -> int: candidates = [ Path("/home/ollama/node_exporter_textfiles/backup.prom"), @@ -759,6 +929,42 @@ def _base_lines(host: str) -> list[str]: "# TYPE awoooi_backup_cron_singular_entry_count gauge", "# HELP awoooi_backup_cron_singular_entry_ok Whether a backup/offsite singleton cron pattern has exactly one active entry.", "# TYPE awoooi_backup_cron_singular_entry_ok gauge", + "# HELP awoooi_gitea_bundle_root_exists Whether the 188 Gitea private bundle backup root exists.", + "# TYPE awoooi_gitea_bundle_root_exists gauge", + "# HELP awoooi_gitea_bundle_manifest_present Whether the 188 Gitea private bundle manifest exists.", + "# TYPE awoooi_gitea_bundle_manifest_present gauge", + "# HELP awoooi_gitea_bundle_newest_timestamp Unix timestamp of newest file under the 188 Gitea private bundle backup root.", + "# TYPE awoooi_gitea_bundle_newest_timestamp gauge", + "# HELP awoooi_gitea_bundle_age_seconds Age of newest 188 Gitea private bundle backup evidence.", + "# TYPE awoooi_gitea_bundle_age_seconds gauge", + "# HELP awoooi_gitea_bundle_fresh Whether the 188 Gitea private bundle backup evidence is fresh.", + "# TYPE awoooi_gitea_bundle_fresh gauge", + "# HELP awoooi_gitea_bundle_file_count Number of files scanned under the 188 Gitea private bundle backup root.", + "# TYPE awoooi_gitea_bundle_file_count gauge", + "# HELP awoooi_gitea_bundle_expected_repo_info Expected Gitea repository that must have a private bundle backup.", + "# TYPE awoooi_gitea_bundle_expected_repo_info gauge", + "# HELP awoooi_gitea_bundle_repo_status_info Manifest or fallback status for an expected Gitea private bundle backup.", + "# TYPE awoooi_gitea_bundle_repo_status_info gauge", + "# HELP awoooi_gitea_bundle_repo_present Whether an expected Gitea repository bundle file exists.", + "# TYPE awoooi_gitea_bundle_repo_present gauge", + "# HELP awoooi_gitea_bundle_repo_ok Whether an expected Gitea repository has a present bundle and checksum evidence.", + "# TYPE awoooi_gitea_bundle_repo_ok gauge", + "# HELP awoooi_gitea_bundle_repo_head_count Head count recorded in the Gitea private bundle manifest.", + "# TYPE awoooi_gitea_bundle_repo_head_count gauge", + "# HELP awoooi_gitea_bundle_checksum_present Whether checksum evidence exists for an expected Gitea private bundle.", + "# TYPE awoooi_gitea_bundle_checksum_present gauge", + "# HELP awoooi_gitea_bundle_checksum_digest_present Whether checksum evidence contains a SHA-256 digest.", + "# TYPE awoooi_gitea_bundle_checksum_digest_present gauge", + "# HELP awoooi_gitea_bundle_expected_repo_count Number of expected Gitea private bundle repositories.", + "# TYPE awoooi_gitea_bundle_expected_repo_count gauge", + "# HELP awoooi_gitea_bundle_expected_repo_missing_count Number of expected Gitea private bundle repositories with no bundle file.", + "# TYPE awoooi_gitea_bundle_expected_repo_missing_count gauge", + "# HELP awoooi_gitea_bundle_failed_repo_count Number of expected Gitea private bundle repositories without complete bundle/checksum evidence.", + "# TYPE awoooi_gitea_bundle_failed_repo_count gauge", + "# HELP awoooi_gitea_bundle_checksum_missing_count Number of expected Gitea private bundle repositories without checksum evidence.", + "# TYPE awoooi_gitea_bundle_checksum_missing_count gauge", + "# HELP awoooi_gitea_bundle_all_expected_ok Whether all expected Gitea private bundle repositories are present and fresh.", + "# TYPE awoooi_gitea_bundle_all_expected_ok gauge", "# HELP awoooi_velero_monitor_up Whether the backup health exporter can query Velero via a reachable K3s kubectl endpoint.", "# TYPE awoooi_velero_monitor_up gauge", "# HELP awoooi_velero_schedule_count Number of Velero schedules in the velero namespace.", @@ -939,15 +1145,17 @@ def _collect_188(host: str) -> list[str]: sample_count=gitea_mirror_count, ) ) + gitea_bundle_lines, gitea_bundle_ok = _gitea_bundle_metric_lines(host) + lines.extend(gitea_bundle_lines) coverage_labels = ( f'host="{_escape_label(host)}",' 'domain="service",' - 'required_jobs="backup_from_110,gitea_repo_mirror_from_110"' + 'required_jobs="backup_from_110,gitea_repo_mirror_from_110,gitea_private_bundle_completeness"' ) lines.append(f"awoooi_backup_coverage_domain_expected_info{{{coverage_labels}}} 1") lines.append( "awoooi_backup_coverage_domain_fresh" - f"{{{coverage_labels}}} {1 if gitea_mirror_fresh else 0}" + f"{{{coverage_labels}}} {1 if gitea_mirror_fresh and gitea_bundle_ok else 0}" ) momo_ts = _newest_file_timestamp([ "/home/ollama/momo_backups/*.sql.gz", diff --git a/scripts/ops/tests/test_backup_health_textfile_exporter.py b/scripts/ops/tests/test_backup_health_textfile_exporter.py new file mode 100644 index 00000000..ce513ee7 --- /dev/null +++ b/scripts/ops/tests/test_backup_health_textfile_exporter.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +SCRIPT_ROOT = Path(__file__).resolve().parents[1] +EXPORTER_PATH = SCRIPT_ROOT / "backup-health-textfile-exporter.py" + + +def load_exporter(): + spec = importlib.util.spec_from_file_location("backup_health_textfile_exporter", EXPORTER_PATH) + assert spec and spec.loader + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def test_gitea_bundle_metrics_require_all_expected_repos(tmp_path: Path, monkeypatch) -> None: + exporter = load_exporter() + bundle_root = tmp_path / "latest-private-complete" + bundle_root.mkdir() + monkeypatch.setattr(exporter, "GITEA_BUNDLE_ROOT", bundle_root) + monkeypatch.setattr(exporter, "GITEA_BUNDLE_MAX_AGE_HOURS", 25) + monkeypatch.setenv("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "wooo/awoooi,wooo/tsenyang-website") + + (bundle_root / "manifest.remote.tsv").write_text( + "\n".join( + [ + "repo\tstatus\thead_count\tbundle\tchecksum", + f"wooo/awoooi\tok\t2\t{bundle_root / 'wooo__awoooi.bundle'}\t{bundle_root / 'wooo__awoooi.bundle.sha256'}", + ] + ) + + "\n", + encoding="utf-8", + ) + (bundle_root / "wooo__awoooi.bundle").write_text("bundle", encoding="utf-8") + (bundle_root / "wooo__awoooi.bundle.sha256").write_text( + "a" * 64 + " wooo__awoooi.bundle\n", + encoding="utf-8", + ) + (bundle_root / "tsenyang-website-local-20260701.bundle").write_text("bundle", encoding="utf-8") + (bundle_root / "tsenyang-website-local-20260701.bundle.sha256").write_text( + "b" * 64 + " tsenyang-website-local-20260701.bundle\n", + encoding="utf-8", + ) + + metrics, all_ok = exporter._gitea_bundle_metric_lines("188") + + rendered = "\n".join(metrics) + assert all_ok == 1 + assert 'awoooi_gitea_bundle_expected_repo_count{host="188"' in rendered + assert 'awoooi_gitea_bundle_expected_repo_missing_count{host="188"' in rendered + assert 'awoooi_gitea_bundle_failed_repo_count{host="188"' in rendered + assert 'repo="wooo/tsenyang-website",status="bundle_file_only"' in rendered + assert 'awoooi_gitea_bundle_all_expected_ok{host="188"' in rendered + assert rendered.rstrip().endswith(" 1") + + +def test_gitea_bundle_metrics_fail_when_checksum_missing(tmp_path: Path, monkeypatch) -> None: + exporter = load_exporter() + bundle_root = tmp_path / "latest-private-complete" + bundle_root.mkdir() + monkeypatch.setattr(exporter, "GITEA_BUNDLE_ROOT", bundle_root) + monkeypatch.setenv("AIOPS_GITEA_BUNDLE_EXPECTED_REPOS", "wooo/awoooi") + + (bundle_root / "manifest.remote.tsv").write_text( + "\n".join( + [ + "repo\tstatus\thead_count\tbundle\tchecksum", + f"wooo/awoooi\tok\t2\t{bundle_root / 'wooo__awoooi.bundle'}\t{bundle_root / 'wooo__awoooi.bundle.sha256'}", + ] + ) + + "\n", + encoding="utf-8", + ) + (bundle_root / "wooo__awoooi.bundle").write_text("bundle", encoding="utf-8") + + metrics, all_ok = exporter._gitea_bundle_metric_lines("188") + + rendered = "\n".join(metrics) + assert all_ok == 0 + assert 'awoooi_gitea_bundle_checksum_missing_count{host="188"' in rendered + assert rendered.rstrip().endswith(" 0")