fix(ops): recover backup core after reboot [skip ci]

This commit is contained in:
ogt
2026-06-27 03:06:42 +08:00
parent 6e6e9fa746
commit 8fdcc0194f
7 changed files with 184 additions and 13 deletions

View File

@@ -5,13 +5,20 @@ from __future__ import annotations
import argparse
import json
import re
import sys
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Any
import yaml
try:
import yaml
except ModuleNotFoundError: # pragma: no cover - exercised on lean operator hosts
yaml = None
YAML_ERROR_TYPES: tuple[type[BaseException], ...] = ()
else:
YAML_ERROR_TYPES = (yaml.YAMLError,)
DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml")
@@ -24,7 +31,99 @@ class ContractError(RuntimeError):
pass
RECOVERABLE_ERRORS = (ContractError, OSError, json.JSONDecodeError) + YAML_ERROR_TYPES
_RECORD_RE = re.compile(r"^(?P<indent>\s*)-\s+record:\s*(?P<record>.+?)\s*$")
_RULE_START_RE = re.compile(r"^(?P<indent>\s*)-\s+(?:record|alert):\s*.+$")
_EXPR_RE = re.compile(r"^(?P<indent>\s*)expr:\s*(?P<tail>.*)$")
_PROM_RULES_RE = re.compile(r"^(?P<indent>\s*)prometheus_recording_rules:\s*$")
_LIST_ITEM_RE = re.compile(r"^(?P<indent>\s*)-\s+(?P<value>.+?)\s*$")
def _strip_yaml_scalar(value: str) -> str:
return value.strip().strip('"').strip("'")
def _indent_width(line: str) -> int:
return len(line) - len(line.lstrip(" "))
def _fallback_rules(path: Path) -> list[dict[str, Any]]:
lines = path.read_text(encoding="utf-8").splitlines()
rules: list[dict[str, Any]] = []
index = 0
while index < len(lines):
record_match = _RECORD_RE.match(lines[index])
if not record_match:
index += 1
continue
record_indent = len(record_match.group("indent"))
rule: dict[str, Any] = {"record": _strip_yaml_scalar(record_match.group("record"))}
index += 1
while index < len(lines):
next_rule = _RULE_START_RE.match(lines[index])
if next_rule and len(next_rule.group("indent")) <= record_indent:
break
expr_match = _EXPR_RE.match(lines[index])
if not expr_match:
index += 1
continue
expr_indent = len(expr_match.group("indent"))
tail = expr_match.group("tail").strip()
if tail not in {"|", "|-", "|+"}:
rule["expr"] = _strip_yaml_scalar(tail)
index += 1
continue
block: list[str] = []
index += 1
while index < len(lines):
block_next_rule = _RULE_START_RE.match(lines[index])
if block_next_rule and len(block_next_rule.group("indent")) <= record_indent:
break
if lines[index].strip() and _indent_width(lines[index]) <= expr_indent:
break
block.append(lines[index])
index += 1
rule["expr"] = "\n".join(block)
rules.append(rule)
if not rules:
raise ContractError(f"missing recording rules in {path}")
return rules
def _fallback_expected_recording_rules(path: Path) -> list[str]:
lines = path.read_text(encoding="utf-8").splitlines()
for index, line in enumerate(lines):
key_match = _PROM_RULES_RE.match(line)
if not key_match:
continue
key_indent = len(key_match.group("indent"))
rules: list[str] = []
for child in lines[index + 1 :]:
if not child.strip():
continue
child_indent = _indent_width(child)
if child_indent <= key_indent:
break
item_match = _LIST_ITEM_RE.match(child)
if item_match and len(item_match.group("indent")) > key_indent:
rules.append(_strip_yaml_scalar(item_match.group("value")))
if rules:
return rules
raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}")
def _rules(path: Path) -> list[dict[str, Any]]:
if yaml is None:
return _fallback_rules(path)
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
rules: list[dict[str, Any]] = []
for group in data.get("groups") or []:
@@ -33,6 +132,8 @@ def _rules(path: Path) -> list[dict[str, Any]]:
def _expected_recording_rules(path: Path) -> list[str]:
if yaml is None:
return _fallback_expected_recording_rules(path)
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or []
if not rules:
@@ -136,7 +237,7 @@ def main() -> int:
args.expect_dr_ready,
):
print(line)
except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc:
except RECOVERABLE_ERRORS as exc:
print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr)
return 1