fix(ops): recover backup core after reboot [skip ci]
This commit is contained in:
@@ -5,13 +5,20 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
try:
|
||||
import yaml
|
||||
except ModuleNotFoundError: # pragma: no cover - exercised on lean operator hosts
|
||||
yaml = None
|
||||
YAML_ERROR_TYPES: tuple[type[BaseException], ...] = ()
|
||||
else:
|
||||
YAML_ERROR_TYPES = (yaml.YAMLError,)
|
||||
|
||||
|
||||
DEFAULT_RULES = Path("ops/monitoring/alerts-unified.yml")
|
||||
@@ -24,7 +31,99 @@ class ContractError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
RECOVERABLE_ERRORS = (ContractError, OSError, json.JSONDecodeError) + YAML_ERROR_TYPES
|
||||
_RECORD_RE = re.compile(r"^(?P<indent>\s*)-\s+record:\s*(?P<record>.+?)\s*$")
|
||||
_RULE_START_RE = re.compile(r"^(?P<indent>\s*)-\s+(?:record|alert):\s*.+$")
|
||||
_EXPR_RE = re.compile(r"^(?P<indent>\s*)expr:\s*(?P<tail>.*)$")
|
||||
_PROM_RULES_RE = re.compile(r"^(?P<indent>\s*)prometheus_recording_rules:\s*$")
|
||||
_LIST_ITEM_RE = re.compile(r"^(?P<indent>\s*)-\s+(?P<value>.+?)\s*$")
|
||||
|
||||
|
||||
def _strip_yaml_scalar(value: str) -> str:
|
||||
return value.strip().strip('"').strip("'")
|
||||
|
||||
|
||||
def _indent_width(line: str) -> int:
|
||||
return len(line) - len(line.lstrip(" "))
|
||||
|
||||
|
||||
def _fallback_rules(path: Path) -> list[dict[str, Any]]:
|
||||
lines = path.read_text(encoding="utf-8").splitlines()
|
||||
rules: list[dict[str, Any]] = []
|
||||
index = 0
|
||||
while index < len(lines):
|
||||
record_match = _RECORD_RE.match(lines[index])
|
||||
if not record_match:
|
||||
index += 1
|
||||
continue
|
||||
|
||||
record_indent = len(record_match.group("indent"))
|
||||
rule: dict[str, Any] = {"record": _strip_yaml_scalar(record_match.group("record"))}
|
||||
index += 1
|
||||
|
||||
while index < len(lines):
|
||||
next_rule = _RULE_START_RE.match(lines[index])
|
||||
if next_rule and len(next_rule.group("indent")) <= record_indent:
|
||||
break
|
||||
|
||||
expr_match = _EXPR_RE.match(lines[index])
|
||||
if not expr_match:
|
||||
index += 1
|
||||
continue
|
||||
|
||||
expr_indent = len(expr_match.group("indent"))
|
||||
tail = expr_match.group("tail").strip()
|
||||
if tail not in {"|", "|-", "|+"}:
|
||||
rule["expr"] = _strip_yaml_scalar(tail)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
block: list[str] = []
|
||||
index += 1
|
||||
while index < len(lines):
|
||||
block_next_rule = _RULE_START_RE.match(lines[index])
|
||||
if block_next_rule and len(block_next_rule.group("indent")) <= record_indent:
|
||||
break
|
||||
if lines[index].strip() and _indent_width(lines[index]) <= expr_indent:
|
||||
break
|
||||
block.append(lines[index])
|
||||
index += 1
|
||||
rule["expr"] = "\n".join(block)
|
||||
|
||||
rules.append(rule)
|
||||
|
||||
if not rules:
|
||||
raise ContractError(f"missing recording rules in {path}")
|
||||
return rules
|
||||
|
||||
|
||||
def _fallback_expected_recording_rules(path: Path) -> list[str]:
|
||||
lines = path.read_text(encoding="utf-8").splitlines()
|
||||
for index, line in enumerate(lines):
|
||||
key_match = _PROM_RULES_RE.match(line)
|
||||
if not key_match:
|
||||
continue
|
||||
|
||||
key_indent = len(key_match.group("indent"))
|
||||
rules: list[str] = []
|
||||
for child in lines[index + 1 :]:
|
||||
if not child.strip():
|
||||
continue
|
||||
child_indent = _indent_width(child)
|
||||
if child_indent <= key_indent:
|
||||
break
|
||||
item_match = _LIST_ITEM_RE.match(child)
|
||||
if item_match and len(item_match.group("indent")) > key_indent:
|
||||
rules.append(_strip_yaml_scalar(item_match.group("value")))
|
||||
if rules:
|
||||
return rules
|
||||
|
||||
raise ContractError(f"missing monitoring_contract.prometheus_recording_rules in {path}")
|
||||
|
||||
|
||||
def _rules(path: Path) -> list[dict[str, Any]]:
|
||||
if yaml is None:
|
||||
return _fallback_rules(path)
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
rules: list[dict[str, Any]] = []
|
||||
for group in data.get("groups") or []:
|
||||
@@ -33,6 +132,8 @@ def _rules(path: Path) -> list[dict[str, Any]]:
|
||||
|
||||
|
||||
def _expected_recording_rules(path: Path) -> list[str]:
|
||||
if yaml is None:
|
||||
return _fallback_expected_recording_rules(path)
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
||||
rules = data.get("monitoring_contract", {}).get("prometheus_recording_rules") or []
|
||||
if not rules:
|
||||
@@ -136,7 +237,7 @@ def main() -> int:
|
||||
args.expect_dr_ready,
|
||||
):
|
||||
print(line)
|
||||
except (ContractError, OSError, yaml.YAMLError, json.JSONDecodeError) as exc:
|
||||
except RECOVERABLE_ERRORS as exc:
|
||||
print(f"RECOVERY_SCORECARD_CONTRACT_FAILED {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user