fix(backup): restore reboot freshness readback
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m1s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 2m1s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
This commit is contained in:
@@ -15,37 +15,223 @@ source "$(dirname "$0")/common.sh"
|
||||
SERVICE="awoooi-frequent"
|
||||
AWOOOI_HOST="192.168.0.188"
|
||||
AWOOOI_DB_USER="awoooi"
|
||||
AWOOOI_DB_PASS="awoooi_prod_2026"
|
||||
AWOOOI_DB_PASS="${AWOOOI_DB_PASS:-}"
|
||||
AWOOOI_DB_HOST="localhost"
|
||||
AWOOOI_DB_PORT="5432"
|
||||
LOCAL_REPO="${BACKUP_BASE}/awoooi"
|
||||
DUMP_DIR="/tmp/awoooi-freq-backup-$$"
|
||||
AWOOOI_K8S_HOST="${AWOOOI_K8S_HOST:-192.168.0.120}"
|
||||
AWOOOI_K8S_HOSTS="${AWOOOI_K8S_HOSTS:-${AWOOOI_K8S_HOST} 192.168.0.121 192.168.0.125}"
|
||||
AWOOOI_K8S_SECRET_NAME="${AWOOOI_K8S_SECRET_NAME:-awoooi-secrets}"
|
||||
AWOOOI_K8S_NAMESPACE="${AWOOOI_K8S_NAMESPACE:-awoooi-prod}"
|
||||
AWOOOI_K8S_DATABASE_URL_KEYS="${AWOOOI_K8S_DATABASE_URL_KEYS:-AWOOOI_BACKUP_DATABASE_URL BACKUP_DATABASE_URL DATABASE_URL}"
|
||||
FORCE_RLS_RESTORE_SQL=""
|
||||
FORCE_RLS_RESTORE_DB=""
|
||||
|
||||
# 高頻備份保留策略
|
||||
KEEP_HOURLY=28 # 保留 7 天的 6 小時快照(7*4=28)
|
||||
KEEP_DAILY=30
|
||||
KEEP_WEEKLY=12
|
||||
KEEP_MONTHLY=24
|
||||
# 2026-05-19 ogt + Codex: 保留策略統一交給 common.sh。
|
||||
# 預設 latest-only keep-last=1,避免高頻 DB snapshot 堆積。
|
||||
|
||||
resolve_database_url() {
|
||||
if [ -n "${AWOOOI_DATABASE_URL:-}" ]; then
|
||||
printf '%s\n' "${AWOOOI_DATABASE_URL}"
|
||||
return 0
|
||||
fi
|
||||
if [ -n "${DATABASE_URL:-}" ]; then
|
||||
printf '%s\n' "${DATABASE_URL}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 2026-07-01 ogt + Codex: 優先使用專用備份 DB URL;不存在時才退回
|
||||
# runtime DATABASE_URL。只在遠端流程內解碼,不把 secret value 寫入 log。
|
||||
local k8s_host key encoded decoded
|
||||
for k8s_host in ${AWOOOI_K8S_HOSTS}; do
|
||||
for key in ${AWOOOI_K8S_DATABASE_URL_KEYS}; do
|
||||
encoded="$(ssh -o BatchMode=yes -o StrictHostKeyChecking=accept-new -o ConnectTimeout=8 "wooo@${k8s_host}" \
|
||||
"sudo -n kubectl get secret ${AWOOOI_K8S_SECRET_NAME} -n ${AWOOOI_K8S_NAMESPACE} -o jsonpath='{.data.${key}}' 2>/dev/null || kubectl get secret ${AWOOOI_K8S_SECRET_NAME} -n ${AWOOOI_K8S_NAMESPACE} -o jsonpath='{.data.${key}}'" \
|
||||
2>/dev/null || true)"
|
||||
decoded="$(printf '%s' "${encoded}" | base64 -d 2>/dev/null || true)"
|
||||
if [ -n "${decoded}" ]; then
|
||||
printf '%s\n' "${decoded}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
load_database_config() {
|
||||
local database_url
|
||||
database_url="$(resolve_database_url || true)"
|
||||
if [ -z "${database_url}" ]; then
|
||||
log_error "無法解析 AWOOOI DATABASE_URL;拒絕使用舊硬編密碼"
|
||||
return 1
|
||||
fi
|
||||
|
||||
eval "$(
|
||||
python3 - 3<<< "${database_url}" <<'PY'
|
||||
import shlex
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
with open(3) as source:
|
||||
url = source.read().strip()
|
||||
parsed = urlparse(url)
|
||||
|
||||
values = {
|
||||
"AWOOOI_DB_USER": unquote(parsed.username or "awoooi"),
|
||||
"AWOOOI_DB_PASS": unquote(parsed.password or ""),
|
||||
"AWOOOI_DB_HOST": parsed.hostname or "localhost",
|
||||
"AWOOOI_DB_PORT": str(parsed.port or 5432),
|
||||
}
|
||||
for key, value in values.items():
|
||||
print(f"{key}={shlex.quote(value)}")
|
||||
PY
|
||||
)"
|
||||
}
|
||||
|
||||
quote_remote() {
|
||||
printf "%q" "$1"
|
||||
}
|
||||
|
||||
pgpass_escape() {
|
||||
local value="$1"
|
||||
value="${value//\\/\\\\}"
|
||||
value="${value//:/\\:}"
|
||||
printf '%s' "${value}"
|
||||
}
|
||||
|
||||
pgpass_line() {
|
||||
local database="$1"
|
||||
printf '%s:%s:%s:%s:%s\n' \
|
||||
"$(pgpass_escape "${AWOOOI_DB_HOST}")" \
|
||||
"$(pgpass_escape "${AWOOOI_DB_PORT}")" \
|
||||
"$(pgpass_escape "${database}")" \
|
||||
"$(pgpass_escape "${AWOOOI_DB_USER}")" \
|
||||
"$(pgpass_escape "${AWOOOI_DB_PASS}")"
|
||||
}
|
||||
|
||||
remote_psql_command() {
|
||||
local database="$1"
|
||||
printf "psql --no-password -U %s -h %s -p %s -d %s -v ON_ERROR_STOP=1" \
|
||||
"$(quote_remote "${AWOOOI_DB_USER}")" \
|
||||
"$(quote_remote "${AWOOOI_DB_HOST}")" \
|
||||
"$(quote_remote "${AWOOOI_DB_PORT}")" \
|
||||
"$(quote_remote "${database}")"
|
||||
}
|
||||
|
||||
remote_pgpass_wrapper() {
|
||||
local command="$1"
|
||||
printf 'umask 077; pgpass=$(mktemp "${TMPDIR:-/tmp}/awoooi-pgpass.XXXXXX") || exit 1; cleanup() { rm -f "$pgpass"; }; trap cleanup EXIT HUP INT TERM; cat > "$pgpass"; PGOPTIONS="-c statement_timeout=0 -c max_parallel_workers_per_gather=0" PGPASSFILE="$pgpass" %s' "${command}"
|
||||
}
|
||||
|
||||
run_remote_pgpass_command() {
|
||||
local database="$1"
|
||||
local command="$2"
|
||||
pgpass_line "${database}" | ssh "ollama@${AWOOOI_HOST}" "$(remote_pgpass_wrapper "${command}")"
|
||||
}
|
||||
|
||||
latest_restic_snapshot_id() {
|
||||
restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
|
||||
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
|
||||
python3 -c 'import json,sys; rows=json.load(sys.stdin); row=max(rows,key=lambda r: r.get("time","")) if rows else {}; print(row.get("short_id","unknown"))' \
|
||||
2>/dev/null || echo "unknown"
|
||||
}
|
||||
|
||||
collect_force_rls_sql() {
|
||||
local database="$1"
|
||||
local mode="$2"
|
||||
local query
|
||||
|
||||
query="
|
||||
select format('ALTER TABLE %I.%I ${mode} ROW LEVEL SECURITY;', n.nspname, c.relname)
|
||||
from pg_class c
|
||||
join pg_namespace n on n.oid = c.relnamespace
|
||||
where c.relkind in ('r', 'p')
|
||||
and c.relforcerowsecurity
|
||||
and pg_get_userbyid(c.relowner) = current_user
|
||||
order by 1;
|
||||
"
|
||||
run_remote_pgpass_command "${database}" "$(remote_psql_command "${database}") -At -c $(quote_remote "${query}")"
|
||||
}
|
||||
|
||||
apply_remote_sql() {
|
||||
local database="$1"
|
||||
local sql="$2"
|
||||
[ -n "${sql}" ] || return 0
|
||||
run_remote_pgpass_command "${database}" "$(remote_psql_command "${database}") -c $(quote_remote "${sql}") >/dev/null"
|
||||
}
|
||||
|
||||
restore_force_rls() {
|
||||
if [ -n "${FORCE_RLS_RESTORE_DB}" ] && [ -n "${FORCE_RLS_RESTORE_SQL}" ]; then
|
||||
if apply_remote_sql "${FORCE_RLS_RESTORE_DB}" "${FORCE_RLS_RESTORE_SQL}"; then
|
||||
log_info "FORCE ROW LEVEL SECURITY 已恢復 (${FORCE_RLS_RESTORE_DB})"
|
||||
else
|
||||
log_error "FORCE ROW LEVEL SECURITY 恢復失敗 (${FORCE_RLS_RESTORE_DB})"
|
||||
return 1
|
||||
fi
|
||||
FORCE_RLS_RESTORE_DB=""
|
||||
FORCE_RLS_RESTORE_SQL=""
|
||||
fi
|
||||
}
|
||||
|
||||
trap restore_force_rls EXIT
|
||||
|
||||
dump_database_with_rls_guard() {
|
||||
local database="$1"
|
||||
local output_file="$2"
|
||||
local stderr_file="${output_file}.stderr"
|
||||
local noforce_sql force_sql dump_rc
|
||||
|
||||
noforce_sql="$(collect_force_rls_sql "${database}" "NO FORCE")"
|
||||
force_sql="$(printf '%s\n' "${noforce_sql}" | sed 's/NO FORCE/FORCE/')"
|
||||
|
||||
if [ -n "${noforce_sql}" ]; then
|
||||
FORCE_RLS_RESTORE_DB="${database}"
|
||||
FORCE_RLS_RESTORE_SQL="${force_sql}"
|
||||
log_info "暫時解除 FORCE RLS 以完成完整 pg_dump (${database}, tables=$(printf '%s\n' "${noforce_sql}" | awk 'NF {count++} END {print count+0}'))"
|
||||
apply_remote_sql "${database}" "${noforce_sql}"
|
||||
fi
|
||||
|
||||
set +e
|
||||
run_remote_pgpass_command "${database}" "pg_dump --no-password \
|
||||
-U $(quote_remote "${AWOOOI_DB_USER}") -h $(quote_remote "${AWOOOI_DB_HOST}") -p $(quote_remote "${AWOOOI_DB_PORT}") \
|
||||
$(quote_remote "${database}")" > "${output_file}" 2>"${stderr_file}"
|
||||
dump_rc=$?
|
||||
set -e
|
||||
|
||||
restore_force_rls
|
||||
|
||||
if [ "${dump_rc}" -ne 0 ]; then
|
||||
log_error "${database} dump 失敗,pg_dump stderr 尾端如下(已避免輸出 credential):"
|
||||
tail -40 "${stderr_file}" | sed -E 's/(password=)[^ ]+/\1REDACTED/g' || true
|
||||
return "${dump_rc}"
|
||||
fi
|
||||
rm -f "${stderr_file}"
|
||||
}
|
||||
|
||||
main() {
|
||||
local start_time=$(date +%s)
|
||||
|
||||
log_info "========== AWOOOI 高頻備份 ($(date '+%H:%M')) =========="
|
||||
mkdir -p "${DUMP_DIR}"
|
||||
load_database_config || {
|
||||
notify_clawbot "failed" "${SERVICE}" "AWOOOI 高頻備份失敗:DATABASE_URL 不可用"
|
||||
rm -rf "${DUMP_DIR}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
local timestamp=$(date "+%Y%m%d_%H%M%S")
|
||||
|
||||
# 只備份 awoooi_prod(高頻核心)
|
||||
if ssh ollama@${AWOOOI_HOST} "PGPASSWORD='${AWOOOI_DB_PASS}' pg_dump \
|
||||
-U ${AWOOOI_DB_USER} -h ${AWOOOI_DB_HOST} -p ${AWOOOI_DB_PORT} \
|
||||
awoooi_prod" > "${DUMP_DIR}/awoooi_prod_${timestamp}.sql" 2>&1; then
|
||||
if dump_database_with_rls_guard "awoooi_prod" "${DUMP_DIR}/awoooi_prod_${timestamp}.sql"; then
|
||||
local size=$(du -h "${DUMP_DIR}/awoooi_prod_${timestamp}.sql" | cut -f1)
|
||||
log_success "awoooi_prod dump 完成 (${size})"
|
||||
else
|
||||
local status=$?
|
||||
log_error "awoooi_prod dump 失敗"
|
||||
notify_clawbot "failed" "${SERVICE}" "AWOOOI 高頻備份失敗"
|
||||
rm -rf "${DUMP_DIR}"
|
||||
exit 1
|
||||
exit "${status}"
|
||||
fi
|
||||
|
||||
# Restic 備份(同一倉庫,頻率不同)
|
||||
@@ -54,18 +240,11 @@ main() {
|
||||
--tag "service:awoooi" --tag "freq:6h" \
|
||||
--tag "timestamp:${timestamp}" 2>&1
|
||||
|
||||
local snapshot_id=$(restic -r "${LOCAL_REPO}" snapshots --latest 1 --json \
|
||||
--password-file "${RESTIC_PASSWORD_FILE}" 2>/dev/null | \
|
||||
grep -oP '"short_id":"\K[^"]+' | head -1)
|
||||
local snapshot_id
|
||||
snapshot_id="$(latest_restic_snapshot_id)"
|
||||
log_success "快照: ${snapshot_id}"
|
||||
|
||||
# GFS 清理(加入 hourly 保留)
|
||||
restic -r "${LOCAL_REPO}" forget --prune \
|
||||
--password-file "${RESTIC_PASSWORD_FILE}" \
|
||||
--keep-hourly ${KEEP_HOURLY} \
|
||||
--keep-daily ${KEEP_DAILY} \
|
||||
--keep-weekly ${KEEP_WEEKLY} \
|
||||
--keep-monthly ${KEEP_MONTHLY} 2>&1
|
||||
cleanup_old_backups "${LOCAL_REPO}"
|
||||
|
||||
rm -rf "${DUMP_DIR}"
|
||||
|
||||
|
||||
@@ -643,6 +643,9 @@ def _offsite_and_escrow_metric_lines(host: str) -> list[str]:
|
||||
if not offsite_configured:
|
||||
next_step = "configure_google_drive_rclone_on_110_tty"
|
||||
phase = 1
|
||||
elif escrow_missing_count > 0 and full_fresh:
|
||||
next_step = "complete_credential_escrow_review"
|
||||
phase = 3
|
||||
elif not any_partial_fresh:
|
||||
next_step = "run_small_dry_run_then_partial_sync"
|
||||
phase = 2
|
||||
|
||||
@@ -84,3 +84,33 @@ def test_gitea_bundle_metrics_fail_when_checksum_missing(tmp_path: Path, monkeyp
|
||||
assert all_ok == 0
|
||||
assert 'awoooi_gitea_bundle_checksum_missing_count{host="188"' in rendered
|
||||
assert rendered.rstrip().endswith(" 0")
|
||||
|
||||
|
||||
def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_stale(
|
||||
tmp_path: Path, monkeypatch
|
||||
) -> None:
|
||||
exporter = load_exporter()
|
||||
offsite_dir = tmp_path / "offsite"
|
||||
escrow_dir = tmp_path / "escrow"
|
||||
offsite_dir.mkdir()
|
||||
escrow_dir.mkdir()
|
||||
now = 1_782_900_000
|
||||
|
||||
monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir)
|
||||
monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir)
|
||||
monkeypatch.setattr(exporter.time, "time", lambda: now)
|
||||
monkeypatch.setattr(exporter, "_b2_configured", lambda: False)
|
||||
monkeypatch.setattr(exporter, "_rclone_configured", lambda: True)
|
||||
(offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8")
|
||||
(offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8")
|
||||
|
||||
metrics = exporter._offsite_and_escrow_metric_lines("110")
|
||||
rendered = "\n".join(metrics)
|
||||
|
||||
assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered
|
||||
assert (
|
||||
'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0'
|
||||
in rendered
|
||||
)
|
||||
assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered
|
||||
assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered
|
||||
|
||||
Reference in New Issue
Block a user