fix(ops): harden 188 backup exporter recovery [skip ci]

This commit is contained in:
Your Name
2026-06-24 06:37:44 +08:00
parent 2b12f44547
commit 95f442adab
7 changed files with 315 additions and 20 deletions

View File

@@ -0,0 +1,78 @@
#!/usr/bin/env bash
# Restore PostgreSQL / Redis exporters on 192.168.0.188 without host networking.
#
# Required on the host:
# /home/ollama/monitoring/.env.exporters
#
# Required variables:
# POSTGRES_EXPORTER_DATA_SOURCE_NAME=postgresql://...@192.168.0.188:5432/...?...sslmode=disable
#
# Optional variables:
# REDIS_EXPORTER_ADDR=192.168.0.188:6380
# REDIS_PASSWORD=
set -euo pipefail
ENV_FILE="${EXPORTER_ENV_FILE:-/home/ollama/monitoring/.env.exporters}"
QUERIES_FILE="${POSTGRES_EXPORTER_QUERIES_FILE:-/home/ollama/monitoring/postgres-exporter-queries.yaml}"
POSTGRES_IMAGE="${POSTGRES_EXPORTER_IMAGE:-prometheuscommunity/postgres-exporter:v0.15.0}"
REDIS_IMAGE="${REDIS_EXPORTER_IMAGE:-oliver006/redis_exporter:v1.58.0}"
if [ ! -f "$ENV_FILE" ]; then
echo "EXPORTER_ENV_FILE_MISSING $ENV_FILE" >&2
exit 2
fi
set -a
# shellcheck disable=SC1090
. "$ENV_FILE"
set +a
if [ -z "${POSTGRES_EXPORTER_DATA_SOURCE_NAME:-}" ]; then
echo "POSTGRES_EXPORTER_DATA_SOURCE_NAME_MISSING" >&2
exit 2
fi
if [ ! -f "$QUERIES_FILE" ]; then
echo "POSTGRES_EXPORTER_QUERIES_FILE_MISSING $QUERIES_FILE" >&2
exit 2
fi
REDIS_EXPORTER_ADDR="${REDIS_EXPORTER_ADDR:-192.168.0.188:6380}"
REDIS_PASSWORD="${REDIS_PASSWORD:-}"
docker rm -f postgres-exporter redis-exporter >/dev/null 2>&1 || true
docker run -d \
--name postgres-exporter \
--restart unless-stopped \
-p 9187:9187 \
-e DATA_SOURCE_NAME="$POSTGRES_EXPORTER_DATA_SOURCE_NAME" \
-e PG_EXPORTER_EXTEND_QUERY_PATH=/etc/postgres_exporter/queries.yaml \
-e PG_EXPORTER_LOG_LEVEL=info \
-v "$QUERIES_FILE:/etc/postgres_exporter/queries.yaml:ro" \
"$POSTGRES_IMAGE" >/dev/null
redis_args=(
docker run -d
--name redis-exporter
--restart unless-stopped
-p 9121:9121
-e "REDIS_ADDR=$REDIS_EXPORTER_ADDR"
-e "REDIS_EXPORTER_CHECK_KEYS=awoooi:*"
-e REDIS_EXPORTER_INCL_SYSTEM_METRICS=true
)
if [ -n "$REDIS_PASSWORD" ]; then
redis_args+=(-e "REDIS_PASSWORD=$REDIS_PASSWORD")
fi
redis_args+=("$REDIS_IMAGE")
"${redis_args[@]}" >/dev/null
pg_up="$(curl -fsS --max-time 5 http://127.0.0.1:9187/metrics | awk '/^pg_up / {print $2; exit}')"
redis_up="$(curl -fsS --max-time 5 http://127.0.0.1:9121/metrics | awk '/^redis_up / {print $2; exit}')"
echo "POSTGRES_EXPORTER_UP ${pg_up:-missing}"
echo "REDIS_EXPORTER_UP ${redis_up:-missing}"
test "${pg_up:-0}" = "1"
test "${redis_up:-0}" = "1"

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env bash
# Restore the 188 MinIO endpoint used by Velero, then optionally create a
# one-off Velero backup to re-establish backup freshness evidence.
set -euo pipefail
MINIO_HOST="${MINIO_HOST:-ollama@192.168.0.188}"
K3S_HOST="${K3S_HOST:-wooo@192.168.0.120}"
BACKUP_HEALTH_HOST="${BACKUP_HEALTH_HOST:-wooo@192.168.0.110}"
MINIO_COMPOSE_FILE="${MINIO_COMPOSE_FILE:-/home/ollama/minio/docker-compose.yml}"
MINIO_OVERRIDE_FILE="${MINIO_OVERRIDE_FILE:-/home/ollama/minio/docker-compose.override.yml}"
VELERO_NAMESPACE="${VELERO_NAMESPACE:-velero}"
VELERO_TARGET_NAMESPACE="${VELERO_TARGET_NAMESPACE:-awoooi-prod}"
CREATE_VELERO_BACKUP="${CREATE_VELERO_BACKUP:-false}"
REFRESH_BACKUP_HEALTH="${REFRESH_BACKUP_HEALTH:-false}"
BACKUP_NAME="${VELERO_BACKUP_NAME:-reboot-recovery-$(date -u +%Y%m%d%H%M)}"
ssh "$MINIO_HOST" "test -f '$MINIO_COMPOSE_FILE'"
ssh "$MINIO_HOST" "cat > '$MINIO_OVERRIDE_FILE' <<'EOF'
services:
minio:
userns_mode: host
EOF"
ssh "$MINIO_HOST" "docker compose -f '$MINIO_COMPOSE_FILE' -f '$MINIO_OVERRIDE_FILE' up -d"
ssh "$MINIO_HOST" "for i in \$(seq 1 30); do curl -fsS --max-time 3 http://127.0.0.1:9000/minio/health/live >/dev/null && exit 0; sleep 2; done; docker logs --tail=80 minio >&2; exit 1"
echo "MINIO_188_HEALTHY endpoint=192.168.0.188:9000"
ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backupstoragelocations.velero.io default -o jsonpath='{.status.phase}'" | grep -qx Available
echo "VELERO_BACKUP_STORAGE_LOCATION_AVAILABLE namespace=$VELERO_NAMESPACE"
if [ "$CREATE_VELERO_BACKUP" = "true" ]; then
ssh "$K3S_HOST" "printf '%s\n' \
'apiVersion: velero.io/v1' \
'kind: Backup' \
'metadata:' \
' name: $BACKUP_NAME' \
' namespace: $VELERO_NAMESPACE' \
' labels:' \
' awoooi.wooo.work/source: reboot-recovery' \
'spec:' \
' includedNamespaces:' \
' - $VELERO_TARGET_NAMESPACE' \
' storageLocation: default' \
' ttl: 720h0m0s' \
| sudo -n k3s kubectl apply -f -"
for _ in $(seq 1 60); do
phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)"
case "$phase" in
Completed)
echo "VELERO_BACKUP_COMPLETED name=$BACKUP_NAME"
break
;;
Failed|PartiallyFailed)
ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o yaml" >&2 || true
echo "VELERO_BACKUP_FAILED name=$BACKUP_NAME phase=$phase" >&2
exit 1
;;
esac
sleep 5
done
phase="$(ssh "$K3S_HOST" "sudo -n k3s kubectl -n '$VELERO_NAMESPACE' get backup '$BACKUP_NAME' -o jsonpath='{.status.phase}'" || true)"
if [ "$phase" != "Completed" ]; then
echo "VELERO_BACKUP_TIMEOUT name=$BACKUP_NAME phase=${phase:-unknown}" >&2
exit 1
fi
fi
if [ "$REFRESH_BACKUP_HEALTH" = "true" ]; then
ssh "$BACKUP_HEALTH_HOST" "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin AIOPS_HOST_LABEL=110 NODE_EXPORTER_TEXTFILE_DIR=/home/wooo/node_exporter_textfiles /home/wooo/scripts/backup-health-textfile-exporter.py"
echo "BACKUP_HEALTH_TEXTFILE_REFRESHED host=110"
fi