Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 1m44s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
755 lines
29 KiB
Bash
Executable File
755 lines
29 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -uo pipefail
|
|
|
|
# One-entry read-only post-reboot check. This wrapper intentionally delegates
|
|
# deep checks to the existing recovery scripts and does not restart, patch,
|
|
# delete, import, reload, or write runtime state.
|
|
|
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-6}"
|
|
SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-25}"
|
|
ROUTE_RETRY_ATTEMPTS="${ROUTE_RETRY_ATTEMPTS:-3}"
|
|
ROUTE_RETRY_DELAY_SECONDS="${ROUTE_RETRY_DELAY_SECONDS:-2}"
|
|
STOCK_FRESHNESS_RETRY_ATTEMPTS="${STOCK_FRESHNESS_RETRY_ATTEMPTS:-6}"
|
|
STOCK_FRESHNESS_RETRY_DELAY_SECONDS="${STOCK_FRESHNESS_RETRY_DELAY_SECONDS:-5}"
|
|
RUN_COLD_START=1
|
|
RUN_MOMO=1
|
|
RUN_STOCK=1
|
|
RUN_BACKUP=1
|
|
RUN_ROUTES=1
|
|
RUN_CPU=1
|
|
NO_COLOR_FLAG=0
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
COLD_START_PENDING_BLOCKERS=0
|
|
COLD_START_BLOCKED_SUMMARY=""
|
|
COLD_START_BLOCKED_LINES=""
|
|
ROUTE_SMOKE_BLOCKED=0
|
|
AWOOOI_API_ROUTE_OK=0
|
|
STOCK_EOD_WINDOW_PENDING=0
|
|
STOCK_EOD_CLASSIFICATION="not_evaluated"
|
|
STOCK_EOD_NEXT_ACTION="not_evaluated"
|
|
STOCK_EOD_FIRST_FULL_WINDOW_END_LOCAL="19:15"
|
|
|
|
PASS_COUNT=0
|
|
WARN_COUNT=0
|
|
BLOCKED_COUNT=0
|
|
SERVICE_WARN_COUNT=0
|
|
BOUNDARY_WARN_COUNT=0
|
|
EVIDENCE_WARN_COUNT=0
|
|
|
|
HOSTS=(
|
|
"192.168.0.110"
|
|
"192.168.0.120"
|
|
"192.168.0.121"
|
|
"192.168.0.188"
|
|
)
|
|
|
|
ROUTES=(
|
|
"https://awoooi.wooo.work/"
|
|
"https://awoooi.wooo.work/api/v1/health"
|
|
"https://awoooi.wooo.work/zh-TW/iwooos"
|
|
"https://vibework.wooo.work/"
|
|
"https://awooogo.wooo.work/"
|
|
"https://2026fifa.wooo.work/"
|
|
"https://agent.wooo.work/"
|
|
"https://mo.wooo.work/"
|
|
"https://mo.wooo.work/health"
|
|
"https://stock.wooo.work/"
|
|
"https://stock.wooo.work/healthz"
|
|
"https://stock.wooo.work/api/healthz"
|
|
"https://bitan.wooo.work/"
|
|
"https://tsenyang.com/"
|
|
"https://www.tsenyang.com/"
|
|
"https://vtuber.wooo.work/"
|
|
"https://gitea.wooo.work/"
|
|
"https://harbor.wooo.work/"
|
|
"https://registry.wooo.work/"
|
|
"https://sentry.wooo.work/"
|
|
"https://signoz.wooo.work/"
|
|
"https://langfuse.wooo.work/"
|
|
"https://aiops.wooo.work/"
|
|
)
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage: post-start-quick-check.sh [options]
|
|
|
|
Read-only post-reboot quick check for 110 / 120 / 121 / 188.
|
|
|
|
Options:
|
|
--skip-cold-start Do not run full-stack-cold-start-check.sh.
|
|
--skip-momo Do not run momo-drive-token-source-recovery-preflight.sh.
|
|
--skip-stock Do not query StockPlatform data freshness.
|
|
--skip-backup Do not run /backup/scripts/backup-status.sh on 110.
|
|
--skip-routes Do not curl public route smoke targets.
|
|
--skip-cpu Do not read 110 CPU / process summary.
|
|
--no-color Disable ANSI color.
|
|
-h, --help Show this help.
|
|
|
|
Environment:
|
|
SSH_COMMAND_TIMEOUT_SECONDS Per remote SSH command timeout. Default: 25.
|
|
ROUTE_RETRY_ATTEMPTS Public route attempts before blocking. Default: 3.
|
|
ROUTE_RETRY_DELAY_SECONDS Delay between failed public route attempts. Default: 2.
|
|
STOCK_FRESHNESS_RETRY_ATTEMPTS Stock freshness attempts before blocking. Default: 6.
|
|
STOCK_FRESHNESS_RETRY_DELAY_SECONDS Delay between failed Stock freshness attempts. Default: 5.
|
|
|
|
Exit codes:
|
|
0 = no service blockers. Boundary / evidence warnings may still be present.
|
|
1 = service warnings only.
|
|
2 = service blockers observed.
|
|
|
|
This script never reads token content and never writes runtime state.
|
|
USAGE
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--skip-cold-start)
|
|
RUN_COLD_START=0
|
|
;;
|
|
--skip-momo)
|
|
RUN_MOMO=0
|
|
;;
|
|
--skip-stock)
|
|
RUN_STOCK=0
|
|
;;
|
|
--skip-backup)
|
|
RUN_BACKUP=0
|
|
;;
|
|
--skip-routes)
|
|
RUN_ROUTES=0
|
|
;;
|
|
--skip-cpu)
|
|
RUN_CPU=0
|
|
;;
|
|
--no-color)
|
|
NO_COLOR_FLAG=1
|
|
;;
|
|
-h|--help)
|
|
usage
|
|
exit 0
|
|
;;
|
|
*)
|
|
printf 'Unknown argument: %s\n' "$1" >&2
|
|
usage >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
if [[ -n "${NO_COLOR:-}" || "$NO_COLOR_FLAG" -eq 1 ]]; then
|
|
RED=""
|
|
GREEN=""
|
|
YELLOW=""
|
|
BLUE=""
|
|
NC=""
|
|
else
|
|
RED=$'\033[0;31m'
|
|
GREEN=$'\033[0;32m'
|
|
YELLOW=$'\033[1;33m'
|
|
BLUE=$'\033[0;34m'
|
|
NC=$'\033[0m'
|
|
fi
|
|
|
|
section() {
|
|
printf '\n%s=== %s ===%s\n' "$BLUE" "$1" "$NC"
|
|
}
|
|
|
|
ok() {
|
|
PASS_COUNT=$((PASS_COUNT + 1))
|
|
printf '%sOK%s %s\n' "$GREEN" "$NC" "$*"
|
|
}
|
|
|
|
warn() {
|
|
WARN_COUNT=$((WARN_COUNT + 1))
|
|
printf '%sWARN%s %s\n' "$YELLOW" "$NC" "$*"
|
|
}
|
|
|
|
service_warn() {
|
|
SERVICE_WARN_COUNT=$((SERVICE_WARN_COUNT + 1))
|
|
warn "$@"
|
|
}
|
|
|
|
boundary_warn() {
|
|
BOUNDARY_WARN_COUNT=$((BOUNDARY_WARN_COUNT + 1))
|
|
warn "$@"
|
|
}
|
|
|
|
evidence_warn() {
|
|
EVIDENCE_WARN_COUNT=$((EVIDENCE_WARN_COUNT + 1))
|
|
warn "$@"
|
|
}
|
|
|
|
blocked() {
|
|
BLOCKED_COUNT=$((BLOCKED_COUNT + 1))
|
|
printf '%sBLOCKED%s %s\n' "$RED" "$NC" "$*"
|
|
}
|
|
|
|
local_ip_list() {
|
|
{
|
|
hostname -I 2>/dev/null | tr ' ' '\n' || true
|
|
ip -o -4 addr show 2>/dev/null | awk '{split($4,a,"/"); print a[1]}' || true
|
|
ifconfig 2>/dev/null | awk '$1 == "inet" {print $2}' || true
|
|
} | awk 'NF'
|
|
}
|
|
|
|
is_local_target() {
|
|
local target="$1"
|
|
local host="${target##*@}"
|
|
local ips
|
|
[[ "$host" == "127.0.0.1" || "$host" == "localhost" ]] && return 0
|
|
ips="$(local_ip_list)"
|
|
grep -Fxq "$host" <<<"$ips"
|
|
}
|
|
|
|
ssh_read() {
|
|
local user_host="$1"
|
|
local command="$2"
|
|
local quoted_command=""
|
|
if is_local_target "$user_host"; then
|
|
bash -lc "$command"
|
|
return $?
|
|
fi
|
|
printf -v quoted_command '%q' "$command"
|
|
ssh \
|
|
-o BatchMode=yes \
|
|
-o ConnectTimeout="$SSH_CONNECT_TIMEOUT" \
|
|
-o ConnectionAttempts=1 \
|
|
-o ServerAliveInterval=5 \
|
|
-o ServerAliveCountMax=1 \
|
|
-o PreferredAuthentications=publickey \
|
|
-o NumberOfPasswordPrompts=0 \
|
|
"$user_host" \
|
|
"if command -v timeout >/dev/null 2>&1; then timeout ${SSH_COMMAND_TIMEOUT_SECONDS}s bash -lc ${quoted_command}; else bash -lc ${quoted_command}; fi"
|
|
}
|
|
|
|
service_route_recovered() {
|
|
[[ "$RUN_ROUTES" -eq 1 && "$ROUTE_SMOKE_BLOCKED" -eq 0 && "$AWOOOI_API_ROUTE_OK" -eq 1 ]]
|
|
}
|
|
|
|
capacity_or_runner_issue() {
|
|
if service_route_recovered; then
|
|
evidence_warn "$@"
|
|
else
|
|
blocked "$@"
|
|
fi
|
|
}
|
|
|
|
run_and_capture() {
|
|
local label="$1"
|
|
shift
|
|
local tmp
|
|
tmp="$(mktemp -t post-start-quick-check.XXXXXX)"
|
|
if "$@" >"$tmp" 2>&1; then
|
|
ok "$label"
|
|
cat "$tmp"
|
|
rm -f "$tmp"
|
|
return 0
|
|
fi
|
|
local rc=$?
|
|
cat "$tmp"
|
|
rm -f "$tmp"
|
|
return "$rc"
|
|
}
|
|
|
|
section "主機 / SSH"
|
|
for host in "${HOSTS[@]}"; do
|
|
if ping -c 1 -W 1 "$host" >/dev/null 2>&1; then
|
|
ok "PING_OK $host"
|
|
else
|
|
blocked "PING_FAIL $host"
|
|
fi
|
|
|
|
if nc -z -w 2 "$host" 22 >/dev/null 2>&1; then
|
|
ok "SSH_PORT_OK $host"
|
|
else
|
|
blocked "SSH_PORT_FAIL $host"
|
|
fi
|
|
done
|
|
|
|
if [[ "$RUN_COLD_START" -eq 1 ]]; then
|
|
section "Cold-start scorecard"
|
|
cold_tmp="$(mktemp -t post-start-cold-start.XXXXXX)"
|
|
cold_rc=0
|
|
if bash "$ROOT_DIR/scripts/reboot-recovery/full-stack-cold-start-check.sh" --monitor-read-only --no-color --watch --interval 1 --max-attempts 1 >"$cold_tmp" 2>&1; then
|
|
cold_rc=0
|
|
else
|
|
cold_rc=$?
|
|
fi
|
|
cat "$cold_tmp"
|
|
cold_summary="$(grep -E 'PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' "$cold_tmp" | tail -n 1 || true)"
|
|
if [[ -n "$cold_summary" ]]; then
|
|
ok "cold-start summary: $cold_summary"
|
|
cold_warn=0
|
|
cold_blocked=0
|
|
if [[ "$cold_summary" =~ WARN=([0-9]+) ]]; then
|
|
cold_warn="${BASH_REMATCH[1]}"
|
|
fi
|
|
if [[ "$cold_summary" =~ BLOCKED=([0-9]+) ]]; then
|
|
cold_blocked="${BASH_REMATCH[1]}"
|
|
fi
|
|
if [[ "$cold_blocked" -gt 0 ]]; then
|
|
COLD_START_PENDING_BLOCKERS="$cold_blocked"
|
|
COLD_START_BLOCKED_SUMMARY="$cold_summary"
|
|
COLD_START_BLOCKED_LINES="$(grep -E '^BLOCKED ' "$cold_tmp" || true)"
|
|
evidence_warn "cold-start blockers pending wrapper retry classification: $cold_summary"
|
|
elif [[ "$cold_warn" -gt 0 ]]; then
|
|
evidence_warn "cold-start is warning-only; dedicated gates below classify service impact: $cold_summary"
|
|
elif [[ "$cold_rc" -eq 0 ]]; then
|
|
ok "cold-start command exited 0"
|
|
else
|
|
evidence_warn "cold-start exited $cold_rc but summary has no blockers: $cold_summary"
|
|
fi
|
|
else
|
|
if [[ "$cold_rc" -eq 0 ]]; then
|
|
service_warn "cold-start summary not found"
|
|
else
|
|
blocked "cold-start command returned $cold_rc without summary"
|
|
fi
|
|
fi
|
|
rm -f "$cold_tmp"
|
|
fi
|
|
|
|
if [[ "$RUN_MOMO" -eq 1 ]]; then
|
|
section "MOMO freshness"
|
|
momo_tmp="$(mktemp -t post-start-momo.XXXXXX)"
|
|
bash "$ROOT_DIR/scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh" >"$momo_tmp" 2>&1
|
|
momo_rc=$?
|
|
cat "$momo_tmp"
|
|
momo_summary="$(grep -E 'MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT PASS=[0-9]+ WARN=[0-9]+ BLOCKED=[0-9]+' "$momo_tmp" | tail -n 1 || true)"
|
|
case "$momo_rc" in
|
|
0)
|
|
ok "MOMO preflight clean"
|
|
;;
|
|
1)
|
|
if [[ "$momo_summary" =~ BLOCKED=0 ]]; then
|
|
evidence_warn "MOMO preflight has non-service warnings"
|
|
else
|
|
service_warn "MOMO preflight has warnings and no clean summary"
|
|
fi
|
|
;;
|
|
*)
|
|
blocked "MOMO preflight has blockers"
|
|
;;
|
|
esac
|
|
grep -E 'MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT|MOMO_HEALTH_VERSION|DB_MONTHLY_SYNC|DB_DAILY_FRESHNESS|DB_LATEST_DAILY_IMPORT_JOB' "$momo_tmp" || true
|
|
source_gate_output="$("$ROOT_DIR/scripts/reboot-recovery/momo-source-arrival-gate.py" --preflight-log "$momo_tmp" 2>&1)"
|
|
source_gate_rc=$?
|
|
printf '%s\n' "$source_gate_output"
|
|
if grep -q 'status=source_absent_no_newer_drive_waiting' <<<"$source_gate_output"; then
|
|
evidence_warn "MOMO source-arrival gate confirms no newer Drive source; waiting for legitimate source"
|
|
elif grep -q 'status=blocked_source_absent_fail_closed' <<<"$source_gate_output"; then
|
|
evidence_warn "MOMO source-arrival gate confirms source absent fail-closed"
|
|
elif grep -q 'status=source_arrived_ready_for_safe_import_preflight' <<<"$source_gate_output"; then
|
|
boundary_warn "MOMO source arrived; safe import preflight only, DB/Drive/runtime writes remain unauthorized"
|
|
elif grep -q 'status=freshness_already_green_recheck_cold_start' <<<"$source_gate_output"; then
|
|
ok "MOMO source-arrival gate reports freshness green; rerun post-reboot summary before updating declaration"
|
|
elif [[ "$source_gate_rc" -ne 0 ]]; then
|
|
service_warn "MOMO source-arrival gate did not produce a known state rc=$source_gate_rc"
|
|
else
|
|
evidence_warn "MOMO source-arrival gate produced a non-terminal state"
|
|
fi
|
|
rm -f "$momo_tmp"
|
|
fi
|
|
|
|
if [[ "$RUN_STOCK" -eq 1 ]]; then
|
|
section "StockPlatform freshness"
|
|
stock_tmp="$(mktemp -t post-start-stock.XXXXXX)"
|
|
stock_code=""
|
|
stock_attempt=1
|
|
while [[ "$stock_attempt" -le "$STOCK_FRESHNESS_RETRY_ATTEMPTS" ]]; do
|
|
stock_code="$(curl -k -sS -o "$stock_tmp" -w '%{http_code}' --max-time 12 "https://stock.wooo.work/api/v1/system/freshness" 2>/dev/null || true)"
|
|
if [[ "$stock_code" == 2* ]]; then
|
|
if [[ "$stock_attempt" -gt 1 ]]; then
|
|
evidence_warn "StockPlatform freshness recovered after attempt=$stock_attempt"
|
|
fi
|
|
break
|
|
fi
|
|
if [[ "$stock_attempt" -lt "$STOCK_FRESHNESS_RETRY_ATTEMPTS" ]]; then
|
|
sleep "$STOCK_FRESHNESS_RETRY_DELAY_SECONDS"
|
|
fi
|
|
stock_attempt=$((stock_attempt + 1))
|
|
done
|
|
if [[ "$stock_code" != 2* ]]; then
|
|
blocked "StockPlatform freshness endpoint returned ${stock_code:-curl_failed} attempts=$STOCK_FRESHNESS_RETRY_ATTEMPTS"
|
|
cat "$stock_tmp" || true
|
|
else
|
|
python3 - "$stock_tmp" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
with open(path, "r", encoding="utf-8") as fh:
|
|
payload = json.load(fh)
|
|
|
|
print(f"STOCK_FRESHNESS_STATUS {payload.get('status')}")
|
|
print(f"STOCK_LATEST_TRADING_DATE {payload.get('latest_trading_date')}")
|
|
print("STOCK_BLOCKERS " + ",".join(payload.get("blockers") or []))
|
|
for source in payload.get("sources") or []:
|
|
print(
|
|
"STOCK_SOURCE "
|
|
f"{source.get('source')}|{source.get('status')}|"
|
|
f"{source.get('latest_date')}|{source.get('row_count')}"
|
|
)
|
|
PY
|
|
stock_status="$(python3 - "$stock_tmp" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
with open(sys.argv[1], "r", encoding="utf-8") as fh:
|
|
print(json.load(fh).get("status") or "")
|
|
PY
|
|
)"
|
|
if [[ "$stock_status" == "ok" ]]; then
|
|
printf 'STOCK_EOD_WINDOW_PENDING 0\n'
|
|
printf 'STOCK_EOD_CLASSIFICATION ok\n'
|
|
printf 'STOCK_EOD_NEXT_ACTION none\n'
|
|
ok "StockPlatform freshness is ok"
|
|
else
|
|
stock_blockers="$(python3 - "$stock_tmp" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
with open(sys.argv[1], "r", encoding="utf-8") as fh:
|
|
print(",".join(json.load(fh).get("blockers") or []))
|
|
PY
|
|
)"
|
|
stock_eod_context="$(python3 - "$stock_tmp" <<'PY'
|
|
import json
|
|
import sys
|
|
from datetime import datetime, time, timezone, timedelta
|
|
|
|
try:
|
|
from zoneinfo import ZoneInfo
|
|
except Exception: # pragma: no cover - old Python fallback
|
|
ZoneInfo = None
|
|
|
|
with open(sys.argv[1], "r", encoding="utf-8") as fh:
|
|
payload = json.load(fh)
|
|
|
|
tz = ZoneInfo("Asia/Taipei") if ZoneInfo else timezone(timedelta(hours=8))
|
|
now = datetime.now(tz)
|
|
today = now.date().isoformat()
|
|
latest = payload.get("latest_trading_date") or ""
|
|
status = payload.get("status") or "unknown"
|
|
blockers = payload.get("blockers") or []
|
|
first_full_window_end = time(19, 15)
|
|
final_retry_window_end = time(23, 35)
|
|
|
|
pending = 0
|
|
classification = "blocked_unknown"
|
|
next_action = "investigate_stockplatform_freshness"
|
|
|
|
if status == "ok":
|
|
classification = "ok"
|
|
next_action = "none"
|
|
elif latest == today and blockers and now.time() < first_full_window_end:
|
|
pending = 1
|
|
classification = "pending_first_eod_window"
|
|
next_action = "wait_for_18_20_19_10_cron_then_recheck"
|
|
elif latest == today and blockers and now.time() < final_retry_window_end:
|
|
classification = "after_first_eod_window_blocked"
|
|
next_action = "inspect_ingestion_logs_and_wait_retry_windows"
|
|
elif latest == today and blockers:
|
|
classification = "after_final_eod_window_blocked"
|
|
next_action = "open_stockplatform_data_recovery_gate"
|
|
elif blockers:
|
|
classification = "blocked_non_current_trading_day"
|
|
next_action = "inspect_trading_calendar_and_ingestion_logs"
|
|
|
|
print(f"STOCK_EOD_WINDOW_PENDING {pending}")
|
|
print(f"STOCK_EOD_CLASSIFICATION {classification}")
|
|
print(f"STOCK_EOD_NEXT_ACTION {next_action}")
|
|
print(f"STOCK_EOD_FIRST_FULL_WINDOW_END_LOCAL {first_full_window_end.strftime('%H:%M')}")
|
|
print(f"STOCK_EOD_FINAL_RETRY_WINDOW_END_LOCAL {final_retry_window_end.strftime('%H:%M')}")
|
|
print(f"STOCK_EOD_OBSERVED_AT_LOCAL {now.isoformat(timespec='seconds')}")
|
|
PY
|
|
)"
|
|
printf '%s\n' "$stock_eod_context"
|
|
STOCK_EOD_WINDOW_PENDING="$(awk '$1 == "STOCK_EOD_WINDOW_PENDING" {print $2}' <<<"$stock_eod_context" | tail -n 1)"
|
|
STOCK_EOD_CLASSIFICATION="$(awk '$1 == "STOCK_EOD_CLASSIFICATION" {print $2}' <<<"$stock_eod_context" | tail -n 1)"
|
|
STOCK_EOD_NEXT_ACTION="$(awk '$1 == "STOCK_EOD_NEXT_ACTION" {print $2}' <<<"$stock_eod_context" | tail -n 1)"
|
|
if [[ "$STOCK_EOD_WINDOW_PENDING" == "1" ]]; then
|
|
evidence_warn "StockPlatform freshness pending scheduled EOD window: ${STOCK_EOD_CLASSIFICATION:-unknown}; next=${STOCK_EOD_NEXT_ACTION:-unknown}; blockers=${stock_blockers:-no_blocker_list}"
|
|
else
|
|
blocked "StockPlatform freshness is ${stock_status:-unknown}: ${stock_blockers:-no_blocker_list}; classification=${STOCK_EOD_CLASSIFICATION:-unknown}"
|
|
fi
|
|
fi
|
|
fi
|
|
rm -f "$stock_tmp"
|
|
fi
|
|
|
|
if [[ "$RUN_BACKUP" -eq 1 ]]; then
|
|
section "Backup / offsite / escrow"
|
|
backup_tmp="$(mktemp -t post-start-backup.XXXXXX)"
|
|
if ssh_read "wooo@192.168.0.110" '/backup/scripts/backup-status.sh --no-notify --no-refresh' >"$backup_tmp" 2>&1; then
|
|
ok "backup-status readback succeeded"
|
|
else
|
|
blocked "backup-status readback failed"
|
|
fi
|
|
cat "$backup_tmp"
|
|
if grep -Eq 'core_blockers=0|CORE_BLOCKERS[ =]0' "$backup_tmp"; then
|
|
ok "backup core blockers are 0"
|
|
elif grep -Eq 'core_blockers=[1-9]|CORE_BLOCKERS[ =][1-9]' "$backup_tmp"; then
|
|
blocked "backup core blockers are non-zero"
|
|
else
|
|
service_warn "backup core blocker summary not confirmed"
|
|
fi
|
|
if grep -Eq 'escrow_missing=0|ESCROW_MISSING_COUNT[ =]0' "$backup_tmp"; then
|
|
ok "credential escrow missing is 0"
|
|
elif grep -Eq 'escrow_missing=[1-9]|ESCROW_MISSING_COUNT[ =][1-9]' "$backup_tmp"; then
|
|
boundary_warn "credential escrow still missing; DR_COMPLETE is forbidden"
|
|
escrow_status_tmp="$(mktemp -t post-start-escrow-status.XXXXXX)"
|
|
if ssh_read "wooo@192.168.0.110" '/backup/scripts/mark-credential-escrow-verified.sh --status' >"$escrow_status_tmp" 2>&1; then
|
|
evidence_warn "credential escrow missing items follow"
|
|
else
|
|
evidence_warn "credential escrow missing item readback failed"
|
|
fi
|
|
cat "$escrow_status_tmp"
|
|
rm -f "$escrow_status_tmp"
|
|
else
|
|
evidence_warn "credential escrow count not found"
|
|
fi
|
|
rm -f "$backup_tmp"
|
|
fi
|
|
|
|
if [[ "$RUN_ROUTES" -eq 1 ]]; then
|
|
section "Public routes"
|
|
for url in "${ROUTES[@]}"; do
|
|
code=""
|
|
attempt=1
|
|
while [[ "$attempt" -le "$ROUTE_RETRY_ATTEMPTS" ]]; do
|
|
code="$(curl -k -sS -o /dev/null -w '%{http_code}' --max-time 12 "$url" 2>/dev/null || true)"
|
|
case "$code" in
|
|
2*|3*)
|
|
break
|
|
;;
|
|
esac
|
|
if [[ "$attempt" -lt "$ROUTE_RETRY_ATTEMPTS" ]]; then
|
|
sleep "$ROUTE_RETRY_DELAY_SECONDS"
|
|
fi
|
|
attempt=$((attempt + 1))
|
|
done
|
|
case "$code" in
|
|
2*|3*)
|
|
if [[ "$url" == "https://awoooi.wooo.work/api/v1/health" && "$code" == 2* ]]; then
|
|
AWOOOI_API_ROUTE_OK=1
|
|
fi
|
|
if [[ "$attempt" -gt 1 ]]; then
|
|
evidence_warn "$code $url recovered_after_attempt=$attempt"
|
|
else
|
|
ok "$code $url"
|
|
fi
|
|
;;
|
|
*)
|
|
ROUTE_SMOKE_BLOCKED=$((ROUTE_SMOKE_BLOCKED + 1))
|
|
blocked "${code:-curl_failed} $url attempts=$ROUTE_RETRY_ATTEMPTS"
|
|
;;
|
|
esac
|
|
done
|
|
fi
|
|
|
|
if [[ "$COLD_START_PENDING_BLOCKERS" -gt 0 ]]; then
|
|
non_route_cold_blockers="$(printf '%s\n' "$COLD_START_BLOCKED_LINES" | grep -Ev '^BLOCKED public route ' || true)"
|
|
if [[ "$RUN_ROUTES" -eq 1 && "$ROUTE_SMOKE_BLOCKED" -eq 0 && "$AWOOOI_API_ROUTE_OK" -eq 1 ]]; then
|
|
non_route_cold_blockers="$(
|
|
printf '%s\n' "$non_route_cold_blockers" | grep -Ev '^BLOCKED AWOOOI API not reachable$|^BLOCKED AWOOI API not reachable$' || true
|
|
)"
|
|
fi
|
|
if service_route_recovered && [[ -z "$non_route_cold_blockers" ]]; then
|
|
evidence_warn "cold-start route/API warmup blockers recovered under wrapper route retry: $COLD_START_BLOCKED_SUMMARY"
|
|
printf '%s\n' "$COLD_START_BLOCKED_LINES"
|
|
elif service_route_recovered; then
|
|
evidence_warn "cold-start non-route blockers retained as capacity/freshness evidence after public routes and AWOOOI API recovered: $COLD_START_BLOCKED_SUMMARY"
|
|
printf '%s\n' "$COLD_START_BLOCKED_LINES"
|
|
else
|
|
blocked "cold-start has blockers: $COLD_START_BLOCKED_SUMMARY"
|
|
printf '%s\n' "$COLD_START_BLOCKED_LINES"
|
|
fi
|
|
fi
|
|
|
|
if [[ "$RUN_CPU" -eq 1 ]]; then
|
|
section "110 CPU / process attribution"
|
|
cpu_tmp="$(mktemp -t post-start-cpu.XXXXXX)"
|
|
if ssh_read "wooo@192.168.0.110" 'uptime; vmstat 1 5; ps -eo pid,ppid,pgid,stat,pcpu,pmem,comm,args --sort=-pcpu | head -25' >"$cpu_tmp" 2>&1; then
|
|
ok "110 CPU/process readback succeeded"
|
|
else
|
|
evidence_warn "110 CPU/process readback failed"
|
|
fi
|
|
cat "$cpu_tmp"
|
|
smoke_candidates="$(awk -f "$SCRIPT_DIR/post-start-smoke-process-classifier.awk" "$cpu_tmp" || true)"
|
|
if [[ -n "$smoke_candidates" ]]; then
|
|
printf '%s\n' "$smoke_candidates"
|
|
evidence_warn "browser/smoke process is visible; classify orphan vs active parent before action"
|
|
fi
|
|
if grep -Eiq 'gitea|actions|runner|npm|pnpm|pytest|pip-audit' "$cpu_tmp"; then
|
|
ok "active CI/build/test load is visible"
|
|
fi
|
|
rm -f "$cpu_tmp"
|
|
fi
|
|
|
|
section "110 runner fail-closed guard"
|
|
runner_tmp="$(mktemp -t post-start-runner.XXXXXX)"
|
|
if ssh_read "wooo@192.168.0.110" '
|
|
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
|
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
|
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
|
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
|
mainpid=$(systemctl show "$u" -p MainPID --value 2>/dev/null || true)
|
|
unit_ok=0
|
|
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
|
unit_ok=1
|
|
fi
|
|
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
|
|
done
|
|
cd_lane_load=$(systemctl show awoooi-cd-lane.service -p LoadState --value 2>/dev/null || true)
|
|
cd_lane_unitfile=$(systemctl show awoooi-cd-lane.service -p UnitFileState --value 2>/dev/null || true)
|
|
cd_lane_active=$(systemctl show awoooi-cd-lane.service -p ActiveState --value 2>/dev/null || true)
|
|
cd_lane_mainpid=$(systemctl show awoooi-cd-lane.service -p MainPID --value 2>/dev/null || true)
|
|
cd_lane_execstart=$(systemctl show awoooi-cd-lane.service -p ExecStart --value 2>/dev/null || true)
|
|
cd_lane_sentinel=missing
|
|
[ -e /run/awoooi-cd-lane-enabled ] && cd_lane_sentinel=present
|
|
cd_lane_capacity_ok=0
|
|
cd_lane_labels_ok=0
|
|
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
|
|
cd_lane_capacity_ok=1
|
|
fi
|
|
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
|
|
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null \
|
|
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane/config.yaml 2>/dev/null; then
|
|
cd_lane_labels_ok=1
|
|
fi
|
|
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
|
|
cd_lane_binary_elf=0
|
|
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
|
|
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane" 2>/dev/null | wc -l | tr -d " ")
|
|
cd_lane_ok=0
|
|
cd_lane_mode=blocked
|
|
if [ "$cd_lane_active" = "inactive" ] \
|
|
&& [ "$cd_lane_sentinel" = "missing" ] \
|
|
&& [ "$cd_lane_binary_elf" = "0" ] \
|
|
&& [ "$cd_lane_process_count" = "0" ] \
|
|
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
|
|
cd_lane_ok=1
|
|
cd_lane_mode=failclosed
|
|
fi
|
|
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
|
|
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
|
|
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
|
|
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
|
|
cd_lane_drain_mainpid=$(systemctl show awoooi-cd-lane-drain.service -p MainPID --value 2>/dev/null || true)
|
|
cd_lane_drain_cpu_accounting=$(systemctl show awoooi-cd-lane-drain.service -p CPUAccounting --value 2>/dev/null || true)
|
|
cd_lane_drain_cpu_quota=$(systemctl show awoooi-cd-lane-drain.service -p CPUQuotaPerSecUSec --value 2>/dev/null || true)
|
|
cd_lane_drain_memory_accounting=$(systemctl show awoooi-cd-lane-drain.service -p MemoryAccounting --value 2>/dev/null || true)
|
|
cd_lane_drain_memory_max=$(systemctl show awoooi-cd-lane-drain.service -p MemoryMax --value 2>/dev/null || true)
|
|
cd_lane_drain_tasks_accounting=$(systemctl show awoooi-cd-lane-drain.service -p TasksAccounting --value 2>/dev/null || true)
|
|
cd_lane_drain_tasks_max=$(systemctl show awoooi-cd-lane-drain.service -p TasksMax --value 2>/dev/null || true)
|
|
cd_lane_drain_limits_ok=0
|
|
if [ "$cd_lane_drain_cpu_accounting" = "yes" ] \
|
|
&& [ -n "$cd_lane_drain_cpu_quota" ] && [ "$cd_lane_drain_cpu_quota" != "infinity" ] \
|
|
&& [ "$cd_lane_drain_memory_accounting" = "yes" ] \
|
|
&& [ -n "$cd_lane_drain_memory_max" ] && [ "$cd_lane_drain_memory_max" != "infinity" ] \
|
|
&& [ "$cd_lane_drain_tasks_accounting" = "yes" ] \
|
|
&& [ -n "$cd_lane_drain_tasks_max" ] && [ "$cd_lane_drain_tasks_max" != "infinity" ]; then
|
|
cd_lane_drain_limits_ok=1
|
|
fi
|
|
cd_lane_drain_capacity_ok=0
|
|
cd_lane_drain_labels_ok=0
|
|
if grep -Eq "^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
|
|
cd_lane_drain_capacity_ok=1
|
|
fi
|
|
if grep -q "awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
|
|
&& grep -q "awoooi-host:host" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null \
|
|
&& ! grep -Eq "^[[:space:]]+- \".*(ubuntu-latest|stockplatform|headless|playwright)" /home/wooo/awoooi-cd-lane-drain/config.yaml 2>/dev/null; then
|
|
cd_lane_drain_labels_ok=1
|
|
fi
|
|
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
|
|
cd_lane_drain_binary_elf=0
|
|
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
|
|
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
|
|
cd_lane_drain_ok=0
|
|
cd_lane_drain_mode=blocked
|
|
if [ "$cd_lane_drain_active" != "active" ] \
|
|
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
|
|
&& [ "$cd_lane_drain_process_count" = "0" ] \
|
|
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
|
|
cd_lane_drain_ok=1
|
|
cd_lane_drain_mode=failclosed
|
|
elif [ "$cd_lane_drain_active" = "active" ] \
|
|
&& [ "$cd_lane_drain_capacity_ok" = "1" ] \
|
|
&& [ "$cd_lane_drain_labels_ok" = "1" ] \
|
|
&& [ "$cd_lane_drain_binary_elf" = "1" ] \
|
|
&& [ "$cd_lane_drain_limits_ok" = "1" ]; then
|
|
cd_lane_drain_ok=1
|
|
cd_lane_drain_mode=controlled_open
|
|
fi
|
|
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf limits=$cd_lane_drain_limits_ok process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
|
|
cd_lane_root_restore_left=unknown
|
|
if sudo -n true >/dev/null 2>&1; then
|
|
cd_lane_root_restore_left=$(sudo -n find /root -maxdepth 1 -type d \( -name "awoooi-cd-lane-disabled-*" -o -name "awoooi-cd-lane-drain-disabled-*" \) -print 2>/dev/null | wc -l | tr -d " ")
|
|
fi
|
|
echo "CD_LANE_ROOT_RESTORE_SOURCES left=$cd_lane_root_restore_left"
|
|
cd_lane_guard_ok=0
|
|
if { [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; } && [ "$cd_lane_root_restore_left" = "0" ]; then
|
|
cd_lane_guard_ok=1
|
|
fi
|
|
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
|
|
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
|
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
|
|
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
|
kind=$(file -b "$p" 2>/dev/null || echo missing)
|
|
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
|
|
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
|
|
done
|
|
HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1 HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=0 /usr/local/bin/awoooi-wait-host-web-build-pressure.sh
|
|
echo "RUNNER_PRESSURE_GATE_RC $?"
|
|
' >"$runner_tmp" 2>&1; then
|
|
ok "110 controlled runner readback succeeded"
|
|
else
|
|
capacity_or_runner_issue "110 controlled runner readback failed"
|
|
fi
|
|
cat "$runner_tmp"
|
|
if awk '$1 == "RUNNER_FAILCLOSED_UNIT" && $NF != "ok=1" {bad=1} END {exit bad}' "$runner_tmp"; then
|
|
ok "110 legacy direct/Gitea runner units are fail-closed"
|
|
else
|
|
capacity_or_runner_issue "110 legacy direct/Gitea runner units are not fail-closed"
|
|
fi
|
|
grep -q "CD_LANE_GUARDRAILS_OK 1" "$runner_tmp" && ok "110 controlled cd-lane is safe-open/drained or fail-closed" || capacity_or_runner_issue "110 controlled cd-lane guardrails incomplete"
|
|
grep -q "RUNNER_DIRECT_PROCESS_COUNT 0" "$runner_tmp" && ok "110 legacy direct runner process count is zero" || capacity_or_runner_issue "110 legacy direct runner process detected"
|
|
grep -q "RUNNER_FAILCLOSED_BINARY_ELF" "$runner_tmp" && capacity_or_runner_issue "110 runner fail-closed binary path restored to ELF" || ok "110 runner binary paths are fail-closed stubs or missing"
|
|
grep -q "RUNNER_PRESSURE_GATE_RC 0" "$runner_tmp" && ok "110 host pressure gate returned 0" || capacity_or_runner_issue "110 host pressure gate is blocking"
|
|
rm -f "$runner_tmp"
|
|
|
|
section "總結"
|
|
printf 'POST_START_QUICK_CHECK PASS=%s WARN=%s BLOCKED=%s\n' "$PASS_COUNT" "$WARN_COUNT" "$BLOCKED_COUNT"
|
|
printf 'POST_START_QUICK_CHECK_WARNINGS SERVICE=%s BOUNDARY=%s EVIDENCE=%s\n' "$SERVICE_WARN_COUNT" "$BOUNDARY_WARN_COUNT" "$EVIDENCE_WARN_COUNT"
|
|
|
|
if [[ "$BLOCKED_COUNT" -gt 0 ]]; then
|
|
printf 'RESULT=BLOCKED\n'
|
|
exit 2
|
|
fi
|
|
|
|
if [[ "$SERVICE_WARN_COUNT" -gt 0 ]]; then
|
|
printf 'RESULT=DEGRADED\n'
|
|
exit 1
|
|
fi
|
|
|
|
if [[ "$STOCK_EOD_WINDOW_PENDING" == "1" ]]; then
|
|
printf 'RESULT=PRODUCT_DATA_PENDING_EOD_WINDOW\n'
|
|
exit 0
|
|
fi
|
|
|
|
if [[ "$BOUNDARY_WARN_COUNT" -gt 0 ]]; then
|
|
printf 'RESULT=FULL_STACK_GREEN_DR_ESCROW_BLOCKED\n'
|
|
exit 0
|
|
fi
|
|
|
|
if [[ "$EVIDENCE_WARN_COUNT" -gt 0 ]]; then
|
|
printf 'RESULT=GREEN_WITH_EVIDENCE_WARNINGS\n'
|
|
exit 0
|
|
fi
|
|
|
|
printf 'RESULT=GREEN\n'
|
|
exit 0
|