diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index b5ebf1deb..6660bbfd8 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,32 @@ +## 2026-06-30 — 17:57 P0-006 全主機重啟實測與受控恢復 + +**實測結論**: +- 2026-06-30 17:16:40 CST 起觀測 110 / 120 / 121 / 188 全主機重啟,10 分鐘期限 17:26:40;期限後 Gitea / Stock edge / AWOOI VIP / public AWOOI / public Stock / 188 Ollama 外部讀回仍未全部恢復,`10_minute_auto_recovery=false`。 +- 主阻塞不是單一頁面噪音:110 Docker 因多筆 corrupt container metadata 卡在 startup,導致 registry `192.168.0.110:5000`、Harbor / registry public route、AWOOI image pull、Stock edge 與 110 SSH readback 連鎖失敗。 + +**已執行的 controlled recovery**: +- 110 Docker corrupt metadata 只做 quarantine,不刪除:receipt `/tmp/awoooi-docker-metadata-quarantine-20260630-173012.txt`,9 個壞 metadata container dir 移至 `/var/lib/docker/containers/.awoooi-corrupt-metadata-quarantine-20260630-173012`;Docker 於 17:37:12 回 `active`,Gitea / Prometheus / Alertmanager 回通。 +- AWOOI `awoooi-prod` 因 registry `:5000` refused 進入 `ImagePullBackOff`;已驗證 120 有 `7890778b830bd0813a465931bba54feec799eeda` API/Web image cache、121 無 cache,受控 patch API/Web/Worker/Canary:`imagePullPolicy=IfNotPresent`、`nodeSelector=kubernetes.io/hostname=mon`,並暫停 API/Web/Worker template topology spread,讓 120 cache fallback 先恢復服務。 +- AWOOI runtime readback:`awoooi-api=2/2`、`awoooi-web=2/2`、`awoooi-worker=1/1`、`awoooi-auto-repair-canary=1/1`,`bad_pods=0`;VIP 內部讀回 API 200 / Web 307,public `https://awoooi.wooo.work/api/v1/health` 200、首頁 200。 +- StockPlatform public edge / API 已恢復:`https://stock.wooo.work/healthz=200`、`/api/healthz=200`;但 freshness / ingestion 仍 `status=not_configured`,blocker `postgres_not_ready`。 + +**source 修正**: +- `full-stack-cold-start-check.sh` 新增 registry 外部 `/v2/` gate 與 K3s `IMAGE_PULL_BLOCKED` / `REGISTRY_PULL_REFUSED_EVENTS` blocker。 +- `full-stack-recovery-scorecard.sh` 新增 `CORE_REGISTRY_HTTPS_CODE`、`CORE_REGISTRY_HTTP_CODE`、`CORE_REGISTRY_READY`、`CORE_REGISTRY_BLOCKER`,registry `:5000` 未就緒時不允許 `CORE_READY`。 + +**目前 blocker / rollback**: +- `192.168.0.110:5000` registry 仍 refused,Harbor / registry / SignOz public route 仍 502;110 SSH readback 仍 timeout,offsite evidence readback 仍 `ssh_control_channel_timeout`。 +- 120 keepalived 上有 VIP,但外部打 120/VIP NodePort refused;121 NodePort 對外 200。嘗試受控 failover 被 `sudo: a password is required` 擋住,未變更 keepalived。rollback / 正規化路徑:registry 恢復後移除 AWOOI cache fallback nodeSelector、恢復 topology spread、imagePullPolicy 回 `Always`。 +- 188 核心服務已活:Docker / Nginx / Ollama / PostgreSQL / Redis active,Ollama 本機 `/api/tags` 200;外部 `188:11434` refused 是 `127.0.0.1` 綁定行為,需依既有暴露策略判定。 + +**驗證**: +- `pytest -q scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py`:`4 passed`。 +- `bash -n scripts/reboot-recovery/full-stack-cold-start-check.sh scripts/reboot-recovery/full-stack-recovery-scorecard.sh`:通過。 +- `python3 ops/runner/guard-gitea-runner-pressure.py --root .`:`workflow_files=11 scheduled_workflows=3 auto_branch_events_on_110=0 generic_runner_labels=0`。 +- `SSH_COMMAND_TIMEOUT_SECONDS=8 bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color`:`PASS=64 WARN=6 BLOCKED=6`,核心 AWOOI workload / public routes 已綠,registry / Harbor / SignOz / 110 readback 仍紅。 + +**邊界**:未使用 GitHub / `gh` / GitHub API,未讀 secret / token / `.env` / raw sessions / SQLite / auth,未重啟主機,未 node drain,未 firewall cutover,未 DB destructive operation。 + ## 2026-06-30 — 16:48 P0-006 110 control-channel / Stock API runtime blocker 收斂 **照主線處理的問題**: diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 98b972275..55da6b624 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -5,6 +5,8 @@ set -uo pipefail SSH_COMMAND_TIMEOUT_SECONDS="${SSH_COMMAND_TIMEOUT_SECONDS:-45}" +REGISTRY_HTTPS_URL="${REGISTRY_HTTPS_URL:-https://192.168.0.110:5000/v2/}" +REGISTRY_HTTP_URL="${REGISTRY_HTTP_URL:-http://192.168.0.110:5000/v2/}" SSH_OPTS=( -o BatchMode=yes -o ConnectTimeout=6 @@ -198,6 +200,25 @@ probe_http_code() { echo "${code:-000}" } +probe_http_code_insecure() { + local url="$1" + local attempt code + for attempt in 1 2; do + code=$(curl -ks -o /dev/null -w "%{http_code}" --max-time 12 "$url" 2>/dev/null || true) + if [[ "$code" =~ ^[0-9]{3}$ ]] && [ "$code" != "000" ]; then + echo "$code" + return + fi + sleep 1 + done + echo "${code:-000}" +} + +registry_code_ready() { + local code="$1" + [ "$code" = "200" ] || [ "$code" = "401" ] +} + probe_tcp() { local host="$1" local port="$2" @@ -279,7 +300,17 @@ docker ps --format "DOCKER {{.Names}}\t{{.Status}}" | head -80 check_110() { log_section "P0-110-REGISTRY-OBSERVABILITY" - local out + local out registry_https_code registry_http_code + registry_https_code="$(probe_http_code_insecure "$REGISTRY_HTTPS_URL")" + registry_http_code="$(probe_http_code "$REGISTRY_HTTP_URL")" + echo "REGISTRY_EXTERNAL_HTTPS_CODE $registry_https_code $REGISTRY_HTTPS_URL" + echo "REGISTRY_EXTERNAL_HTTP_CODE $registry_http_code $REGISTRY_HTTP_URL" + if registry_code_ready "$registry_https_code" || registry_code_ready "$registry_http_code"; then + ok "110 registry external /v2 reachable" + else + fail "110 registry external /v2 not reachable" + fi + if ! out=$(host_cmd "wooo@192.168.0.110" ' echo "HOST $(hostname) $(uptime)" echo "MEM $(free -h | awk "/Mem:/ {print \$2,\$3,\$7}")" @@ -480,6 +511,26 @@ node_fs_events=$(kcmd get events -A --field-selector involvedObject.kind=Node -- | grep -Eiv "InvalidDiskCapacity|image filesystem" \ | grep -Eic "fsck|I/O error|read-only file system|Structure needs cleaning|orphan linked list|EXT4-fs.*error|XFS.*(corruption|metadata)|Remounting filesystem read-only" || true) echo "NODE_FS_ERROR_EVENTS ${node_fs_events:-0}" +image_pull_blocked=$(kcmd get pods -n awoooi-prod -o json 2>/dev/null | python3 -c "import json,sys +try: + d=json.load(sys.stdin) +except Exception: + d={\"items\": []} +blocked=0 +reasons={} +for pod in d.get(\"items\", []): + for status in pod.get(\"status\", {}).get(\"containerStatuses\", []) or []: + waiting=(status.get(\"state\") or {}).get(\"waiting\") or {} + reason=waiting.get(\"reason\") or \"\" + if reason in {\"ImagePullBackOff\", \"ErrImagePull\"}: + blocked += 1 + reasons[reason]=reasons.get(reason, 0) + 1 +print(\"IMAGE_PULL_BLOCKED\", blocked) +print(\"IMAGE_PULL_REASONS\", \",\".join(f\"{k}:{v}\" for k,v in sorted(reasons.items())) or \"none\")" || true) +printf "%s\n" "$image_pull_blocked" +registry_pull_refused_events=$(kcmd get events -n awoooi-prod --sort-by=.lastTimestamp 2>/dev/null \ + | grep -Ec "Failed to pull image|ImagePullBackOff|ErrImagePull|192\\.168\\.0\\.110:5000.*connect: connection refused" || true) +echo "REGISTRY_PULL_REFUSED_EVENTS ${registry_pull_refused_events:-0}" ip addr show | grep 192.168.0.125 || true ' 2>&1); then fail "ssh 120 k3s read-only check" @@ -508,6 +559,8 @@ ip addr show | grep 192.168.0.125 || true else fail "K3s node storage condition or severe filesystem event present" fi + grep -q "IMAGE_PULL_BLOCKED 0" <<<"$out" && ok "K3s AWOOOI image pull is not blocked" || fail "K3s AWOOOI image pull blocked" + grep -q "REGISTRY_PULL_REFUSED_EVENTS 0" <<<"$out" && ok "K3s registry pull has no refused events" || fail "K3s registry pull refused by 110:5000" grep -q "192.168.0.125" <<<"$out" && ok "VIP 192.168.0.125 present on 120" || warn "VIP not confirmed on 120" } diff --git a/scripts/reboot-recovery/full-stack-recovery-scorecard.sh b/scripts/reboot-recovery/full-stack-recovery-scorecard.sh index ebfb7b415..223548373 100755 --- a/scripts/reboot-recovery/full-stack-recovery-scorecard.sh +++ b/scripts/reboot-recovery/full-stack-recovery-scorecard.sh @@ -7,6 +7,8 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" REMOTE_110="${REMOTE_110:-wooo@192.168.0.110}" PROMETHEUS_URL="${PROMETHEUS_URL:-http://192.168.0.110:9090}" ALERTMANAGER_URL="${ALERTMANAGER_URL:-http://192.168.0.110:9093}" +REGISTRY_HTTPS_URL="${REGISTRY_HTTPS_URL:-https://192.168.0.110:5000/v2/}" +REGISTRY_HTTP_URL="${REGISTRY_HTTP_URL:-http://192.168.0.110:5000/v2/}" SSH_BATCH_MODE="${SSH_BATCH_MODE:-yes}" SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}" REMOTE_COMMAND_TIMEOUT_SECONDS="${REMOTE_COMMAND_TIMEOUT_SECONDS:-45}" @@ -106,6 +108,21 @@ except Exception: PY } +http_code() { + local url="$1" + curl -s -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true +} + +http_code_insecure() { + local url="$1" + curl -ks -o /dev/null -w "%{http_code}" --max-time 8 "$url" 2>/dev/null || true +} + +ready_http_code() { + local code="$1" + [ "$code" = "200" ] || [ "$code" = "401" ] +} + classify_readback_blocker() { local path="$1" if [ ! -s "$path" ]; then @@ -144,6 +161,18 @@ status_value CORE_COLD_START_WARN_GATES "$cold_warn" status_value CORE_COLD_START_BLOCKED_GATES "$cold_blocked" status_value CORE_COLD_START_FIRING_ALERTS "$cold_alerts" +registry_https_code="$(http_code_insecure "$REGISTRY_HTTPS_URL")" +registry_http_code="$(http_code "$REGISTRY_HTTP_URL")" +status_value CORE_REGISTRY_HTTPS_CODE "${registry_https_code:-000}" +status_value CORE_REGISTRY_HTTP_CODE "${registry_http_code:-000}" +if ready_http_code "${registry_https_code:-000}" || ready_http_code "${registry_http_code:-000}"; then + status_value CORE_REGISTRY_READY 1 + status_value CORE_REGISTRY_BLOCKER none +else + status_value CORE_REGISTRY_READY 0 + status_value CORE_REGISTRY_BLOCKER registry_5000_unreachable_or_not_ready +fi + cold_start_parity_log="/tmp/awoooi-scorecard-cold-start-parity.log" if bash "$ROOT_DIR/scripts/reboot-recovery/verify-cold-start-monitor-deploy.sh" >"$cold_start_parity_log" 2>&1; then status_value CORE_COLD_START_DEPLOY_PARITY 1 @@ -196,7 +225,8 @@ status_value NEXT_STEP "${next_step:-unknown}" if [ "$cold_green" = "1" ] \ && [ "${cold_warn%.*}" = "0" ] \ && [ "${cold_blocked%.*}" = "0" ] \ - && [ "${cold_alerts%.*}" = "0" ]; then + && [ "${cold_alerts%.*}" = "0" ] \ + && { ready_http_code "${registry_https_code:-000}" || ready_http_code "${registry_http_code:-000}"; }; then core_state="CORE_READY" else core_state="CORE_NOT_READY" diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index d5a80cc74..73bd71c4c 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -34,11 +34,29 @@ def test_recovery_scorecard_bounds_offsite_evidence_ssh() -> None: assert "offsite-escrow-evidence-report.sh --no-color" in text assert "classify_readback_blocker()" in text assert "CORE_COLD_START_DEPLOY_PARITY_BLOCKER" in text + assert "CORE_REGISTRY_HTTPS_CODE" in text + assert "CORE_REGISTRY_HTTP_CODE" in text + assert "CORE_REGISTRY_READY" in text + assert "CORE_REGISTRY_BLOCKER" in text + assert "registry_5000_unreachable_or_not_ready" in text assert "DR_OFFSITE_EVIDENCE_READBACK" in text assert "DR_OFFSITE_EVIDENCE_BLOCKER" in text assert "ssh_control_channel_timeout" in text +def test_full_stack_cold_start_check_tracks_registry_and_image_pull_blockers() -> None: + text = COLD_START_CHECK.read_text(encoding="utf-8") + + assert "REGISTRY_EXTERNAL_HTTPS_CODE" in text + assert "REGISTRY_EXTERNAL_HTTP_CODE" in text + assert "110 registry external /v2 not reachable" in text + assert "IMAGE_PULL_BLOCKED" in text + assert "IMAGE_PULL_REASONS" in text + assert "REGISTRY_PULL_REFUSED_EVENTS" in text + assert "K3s AWOOOI image pull blocked" in text + assert "K3s registry pull refused by 110:5000" in text + + def test_cold_start_deploy_parity_verifier_bounds_ssh_readback() -> None: text = VERIFY_DEPLOY.read_text(encoding="utf-8")