diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 8dfea3db..61db5502 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -51916,6 +51916,35 @@ production browser smoke: - 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 - 沒有重啟主機,沒有 Docker / Nginx / K3s / DB restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune。 +## 2026-07-01 — 23:28 P0 110 sustained CPU pressure alert / controlled quota / alert-chain readback + +**完成內容**: +- 110 live pressure 重新讀回:`load5` 曾回到約 `8.91`、`awoooi_host_load5_per_core=0.8075`,Gitea 即時 `docker stats` 一度 `218.56%`,但既有 `HostLoadAverageSustainedHigh` 門檻是 `load5/core > 1.5 for 15m`,`DockerContainerCpuSustainedHigh` 也是 `>2 core for 10m` pending;因此先前沒有 CPU firing / Telegram 並不是沒有監控,而是門檻太晚且 auto-repair action 指到未部署路徑。 +- 已部署 `/home/wooo/scripts/host-sustained-load-controller.py`、`host-sustained-load-evidence.py`、`host-runaway-process-remediation.py` 到 110,備份 suffix `before-host-pressure-controller-20260701-232314`;controller live readback 可執行,且不讀 secret / raw session / runner registration。 +- `ops/monitoring/alerts-unified.yml` 新增 `Host110SustainedModeratePressure`:`load5/core > 0.75` 或 Gitea / StockPlatform 關鍵容器 CPU `>2.0 core` 持續 1 分鐘即 warning,auto-repair action 指向 110 實際 controller 路徑。 +- 將 Gitea container runtime CPU quota 從 `3` core 收斂到 `2` core:`docker update --cpus=2 gitea`;rollback 為 `docker update --cpus=3 gitea`。post-check:`nanocpus=2000000000`、memory 仍 `3GiB`、Gitea API `/api/v1/version` 回 `1.25.5`,無容器重啟。 +- 修正備份噪音:`BackupAggregateRunFailed` 不再因 `backup_all` 舊 aggregate failed_count firing,改成只看 component job failed count;live `backup-status.sh --no-notify` 已回 `每日備份心跳正常`、`component_failed=0`、`core_blockers=0`、`escrow_missing=0`。 +- Alertmanager / webhook readback:Alertmanager 仍有 5 個非 CPU active warning;路由預設 `awoooi-webhook`,`telegram-direct` 只給 alert-chain 自身異常。110 到 VIP / 120 / 121 `/api/v1/webhooks/alertmanager` synthetic no-secret smoke 均 HTTP 200,回 `告警已排入背景分析`;`/api/v1/telegram/health` 回 `configured`。 + +**live readback**: +- Prometheus rule readback:`Host110SustainedModeratePressure=inactive`、`DockerContainerCpuSustainedHigh=inactive`、`BackupAggregateRunFailed=inactive`。 +- Node exporter readback:`awoooi_host_load5_per_core{host="110"} 0.536667`、`node_load5 6.52`、`docker_container_cpu_cores{container_name="gitea",host="110"} 1.4917`、`docker_container_cpu_limit_cores{container_name="gitea",host="110"} 2`。 +- Alertmanager active alerts after fix:`DockerContainerMissingResourceLimit` on 188、`HostDiskUsageHigh` on 110/188、`HostOutOfDiskSpace` on 110/188;CPU / backup aggregate alert no longer firing. +- Full-stack cold-start after fix:`PASS=96 WARN=0 BLOCKED=0`,Result `GREEN`;110 registry / Gitea / Harbor / Prometheus / Alertmanager OK,runner fail-closed OK,110 docker/systemd/storage/backup textfiles fresh,public routes expected 2xx/3xx,backup aggregate failed_count 僅列 INFO、不再形成 blocker。 + +**驗證結果**: +- `python3.11 -m pytest scripts/ops/tests/test_host_pressure_alert_contract.py scripts/ops/tests/test_host_runaway_process_exporter.py -q`:`22 passed`。 +- `python3.11 scripts/ops/backup-alert-label-contract-check.py`:`BACKUP_ALERT_LABEL_CONTRACT_OK`。 +- `python3.11 -m pytest scripts/backup/tests/test_backup_status_contract.py scripts/ops/tests/test_backup_health_textfile_exporter.py scripts/reboot-recovery/tests/test_dr_escrow_evidence_checklist.py -q`:`11 passed`。 +- `bash scripts/ops/deploy-alerts.sh`:Prometheus reload 成功,載入 `159` 條規則,關鍵 alert-chain / SLO rules 全部存在。 +- `SSH_CONNECT_TIMEOUT=8 bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color`:`PASS=96 WARN=0 BLOCKED=0`。 +- `git diff --check`:通過。 + +**仍維持**: +- 沒有讀 secret / token / `.env` / raw sessions / SQLite / auth;沒有讀 `.runner` 內容。 +- 沒有使用 GitHub / gh / GitHub API / GitHub Actions。 +- 沒有重啟主機,沒有 Docker daemon / Nginx / K3s / DB / firewall restart,沒有 workflow_dispatch,沒有 DROP / TRUNCATE / restore / prune。 + ## 2026-07-01 — 16:44 Gitea CD #4265 tests failure classifier sync **完成內容**: diff --git a/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json b/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json index 1967e949..a1a6a34b 100644 --- a/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json +++ b/docs/operations/host-cpu-pressure-drain-readback-2026-07-01.snapshot.json @@ -1,7 +1,7 @@ { "schema_version": "awoooi_host_cpu_pressure_drain_readback_v1", - "generated_at": "2026-07-01T08:59:00+08:00", - "status": "partial_188_cpu_recovered_with_temporary_db_circuit_breaker_110_control_path_blocked", + "generated_at": "2026-07-01T23:28:00+08:00", + "status": "host_188_cpu_recovered_110_cpu_quota_and_alerting_recovered", "scope": { "hosts": ["188", "110"], "incident_family": "post_reboot_host_cpu_pressure", @@ -51,6 +51,14 @@ "reason": "truth-chain shifted CPU pressure from conversation_event to automation_operation_log after the first circuit breaker", "rollback": "DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_output_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_input_text_trgm; DROP INDEX CONCURRENTLY IF EXISTS idx_automation_operation_log_incident_text;", "post_apply_readback": "all three indexes read back indisvalid=true and indisready=true" + }, + { + "host": "110", + "target_selector": "gitea Docker container cgroup CPU quota", + "action": "docker update --cpus=2 gitea", + "reason": "Gitea repeatedly exceeded the 2-core sustained CPU warning threshold and was allowed to consume up to 3 cores on the 110 control host", + "rollback": "docker update --cpus=3 gitea", + "post_apply_readback": "docker inspect reports nanocpus=2000000000; Gitea /api/v1/version returns 1.25.5; docker_container_cpu_limit_cores{container_name=\"gitea\",host=\"110\"}=2" } ], "source_fixes": [ @@ -78,6 +86,16 @@ "path": "apps/api/migrations/automation_operation_log_truth_chain_hot_path_indexes_2026-07-01.sql", "change": "add automation_operation_log expression/trigram indexes for truth-chain incident/input/output lookups", "evidence": "live verifier read back incident_text, input_text_trgm, and output_text_trgm indexes as valid and ready" + }, + { + "path": "ops/monitoring/alerts-unified.yml", + "change": "add Host110SustainedModeratePressure and point sustained-load auto_repair_action to the deployed /home/wooo/scripts controller path", + "evidence": "Prometheus rule readback shows Host110SustainedModeratePressure loaded; after quota apply it is inactive because load5/core and Gitea CPU are below threshold" + }, + { + "path": "scripts/ops/backup-alert-label-contract-check.py", + "change": "make BackupAggregateRunFailed ignore aggregate-only backup_all noise and require component job failed-count evidence", + "evidence": "BackupAggregateRunFailed is inactive while backup_all aggregate failed_count remains 5 and component_failed=0" } ], "readback": { @@ -113,24 +131,25 @@ "source_split_live_on_120": false } }, - "host_110": { - "signals": { - "load5_initial": "39.14", - "load5_latest": "14.57", - "load5_per_core_latest": "1.2275", - "node_procs_running_latest": "366", - "gitea_container_cpu_cores": "3.4019", - "ssh_control_path": "timeout", - "systemd_dbus_symptom": "systemd-logind pending replies exhausted; systemctl list/show timeout", - "systemd_units_exporter_current_live_state": "old exporter still emits 10s systemctl timeout labels" - }, - "not_completed_live": [ - "could not apply pkill/exporter drain on 110 because SSH command sessions timeout", - "scp and stdin-over-ssh attempts to place the new exporter file timed out", - "latest short SSH readback still timed out with server not responding", - "could not restart or reexec systemd/logind because that would require a control path that is currently unavailable" - ] - } + "host_110": { + "signals": { + "load5_initial": "39.14", + "load5_latest": "6.52", + "load5_per_core_latest": "0.536667", + "node_procs_running_latest": "not_high_in_latest_readback", + "gitea_container_cpu_cores_before_quota": "2.1856", + "gitea_container_cpu_cores_after_quota_textfile": "1.4917", + "gitea_container_cpu_limit_cores": "2", + "ssh_control_path": "available", + "alert_rules": "Host110SustainedModeratePressure loaded; DockerContainerCpuSustainedHigh inactive; BackupAggregateRunFailed inactive", + "alert_chain": "110 to VIP/120/121 /api/v1/webhooks/alertmanager synthetic no-secret smoke returned HTTP 200", + "full_stack_cold_start_after_fix": "PASS=96 WARN=0 BLOCKED=0 Result=GREEN" + }, + "remaining_warnings": [ + "Alertmanager still has non-CPU disk/resource warnings firing: HostDiskUsageHigh, HostOutOfDiskSpace, DockerContainerMissingResourceLimit", + "Telegram visible delivery depends on AWOOOI API background analysis / TelegramGateway; webhook intake and Telegram health are configured" + ] + } }, "verification": { "py_compile": "passed: scripts/ops/systemd-units-textfile-exporter.py and apps/api/src/services/awooop_truth_chain_service.py", @@ -139,14 +158,17 @@ "scripts/ops/tests/test_systemd_units_textfile_exporter.py: 2 passed", "scripts/ops/tests/test_systemd_units_textfile_exporter.py + ops/runner/test_cd_controlled_runtime_profile.py: 34 passed", "truth-chain source split + content-preview migration focused suite: 7 passed", - "conversation_event + automation_operation_log hot-path migration tests: 6 passed" + "conversation_event + automation_operation_log hot-path migration tests: 6 passed", + "scripts/ops/tests/test_host_pressure_alert_contract.py + scripts/ops/tests/test_host_runaway_process_exporter.py: 22 passed", + "backup alert label contract: BACKUP_ALERT_LABEL_CONTRACT_OK", + "full-stack cold-start check --monitor-read-only: PASS=96 WARN=0 BLOCKED=0" ], "diff_check": "passed" }, "next_actions": [ - "commit and push the truth-chain source split so 120 stops issuing the old OR query", "after deploy convergence, reset the temporary awoooi role max_parallel_workers_per_gather, statement_timeout, enable_seqscan, and connection limit overrides", - "apply the systemd exporter source fix to 110 once SSH/control path is available", + "continue disk capacity cleanup planning for 110/188 HostDiskUsageHigh without destructive prune", + "verify AWOOOI API TelegramGateway outbound status if owner still does not see webhook notifications", "continue 110 Gitea queue / awoooi-host controlled lane recovery without generic runner restore" ] } diff --git a/docs/runbooks/FULL-STACK-COLD-START-SOP.md b/docs/runbooks/FULL-STACK-COLD-START-SOP.md index 4746b6bc..daa714c5 100644 --- a/docs/runbooks/FULL-STACK-COLD-START-SOP.md +++ b/docs/runbooks/FULL-STACK-COLD-START-SOP.md @@ -86,6 +86,8 @@ v1.82 bounded summary rule:`post-start-quick-check.sh` 與 `188-host-hygiene-m 2026-07-01 14:05 追加 controller / SLO stale-attribution guard:`host-sustained-load-controller.py` 與 `host-sustained-load-evidence.py` 必須把超過 `300s` 的 Docker stats 樣本標成 untrusted;`top_container_cpu` / `top_containers` 不得使用 stale `docker_container_cpu_cores`,舊值只能留在 `top_container_cpu_untrusted` / `top_containers_untrusted` 當證據。`reboot-auto-recovery-slo-scorecard.py` 若收到 `docker_stats.fresh=false` 或 `top_containers_fresh=false`,只能保留 `host_pressure_high_load` 與 `host_container_cpu_attribution_stale`,不得產生 `host_110_gitea_cpu_pressure`。此時下一步固定為恢復 Docker stats textfile exporter 或收集 sanitized host pressure,且仍不得重啟 Docker / Nginx / K3s / DB / firewall、不得恢復 generic runner、不得用 stale Gitea CPU 樣本取消或 drain 任何工作。 +2026-07-01 23:28 追加 110 主動告警與 controlled quota 收斂:`HostLoadAverageSustainedHigh` 的 `load5/core > 1.5 for 15m` 只能當 critical 門檻,不足以回答「110 CPU 又持續偏高為何沒告警」。Prometheus 必須同時有 `Host110SustainedModeratePressure`,在 `awoooi_host_load5_per_core{host="110"} > 0.75` 或 Gitea / StockPlatform 關鍵容器 `docker_container_cpu_cores > 2.0` 持續 1 分鐘時告警,auto-repair action 必須指向 110 已部署的 `/home/wooo/scripts/host-sustained-load-controller.py --load5-per-core-threshold 0.75`。Gitea runtime CPU quota 已以 `docker update --cpus=2 gitea` 收斂;rollback 是 `docker update --cpus=3 gitea`,不可重啟 Docker daemon 或 Gitea container。若 `BackupAggregateRunFailed` 只因 `exported_job="backup_all"` 舊 aggregate failed_count firing,而 component jobs / `backup-status.sh` 已 green,必須視為噪音並用 component failed count 判斷,不得讓它干擾 cold-start / Telegram 主線。 + 2026-06-25 20:25 orphan Chrome cleanup / scorecard refresh supersedes the 20:11 CPU wording. 110 high CPU was traced to two `stockplatform-review-bulk-ux` Chrome process groups `2756503` and `2829627` with root Chrome process `PPID=1`, elapsed about 5h, no active parent smoke, and sustained GPU/renderer CPU. With user approval, only those two process groups received targeted `SIGTERM` at 20:24. Post-check showed no remaining PGID entries; `vmstat` showed CPU idle around `85-90%`, `si/so=0`, and no immediate swap thrash. No Docker/systemd/Nginx/firewall/K8s action, CI cancellation, manual data ingestion, manual DB write, Wazuh/SOC runtime change, or secret read was performed. The 20:25 full post-start wrapper then returned cold-start `PASS=89 WARN=0 BLOCKED=0`, but overall `POST_START_QUICK_CHECK PASS=37 WARN=2 BLOCKED=1`, `RESULT=BLOCKED`, because StockPlatform data freshness was still blocked at that time and DR remained incomplete. 2026-06-25 20:11 StockPlatform cron-source recovery supersedes the 19:35 source-version wording. StockPlatform Gitea `main` and live `/home/wooo/stockplatform-v2` are now at `fb91aa4c6272469d1d26e0820169629eac17d28a fix(ops): restore production cron recovery entrypoints`; six missing production cron entrypoint scripts are restored, `run-intelligence-sync.sh` contains the Docker-backed `psql` shim, and live contract check confirms every `scripts/ops/*.sh` referenced by `install-production-cron.sh` exists. The only live write performed for StockPlatform recovery was a fast-forward `git pull --ff-only origin main` on 110; no Docker/systemd/Nginx/firewall/K8s restart, manual ingestion run, manual DB write, or secret read was performed. Natural cron evidence after the pull is now green for the repaired entrypoints: `source-remediation-queue` 19:56 and 20:00 succeeded, `market-index-ingestion` 20:00 succeeded, `price-ingestion` 20:02 succeeded, `margin-short-ingestion` 20:05 succeeded, `chips-ingestion` 20:06 succeeded, and `ai-recommendation-pipeline` 20:10 ran but correctly produced the internal blocker `core_margin_short_daily_incomplete,official_margin_short_daily_official_pending`. StockPlatform `/api/v1/system/freshness` therefore still returns `status=blocked` because the 2026-06-25 official margin-short source is pending and `ai.recommendations` must stay on 2026-06-24 until that gate clears. This is no longer a route, source-version, or missing-cron-script blocker; it is a product-data freshness blocker waiting on official source availability and the next valid AI pipeline run. @@ -896,7 +898,7 @@ Prometheus/Alertmanager on 110 -> Telegram ``` -Alertmanager health alone is not enough. Run E2E: +Alertmanager health alone is not enough. The test must be run from 110 or another host that can reach the K3s VIP / NodePort. A local workstation connection failure to `192.168.0.125:32334` does not by itself prove Alertmanager cannot deliver alerts, because Alertmanager also runs on 110. Run E2E: ```bash curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \ @@ -906,6 +908,8 @@ curl -s -X POST http://192.168.0.125:32334/api/v1/webhooks/alertmanager \ Expected: API returns success and Telegram receives the test alert. +2026-07-01 live readback:from 110, `/api/v1/webhooks/health` returns `200`, `/api/v1/telegram/health` returns `configured`, and synthetic no-secret `CodexAlertChainSmoke` POSTs to `192.168.0.125` / `192.168.0.120` / `192.168.0.121` all returned HTTP `200` with `告警已排入背景分析`. If Telegram is still not visible to the owner, the next check is AWOOOI API background analysis / TelegramGateway outbound status, not Prometheus alert existence. + --- ## 9. P2 Schedules And Delayed Work diff --git a/ops/monitoring/alerts-unified.yml b/ops/monitoring/alerts-unified.yml index 74a24faa..87f5380f 100644 --- a/ops/monitoring/alerts-unified.yml +++ b/ops/monitoring/alerts-unified.yml @@ -101,7 +101,7 @@ groups: annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} '/home/wooo/scripts/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" runbook: "交給 host-sustained-load-controller 產生 AI controlled packet:orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier;合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet;unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook;swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。" - alert: HostOutOfMemory @@ -304,6 +304,29 @@ groups: description: "host-runaway-process exporter 應永遠保持 read-only;若 remediation_authorized > 0,代表有人把監控器改成執行器或把 runtime gate 誤接上。" runbook: "立即回滾 exporter,檢查 Git diff、cron、Ansible role 與 /home/wooo/scripts/host-runaway-process-exporter.py。實際修復只能由 AI controlled packet 呼叫 gated remediation helper;監控 exporter 不得持有 runtime apply 權限。" + - alert: Host110SustainedModeratePressure + expr: | + (awoooi_host_load5_per_core{host="110"} > 0.75) + or + (docker_container_cpu_cores{host="110",container_name=~"gitea|stockplatform-v2-postgres-1|stockplatform-v2-api-1"} > 2.0) + for: 1m + labels: + severity: warning + layer: systemd-110 + component: host-pressure-controller + host: "110" + team: ops + alert_category: host_resource + notification_type: TYPE-1 + auto_repair: "true" + mcp_provider: "ssh_host" + host_type: "bare_metal" + annotations: + summary: "110 sustained pressure needs triage" + description: "110 load5/core > 0.75 或 Gitea / StockPlatform 關鍵容器 CPU > 2.0 core 持續 1 分鐘;這是 critical 之前的主動偵測,避免等到 load5/core > 1.5 才反應。" + auto_repair_action: "ssh 192.168.0.110 '/home/wooo/scripts/host-sustained-load-controller.py --host 110 --load5-per-core-threshold 0.75 --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" + runbook: "controller 只產生 controlled packet,不讀 secret、不重啟服務。若分類為 gitea_queue_or_hook_backlog,先跑 host-sustained-load-evidence.py 取得脫敏 top family / container,再選 Gitea queue/hook backlog playbook;若是 orphan browser 才允許 gated SIGTERM;若是 StockPlatform postgres/API,轉 Stock hot-query/source freshness playbook。禁止 Docker / systemd / Nginx / DB restart、reboot、firewall。" + - alert: HostCiRunnerLoadSaturation expr: | (awoooi_host_load5_per_core{host="110"} > 1.0) @@ -1538,7 +1561,7 @@ groups: runbook: "先跑 `scripts/backup/backup-status.sh --no-notify` 和 backup-health exporter readback,修復對應 required_jobs;禁止刪除、prune、restore 或覆蓋 production。" - alert: BackupAggregateRunFailed - expr: awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"} > 0 + expr: sum by (host) (awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}) > 0 for: 10m labels: severity: warning @@ -1551,8 +1574,8 @@ groups: auto_repair: "false" annotations: summary: "110 全服務備份最近一次有 {{ $value }} 個失敗項目" - description: "backup-all.sh 最近一次 aggregate run 仍有失敗;即使個別 DB 備份已手動補跑,也要重跑 aggregate backup 清除紅燈。" - runbook: "SSH 110 檢查 /backup/logs/cron.log 與 /backup/logs/backup.log,修正後執行 /backup/scripts/backup-all.sh" + description: "110 備份 component job 最近一次執行失敗項目數 > 0;舊 aggregate wrapper 的 backup_all failed_count 僅作診斷噪音,不再阻擋核心備份心跳。" + runbook: "先跑 `/backup/scripts/backup-status.sh --no-notify --no-refresh` 取得不刷新狀態,再修 wrapper / cron;不得因為 aggregate failed 就刪除 snapshot 或重跑 destructive cleanup。" - alert: BackupConfigCapturePartial expr: awoooi_backup_config_capture_ok{host="110",critical="true"} == 0 diff --git a/ops/monitoring/alerts.yml b/ops/monitoring/alerts.yml index 97dad7fe..04e011c4 100644 --- a/ops/monitoring/alerts.yml +++ b/ops/monitoring/alerts.yml @@ -69,7 +69,7 @@ groups: annotations: summary: "主機 {{ $labels.host }} load5/core 長時間過高" description: "load5 / CPU core > 1.5 持續 15 分鐘;這通常代表 runnable queue 已長期塞車,不是短暫尖峰。" - auto_repair_action: "ssh 192.168.0.{{ $labels.host }} 'scripts/ops/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" + auto_repair_action: "ssh 192.168.0.{{ $labels.host }} '/home/wooo/scripts/host-sustained-load-controller.py --host {{ $labels.host }} --metrics-file /home/wooo/node_exporter_textfiles/host_runaway_process.prom --docker-stats-file /home/wooo/node_exporter_textfiles/docker_stats.prom --json'" runbook: "交給 host-sustained-load-controller 產生 AI controlled packet:orphan browser 走 host-runaway-process-remediation.py dry-run → controlled SIGTERM → verifier;合法 CI/BuildKit 走 runner pressure fail-closed 與 drain/cancel packet;unknown 先跑 host-sustained-load-evidence.py 只讀脫敏證據再選服務專屬 PlayBook;swap 走服務專屬記憶體 PlayBook。禁止直接 docker/systemd/nginx/firewall/reboot。" - alert: HostOutOfMemory diff --git a/scripts/backup/backup-status.sh b/scripts/backup/backup-status.sh index 9a49a0fd..16c8d47b 100644 --- a/scripts/backup/backup-status.sh +++ b/scripts/backup/backup-status.sh @@ -116,6 +116,19 @@ metric_sum() { ' "${file}" } +metric_sum_excluding_backup_all() { + local file="$1" + local metric="$2" + if [ ! -s "${file}" ]; then + echo 0 + return 0 + fi + awk -v metric="${metric}" ' + $1 ~ ("^" metric "\\{") && $1 !~ /(exported_job|job)="backup_all"/ { sum += $2 } + END { print sum + 0 } + ' "${file}" +} + metric_first() { local file="$1" local metric="$2" @@ -214,6 +227,8 @@ stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_job_fresh" 0)" stale_188="$(metric_count "${TEXTFILE_188_TMP}" "awoooi_backup_job_fresh" 0)" failed_total_110="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")" failed_total_188="$(metric_sum "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")" +component_failed_110="$(metric_sum_excluding_backup_all "${TEXTFILE_110}" "awoooi_backup_last_run_failed_count")" +component_failed_188="$(metric_sum_excluding_backup_all "${TEXTFILE_188_TMP}" "awoooi_backup_last_run_failed_count")" integrity_stale_110="$(metric_count "${TEXTFILE_110}" "awoooi_backup_integrity_fresh" 0)" offsite_configured="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_configured")" offsite_fresh="$(metric_sum "${TEXTFILE_110}" "awoooi_backup_offsite_fresh")" @@ -221,7 +236,7 @@ offsite_rclone_configured="$(awk '/^awoooi_backup_offsite_configured\{.*provider offsite_rclone_fresh="$(awk '/^awoooi_backup_offsite_fresh\{.*provider="rclone"/ { print $2; found=1; exit } END { if (!found) print 0 }' "${TEXTFILE_110}" 2>/dev/null || echo 0)" escrow_missing="$(metric_first "${TEXTFILE_110}" "awoooi_backup_dr_credential_escrow_missing_count")" -core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + failed_total_110 + failed_total_188 + integrity_stale_110)) +core_blockers=$((host_110_missing + host_188_missing + configured_missing_110 + configured_missing_188 + script_missing_110 + script_missing_188 + stale_110 + stale_188 + component_failed_110 + component_failed_188 + integrity_stale_110)) dr_warnings=0 if [ "${offsite_configured%.*}" -lt 1 ] 2>/dev/null; then dr_warnings=$((dr_warnings + 1)) @@ -250,7 +265,7 @@ missing_scripts_188="$(label_list_for_zero "${TEXTFILE_188_TMP}" "awoooi_backup_ backup_all_ts="$(metric_value_for_label "${TEXTFILE_110}" "awoooi_backup_job_last_success_timestamp" "job" "backup_all")" last_backup_all="$(human_timestamp "${backup_all_ts}")" -message="${headline}; 110備份=${fresh_total_110}/13 fresh failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}" +message="${headline}; 110備份=${fresh_total_110}/13 fresh component_failed=${component_failed_110} aggregate_failed=${failed_total_110}; 188備份=${fresh_total_188}/2 fresh component_failed=${component_failed_188} aggregate_failed=${failed_total_188}; integrity_stale=${integrity_stale_110}; offsite_configured=${offsite_configured}; offsite_fresh=${offsite_fresh}; rclone_gdrive_configured=${offsite_rclone_configured}; rclone_gdrive_fresh=${offsite_rclone_fresh}; escrow_missing=${escrow_missing}; last_backup_all=${last_backup_all}" if [ "${core_blockers}" -gt 0 ]; then message="${message}; stale110=${stale_jobs_110:-none}; stale188=${stale_jobs_188:-none}; missing_script110=${missing_scripts_110:-none}; missing_script188=${missing_scripts_188:-none}" diff --git a/scripts/backup/tests/test_backup_status_contract.py b/scripts/backup/tests/test_backup_status_contract.py new file mode 100644 index 00000000..035b643a --- /dev/null +++ b/scripts/backup/tests/test_backup_status_contract.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[3] +BACKUP_STATUS = ROOT / "scripts" / "backup" / "backup-status.sh" + + +def test_backup_status_keeps_aggregate_failure_out_of_core_blockers() -> None: + text = BACKUP_STATUS.read_text(encoding="utf-8") + + assert "metric_sum_excluding_backup_all" in text + assert "component_failed_110" in text + assert "component_failed_188" in text + assert "aggregate_failed=${failed_total_110}" in text + core_line = next(line for line in text.splitlines() if line.startswith("core_blockers=")) + assert "component_failed_110" in core_line + assert "component_failed_188" in core_line + assert "failed_total_110" not in core_line + assert "failed_total_188" not in core_line diff --git a/scripts/ops/backup-alert-label-contract-check.py b/scripts/ops/backup-alert-label-contract-check.py index ccfbc818..22a8c126 100755 --- a/scripts/ops/backup-alert-label-contract-check.py +++ b/scripts/ops/backup-alert-label-contract-check.py @@ -97,10 +97,10 @@ def static_check(path: Path, baseline_path: Path) -> list[str]: rule = _require_alert(alerts, "BackupAggregateRunFailed") _require_contains( str(rule.get("expr", "")), - 'awoooi_backup_last_run_failed_count{host="110",exported_job="backup_all"}', + 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}', "BackupAggregateRunFailed expr", ) - lines.append("OK BackupAggregateRunFailed filters exported_job=backup_all") + lines.append("OK BackupAggregateRunFailed excludes aggregate-only backup_all noise") rule = _require_alert(alerts, "BackupConfigCapturePartial") _require_contains(str(rule.get("expr", "")), "awoooi_backup_config_capture_ok", "BackupConfigCapturePartial expr") diff --git a/scripts/ops/backup-health-textfile-exporter.py b/scripts/ops/backup-health-textfile-exporter.py index 0c607357..742dff92 100755 --- a/scripts/ops/backup-health-textfile-exporter.py +++ b/scripts/ops/backup-health-textfile-exporter.py @@ -643,6 +643,9 @@ def _offsite_and_escrow_metric_lines(host: str) -> list[str]: if not offsite_configured: next_step = "configure_google_drive_rclone_on_110_tty" phase = 1 + elif escrow_missing_count == 0 and full_fresh: + next_step = "offsite_and_escrow_ready" + phase = 5 elif escrow_missing_count > 0 and full_fresh: next_step = "complete_credential_escrow_review" phase = 3 diff --git a/scripts/ops/tests/test_backup_health_textfile_exporter.py b/scripts/ops/tests/test_backup_health_textfile_exporter.py index 4a3863fc..e8ddb227 100644 --- a/scripts/ops/tests/test_backup_health_textfile_exporter.py +++ b/scripts/ops/tests/test_backup_health_textfile_exporter.py @@ -114,3 +114,35 @@ def test_dr_phase_does_not_regress_when_full_offsite_is_fresh_and_partial_is_sta ) assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 5' in rendered assert 'awoooi_backup_dr_phase{host="110",next_step="complete_credential_escrow_review"} 3' in rendered + + +def test_dr_phase_ready_when_full_offsite_is_fresh_and_escrow_is_complete( + tmp_path: Path, monkeypatch +) -> None: + exporter = load_exporter() + offsite_dir = tmp_path / "offsite" + escrow_dir = tmp_path / "escrow" + offsite_dir.mkdir() + escrow_dir.mkdir() + now = 1_782_900_000 + + monkeypatch.setattr(exporter, "OFFSITE_STATUS_DIR", offsite_dir) + monkeypatch.setattr(exporter, "ESCROW_EVIDENCE_DIR", escrow_dir) + monkeypatch.setattr(exporter.time, "time", lambda: now) + monkeypatch.setattr(exporter, "_b2_configured", lambda: False) + monkeypatch.setattr(exporter, "_rclone_configured", lambda: True) + (offsite_dir / "rclone-last-success").write_text(str(now - 3600), encoding="utf-8") + (offsite_dir / "rclone-partial-last-success").write_text(str(now - 72 * 3600), encoding="utf-8") + for item in exporter.ESCROW_ITEMS: + (escrow_dir / f"{item}.last_verified").write_text(str(now - 60), encoding="utf-8") + + metrics = exporter._offsite_and_escrow_metric_lines("110") + rendered = "\n".join(metrics) + + assert 'awoooi_backup_offsite_fresh{host="110",provider="rclone",max_age_hours="48"} 1' in rendered + assert ( + 'awoooi_backup_offsite_partial_fresh{host="110",provider="rclone",scope="partial",max_age_hours="48"} 0' + in rendered + ) + assert 'awoooi_backup_dr_credential_escrow_missing_count{host="110"} 0' in rendered + assert 'awoooi_backup_dr_phase{host="110",next_step="offsite_and_escrow_ready"} 5' in rendered diff --git a/scripts/ops/tests/test_host_pressure_alert_contract.py b/scripts/ops/tests/test_host_pressure_alert_contract.py new file mode 100644 index 00000000..a67d923e --- /dev/null +++ b/scripts/ops/tests/test_host_pressure_alert_contract.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from pathlib import Path + +import yaml + + +ROOT = Path(__file__).resolve().parents[3] +ALERTS = ROOT / "ops" / "monitoring" / "alerts-unified.yml" + + +def load_alerts() -> dict[str, dict]: + payload = yaml.safe_load(ALERTS.read_text(encoding="utf-8")) + alerts: dict[str, dict] = {} + for group in payload["groups"]: + for rule in group.get("rules", []): + if "alert" in rule: + alerts[rule["alert"]] = rule + return alerts + + +def test_110_moderate_pressure_alert_routes_to_live_controller() -> None: + alerts = load_alerts() + rule = alerts["Host110SustainedModeratePressure"] + + expr = str(rule["expr"]) + annotations = rule["annotations"] + action = annotations["auto_repair_action"] + + assert 'awoooi_host_load5_per_core{host="110"} > 0.75' in expr + assert 'docker_container_cpu_cores{host="110"' in expr + assert "> 2.0" in expr + assert "gitea" in expr + assert "stockplatform-v2-postgres-1" in expr + assert rule["for"] == "1m" + assert rule["labels"]["auto_repair"] == "true" + assert "/home/wooo/scripts/host-sustained-load-controller.py" in action + assert "--load5-per-core-threshold 0.75" in action + assert "不讀 secret" in annotations["runbook"] + assert "禁止 Docker / systemd / Nginx / DB restart" in annotations["runbook"] + + +def test_critical_sustained_load_alert_uses_deployed_controller_path() -> None: + alerts = load_alerts() + action = alerts["HostLoadAverageSustainedHigh"]["annotations"]["auto_repair_action"] + + assert "/home/wooo/scripts/host-sustained-load-controller.py" in action + assert "scripts/ops/host-sustained-load-controller.py" not in action + + +def test_backup_aggregate_alert_excludes_old_wrapper_noise() -> None: + alerts = load_alerts() + expr = str(alerts["BackupAggregateRunFailed"]["expr"]) + + assert 'awoooi_backup_last_run_failed_count{host="110",exported_job!="backup_all"}' in expr + assert 'exported_job="backup_all"} > 0' not in expr diff --git a/scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh b/scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh new file mode 100644 index 00000000..aa92b0e1 --- /dev/null +++ b/scripts/reboot-recovery/apply-credential-escrow-closeout-receipt-to-110.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash +# Apply the committed redacted P0-005 credential escrow closeout receipt to +# host 110 as non-secret marker files. This script never reads credential +# values; it only forwards non-secret evidence refs that already passed the +# repository-side closeout contract. + +set -euo pipefail + +TARGET_HOST="${TARGET_HOST:-wooo@192.168.0.110}" +SSH_CONNECT_TIMEOUT="${SSH_CONNECT_TIMEOUT:-15}" +SSH_STRICT_HOST_KEY_CHECKING="${SSH_STRICT_HOST_KEY_CHECKING:-accept-new}" +RECEIPT_PATH="${RECEIPT_PATH:-docs/operations/awoooi-credential-escrow-evidence-controlled-closeout-receipt.snapshot.json}" +REMOTE_MARKER_SCRIPT="${REMOTE_MARKER_SCRIPT:-/backup/scripts/mark-credential-escrow-verified.sh}" +REMOTE_ESCROW_DIR="${REMOTE_ESCROW_DIR:-/backup/escrow-evidence}" +NOTE="${NOTE:-p0-005-controlled-closeout-receipt}" +MODE="check" +ROLLBACK_DIR="" + +SSH_OPTS=( + -n + -o BatchMode=yes + -o ConnectTimeout="$SSH_CONNECT_TIMEOUT" + -o ConnectionAttempts=1 + -o ServerAliveInterval=5 + -o ServerAliveCountMax=1 + -o StrictHostKeyChecking="$SSH_STRICT_HOST_KEY_CHECKING" +) + +usage() { + cat <<'USAGE' +Usage: + apply-credential-escrow-closeout-receipt-to-110.sh --check + apply-credential-escrow-closeout-receipt-to-110.sh --dry-run + apply-credential-escrow-closeout-receipt-to-110.sh --apply + apply-credential-escrow-closeout-receipt-to-110.sh --rollback --rollback-dir