diff --git a/.gitea/workflows/ansible-lint.yml b/.gitea/workflows/ansible-lint.yml index 8644e8ca..22814cd1 100644 --- a/.gitea/workflows/ansible-lint.yml +++ b/.gitea/workflows/ansible-lint.yml @@ -1,27 +1,8 @@ name: Ansible / Reboot Recovery Contract on: - push: - branches: [main] - paths: - - 'infra/ansible/**' - - 'ops/monitoring/**' - - 'ops/reboot-recovery/**' - - 'scripts/backup/**' - - 'scripts/ops/**' - - 'scripts/reboot-recovery/**' - - 'docs/**' - - '.gitea/workflows/**' - pull_request: - paths: - - 'infra/ansible/**' - - 'ops/monitoring/**' - - 'ops/reboot-recovery/**' - - 'scripts/backup/**' - - 'scripts/ops/**' - - 'scripts/reboot-recovery/**' - - 'docs/**' - - '.gitea/workflows/**' + # 2026-06-28 Codex: 110 host runner/CD lane pressure incident. + # Automatic push/PR triggers stay disabled until the runner is moved or rate-limited. workflow_dispatch: jobs: diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index 65068e82..913a42c6 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -10,23 +10,8 @@ name: CD Pipeline on: - push: - branches: [main] - paths: - # 只有實際影響部署的程式碼才觸發 CD - - 'apps/**' - - 'k8s/**' - - '.dockerignore' - # Dockerfile COPY scripts/ into the API image; keep production ops - # seed scripts deploy-coupled instead of repo-only. - - 'scripts/backup/backup-momo-188-pg.sh' - - 'scripts/ci/wait-host-web-build-pressure.sh' - - 'scripts/ops/notify-awoooi-ops.sh' - - 'scripts/ops/awooop-seed-auto-repair-canary-playbook.py' - # Workflow-only changes do not rebuild runtime images. Use workflow_dispatch - # when an operator explicitly wants to test the CD pipeline itself. - # docs/、memory/、ADR 等不觸發 - # ops/monitoring/alerts-unified.yml 由 deploy-alerts.yaml 獨立處理 (I3) + # 2026-06-28 Codex: 110 host runner/CD lane pressure incident. + # Production CD is manual-only until the runner is moved or rate-limited. workflow_dispatch: # 手動觸發永遠可用(用於補跑、緊急部署) diff --git a/.gitea/workflows/code-review.yaml b/.gitea/workflows/code-review.yaml index 7cd64ece..4d746d15 100644 --- a/.gitea/workflows/code-review.yaml +++ b/.gitea/workflows/code-review.yaml @@ -1,15 +1,8 @@ name: Code Review on: - push: - branches: [main] - paths: - - 'apps/**' - - 'k8s/**' - - '!k8s/awoooi-prod/kustomization.yaml' - - 'ops/**' - - 'scripts/**' - - '.gitea/workflows/**' + # 2026-06-28 Codex: 110 host runner/CD lane pressure incident. + # Keep code review manual until the runner is moved or rate-limited. workflow_dispatch: concurrency: diff --git a/AGENTS.md b/AGENTS.md index 0ca3f1a1..7d319b32 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -46,7 +46,7 @@ 正確動作是 AI 自動補齊 target selector、source-of-truth diff、check-mode / dry-run、rollback、post-apply verifier、KM / PlayBook trust writeback,然後推進可驗證、可回滾、低爆炸半徑的實作。 -**110 runner / controlled CD lane 壓力事故例外**:Gitea / act-runner / direct transient runner 對 110 造成 CPU / headless smoke / Docker build 壓力時,屬事故級容量保護,不得用「全面授權」直接重開 legacy runner、移除 legacy mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary,或把 host pressure gate 改成 warn-only。專用 `awoooi-cd-lane.service` 只能在獨立 sentinel、`capacity=1`、窄 label、rollback unit 與 post-apply verifier 成立時受控開啟;正確動作是分流 legacy runner 與 controlled cd-lane,不得一把梭恢復泛用 runner。 +**110 runner / direct CD lane 壓力事故例外**:Gitea / act-runner / direct transient runner / direct CD lane 對 110 造成 CPU / headless smoke / Docker build 壓力時,屬事故級容量保護,不得用「全面授權」直接重開 runner、移除 mask、還原 runner / cd-lane binary、用 `systemd-run` 直啟 `.real` binary,或把 host pressure gate 改成 warn-only。未完成 runner / CD lane 搬遷或硬限流前,相關 runner config / `.runner` registration 與 restore source artifacts 可不讀內容直接 quarantine,unit 應維持 `/dev/null` mask 或等價 fail-closed;會命中 110 runner 的 Gitea push / PR workflow 應維持 `workflow_dispatch` only。正確動作是先做 runner / CD lane 搬遷、限流、label isolation、smoke 排程,再以 check-mode、rollback 與 post-apply verifier 受控恢復。 --- diff --git a/docs/HARD_RULES.md b/docs/HARD_RULES.md index 5758f7dd..da1049b4 100644 --- a/docs/HARD_RULES.md +++ b/docs/HARD_RULES.md @@ -289,9 +289,9 @@ force push / 刪 repo / 刪 refs / 改 repo visibility / raw runtime secret volu ### 110 runner / direct CD lane 壓力事故例外 -2026-06-28 事故後,110 上的 Gitea / act-runner / direct transient runner、StockPlatform headless smoke、host-side Next build 與 Docker / BuildKit 壓力屬容量事故保護面。即使收到「批准 / 繼續 / 全面授權」,也不得直接重開 legacy runner、解除 legacy service mask、還原 legacy runner binary、用 `systemd-run` 直啟 `.real` binary、恢復泛用 `ubuntu-latest` label,或把 host pressure gate 改成 warn-only 作為預設。 +2026-06-28 事故後,110 上的 Gitea / act-runner / direct transient runner / direct CD lane、StockPlatform headless smoke、host-side Next build 與 Docker / BuildKit 壓力屬容量事故保護面。即使收到「批准 / 繼續 / 全面授權」,也不得直接重開 runner、解除 service mask、還原 live runner / cd-lane binary、用 `systemd-run` 直啟 `.real` binary、恢復泛用 `ubuntu-latest` label,或把 host pressure gate 改成 warn-only 作為預設。 -允許的 controlled apply 是降壓與防再發:停止 / disable / mask legacy runner、mask direct transient unit、quarantine legacy runner binary、收斂 labels、補 source fail-closed guard、搬遷 runner、限制 concurrency、把 smoke 改成排程 / 非 110 runner,以及執行只讀 pressure / cold-start verifier。專用 `awoooi-cd-lane.service` 可在獨立 sentinel、`capacity=1`、無 `ubuntu-latest` / StockPlatform / headless / Playwright label、可回滾 unit、post-apply verifier 都成立時受控開啟;verifier 必須把它與 legacy runner 分開判讀。 +允許的 controlled apply 是降壓與防再發:停止 / disable / mask runner、mask direct transient / direct CD lane unit、quarantine runner / cd-lane binary、在不讀 secret / token / registration payload 內容的前提下 quarantine runner config / `.runner` registration 與 restore source artifacts、清除 cd-lane controlled-open sentinel、收斂 labels、補 source / startup fail-closed guard、暫停會命中 110 runner 的 push / PR workflow automatic trigger、搬遷 runner / CD lane、限制 concurrency、把 smoke 改成排程 / 非 110 runner,以及執行只讀 pressure / cold-start verifier。 恢復 runner 必須同時具備: @@ -301,6 +301,14 @@ force push / 刪 repo / 刪 refs / 改 repo visibility / raw runtime secret volu 4. rollback:能回到 inactive / masked / fail-closed stub。 5. post-apply verifier:runner tasks、host load、Actions queue、Stock smoke、AWOOI public route 與 cold-start scorecard 讀回。 +在上述條件完成前,startup / recovery script 不得保留 `START_CONTROLLED_CD_LANE`、`ensure_controlled_cd_lane_open`、直接 restore quarantined config / `.runner` / ELF、或 touch `/run/awoooi-cd-lane-*` sentinel 後 `systemctl enable --now` 的路徑。 + +### Source freshness / provider proxy gate + +資料 freshness gate 必須分清「系統失效」與「上游沒有新來源」。若 source preflight 證明 auth 可用、intake 為空、failed folder 無候選、沒有比最後乾淨 import 更新的 source,且 DB snapshot / realtime sync 與 latest import job 乾淨,stale business data 應降為 warning / source freshness evidence,不得硬擋 cold-start 或 release。若 auth 失敗、failed folder 有候選、source metadata 不可讀、DB sync 不一致或有更新 source 未匯入,仍是 hard blocker。 + +AI provider / proxy health gate 也必須區分「已 provisioned 服務故障」與「未 provisioned / optional retired」。像 LiteLLM 這類會切 provider route 或成本邊界的 proxy,若 repo / inventory 已標成 optional retired、live 無 container 且無 provisioned dir,只能列 warning;不得為了讓 health gate 變綠而直接啟動或切換 provider / cost route。 + ### 跨專案繼承 本節是 AWOOOI / AwoooP / IwoooS / StockPlatform / VibeWork / MOMO / AwoooGo / 2026FIFA / Agent Bounty / Tsenyang 等所有專案的預設行為。若任一 repo 的 `AGENTS.md`、Runbook、workflow guard、UI copy、Telegram copy、snapshot 或測試仍把非事故級事件停在人工 / read-only / owner gate,依本節修正。 diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 0f93388f..0b2862fc 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,29 @@ +## 2026-06-28 — 10:34 110 cd-lane 外部 opener 止血與 188 source-aware gate 收斂 + +**背景**:09:44 後 `awoooi-cd-lane.service` 仍在 10:03、10:22 後多次被還原為 `enabled / active`,binary 又回到 ELF,並可接 Gitea Actions task;後續確認不是 Docker / Nginx / Harbor 事故,而是 110 runner / direct CD lane 壓力事故與外部 opener 反覆恢復。 + +**完成內容**: +- 110 live:停止 active Gitea Actions job container,停止 `awoooi-cd-lane.service`;多輪 regular fail-closed unit 仍被外部 opener 覆寫後,最終將 unit 改為 `/dev/null` mask symlink,將 `/home/wooo/awoooi-cd-lane/awoooi_cd_lane` 改為 shell stub,cd-lane 目錄與 data 目錄加 immutable。 +- 110 live:不讀內容直接 quarantine `config.yaml`、`config.yaml.*`、`data/.runner` 與 restore source artifacts;讀回 `CONFIG_LEFT=0`、`RESTORE_SOURCE_LEFT=0`、`SENTINELS=0`、Actions `0`、cd-lane process `0`。 +- Root cause:最新 main 的 controlled CD lane restore/open 路徑與另一條 sudo opener 會 touch `/run/awoooi-cd-lane-*` sentinel、從 quarantine 安裝 config / `.runner` / ELF 並 `systemctl enable --now`。已將 source `scripts/reboot-recovery/awoooi-startup-110.sh` 改為只會 fail-close:清除 sentinel、quarantine config / `.runner`、unit 維持 `/dev/null` mask,並同步 live `/usr/local/bin/awoooi-startup-110.sh` 加 immutable。 +- 110 live:10:43 再清掉殘留 `/run/awoooi-runner-host-enabled` 與 `/home/wooo/act-runner/data/.runner.quarantined-20260628` restore source;10:46 停止新冒出的 `GITEA-ACTIONS-TASK-8430_WORKFLOW-Code-Review_JOB-ai-code-review` job container。 +- 110 live:最新 main 觸發的 `awoooi-cd-lane-drain.service` 於 10:54 被 P3 抓到 `active / enabled`、binary ELF、job container active;10:55 已停止 / disable / mask,並不讀內容 quarantine drain binary / config 與 job container。 +- Source:`.gitea/workflows/ansible-lint.yml`、`code-review.yaml`、`cd.yaml` 暫改 `workflow_dispatch` only,避免 main push 在 runner 搬遷 / 限流前自動觸發 110 runner / CD lane。 +- Source:`full-stack-cold-start-check.sh` 與 `p3-controlled-release-gate.sh` 修正 cd-lane / drain fail-closed verifier,將 `/dev/null` mask 或 not-found + inactive + sentinel missing + process `0` + binary 非 ELF / missing 視為 fail-closed;兩條 lane 都 fail-closed 才給 `CD_LANE_GUARDRAILS_OK 1`。 +- 188 backup:`scripts/backup/backup-momo-188-pg.sh` 改成 `127.0.0.1 --no-password`,不再要求 container env `POSTGRES_PASSWORD`;live backup 成功產生 `momo_analytics_20260628_095243.sql.gz (205M, 39s)`,backup exporter 顯示 `momo_pg_daily fresh=1`。 +- 188 MOMO:`momo-drive-token-source-recovery-preflight.sh` 與 cold-start 改成 source-aware;Drive intake `0`、failed `0`、global latest `2026-06-25`,latest daily import `57 completed` 且 `15383/15383/0`,因此 daily sales stale 降為 source freshness warning。 +- 188 LiteLLM:repo / inventory / P3 gate 改成 optional retired / not-provisioned warning;未因 health gate 啟動 provider proxy,也未切 provider route / 成本路徑。 + +**驗證結果**: +- 10:32 延遲讀回:`/etc/systemd/system/awoooi-cd-lane.service -> /dev/null`、`0|masked|inactive|masked`、`PROC_COUNT=0`、`ACTIONS=0`、`CONFIG_LEFT=0`、`RESTORE_SOURCE_LEFT=0`、`SENTINELS=0`。 +- 10:47 最終讀回:`/etc/systemd/system/awoooi-cd-lane.service -> /dev/null`、`SYSTEMD=masked|inactive|masked`、`CD_LANE_PROC_COUNT=0`、`RUNNER_PROC_COUNT=0`、`ACTIVE_JOB_CONTAINERS=0`、`CONFIG_LEFT=0`、`RESTORE_SOURCE_LEFT=0`、`SENTINELS=0`。 +- 10:56 最終讀回:regular cd-lane 與 drain lane 都 `/dev/null` / masked 或 not-found / inactive;`CD_LANE_PROC_COUNT=0`、`RUNNER_PROC_COUNT=0`、`ACTIVE_JOB_CONTAINERS=0`、`CONFIG_REG_BINARY_LEFT=0`、`RESTORE_SOURCE_LEFT=0`、`SENTINELS=0`。 +- host pressure gate:`/usr/local/bin/awoooi-wait-host-web-build-pressure.sh` 回 `GATE_RC=0` 與 `no host web/build/smoke pressure detected`。 +- cold-start:`PASS=91 WARN=2 BLOCKED=0`;warnings 是 Alertmanager webhook POST skipped 與 MOMO source freshness,source preflight summary `PASS=20 WARN=5 BLOCKED=0`。 +- P3 release gate:`PASS=38 WARN=3 BLOCKED=0`,runner/CD guardrails `BAD_RUNNER_GUARDRAILS 0`,`NO_ACTIVE_JOB_CONTAINERS`。 + +**邊界**:沒有重啟 Docker / Nginx / firewall / K3s / DB;沒有讀 raw sessions / SQLite / auth / `.env` / runner token;沒有做 DB restore / destructive migration;沒有啟動 LiteLLM 或切 provider route。runner / CD lane 搬遷與硬限流仍是 P0,未完成前不得恢復 automatic push workflow 或 110 runner。 + ## 2026-06-28 — 10:22 AI Agent market radar readback 合約重新對齊 **背景**:feature 合併最新 `gitea-ssh/main=93434b1f6` 後,main 已把 `apps/api/tests/test_ai_agent_market_radar_readback.py` 的 committed snapshot 合約重新對齊為 `8f402983e` 與 `Durable execution / persistence / controlled review loop`;feature 原先的 snapshot 修正仍停在前一輪 `61cf5024` / `human-in-the-loop`,造成 focused pytest 2 項失敗。 @@ -19,7 +45,7 @@ - `full-stack-cold-start-check.sh`、`post-start-quick-check.sh`、`p3-controlled-release-gate.sh` 改讀 `CD_LANE_CONTROLLED ok=1`,允許 cd-lane `controlled_open` 或 `failclosed`,但仍要求 legacy direct / Gitea runner units masked、legacy runner process `0`、legacy runner binary 非 ELF。 - `AGENTS.md` 與 `docs/HARD_RULES.md` 同步新邊界:全面授權不是恢復泛用 runner,而是分流 legacy runner 與 controlled cd-lane。 -**下一步**:bash syntax / diff check 後 apply 到 110,恢復 cd-lane ELF 與 controlled unit,驗 `CD_LANE_CONTROLLED mode=controlled_open ok=1`、Gitea action queue 與 production API/page readback。 +**10:34 狀態更新**:後續 live incident 已證明 controlled-open 邏輯會被外部 opener 利用而反覆還原 cd-lane。此段保留為歷史紀錄;實際下一步以 10:34 fail-closed 結論為準,未完成搬遷或硬限流前不得恢復 cd-lane ELF、unit 或 automatic push workflow。 ## 2026-06-28 — 09:35 Agent Market discovery guard 轉 AI controlled queue diff --git a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md index c96f1a0d..3aedc4f1 100644 --- a/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md +++ b/docs/superpowers/specs/2026-04-15-MASTER-ai-autonomous-flywheel-v2.md @@ -153,6 +153,9 @@ AWOOOI / AwoooP / IwoooS 不是單純監控頁、告警轉發器或資安清冊 3. 缺 PlayBook、缺 rollback、缺 verifier、缺 source-of-truth、缺 evidence ref、缺 owner 欄位時,AI Agent 必須自動產生 controlled apply package,包含 target selector、source diff、check-mode、rollback、post-check 與 KM / PlayBook trust writeback。 4. Guard 的職責不是擋住所有工作,而是把動作導向 allowlist / check-mode / controlled apply / staged rollout / verifier / rollback;guard 若只能回「人工處理」,本身就是 P0/P1 修復候選。 5. 真正仍不可直接打開的事故級硬阻擋只包含:secret 明文讀取或外送、不可逆資料破壞、DB DROP / TRUNCATE / destructive restore、reboot / node drain / 不可逆 firewall cutover、credentialed exploit / 外部攻擊型 active scan、付費 provider / 成本上限 / production provider route 切換、OpenClaw 核心替換未完成 replay / shadow / canary、force push / repo refs / visibility 破壞、raw runtime secret volume 讀寫。 +6. 110 runner / direct CD lane 容量事故屬硬保護例外:搬遷或硬限流前,不得重開 runner、解除 fail-closed、還原 ELF、恢復泛用 label 或把 host pressure gate warn-only;允許 quarantine config / `.runner` registration、immutable fail-closed unit / binary、workflow manual-only、source guard 與只讀 verifier。 +7. 資料 freshness gate 必須 source-aware:若 Drive / provider source preflight 證明沒有比最後乾淨 import 更新的來源,且 DB sync / import job 乾淨,stale business data 是 source freshness warning;auth/source/failed-folder/DB sync 有異常才是 hard blocker。 +8. Provider proxy gate 必須避免成本 / route 誤開:未 provisioned 且 repo 已標 optional retired 的 LiteLLM 等 proxy,只能列 warning;不得為了過 health gate 自動啟動或切 production provider route。 本節是全產品 override。§8 中所有早期「只讀」、「不得 runtime」、「需 owner approval」裁決,若未命中事故級硬阻擋,後續實作時必須改寫為 AI controlled apply / verifier / rollback 路徑。 diff --git a/infra/ansible/inventory/group_vars/host_188.yml b/infra/ansible/inventory/group_vars/host_188.yml index e8c56631..0bc802d5 100644 --- a/infra/ansible/inventory/group_vars/host_188.yml +++ b/infra/ansible/inventory/group_vars/host_188.yml @@ -18,9 +18,6 @@ docker_compose_services: minio: dir: /opt/minio expected_port: 9000 - litellm: - dir: /opt/litellm - expected_port: 4000 n8n: dir: /opt/n8n expected_port: 5678 diff --git a/infra/ansible/playbooks/188-ai-web-readonly.yml b/infra/ansible/playbooks/188-ai-web-readonly.yml index 19151299..19688d9f 100644 --- a/infra/ansible/playbooks/188-ai-web-readonly.yml +++ b/infra/ansible/playbooks/188-ai-web-readonly.yml @@ -15,7 +15,6 @@ - momo-db - signoz - minio - - litellm - n8n - open-webui - docker-registry diff --git a/infra/ansible/playbooks/188-ai-web.yml b/infra/ansible/playbooks/188-ai-web.yml index ca73d355..53f71202 100644 --- a/infra/ansible/playbooks/188-ai-web.yml +++ b/infra/ansible/playbooks/188-ai-web.yml @@ -9,7 +9,7 @@ # momo: running (port 5003) # signoz: running (port 3301) # minio: running (port 9000) -# litellm: running (port 4000) +# litellm: optional/retired unless provider route is explicitly approved # n8n: running (port 5678) # open-webui: running (port 3010) # docker-registry: running (port 5001) @@ -51,7 +51,6 @@ - momo - signoz - minio - - litellm tags: docker - name: "OpenClaw | 確認 systemd drop-in 目錄存在" diff --git a/k8s/monitoring/prometheus.yml b/k8s/monitoring/prometheus.yml index 14ad7764..23db89f8 100644 --- a/k8s/monitoring/prometheus.yml +++ b/k8s/monitoring/prometheus.yml @@ -55,7 +55,6 @@ scrape_configs: - targets: - https://aiops.wooo.work - https://mo.wooo.work - - http://192.168.0.188:4000/health/liveliness - http://192.168.0.110:3001 - http://192.168.0.125:32334/api/v1/health - http://192.168.0.125:32335 diff --git a/ops/reboot-recovery/full-stack-cold-start-baseline.yml b/ops/reboot-recovery/full-stack-cold-start-baseline.yml index 899d0375..c824273a 100644 --- a/ops/reboot-recovery/full-stack-cold-start-baseline.yml +++ b/ops/reboot-recovery/full-stack-cold-start-baseline.yml @@ -305,9 +305,8 @@ resource_guardrails: num_parallel: 1 note: "188 本機 Ollama 是 cold-start 依賴與 Open-WebUI local endpoint;不得維持 disabled/inactive,也不得保留 700%/45G 無節制 guardrail。" litellm: - cpus: 1.0 - memory: 1G - mode: stateless + mode: optional_retired + note: "188 currently has no litellm container, unit, port 4000, or /opt/litellm tree. Do not hard-start a provider proxy without provider route/cost approval; P3 gate treats absent litellm as warning evidence." momo_scheduler: cpus: 2.0 memory: 2G diff --git a/ops/runner/README.md b/ops/runner/README.md index 18a569b5..66cd7cd0 100644 --- a/ops/runner/README.md +++ b/ops/runner/README.md @@ -416,6 +416,27 @@ post-apply verifier 可讀回 `CD_LANE_CONTROLLED ok=1` 時,才可受控恢復 未完成 runner 搬遷、限流、smoke 排程前,不得解除 legacy mask、恢復泛用 runner label, 或把 host pressure gate 預設改成 warn-only。 +2026-06-28 10:10 live update:`awoooi-cd-lane.service` 曾再次被還原為 +`enabled / active` 並把 binary 還原成 ELF;本次改成更強 fail-closed: + +- `awoooi-cd-lane.service` 是 `/dev/null` mask symlink;先前 regular fail-closed + unit 會被外部 opener 用 `tee` 覆寫,mask symlink 讓相同寫入落到 `/dev/null`。 +- `/home/wooo/awoooi-cd-lane/awoooi_cd_lane` 是 immutable shell stub。 +- `config.yaml`、`config.yaml.*`、`data/.runner` 與 restore artifacts 已不讀內容 + 直接 quarantine 到 root 隔離區,cd-lane 目錄 / data 目錄維持 immutable。 +- act-runner-controlled 的 rollback binary / `.runner` restore source 與 `/tmp/awoooi-startup-110.sh.*` + opener 腳本也必須維持 root 隔離;否則外部 opener 可繞過 cd-lane 目錄隔離重新 + 生成 config / registration / ELF。 +- `.gitea/workflows/ansible-lint.yml`、`code-review.yaml`、`cd.yaml` 暫時改成 + `workflow_dispatch` only,避免 main push 自動把 110 direct lane 再拉起。 +- `/usr/local/bin/awoooi-startup-110.sh` 已同步 source fail-closed 版本:移除 + controlled-open path,清除 `/run/awoooi-cd-lane-*` sentinel,並會 quarantine + cd-lane `config.yaml` / `.runner` registration。 + +Rollback 必須先完成 runner / CD lane 搬遷或硬限流,確認新的 target selector、 +source diff、rollback 與 post-apply verifier 後,才能 `chattr -i`、恢復 unit / +registration,並重新啟用必要 workflow trigger。 + --- 版本: v2.0 | 更新: 2026-03-29 | 作者: Claude Code 變更: v1.0→v2.0 序列建構取代 Job Concurrency Groups diff --git a/scripts/backup/backup-momo-188-pg.sh b/scripts/backup/backup-momo-188-pg.sh index 9321d886..c4c04d3b 100755 --- a/scripts/backup/backup-momo-188-pg.sh +++ b/scripts/backup/backup-momo-188-pg.sh @@ -104,8 +104,8 @@ container_running() { run_pg_dump() { docker exec "${DB_CONTAINER}" sh -eu -c ' - : "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}" - PGPASSWORD="${POSTGRES_PASSWORD}" exec pg_dump \ + exec pg_dump \ + -h 127.0.0.1 \ -U "${POSTGRES_USER:-momo}" \ -d "${POSTGRES_DB:-momo_analytics}" \ --no-password \ @@ -124,8 +124,8 @@ insert_backup_log() { -e BACKUP_HOST="$(hostname)" \ -e BACKUP_STORAGE_PATH="${FILEPATH}" \ "${DB_CONTAINER}" sh -eu -c ' - : "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}" - PGPASSWORD="${POSTGRES_PASSWORD}" psql \ + psql \ + -h 127.0.0.1 \ -U "${POSTGRES_USER:-momo}" \ -d "${POSTGRES_DB:-momo_analytics}" \ --no-password \ diff --git a/scripts/reboot-recovery/awoooi-startup-110.sh b/scripts/reboot-recovery/awoooi-startup-110.sh index 13bb5966..468ea6d4 100644 --- a/scripts/reboot-recovery/awoooi-startup-110.sh +++ b/scripts/reboot-recovery/awoooi-startup-110.sh @@ -192,20 +192,13 @@ log "[6/6] 檢查 Gitea Act Runner(預設不自動啟動)..." RUNNER_DIR="/home/wooo/act-runner" RUNNER_SERVICE="gitea-act-runner-host.service" RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled" +CD_LANE_DIR="/home/wooo/awoooi-cd-lane" +CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain" START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}" START_GITEA_RUNNER_ALLOWED=0 -CD_LANE_DIR="/home/wooo/awoooi-cd-lane" -CD_LANE_SERVICE="awoooi-cd-lane.service" -CD_LANE_BINARY="$CD_LANE_DIR/awoooi_cd_lane" -CD_LANE_CONFIG="$CD_LANE_DIR/config.yaml" -CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain" -CD_LANE_DRAIN_SERVICE="awoooi-cd-lane-drain.service" -CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled" -CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml" -CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled" -START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}" -START_CD_LANE_ALLOWED=0 RUNNER_FAIL_CLOSED_SERVICES=( + "awoooi-cd-lane.service" + "awoooi-cd-lane-drain.service" "awoooi-direct-runner-open.service" "awoooi-direct-runner.service" "gitea-act-runner-host.service" @@ -214,19 +207,18 @@ RUNNER_FAIL_CLOSED_SERVICES=( "gitea-act-runner-awoooi-open.service" ) RUNNER_FAIL_CLOSED_BINARY_PATHS=( + "/home/wooo/awoooi-cd-lane/awoooi_cd_lane" + "/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" "/home/wooo/act-runner/act_runner" "/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard" "/home/wooo/act-runner-controlled/act_runner" "/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" ) -# Legacy host runner still needs both keys. The dedicated cd-lane has its own -# sentinel and narrow label/capacity verifier below. +# Host runner still needs both keys. The direct cd-lane stays fail-closed until +# it is migrated or hard-limited outside this production host pressure lane. if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then START_GITEA_RUNNER_ALLOWED=1 fi -if [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; then - START_CD_LANE_ALLOWED=1 -fi mask_runner_unit_file() { local unit="$1" @@ -279,143 +271,71 @@ EOF install_cd_lane_fail_closed_unit() { local unit_file="/etc/systemd/system/awoooi-cd-lane.service" - local tmp local quarantine_stamp quarantine_stamp="$(date +%Y%m%d%H%M%S)" + systemctl mask awoooi-cd-lane.service >/dev/null 2>&1 || true if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then chattr -i "$unit_file" >/dev/null 2>&1 || true - if ! grep -q "AWOOOI direct CD lane fail-closed" "$unit_file" 2>/dev/null; then + if ! { [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; }; then mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true fi fi - tmp="$(mktemp)" - cat >"$tmp" <<'EOF' -[Unit] -Description=AWOOOI direct CD lane fail-closed after 2026-06-28 pressure incident -ConditionPathExists=/run/awoooi-cd-lane-enabled - -[Service] -Type=oneshot -ExecStart=/bin/false -EOF - install -o root -g root -m 0444 "$tmp" "$unit_file" >/dev/null 2>&1 || true - rm -f "$tmp" - chattr +i "$unit_file" >/dev/null 2>&1 || true + ln -sfn /dev/null "$unit_file" >/dev/null 2>&1 || true } -cd_lane_config_path_is_controlled() { - local config_path="$1" - [ -f "$config_path" ] || return 1 - grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$config_path" || return 1 - grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$config_path" || return 1 - grep -q 'awoooi-host:host' "$config_path" || return 1 - if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$config_path"; then - return 1 - fi - return 0 +quarantine_cd_lane_registration_fail_closed() { + local quarantine_dir + local lane_dir + local path + local target + + rm -f /run/awoooi-cd-lane-enabled /run/awoooi-cd-lane-controlled-open >/dev/null 2>&1 || true + + for lane_dir in "$CD_LANE_DIR" "$CD_LANE_DRAIN_DIR"; do + [ -d "$lane_dir" ] || continue + quarantine_dir="$lane_dir/quarantine-startup-$(date +%Y%m%d%H%M%S)" + chattr -i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true + mkdir -p "$quarantine_dir" >/dev/null 2>&1 || true + while IFS= read -r -d '' path; do + [ -e "$path" ] || continue + chattr -i "$path" >/dev/null 2>&1 || true + target="$quarantine_dir/$(basename "$path")" + mv "$path" "$target" >/dev/null 2>&1 || true + chmod 0400 "$target" >/dev/null 2>&1 || true + chattr +i "$target" >/dev/null 2>&1 || true + done < <( + { + find "$lane_dir" -maxdepth 1 \( -name 'config.yaml' -o -name 'config.yaml.*' -o -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null + find "$lane_dir/data" -maxdepth 1 \( -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null + } || true + ) + chattr +i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true + done } -cd_lane_config_is_controlled() { - cd_lane_config_path_is_controlled "$CD_LANE_CONFIG" -} - -cd_lane_drain_config_is_controlled() { - cd_lane_config_path_is_controlled "$CD_LANE_DRAIN_CONFIG" -} - -cd_lane_drain_is_controlled_open() { - local active - active="$(systemctl show "$CD_LANE_DRAIN_SERVICE" -p ActiveState --value 2>/dev/null || true)" - [ "$active" = "active" ] || return 1 - cd_lane_drain_config_is_controlled || return 1 - file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1 - return 0 -} - -ensure_cd_lane_fail_closed() { - if cd_lane_drain_is_controlled_open; then - log "✅ controlled cd-lane drain verifier passed; preserving drain lane and fail-closing regular lane only" - systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true - systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true - systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true - install_cd_lane_fail_closed_unit - pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true - systemctl daemon-reload >/dev/null 2>&1 || true - return 0 - fi - if { [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ -e "/run/awoooi-cd-lane-controlled-open" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; } \ - && cd_lane_config_is_controlled \ - && file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then - log "✅ controlled cd-lane verifier passed; keeping dedicated lane open" - install_controlled_cd_lane_unit - systemctl daemon-reload >/dev/null 2>&1 || true - systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true - return 0 - fi - systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true - systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true - systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true +apply_cd_lane_fail_closed_guard() { + local unit + for unit in awoooi-cd-lane.service awoooi-cd-lane-drain.service; do + systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true + systemctl stop "$unit" >/dev/null 2>&1 || true + systemctl disable "$unit" >/dev/null 2>&1 || true + if [ "$unit" = "awoooi-cd-lane.service" ]; then + install_cd_lane_fail_closed_unit + else + systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system" + mask_runner_unit_file "$unit" "/etc/systemd/system" + fi + done install_cd_lane_fail_closed_unit - pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true - guard_runner_binary_fail_closed "$CD_LANE_BINARY" + pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true + pkill -KILL -f "^${CD_LANE_DRAIN_DIR}/awoooi_cd_lane_controlled daemon" >/dev/null 2>&1 || true + quarantine_cd_lane_registration_fail_closed + guard_runner_binary_fail_closed "$CD_LANE_DIR/awoooi_cd_lane" + guard_runner_binary_fail_closed "$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled" systemctl daemon-reload >/dev/null 2>&1 || true } -install_controlled_cd_lane_unit() { - local unit_file="/etc/systemd/system/$CD_LANE_SERVICE" - local tmp - chattr -i "$unit_file" "$CD_LANE_BINARY" >/dev/null 2>&1 || true - tmp="$(mktemp)" - cat >"$tmp" </dev/null 2>&1 || true - rm -f "$tmp" -} - -ensure_controlled_cd_lane_open() { - if ! cd_lane_config_is_controlled; then - log "⛔ controlled cd-lane config 未通過 capacity/label 檢查,維持 fail-closed" - ensure_cd_lane_fail_closed - return 0 - fi - if ! file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then - log "⛔ controlled cd-lane binary 不是可執行 ELF,維持 fail-closed" - ensure_cd_lane_fail_closed - return 0 - fi - install_controlled_cd_lane_unit - systemctl daemon-reload >/dev/null 2>&1 || true - systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true -} - ensure_host_runner_fail_closed() { local unit local binary @@ -445,6 +365,8 @@ ensure_host_runner_fail_closed() { fi pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true + pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true + quarantine_cd_lane_registration_fail_closed for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do guard_runner_binary_fail_closed "$binary" done @@ -550,13 +472,8 @@ else log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR" fi -if [ "$START_CD_LANE_ALLOWED" = "1" ]; then - log "✅ controlled cd-lane sentinel present; opening dedicated rate-limited CD lane" - ensure_controlled_cd_lane_open -else - log "⏸️ controlled cd-lane 維持 fail-closed;需 $CD_LANE_ENABLE_SENTINEL 或 AWOOOI_START_CONTROLLED_CD_LANE=1" - ensure_cd_lane_fail_closed -fi +log "⏸️ direct cd-lane 維持 fail-closed;需完成搬遷或硬限流後才可用獨立變更恢復" +apply_cd_lane_fail_closed_guard # ────────────────────────────────────────────── # STEP 7: Sentry(Error Tracking) diff --git a/scripts/reboot-recovery/full-stack-cold-start-check.sh b/scripts/reboot-recovery/full-stack-cold-start-check.sh index 1237a516..7955a0c2 100755 --- a/scripts/reboot-recovery/full-stack-cold-start-check.sh +++ b/scripts/reboot-recovery/full-stack-cold-start-check.sh @@ -286,7 +286,7 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*" for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /" done -for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do +for u in awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true) unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true) active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true) @@ -294,6 +294,8 @@ for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-ac unit_ok=0 if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then unit_ok=1 + elif [ "$u" = "awoooi-cd-lane-drain.service" ] && [ "$load" = "not-found" ] && [ "$active" != "active" ]; then + unit_ok=1 fi echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok" done @@ -317,16 +319,21 @@ fi cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing) cd_lane_binary_elf=0 echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1 +cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ") cd_lane_ok=0 cd_lane_mode=blocked -if [ "$cd_lane_active" = "inactive" ] && echo "$cd_lane_execstart" | grep -q "/bin/false" && [ "$cd_lane_binary_elf" = "0" ]; then +if [ "$cd_lane_active" = "inactive" ] \ + && [ "$cd_lane_sentinel" = "missing" ] \ + && [ "$cd_lane_binary_elf" = "0" ] \ + && [ "$cd_lane_process_count" = "0" ] \ + && { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then cd_lane_ok=1 cd_lane_mode=failclosed elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then cd_lane_ok=1 cd_lane_mode=controlled_open fi -echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok" +echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok" cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) @@ -344,24 +351,25 @@ fi cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing) cd_lane_drain_binary_elf=0 echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1 +cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ") cd_lane_drain_ok=0 -cd_lane_drain_mode=absent -if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then - cd_lane_drain_mode=blocked -fi -if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then +cd_lane_drain_mode=blocked +if [ "$cd_lane_drain_active" != "active" ] \ + && [ "$cd_lane_drain_binary_elf" = "0" ] \ + && [ "$cd_lane_drain_process_count" = "0" ] \ + && { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then cd_lane_drain_ok=1 - cd_lane_drain_mode=controlled_open + cd_lane_drain_mode=failclosed fi -echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok" +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" cd_lane_guard_ok=0 -if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then +if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ]; then cd_lane_guard_ok=1 fi echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" -for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do +for p in /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do kind=$(file -b "$p" 2>/dev/null || echo missing) echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p" @@ -573,22 +581,69 @@ scheduler_uid=$(docker top momo-scheduler -eo pid,user,uid 2>/dev/null | awk "NR echo "MOMO_GDRIVE_TOKEN_STAT ${token_stat:-missing} scheduler_uid=${scheduler_uid:-unknown}" db_user=$(docker exec momo-pro-system printenv POSTGRES_USER 2>/dev/null || true) db_name=$(docker exec momo-pro-system printenv POSTGRES_DB 2>/dev/null || true) -db_pass=$(docker exec momo-pro-system printenv POSTGRES_PASSWORD 2>/dev/null || true) -if [ -n "$db_user" ] && [ -n "$db_name" ] && [ -n "$db_pass" ]; then - momo_sync=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;" 2>/dev/null || true) - momo_freshness=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;" 2>/dev/null || true) - momo_import_config=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT config_key || chr(61) || config_value FROM import_config;" 2>/dev/null | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true) - momo_latest_import_job=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" 2>/dev/null | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true) +if [ -n "$db_user" ] && [ -n "$db_name" ]; then + psql_no_secret() { + docker exec -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" --no-password -Atc "$1" 2>/dev/null || true + } + momo_sync=$(psql_no_secret "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;") + momo_freshness=$(psql_no_secret "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;") + momo_import_config=$(psql_no_secret "SELECT config_key || chr(61) || config_value FROM import_config;" | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true) + momo_latest_import_job=$(psql_no_secret "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true) + tmp_drive_probe="/tmp/awoooi-momo-drive-source-probe.$$" + cat > "$tmp_drive_probe" </dev/null | awk "/^MOMO_DRIVE_/ {print}" || true) + rm -f "$tmp_drive_probe" else momo_sync="" momo_freshness="" momo_import_config="" momo_latest_import_job="" + momo_drive_source_probe="" fi echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}" echo "MOMO_DAILY_FRESHNESS ${momo_freshness:-unavailable}" echo "MOMO_IMPORT_CONFIG ${momo_import_config:-unavailable}" echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}" +printf "%s\n" "$momo_drive_source_probe" ' 2>&1); then echo "$out" grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed" @@ -611,10 +666,36 @@ echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}" grep -Fq "MOMO_IMPORT_CONFIG 當日業績匯入|即時業績_當日" <<<"$out" && ok "188 momo Drive import config points to expected daily-sales intake" || fail "188 momo Drive import config drifted from expected daily-sales intake" awk '/MOMO_LATEST_IMPORT_JOB / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[2] == "completed" && a[6] == a[7] && a[8] == 0)}' <<<"$out" && ok "188 momo latest daily import job completed cleanly" || warn "188 momo latest daily import job not confirmed clean" awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed" + momo_source_stale_only=$(awk ' + $1 == "MOMO_DRIVE_INTAKE_COUNT" {intake=$2+0} + $1 == "MOMO_DRIVE_FAILED_COUNT" {failed=$2+0} + $1 == "MOMO_DRIVE_GLOBAL_LATEST_DATE" {global=$2} + $1 == "MOMO_LATEST_IMPORT_JOB" {split($2,a,"|"); completed=substr(a[5],1,10)} + END { + if (intake == 0 && failed == 0 && global ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ && completed ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ && global <= completed) print 1; + else print 0; + }' <<<"$out") if awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 0 && a[1] <= 2)}' <<<"$out"; then ok "188 momo daily sales data fresh enough" elif awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 3)}' <<<"$out"; then - if awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then + if [ "$momo_source_stale_only" = "1" ]; then + warn "188 momo daily sales stale but Drive has no newer source candidate" + elif [ -x scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh ]; then + momo_source_preflight_summary="$( + scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh \ + --host ollama@192.168.0.188 \ + --freshness-max-days 2 2>/dev/null \ + | awk '/^MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT / {line=$0} END {print line}' || true + )" + [ -n "$momo_source_preflight_summary" ] && echo "$momo_source_preflight_summary" + if grep -q "BLOCKED=0" <<<"$momo_source_preflight_summary"; then + warn "188 momo daily sales stale but source preflight has no hard blocker" + elif awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then + fail "188 momo source file absent while daily sales data stale" + else + fail "188 momo daily sales data stale beyond 3 days" + fi + elif awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then fail "188 momo source file absent while daily sales data stale" else fail "188 momo daily sales data stale beyond 3 days" diff --git a/scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh b/scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh index 55086914..e4856200 100755 --- a/scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh +++ b/scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh @@ -306,6 +306,7 @@ drive_archive_count="$(num_for DRIVE_ARCHIVE_COUNT)" drive_failed_count="$(num_for DRIVE_FAILED_COUNT)" drive_archive_latest="$(value_for DRIVE_ARCHIVE_LATEST_MODIFIED)" drive_global_latest="$(value_for DRIVE_GLOBAL_LATEST_MODIFIED)" +drive_global_latest_date="${drive_global_latest:0:10}" if [[ "$drive_intake_count" -gt 0 ]]; then ok "Drive daily-sales intake has pending source files: count=$drive_intake_count" else @@ -338,13 +339,22 @@ IFS='|' read -r freshness_days latest_daily_date <<<"$freshness" if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -le "$FRESHNESS_MAX_DAYS" ]]; then ok "daily sales data freshness is within ${FRESHNESS_MAX_DAYS} days: $freshness" elif [[ "$freshness_days" =~ ^[0-9]+$ ]]; then - blocked "daily sales data is stale: $freshness" + warn "daily sales data is stale: $freshness" else blocked "daily sales freshness is unavailable: ${freshness:-missing}" fi latest_job="$(value_for DB_LATEST_DAILY_IMPORT_JOB)" IFS='|' read -r job_id job_status job_file job_created job_completed job_total job_success job_errors <<<"$latest_job" +job_completed_date="${job_completed:0:10}" +source_absent_without_newer_drive=0 +if [[ "$drive_intake_count" -eq 0 \ + && "$drive_failed_count" -eq 0 \ + && "$drive_global_latest_date" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ \ + && "$job_completed_date" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]] \ + && [[ "$drive_global_latest_date" < "$job_completed_date" || "$drive_global_latest_date" == "$job_completed_date" ]]; then + source_absent_without_newer_drive=1 +fi if [[ "$job_id" =~ ^[0-9]+$ && "$job_status" == "completed" && "$job_total" == "$job_success" && "$job_errors" == "0" ]]; then ok "latest daily import job completed cleanly: id=$job_id file=$job_file" else @@ -354,6 +364,8 @@ fi if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -gt "$FRESHNESS_MAX_DAYS" ]]; then if [[ "$auth_failures" -gt 0 ]]; then blocked "release blocker is stale business data with active Drive auth/source evidence gate" + elif [[ "$source_absent_without_newer_drive" -eq 1 ]]; then + warn "daily sales data is stale, but Drive has no newer source candidate than the last clean import" else blocked "release blocker is stale business data; source evidence must be refreshed" fi diff --git a/scripts/reboot-recovery/p3-controlled-release-gate.sh b/scripts/reboot-recovery/p3-controlled-release-gate.sh index 68a25911..a36b4e1b 100755 --- a/scripts/reboot-recovery/p3-controlled-release-gate.sh +++ b/scripts/reboot-recovery/p3-controlled-release-gate.sh @@ -306,13 +306,15 @@ check_runner_guardrails() { local out bad if ! out=$(ssh_cmd "wooo@192.168.0.110" ' bad=0 -for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do +for u in awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true) unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true) active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true) unit_ok=0 if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then unit_ok=1 + elif [ "$u" = "awoooi-cd-lane-drain.service" ] && [ "$load" = "not-found" ] && [ "$active" != "active" ]; then + unit_ok=1 fi echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active ok=$unit_ok" [ "$unit_ok" = "1" ] || bad=1 @@ -336,16 +338,21 @@ fi cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing) cd_lane_binary_elf=0 echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1 +cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ") cd_lane_ok=0 cd_lane_mode=blocked -if [ "$cd_lane_active" = "inactive" ] && echo "$cd_lane_execstart" | grep -q "/bin/false" && [ "$cd_lane_binary_elf" = "0" ]; then +if [ "$cd_lane_active" = "inactive" ] \ + && [ "$cd_lane_sentinel" = "missing" ] \ + && [ "$cd_lane_binary_elf" = "0" ] \ + && [ "$cd_lane_process_count" = "0" ] \ + && { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then cd_lane_ok=1 cd_lane_mode=failclosed elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then cd_lane_ok=1 cd_lane_mode=controlled_open fi -echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok" +echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok" cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true) cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true) cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true) @@ -362,18 +369,19 @@ fi cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing) cd_lane_drain_binary_elf=0 echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1 +cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ") cd_lane_drain_ok=0 -cd_lane_drain_mode=absent -if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then - cd_lane_drain_mode=blocked -fi -if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then +cd_lane_drain_mode=blocked +if [ "$cd_lane_drain_active" != "active" ] \ + && [ "$cd_lane_drain_binary_elf" = "0" ] \ + && [ "$cd_lane_drain_process_count" = "0" ] \ + && { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then cd_lane_drain_ok=1 - cd_lane_drain_mode=controlled_open + cd_lane_drain_mode=failclosed fi -echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok" +echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok" cd_lane_guard_ok=0 -if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then +if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ]; then cd_lane_guard_ok=1 fi echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" @@ -381,7 +389,7 @@ echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok" direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ") echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count" [ "$direct_runner_count" = "0" ] || bad=1 -for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do +for p in /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do kind=$(file -b "$p" 2>/dev/null || echo missing) echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind" echo "$kind" | grep -qi "ELF" && bad=1 @@ -446,13 +454,22 @@ echo "ollama-systemd $(systemctl is-active ollama 2>/dev/null || true)" echo "ollama-api $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:11434/api/tags || true)" docker inspect -f "momo-scheduler {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true docker inspect -f "litellm {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" litellm 2>/dev/null || true +if ! docker inspect litellm >/dev/null 2>&1 && [ ! -d /opt/litellm ]; then + echo "litellm not-provisioned" +fi docker inspect -f "signoz-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" signoz-clickhouse 2>/dev/null || true ' 2>&1); then echo "$out" grep -q "ollama-systemd active" <<<"$out" && ok "188 Ollama systemd active" || blocked "188 Ollama systemd inactive" grep -q "ollama-api 200" <<<"$out" && ok "188 Ollama API reachable" || blocked "188 Ollama API not reachable" grep -q "momo-scheduler running healthy" <<<"$out" && ok "188 momo-scheduler healthy" || blocked "188 momo-scheduler not healthy" - grep -Eq "litellm running( |$)" <<<"$out" && ok "188 litellm running" || blocked "188 litellm not running" + if grep -Eq "litellm running( |$)" <<<"$out"; then + ok "188 litellm running" + elif grep -q "litellm not-provisioned" <<<"$out"; then + warn "188 litellm not provisioned; provider route/cost switch requires separate approval" + else + blocked "188 litellm not running" + fi grep -q "signoz-clickhouse running healthy" <<<"$out" && ok "188 SignOz ClickHouse healthy" || warn "188 SignOz ClickHouse health not confirmed" else blocked "188 high-load service check unavailable"