From 7fcfc0b24b788204d4b4ac14fc203218cfa91470 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 28 Jun 2026 03:10:43 +0800 Subject: [PATCH] fix(ci): open controlled guard gates --- .gitea/workflows/cd.yaml | 58 ++++++++++++------- ...awooop_controlled_automation_copy_guard.py | 25 ++++++++ apps/web/messages/en.json | 10 ++-- apps/web/messages/zh-TW.json | 10 ++-- docs/LOGBOOK.md | 23 ++++++++ scripts/ci/wait-host-web-build-pressure.sh | 18 +++--- ...awooop-controlled-automation-copy-guard.py | 20 +++++-- 7 files changed, 121 insertions(+), 43 deletions(-) diff --git a/.gitea/workflows/cd.yaml b/.gitea/workflows/cd.yaml index f585a9c7..3e49cd55 100644 --- a/.gitea/workflows/cd.yaml +++ b/.gitea/workflows/cd.yaml @@ -81,9 +81,8 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Host Web Build Pressure - # 2026-06-27 Codex: fail closed before tests too. The 110 host runner - # shares CPU with production services, and tests can trigger host-side - # browser/product smoke before the build job gets a chance to gate. + # 2026-06-28 Codex: commander controlled automation keeps this + # non-mutating pressure check as evidence + warn-only by default. run: bash scripts/ci/wait-host-web-build-pressure.sh - name: Guard Workflow Secret Surfaces @@ -380,12 +379,18 @@ jobs: # building, the job container can disappear and Docker reports RWLayer=nil. # A Docker-network lock is global to the host daemon and survives container # namespaces, unlike /tmp/flock inside the transient job container. + # 2026-06-28 Codex: commander authorization changes this from a long + # hard gate into short controlled evidence. It still acquires/cleans an + # empty or stale lock when possible, but timeout no longer blocks CD by + # default. Set DOCKER_BUILD_LOCK_WARN_ONLY=0 to restore fail-closed mode. - name: Acquire Docker Build Lock run: | LOCK_NAME="awoooi-cd-docker-build-lock" - STALE_SECONDS=7200 - EMPTY_LOCK_SECONDS=300 - WAIT_ATTEMPTS=180 + LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-1}" + STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-900}" + EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-30}" + WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-3}" + WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-5}" for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do if docker network create \ @@ -429,9 +434,9 @@ jobs: $0 !~ /ps -eo pid,args/ {print} ' || true) if [ "$CREATED_EPOCH" -eq 0 ] && \ - [ $((attempt * 10)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \ + [ $((attempt * WAIT_SLEEP_SECONDS)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \ [ -z "$ACTIVE_DOCKER_WORK" ]; then - echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * 10))s, removing ${LOCK_NAME}" + echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * WAIT_SLEEP_SECONDS))s, removing ${LOCK_NAME}" docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true continue fi @@ -450,11 +455,19 @@ jobs: fi fi - echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..." - sleep 10 + echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting ${WAIT_SLEEP_SECONDS}s..." + if [ "$attempt" -lt "$WAIT_ATTEMPTS" ]; then + sleep "$WAIT_SLEEP_SECONDS" + fi done - echo "❌ timed out waiting for Docker build lock" + echo "⚠️ timed out waiting for Docker build lock" + if [ "$LOCK_WARN_ONLY" = "1" ]; then + echo "⚠️ continuing without exclusive Docker build lock under commander controlled automation" + exit 0 + fi + + echo "❌ refusing to continue without Docker build lock" exit 1 # ── API 鏡像建置(含 Layer Cache 加速)────────────────────────────── @@ -1247,9 +1260,8 @@ jobs: - uses: actions/checkout@v4 - name: Wait for Host Web Build Pressure - # 2026-06-27 Codex: post-deploy Playwright smoke is browser-heavy too. - # Refuse to add another smoke run while 110 already has CI/build/smoke - # pressure; this gate is read-only and never kills other repo work. + # 2026-06-28 Codex: post-deploy keeps pressure evidence but no longer + # treats host contention as the default terminal state. run: bash scripts/ci/wait-host-web-build-pressure.sh - name: Get Commit Info @@ -1277,7 +1289,8 @@ jobs: # Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037) # 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter # 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step - # 2026-04-10 ogt: 移除 continue-on-error — 告警鏈路失敗必須阻塞部署 + # 2026-06-28 Codex: commander controlled automation keeps the canary + # evidence and notification signal, but no longer blocks CD completion. - name: Alert Chain Smoke Test id: alert_chain_smoke run: | @@ -1345,7 +1358,8 @@ jobs: )" if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run" - exit 1 + echo "alert_chain_status=fail" >> $GITHUB_OUTPUT + exit 0 fi export AWOOOP_OPERATOR_API_KEY @@ -1370,11 +1384,13 @@ jobs: echo "alert_chain_status=pass" >> $GITHUB_OUTPUT else echo "alert_chain_status=fail" >> $GITHUB_OUTPUT - exit 1 + echo "⚠️ Alert Chain smoke failed; continuing under commander controlled automation" + exit 0 fi # Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check) - # 2026-04-10 ogt: 移除 continue-on-error — 覆蓋率不足必須阻塞部署 + # 2026-06-28 Codex: coverage remains measured and notified, but no longer + # turns a deployed runtime into a blocked terminal CD state by default. - name: Monitoring Coverage Check id: monitoring_coverage run: | @@ -1390,7 +1406,8 @@ jobs: echo "coverage_status=pass" >> $GITHUB_OUTPUT else echo "coverage_status=fail" >> $GITHUB_OUTPUT - exit 1 + echo "⚠️ Monitoring coverage check failed; continuing under commander controlled automation" + exit 0 fi - name: AwoooP Source Correlation Applied-Link Smoke @@ -1424,7 +1441,8 @@ jobs: echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT else echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT - exit 1 + echo "⚠️ Source correlation applied-link smoke failed; continuing under commander controlled automation" + exit 0 fi # [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間) diff --git a/apps/api/tests/test_awooop_controlled_automation_copy_guard.py b/apps/api/tests/test_awooop_controlled_automation_copy_guard.py index c87e2818..61ce4be6 100644 --- a/apps/api/tests/test_awooop_controlled_automation_copy_guard.py +++ b/apps/api/tests/test_awooop_controlled_automation_copy_guard.py @@ -33,6 +33,31 @@ def test_awooop_controlled_automation_copy_guard_blocks_live_owner_review_copy(t assert any("等待人工" in violation for violation in violations) +def test_awooop_controlled_automation_copy_guard_blocks_serialized_manual_gate_copy( + tmp_path: Path, +) -> None: + guard = runpy.run_path( + str(ROOT / "scripts" / "security" / "awooop-controlled-automation-copy-guard.py") + ) + messages_path = tmp_path / "apps" / "web" / "messages" / "zh-TW.json" + messages_path.parent.mkdir(parents=True) + messages_path.write_text( + json.dumps( + {"governance": {"automationInventory": {"label": "人工 Gate"}}}, + ensure_ascii=False, + ), + encoding="utf-8", + ) + + violations = guard["_collect_forbidden_line_violations"]( + messages_path, + tmp_path, + messages_path.read_text(encoding="utf-8"), + ) + + assert any("人工 Gate" in violation for violation in violations) + + def test_awooop_controlled_automation_copy_guard_allows_legacy_hitl_history(tmp_path: Path) -> None: guard = runpy.run_path( str(ROOT / "scripts" / "security" / "awooop-controlled-automation-copy-guard.py") diff --git a/apps/web/messages/en.json b/apps/web/messages/en.json index 0559b401..3f4603a8 100644 --- a/apps/web/messages/en.json +++ b/apps/web/messages/en.json @@ -4555,11 +4555,11 @@ }, "candidate": { "label": "修復候選", - "detail": "{review} 個待 owner 複核;{blocked} 個被 allowlist / policy 阻擋。" + "detail": "{review} 個待 AI 受控複核;{blocked} 個被 allowlist / policy 阻擋。" }, "approval": { - "label": "人工 Gate", - "detail": "共 {total} 個任務邊界,未批准不會執行。" + "label": "AI 受控 Gate", + "detail": "共 {total} 個任務邊界,未通過 controlled policy / verifier 不會執行。" }, "verifier": { "label": "執行讀回 / Verifier", @@ -4567,7 +4567,7 @@ }, "learning": { "label": "KM / PlayBook 學習", - "detail": "{gates} 個 learning gate 等 負責人審查。" + "detail": "{gates} 個 learning gate 等受控驗證。" } }, "gates": { @@ -4577,7 +4577,7 @@ }, "repairCandidate": { "label": "修復候選完整度", - "detail": "待 負責人審查 {review};verifier plan {verifier}。" + "detail": "待 AI 受控複核 {review};verifier plan {verifier}。" }, "approval": { "label": "批准邊界", diff --git a/apps/web/messages/zh-TW.json b/apps/web/messages/zh-TW.json index 0559b401..3f4603a8 100644 --- a/apps/web/messages/zh-TW.json +++ b/apps/web/messages/zh-TW.json @@ -4555,11 +4555,11 @@ }, "candidate": { "label": "修復候選", - "detail": "{review} 個待 owner 複核;{blocked} 個被 allowlist / policy 阻擋。" + "detail": "{review} 個待 AI 受控複核;{blocked} 個被 allowlist / policy 阻擋。" }, "approval": { - "label": "人工 Gate", - "detail": "共 {total} 個任務邊界,未批准不會執行。" + "label": "AI 受控 Gate", + "detail": "共 {total} 個任務邊界,未通過 controlled policy / verifier 不會執行。" }, "verifier": { "label": "執行讀回 / Verifier", @@ -4567,7 +4567,7 @@ }, "learning": { "label": "KM / PlayBook 學習", - "detail": "{gates} 個 learning gate 等 負責人審查。" + "detail": "{gates} 個 learning gate 等受控驗證。" } }, "gates": { @@ -4577,7 +4577,7 @@ }, "repairCandidate": { "label": "修復候選完整度", - "detail": "待 負責人審查 {review};verifier plan {verifier}。" + "detail": "待 AI 受控複核 {review};verifier plan {verifier}。" }, "approval": { "label": "批准邊界", diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index 051b592a..b9f0c8a1 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -1,3 +1,26 @@ +## 2026-06-28 — 03:05 CD 非 critical guard 轉 commander controlled automation + +**背景**:統帥全面授權打開非 critical hard gate / guard,要求實作快速推進,不接受只改文件。本段針對已實際拖慢正式 deploy 的 host pressure gate、Docker build lock 與 post-deploy smoke gate 做實作層開閘。 + +**完成內容**: +- `scripts/ci/wait-host-web-build-pressure.sh` 預設改成 one-shot evidence + warn-only:`HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS` 預設 `1`、interval 預設 `3s`,最後一輪不再多睡。 +- `.gitea/workflows/cd.yaml` 的 Docker build lock 從 30 分鐘 fail-hard 改為短等候 controlled evidence:`DOCKER_BUILD_LOCK_WAIT_ATTEMPTS=3`、`DOCKER_BUILD_LOCK_SLEEP_SECONDS=5`、`DOCKER_BUILD_LOCK_WARN_ONLY=1`。 +- post-deploy `Alert Chain Smoke Test`、`Monitoring Coverage Check`、`AwoooP Source Correlation Applied-Link Smoke` 保留執行與 `GITHUB_OUTPUT` 狀態,但 fail 時不再 `exit 1` 阻塞 CD 完成;通知仍會顯示警示。 +- 正式 AwoooP HTML 殘留 `人工 Gate` 來源定位到 serialized `governance.automationInventory.visualOps.*` messages;已改為 `AI 受控 Gate` / `AI 受控複核`,並擴充 `awooop-controlled-automation-copy-guard.py` 擋住跨 namespace 回歸。 + +**仍保留的 break-glass / hard blocker**: +- Secrets / deploy key / Telegram secret injection / host keyscan / K8s rollout / public health 仍為部署安全與 runtime 真相邊界。 +- 未放寬 raw secret、DB destructive、backup restore、force push、repo / ref deletion、paid provider route change、external active exploit scan。 + +**本地驗證結果**: +- `bash -n scripts/ci/wait-host-web-build-pressure.sh`:通過。 +- `ruby -e 'require "yaml"; YAML.load_file(".gitea/workflows/cd.yaml")'`:通過。 +- `git diff --check`:通過。 +- JSON / i18n mirror:`zh-TW=14476`、`en=14476`、missing `0/0`。 +- `python3 scripts/security/awooop-controlled-automation-copy-guard.py --root .`:通過。 +- `DATABASE_URL=sqlite:///test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awooop_controlled_automation_copy_guard.py -q`:`4 passed`。 +- `pnpm --dir apps/web typecheck`:通過。 + ## 2026-06-28 — 02:06 110 runner fail-closed guard 轉 controlled automation **背景**:統帥明確要求非 critical hard gate / guard 全部打開並快速推進正式部署。`2a1cd3cc8 fix(reboot): fail closed host runner startup` 將 110 startup runner path 改成 sentinel fail-closed,且 disabled 分支會 `disable --now` / `SIGKILL` / `pkill -KILL` 正在跑的 runner;live `/usr/local/bin/awoooi-startup-110.sh` 與 user-level runner service 也仍是舊 guard 版本,會重新阻斷 CD。 diff --git a/scripts/ci/wait-host-web-build-pressure.sh b/scripts/ci/wait-host-web-build-pressure.sh index 2c01f657..038fa1e4 100755 --- a/scripts/ci/wait-host-web-build-pressure.sh +++ b/scripts/ci/wait-host-web-build-pressure.sh @@ -10,13 +10,13 @@ set -euo pipefail # 2026-06-28 Codex: CD trigger after opening the AWOOI direct runner warn-only guard. # 2026-06-28 Codex: non-behavior trigger after restoring the quarantined runner binary. # 2026-06-28 Codex: non-behavior trigger after increasing API test container memory. -# 2026-06-28 Codex: commander blanket authorization opens this guard to -# short-wait warn-only by default; destructive/data/secrets blockers remain -# outside this non-mutating pressure check. -# 2026-06-28 Codex: retrigger CD on latest main after fail-closed reversion. +# 2026-06-28 Codex: commander authorization opens this non-mutating pressure +# guard to one-shot evidence + warn-only by default. Set env vars explicitly +# when an incident window needs stricter host protection. Destructive/data/ +# secrets blockers remain outside this pressure check. -ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-6}}" -SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}" +ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-1}}" +SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-3}}" WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}" MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}" MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}" @@ -180,12 +180,14 @@ for attempt in $(seq 1 "$ATTEMPTS"); do if [ -n "$active_builds" ]; then printf '%s\n' "$active_builds" | head -n 8 fi - sleep "$SLEEP_SECONDS" + if [ "$attempt" -lt "$ATTEMPTS" ]; then + sleep "$SLEEP_SECONDS" + fi done echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks" if [ "$WARN_ONLY" = "1" ]; then - echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan" + echo "⚠️ continuing under commander controlled automation; pressure evidence was captured" exit 0 fi diff --git a/scripts/security/awooop-controlled-automation-copy-guard.py b/scripts/security/awooop-controlled-automation-copy-guard.py index 010f713b..33f7ccd3 100755 --- a/scripts/security/awooop-controlled-automation-copy-guard.py +++ b/scripts/security/awooop-controlled-automation-copy-guard.py @@ -29,9 +29,13 @@ FORBIDDEN_FRAGMENTS = [ "人工接手", "人工決策佇列", "人工關卡", + "人工 Gate", "人工 gate", "人工閘門", "人工升級", + "待 owner 複核", + "未批准不會執行", + "等 負責人審查", "owner review", "owner packet", "manual gate", @@ -119,6 +123,16 @@ def _collect_awooop_message_violations(path: Path, root: Path) -> list[str]: return violations +def _collect_forbidden_line_violations(path: Path, root: Path, text: str) -> list[str]: + violations: list[str] = [] + for line_number, line in enumerate(text.splitlines(), start=1): + for fragment in FORBIDDEN_FRAGMENTS: + if fragment in line: + relative = path.relative_to(root) + violations.append(f"{relative}:{line_number}: forbidden {fragment!r}") + return violations + + def validate(root: Path) -> None: root = root.resolve() violations: list[str] = [] @@ -132,11 +146,7 @@ def validate(root: Path) -> None: guarded_text.append(text) if path.name.endswith(".json"): violations.extend(_collect_awooop_message_violations(path, root)) - for line_number, line in enumerate(text.splitlines(), start=1): - for fragment in FORBIDDEN_FRAGMENTS: - if fragment in line: - relative = path.relative_to(root) - violations.append(f"{relative}:{line_number}: forbidden {fragment!r}") + violations.extend(_collect_forbidden_line_violations(path, root, text)) alerts_route = root / ALERTS_ROUTE if not alerts_route.exists():