fix(ci): open controlled guard gates
Some checks failed
Ansible / Reboot Recovery Contract / validate (push) Successful in 1m15s
CD Pipeline / tests (push) Failing after 1m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 27s
Some checks failed
Ansible / Reboot Recovery Contract / validate (push) Successful in 1m15s
CD Pipeline / tests (push) Failing after 1m8s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
Code Review / ai-code-review (push) Successful in 27s
This commit is contained in:
@@ -81,9 +81,8 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for Host Web Build Pressure
|
||||
# 2026-06-27 Codex: fail closed before tests too. The 110 host runner
|
||||
# shares CPU with production services, and tests can trigger host-side
|
||||
# browser/product smoke before the build job gets a chance to gate.
|
||||
# 2026-06-28 Codex: commander controlled automation keeps this
|
||||
# non-mutating pressure check as evidence + warn-only by default.
|
||||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||||
|
||||
- name: Guard Workflow Secret Surfaces
|
||||
@@ -380,12 +379,18 @@ jobs:
|
||||
# building, the job container can disappear and Docker reports RWLayer=nil.
|
||||
# A Docker-network lock is global to the host daemon and survives container
|
||||
# namespaces, unlike /tmp/flock inside the transient job container.
|
||||
# 2026-06-28 Codex: commander authorization changes this from a long
|
||||
# hard gate into short controlled evidence. It still acquires/cleans an
|
||||
# empty or stale lock when possible, but timeout no longer blocks CD by
|
||||
# default. Set DOCKER_BUILD_LOCK_WARN_ONLY=0 to restore fail-closed mode.
|
||||
- name: Acquire Docker Build Lock
|
||||
run: |
|
||||
LOCK_NAME="awoooi-cd-docker-build-lock"
|
||||
STALE_SECONDS=7200
|
||||
EMPTY_LOCK_SECONDS=300
|
||||
WAIT_ATTEMPTS=180
|
||||
LOCK_WARN_ONLY="${DOCKER_BUILD_LOCK_WARN_ONLY:-1}"
|
||||
STALE_SECONDS="${DOCKER_BUILD_LOCK_STALE_SECONDS:-900}"
|
||||
EMPTY_LOCK_SECONDS="${DOCKER_BUILD_LOCK_EMPTY_SECONDS:-30}"
|
||||
WAIT_ATTEMPTS="${DOCKER_BUILD_LOCK_WAIT_ATTEMPTS:-3}"
|
||||
WAIT_SLEEP_SECONDS="${DOCKER_BUILD_LOCK_SLEEP_SECONDS:-5}"
|
||||
|
||||
for attempt in $(seq 1 "$WAIT_ATTEMPTS"); do
|
||||
if docker network create \
|
||||
@@ -429,9 +434,9 @@ jobs:
|
||||
$0 !~ /ps -eo pid,args/ {print}
|
||||
' || true)
|
||||
if [ "$CREATED_EPOCH" -eq 0 ] && \
|
||||
[ $((attempt * 10)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \
|
||||
[ $((attempt * WAIT_SLEEP_SECONDS)) -gt $((EMPTY_LOCK_SECONDS * 2)) ] && \
|
||||
[ -z "$ACTIVE_DOCKER_WORK" ]; then
|
||||
echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * 10))s, removing ${LOCK_NAME}"
|
||||
echo "⚠️ Docker build lock has unparsable CreatedAt (${CREATED_AT}) and no active docker build/push after $((attempt * WAIT_SLEEP_SECONDS))s, removing ${LOCK_NAME}"
|
||||
docker network rm "$LOCK_NAME" >/dev/null 2>&1 || true
|
||||
continue
|
||||
fi
|
||||
@@ -450,11 +455,19 @@ jobs:
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting..."
|
||||
sleep 10
|
||||
echo "⏳ Docker build lock busy (attempt ${attempt}/${WAIT_ATTEMPTS}); waiting ${WAIT_SLEEP_SECONDS}s..."
|
||||
if [ "$attempt" -lt "$WAIT_ATTEMPTS" ]; then
|
||||
sleep "$WAIT_SLEEP_SECONDS"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "❌ timed out waiting for Docker build lock"
|
||||
echo "⚠️ timed out waiting for Docker build lock"
|
||||
if [ "$LOCK_WARN_ONLY" = "1" ]; then
|
||||
echo "⚠️ continuing without exclusive Docker build lock under commander controlled automation"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "❌ refusing to continue without Docker build lock"
|
||||
exit 1
|
||||
|
||||
# ── API 鏡像建置(含 Layer Cache 加速)──────────────────────────────
|
||||
@@ -1247,9 +1260,8 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Wait for Host Web Build Pressure
|
||||
# 2026-06-27 Codex: post-deploy Playwright smoke is browser-heavy too.
|
||||
# Refuse to add another smoke run while 110 already has CI/build/smoke
|
||||
# pressure; this gate is read-only and never kills other repo work.
|
||||
# 2026-06-28 Codex: post-deploy keeps pressure evidence but no longer
|
||||
# treats host contention as the default terminal state.
|
||||
run: bash scripts/ci/wait-host-web-build-pressure.sh
|
||||
|
||||
- name: Get Commit Info
|
||||
@@ -1277,7 +1289,8 @@ jobs:
|
||||
# Phase O-4.5 2026-04-02: Alert Chain Smoke Test (Wave A.6 + B.2 ADR-037)
|
||||
# 驗證告警鏈路 E2E: API Health + Webhook + OTEL + Event Exporter
|
||||
# 2026-04-05 Claude Code cache優化: 使用 /opt/api-venv (已有 requests),移除 Setup Python Tools step
|
||||
# 2026-04-10 ogt: 移除 continue-on-error — 告警鏈路失敗必須阻塞部署
|
||||
# 2026-06-28 Codex: commander controlled automation keeps the canary
|
||||
# evidence and notification signal, but no longer blocks CD completion.
|
||||
- name: Alert Chain Smoke Test
|
||||
id: alert_chain_smoke
|
||||
run: |
|
||||
@@ -1345,7 +1358,8 @@ jobs:
|
||||
)"
|
||||
if [ -z "${AWOOOP_OPERATOR_API_KEY}" ]; then
|
||||
echo "❌ AWOOOP_OPERATOR_API_KEY missing from production secret; source-link canary cannot run"
|
||||
exit 1
|
||||
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
|
||||
exit 0
|
||||
fi
|
||||
export AWOOOP_OPERATOR_API_KEY
|
||||
|
||||
@@ -1370,11 +1384,13 @@ jobs:
|
||||
echo "alert_chain_status=pass" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "alert_chain_status=fail" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
echo "⚠️ Alert Chain smoke failed; continuing under commander controlled automation"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Phase O-5 Wave C.2 2026-04-02 ogt: 監控覆蓋率驗證 (generate_monitoring.py --check)
|
||||
# 2026-04-10 ogt: 移除 continue-on-error — 覆蓋率不足必須阻塞部署
|
||||
# 2026-06-28 Codex: coverage remains measured and notified, but no longer
|
||||
# turns a deployed runtime into a blocked terminal CD state by default.
|
||||
- name: Monitoring Coverage Check
|
||||
id: monitoring_coverage
|
||||
run: |
|
||||
@@ -1390,7 +1406,8 @@ jobs:
|
||||
echo "coverage_status=pass" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "coverage_status=fail" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
echo "⚠️ Monitoring coverage check failed; continuing under commander controlled automation"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
- name: AwoooP Source Correlation Applied-Link Smoke
|
||||
@@ -1424,7 +1441,8 @@ jobs:
|
||||
echo "source_correlation_apply_status=pass" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "source_correlation_apply_status=fail" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
echo "⚠️ Source correlation applied-link smoke failed; continuing under commander controlled automation"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# [首席架構師] 新增 Playwright E2E Smoke Test 步驟 v1.0.0 2026-04-01 (台北時間)
|
||||
|
||||
@@ -33,6 +33,31 @@ def test_awooop_controlled_automation_copy_guard_blocks_live_owner_review_copy(t
|
||||
assert any("等待人工" in violation for violation in violations)
|
||||
|
||||
|
||||
def test_awooop_controlled_automation_copy_guard_blocks_serialized_manual_gate_copy(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
guard = runpy.run_path(
|
||||
str(ROOT / "scripts" / "security" / "awooop-controlled-automation-copy-guard.py")
|
||||
)
|
||||
messages_path = tmp_path / "apps" / "web" / "messages" / "zh-TW.json"
|
||||
messages_path.parent.mkdir(parents=True)
|
||||
messages_path.write_text(
|
||||
json.dumps(
|
||||
{"governance": {"automationInventory": {"label": "人工 Gate"}}},
|
||||
ensure_ascii=False,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
violations = guard["_collect_forbidden_line_violations"](
|
||||
messages_path,
|
||||
tmp_path,
|
||||
messages_path.read_text(encoding="utf-8"),
|
||||
)
|
||||
|
||||
assert any("人工 Gate" in violation for violation in violations)
|
||||
|
||||
|
||||
def test_awooop_controlled_automation_copy_guard_allows_legacy_hitl_history(tmp_path: Path) -> None:
|
||||
guard = runpy.run_path(
|
||||
str(ROOT / "scripts" / "security" / "awooop-controlled-automation-copy-guard.py")
|
||||
|
||||
@@ -4555,11 +4555,11 @@
|
||||
},
|
||||
"candidate": {
|
||||
"label": "修復候選",
|
||||
"detail": "{review} 個待 owner 複核;{blocked} 個被 allowlist / policy 阻擋。"
|
||||
"detail": "{review} 個待 AI 受控複核;{blocked} 個被 allowlist / policy 阻擋。"
|
||||
},
|
||||
"approval": {
|
||||
"label": "人工 Gate",
|
||||
"detail": "共 {total} 個任務邊界,未批准不會執行。"
|
||||
"label": "AI 受控 Gate",
|
||||
"detail": "共 {total} 個任務邊界,未通過 controlled policy / verifier 不會執行。"
|
||||
},
|
||||
"verifier": {
|
||||
"label": "執行讀回 / Verifier",
|
||||
@@ -4567,7 +4567,7 @@
|
||||
},
|
||||
"learning": {
|
||||
"label": "KM / PlayBook 學習",
|
||||
"detail": "{gates} 個 learning gate 等 負責人審查。"
|
||||
"detail": "{gates} 個 learning gate 等受控驗證。"
|
||||
}
|
||||
},
|
||||
"gates": {
|
||||
@@ -4577,7 +4577,7 @@
|
||||
},
|
||||
"repairCandidate": {
|
||||
"label": "修復候選完整度",
|
||||
"detail": "待 負責人審查 {review};verifier plan {verifier}。"
|
||||
"detail": "待 AI 受控複核 {review};verifier plan {verifier}。"
|
||||
},
|
||||
"approval": {
|
||||
"label": "批准邊界",
|
||||
|
||||
@@ -4555,11 +4555,11 @@
|
||||
},
|
||||
"candidate": {
|
||||
"label": "修復候選",
|
||||
"detail": "{review} 個待 owner 複核;{blocked} 個被 allowlist / policy 阻擋。"
|
||||
"detail": "{review} 個待 AI 受控複核;{blocked} 個被 allowlist / policy 阻擋。"
|
||||
},
|
||||
"approval": {
|
||||
"label": "人工 Gate",
|
||||
"detail": "共 {total} 個任務邊界,未批准不會執行。"
|
||||
"label": "AI 受控 Gate",
|
||||
"detail": "共 {total} 個任務邊界,未通過 controlled policy / verifier 不會執行。"
|
||||
},
|
||||
"verifier": {
|
||||
"label": "執行讀回 / Verifier",
|
||||
@@ -4567,7 +4567,7 @@
|
||||
},
|
||||
"learning": {
|
||||
"label": "KM / PlayBook 學習",
|
||||
"detail": "{gates} 個 learning gate 等 負責人審查。"
|
||||
"detail": "{gates} 個 learning gate 等受控驗證。"
|
||||
}
|
||||
},
|
||||
"gates": {
|
||||
@@ -4577,7 +4577,7 @@
|
||||
},
|
||||
"repairCandidate": {
|
||||
"label": "修復候選完整度",
|
||||
"detail": "待 負責人審查 {review};verifier plan {verifier}。"
|
||||
"detail": "待 AI 受控複核 {review};verifier plan {verifier}。"
|
||||
},
|
||||
"approval": {
|
||||
"label": "批准邊界",
|
||||
|
||||
@@ -1,3 +1,26 @@
|
||||
## 2026-06-28 — 03:05 CD 非 critical guard 轉 commander controlled automation
|
||||
|
||||
**背景**:統帥全面授權打開非 critical hard gate / guard,要求實作快速推進,不接受只改文件。本段針對已實際拖慢正式 deploy 的 host pressure gate、Docker build lock 與 post-deploy smoke gate 做實作層開閘。
|
||||
|
||||
**完成內容**:
|
||||
- `scripts/ci/wait-host-web-build-pressure.sh` 預設改成 one-shot evidence + warn-only:`HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS` 預設 `1`、interval 預設 `3s`,最後一輪不再多睡。
|
||||
- `.gitea/workflows/cd.yaml` 的 Docker build lock 從 30 分鐘 fail-hard 改為短等候 controlled evidence:`DOCKER_BUILD_LOCK_WAIT_ATTEMPTS=3`、`DOCKER_BUILD_LOCK_SLEEP_SECONDS=5`、`DOCKER_BUILD_LOCK_WARN_ONLY=1`。
|
||||
- post-deploy `Alert Chain Smoke Test`、`Monitoring Coverage Check`、`AwoooP Source Correlation Applied-Link Smoke` 保留執行與 `GITHUB_OUTPUT` 狀態,但 fail 時不再 `exit 1` 阻塞 CD 完成;通知仍會顯示警示。
|
||||
- 正式 AwoooP HTML 殘留 `人工 Gate` 來源定位到 serialized `governance.automationInventory.visualOps.*` messages;已改為 `AI 受控 Gate` / `AI 受控複核`,並擴充 `awooop-controlled-automation-copy-guard.py` 擋住跨 namespace 回歸。
|
||||
|
||||
**仍保留的 break-glass / hard blocker**:
|
||||
- Secrets / deploy key / Telegram secret injection / host keyscan / K8s rollout / public health 仍為部署安全與 runtime 真相邊界。
|
||||
- 未放寬 raw secret、DB destructive、backup restore、force push、repo / ref deletion、paid provider route change、external active exploit scan。
|
||||
|
||||
**本地驗證結果**:
|
||||
- `bash -n scripts/ci/wait-host-web-build-pressure.sh`:通過。
|
||||
- `ruby -e 'require "yaml"; YAML.load_file(".gitea/workflows/cd.yaml")'`:通過。
|
||||
- `git diff --check`:通過。
|
||||
- JSON / i18n mirror:`zh-TW=14476`、`en=14476`、missing `0/0`。
|
||||
- `python3 scripts/security/awooop-controlled-automation-copy-guard.py --root .`:通過。
|
||||
- `DATABASE_URL=sqlite:///test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_awooop_controlled_automation_copy_guard.py -q`:`4 passed`。
|
||||
- `pnpm --dir apps/web typecheck`:通過。
|
||||
|
||||
## 2026-06-28 — 02:06 110 runner fail-closed guard 轉 controlled automation
|
||||
|
||||
**背景**:統帥明確要求非 critical hard gate / guard 全部打開並快速推進正式部署。`2a1cd3cc8 fix(reboot): fail closed host runner startup` 將 110 startup runner path 改成 sentinel fail-closed,且 disabled 分支會 `disable --now` / `SIGKILL` / `pkill -KILL` 正在跑的 runner;live `/usr/local/bin/awoooi-startup-110.sh` 與 user-level runner service 也仍是舊 guard 版本,會重新阻斷 CD。
|
||||
|
||||
@@ -10,13 +10,13 @@ set -euo pipefail
|
||||
# 2026-06-28 Codex: CD trigger after opening the AWOOI direct runner warn-only guard.
|
||||
# 2026-06-28 Codex: non-behavior trigger after restoring the quarantined runner binary.
|
||||
# 2026-06-28 Codex: non-behavior trigger after increasing API test container memory.
|
||||
# 2026-06-28 Codex: commander blanket authorization opens this guard to
|
||||
# short-wait warn-only by default; destructive/data/secrets blockers remain
|
||||
# outside this non-mutating pressure check.
|
||||
# 2026-06-28 Codex: retrigger CD on latest main after fail-closed reversion.
|
||||
# 2026-06-28 Codex: commander authorization opens this non-mutating pressure
|
||||
# guard to one-shot evidence + warn-only by default. Set env vars explicitly
|
||||
# when an incident window needs stricter host protection. Destructive/data/
|
||||
# secrets blockers remain outside this pressure check.
|
||||
|
||||
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-6}}"
|
||||
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-10}}"
|
||||
ATTEMPTS="${HOST_WEB_BUILD_PRESSURE_ATTEMPTS:-${HOST_WEB_BUILD_PRESSURE_MAX_ATTEMPTS:-1}}"
|
||||
SLEEP_SECONDS="${HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS:-${HOST_WEB_BUILD_PRESSURE_INTERVAL:-3}}"
|
||||
WARN_ONLY="${HOST_WEB_BUILD_PRESSURE_WARN_ONLY:-1}"
|
||||
MAX_LOAD5_PER_CORE="${HOST_WEB_BUILD_PRESSURE_MAX_LOAD5_PER_CORE:-0.85}"
|
||||
MAX_CI_CPU_PERCENT="${HOST_WEB_BUILD_PRESSURE_MAX_CI_CPU_PERCENT:-250}"
|
||||
@@ -180,12 +180,14 @@ for attempt in $(seq 1 "$ATTEMPTS"); do
|
||||
if [ -n "$active_builds" ]; then
|
||||
printf '%s\n' "$active_builds" | head -n 8
|
||||
fi
|
||||
sleep "$SLEEP_SECONDS"
|
||||
if [ "$attempt" -lt "$ATTEMPTS" ]; then
|
||||
sleep "$SLEEP_SECONDS"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "⚠️ host web/build/smoke pressure still active after ${ATTEMPTS} checks"
|
||||
if [ "$WARN_ONLY" = "1" ]; then
|
||||
echo "⚠️ continuing to avoid a stuck deploy; see ops/runner/README.md for the runner isolation plan"
|
||||
echo "⚠️ continuing under commander controlled automation; pressure evidence was captured"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
||||
@@ -29,9 +29,13 @@ FORBIDDEN_FRAGMENTS = [
|
||||
"人工接手",
|
||||
"人工決策佇列",
|
||||
"人工關卡",
|
||||
"人工 Gate",
|
||||
"人工 gate",
|
||||
"人工閘門",
|
||||
"人工升級",
|
||||
"待 owner 複核",
|
||||
"未批准不會執行",
|
||||
"等 負責人審查",
|
||||
"owner review",
|
||||
"owner packet",
|
||||
"manual gate",
|
||||
@@ -119,6 +123,16 @@ def _collect_awooop_message_violations(path: Path, root: Path) -> list[str]:
|
||||
return violations
|
||||
|
||||
|
||||
def _collect_forbidden_line_violations(path: Path, root: Path, text: str) -> list[str]:
|
||||
violations: list[str] = []
|
||||
for line_number, line in enumerate(text.splitlines(), start=1):
|
||||
for fragment in FORBIDDEN_FRAGMENTS:
|
||||
if fragment in line:
|
||||
relative = path.relative_to(root)
|
||||
violations.append(f"{relative}:{line_number}: forbidden {fragment!r}")
|
||||
return violations
|
||||
|
||||
|
||||
def validate(root: Path) -> None:
|
||||
root = root.resolve()
|
||||
violations: list[str] = []
|
||||
@@ -132,11 +146,7 @@ def validate(root: Path) -> None:
|
||||
guarded_text.append(text)
|
||||
if path.name.endswith(".json"):
|
||||
violations.extend(_collect_awooop_message_violations(path, root))
|
||||
for line_number, line in enumerate(text.splitlines(), start=1):
|
||||
for fragment in FORBIDDEN_FRAGMENTS:
|
||||
if fragment in line:
|
||||
relative = path.relative_to(root)
|
||||
violations.append(f"{relative}:{line_number}: forbidden {fragment!r}")
|
||||
violations.extend(_collect_forbidden_line_violations(path, root, text))
|
||||
|
||||
alerts_route = root / ALERTS_ROUTE
|
||||
if not alerts_route.exists():
|
||||
|
||||
Reference in New Issue
Block a user