fix(recovery): freeze 110 cd lane and source-aware 188 gates [skip ci]
This commit is contained in:
@@ -104,8 +104,8 @@ container_running() {
|
||||
|
||||
run_pg_dump() {
|
||||
docker exec "${DB_CONTAINER}" sh -eu -c '
|
||||
: "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}"
|
||||
PGPASSWORD="${POSTGRES_PASSWORD}" exec pg_dump \
|
||||
exec pg_dump \
|
||||
-h 127.0.0.1 \
|
||||
-U "${POSTGRES_USER:-momo}" \
|
||||
-d "${POSTGRES_DB:-momo_analytics}" \
|
||||
--no-password \
|
||||
@@ -124,8 +124,8 @@ insert_backup_log() {
|
||||
-e BACKUP_HOST="$(hostname)" \
|
||||
-e BACKUP_STORAGE_PATH="${FILEPATH}" \
|
||||
"${DB_CONTAINER}" sh -eu -c '
|
||||
: "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}"
|
||||
PGPASSWORD="${POSTGRES_PASSWORD}" psql \
|
||||
psql \
|
||||
-h 127.0.0.1 \
|
||||
-U "${POSTGRES_USER:-momo}" \
|
||||
-d "${POSTGRES_DB:-momo_analytics}" \
|
||||
--no-password \
|
||||
|
||||
@@ -192,20 +192,13 @@ log "[6/6] 檢查 Gitea Act Runner(預設不自動啟動)..."
|
||||
RUNNER_DIR="/home/wooo/act-runner"
|
||||
RUNNER_SERVICE="gitea-act-runner-host.service"
|
||||
RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled"
|
||||
CD_LANE_DIR="/home/wooo/awoooi-cd-lane"
|
||||
CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain"
|
||||
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}"
|
||||
START_GITEA_RUNNER_ALLOWED=0
|
||||
CD_LANE_DIR="/home/wooo/awoooi-cd-lane"
|
||||
CD_LANE_SERVICE="awoooi-cd-lane.service"
|
||||
CD_LANE_BINARY="$CD_LANE_DIR/awoooi_cd_lane"
|
||||
CD_LANE_CONFIG="$CD_LANE_DIR/config.yaml"
|
||||
CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain"
|
||||
CD_LANE_DRAIN_SERVICE="awoooi-cd-lane-drain.service"
|
||||
CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled"
|
||||
CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml"
|
||||
CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled"
|
||||
START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}"
|
||||
START_CD_LANE_ALLOWED=0
|
||||
RUNNER_FAIL_CLOSED_SERVICES=(
|
||||
"awoooi-cd-lane.service"
|
||||
"awoooi-cd-lane-drain.service"
|
||||
"awoooi-direct-runner-open.service"
|
||||
"awoooi-direct-runner.service"
|
||||
"gitea-act-runner-host.service"
|
||||
@@ -214,19 +207,18 @@ RUNNER_FAIL_CLOSED_SERVICES=(
|
||||
"gitea-act-runner-awoooi-open.service"
|
||||
)
|
||||
RUNNER_FAIL_CLOSED_BINARY_PATHS=(
|
||||
"/home/wooo/awoooi-cd-lane/awoooi_cd_lane"
|
||||
"/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled"
|
||||
"/home/wooo/act-runner/act_runner"
|
||||
"/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard"
|
||||
"/home/wooo/act-runner-controlled/act_runner"
|
||||
"/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner"
|
||||
)
|
||||
# Legacy host runner still needs both keys. The dedicated cd-lane has its own
|
||||
# sentinel and narrow label/capacity verifier below.
|
||||
# Host runner still needs both keys. The direct cd-lane stays fail-closed until
|
||||
# it is migrated or hard-limited outside this production host pressure lane.
|
||||
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then
|
||||
START_GITEA_RUNNER_ALLOWED=1
|
||||
fi
|
||||
if [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; then
|
||||
START_CD_LANE_ALLOWED=1
|
||||
fi
|
||||
|
||||
mask_runner_unit_file() {
|
||||
local unit="$1"
|
||||
@@ -279,143 +271,71 @@ EOF
|
||||
|
||||
install_cd_lane_fail_closed_unit() {
|
||||
local unit_file="/etc/systemd/system/awoooi-cd-lane.service"
|
||||
local tmp
|
||||
local quarantine_stamp
|
||||
quarantine_stamp="$(date +%Y%m%d%H%M%S)"
|
||||
|
||||
systemctl mask awoooi-cd-lane.service >/dev/null 2>&1 || true
|
||||
if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then
|
||||
chattr -i "$unit_file" >/dev/null 2>&1 || true
|
||||
if ! grep -q "AWOOOI direct CD lane fail-closed" "$unit_file" 2>/dev/null; then
|
||||
if ! { [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; }; then
|
||||
mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
fi
|
||||
tmp="$(mktemp)"
|
||||
cat >"$tmp" <<'EOF'
|
||||
[Unit]
|
||||
Description=AWOOOI direct CD lane fail-closed after 2026-06-28 pressure incident
|
||||
ConditionPathExists=/run/awoooi-cd-lane-enabled
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/bin/false
|
||||
EOF
|
||||
install -o root -g root -m 0444 "$tmp" "$unit_file" >/dev/null 2>&1 || true
|
||||
rm -f "$tmp"
|
||||
chattr +i "$unit_file" >/dev/null 2>&1 || true
|
||||
ln -sfn /dev/null "$unit_file" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
cd_lane_config_path_is_controlled() {
|
||||
local config_path="$1"
|
||||
[ -f "$config_path" ] || return 1
|
||||
grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$config_path" || return 1
|
||||
grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$config_path" || return 1
|
||||
grep -q 'awoooi-host:host' "$config_path" || return 1
|
||||
if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$config_path"; then
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
quarantine_cd_lane_registration_fail_closed() {
|
||||
local quarantine_dir
|
||||
local lane_dir
|
||||
local path
|
||||
local target
|
||||
|
||||
rm -f /run/awoooi-cd-lane-enabled /run/awoooi-cd-lane-controlled-open >/dev/null 2>&1 || true
|
||||
|
||||
for lane_dir in "$CD_LANE_DIR" "$CD_LANE_DRAIN_DIR"; do
|
||||
[ -d "$lane_dir" ] || continue
|
||||
quarantine_dir="$lane_dir/quarantine-startup-$(date +%Y%m%d%H%M%S)"
|
||||
chattr -i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
|
||||
mkdir -p "$quarantine_dir" >/dev/null 2>&1 || true
|
||||
while IFS= read -r -d '' path; do
|
||||
[ -e "$path" ] || continue
|
||||
chattr -i "$path" >/dev/null 2>&1 || true
|
||||
target="$quarantine_dir/$(basename "$path")"
|
||||
mv "$path" "$target" >/dev/null 2>&1 || true
|
||||
chmod 0400 "$target" >/dev/null 2>&1 || true
|
||||
chattr +i "$target" >/dev/null 2>&1 || true
|
||||
done < <(
|
||||
{
|
||||
find "$lane_dir" -maxdepth 1 \( -name 'config.yaml' -o -name 'config.yaml.*' -o -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null
|
||||
find "$lane_dir/data" -maxdepth 1 \( -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null
|
||||
} || true
|
||||
)
|
||||
chattr +i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
|
||||
done
|
||||
}
|
||||
|
||||
cd_lane_config_is_controlled() {
|
||||
cd_lane_config_path_is_controlled "$CD_LANE_CONFIG"
|
||||
}
|
||||
|
||||
cd_lane_drain_config_is_controlled() {
|
||||
cd_lane_config_path_is_controlled "$CD_LANE_DRAIN_CONFIG"
|
||||
}
|
||||
|
||||
cd_lane_drain_is_controlled_open() {
|
||||
local active
|
||||
active="$(systemctl show "$CD_LANE_DRAIN_SERVICE" -p ActiveState --value 2>/dev/null || true)"
|
||||
[ "$active" = "active" ] || return 1
|
||||
cd_lane_drain_config_is_controlled || return 1
|
||||
file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
ensure_cd_lane_fail_closed() {
|
||||
if cd_lane_drain_is_controlled_open; then
|
||||
log "✅ controlled cd-lane drain verifier passed; preserving drain lane and fail-closing regular lane only"
|
||||
systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
install_cd_lane_fail_closed_unit
|
||||
pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true
|
||||
systemctl daemon-reload >/dev/null 2>&1 || true
|
||||
return 0
|
||||
fi
|
||||
if { [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ -e "/run/awoooi-cd-lane-controlled-open" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; } \
|
||||
&& cd_lane_config_is_controlled \
|
||||
&& file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then
|
||||
log "✅ controlled cd-lane verifier passed; keeping dedicated lane open"
|
||||
install_controlled_cd_lane_unit
|
||||
systemctl daemon-reload >/dev/null 2>&1 || true
|
||||
systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
return 0
|
||||
fi
|
||||
systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
apply_cd_lane_fail_closed_guard() {
|
||||
local unit
|
||||
for unit in awoooi-cd-lane.service awoooi-cd-lane-drain.service; do
|
||||
systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
|
||||
systemctl stop "$unit" >/dev/null 2>&1 || true
|
||||
systemctl disable "$unit" >/dev/null 2>&1 || true
|
||||
if [ "$unit" = "awoooi-cd-lane.service" ]; then
|
||||
install_cd_lane_fail_closed_unit
|
||||
else
|
||||
systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system"
|
||||
mask_runner_unit_file "$unit" "/etc/systemd/system"
|
||||
fi
|
||||
done
|
||||
install_cd_lane_fail_closed_unit
|
||||
pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true
|
||||
guard_runner_binary_fail_closed "$CD_LANE_BINARY"
|
||||
pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true
|
||||
pkill -KILL -f "^${CD_LANE_DRAIN_DIR}/awoooi_cd_lane_controlled daemon" >/dev/null 2>&1 || true
|
||||
quarantine_cd_lane_registration_fail_closed
|
||||
guard_runner_binary_fail_closed "$CD_LANE_DIR/awoooi_cd_lane"
|
||||
guard_runner_binary_fail_closed "$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled"
|
||||
systemctl daemon-reload >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
install_controlled_cd_lane_unit() {
|
||||
local unit_file="/etc/systemd/system/$CD_LANE_SERVICE"
|
||||
local tmp
|
||||
chattr -i "$unit_file" "$CD_LANE_BINARY" >/dev/null 2>&1 || true
|
||||
tmp="$(mktemp)"
|
||||
cat >"$tmp" <<EOF
|
||||
[Unit]
|
||||
Description=AWOOOI controlled CD lane
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=wooo
|
||||
WorkingDirectory=${CD_LANE_DIR}/data
|
||||
Environment=HOME=/home/wooo
|
||||
Environment=AWOOOI_CONTROLLED_RUNNER_OPEN=1
|
||||
Environment=HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1
|
||||
Environment=HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=1
|
||||
ExecStart=${CD_LANE_BINARY} daemon --config ${CD_LANE_CONFIG}
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
KillSignal=SIGINT
|
||||
TimeoutStopSec=3700
|
||||
SuccessExitStatus=0 130 143
|
||||
CPUQuota=250%
|
||||
MemoryHigh=8G
|
||||
MemoryMax=12G
|
||||
TasksMax=512
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
install -o root -g root -m 0644 "$tmp" "$unit_file" >/dev/null 2>&1 || true
|
||||
rm -f "$tmp"
|
||||
}
|
||||
|
||||
ensure_controlled_cd_lane_open() {
|
||||
if ! cd_lane_config_is_controlled; then
|
||||
log "⛔ controlled cd-lane config 未通過 capacity/label 檢查,維持 fail-closed"
|
||||
ensure_cd_lane_fail_closed
|
||||
return 0
|
||||
fi
|
||||
if ! file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then
|
||||
log "⛔ controlled cd-lane binary 不是可執行 ELF,維持 fail-closed"
|
||||
ensure_cd_lane_fail_closed
|
||||
return 0
|
||||
fi
|
||||
install_controlled_cd_lane_unit
|
||||
systemctl daemon-reload >/dev/null 2>&1 || true
|
||||
systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
ensure_host_runner_fail_closed() {
|
||||
local unit
|
||||
local binary
|
||||
@@ -445,6 +365,8 @@ ensure_host_runner_fail_closed() {
|
||||
fi
|
||||
|
||||
pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true
|
||||
pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true
|
||||
quarantine_cd_lane_registration_fail_closed
|
||||
for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do
|
||||
guard_runner_binary_fail_closed "$binary"
|
||||
done
|
||||
@@ -550,13 +472,8 @@ else
|
||||
log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR"
|
||||
fi
|
||||
|
||||
if [ "$START_CD_LANE_ALLOWED" = "1" ]; then
|
||||
log "✅ controlled cd-lane sentinel present; opening dedicated rate-limited CD lane"
|
||||
ensure_controlled_cd_lane_open
|
||||
else
|
||||
log "⏸️ controlled cd-lane 維持 fail-closed;需 $CD_LANE_ENABLE_SENTINEL 或 AWOOOI_START_CONTROLLED_CD_LANE=1"
|
||||
ensure_cd_lane_fail_closed
|
||||
fi
|
||||
log "⏸️ direct cd-lane 維持 fail-closed;需完成搬遷或硬限流後才可用獨立變更恢復"
|
||||
apply_cd_lane_fail_closed_guard
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# STEP 7: Sentry(Error Tracking)
|
||||
|
||||
@@ -286,7 +286,7 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*"
|
||||
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
|
||||
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
|
||||
done
|
||||
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
for u in awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
||||
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
||||
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
||||
@@ -294,6 +294,8 @@ for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-ac
|
||||
unit_ok=0
|
||||
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
||||
unit_ok=1
|
||||
elif [ "$u" = "awoooi-cd-lane-drain.service" ] && [ "$load" = "not-found" ] && [ "$active" != "active" ]; then
|
||||
unit_ok=1
|
||||
fi
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
|
||||
done
|
||||
@@ -317,16 +319,21 @@ fi
|
||||
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
|
||||
cd_lane_binary_elf=0
|
||||
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
|
||||
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
|
||||
cd_lane_ok=0
|
||||
cd_lane_mode=blocked
|
||||
if [ "$cd_lane_active" = "inactive" ] && echo "$cd_lane_execstart" | grep -q "/bin/false" && [ "$cd_lane_binary_elf" = "0" ]; then
|
||||
if [ "$cd_lane_active" = "inactive" ] \
|
||||
&& [ "$cd_lane_sentinel" = "missing" ] \
|
||||
&& [ "$cd_lane_binary_elf" = "0" ] \
|
||||
&& [ "$cd_lane_process_count" = "0" ] \
|
||||
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
|
||||
cd_lane_ok=1
|
||||
cd_lane_mode=failclosed
|
||||
elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then
|
||||
cd_lane_ok=1
|
||||
cd_lane_mode=controlled_open
|
||||
fi
|
||||
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok"
|
||||
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
|
||||
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
|
||||
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
|
||||
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
|
||||
@@ -344,24 +351,25 @@ fi
|
||||
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
|
||||
cd_lane_drain_binary_elf=0
|
||||
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
|
||||
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
|
||||
cd_lane_drain_ok=0
|
||||
cd_lane_drain_mode=absent
|
||||
if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then
|
||||
cd_lane_drain_mode=blocked
|
||||
fi
|
||||
if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then
|
||||
cd_lane_drain_mode=blocked
|
||||
if [ "$cd_lane_drain_active" != "active" ] \
|
||||
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
|
||||
&& [ "$cd_lane_drain_process_count" = "0" ] \
|
||||
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
|
||||
cd_lane_drain_ok=1
|
||||
cd_lane_drain_mode=controlled_open
|
||||
cd_lane_drain_mode=failclosed
|
||||
fi
|
||||
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok"
|
||||
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
|
||||
cd_lane_guard_ok=0
|
||||
if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then
|
||||
if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ]; then
|
||||
cd_lane_guard_ok=1
|
||||
fi
|
||||
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
|
||||
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
for p in /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
kind=$(file -b "$p" 2>/dev/null || echo missing)
|
||||
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
|
||||
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
|
||||
@@ -573,22 +581,69 @@ scheduler_uid=$(docker top momo-scheduler -eo pid,user,uid 2>/dev/null | awk "NR
|
||||
echo "MOMO_GDRIVE_TOKEN_STAT ${token_stat:-missing} scheduler_uid=${scheduler_uid:-unknown}"
|
||||
db_user=$(docker exec momo-pro-system printenv POSTGRES_USER 2>/dev/null || true)
|
||||
db_name=$(docker exec momo-pro-system printenv POSTGRES_DB 2>/dev/null || true)
|
||||
db_pass=$(docker exec momo-pro-system printenv POSTGRES_PASSWORD 2>/dev/null || true)
|
||||
if [ -n "$db_user" ] && [ -n "$db_name" ] && [ -n "$db_pass" ]; then
|
||||
momo_sync=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;" 2>/dev/null || true)
|
||||
momo_freshness=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;" 2>/dev/null || true)
|
||||
momo_import_config=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT config_key || chr(61) || config_value FROM import_config;" 2>/dev/null | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true)
|
||||
momo_latest_import_job=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" 2>/dev/null | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true)
|
||||
if [ -n "$db_user" ] && [ -n "$db_name" ]; then
|
||||
psql_no_secret() {
|
||||
docker exec -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" --no-password -Atc "$1" 2>/dev/null || true
|
||||
}
|
||||
momo_sync=$(psql_no_secret "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;")
|
||||
momo_freshness=$(psql_no_secret "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;")
|
||||
momo_import_config=$(psql_no_secret "SELECT config_key || chr(61) || config_value FROM import_config;" | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true)
|
||||
momo_latest_import_job=$(psql_no_secret "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true)
|
||||
tmp_drive_probe="/tmp/awoooi-momo-drive-source-probe.$$"
|
||||
cat > "$tmp_drive_probe" <<PYDRIVE
|
||||
from services.google_drive_service import drive_service
|
||||
from services.import_service import import_service
|
||||
|
||||
def emit(key, value):
|
||||
print(f"{key} {value if value not in (None, '') else '-'}")
|
||||
|
||||
folder = import_service.get_config("gdrive_folder_path", "當日業績匯入")
|
||||
pattern = import_service.get_config("gdrive_file_pattern", "即時業績_當日")
|
||||
archive = import_service.get_config("gdrive_archive_folder", "當日業績匯入/已匯入")
|
||||
failed = import_service.get_config("gdrive_failed_folder", "匯入失敗")
|
||||
intake_files = drive_service.list_files_in_folder(folder, pattern)
|
||||
archive_files = drive_service.list_files_in_folder(archive, pattern)
|
||||
failed_files = drive_service.list_files_in_folder(failed, pattern)
|
||||
emit("MOMO_DRIVE_INTAKE_COUNT", len(intake_files))
|
||||
emit("MOMO_DRIVE_ARCHIVE_COUNT", len(archive_files))
|
||||
emit("MOMO_DRIVE_FAILED_COUNT", len(failed_files))
|
||||
emit("MOMO_DRIVE_ARCHIVE_LATEST_DATE", (archive_files[0].get("modifiedTime", "")[:10] if archive_files else "-"))
|
||||
if not drive_service.service:
|
||||
drive_service.authenticate()
|
||||
global_date = "-"
|
||||
if drive_service.service:
|
||||
safe_pattern = drive_service._escape_query_value(pattern)
|
||||
query = (
|
||||
f"name contains '{safe_pattern}' and trashed=false and "
|
||||
"(mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' "
|
||||
"or mimeType='application/vnd.ms-excel')"
|
||||
)
|
||||
results = drive_service.service.files().list(
|
||||
q=query,
|
||||
spaces="drive",
|
||||
fields="files(modifiedTime)",
|
||||
orderBy="modifiedTime desc",
|
||||
pageSize=1,
|
||||
).execute()
|
||||
global_files = results.get("files", [])
|
||||
if global_files:
|
||||
global_date = global_files[0].get("modifiedTime", "")[:10] or "-"
|
||||
emit("MOMO_DRIVE_GLOBAL_LATEST_DATE", global_date)
|
||||
PYDRIVE
|
||||
momo_drive_source_probe=$(docker exec -i momo-scheduler python - < "$tmp_drive_probe" 2>/dev/null | awk "/^MOMO_DRIVE_/ {print}" || true)
|
||||
rm -f "$tmp_drive_probe"
|
||||
else
|
||||
momo_sync=""
|
||||
momo_freshness=""
|
||||
momo_import_config=""
|
||||
momo_latest_import_job=""
|
||||
momo_drive_source_probe=""
|
||||
fi
|
||||
echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
|
||||
echo "MOMO_DAILY_FRESHNESS ${momo_freshness:-unavailable}"
|
||||
echo "MOMO_IMPORT_CONFIG ${momo_import_config:-unavailable}"
|
||||
echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}"
|
||||
printf "%s\n" "$momo_drive_source_probe"
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
|
||||
@@ -611,10 +666,36 @@ echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}"
|
||||
grep -Fq "MOMO_IMPORT_CONFIG 當日業績匯入|即時業績_當日" <<<"$out" && ok "188 momo Drive import config points to expected daily-sales intake" || fail "188 momo Drive import config drifted from expected daily-sales intake"
|
||||
awk '/MOMO_LATEST_IMPORT_JOB / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[2] == "completed" && a[6] == a[7] && a[8] == 0)}' <<<"$out" && ok "188 momo latest daily import job completed cleanly" || warn "188 momo latest daily import job not confirmed clean"
|
||||
awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
|
||||
momo_source_stale_only=$(awk '
|
||||
$1 == "MOMO_DRIVE_INTAKE_COUNT" {intake=$2+0}
|
||||
$1 == "MOMO_DRIVE_FAILED_COUNT" {failed=$2+0}
|
||||
$1 == "MOMO_DRIVE_GLOBAL_LATEST_DATE" {global=$2}
|
||||
$1 == "MOMO_LATEST_IMPORT_JOB" {split($2,a,"|"); completed=substr(a[5],1,10)}
|
||||
END {
|
||||
if (intake == 0 && failed == 0 && global ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ && completed ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ && global <= completed) print 1;
|
||||
else print 0;
|
||||
}' <<<"$out")
|
||||
if awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 0 && a[1] <= 2)}' <<<"$out"; then
|
||||
ok "188 momo daily sales data fresh enough"
|
||||
elif awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 3)}' <<<"$out"; then
|
||||
if awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
|
||||
if [ "$momo_source_stale_only" = "1" ]; then
|
||||
warn "188 momo daily sales stale but Drive has no newer source candidate"
|
||||
elif [ -x scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh ]; then
|
||||
momo_source_preflight_summary="$(
|
||||
scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh \
|
||||
--host ollama@192.168.0.188 \
|
||||
--freshness-max-days 2 2>/dev/null \
|
||||
| awk '/^MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT / {line=$0} END {print line}' || true
|
||||
)"
|
||||
[ -n "$momo_source_preflight_summary" ] && echo "$momo_source_preflight_summary"
|
||||
if grep -q "BLOCKED=0" <<<"$momo_source_preflight_summary"; then
|
||||
warn "188 momo daily sales stale but source preflight has no hard blocker"
|
||||
elif awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
|
||||
fail "188 momo source file absent while daily sales data stale"
|
||||
else
|
||||
fail "188 momo daily sales data stale beyond 3 days"
|
||||
fi
|
||||
elif awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
|
||||
fail "188 momo source file absent while daily sales data stale"
|
||||
else
|
||||
fail "188 momo daily sales data stale beyond 3 days"
|
||||
|
||||
@@ -306,6 +306,7 @@ drive_archive_count="$(num_for DRIVE_ARCHIVE_COUNT)"
|
||||
drive_failed_count="$(num_for DRIVE_FAILED_COUNT)"
|
||||
drive_archive_latest="$(value_for DRIVE_ARCHIVE_LATEST_MODIFIED)"
|
||||
drive_global_latest="$(value_for DRIVE_GLOBAL_LATEST_MODIFIED)"
|
||||
drive_global_latest_date="${drive_global_latest:0:10}"
|
||||
if [[ "$drive_intake_count" -gt 0 ]]; then
|
||||
ok "Drive daily-sales intake has pending source files: count=$drive_intake_count"
|
||||
else
|
||||
@@ -338,13 +339,22 @@ IFS='|' read -r freshness_days latest_daily_date <<<"$freshness"
|
||||
if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -le "$FRESHNESS_MAX_DAYS" ]]; then
|
||||
ok "daily sales data freshness is within ${FRESHNESS_MAX_DAYS} days: $freshness"
|
||||
elif [[ "$freshness_days" =~ ^[0-9]+$ ]]; then
|
||||
blocked "daily sales data is stale: $freshness"
|
||||
warn "daily sales data is stale: $freshness"
|
||||
else
|
||||
blocked "daily sales freshness is unavailable: ${freshness:-missing}"
|
||||
fi
|
||||
|
||||
latest_job="$(value_for DB_LATEST_DAILY_IMPORT_JOB)"
|
||||
IFS='|' read -r job_id job_status job_file job_created job_completed job_total job_success job_errors <<<"$latest_job"
|
||||
job_completed_date="${job_completed:0:10}"
|
||||
source_absent_without_newer_drive=0
|
||||
if [[ "$drive_intake_count" -eq 0 \
|
||||
&& "$drive_failed_count" -eq 0 \
|
||||
&& "$drive_global_latest_date" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ \
|
||||
&& "$job_completed_date" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]] \
|
||||
&& [[ "$drive_global_latest_date" < "$job_completed_date" || "$drive_global_latest_date" == "$job_completed_date" ]]; then
|
||||
source_absent_without_newer_drive=1
|
||||
fi
|
||||
if [[ "$job_id" =~ ^[0-9]+$ && "$job_status" == "completed" && "$job_total" == "$job_success" && "$job_errors" == "0" ]]; then
|
||||
ok "latest daily import job completed cleanly: id=$job_id file=$job_file"
|
||||
else
|
||||
@@ -354,6 +364,8 @@ fi
|
||||
if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -gt "$FRESHNESS_MAX_DAYS" ]]; then
|
||||
if [[ "$auth_failures" -gt 0 ]]; then
|
||||
blocked "release blocker is stale business data with active Drive auth/source evidence gate"
|
||||
elif [[ "$source_absent_without_newer_drive" -eq 1 ]]; then
|
||||
warn "daily sales data is stale, but Drive has no newer source candidate than the last clean import"
|
||||
else
|
||||
blocked "release blocker is stale business data; source evidence must be refreshed"
|
||||
fi
|
||||
|
||||
@@ -306,13 +306,15 @@ check_runner_guardrails() {
|
||||
local out bad
|
||||
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
|
||||
bad=0
|
||||
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
for u in awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
|
||||
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
|
||||
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
|
||||
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
|
||||
unit_ok=0
|
||||
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
|
||||
unit_ok=1
|
||||
elif [ "$u" = "awoooi-cd-lane-drain.service" ] && [ "$load" = "not-found" ] && [ "$active" != "active" ]; then
|
||||
unit_ok=1
|
||||
fi
|
||||
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active ok=$unit_ok"
|
||||
[ "$unit_ok" = "1" ] || bad=1
|
||||
@@ -336,16 +338,21 @@ fi
|
||||
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
|
||||
cd_lane_binary_elf=0
|
||||
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
|
||||
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
|
||||
cd_lane_ok=0
|
||||
cd_lane_mode=blocked
|
||||
if [ "$cd_lane_active" = "inactive" ] && echo "$cd_lane_execstart" | grep -q "/bin/false" && [ "$cd_lane_binary_elf" = "0" ]; then
|
||||
if [ "$cd_lane_active" = "inactive" ] \
|
||||
&& [ "$cd_lane_sentinel" = "missing" ] \
|
||||
&& [ "$cd_lane_binary_elf" = "0" ] \
|
||||
&& [ "$cd_lane_process_count" = "0" ] \
|
||||
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
|
||||
cd_lane_ok=1
|
||||
cd_lane_mode=failclosed
|
||||
elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then
|
||||
cd_lane_ok=1
|
||||
cd_lane_mode=controlled_open
|
||||
fi
|
||||
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok"
|
||||
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
|
||||
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
|
||||
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
|
||||
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
|
||||
@@ -362,18 +369,19 @@ fi
|
||||
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
|
||||
cd_lane_drain_binary_elf=0
|
||||
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
|
||||
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
|
||||
cd_lane_drain_ok=0
|
||||
cd_lane_drain_mode=absent
|
||||
if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then
|
||||
cd_lane_drain_mode=blocked
|
||||
fi
|
||||
if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then
|
||||
cd_lane_drain_mode=blocked
|
||||
if [ "$cd_lane_drain_active" != "active" ] \
|
||||
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
|
||||
&& [ "$cd_lane_drain_process_count" = "0" ] \
|
||||
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
|
||||
cd_lane_drain_ok=1
|
||||
cd_lane_drain_mode=controlled_open
|
||||
cd_lane_drain_mode=failclosed
|
||||
fi
|
||||
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok"
|
||||
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
|
||||
cd_lane_guard_ok=0
|
||||
if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then
|
||||
if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ]; then
|
||||
cd_lane_guard_ok=1
|
||||
fi
|
||||
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
|
||||
@@ -381,7 +389,7 @@ echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
|
||||
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
|
||||
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
|
||||
[ "$direct_runner_count" = "0" ] || bad=1
|
||||
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
for p in /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
|
||||
kind=$(file -b "$p" 2>/dev/null || echo missing)
|
||||
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
|
||||
echo "$kind" | grep -qi "ELF" && bad=1
|
||||
@@ -446,13 +454,22 @@ echo "ollama-systemd $(systemctl is-active ollama 2>/dev/null || true)"
|
||||
echo "ollama-api $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:11434/api/tags || true)"
|
||||
docker inspect -f "momo-scheduler {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true
|
||||
docker inspect -f "litellm {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" litellm 2>/dev/null || true
|
||||
if ! docker inspect litellm >/dev/null 2>&1 && [ ! -d /opt/litellm ]; then
|
||||
echo "litellm not-provisioned"
|
||||
fi
|
||||
docker inspect -f "signoz-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" signoz-clickhouse 2>/dev/null || true
|
||||
' 2>&1); then
|
||||
echo "$out"
|
||||
grep -q "ollama-systemd active" <<<"$out" && ok "188 Ollama systemd active" || blocked "188 Ollama systemd inactive"
|
||||
grep -q "ollama-api 200" <<<"$out" && ok "188 Ollama API reachable" || blocked "188 Ollama API not reachable"
|
||||
grep -q "momo-scheduler running healthy" <<<"$out" && ok "188 momo-scheduler healthy" || blocked "188 momo-scheduler not healthy"
|
||||
grep -Eq "litellm running( |$)" <<<"$out" && ok "188 litellm running" || blocked "188 litellm not running"
|
||||
if grep -Eq "litellm running( |$)" <<<"$out"; then
|
||||
ok "188 litellm running"
|
||||
elif grep -q "litellm not-provisioned" <<<"$out"; then
|
||||
warn "188 litellm not provisioned; provider route/cost switch requires separate approval"
|
||||
else
|
||||
blocked "188 litellm not running"
|
||||
fi
|
||||
grep -q "signoz-clickhouse running healthy" <<<"$out" && ok "188 SignOz ClickHouse healthy" || warn "188 SignOz ClickHouse health not confirmed"
|
||||
else
|
||||
blocked "188 high-load service check unavailable"
|
||||
|
||||
Reference in New Issue
Block a user