fix(recovery): freeze 110 cd lane and source-aware 188 gates [skip ci]

This commit is contained in:
Your Name
2026-06-28 10:35:14 +08:00
parent 392c1741ca
commit 241cbe067e
18 changed files with 279 additions and 242 deletions

View File

@@ -104,8 +104,8 @@ container_running() {
run_pg_dump() {
docker exec "${DB_CONTAINER}" sh -eu -c '
: "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}"
PGPASSWORD="${POSTGRES_PASSWORD}" exec pg_dump \
exec pg_dump \
-h 127.0.0.1 \
-U "${POSTGRES_USER:-momo}" \
-d "${POSTGRES_DB:-momo_analytics}" \
--no-password \
@@ -124,8 +124,8 @@ insert_backup_log() {
-e BACKUP_HOST="$(hostname)" \
-e BACKUP_STORAGE_PATH="${FILEPATH}" \
"${DB_CONTAINER}" sh -eu -c '
: "${POSTGRES_PASSWORD:?POSTGRES_PASSWORD missing in container env}"
PGPASSWORD="${POSTGRES_PASSWORD}" psql \
psql \
-h 127.0.0.1 \
-U "${POSTGRES_USER:-momo}" \
-d "${POSTGRES_DB:-momo_analytics}" \
--no-password \

View File

@@ -192,20 +192,13 @@ log "[6/6] 檢查 Gitea Act Runner預設不自動啟動..."
RUNNER_DIR="/home/wooo/act-runner"
RUNNER_SERVICE="gitea-act-runner-host.service"
RUNNER_ENABLE_SENTINEL="/run/awoooi-runner-host-enabled"
CD_LANE_DIR="/home/wooo/awoooi-cd-lane"
CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain"
START_GITEA_RUNNER_ON_BOOT="${AWOOOI_START_GITEA_RUNNER_ON_BOOT:-0}"
START_GITEA_RUNNER_ALLOWED=0
CD_LANE_DIR="/home/wooo/awoooi-cd-lane"
CD_LANE_SERVICE="awoooi-cd-lane.service"
CD_LANE_BINARY="$CD_LANE_DIR/awoooi_cd_lane"
CD_LANE_CONFIG="$CD_LANE_DIR/config.yaml"
CD_LANE_DRAIN_DIR="/home/wooo/awoooi-cd-lane-drain"
CD_LANE_DRAIN_SERVICE="awoooi-cd-lane-drain.service"
CD_LANE_DRAIN_BINARY="$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled"
CD_LANE_DRAIN_CONFIG="$CD_LANE_DRAIN_DIR/config.yaml"
CD_LANE_ENABLE_SENTINEL="/run/awoooi-cd-lane-enabled"
START_CONTROLLED_CD_LANE="${AWOOOI_START_CONTROLLED_CD_LANE:-0}"
START_CD_LANE_ALLOWED=0
RUNNER_FAIL_CLOSED_SERVICES=(
"awoooi-cd-lane.service"
"awoooi-cd-lane-drain.service"
"awoooi-direct-runner-open.service"
"awoooi-direct-runner.service"
"gitea-act-runner-host.service"
@@ -214,19 +207,18 @@ RUNNER_FAIL_CLOSED_SERVICES=(
"gitea-act-runner-awoooi-open.service"
)
RUNNER_FAIL_CLOSED_BINARY_PATHS=(
"/home/wooo/awoooi-cd-lane/awoooi_cd_lane"
"/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled"
"/home/wooo/act-runner/act_runner"
"/home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard"
"/home/wooo/act-runner-controlled/act_runner"
"/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner"
)
# Legacy host runner still needs both keys. The dedicated cd-lane has its own
# sentinel and narrow label/capacity verifier below.
# Host runner still needs both keys. The direct cd-lane stays fail-closed until
# it is migrated or hard-limited outside this production host pressure lane.
if [ "$START_GITEA_RUNNER_ON_BOOT" = "1" ] && [ -e "$RUNNER_ENABLE_SENTINEL" ]; then
START_GITEA_RUNNER_ALLOWED=1
fi
if [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; then
START_CD_LANE_ALLOWED=1
fi
mask_runner_unit_file() {
local unit="$1"
@@ -279,143 +271,71 @@ EOF
install_cd_lane_fail_closed_unit() {
local unit_file="/etc/systemd/system/awoooi-cd-lane.service"
local tmp
local quarantine_stamp
quarantine_stamp="$(date +%Y%m%d%H%M%S)"
systemctl mask awoooi-cd-lane.service >/dev/null 2>&1 || true
if [ -e "$unit_file" ] || [ -L "$unit_file" ]; then
chattr -i "$unit_file" >/dev/null 2>&1 || true
if ! grep -q "AWOOOI direct CD lane fail-closed" "$unit_file" 2>/dev/null; then
if ! { [ -L "$unit_file" ] && [ "$(readlink "$unit_file" 2>/dev/null || true)" = "/dev/null" ]; }; then
mv "$unit_file" "${unit_file}.quarantined-runner-incident-${quarantine_stamp}" >/dev/null 2>&1 || true
fi
fi
tmp="$(mktemp)"
cat >"$tmp" <<'EOF'
[Unit]
Description=AWOOOI direct CD lane fail-closed after 2026-06-28 pressure incident
ConditionPathExists=/run/awoooi-cd-lane-enabled
[Service]
Type=oneshot
ExecStart=/bin/false
EOF
install -o root -g root -m 0444 "$tmp" "$unit_file" >/dev/null 2>&1 || true
rm -f "$tmp"
chattr +i "$unit_file" >/dev/null 2>&1 || true
ln -sfn /dev/null "$unit_file" >/dev/null 2>&1 || true
}
cd_lane_config_path_is_controlled() {
local config_path="$1"
[ -f "$config_path" ] || return 1
grep -Eq '^[[:space:]]+capacity:[[:space:]]*1[[:space:]]*$' "$config_path" || return 1
grep -q 'awoooi-ubuntu:docker://192.168.0.110:5000/awoooi/ci-runner:act-22.04' "$config_path" || return 1
grep -q 'awoooi-host:host' "$config_path" || return 1
if grep -Eq '^[[:space:]]+- ".*(ubuntu-latest|stockplatform|headless|playwright)' "$config_path"; then
return 1
fi
return 0
quarantine_cd_lane_registration_fail_closed() {
local quarantine_dir
local lane_dir
local path
local target
rm -f /run/awoooi-cd-lane-enabled /run/awoooi-cd-lane-controlled-open >/dev/null 2>&1 || true
for lane_dir in "$CD_LANE_DIR" "$CD_LANE_DRAIN_DIR"; do
[ -d "$lane_dir" ] || continue
quarantine_dir="$lane_dir/quarantine-startup-$(date +%Y%m%d%H%M%S)"
chattr -i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
mkdir -p "$quarantine_dir" >/dev/null 2>&1 || true
while IFS= read -r -d '' path; do
[ -e "$path" ] || continue
chattr -i "$path" >/dev/null 2>&1 || true
target="$quarantine_dir/$(basename "$path")"
mv "$path" "$target" >/dev/null 2>&1 || true
chmod 0400 "$target" >/dev/null 2>&1 || true
chattr +i "$target" >/dev/null 2>&1 || true
done < <(
{
find "$lane_dir" -maxdepth 1 \( -name 'config.yaml' -o -name 'config.yaml.*' -o -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null
find "$lane_dir/data" -maxdepth 1 \( -name '.runner' -o -name '.runner.*' \) -print0 2>/dev/null
} || true
)
chattr +i "$lane_dir" "$lane_dir/data" >/dev/null 2>&1 || true
done
}
cd_lane_config_is_controlled() {
cd_lane_config_path_is_controlled "$CD_LANE_CONFIG"
}
cd_lane_drain_config_is_controlled() {
cd_lane_config_path_is_controlled "$CD_LANE_DRAIN_CONFIG"
}
cd_lane_drain_is_controlled_open() {
local active
active="$(systemctl show "$CD_LANE_DRAIN_SERVICE" -p ActiveState --value 2>/dev/null || true)"
[ "$active" = "active" ] || return 1
cd_lane_drain_config_is_controlled || return 1
file "$CD_LANE_DRAIN_BINARY" 2>/dev/null | grep -qi "ELF" || return 1
return 0
}
ensure_cd_lane_fail_closed() {
if cd_lane_drain_is_controlled_open; then
log "✅ controlled cd-lane drain verifier passed; preserving drain lane and fail-closing regular lane only"
systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
install_cd_lane_fail_closed_unit
pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true
systemctl daemon-reload >/dev/null 2>&1 || true
return 0
fi
if { [ -e "$CD_LANE_ENABLE_SENTINEL" ] || [ -e "/run/awoooi-cd-lane-controlled-open" ] || [ "$START_CONTROLLED_CD_LANE" = "1" ]; } \
&& cd_lane_config_is_controlled \
&& file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then
log "✅ controlled cd-lane verifier passed; keeping dedicated lane open"
install_controlled_cd_lane_unit
systemctl daemon-reload >/dev/null 2>&1 || true
systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
return 0
fi
systemctl kill --signal=SIGKILL "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
systemctl stop "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
systemctl disable "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
apply_cd_lane_fail_closed_guard() {
local unit
for unit in awoooi-cd-lane.service awoooi-cd-lane-drain.service; do
systemctl kill --signal=SIGKILL "$unit" >/dev/null 2>&1 || true
systemctl stop "$unit" >/dev/null 2>&1 || true
systemctl disable "$unit" >/dev/null 2>&1 || true
if [ "$unit" = "awoooi-cd-lane.service" ]; then
install_cd_lane_fail_closed_unit
else
systemctl mask "$unit" >/dev/null 2>&1 || mask_runner_unit_file "$unit" "/etc/systemd/system"
mask_runner_unit_file "$unit" "/etc/systemd/system"
fi
done
install_cd_lane_fail_closed_unit
pkill -KILL -f "^${CD_LANE_BINARY} daemon" >/dev/null 2>&1 || true
guard_runner_binary_fail_closed "$CD_LANE_BINARY"
pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true
pkill -KILL -f "^${CD_LANE_DRAIN_DIR}/awoooi_cd_lane_controlled daemon" >/dev/null 2>&1 || true
quarantine_cd_lane_registration_fail_closed
guard_runner_binary_fail_closed "$CD_LANE_DIR/awoooi_cd_lane"
guard_runner_binary_fail_closed "$CD_LANE_DRAIN_DIR/awoooi_cd_lane_controlled"
systemctl daemon-reload >/dev/null 2>&1 || true
}
install_controlled_cd_lane_unit() {
local unit_file="/etc/systemd/system/$CD_LANE_SERVICE"
local tmp
chattr -i "$unit_file" "$CD_LANE_BINARY" >/dev/null 2>&1 || true
tmp="$(mktemp)"
cat >"$tmp" <<EOF
[Unit]
Description=AWOOOI controlled CD lane
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
User=wooo
WorkingDirectory=${CD_LANE_DIR}/data
Environment=HOME=/home/wooo
Environment=AWOOOI_CONTROLLED_RUNNER_OPEN=1
Environment=HOST_WEB_BUILD_PRESSURE_ATTEMPTS=1
Environment=HOST_WEB_BUILD_PRESSURE_SLEEP_SECONDS=1
ExecStart=${CD_LANE_BINARY} daemon --config ${CD_LANE_CONFIG}
Restart=always
RestartSec=10
KillSignal=SIGINT
TimeoutStopSec=3700
SuccessExitStatus=0 130 143
CPUQuota=250%
MemoryHigh=8G
MemoryMax=12G
TasksMax=512
[Install]
WantedBy=multi-user.target
EOF
install -o root -g root -m 0644 "$tmp" "$unit_file" >/dev/null 2>&1 || true
rm -f "$tmp"
}
ensure_controlled_cd_lane_open() {
if ! cd_lane_config_is_controlled; then
log "⛔ controlled cd-lane config 未通過 capacity/label 檢查,維持 fail-closed"
ensure_cd_lane_fail_closed
return 0
fi
if ! file "$CD_LANE_BINARY" 2>/dev/null | grep -qi "ELF"; then
log "⛔ controlled cd-lane binary 不是可執行 ELF維持 fail-closed"
ensure_cd_lane_fail_closed
return 0
fi
install_controlled_cd_lane_unit
systemctl daemon-reload >/dev/null 2>&1 || true
systemctl enable --now "$CD_LANE_SERVICE" >/dev/null 2>&1 || true
}
ensure_host_runner_fail_closed() {
local unit
local binary
@@ -445,6 +365,8 @@ ensure_host_runner_fail_closed() {
fi
pkill -KILL -f "^${RUNNER_DIR}/act_runner(\\.real-[^ ]*)? daemon" >/dev/null 2>&1 || true
pkill -KILL -f "^${CD_LANE_DIR}/awoooi_cd_lane daemon" >/dev/null 2>&1 || true
quarantine_cd_lane_registration_fail_closed
for binary in "${RUNNER_FAIL_CLOSED_BINARY_PATHS[@]}"; do
guard_runner_binary_fail_closed "$binary"
done
@@ -550,13 +472,8 @@ else
log "⚠️ 找不到 act-runner binary/config: $RUNNER_DIR"
fi
if [ "$START_CD_LANE_ALLOWED" = "1" ]; then
log "✅ controlled cd-lane sentinel present; opening dedicated rate-limited CD lane"
ensure_controlled_cd_lane_open
else
log "⏸️ controlled cd-lane 維持 fail-closed$CD_LANE_ENABLE_SENTINEL 或 AWOOOI_START_CONTROLLED_CD_LANE=1"
ensure_cd_lane_fail_closed
fi
log "⏸️ direct cd-lane 維持 fail-closed需完成搬遷或硬限流後才可用獨立變更恢復"
apply_cd_lane_fail_closed_guard
# ──────────────────────────────────────────────
# STEP 7: SentryError Tracking

View File

@@ -286,7 +286,7 @@ echo "ACTION_RUNNER_ENABLED_COUNT $(systemctl list-unit-files "actions.runner.*"
for u in $(systemctl list-units "actions.runner.*" --all --no-legend --plain 2>/dev/null | awk "{print \$1}"); do
systemctl show "$u" -p ActiveState -p SubState -p CPUQuotaPerSecUSec -p MemoryMax -p WatchdogUSec -p NRestarts | sed "s/^/RUNNER $u /"
done
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
for u in awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
@@ -294,6 +294,8 @@ for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-ac
unit_ok=0
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
unit_ok=1
elif [ "$u" = "awoooi-cd-lane-drain.service" ] && [ "$load" = "not-found" ] && [ "$active" != "active" ]; then
unit_ok=1
fi
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active mainpid=$mainpid ok=$unit_ok"
done
@@ -317,16 +319,21 @@ fi
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
cd_lane_binary_elf=0
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_ok=0
cd_lane_mode=blocked
if [ "$cd_lane_active" = "inactive" ] && echo "$cd_lane_execstart" | grep -q "/bin/false" && [ "$cd_lane_binary_elf" = "0" ]; then
if [ "$cd_lane_active" = "inactive" ] \
&& [ "$cd_lane_sentinel" = "missing" ] \
&& [ "$cd_lane_binary_elf" = "0" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
cd_lane_ok=1
cd_lane_mode=failclosed
elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then
cd_lane_ok=1
cd_lane_mode=controlled_open
fi
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok"
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active mainpid=$cd_lane_mainpid sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
@@ -344,24 +351,25 @@ fi
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
cd_lane_drain_binary_elf=0
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_drain_ok=0
cd_lane_drain_mode=absent
if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then
cd_lane_drain_mode=blocked
fi
if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then
cd_lane_drain_mode=blocked
if [ "$cd_lane_drain_active" != "active" ] \
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
&& [ "$cd_lane_drain_process_count" = "0" ] \
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
cd_lane_drain_ok=1
cd_lane_drain_mode=controlled_open
cd_lane_drain_mode=failclosed
fi
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok"
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active mainpid=$cd_lane_drain_mainpid capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
cd_lane_guard_ok=0
if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then
if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ]; then
cd_lane_guard_ok=1
fi
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
for p in /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && echo "RUNNER_FAILCLOSED_BINARY_ELF $p"
@@ -573,22 +581,69 @@ scheduler_uid=$(docker top momo-scheduler -eo pid,user,uid 2>/dev/null | awk "NR
echo "MOMO_GDRIVE_TOKEN_STAT ${token_stat:-missing} scheduler_uid=${scheduler_uid:-unknown}"
db_user=$(docker exec momo-pro-system printenv POSTGRES_USER 2>/dev/null || true)
db_name=$(docker exec momo-pro-system printenv POSTGRES_DB 2>/dev/null || true)
db_pass=$(docker exec momo-pro-system printenv POSTGRES_PASSWORD 2>/dev/null || true)
if [ -n "$db_user" ] && [ -n "$db_name" ] && [ -n "$db_pass" ]; then
momo_sync=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;" 2>/dev/null || true)
momo_freshness=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;" 2>/dev/null || true)
momo_import_config=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT config_key || chr(61) || config_value FROM import_config;" 2>/dev/null | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true)
momo_latest_import_job=$(docker exec -e PGPASSWORD="$db_pass" -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" -Atc "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" 2>/dev/null | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true)
if [ -n "$db_user" ] && [ -n "$db_name" ]; then
psql_no_secret() {
docker exec -e PGCONNECT_TIMEOUT=5 momo-db psql -h 127.0.0.1 -U "$db_user" -d "$db_name" --no-password -Atc "$1" 2>/dev/null || true
}
momo_sync=$(psql_no_secret "WITH scope AS (SELECT min(snapshot_date::date) dmin, max(snapshot_date::date) dmax, count(*) sc FROM daily_sales_snapshot WHERE snapshot_date::date >= make_date(extract(year from current_date)::int, extract(month from current_date)::int, 1)), monthly AS (SELECT count(*) mc, min(\"日期\"::date) mmin, max(\"日期\"::date) mmax FROM realtime_sales_monthly, scope WHERE scope.sc > 0 AND \"日期\"::date BETWEEN scope.dmin AND scope.dmax) SELECT coalesce(scope.sc,0)::text || chr(124) || coalesce(monthly.mc,0)::text || chr(124) || coalesce(scope.dmin::text,chr(45)) || chr(124) || coalesce(scope.dmax::text,chr(45)) || chr(124) || coalesce(monthly.mmin::text,chr(45)) || chr(124) || coalesce(monthly.mmax::text,chr(45)) FROM scope, monthly;")
momo_freshness=$(psql_no_secret "SELECT coalesce((current_date - max(snapshot_date::date))::text, chr(45)) || chr(124) || coalesce(max(snapshot_date::date)::text, chr(45)) FROM daily_sales_snapshot;")
momo_import_config=$(psql_no_secret "SELECT config_key || chr(61) || config_value FROM import_config;" | awk -F= "\$1 == \"gdrive_folder_path\" {folder=\$2} \$1 == \"gdrive_file_pattern\" {pattern=\$2} END {if (folder || pattern) print folder \"|\" pattern}" || true)
momo_latest_import_job=$(psql_no_secret "SELECT coalesce(id::text, chr(45)) || chr(124) || coalesce(job_type, chr(45)) || chr(124) || coalesce(status, chr(45)) || chr(124) || coalesce(drive_file_name, chr(45)) || chr(124) || coalesce(replace(created_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(replace(completed_at::text, chr(32), chr(84)), chr(45)) || chr(124) || coalesce(total_rows::text, chr(45)) || chr(124) || coalesce(success_rows::text, chr(45)) || chr(124) || coalesce(error_rows::text, chr(45)) FROM import_jobs ORDER BY created_at DESC LIMIT 20;" | awk "BEGIN {FS=sprintf(\"%c\",124)} \$2 == \"daily_sales\" {print \$1 \"|\" \$3 \"|\" \$4 \"|\" \$5 \"|\" \$6 \"|\" \$7 \"|\" \$8 \"|\" \$9; exit}" || true)
tmp_drive_probe="/tmp/awoooi-momo-drive-source-probe.$$"
cat > "$tmp_drive_probe" <<PYDRIVE
from services.google_drive_service import drive_service
from services.import_service import import_service
def emit(key, value):
print(f"{key} {value if value not in (None, '') else '-'}")
folder = import_service.get_config("gdrive_folder_path", "當日業績匯入")
pattern = import_service.get_config("gdrive_file_pattern", "即時業績_當日")
archive = import_service.get_config("gdrive_archive_folder", "當日業績匯入/已匯入")
failed = import_service.get_config("gdrive_failed_folder", "匯入失敗")
intake_files = drive_service.list_files_in_folder(folder, pattern)
archive_files = drive_service.list_files_in_folder(archive, pattern)
failed_files = drive_service.list_files_in_folder(failed, pattern)
emit("MOMO_DRIVE_INTAKE_COUNT", len(intake_files))
emit("MOMO_DRIVE_ARCHIVE_COUNT", len(archive_files))
emit("MOMO_DRIVE_FAILED_COUNT", len(failed_files))
emit("MOMO_DRIVE_ARCHIVE_LATEST_DATE", (archive_files[0].get("modifiedTime", "")[:10] if archive_files else "-"))
if not drive_service.service:
drive_service.authenticate()
global_date = "-"
if drive_service.service:
safe_pattern = drive_service._escape_query_value(pattern)
query = (
f"name contains '{safe_pattern}' and trashed=false and "
"(mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' "
"or mimeType='application/vnd.ms-excel')"
)
results = drive_service.service.files().list(
q=query,
spaces="drive",
fields="files(modifiedTime)",
orderBy="modifiedTime desc",
pageSize=1,
).execute()
global_files = results.get("files", [])
if global_files:
global_date = global_files[0].get("modifiedTime", "")[:10] or "-"
emit("MOMO_DRIVE_GLOBAL_LATEST_DATE", global_date)
PYDRIVE
momo_drive_source_probe=$(docker exec -i momo-scheduler python - < "$tmp_drive_probe" 2>/dev/null | awk "/^MOMO_DRIVE_/ {print}" || true)
rm -f "$tmp_drive_probe"
else
momo_sync=""
momo_freshness=""
momo_import_config=""
momo_latest_import_job=""
momo_drive_source_probe=""
fi
echo "MOMO_MONTHLY_SYNC ${momo_sync:-unavailable}"
echo "MOMO_DAILY_FRESHNESS ${momo_freshness:-unavailable}"
echo "MOMO_IMPORT_CONFIG ${momo_import_config:-unavailable}"
echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}"
printf "%s\n" "$momo_drive_source_probe"
' 2>&1); then
echo "$out"
grep -q "CRON_188 active" <<<"$out" && ok "188 cron active" || warn "188 cron not confirmed"
@@ -611,10 +666,36 @@ echo "MOMO_LATEST_IMPORT_JOB ${momo_latest_import_job:-unavailable}"
grep -Fq "MOMO_IMPORT_CONFIG 當日業績匯入|即時業績_當日" <<<"$out" && ok "188 momo Drive import config points to expected daily-sales intake" || fail "188 momo Drive import config drifted from expected daily-sales intake"
awk '/MOMO_LATEST_IMPORT_JOB / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[2] == "completed" && a[6] == a[7] && a[8] == 0)}' <<<"$out" && ok "188 momo latest daily import job completed cleanly" || warn "188 momo latest daily import job not confirmed clean"
awk '/MOMO_MONTHLY_SYNC / {split($2,a,"|"); exit !(a[1] > 0 && a[1] == a[2] && a[3] == a[5] && a[4] == a[6])}' <<<"$out" && ok "188 momo current-month snapshot and realtime tables match" || warn "188 momo current-month snapshot/realtime sync not confirmed"
momo_source_stale_only=$(awk '
$1 == "MOMO_DRIVE_INTAKE_COUNT" {intake=$2+0}
$1 == "MOMO_DRIVE_FAILED_COUNT" {failed=$2+0}
$1 == "MOMO_DRIVE_GLOBAL_LATEST_DATE" {global=$2}
$1 == "MOMO_LATEST_IMPORT_JOB" {split($2,a,"|"); completed=substr(a[5],1,10)}
END {
if (intake == 0 && failed == 0 && global ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ && completed ~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ && global <= completed) print 1;
else print 0;
}' <<<"$out")
if awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 0 && a[1] <= 2)}' <<<"$out"; then
ok "188 momo daily sales data fresh enough"
elif awk '/MOMO_DAILY_FRESHNESS / {split($2,a,"|"); exit !(a[1] ~ /^[0-9]+$/ && a[1] >= 3)}' <<<"$out"; then
if awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
if [ "$momo_source_stale_only" = "1" ]; then
warn "188 momo daily sales stale but Drive has no newer source candidate"
elif [ -x scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh ]; then
momo_source_preflight_summary="$(
scripts/reboot-recovery/momo-drive-token-source-recovery-preflight.sh \
--host ollama@192.168.0.188 \
--freshness-max-days 2 2>/dev/null \
| awk '/^MOMO_DRIVE_TOKEN_SOURCE_PREFLIGHT / {line=$0} END {print line}' || true
)"
[ -n "$momo_source_preflight_summary" ] && echo "$momo_source_preflight_summary"
if grep -q "BLOCKED=0" <<<"$momo_source_preflight_summary"; then
warn "188 momo daily sales stale but source preflight has no hard blocker"
elif awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
fail "188 momo source file absent while daily sales data stale"
else
fail "188 momo daily sales data stale beyond 3 days"
fi
elif awk '/MOMO_SOURCE_EMPTY_EVIDENCE_LINES / {exit !($2 > 0)}' <<<"$out"; then
fail "188 momo source file absent while daily sales data stale"
else
fail "188 momo daily sales data stale beyond 3 days"

View File

@@ -306,6 +306,7 @@ drive_archive_count="$(num_for DRIVE_ARCHIVE_COUNT)"
drive_failed_count="$(num_for DRIVE_FAILED_COUNT)"
drive_archive_latest="$(value_for DRIVE_ARCHIVE_LATEST_MODIFIED)"
drive_global_latest="$(value_for DRIVE_GLOBAL_LATEST_MODIFIED)"
drive_global_latest_date="${drive_global_latest:0:10}"
if [[ "$drive_intake_count" -gt 0 ]]; then
ok "Drive daily-sales intake has pending source files: count=$drive_intake_count"
else
@@ -338,13 +339,22 @@ IFS='|' read -r freshness_days latest_daily_date <<<"$freshness"
if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -le "$FRESHNESS_MAX_DAYS" ]]; then
ok "daily sales data freshness is within ${FRESHNESS_MAX_DAYS} days: $freshness"
elif [[ "$freshness_days" =~ ^[0-9]+$ ]]; then
blocked "daily sales data is stale: $freshness"
warn "daily sales data is stale: $freshness"
else
blocked "daily sales freshness is unavailable: ${freshness:-missing}"
fi
latest_job="$(value_for DB_LATEST_DAILY_IMPORT_JOB)"
IFS='|' read -r job_id job_status job_file job_created job_completed job_total job_success job_errors <<<"$latest_job"
job_completed_date="${job_completed:0:10}"
source_absent_without_newer_drive=0
if [[ "$drive_intake_count" -eq 0 \
&& "$drive_failed_count" -eq 0 \
&& "$drive_global_latest_date" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ \
&& "$job_completed_date" =~ ^[0-9]{4}-[0-9]{2}-[0-9]{2}$ ]] \
&& [[ "$drive_global_latest_date" < "$job_completed_date" || "$drive_global_latest_date" == "$job_completed_date" ]]; then
source_absent_without_newer_drive=1
fi
if [[ "$job_id" =~ ^[0-9]+$ && "$job_status" == "completed" && "$job_total" == "$job_success" && "$job_errors" == "0" ]]; then
ok "latest daily import job completed cleanly: id=$job_id file=$job_file"
else
@@ -354,6 +364,8 @@ fi
if [[ "$freshness_days" =~ ^[0-9]+$ && "$freshness_days" -gt "$FRESHNESS_MAX_DAYS" ]]; then
if [[ "$auth_failures" -gt 0 ]]; then
blocked "release blocker is stale business data with active Drive auth/source evidence gate"
elif [[ "$source_absent_without_newer_drive" -eq 1 ]]; then
warn "daily sales data is stale, but Drive has no newer source candidate than the last clean import"
else
blocked "release blocker is stale business data; source evidence must be refreshed"
fi

View File

@@ -306,13 +306,15 @@ check_runner_guardrails() {
local out bad
if ! out=$(ssh_cmd "wooo@192.168.0.110" '
bad=0
for u in awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
for u in awoooi-cd-lane-drain.service awoooi-direct-runner-open.service awoooi-direct-runner.service gitea-act-runner-host.service gitea-act-runner-awoooi-controlled.service gitea-awoooi-controlled-runner.service gitea-act-runner-awoooi-open.service; do
load=$(systemctl show "$u" -p LoadState --value 2>/dev/null || true)
unitfile=$(systemctl show "$u" -p UnitFileState --value 2>/dev/null || true)
active=$(systemctl show "$u" -p ActiveState --value 2>/dev/null || true)
unit_ok=0
if [ "$load" = "masked" ] && [ "$unitfile" = "masked" ] && [ "$active" = "inactive" ]; then
unit_ok=1
elif [ "$u" = "awoooi-cd-lane-drain.service" ] && [ "$load" = "not-found" ] && [ "$active" != "active" ]; then
unit_ok=1
fi
echo "RUNNER_FAILCLOSED_UNIT $u load=$load unitfile=$unitfile active=$active ok=$unit_ok"
[ "$unit_ok" = "1" ] || bad=1
@@ -336,16 +338,21 @@ fi
cd_lane_binary_kind=$(file -b /home/wooo/awoooi-cd-lane/awoooi_cd_lane 2>/dev/null || echo missing)
cd_lane_binary_elf=0
echo "$cd_lane_binary_kind" | grep -qi "ELF" && cd_lane_binary_elf=1
cd_lane_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane/awoooi_cd_lane|^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_ok=0
cd_lane_mode=blocked
if [ "$cd_lane_active" = "inactive" ] && echo "$cd_lane_execstart" | grep -q "/bin/false" && [ "$cd_lane_binary_elf" = "0" ]; then
if [ "$cd_lane_active" = "inactive" ] \
&& [ "$cd_lane_sentinel" = "missing" ] \
&& [ "$cd_lane_binary_elf" = "0" ] \
&& [ "$cd_lane_process_count" = "0" ] \
&& { { [ "$cd_lane_load" = "masked" ] && [ "$cd_lane_unitfile" = "masked" ]; } || echo "$cd_lane_execstart" | grep -q "/bin/false"; }; then
cd_lane_ok=1
cd_lane_mode=failclosed
elif [ "$cd_lane_sentinel" = "present" ] && [ "$cd_lane_active" = "active" ] && [ "$cd_lane_capacity_ok" = "1" ] && [ "$cd_lane_labels_ok" = "1" ] && [ "$cd_lane_binary_elf" = "1" ]; then
cd_lane_ok=1
cd_lane_mode=controlled_open
fi
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf ok=$cd_lane_ok"
echo "CD_LANE_CONTROLLED mode=$cd_lane_mode load=$cd_lane_load unitfile=$cd_lane_unitfile active=$cd_lane_active sentinel=$cd_lane_sentinel capacity=$cd_lane_capacity_ok labels=$cd_lane_labels_ok binary_elf=$cd_lane_binary_elf process_count=$cd_lane_process_count ok=$cd_lane_ok"
cd_lane_drain_load=$(systemctl show awoooi-cd-lane-drain.service -p LoadState --value 2>/dev/null || true)
cd_lane_drain_unitfile=$(systemctl show awoooi-cd-lane-drain.service -p UnitFileState --value 2>/dev/null || true)
cd_lane_drain_active=$(systemctl show awoooi-cd-lane-drain.service -p ActiveState --value 2>/dev/null || true)
@@ -362,18 +369,19 @@ fi
cd_lane_drain_binary_kind=$(file -b /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled 2>/dev/null || echo missing)
cd_lane_drain_binary_elf=0
echo "$cd_lane_drain_binary_kind" | grep -qi "ELF" && cd_lane_drain_binary_elf=1
cd_lane_drain_process_count=$(pgrep -f "^/home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled" 2>/dev/null | wc -l | tr -d " ")
cd_lane_drain_ok=0
cd_lane_drain_mode=absent
if [ "$cd_lane_drain_load" = "loaded" ] || [ "$cd_lane_drain_unitfile" = "enabled" ] || [ "$cd_lane_drain_active" = "active" ]; then
cd_lane_drain_mode=blocked
fi
if [ "$cd_lane_drain_active" = "active" ] && [ "$cd_lane_drain_capacity_ok" = "1" ] && [ "$cd_lane_drain_labels_ok" = "1" ] && [ "$cd_lane_drain_binary_elf" = "1" ]; then
cd_lane_drain_mode=blocked
if [ "$cd_lane_drain_active" != "active" ] \
&& [ "$cd_lane_drain_binary_elf" = "0" ] \
&& [ "$cd_lane_drain_process_count" = "0" ] \
&& { [ "$cd_lane_drain_load" = "not-found" ] || { [ "$cd_lane_drain_load" = "masked" ] && [ "$cd_lane_drain_unitfile" = "masked" ]; }; }; then
cd_lane_drain_ok=1
cd_lane_drain_mode=controlled_open
cd_lane_drain_mode=failclosed
fi
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf ok=$cd_lane_drain_ok"
echo "CD_LANE_DRAIN_CONTROLLED mode=$cd_lane_drain_mode load=$cd_lane_drain_load unitfile=$cd_lane_drain_unitfile active=$cd_lane_drain_active capacity=$cd_lane_drain_capacity_ok labels=$cd_lane_drain_labels_ok binary_elf=$cd_lane_drain_binary_elf process_count=$cd_lane_drain_process_count ok=$cd_lane_drain_ok"
cd_lane_guard_ok=0
if [ "$cd_lane_ok" = "1" ] || [ "$cd_lane_drain_ok" = "1" ]; then
if [ "$cd_lane_ok" = "1" ] && [ "$cd_lane_drain_ok" = "1" ]; then
cd_lane_guard_ok=1
fi
echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
@@ -381,7 +389,7 @@ echo "CD_LANE_GUARDRAILS_OK $cd_lane_guard_ok"
direct_runner_count=$(pgrep -f "^/home/wooo/act-runner/act_runner|^/home/wooo/act-runner-controlled/act_runner|^/home/wooo/awoooi-controlled-runner/awoooi_controlled_runner" 2>/dev/null | wc -l | tr -d " ")
echo "RUNNER_DIRECT_PROCESS_COUNT $direct_runner_count"
[ "$direct_runner_count" = "0" ] || bad=1
for p in /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
for p in /home/wooo/awoooi-cd-lane-drain/awoooi_cd_lane_controlled /home/wooo/act-runner/act_runner /home/wooo/act-runner/act_runner.real-20260628-runner-pressure-guard /home/wooo/act-runner-controlled/act_runner /home/wooo/awoooi-controlled-runner/awoooi_controlled_runner; do
kind=$(file -b "$p" 2>/dev/null || echo missing)
echo "RUNNER_FAILCLOSED_BINARY $p kind=$kind"
echo "$kind" | grep -qi "ELF" && bad=1
@@ -446,13 +454,22 @@ echo "ollama-systemd $(systemctl is-active ollama 2>/dev/null || true)"
echo "ollama-api $(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://127.0.0.1:11434/api/tags || true)"
docker inspect -f "momo-scheduler {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" momo-scheduler 2>/dev/null || true
docker inspect -f "litellm {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" litellm 2>/dev/null || true
if ! docker inspect litellm >/dev/null 2>&1 && [ ! -d /opt/litellm ]; then
echo "litellm not-provisioned"
fi
docker inspect -f "signoz-clickhouse {{.State.Status}} {{if .State.Health}}{{.State.Health.Status}}{{end}}" signoz-clickhouse 2>/dev/null || true
' 2>&1); then
echo "$out"
grep -q "ollama-systemd active" <<<"$out" && ok "188 Ollama systemd active" || blocked "188 Ollama systemd inactive"
grep -q "ollama-api 200" <<<"$out" && ok "188 Ollama API reachable" || blocked "188 Ollama API not reachable"
grep -q "momo-scheduler running healthy" <<<"$out" && ok "188 momo-scheduler healthy" || blocked "188 momo-scheduler not healthy"
grep -Eq "litellm running( |$)" <<<"$out" && ok "188 litellm running" || blocked "188 litellm not running"
if grep -Eq "litellm running( |$)" <<<"$out"; then
ok "188 litellm running"
elif grep -q "litellm not-provisioned" <<<"$out"; then
warn "188 litellm not provisioned; provider route/cost switch requires separate approval"
else
blocked "188 litellm not running"
fi
grep -q "signoz-clickhouse running healthy" <<<"$out" && ok "188 SignOz ClickHouse healthy" || warn "188 SignOz ClickHouse health not confirmed"
else
blocked "188 high-load service check unavailable"