Files
awoooi/.gitea/workflows/harbor-110-local-repair.yaml
ogt 8af9789dec
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 20s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
fix(ci): harden harbor repair ssh timeouts
2026-07-01 16:32:08 +08:00

248 lines
11 KiB
YAML

# AWOOOI Harbor 110 Local Repair
#
# Controlled runtime:
# - workflow_dispatch + hourly low-frequency schedule only
# - no push / pull_request / pull_request_target trigger
# - runs on the non-110 controlled host lane, then reaches 110 only through a
# bounded SSH control channel
# - no secret read, no Docker daemon restart, no reboot, no node drain
# - runs the existing bounded recovery script, then verifies local and public /v2/
name: AWOOOI Harbor 110 Local Repair
on:
workflow_dispatch:
schedule:
- cron: "17 * * * *"
env:
AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED: "1"
AWOOOI_110_EXPECTED_HOST_IP: 192.168.0.110
AWOOOI_110_SSH_TARGET: wooo@192.168.0.110
AWOOOI_110_SSH_CONNECT_TIMEOUT_SECONDS: "3"
AWOOOI_110_SSH_COMMAND_TIMEOUT_SECONDS: "12"
AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER: ${{ github.event_name }}
jobs:
workflow-shape:
# Keep one no-op non-110 root job so the workflow is always parseable even
# if the 110-local repair lane is disabled by env.
runs-on: awoooi-non110-host
timeout-minutes: 1
steps:
- name: Confirm controlled Harbor repair workflow shape
run: echo "harbor 110 local repair workflow is workflow_dispatch only."
harbor-110-local-repair:
if: ${{ env.AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED == '1' }}
runs-on: awoooi-non110-host
timeout-minutes: 15
steps:
- uses: actions/checkout@v4
- name: Validate repair workflow boundaries
run: |
set -euo pipefail
python3 ops/runner/guard-gitea-runner-pressure.py --root .
git diff --check
echo "operation_boundary_secret_value_read=false"
echo "operation_boundary_docker_daemon_restart_performed=false"
echo "operation_boundary_host_reboot_performed=false"
echo "operation_boundary_node_drain_performed=false"
echo "operation_boundary_remote_ssh_bounded=true"
- name: Run 110 remote control path and Harbor repair
run: |
set -euo pipefail
if ! command -v ssh >/dev/null 2>&1; then
echo "BLOCKED ssh_command_missing_for_110_control_channel"
exit 65
fi
ssh_base=(
ssh
-n
-T
-o BatchMode=yes
-o StdinNull=yes
-o RequestTTY=no
-o PreferredAuthentications=publickey
-o PasswordAuthentication=no
-o KbdInteractiveAuthentication=no
-o GSSAPIAuthentication=no
-o NumberOfPasswordPrompts=0
-o ConnectionAttempts=1
-o ConnectTimeout="${AWOOOI_110_SSH_CONNECT_TIMEOUT_SECONDS}"
-o ServerAliveInterval=3
-o ServerAliveCountMax=1
"${AWOOOI_110_SSH_TARGET}"
)
SSH_PROBE_ATTEMPTS="${AWOOOI_110_SSH_PROBE_ATTEMPTS:-6}"
SSH_PROBE_SLEEP_SECONDS="${AWOOOI_110_SSH_PROBE_SLEEP_SECONDS:-10}"
run_ssh() {
local attempt rc
attempt=1
rc=1
while [ "${attempt}" -le "${SSH_PROBE_ATTEMPTS}" ]; do
if timeout --foreground --kill-after=3s "${AWOOOI_110_SSH_COMMAND_TIMEOUT_SECONDS}" "${ssh_base[@]}" "$@"; then
echo "harbor_110_remote_ssh_probe_attempt=${attempt} result=success"
return 0
else
rc=$?
fi
echo "harbor_110_remote_ssh_probe_attempt=${attempt} result=failure rc=${rc}"
if [ "${attempt}" -lt "${SSH_PROBE_ATTEMPTS}" ]; then
sleep "${SSH_PROBE_SLEEP_SECONDS}"
fi
attempt=$((attempt + 1))
done
return "${rc}"
}
diagnose_ssh_control_channel() {
set +e
diag_output="$(
timeout --foreground --kill-after=3s 20 ssh -vvv -4 \
-n \
-T \
-o BatchMode=yes \
-o StdinNull=yes \
-o RequestTTY=no \
-o PreferredAuthentications=publickey \
-o PasswordAuthentication=no \
-o KbdInteractiveAuthentication=no \
-o GSSAPIAuthentication=no \
-o NumberOfPasswordPrompts=0 \
-o ConnectTimeout="${AWOOOI_110_SSH_CONNECT_TIMEOUT_SECONDS}" \
-o ConnectionAttempts=1 \
-o ServerAliveInterval=3 \
-o ServerAliveCountMax=1 \
"${AWOOOI_110_SSH_TARGET}" \
'true' 2>&1
)"
diag_rc=$?
set -e
echo "harbor_110_remote_ssh_diag_rc=${diag_rc}"
if printf '%s\n' "${diag_output}" | grep -q "Connection established."; then
echo "harbor_110_remote_ssh_tcp_connected=true"
else
echo "harbor_110_remote_ssh_tcp_connected=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Remote protocol version"; then
echo "harbor_110_remote_ssh_banner_seen=true"
else
echo "harbor_110_remote_ssh_banner_seen=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "SSH2_MSG_SERVICE_ACCEPT received"; then
echo "harbor_110_remote_ssh_userauth_service_accept_seen=true"
else
echo "harbor_110_remote_ssh_userauth_service_accept_seen=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Offering public key:"; then
echo "harbor_110_remote_ssh_publickey_offered=true"
else
echo "harbor_110_remote_ssh_publickey_offered=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Server accepts key" \
&& printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then
echo "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true"
echo "SSH_AUTH user=wooo mode=publickey rc=${diag_rc} classification=server_accepts_key_then_timeout"
else
echo "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=false"
fi
if { printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \
|| printf '%s\n' "${diag_output}" | grep -q "Server accepts key"; } \
&& printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=true"
echo "harbor_110_remote_ssh_publickey_auth_stalled=true"
echo "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=${AWOOOI_110_SSH_TARGET}"
else
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=false"
echo "harbor_110_remote_ssh_publickey_auth_stalled=false"
fi
if printf '%s\n' "${diag_output}" | grep -q "Permission denied"; then
echo "harbor_110_remote_ssh_auth_permission_denied=true"
else
echo "harbor_110_remote_ssh_auth_permission_denied=false"
fi
echo "harbor_110_remote_ssh_diag_raw_log_printed=false"
}
if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then
diagnose_ssh_control_channel || true
echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}"
echo "harbor_110_remote_ssh_reachable=false"
exit 65
fi
echo "harbor_110_remote_ssh_reachable=true"
if ! run_ssh 'test -x /usr/local/bin/recover-110-control-path-and-harbor-local.sh'; then
diagnose_ssh_control_channel || true
echo "BLOCKED harbor_110_local_recovery_script_missing path=/usr/local/bin/recover-110-control-path-and-harbor-local.sh"
exit 65
fi
run_recovery() {
timeout --foreground --kill-after=5s 240 "${ssh_base[@]}" \
sudo -n env \
TARGET_USER=wooo \
RELOAD_SSH=0 \
AWOOOI_110_EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP}" \
/usr/local/bin/recover-110-control-path-and-harbor-local.sh \
"$@"
}
echo "harbor_110_remote_repair_check_start=1"
set +e
check_output="$(run_recovery --check 2>&1)"
check_rc=$?
set -e
printf '%s\n' "${check_output}"
echo "harbor_110_remote_repair_check_rc=${check_rc}"
if [ "${check_rc}" -ne 0 ]; then
diagnose_ssh_control_channel || true
echo "BLOCKED harbor_110_remote_repair_check_unavailable rc=${check_rc}"
exit 65
fi
trigger="${GITHUB_EVENT_NAME:-${AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER:-unknown}}"
echo "harbor_110_remote_repair_trigger=${trigger}"
if [ "${trigger}" != "workflow_dispatch" ] \
&& printf '%s\n' "${check_output}" | grep -q "harbor_ready=true"; then
echo "harbor_110_remote_repair_skipped=already_ready"
exit 0
fi
echo "harbor_110_remote_repair_apply_all_start=1"
set +e
apply_output="$(run_recovery --apply-all 2>&1)"
apply_rc=$?
set -e
printf '%s\n' "${apply_output}"
echo "harbor_110_remote_repair_apply_all_rc=${apply_rc}"
if [ "${apply_rc}" -ne 0 ]; then
diagnose_ssh_control_channel || true
echo "BLOCKED harbor_110_remote_repair_apply_all_failed rc=${apply_rc}"
exit "${apply_rc}"
fi
local_status="$(
run_ssh 'curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true'
)"
public_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 https://registry.wooo.work/v2/ || true)"
[ -n "${local_status}" ] || local_status="000"
[ -n "${public_status}" ] || public_status="000"
echo "harbor_110_remote_local_v2_http_status=${local_status}"
echo "harbor_public_registry_v2_http_status=${public_status}"
case "${local_status}" in
200|401) ;;
*) echo "BLOCKED harbor_110_remote_local_registry_v2_unavailable status=${local_status}"; exit 1 ;;
esac
case "${public_status}" in
200|401) ;;
*) echo "BLOCKED harbor_public_registry_v2_unavailable status=${public_status}"; exit 1 ;;
esac
echo "harbor_110_remote_repair_verified=true"