Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Failing after 20s
CD Pipeline / build-and-deploy (push) Has been skipped
CD Pipeline / post-deploy-checks (push) Has been skipped
248 lines
11 KiB
YAML
248 lines
11 KiB
YAML
# AWOOOI Harbor 110 Local Repair
|
|
#
|
|
# Controlled runtime:
|
|
# - workflow_dispatch + hourly low-frequency schedule only
|
|
# - no push / pull_request / pull_request_target trigger
|
|
# - runs on the non-110 controlled host lane, then reaches 110 only through a
|
|
# bounded SSH control channel
|
|
# - no secret read, no Docker daemon restart, no reboot, no node drain
|
|
# - runs the existing bounded recovery script, then verifies local and public /v2/
|
|
|
|
name: AWOOOI Harbor 110 Local Repair
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
schedule:
|
|
- cron: "17 * * * *"
|
|
|
|
env:
|
|
AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED: "1"
|
|
AWOOOI_110_EXPECTED_HOST_IP: 192.168.0.110
|
|
AWOOOI_110_SSH_TARGET: wooo@192.168.0.110
|
|
AWOOOI_110_SSH_CONNECT_TIMEOUT_SECONDS: "3"
|
|
AWOOOI_110_SSH_COMMAND_TIMEOUT_SECONDS: "12"
|
|
AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER: ${{ github.event_name }}
|
|
|
|
jobs:
|
|
workflow-shape:
|
|
# Keep one no-op non-110 root job so the workflow is always parseable even
|
|
# if the 110-local repair lane is disabled by env.
|
|
runs-on: awoooi-non110-host
|
|
timeout-minutes: 1
|
|
steps:
|
|
- name: Confirm controlled Harbor repair workflow shape
|
|
run: echo "harbor 110 local repair workflow is workflow_dispatch only."
|
|
|
|
harbor-110-local-repair:
|
|
if: ${{ env.AWOOOI_HARBOR_110_LOCAL_REPAIR_ENABLED == '1' }}
|
|
runs-on: awoooi-non110-host
|
|
timeout-minutes: 15
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
|
|
- name: Validate repair workflow boundaries
|
|
run: |
|
|
set -euo pipefail
|
|
python3 ops/runner/guard-gitea-runner-pressure.py --root .
|
|
git diff --check
|
|
echo "operation_boundary_secret_value_read=false"
|
|
echo "operation_boundary_docker_daemon_restart_performed=false"
|
|
echo "operation_boundary_host_reboot_performed=false"
|
|
echo "operation_boundary_node_drain_performed=false"
|
|
echo "operation_boundary_remote_ssh_bounded=true"
|
|
|
|
- name: Run 110 remote control path and Harbor repair
|
|
run: |
|
|
set -euo pipefail
|
|
if ! command -v ssh >/dev/null 2>&1; then
|
|
echo "BLOCKED ssh_command_missing_for_110_control_channel"
|
|
exit 65
|
|
fi
|
|
|
|
ssh_base=(
|
|
ssh
|
|
-n
|
|
-T
|
|
-o BatchMode=yes
|
|
-o StdinNull=yes
|
|
-o RequestTTY=no
|
|
-o PreferredAuthentications=publickey
|
|
-o PasswordAuthentication=no
|
|
-o KbdInteractiveAuthentication=no
|
|
-o GSSAPIAuthentication=no
|
|
-o NumberOfPasswordPrompts=0
|
|
-o ConnectionAttempts=1
|
|
-o ConnectTimeout="${AWOOOI_110_SSH_CONNECT_TIMEOUT_SECONDS}"
|
|
-o ServerAliveInterval=3
|
|
-o ServerAliveCountMax=1
|
|
"${AWOOOI_110_SSH_TARGET}"
|
|
)
|
|
SSH_PROBE_ATTEMPTS="${AWOOOI_110_SSH_PROBE_ATTEMPTS:-6}"
|
|
SSH_PROBE_SLEEP_SECONDS="${AWOOOI_110_SSH_PROBE_SLEEP_SECONDS:-10}"
|
|
|
|
run_ssh() {
|
|
local attempt rc
|
|
attempt=1
|
|
rc=1
|
|
while [ "${attempt}" -le "${SSH_PROBE_ATTEMPTS}" ]; do
|
|
if timeout --foreground --kill-after=3s "${AWOOOI_110_SSH_COMMAND_TIMEOUT_SECONDS}" "${ssh_base[@]}" "$@"; then
|
|
echo "harbor_110_remote_ssh_probe_attempt=${attempt} result=success"
|
|
return 0
|
|
else
|
|
rc=$?
|
|
fi
|
|
echo "harbor_110_remote_ssh_probe_attempt=${attempt} result=failure rc=${rc}"
|
|
if [ "${attempt}" -lt "${SSH_PROBE_ATTEMPTS}" ]; then
|
|
sleep "${SSH_PROBE_SLEEP_SECONDS}"
|
|
fi
|
|
attempt=$((attempt + 1))
|
|
done
|
|
return "${rc}"
|
|
}
|
|
|
|
diagnose_ssh_control_channel() {
|
|
set +e
|
|
diag_output="$(
|
|
timeout --foreground --kill-after=3s 20 ssh -vvv -4 \
|
|
-n \
|
|
-T \
|
|
-o BatchMode=yes \
|
|
-o StdinNull=yes \
|
|
-o RequestTTY=no \
|
|
-o PreferredAuthentications=publickey \
|
|
-o PasswordAuthentication=no \
|
|
-o KbdInteractiveAuthentication=no \
|
|
-o GSSAPIAuthentication=no \
|
|
-o NumberOfPasswordPrompts=0 \
|
|
-o ConnectTimeout="${AWOOOI_110_SSH_CONNECT_TIMEOUT_SECONDS}" \
|
|
-o ConnectionAttempts=1 \
|
|
-o ServerAliveInterval=3 \
|
|
-o ServerAliveCountMax=1 \
|
|
"${AWOOOI_110_SSH_TARGET}" \
|
|
'true' 2>&1
|
|
)"
|
|
diag_rc=$?
|
|
set -e
|
|
|
|
echo "harbor_110_remote_ssh_diag_rc=${diag_rc}"
|
|
if printf '%s\n' "${diag_output}" | grep -q "Connection established."; then
|
|
echo "harbor_110_remote_ssh_tcp_connected=true"
|
|
else
|
|
echo "harbor_110_remote_ssh_tcp_connected=false"
|
|
fi
|
|
if printf '%s\n' "${diag_output}" | grep -q "Remote protocol version"; then
|
|
echo "harbor_110_remote_ssh_banner_seen=true"
|
|
else
|
|
echo "harbor_110_remote_ssh_banner_seen=false"
|
|
fi
|
|
if printf '%s\n' "${diag_output}" | grep -q "SSH2_MSG_SERVICE_ACCEPT received"; then
|
|
echo "harbor_110_remote_ssh_userauth_service_accept_seen=true"
|
|
else
|
|
echo "harbor_110_remote_ssh_userauth_service_accept_seen=false"
|
|
fi
|
|
if printf '%s\n' "${diag_output}" | grep -q "Offering public key:"; then
|
|
echo "harbor_110_remote_ssh_publickey_offered=true"
|
|
else
|
|
echo "harbor_110_remote_ssh_publickey_offered=false"
|
|
fi
|
|
if printf '%s\n' "${diag_output}" | grep -q "Server accepts key" \
|
|
&& printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then
|
|
echo "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true"
|
|
echo "SSH_AUTH user=wooo mode=publickey rc=${diag_rc} classification=server_accepts_key_then_timeout"
|
|
else
|
|
echo "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=false"
|
|
fi
|
|
if { printf '%s\n' "${diag_output}" | grep -q "we sent a publickey packet, wait for reply" \
|
|
|| printf '%s\n' "${diag_output}" | grep -q "Server accepts key"; } \
|
|
&& printf '%s\n' "${diag_output}" | grep -Eq "timed out|Timeout, server"; then
|
|
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=true"
|
|
echo "harbor_110_remote_ssh_publickey_auth_stalled=true"
|
|
echo "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled target=${AWOOOI_110_SSH_TARGET}"
|
|
else
|
|
echo "harbor_110_remote_ssh_publickey_reply_timeout_seen=false"
|
|
echo "harbor_110_remote_ssh_publickey_auth_stalled=false"
|
|
fi
|
|
if printf '%s\n' "${diag_output}" | grep -q "Permission denied"; then
|
|
echo "harbor_110_remote_ssh_auth_permission_denied=true"
|
|
else
|
|
echo "harbor_110_remote_ssh_auth_permission_denied=false"
|
|
fi
|
|
echo "harbor_110_remote_ssh_diag_raw_log_printed=false"
|
|
}
|
|
|
|
if ! run_ssh "expected_host_ip=${AWOOOI_110_EXPECTED_HOST_IP}; printf 'remote_host=%s\n' \"\$(hostname 2>/dev/null || echo unknown)\"; printf 'remote_user=%s\n' \"\$(id -un 2>/dev/null || echo unknown)\"; hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx \"\${expected_host_ip}\""; then
|
|
diagnose_ssh_control_channel || true
|
|
echo "BLOCKED harbor_110_remote_control_channel_unavailable target=${AWOOOI_110_SSH_TARGET}"
|
|
echo "harbor_110_remote_ssh_reachable=false"
|
|
exit 65
|
|
fi
|
|
echo "harbor_110_remote_ssh_reachable=true"
|
|
|
|
if ! run_ssh 'test -x /usr/local/bin/recover-110-control-path-and-harbor-local.sh'; then
|
|
diagnose_ssh_control_channel || true
|
|
echo "BLOCKED harbor_110_local_recovery_script_missing path=/usr/local/bin/recover-110-control-path-and-harbor-local.sh"
|
|
exit 65
|
|
fi
|
|
|
|
run_recovery() {
|
|
timeout --foreground --kill-after=5s 240 "${ssh_base[@]}" \
|
|
sudo -n env \
|
|
TARGET_USER=wooo \
|
|
RELOAD_SSH=0 \
|
|
AWOOOI_110_EXPECTED_HOST_IP="${AWOOOI_110_EXPECTED_HOST_IP}" \
|
|
/usr/local/bin/recover-110-control-path-and-harbor-local.sh \
|
|
"$@"
|
|
}
|
|
|
|
echo "harbor_110_remote_repair_check_start=1"
|
|
set +e
|
|
check_output="$(run_recovery --check 2>&1)"
|
|
check_rc=$?
|
|
set -e
|
|
printf '%s\n' "${check_output}"
|
|
echo "harbor_110_remote_repair_check_rc=${check_rc}"
|
|
if [ "${check_rc}" -ne 0 ]; then
|
|
diagnose_ssh_control_channel || true
|
|
echo "BLOCKED harbor_110_remote_repair_check_unavailable rc=${check_rc}"
|
|
exit 65
|
|
fi
|
|
|
|
trigger="${GITHUB_EVENT_NAME:-${AWOOOI_HARBOR_110_LOCAL_REPAIR_TRIGGER:-unknown}}"
|
|
echo "harbor_110_remote_repair_trigger=${trigger}"
|
|
if [ "${trigger}" != "workflow_dispatch" ] \
|
|
&& printf '%s\n' "${check_output}" | grep -q "harbor_ready=true"; then
|
|
echo "harbor_110_remote_repair_skipped=already_ready"
|
|
exit 0
|
|
fi
|
|
|
|
echo "harbor_110_remote_repair_apply_all_start=1"
|
|
set +e
|
|
apply_output="$(run_recovery --apply-all 2>&1)"
|
|
apply_rc=$?
|
|
set -e
|
|
printf '%s\n' "${apply_output}"
|
|
echo "harbor_110_remote_repair_apply_all_rc=${apply_rc}"
|
|
if [ "${apply_rc}" -ne 0 ]; then
|
|
diagnose_ssh_control_channel || true
|
|
echo "BLOCKED harbor_110_remote_repair_apply_all_failed rc=${apply_rc}"
|
|
exit "${apply_rc}"
|
|
fi
|
|
|
|
local_status="$(
|
|
run_ssh 'curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 http://127.0.0.1:5000/v2/ || true'
|
|
)"
|
|
public_status="$(curl --silent --show-error --output /dev/null --write-out "%{http_code}" --max-time 10 https://registry.wooo.work/v2/ || true)"
|
|
[ -n "${local_status}" ] || local_status="000"
|
|
[ -n "${public_status}" ] || public_status="000"
|
|
echo "harbor_110_remote_local_v2_http_status=${local_status}"
|
|
echo "harbor_public_registry_v2_http_status=${public_status}"
|
|
case "${local_status}" in
|
|
200|401) ;;
|
|
*) echo "BLOCKED harbor_110_remote_local_registry_v2_unavailable status=${local_status}"; exit 1 ;;
|
|
esac
|
|
case "${public_status}" in
|
|
200|401) ;;
|
|
*) echo "BLOCKED harbor_public_registry_v2_unavailable status=${public_status}"; exit 1 ;;
|
|
esac
|
|
echo "harbor_110_remote_repair_verified=true"
|