fix(recovery): honor 110 ssh command-path readiness
Some checks failed
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 36s
CD Pipeline / post-deploy-checks (push) Has been cancelled
CD Pipeline / build-and-deploy (push) Has been cancelled

This commit is contained in:
ogt
2026-07-01 23:12:33 +08:00
parent 63506834fe
commit c90f1ced2f
10 changed files with 463 additions and 20 deletions

View File

@@ -11,8 +11,10 @@ PORT="${PORT:-22}"
NODE_EXPORTER_PORT="${NODE_EXPORTER_PORT:-9100}"
CONNECT_TIMEOUT_SECONDS="${CONNECT_TIMEOUT_SECONDS:-4}"
SSH_ATTEMPT_TIMEOUT_SECONDS="${SSH_ATTEMPT_TIMEOUT_SECONDS:-8}"
SSH_COMMAND_PATH_TIMEOUT_SECONDS="${SSH_COMMAND_PATH_TIMEOUT_SECONDS:-20}"
NODE_EXPORTER_TIMEOUT_SECONDS="${NODE_EXPORTER_TIMEOUT_SECONDS:-8}"
USERS=(${USERS:-wooo root git ollama})
COMMAND_PATH_USER="${COMMAND_PATH_USER:-wooo}"
tmp_dir="$(mktemp -d "${TMPDIR:-/tmp}/awoooi-110-ssh-auth.XXXXXX")"
trap 'rm -rf "$tmp_dir"' EXIT
@@ -28,6 +30,18 @@ ssh_base_opts=(
-o ServerAliveCountMax=1
)
ssh_command_opts=(
-p "$PORT"
-o ConnectTimeout="$CONNECT_TIMEOUT_SECONDS"
-o ConnectionAttempts=1
-o BatchMode=yes
-o StrictHostKeyChecking=accept-new
-o ServerAliveInterval=2
-o ServerAliveCountMax=1
-o PasswordAuthentication=no
-o PreferredAuthentications=publickey
)
run_timeout() {
local seconds="$1"
shift
@@ -157,6 +171,43 @@ probe_user() {
printf 'SSH_AUTH user=%s mode=%s rc=%s classification=%s\n' "$user" "$mode" "$rc" "$classification"
}
probe_command_path_user() {
local user="$1"
local stdout_path="$tmp_dir/${user}-command-path.out"
local stderr_path="$tmp_dir/${user}-command-path.err"
local rc classification marker_seen remote_user remote_user_match
run_timeout "$SSH_COMMAND_PATH_TIMEOUT_SECONDS" \
ssh "${ssh_command_opts[@]}" \
"${user}@${HOST}" \
'printf "AWOOOI_110_COMMAND_PATH_READY=1\n"; printf "remote_user=%s\n" "$(id -un)"' \
>"$stdout_path" 2>"$stderr_path"
rc=$?
marker_seen=false
remote_user_match=false
remote_user="$(awk -F= '$1 == "remote_user" {print $2; exit}' "$stdout_path" 2>/dev/null || true)"
if grep -qx 'AWOOOI_110_COMMAND_PATH_READY=1' "$stdout_path" 2>/dev/null; then
marker_seen=true
fi
if [[ "$remote_user" == "$user" ]]; then
remote_user_match=true
fi
if [[ "$marker_seen" == "true" && "$remote_user_match" == "true" ]]; then
classification="command_path_ready"
elif grep -Eiq 'timed out|not responding|Timeout' "$stderr_path"; then
classification="command_path_timeout"
elif grep -q 'Permission denied' "$stderr_path"; then
classification="command_path_permission_denied"
else
classification="command_path_unavailable"
fi
printf 'SSH_COMMAND_PATH user=%s rc=%s classification=%s marker_seen=%s remote_user_match=%s\n' \
"$user" "$rc" "$classification" "$marker_seen" "$remote_user_match"
}
echo "AWOOOI_110_SSH_PUBLICKEY_AUTH_DIAGNOSIS"
echo "TARGET=${HOST}:${PORT}"
probe_node_exporter
@@ -166,8 +217,10 @@ for user in "${USERS[@]}"; do
probe_user "$user" "publickey"
done
probe_command_path_user "$COMMAND_PATH_USER"
for user in "${USERS[@]}"; do
probe_user "$user" "password_disabled"
done
echo "INTERPRETATION=server_accepts_key_then_timeout_means_check_110_session_pam_account_or_shell_path;publickey_offer_timeout_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path"
echo "INTERPRETATION=command_path_ready_overrides_stale_verbose_true_timeout;server_accepts_key_then_timeout_without_command_path_means_check_110_session_pam_account_or_shell_path;publickey_offer_timeout_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path"

View File

@@ -122,6 +122,29 @@ check_sshd_effective_config() {
echo "SSHD_EFFECTIVE_CONFIG available=true pubkeyauthentication=${pubkey:-unknown} passwordauthentication=${password:-unknown} kbdinteractiveauthentication=${kbdinteractive:-unknown} usepam=${usepam:-unknown} maxstartups=${maxstartups:-unknown} authorized_keys_file_default=${authorized_keys_file_default}"
}
check_sshd_config_syntax() {
local label="$1"
local output rc reason
if output="$(sshd -t 2>&1)"; then
echo "${label}=ok"
return 0
fi
rc=$?
reason="sshd_t_failed"
if printf '%s\n' "$output" | grep -Eiq 'hostkey|host key|permission|no hostkeys'; then
reason="hostkeys_unavailable_or_permission"
fi
if [ "$APPLY" -eq 0 ]; then
echo "${label}=unverified_requires_root rc=$rc reason=$reason"
return 0
fi
echo "${label}=failed rc=$rc reason=$reason" >&2
return "$rc"
}
apply_user_permissions() {
local user="$1"
local home_dir
@@ -149,15 +172,13 @@ require_110_or_explicit
echo "AWOOOI_110_SSH_PUBLICKEY_AUTH_LOCAL_REPAIR mode=$([ "$APPLY" -eq 1 ] && echo apply || echo check) target_user=$TARGET_USER"
systemctl is-active ssh 2>/dev/null | sed 's/^/SSH_SERVICE_ACTIVE=/' || true
sshd -t
echo "SSHD_CONFIG_SYNTAX=ok"
check_sshd_config_syntax "SSHD_CONFIG_SYNTAX"
check_user "$TARGET_USER"
check_sshd_effective_config "$TARGET_USER"
if [ "$APPLY" -eq 1 ]; then
apply_user_permissions "$TARGET_USER"
sshd -t
echo "SSHD_CONFIG_SYNTAX_AFTER_APPLY=ok"
check_sshd_config_syntax "SSHD_CONFIG_SYNTAX_AFTER_APPLY"
if [ "$RELOAD_SSH" = "1" ]; then
systemctl reload ssh
echo "SSH_RELOAD=done"

View File

@@ -212,6 +212,11 @@ def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None:
assert "PasswordAuthentication=no" in text
assert "PubkeyAuthentication=no" in text
assert "NumberOfPasswordPrompts=0" in text
assert "SSH_COMMAND_PATH_TIMEOUT_SECONDS" in text
assert "COMMAND_PATH_USER" in text
assert "SSH_COMMAND_PATH user=%s rc=%s classification=%s" in text
assert "command_path_ready" in text
assert "AWOOOI_110_COMMAND_PATH_READY=1" in text
assert "server_accepts_key_then_timeout" in text
assert "publickey_offer_timeout" in text
assert "NODE_EXPORTER=ok" in text
@@ -245,6 +250,8 @@ def test_110_ssh_publickey_auth_repair_is_local_and_does_not_print_keys() -> Non
assert "account_locked=" in text
assert "shell_executable=" in text
assert "SSHD_EFFECTIVE_CONFIG available=true" in text
assert "unverified_requires_root" in text
assert "hostkeys_unavailable_or_permission" in text
assert "sshd -T -C" in text
assert "pubkeyauthentication=" in text
assert "authorized_keys_file_default=" in text