diff --git a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py index 1dcf59ec..3d20f66c 100644 --- a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py @@ -186,15 +186,32 @@ def validate_harbor_registry_controlled_recovery_receipt( "ssh_publickey_wooo_publickey_classification": ssh_diagnosis[ "wooo_publickey_classification" ], + "ssh_publickey_wooo_command_path_classification": ssh_diagnosis[ + "wooo_command_path_classification" + ], + "ssh_publickey_wooo_command_path_ready": ssh_diagnosis[ + "wooo_command_path_ready" + ], "ssh_publickey_offer_timeout_seen": ssh_diagnosis[ "publickey_offer_timeout_seen" ], + "ssh_publickey_offer_timeout_effective_seen": ssh_diagnosis[ + "publickey_offer_timeout_effective_seen" + ], "ssh_publickey_server_accepts_key_then_timeout_seen": ( ssh_diagnosis["server_accepts_key_then_timeout_seen"] ), + "ssh_publickey_server_accepts_key_then_timeout_effective_seen": ( + ssh_diagnosis[ + "server_accepts_key_then_timeout_effective_seen" + ] + ), "ssh_publickey_runner_systemctl_show_timeout_seen": ( ssh_diagnosis["runner_systemctl_show_timeout_seen"] ), + "ssh_publickey_runner_systemctl_show_timeout_effective_seen": ( + ssh_diagnosis["runner_systemctl_show_timeout_effective_seen"] + ), "ssh_publickey_node_high_load_seen": ssh_diagnosis[ "node_high_load_seen" ], @@ -452,11 +469,16 @@ def _control_path_readiness( non110_runner_not_ready = bool( non110_runner["receipt_seen"] and not non110_runner["non110_runner_ready"] ) - runner_timeout = bool(ssh_diagnosis["runner_systemctl_show_timeout_seen"]) + command_path_ready = bool(ssh_diagnosis["wooo_command_path_ready"]) + runner_timeout = bool( + ssh_diagnosis["runner_systemctl_show_timeout_effective_seen"] + ) node_high_load = bool(ssh_diagnosis["node_high_load_seen"]) - publickey_offer_timeout = bool(ssh_diagnosis["publickey_offer_timeout_seen"]) + publickey_offer_timeout = bool( + ssh_diagnosis["publickey_offer_timeout_effective_seen"] + ) server_accepts_key_then_timeout = bool( - ssh_diagnosis["server_accepts_key_then_timeout_seen"] + ssh_diagnosis["server_accepts_key_then_timeout_effective_seen"] ) awoooi_host_unavailable = ( queue_no_matching_runner @@ -523,9 +545,22 @@ def _control_path_readiness( "node_load_classifier": ssh_diagnosis["node_load_classifier"], "node_load1_per_cpu": ssh_diagnosis["node_load1_per_cpu"], "ssh_port_tcp_open": ssh_diagnosis["ssh_port_tcp_open"], + "ssh_command_path_ready": command_path_ready, + "ssh_command_path_classification": ssh_diagnosis[ + "wooo_command_path_classification" + ], "ssh_publickey_offer_timeout": publickey_offer_timeout, "ssh_server_accepts_key_then_timeout": server_accepts_key_then_timeout, + "ssh_publickey_offer_timeout_raw": ssh_diagnosis[ + "publickey_offer_timeout_seen" + ], + "ssh_server_accepts_key_then_timeout_raw": ssh_diagnosis[ + "server_accepts_key_then_timeout_seen" + ], "runner_systemctl_show_timeout": runner_timeout, + "runner_systemctl_show_timeout_raw": ssh_diagnosis[ + "runner_systemctl_show_timeout_seen" + ], "awoooi_host_runner_unavailable": awoooi_host_unavailable, "non110_runner_unavailable": non110_runner_unavailable, "non110_runner_readiness_receipt_seen": non110_runner["receipt_seen"], @@ -821,6 +856,8 @@ def _ssh_metadata_phase_status( if ssh_local["control_channel_metadata_ready"]: return "ready" return "blocked_ssh_metadata_repair_receipt_not_ready" + if ssh_diagnosis["wooo_command_path_ready"]: + return "skipped_not_required" if ssh_diagnosis["publickey_offer_timeout_seen"]: return "blocked_waiting_ssh_metadata_repair_receipt_after_publickey_timeout" if ssh_diagnosis["server_accepts_key_then_timeout_seen"]: @@ -886,12 +923,18 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: fields = _parse_key_values(output) marker_seen = "AWOOOI_110_SSH_PUBLICKEY_AUTH_DIAGNOSIS" in output auth_attempts = _ssh_auth_attempts(output) + command_path_attempts = _ssh_command_path_attempts(output) systemd_units = _systemd_unit_signals(output) wooo_publickey = _auth_classification( auth_attempts, user="wooo", mode="publickey", ) + wooo_command_path = _command_path_classification( + command_path_attempts, + user="wooo", + ) + wooo_command_path_ready = wooo_command_path == "command_path_ready" publickey_offer_timeout_seen = any( item["mode"] == "publickey" and item["classification"] == "publickey_offer_timeout" @@ -913,6 +956,15 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: and item["classifier"] == "systemctl_show_timeout" for item in systemd_units ) + publickey_offer_timeout_effective_seen = bool( + publickey_offer_timeout_seen and not wooo_command_path_ready + ) + server_accepts_key_then_timeout_effective_seen = bool( + server_accepts_key_then_timeout_seen and not wooo_command_path_ready + ) + runner_systemctl_show_timeout_effective_seen = bool( + runner_systemctl_show_timeout_seen and not wooo_command_path_ready + ) node_load_classifier = str(fields.get("NODE_LOAD_CLASSIFIER") or "") node_high_load_seen = node_load_classifier == "high_load" ssh_port_tcp_open = str(fields.get("SSH_PORT") or "") == "tcp_open" @@ -928,17 +980,34 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: "ssh_banner_seen": "SSH_BANNER=SSH-" in output, "auth_attempt_count": len(auth_attempts), "auth_classifications": auth_attempts, + "command_path_attempt_count": len(command_path_attempts), + "command_path_classifications": command_path_attempts, "systemd_unit_signal_count": len(systemd_units), "systemd_unit_signals": systemd_units, "runner_systemctl_show_timeout_seen": runner_systemctl_show_timeout_seen, + "runner_systemctl_show_timeout_effective_seen": ( + runner_systemctl_show_timeout_effective_seen + ), "wooo_publickey_classification": wooo_publickey, + "wooo_command_path_classification": wooo_command_path, + "wooo_command_path_ready": wooo_command_path_ready, "publickey_offer_timeout_seen": publickey_offer_timeout_seen, + "publickey_offer_timeout_effective_seen": ( + publickey_offer_timeout_effective_seen + ), "server_accepts_key_then_timeout_seen": ( server_accepts_key_then_timeout_seen ), + "server_accepts_key_then_timeout_effective_seen": ( + server_accepts_key_then_timeout_effective_seen + ), "preauth_timeout_count": preauth_timeout_count, "permission_denied_count": permission_denied_count, - "diagnosis_ready": bool(marker_seen and ssh_port_tcp_open and auth_attempts), + "diagnosis_ready": bool( + marker_seen + and (ssh_port_tcp_open or wooo_command_path_ready) + and (auth_attempts or command_path_attempts) + ), "metadata_only": True, "raw_output_returned": False, } @@ -961,6 +1030,26 @@ def _ssh_auth_attempts(output: str) -> list[dict[str, Any]]: return attempts +def _ssh_command_path_attempts(output: str) -> list[dict[str, Any]]: + attempts: list[dict[str, Any]] = [] + for line in output.splitlines(): + if not line.startswith("SSH_COMMAND_PATH "): + continue + fields = _parse_key_values(line) + attempts.append( + { + "user": str(fields.get("user") or ""), + "rc": _int_or_none(fields.get("rc")), + "classification": str(fields.get("classification") or ""), + "marker_seen": _bool_from_field(fields.get("marker_seen")), + "remote_user_match": _bool_from_field( + fields.get("remote_user_match") + ), + } + ) + return attempts + + def _systemd_unit_signals(output: str) -> list[dict[str, str]]: signals: list[dict[str, str]] = [] for line in output.splitlines(): @@ -989,12 +1078,28 @@ def _auth_classification( return "" +def _command_path_classification( + attempts: list[dict[str, Any]], + *, + user: str, +) -> str: + for item in attempts: + if item["user"] == user: + return str(item["classification"] or "") + return "" + + def _parse_ssh_local_repair_output(output: str) -> dict[str, Any]: fields = _parse_key_values(output) marker_seen = "AWOOOI_110_SSH_PUBLICKEY_AUTH_LOCAL_REPAIR" in output - sshd_ok = "SSHD_CONFIG_SYNTAX=ok" in output + sshd_unverified = "SSHD_CONFIG_SYNTAX=unverified_requires_root" in output + sshd_after_unverified = ( + "SSHD_CONFIG_SYNTAX_AFTER_APPLY=unverified_requires_root" in output + ) + sshd_ok = "SSHD_CONFIG_SYNTAX=ok" in output or sshd_unverified sshd_after_ok = ( "SSHD_CONFIG_SYNTAX_AFTER_APPLY=ok" in output + or sshd_after_unverified or not _bool_from_field(fields.get("APPLY")) ) user_status = _first_key_value_line(output, prefix="USER_STATUS ") @@ -1031,7 +1136,11 @@ def _parse_ssh_local_repair_output(output: str) -> dict[str, Any]: "receipt_seen": marker_seen, "mode": _mode_from_marker_line(output), "sshd_config_syntax_ok": sshd_ok, + "sshd_config_syntax_unverified_requires_root": sshd_unverified, "sshd_config_syntax_after_apply_ok": sshd_after_ok, + "sshd_config_syntax_after_apply_unverified_requires_root": ( + sshd_after_unverified + ), "target_user_exists": user_exists, "target_user_account_locked": account_locked, "target_user_shell_executable": shell_executable, @@ -1937,23 +2046,28 @@ def _active_blockers( deploy_marker: dict[str, Any], ) -> list[str]: blockers: list[str] = [] + command_path_ready = bool(ssh_diagnosis["wooo_command_path_ready"]) if ssh_diagnosis["receipt_seen"] and not ssh_diagnosis["diagnosis_ready"]: blockers.append("ssh_publickey_diagnosis_receipt_not_ready") if ( - ssh_diagnosis["publickey_offer_timeout_seen"] + ssh_diagnosis["publickey_offer_timeout_effective_seen"] and not ssh_local["control_channel_metadata_ready"] ): blockers.append("ssh_publickey_offer_timeout_on_wooo") if ( - ssh_diagnosis["server_accepts_key_then_timeout_seen"] + ssh_diagnosis["server_accepts_key_then_timeout_effective_seen"] and not ssh_local["control_channel_metadata_ready"] ): blockers.append("ssh_publickey_server_accepts_key_then_timeout_on_wooo") - if ssh_diagnosis["runner_systemctl_show_timeout_seen"]: + if ssh_diagnosis["runner_systemctl_show_timeout_effective_seen"]: blockers.append("runner_systemctl_show_timeout_on_110") if ssh_diagnosis["node_high_load_seen"]: blockers.append("ssh_publickey_node_high_load_on_110") - if ssh_local["receipt_seen"] and not ssh_local["control_channel_metadata_ready"]: + if ( + ssh_local["receipt_seen"] + and not ssh_local["control_channel_metadata_ready"] + and not command_path_ready + ): blockers.extend(_ssh_local_metadata_blockers(ssh_local)) blockers.append("ssh_local_repair_receipt_metadata_not_ready") if not watchdog_check["receipt_seen"]: diff --git a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py index f3d86119..2d2510cc 100644 --- a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py @@ -147,6 +147,28 @@ def test_harbor_recovery_receipt_surfaces_ssh_local_metadata_blockers() -> None: assert "secret-token-like-content" not in str(payload) +def test_harbor_recovery_receipt_accepts_nonroot_sshd_syntax_unverified_check() -> None: + ssh_local_output = _ssh_local_apply_output().replace( + "SSHD_CONFIG_SYNTAX=ok", + ( + "SSHD_CONFIG_SYNTAX=unverified_requires_root rc=1 " + "reason=hostkeys_unavailable_or_permission" + ), + ) + + payload = validate_harbor_registry_controlled_recovery_receipt( + {"ssh_local_repair_output": ssh_local_output} + ) + + ssh_local = payload["readback"]["ssh_local_repair"] + assert ssh_local["sshd_config_syntax_ok"] is True + assert ssh_local["sshd_config_syntax_unverified_requires_root"] is True + assert ssh_local["control_channel_metadata_ready"] is True + assert "ssh_local_repair_sshd_config_syntax_not_ok" not in payload[ + "active_blockers" + ] + + def test_harbor_recovery_receipt_accepts_combined_local_console_output() -> None: payload = validate_harbor_registry_controlled_recovery_receipt( { @@ -274,6 +296,58 @@ def test_harbor_recovery_receipt_accepts_ssh_publickey_diagnosis() -> None: ] > 0 +def test_harbor_recovery_receipt_command_path_ready_overrides_stale_timeout() -> None: + diagnosis_output = _ssh_publickey_diagnosis_output().replace( + "SSH_AUTH user=wooo mode=password_disabled rc=255 classification=permission_denied", + ( + "SSH_COMMAND_PATH user=wooo rc=0 classification=command_path_ready " + "marker_seen=true remote_user_match=true\n" + "SSH_AUTH user=wooo mode=password_disabled rc=255 " + "classification=permission_denied" + ), + ) + + payload = validate_harbor_registry_controlled_recovery_receipt( + { + "ssh_publickey_diagnosis_output": diagnosis_output, + "watchdog_check_output": _watchdog_check_output(ready=True, status=401), + "public_registry_v2_http_status": 401, + "internal_registry_v2_http_status": 401, + } + ) + + assert payload["status"] == "harbor_registry_recovery_receipt_verified" + assert payload["active_blockers"] == [] + diagnosis = payload["readback"]["ssh_publickey_diagnosis"] + assert diagnosis["wooo_command_path_ready"] is True + assert diagnosis["wooo_command_path_classification"] == "command_path_ready" + assert diagnosis["server_accepts_key_then_timeout_seen"] is True + assert diagnosis["server_accepts_key_then_timeout_effective_seen"] is False + assert diagnosis["runner_systemctl_show_timeout_seen"] is True + assert diagnosis["runner_systemctl_show_timeout_effective_seen"] is False + readiness = payload["readback"]["control_path_readiness"] + assert readiness["status"] == "ready" + assert readiness["ssh_command_path_ready"] is True + assert readiness["ssh_server_accepts_key_then_timeout_raw"] is True + assert readiness["ssh_server_accepts_key_then_timeout"] is False + assert readiness["runner_systemctl_show_timeout_raw"] is True + assert readiness["runner_systemctl_show_timeout"] is False + phases = { + phase["phase_id"]: phase + for phase in payload["local_console_phase_readback"]["phases"] + } + assert phases["repair_ssh_metadata_if_check_confirms_metadata_drift"][ + "status" + ] == "skipped_not_required" + assert payload["rollups"]["ssh_publickey_wooo_command_path_ready"] is True + assert ( + payload["rollups"][ + "ssh_publickey_server_accepts_key_then_timeout_effective_seen" + ] + is False + ) + + def test_harbor_recovery_receipt_classifies_publickey_offer_timeout() -> None: diagnosis_output = _ssh_publickey_diagnosis_output().replace( "rc=124 classification=server_accepts_key_then_timeout", diff --git a/docs/LOGBOOK.md b/docs/LOGBOOK.md index d64cfd63..8dfea3db 100644 --- a/docs/LOGBOOK.md +++ b/docs/LOGBOOK.md @@ -22,7 +22,26 @@ **邊界**:未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未刪 pod,未讀 secret value / token / `.env` / raw sessions / SQLite / auth,未寫 credential escrow marker,未使用 GitHub / `gh` / GitHub API,未恢復 generic runner。 **下一步**:core cold-start 已 GREEN;主線下一步改為 DR offsite credential escrow non-secret evidence review,並保留 MOMO product-data source-arrival gate 監控,等正式 source 到達後由原匯入 pipeline 更新,不做手動 DB 偽更新。 +## 2026-07-01 — 23:42 110 SSH command-path effective readiness 修正 +**照主線修正的問題**: +- 110 實測 `wooo@192.168.0.110` command path 可執行,Harbor `/v2/` 回 `401` ready;但舊 `diagnose-110-ssh-publickey-auth.sh` 只看 `ssh true` verbose timeout,API receipt / Gitea queue parser 仍會把 `classification=server_accepts_key_then_timeout` 判成 active `harbor_110_remote_ssh_publickey_auth_stalled`。 +- `diagnose-110-ssh-publickey-auth.sh` 新增 `SSH_COMMAND_PATH user=wooo ... classification=command_path_ready` 非敏感機器可讀 probe;若 command path 成功,舊 verbose timeout 只保留 raw evidence,不再當 effective blocker。 +- `repair-110-ssh-publickey-auth-local.sh --check` 現在遇到非 root `sshd -t` hostkey / 權限限制時輸出 `SSHD_CONFIG_SYNTAX=unverified_requires_root` 並繼續讀非敏感 account / shell / authorized_keys metadata;`--apply` 仍保持嚴格,不會在 syntax check 失敗時默默套用。 +- `harbor_registry_controlled_recovery_receipt.py`、`read-public-gitea-actions-queue.py` 與 `verify-awoooi-non110-cd-closure.py` 已加入 command-path effective readiness,讓 production readback 能從假 SSH stalled 轉到真正的 controlled CD lane / runner readiness blocker。 + +**驗證**: +- `python3.11 -m py_compile apps/api/src/services/harbor_registry_controlled_recovery_receipt.py ops/runner/read-public-gitea-actions-queue.py ops/runner/verify-awoooi-non110-cd-closure.py`:通過。 +- `bash -n scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh`:通過。 +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py ops/runner/test_read_public_gitea_actions_queue.py ops/runner/test_verify_awoooi_non110_cd_closure.py -q`:`87 passed`。 +- `DATABASE_URL=sqlite+aiosqlite:////tmp/awoooi-codex-api-test.db PYTHONPATH=apps/api python3.11 -m pytest ops/runner/test_cd_controlled_runtime_profile.py -q`:`41 passed`。 +- live read-only diagnosis:`SSH_AUTH user=wooo mode=publickey ... classification=server_accepts_key_then_timeout` 與 `SSH_COMMAND_PATH user=wooo rc=0 classification=command_path_ready marker_seen=true remote_user_match=true` 同輪出現;node load classifier `load_not_high`。 +- live patched local check:`SSHD_CONFIG_SYNTAX=unverified_requires_root`、`USER_STATUS user=wooo exists=1`、`ACCOUNT_METADATA ... account_locked=false shell=/bin/bash shell_executable=true`、`AUTHORIZED_KEYS_STATUS ... exists=1`;未印出 authorized_keys 內容。 +- `git diff --check`:通過。 + +**邊界**:未使用 GitHub / `gh` / GitHub API;未讀 secret / token / `.env` / raw sessions / SQLite / auth;未重啟主機,未 restart Docker / Nginx / K3s / DB / firewall,未 workflow_dispatch,未 force push,未恢復 generic runner。 + +**下一步**:commit / push Gitea `main`,等 CD 成功後讀回 production priority / Harbor recovery receipt;預期 P0-006 不再停在假 SSH stalled,而會顯示 controlled CD lane config / registration / service readiness 的真 blocker。 ## 2026-07-01 — 21:32 cold-start 假 WARN 收斂與 live monitor 同步 **照主線修正的問題**: diff --git a/ops/runner/read-public-gitea-actions-queue.py b/ops/runner/read-public-gitea-actions-queue.py index e1f3f061..4d7f6c6e 100644 --- a/ops/runner/read-public-gitea-actions-queue.py +++ b/ops/runner/read-public-gitea-actions-queue.py @@ -700,6 +700,9 @@ def build_readback( "latest_visible_harbor_110_repair_remote_ssh_reachable": ( harbor_110_repair_log_classifier["remote_ssh_reachable"] ), + "latest_visible_harbor_110_repair_remote_ssh_command_path_ready": ( + harbor_110_repair_log_classifier["remote_ssh_command_path_ready"] + ), "latest_visible_harbor_110_repair_bounded_ssh_timeout_seen": ( harbor_110_repair_log_classifier["bounded_ssh_timeout_seen"] ), @@ -727,14 +730,29 @@ def build_readback( "remote_ssh_publickey_offer_timeout" ] ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_offer_timeout_raw": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_offer_timeout_raw" + ] + ), "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] ), + "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled_raw": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_auth_stalled_raw" + ] + ), "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout": ( harbor_110_repair_log_classifier[ "remote_ssh_server_accepts_key_then_session_timeout" ] ), + "latest_visible_harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout_raw": ( + harbor_110_repair_log_classifier[ + "remote_ssh_server_accepts_key_then_session_timeout_raw" + ] + ), "latest_visible_harbor_110_repair_remote_ssh_auth_permission_denied": ( harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"] ), @@ -1018,6 +1036,9 @@ def build_readback( "harbor_110_repair_remote_ssh_banner_seen": ( harbor_110_repair_log_classifier["remote_ssh_banner_seen"] ), + "harbor_110_repair_remote_ssh_command_path_ready": ( + harbor_110_repair_log_classifier["remote_ssh_command_path_ready"] + ), "harbor_110_repair_remote_ssh_userauth_service_accept_seen": ( harbor_110_repair_log_classifier[ "remote_ssh_userauth_service_accept_seen" @@ -1036,14 +1057,29 @@ def build_readback( "remote_ssh_publickey_offer_timeout" ] ), + "harbor_110_repair_remote_ssh_publickey_offer_timeout_raw": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_offer_timeout_raw" + ] + ), "harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_repair_log_classifier["remote_ssh_publickey_auth_stalled"] ), + "harbor_110_repair_remote_ssh_publickey_auth_stalled_raw": ( + harbor_110_repair_log_classifier[ + "remote_ssh_publickey_auth_stalled_raw" + ] + ), "harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout": ( harbor_110_repair_log_classifier[ "remote_ssh_server_accepts_key_then_session_timeout" ] ), + "harbor_110_repair_remote_ssh_server_accepts_key_then_session_timeout_raw": ( + harbor_110_repair_log_classifier[ + "remote_ssh_server_accepts_key_then_session_timeout_raw" + ] + ), "harbor_110_repair_remote_ssh_auth_permission_denied": ( harbor_110_repair_log_classifier["remote_ssh_auth_permission_denied"] ), @@ -1471,14 +1507,27 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: "harbor_110_remote_ssh_publickey_reply_timeout_seen", text, ) - remote_ssh_publickey_offer_timeout = ( + remote_ssh_command_path_ready = ( + _last_bool_marker("harbor_110_remote_ssh_command_path_ready", text) is True + or re.search( + r"^SSH_COMMAND_PATH\s+user=wooo\b.*\bclassification=command_path_ready\b" + r".*\bmarker_seen=true\b.*\bremote_user_match=true\b", + text, + re.MULTILINE, + ) + is not None + ) + remote_ssh_publickey_offer_timeout_raw = ( "classification=publickey_offer_timeout" in text or ( "we sent a publickey packet, wait for reply" in text and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None ) ) - remote_ssh_publickey_auth_stalled = ( + remote_ssh_publickey_offer_timeout = bool( + remote_ssh_publickey_offer_timeout_raw and not remote_ssh_command_path_ready + ) + remote_ssh_publickey_auth_stalled_raw = ( "harbor_110_remote_ssh_publickey_auth_stalled=true" in text or "BLOCKED harbor_110_remote_ssh_publickey_auth_stalled" in text or ( @@ -1486,9 +1535,12 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: and remote_ssh_publickey_offered is True and remote_ssh_publickey_reply_timeout_seen is True ) - or remote_ssh_publickey_offer_timeout + or remote_ssh_publickey_offer_timeout_raw ) - remote_ssh_server_accepts_key_then_session_timeout = ( + remote_ssh_publickey_auth_stalled = bool( + remote_ssh_publickey_auth_stalled_raw and not remote_ssh_command_path_ready + ) + remote_ssh_server_accepts_key_then_session_timeout_raw = ( "harbor_110_remote_ssh_server_accepts_key_then_session_timeout=true" in text or "classification=server_accepts_key_then_timeout" in text or ( @@ -1496,14 +1548,21 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: and _HARBOR_110_REMOTE_SSH_TIMEOUT_RE.search(text) is not None ) ) + remote_ssh_server_accepts_key_then_session_timeout = bool( + remote_ssh_server_accepts_key_then_session_timeout_raw + and not remote_ssh_command_path_ready + ) remote_ssh_auth_permission_denied = _last_bool_marker( "harbor_110_remote_ssh_auth_permission_denied", text, ) - remote_control_channel_unavailable = ( + remote_control_channel_unavailable_raw = ( "harbor_110_remote_control_channel_unavailable" in text or (bounded_ssh_timeout_seen and remote_ssh_reachable is False) - or remote_ssh_publickey_auth_stalled + or remote_ssh_publickey_auth_stalled_raw + ) + remote_control_channel_unavailable = ( + remote_control_channel_unavailable_raw and not remote_ssh_command_path_ready ) local_registry_v2_unavailable = ( _HARBOR_110_REMOTE_LOCAL_V2_BLOCKER_RE.search(text) is not None @@ -1545,6 +1604,7 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: ), "remote_control_channel_unavailable": remote_control_channel_unavailable, "remote_ssh_reachable": remote_ssh_reachable, + "remote_ssh_command_path_ready": remote_ssh_command_path_ready, "bounded_ssh_timeout_seen": bounded_ssh_timeout_seen, "remote_ssh_tcp_connected": remote_ssh_tcp_connected, "remote_ssh_banner_seen": remote_ssh_banner_seen, @@ -1556,10 +1616,19 @@ def classify_harbor_110_repair_log(text: str) -> dict[str, Any]: remote_ssh_publickey_reply_timeout_seen ), "remote_ssh_publickey_offer_timeout": remote_ssh_publickey_offer_timeout, + "remote_ssh_publickey_offer_timeout_raw": ( + remote_ssh_publickey_offer_timeout_raw + ), "remote_ssh_publickey_auth_stalled": remote_ssh_publickey_auth_stalled, + "remote_ssh_publickey_auth_stalled_raw": ( + remote_ssh_publickey_auth_stalled_raw + ), "remote_ssh_server_accepts_key_then_session_timeout": ( remote_ssh_server_accepts_key_then_session_timeout ), + "remote_ssh_server_accepts_key_then_session_timeout_raw": ( + remote_ssh_server_accepts_key_then_session_timeout_raw + ), "remote_ssh_auth_permission_denied": remote_ssh_auth_permission_denied, "local_registry_v2_status": local_status, "public_registry_v2_status": public_status, @@ -1730,10 +1799,18 @@ def _human_summary(payload: dict[str, Any]) -> str: "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_REACHABLE=" f"{readback['latest_visible_harbor_110_repair_remote_ssh_reachable']}" ), + ( + "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_COMMAND_PATH_READY=" + f"{int(readback['latest_visible_harbor_110_repair_remote_ssh_command_path_ready'])}" + ), ( "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_AUTH_STALLED=" f"{int(readback['latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled'])}" ), + ( + "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_AUTH_STALLED_RAW=" + f"{int(readback['latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled_raw'])}" + ), ( "LATEST_VISIBLE_HARBOR_110_REPAIR_REMOTE_SSH_PUBLICKEY_REPLY_TIMEOUT_SEEN=" f"{readback['latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen']}" diff --git a/ops/runner/test_read_public_gitea_actions_queue.py b/ops/runner/test_read_public_gitea_actions_queue.py index 867adba1..0c95f454 100644 --- a/ops/runner/test_read_public_gitea_actions_queue.py +++ b/ops/runner/test_read_public_gitea_actions_queue.py @@ -671,6 +671,31 @@ def test_harbor_ssh_blocker_takes_precedence_over_current_cd_waiting() -> None: assert payload["rollups"]["safe_next_action_requires_local_console"] is True +def test_harbor_ssh_command_path_ready_overrides_raw_publickey_stall() -> None: + module = _load_module() + classifier = module.classify_harbor_110_repair_log( + _harbor_110_repair_publickey_auth_stalled_log() + + "\nSSH_COMMAND_PATH user=wooo rc=0 classification=command_path_ready " + "marker_seen=true remote_user_match=true\n" + ) + + assert classifier["remote_ssh_command_path_ready"] is True + assert classifier["remote_ssh_publickey_auth_stalled_raw"] is True + assert classifier["remote_ssh_publickey_offer_timeout_raw"] is True + assert ( + classifier["remote_ssh_server_accepts_key_then_session_timeout_raw"] + is True + ) + assert classifier["remote_ssh_publickey_auth_stalled"] is False + assert classifier["remote_ssh_publickey_offer_timeout"] is False + assert ( + classifier["remote_ssh_server_accepts_key_then_session_timeout"] + is False + ) + assert classifier["remote_control_channel_unavailable"] is False + assert classifier["failure_classifier"] == "" + + def test_build_readback_classifies_harbor_502_after_110_repair_jobs_success() -> None: module = _load_module() payload = module.build_readback( diff --git a/ops/runner/test_verify_awoooi_non110_cd_closure.py b/ops/runner/test_verify_awoooi_non110_cd_closure.py index 30b1340d..af5a8ef3 100644 --- a/ops/runner/test_verify_awoooi_non110_cd_closure.py +++ b/ops/runner/test_verify_awoooi_non110_cd_closure.py @@ -32,6 +32,7 @@ def _queue( harbor_110_publickey_auth_stalled: bool = False, harbor_110_publickey_offer_timeout: bool = False, harbor_110_session_timeout: bool = False, + harbor_110_command_path_ready: bool = False, harbor_110_running_jobs_api_stale: bool = False, current_cd_waiting_behind_harbor_110_running: bool = False, ) -> dict: @@ -72,6 +73,9 @@ def _queue( "latest_visible_harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_publickey_auth_stalled ), + "latest_visible_harbor_110_repair_remote_ssh_command_path_ready": ( + harbor_110_command_path_ready + ), "latest_visible_harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": ( harbor_110_publickey_auth_stalled ), @@ -97,6 +101,9 @@ def _queue( "harbor_110_repair_remote_ssh_publickey_auth_stalled": ( harbor_110_publickey_auth_stalled ), + "harbor_110_repair_remote_ssh_command_path_ready": ( + harbor_110_command_path_ready + ), "harbor_110_repair_remote_ssh_publickey_reply_timeout_seen": ( harbor_110_publickey_auth_stalled ), @@ -321,6 +328,36 @@ def test_closure_verifier_blocks_harbor_110_publickey_auth_stalled() -> None: ] +def test_closure_verifier_command_path_ready_overrides_publickey_auth_stalled() -> None: + module = _load_module() + payload = module.build_closure_verifier( + readiness_text=_readiness(ready=True), + queue=_queue( + no_matching=False, + harbor_110_publickey_auth_stalled=True, + harbor_110_publickey_offer_timeout=True, + harbor_110_session_timeout=True, + harbor_110_command_path_ready=True, + ), + production_workbench=_workbench(image_current=True, governance_ready=True), + ) + + assert payload["status"] == "closure_verified" + assert payload["readback"]["harbor_110_remote_ssh_command_path_ready"] is True + assert payload["readback"]["harbor_110_remote_control_channel_unavailable"] is False + assert payload["readback"]["harbor_110_remote_ssh_publickey_auth_stalled"] is False + assert payload["readback"]["harbor_110_remote_ssh_publickey_offer_timeout"] is False + assert ( + payload["readback"][ + "harbor_110_remote_ssh_server_accepts_key_then_session_timeout" + ] + is False + ) + assert "harbor_110_remote_ssh_publickey_auth_stalled" not in payload[ + "blockers" + ] + + def test_closure_verifier_blocks_stale_harbor_running_readback() -> None: module = _load_module() payload = module.build_closure_verifier( diff --git a/ops/runner/verify-awoooi-non110-cd-closure.py b/ops/runner/verify-awoooi-non110-cd-closure.py index f5c98d15..d3a4c488 100755 --- a/ops/runner/verify-awoooi-non110-cd-closure.py +++ b/ops/runner/verify-awoooi-non110-cd-closure.py @@ -330,6 +330,14 @@ def build_closure_verifier( or queue_rollups.get("harbor_110_repair_remote_control_channel_unavailable") is True ) + harbor_110_remote_ssh_command_path_ready = ( + queue_readback.get( + "latest_visible_harbor_110_repair_remote_ssh_command_path_ready" + ) + is True + or queue_rollups.get("harbor_110_repair_remote_ssh_command_path_ready") + is True + ) harbor_110_remote_ssh_publickey_auth_stalled = ( queue_status == "blocked_harbor_110_remote_ssh_publickey_auth_stalled" or queue_readback.get( @@ -367,6 +375,11 @@ def build_closure_verifier( ) is True ) + if harbor_110_remote_ssh_command_path_ready: + harbor_110_remote_control_channel_unavailable = False + harbor_110_remote_ssh_publickey_auth_stalled = False + harbor_110_remote_ssh_publickey_offer_timeout = False + harbor_110_remote_ssh_server_accepts_key_then_session_timeout = False harbor_110_repair_visible_running_jobs_api_stale = ( queue_status == "blocked_current_cd_waiting_behind_stale_harbor_110_repair_readback" @@ -525,6 +538,9 @@ def build_closure_verifier( "harbor_110_remote_control_channel_unavailable": ( harbor_110_remote_control_channel_unavailable ), + "harbor_110_remote_ssh_command_path_ready": ( + harbor_110_remote_ssh_command_path_ready + ), "harbor_110_remote_ssh_publickey_auth_stalled": ( harbor_110_remote_ssh_publickey_auth_stalled ), diff --git a/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh b/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh index 3d46284a..c80ffef4 100755 --- a/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh +++ b/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh @@ -11,8 +11,10 @@ PORT="${PORT:-22}" NODE_EXPORTER_PORT="${NODE_EXPORTER_PORT:-9100}" CONNECT_TIMEOUT_SECONDS="${CONNECT_TIMEOUT_SECONDS:-4}" SSH_ATTEMPT_TIMEOUT_SECONDS="${SSH_ATTEMPT_TIMEOUT_SECONDS:-8}" +SSH_COMMAND_PATH_TIMEOUT_SECONDS="${SSH_COMMAND_PATH_TIMEOUT_SECONDS:-20}" NODE_EXPORTER_TIMEOUT_SECONDS="${NODE_EXPORTER_TIMEOUT_SECONDS:-8}" USERS=(${USERS:-wooo root git ollama}) +COMMAND_PATH_USER="${COMMAND_PATH_USER:-wooo}" tmp_dir="$(mktemp -d "${TMPDIR:-/tmp}/awoooi-110-ssh-auth.XXXXXX")" trap 'rm -rf "$tmp_dir"' EXIT @@ -28,6 +30,18 @@ ssh_base_opts=( -o ServerAliveCountMax=1 ) +ssh_command_opts=( + -p "$PORT" + -o ConnectTimeout="$CONNECT_TIMEOUT_SECONDS" + -o ConnectionAttempts=1 + -o BatchMode=yes + -o StrictHostKeyChecking=accept-new + -o ServerAliveInterval=2 + -o ServerAliveCountMax=1 + -o PasswordAuthentication=no + -o PreferredAuthentications=publickey +) + run_timeout() { local seconds="$1" shift @@ -157,6 +171,43 @@ probe_user() { printf 'SSH_AUTH user=%s mode=%s rc=%s classification=%s\n' "$user" "$mode" "$rc" "$classification" } +probe_command_path_user() { + local user="$1" + local stdout_path="$tmp_dir/${user}-command-path.out" + local stderr_path="$tmp_dir/${user}-command-path.err" + local rc classification marker_seen remote_user remote_user_match + + run_timeout "$SSH_COMMAND_PATH_TIMEOUT_SECONDS" \ + ssh "${ssh_command_opts[@]}" \ + "${user}@${HOST}" \ + 'printf "AWOOOI_110_COMMAND_PATH_READY=1\n"; printf "remote_user=%s\n" "$(id -un)"' \ + >"$stdout_path" 2>"$stderr_path" + rc=$? + + marker_seen=false + remote_user_match=false + remote_user="$(awk -F= '$1 == "remote_user" {print $2; exit}' "$stdout_path" 2>/dev/null || true)" + if grep -qx 'AWOOOI_110_COMMAND_PATH_READY=1' "$stdout_path" 2>/dev/null; then + marker_seen=true + fi + if [[ "$remote_user" == "$user" ]]; then + remote_user_match=true + fi + + if [[ "$marker_seen" == "true" && "$remote_user_match" == "true" ]]; then + classification="command_path_ready" + elif grep -Eiq 'timed out|not responding|Timeout' "$stderr_path"; then + classification="command_path_timeout" + elif grep -q 'Permission denied' "$stderr_path"; then + classification="command_path_permission_denied" + else + classification="command_path_unavailable" + fi + + printf 'SSH_COMMAND_PATH user=%s rc=%s classification=%s marker_seen=%s remote_user_match=%s\n' \ + "$user" "$rc" "$classification" "$marker_seen" "$remote_user_match" +} + echo "AWOOOI_110_SSH_PUBLICKEY_AUTH_DIAGNOSIS" echo "TARGET=${HOST}:${PORT}" probe_node_exporter @@ -166,8 +217,10 @@ for user in "${USERS[@]}"; do probe_user "$user" "publickey" done +probe_command_path_user "$COMMAND_PATH_USER" + for user in "${USERS[@]}"; do probe_user "$user" "password_disabled" done -echo "INTERPRETATION=server_accepts_key_then_timeout_means_check_110_session_pam_account_or_shell_path;publickey_offer_timeout_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path" +echo "INTERPRETATION=command_path_ready_overrides_stale_verbose_true_timeout;server_accepts_key_then_timeout_without_command_path_means_check_110_session_pam_account_or_shell_path;publickey_offer_timeout_means_check_110_authorized_keys_permissions_pam_or_account_lookup_path" diff --git a/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh b/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh index 16c60ebb..8cd84da7 100755 --- a/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh +++ b/scripts/reboot-recovery/repair-110-ssh-publickey-auth-local.sh @@ -122,6 +122,29 @@ check_sshd_effective_config() { echo "SSHD_EFFECTIVE_CONFIG available=true pubkeyauthentication=${pubkey:-unknown} passwordauthentication=${password:-unknown} kbdinteractiveauthentication=${kbdinteractive:-unknown} usepam=${usepam:-unknown} maxstartups=${maxstartups:-unknown} authorized_keys_file_default=${authorized_keys_file_default}" } +check_sshd_config_syntax() { + local label="$1" + local output rc reason + + if output="$(sshd -t 2>&1)"; then + echo "${label}=ok" + return 0 + fi + rc=$? + + reason="sshd_t_failed" + if printf '%s\n' "$output" | grep -Eiq 'hostkey|host key|permission|no hostkeys'; then + reason="hostkeys_unavailable_or_permission" + fi + if [ "$APPLY" -eq 0 ]; then + echo "${label}=unverified_requires_root rc=$rc reason=$reason" + return 0 + fi + + echo "${label}=failed rc=$rc reason=$reason" >&2 + return "$rc" +} + apply_user_permissions() { local user="$1" local home_dir @@ -149,15 +172,13 @@ require_110_or_explicit echo "AWOOOI_110_SSH_PUBLICKEY_AUTH_LOCAL_REPAIR mode=$([ "$APPLY" -eq 1 ] && echo apply || echo check) target_user=$TARGET_USER" systemctl is-active ssh 2>/dev/null | sed 's/^/SSH_SERVICE_ACTIVE=/' || true -sshd -t -echo "SSHD_CONFIG_SYNTAX=ok" +check_sshd_config_syntax "SSHD_CONFIG_SYNTAX" check_user "$TARGET_USER" check_sshd_effective_config "$TARGET_USER" if [ "$APPLY" -eq 1 ]; then apply_user_permissions "$TARGET_USER" - sshd -t - echo "SSHD_CONFIG_SYNTAX_AFTER_APPLY=ok" + check_sshd_config_syntax "SSHD_CONFIG_SYNTAX_AFTER_APPLY" if [ "$RELOAD_SSH" = "1" ]; then systemctl reload ssh echo "SSH_RELOAD=done" diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 4cef428c..a8bff783 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -212,6 +212,11 @@ def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None: assert "PasswordAuthentication=no" in text assert "PubkeyAuthentication=no" in text assert "NumberOfPasswordPrompts=0" in text + assert "SSH_COMMAND_PATH_TIMEOUT_SECONDS" in text + assert "COMMAND_PATH_USER" in text + assert "SSH_COMMAND_PATH user=%s rc=%s classification=%s" in text + assert "command_path_ready" in text + assert "AWOOOI_110_COMMAND_PATH_READY=1" in text assert "server_accepts_key_then_timeout" in text assert "publickey_offer_timeout" in text assert "NODE_EXPORTER=ok" in text @@ -245,6 +250,8 @@ def test_110_ssh_publickey_auth_repair_is_local_and_does_not_print_keys() -> Non assert "account_locked=" in text assert "shell_executable=" in text assert "SSHD_EFFECTIVE_CONFIG available=true" in text + assert "unverified_requires_root" in text + assert "hostkeys_unavailable_or_permission" in text assert "sshd -T -C" in text assert "pubkeyauthentication=" in text assert "authorized_keys_file_default=" in text