diff --git a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py index 85499f80..fc4ac5de 100644 --- a/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/src/services/harbor_registry_controlled_recovery_receipt.py @@ -148,6 +148,9 @@ def validate_harbor_registry_controlled_recovery_receipt( "ssh_publickey_server_accepts_key_then_timeout_seen": ( ssh_diagnosis["server_accepts_key_then_timeout_seen"] ), + "ssh_publickey_runner_systemctl_show_timeout_seen": ( + ssh_diagnosis["runner_systemctl_show_timeout_seen"] + ), "ssh_publickey_node_exporter_ok": ssh_diagnosis["node_exporter_ok"], "ssh_publickey_port_tcp_open": ssh_diagnosis["ssh_port_tcp_open"], "ssh_local_repair_receipt_seen": ssh_local["receipt_seen"], @@ -337,6 +340,7 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: fields = _parse_key_values(output) marker_seen = "AWOOOI_110_SSH_PUBLICKEY_AUTH_DIAGNOSIS" in output auth_attempts = _ssh_auth_attempts(output) + systemd_units = _systemd_unit_signals(output) wooo_publickey = _auth_classification( auth_attempts, user="wooo", @@ -358,6 +362,11 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: permission_denied_count = sum( 1 for item in auth_attempts if item["classification"] == "permission_denied" ) + runner_systemctl_show_timeout_seen = any( + item["unit"].startswith("actions.runner") + and item["classifier"] == "systemctl_show_timeout" + for item in systemd_units + ) ssh_port_tcp_open = str(fields.get("SSH_PORT") or "") == "tcp_open" node_exporter_ok = str(fields.get("NODE_EXPORTER") or "") == "ok" return { @@ -370,6 +379,9 @@ def _parse_ssh_publickey_diagnosis_output(output: str) -> dict[str, Any]: "ssh_banner_seen": "SSH_BANNER=SSH-" in output, "auth_attempt_count": len(auth_attempts), "auth_classifications": auth_attempts, + "systemd_unit_signal_count": len(systemd_units), + "systemd_unit_signals": systemd_units, + "runner_systemctl_show_timeout_seen": runner_systemctl_show_timeout_seen, "wooo_publickey_classification": wooo_publickey, "publickey_offer_timeout_seen": publickey_offer_timeout_seen, "server_accepts_key_then_timeout_seen": ( @@ -400,6 +412,22 @@ def _ssh_auth_attempts(output: str) -> list[dict[str, Any]]: return attempts +def _systemd_unit_signals(output: str) -> list[dict[str, str]]: + signals: list[dict[str, str]] = [] + for line in output.splitlines(): + if not line.startswith("SYSTEMD_UNIT "): + continue + fields = _parse_key_values(line) + signals.append( + { + "unit": str(fields.get("unit") or ""), + "active_state": str(fields.get("active_state") or ""), + "classifier": str(fields.get("classifier") or ""), + } + ) + return signals + + def _auth_classification( attempts: list[dict[str, Any]], *, @@ -792,6 +820,8 @@ def _active_blockers( and not ssh_local["control_channel_metadata_ready"] ): blockers.append("ssh_publickey_server_accepts_key_then_timeout_on_wooo") + if ssh_diagnosis["runner_systemctl_show_timeout_seen"]: + blockers.append("runner_systemctl_show_timeout_on_110") if ssh_local["receipt_seen"] and not ssh_local["control_channel_metadata_ready"]: blockers.append("ssh_local_repair_receipt_metadata_not_ready") if not watchdog_check["receipt_seen"]: diff --git a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py index 1af68bd6..96422b6d 100644 --- a/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py +++ b/apps/api/tests/test_harbor_registry_controlled_recovery_receipt.py @@ -119,6 +119,7 @@ def test_harbor_recovery_receipt_accepts_ssh_publickey_diagnosis() -> None: assert "ssh_publickey_server_accepts_key_then_timeout_on_wooo" in payload[ "active_blockers" ] + assert "runner_systemctl_show_timeout_on_110" in payload["active_blockers"] diagnosis = payload["readback"]["ssh_publickey_diagnosis"] assert diagnosis["diagnosis_ready"] is True assert diagnosis["node_exporter_ok"] is True @@ -128,6 +129,10 @@ def test_harbor_recovery_receipt_accepts_ssh_publickey_diagnosis() -> None: "server_accepts_key_then_timeout" ) assert diagnosis["server_accepts_key_then_timeout_seen"] is True + assert diagnosis["runner_systemctl_show_timeout_seen"] is True + assert diagnosis["systemd_unit_signals"][0]["classifier"] == ( + "systemctl_show_timeout" + ) assert diagnosis["raw_output_returned"] is False phases = { phase["phase_id"]: phase @@ -141,6 +146,10 @@ def test_harbor_recovery_receipt_accepts_ssh_publickey_diagnosis() -> None: assert payload["rollups"]["ssh_publickey_wooo_publickey_classification"] == ( "server_accepts_key_then_timeout" ) + assert ( + payload["rollups"]["ssh_publickey_runner_systemctl_show_timeout_seen"] + is True + ) assert payload["input_redaction"]["ssh_publickey_diagnosis_output"][ "line_count" ] > 0 @@ -385,6 +394,7 @@ SSH_AUTH user=root mode=publickey rc=255 classification=permission_denied SSH_AUTH user=git mode=publickey rc=255 classification=preauth_timeout SSH_AUTH user=ollama mode=publickey rc=255 classification=preauth_timeout SSH_AUTH user=wooo mode=password_disabled rc=255 classification=permission_denied +SYSTEMD_UNIT unit=actions.runner.owenhytsai-awoooi.awoooi-110.service active_state=scrape_error classifier=systemctl_show_timeout INTERPRETATION=server_accepts_key_then_timeout_on_wooo_means_check_110_session_pam_account_or_shell_path """ diff --git a/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh b/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh index 0e014a00..70b4b927 100755 --- a/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh +++ b/scripts/reboot-recovery/diagnose-110-ssh-publickey-auth.sh @@ -83,6 +83,25 @@ probe_node_exporter() { echo "NODE_LOAD1_PER_CPU=$load1_per_cpu" awk -v ratio="$load1_per_cpu" 'BEGIN {print "NODE_LOAD_CLASSIFIER=" (ratio > 1.5 ? "high_load" : "load_not_high")}' fi + awk ' + /^systemd_unit_info/ && /unit="(actions\.runner|awoooi-cd-lane)/ { + line=$0 + unit=line + active=line + substate=line + sub(/^.*unit="/, "", unit) + sub(/".*$/, "", unit) + sub(/^.*active_state="/, "", active) + sub(/".*$/, "", active) + sub(/^.*sub_state="/, "", substate) + sub(/".*$/, "", substate) + classifier=active + if (active == "scrape_error" && substate ~ /timed out/) { + classifier="systemctl_show_timeout" + } + printf "SYSTEMD_UNIT unit=%s active_state=%s classifier=%s\n", unit, active, classifier + } + ' <<<"$metrics" } probe_tcp_banner() { diff --git a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py index 619c23b9..d98ad3aa 100644 --- a/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py +++ b/scripts/reboot-recovery/tests/test_cold_start_monitor_bounded_probes.py @@ -118,10 +118,13 @@ def test_110_ssh_publickey_auth_diagnosis_is_bounded_and_read_only() -> None: assert "NODE_EXPORTER=ok" in text assert "NODE_LOAD1_PER_CPU" in text assert "NODE_LOAD_CLASSIFIER" in text + assert "SYSTEMD_UNIT unit=%s active_state=%s classifier=%s" in text + assert "systemctl_show_timeout" in text assert "cat /home" not in text assert "cat ~/.ssh/authorized_keys" not in text assert "cat \"$home_dir/.ssh/authorized_keys\"" not in text - assert "systemctl" not in text + assert "systemctl show" not in text + assert "systemctl is-active" not in text assert "chmod" not in text assert "chown" not in text