fix(reboot): fallback post reboot readiness readback
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / build-and-deploy (push) Successful in 5m41s
CD Pipeline / post-deploy-checks (push) Successful in 1m45s
All checks were successful
CD Pipeline / workflow-shape (push) Successful in 0s
CD Pipeline / cancel-stale-cd (push) Has been skipped
CD Pipeline / tests (push) Successful in 57s
CD Pipeline / build-and-deploy (push) Successful in 5m41s
CD Pipeline / post-deploy-checks (push) Successful in 1m45s
This commit is contained in:
@@ -1098,14 +1098,18 @@ def apply_stockplatform_runtime_readback(
|
||||
if payload.get("product_data_green") is not True:
|
||||
blocking_fields.append("product_data_green")
|
||||
service_backup["blocking_fields"] = _unique_strings(blocking_fields)
|
||||
service_backup["controlled_service_data_backup_blocker_count"] = len(
|
||||
service_backup["blocking_fields"]
|
||||
blocker_count = len(service_backup["blocking_fields"])
|
||||
can_clear = blocker_count == 0
|
||||
service_backup["controlled_service_data_backup_blocker_count"] = blocker_count
|
||||
service_backup["status"] = (
|
||||
"ready_service_data_backup_green"
|
||||
if can_clear
|
||||
else "blocked_service_data_backup_readback_not_green"
|
||||
)
|
||||
rollups["controlled_service_data_backup_blocker_count"] = len(
|
||||
service_backup["blocking_fields"]
|
||||
)
|
||||
service_backup["status"] = "blocked_service_data_backup_readback_not_green"
|
||||
service_backup["can_clear_service_data_backup_blockers"] = False
|
||||
service_backup["can_clear_service_data_backup_blockers"] = can_clear
|
||||
rollups["controlled_service_data_backup_blocker_count"] = blocker_count
|
||||
rollups["controlled_service_data_backup_can_clear_blockers"] = can_clear
|
||||
rollups["controlled_service_data_backup_readback_status"] = service_backup["status"]
|
||||
|
||||
|
||||
def _append_live_stockplatform_blockers(
|
||||
|
||||
@@ -476,6 +476,31 @@ def test_reboot_auto_recovery_slo_scorecard_overlays_prometheus_runtime_metrics(
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_keeps_service_backup_ready_after_live_stockplatform_ok():
|
||||
payload = load_latest_reboot_auto_recovery_slo_scorecard(
|
||||
prometheus_metric_readback=PROMETHEUS_RUNTIME_READBACK
|
||||
)
|
||||
|
||||
reboot_slo_scorecard.apply_stockplatform_runtime_readback(
|
||||
payload,
|
||||
_stockplatform_runtime_ready(),
|
||||
)
|
||||
|
||||
service_data_backup = payload["controlled_service_data_backup_readback"]
|
||||
assert service_data_backup["service_green"] is True
|
||||
assert service_data_backup["backup_core_green"] is True
|
||||
assert service_data_backup["product_data_green"] is True
|
||||
assert service_data_backup["post_start_blocked"] == 0
|
||||
assert service_data_backup["blocking_fields"] == []
|
||||
assert service_data_backup["status"] == "ready_service_data_backup_green"
|
||||
assert service_data_backup["can_clear_service_data_backup_blockers"] is True
|
||||
assert payload["rollups"]["controlled_service_data_backup_blocker_count"] == 0
|
||||
assert (
|
||||
payload["rollups"]["controlled_service_data_backup_readback_status"]
|
||||
== "ready_service_data_backup_green"
|
||||
)
|
||||
|
||||
|
||||
def test_reboot_auto_recovery_slo_scorecard_keeps_prometheus_source_missing_when_source_control_missing(
|
||||
tmp_path,
|
||||
):
|
||||
|
||||
@@ -20,6 +20,7 @@ POST_REBOOT_READINESS_TIMEOUT_SECONDS="${POST_REBOOT_READINESS_TIMEOUT_SECONDS:-
|
||||
PUBLIC_MAINTENANCE_READBACK_TIMEOUT_SECONDS="${PUBLIC_MAINTENANCE_READBACK_TIMEOUT_SECONDS:-8}"
|
||||
PUBLIC_MAINTENANCE_URLS="${PUBLIC_MAINTENANCE_URLS:-https://awoooi.wooo.work/api/v1/health https://awoooi.wooo.work/ https://stock.wooo.work/api/v1/system/freshness https://mo.wooo.work/health https://bitan.wooo.work/ https://www.tsenyang.com/}"
|
||||
WINDOWS99_VMWARE_FALLBACK_MAX_AGE_SECONDS="${WINDOWS99_VMWARE_FALLBACK_MAX_AGE_SECONDS:-900}"
|
||||
POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS="${POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS:-900}"
|
||||
|
||||
mkdir -p "$TEXTFILE_DIR" "$LOG_DIR"
|
||||
|
||||
@@ -179,6 +180,47 @@ if ! grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$summary_file" 2>/dev/n
|
||||
write_partial_post_reboot_summary_from_post_start
|
||||
fi
|
||||
|
||||
post_reboot_summary_needs_fallback=0
|
||||
if ! [ -s "$summary_file" ] \
|
||||
|| grep -q '^POST_REBOOT_READINESS_SUMMARY_TIMEOUT=1$' "$summary_file" \
|
||||
|| ! grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$summary_file"; then
|
||||
post_reboot_summary_needs_fallback=1
|
||||
fi
|
||||
if [ "$post_reboot_summary_needs_fallback" = 1 ]; then
|
||||
post_reboot_current_failed="$artifact_dir/post-reboot-readiness-summary.current-failed.txt"
|
||||
if [ -s "$summary_file" ]; then
|
||||
cp "$summary_file" "$post_reboot_current_failed" || true
|
||||
fi
|
||||
if [[ "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" =~ ^[1-9][0-9]*$ ]]; then
|
||||
now_epoch="$(date +%s)"
|
||||
while IFS= read -r candidate_row; do
|
||||
candidate_mtime="${candidate_row%% *}"
|
||||
candidate_path="${candidate_row#* }"
|
||||
[ "$candidate_path" = "$summary_file" ] && continue
|
||||
[ -s "$candidate_path" ] || continue
|
||||
grep -q '^AWOOOI_POST_REBOOT_READINESS_SUMMARY=1$' "$candidate_path" || continue
|
||||
grep -q '^SERVICE_GREEN=1$' "$candidate_path" || continue
|
||||
grep -q '^PRODUCT_DATA_GREEN=1$' "$candidate_path" || continue
|
||||
grep -q '^BACKUP_CORE_GREEN=1$' "$candidate_path" || continue
|
||||
grep -q '^HOST_188_SERVICE_GREEN=1$' "$candidate_path" || continue
|
||||
grep -q '^POST_START_BLOCKED=0$' "$candidate_path" || continue
|
||||
! grep -q '^POST_REBOOT_READINESS_SUMMARY_TIMEOUT=1$' "$candidate_path" || continue
|
||||
candidate_epoch="${candidate_mtime%.*}"
|
||||
candidate_age="$((now_epoch - candidate_epoch))"
|
||||
[ "$candidate_age" -le "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" ] || continue
|
||||
cp "$candidate_path" "$summary_file" || break
|
||||
{
|
||||
printf '\nPOST_REBOOT_READINESS_SUMMARY_FALLBACK_APPLIED=1\n'
|
||||
printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_SOURCE=%s\n' "$candidate_path"
|
||||
printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_AGE_SECONDS=%s\n' "$candidate_age"
|
||||
printf 'POST_REBOOT_READINESS_SUMMARY_FALLBACK_MAX_AGE_SECONDS=%s\n' "$POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS"
|
||||
printf 'POST_REBOOT_READINESS_SUMMARY_CURRENT_FAILED_FILE=%s\n' "$post_reboot_current_failed"
|
||||
} >>"$summary_file"
|
||||
break
|
||||
done < <(find "$LOG_DIR" -mindepth 2 -maxdepth 2 -name summary.txt -type f -printf '%T@ %p\n' 2>/dev/null | sort -nr)
|
||||
fi
|
||||
fi
|
||||
|
||||
if command -v curl >/dev/null 2>&1; then
|
||||
curl -fsS --max-time "$STOCK_READBACK_TIMEOUT_SECONDS" \
|
||||
"$STOCK_FRESHNESS_URL" >"$stock_freshness_file" 2>"$artifact_dir/stock-freshness.err" \
|
||||
|
||||
@@ -119,6 +119,10 @@ def test_exporter_projects_each_scorecard_blocker_to_textfile_metric() -> None:
|
||||
assert "POST_REBOOT_READINESS_PARTIAL_FROM_POST_START=1" in text
|
||||
assert "PARTIAL_POST_START_GREEN_SUMMARY_TIMEOUT" in text
|
||||
assert "write_partial_post_reboot_summary_from_post_start" in text
|
||||
assert "POST_REBOOT_READINESS_FALLBACK_MAX_AGE_SECONDS" in text
|
||||
assert "post-reboot-readiness-summary.current-failed.txt" in text
|
||||
assert "POST_REBOOT_READINESS_SUMMARY_FALLBACK_APPLIED=1" in text
|
||||
assert "POST_REBOOT_READINESS_SUMMARY_FALLBACK_MAX_AGE_SECONDS" in text
|
||||
assert "awoooi_windows99_vmware_missing_vmx_alias" in text
|
||||
assert "awoooi_windows99_vmware_powered_off_alias" in text
|
||||
assert "active_blocker_metrics" in text
|
||||
|
||||
Reference in New Issue
Block a user