Files
awoooi/ops/reboot-recovery/full-stack-cold-start-baseline.yml

353 lines
13 KiB
YAML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
version: 2026-05-06.v1
scope:
included_hosts:
"110": "DevOps, registry, observability, Sentry, runners"
"120": "K3s control plane and VIP"
"121": "K3s peer node and DR drill cron"
"188": "Data, AI, web, momo, SignOz, public nginx gateway"
excluded_hosts:
"112": "Kali security host; recorded but not part of cold-start release gate"
principles:
- recover_dependency_chain_before_workloads
- keep_ai_auto_repair_observe_only_until_green
- never_generic_restart_stateful_services
- preserve_corrupt_parts_in_quarantine_not_delete
- release_runners_and_crawlers_last
phases:
- id: P0-NETWORK
order: 0
gates:
- ping_110_120_121_188
- ssh_port_110_120_121_188
- arp_evidence_or_monitor_mode_fallback
- id: P0-188-DATA
order: 10
required_before:
- P1-K3S
- P2-WORKLOAD-ALERTCHAIN
gates:
- containerd_docker_postgresql_redis_ollama_nginx_active
- postgresql_5432_accepting_connections
- redis_pong
- momo_db_not_restarting
- signoz_http_reachable
- momo_health_200
- id: P0-110-REGISTRY-OBSERVABILITY
order: 20
required_before:
- P1-K3S
- P3-RUNNER-CD
gates:
- docker_active
- harbor_v2_200_or_401
- gitea_200_or_302
- prometheus_ready
- alertmanager_healthy
- sentry_http_reachable
- docker_containers_all_up
- runner_watchdog_disabled
- sentry_clickhouse_not_restarting
- cadvisor_image_v0_47_0
- cadvisor_cpu_cap_0_3
- id: P1-K3S
order: 30
gates:
- 120_can_reach_188_postgres
- mon_and_mon1_ready
- no_non_running_non_succeeded_pods
- awoooi_dev_api_nodeport_200
- vip_192_168_0_125_present
- id: P2-WORKLOAD-ALERTCHAIN
order: 40
gates:
- awoooi_api_vip_health_2xx_or_3xx
- awoooi_web_vip_2xx_or_3xx
- alertmanager_webhook_e2e_2xx_when_release_gate
- id: P2-PUBLIC-ROUTES
order: 50
public_https_routes:
- https://awoooi.wooo.work/api/v1/health
- https://awoooi.wooo.work/
- https://awoooi.wooo.work/zh-TW/iwooos
- https://vibework.wooo.work/
- https://awooogo.wooo.work/
- https://2026fifa.wooo.work/
- https://agent.wooo.work/
- https://mo.wooo.work/
- https://mo.wooo.work/health
- https://stock.wooo.work/
- https://stock.wooo.work/healthz
- https://stock.wooo.work/api/healthz
- https://bitan.wooo.work/
- https://tsenyang.com/
- https://www.tsenyang.com/
- https://vtuber.wooo.work/
- https://gitea.wooo.work/
- https://harbor.wooo.work/
- https://registry.wooo.work/
- https://sentry.wooo.work/
- https://signoz.wooo.work/
- https://langfuse.wooo.work/
- https://aiops.wooo.work/
- id: P2-SCHEDULES
order: 60
gates:
- cron_active_188_110_120_121
- docker_restart_textfile_fresh_188
- docker_stats_textfile_fresh_188_110
- systemd_units_textfile_fresh_110
- backup_health_textfile_fresh_188_110
- backup_from_110_success_under_25h
- expected_backup_jobs_fresh_188_110
- host_service_config_backup_success_under_48h
- sentry_dedicated_backup_success_under_48h
- backup_integrity_check_success_under_8d
- backup_restore_drill_success_under_31d
- velero_schedule_present_and_latest_completed_under_25h
- velero_restore_test_cron_present
- momo_scheduler_registered_jobs
- momo_import_config_daily_sales_intake
- momo_source_absence_evidence_when_freshness_blocked
- stockplatform_system_freshness_ok
- stockplatform_latest_trading_day_sources_current
- k8s_cronjobs_unsuspended
- k8s_failed_jobs_zero
- dr_drill_cron_present_121
- id: P3-HIGH-LOAD-WORK
order: 70
release_after:
- P0-NETWORK
- P0-188-DATA
- P0-110-REGISTRY-OBSERVABILITY
- P1-K3S
- P2-WORKLOAD-ALERTCHAIN
- P2-PUBLIC-ROUTES
- P2-SCHEDULES
release_conditions:
- host_load_per_core_below_1_0_for_15m
- no_restart_storm
- clickhouse_merge_or_kafka_lag_not_increasing_two_checks
examples:
- sentry_snuba_consumers
- momo_scheduler_chrome_crawlers
- gitea_actions_jobs
- id: P3-RUNNER-CD
order: 80
release_conditions:
- all_previous_gates_green
- runner_cpuquota_200_percent
- runner_memorymax_2g
- watchdogusec_0
- active_awoooi_cd_or_gitea_actions_task_containers_cpu_capped_during_cold_start
automation_policy:
before_green:
ai_auto_repair: observe_only
alertmanager_smoke_test: manual_or_release_gate_only
stateful_service_actions: human_approval_required
generic_restart: forbidden
after_green:
ai_auto_repair: limited_execution_for_stateless_exporters_only
stateful_service_actions: human_in_the_loop
runner_cd: controlled_release
plan_b:
purpose: degraded_operation_and_recovery_path
not_authorization:
- docker_daemon_restart
- nginx_reload
- firewall_or_iptables_change
- kubectl_patch_live
- secret_read
- destructive_recovery
- production_write
red_lines:
- do_not_call_route_200_full_stack_green
- do_not_silence_correct_red_lights
- do_not_run_unapproved_live_writes
- do_not_release_high_risk_automation_before_dependency_chain_green
triggers:
- id: backup_or_offsite_running
condition: "02:00 backup, 03:00 offsite sync, or full verifier is still running"
immediate_action: abort_or_delay_reboot_read_only_wait
max_declaration: B0_ABORTED_BEFORE_REBOOT
- id: p0_host_unreachable_after_15m
condition: any_p0_host_ping_or_ssh_unreachable_after_reboot_15m
immediate_action: stop_next_phase_and_enter_host_plan_b
max_declaration: B1_HOST_RECOVERY_ONLY
- id: data_188_unhealthy
condition: postgresql_redis_momo_signoz_or_ai_provider_route_unhealthy
immediate_action: freeze_k3s_deploy_runner_ai_auto_remediation
max_declaration: B1_HOST_RECOVERY_ONLY
- id: registry_observability_110_unhealthy
condition: harbor_gitea_prometheus_or_alertmanager_unhealthy
immediate_action: freeze_cd_deploy_image_pull_and_alert_outbound
max_declaration: B2_CORE_SERVICE_READY
- id: single_k3s_control_plane_degraded
condition: one_of_120_or_121_unhealthy_while_peer_can_carry_control_plane
immediate_action: keep_single_node_k3s_service_mode_and_preserve_ha_red_light
max_declaration: B2_CORE_SERVICE_READY
- id: route_green_only
condition: public_route_green_but_db_backup_alert_or_scheduler_not_green
immediate_action: record_route_green_only_and_continue_cross_surface_checks
max_declaration: B2_CORE_SERVICE_READY
- id: cold_start_warn_no_blocked
condition: cold_start_warn_greater_than_zero_and_blocked_zero
immediate_action: declare_available_degraded_with_explicit_warns
max_declaration: B3_SERVICE_AVAILABLE_DEGRADED
- id: credential_escrow_missing
condition: credential_escrow_missing_count_greater_than_zero
immediate_action: keep_dr_red_light_and_forbid_dr_complete_claim
max_declaration: B4_FULL_STACK_GREEN
host_paths:
"110":
degraded_path: preserve_k3s_and_188_data_freeze_cd_runner_harbor_push_alert_outbound
return_to_plan_a:
- host_ready
- harbor_gitea_prometheus_alertmanager_healthy
- backup_status_no_110_core_blocker
- cold_start_110_checks_green
"120":
degraded_path: let_121_carry_k3s_preserve_120_degraded_red_light_forbid_k3s_aa_claim
return_to_plan_a:
- ping_ok
- ssh_ok
- root_filesystem_rw
- k3s_active
- node_mon_ready
- backup_configs_backup_all_offsite_cold_start_chain_green
"121":
degraded_path: let_120_carry_k3s_preserve_121_degraded_red_light_forbid_workload_balanced_claim
return_to_plan_a:
- ping_ok
- ssh_ok
- k3s_active
- node_mon1_ready
- api_web_placement_max_skew_less_or_equal_1
"188":
degraded_path: preserve_data_layer_freeze_batch_crawler_ai_flows
return_to_plan_a:
- host_ready
- postgresql_healthy
- redis_healthy
- momo_parity_green
- signoz_healthy
- ai_provider_route_healthy
- backup_status_no_188_core_blocker
k3s:
degraded_path: keep_healthy_pods_read_nodes_pods_events_vip_nodeport_no_blind_restart
return_to_plan_a:
- mon_ready
- mon1_ready
- api_web_worker_rollout_healthy
- public_api_web_alert_webhook_scorecard_green
public_gateway:
degraded_path: preserve_internal_api_vip_data_no_unapproved_nginx_dns_tls_certbot_firewall_change
return_to_plan_a:
- nginx_config_owner_evidence
- route_smoke
- tls_acme_check
- rollback_owner
- post_check_plan
service_levels:
B0_ABORTED_BEFORE_REBOOT: preflight_no_go_no_runtime_write
B1_HOST_RECOVERY_ONLY: host_layer_recovered_service_not_fully_verified
B2_CORE_SERVICE_READY: core_service_available_cross_surface_checks_incomplete
B3_SERVICE_AVAILABLE_DEGRADED: cold_start_blocked_zero_warn_greater_than_zero
B4_FULL_STACK_GREEN: cold_start_warn_zero_blocked_zero_backup_offsite_db_alert_scheduler_green
B5_DR_COMPLETE: b4_plus_credential_escrow_missing_zero_restore_and_escrow_evidence_complete
timeline:
T+0: freeze_cd_runner_ai_auto_remediation_heavy_batch_preserve_evidence
T+5: decide_host_powered_booted_ready_enter_host_plan_b_if_needed
T+15: stop_if_188_data_or_110_registry_observability_unhealthy
T+30: route_green_only_guard_b2_ceiling
T+60: run_cold_start_scorecard_and_record_warn_blocked
T+120: open_incident_or_followup_no_unapproved_runtime_write
closeout_states:
RETURNED_TO_PLAN_A: blocker_cleared_and_full_plan_a_chain_verified
SERVICE_AVAILABLE_DEGRADED: service_available_but_warn_dr_escrow_or_governance_gate_open
OPEN_INCIDENT_REQUIRED: p0_host_data_k3s_gateway_backup_or_alert_hard_blocked
resource_guardrails:
"110":
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
cpus: 0.3
mem_limit: 512m
sentry_snuba_cold_start_consumers:
cpus: 0.5
persist_in: /opt/sentry/docker-compose.override.yml
sentry_self_hosted_memory_limits:
taskscheduler_mem_limit: 1g
relay_mem_limit: 3g
relay_memswap_limit: 4g
persist_in: /opt/sentry/docker-compose.override.yml
note: "taskscheduler 不得回退到 512m/1grelay 不得回退到 2g 或以下造成長期 >85% memory-limit pressure110 主機仍以 ClickHouse/Kafka/Snuba CPU caps 防止冷啟動過載。"
actions_runner_systemd:
cpu_quota: 200%
memory_max: 2G
watchdog: disabled
gitea:
cpus: 1.5
memory: 3G
restart_policy: always
note: "2026-07-02 live recovery 將 gitea 從 2 CPU cap 收斂到 1.5 CPU不得回退到無限額或 3 CPUrollback 只允許暫回 2 CPU 並需 Prometheus readback。"
stockplatform_v2_postgres:
cpus: 1.5
memory: 4G
memswap: 6G
restart_policy: unless-stopped
note: "2026-07-02 live recovery 將 stockplatform-v2-postgres-1 補上 Docker resource guardrail不得回退到 nanocpus=0 / memory=0 / restart=no 造成 110 host pressure。"
"188":
ollama_systemd:
cpu_quota: 300%
memory_high: 20G
memory_max: 24G
max_loaded_models: 1
num_parallel: 1
note: "188 本機 Ollama 是 cold-start 依賴與 Open-WebUI local endpoint不得維持 disabled/inactive也不得保留 700%/45G 無節制 guardrail。"
litellm:
mode: optional_retired
note: "188 currently has no litellm container, unit, port 4000, or /opt/litellm tree. Do not hard-start a provider proxy without provider route/cost approval; P3 gate treats absent litellm as warning evidence."
momo_scheduler:
cpus: 2.0
memory: 2G
momo_db:
cpus: 2.0
memory: 4G
memswap: 6G
restart_policy: unless-stopped
note: "2026-07-01 live recovery 將 momo-db 補上 Docker resource guardrail不得回退到 nanocpus=0 / memory=0 / memswap=0。"
signoz_clickhouse:
memory: 24G
note: do_not_lower_during_merge_backlog
authoritative_checks:
read_only_monitor:
command: bash scripts/reboot-recovery/full-stack-cold-start-check.sh --monitor-read-only --no-color
expected_for_cron: PASS>0 WARN=0 BLOCKED=0
release_gate:
command: SSH_BATCH_MODE=yes bash scripts/reboot-recovery/full-stack-cold-start-check.sh --send-alert-test
expected: PASS=64 WARN=0 BLOCKED=0
textfile_metric:
path: /home/wooo/node_exporter_textfiles/cold_start_recovery.prom
green_metric: awoooi_cold_start_last_result{host="110",scope="110_120_121_188",result="green"} 1
backup_baseline:
path: ops/reboot-recovery/full-stack-backup-baseline.yml
required_metrics:
- awoooi_backup_health_monitor_up
- awoooi_backup_job_fresh
- awoooi_backup_integrity_fresh
- awoooi_velero_restore_test_cron_present
- awoooi_velero_restore_test_last_success_fresh