ubicloud/prog/postgres/converge_postgres_resource.rb
shikharbhardwaj 615050d2f8 Add Upgrade steps to Prog::Postgres::ConvergePostgresResource
The Converge prog is now also responsible for matching the current
Postgres version to the desired version. If there is a mismatch (current
< desired), the Converge prog is launched.

Roughly, the Converge prog does the following:
1. Provisions new servers. In case of upgrades, it only provisions upto
   one new standby if no existing standby is suitable for
   upgrades.
2. Wait for the required servers to be ready.
3. Wait for the maintenance window to start.
4. Fence the primary server, and launch pg_upgrade.
5. If the upgrade is successful, replace the current primary with the
   candidate standby.

In case the upgrade fails, we delete the candidate standby and unfence the
primary to bring the database back. During the Upgrade health checking
is effectively disabled as the auto-recovery causes conflicts with the
several restarts of various versions on the candidate.
2025-09-24 01:45:32 +02:00

143 lines
5.1 KiB
Ruby

# frozen_string_literal: true
require_relative "../../lib/util"
class Prog::Postgres::ConvergePostgresResource < Prog::Base
subject_is :postgres_resource
label def start
register_deadline("prune_servers", 2 * 60 * 60)
hop_provision_servers
end
label def provision_servers
hop_wait_servers_to_be_ready if postgres_resource.has_enough_fresh_servers?
if postgres_resource.servers.all? { it.vm.vm_host } || postgres_resource.location.aws?
exclude_host_ids = []
exclude_availability_zones = []
availability_zone = nil
if !(Config.development? || Config.is_e2e) && postgres_resource.location.provider == HostProvider::HETZNER_PROVIDER_NAME
used_data_centers = postgres_resource.servers.map { it.vm.vm_host.data_center }.uniq
exclude_host_ids = VmHost.where(data_center: used_data_centers).map(&:id)
end
if postgres_resource.location.provider == HostProvider::AWS_PROVIDER_NAME
if postgres_resource.use_different_az_set?
exclude_availability_zones = postgres_resource.servers.map { it.vm.nic.nic_aws_resource.subnet_az }.uniq
else
availability_zone = postgres_resource.representative_server.vm.nic.nic_aws_resource.subnet_az
end
end
Prog::Postgres::PostgresServerNexus.assemble(resource_id: postgres_resource.id, timeline_id: postgres_resource.timeline.id, timeline_access: "fetch", exclude_host_ids: exclude_host_ids, exclude_availability_zones: exclude_availability_zones, availability_zone: availability_zone)
end
nap 5
end
label def wait_servers_to_be_ready
hop_provision_servers unless postgres_resource.has_enough_fresh_servers?
hop_wait_for_maintenance_window if postgres_resource.has_enough_ready_servers?
nap 60
end
label def wait_for_maintenance_window
nap 10 * 60 unless postgres_resource.in_maintenance_window?
if postgres_resource.needs_upgrade?
postgres_resource.representative_server.incr_fence
hop_wait_fence_primary
end
hop_recycle_representative_server
end
label def wait_fence_primary
return hop_upgrade_standby if postgres_resource.representative_server.strand.label == "wait_fence"
nap 5
end
label def upgrade_standby
case upgrade_candidate.vm.sshable.d_check("upgrade_postgres")
when "Succeeded"
upgrade_candidate.vm.sshable.d_clean("upgrade_postgres")
hop_update_metadata
when "Failed"
hop_upgrade_failed
when "NotStarted"
upgrade_candidate.vm.sshable.d_run("upgrade_postgres", "sudo", "postgres/bin/upgrade", postgres_resource.version)
end
nap 5
end
label def update_metadata
DB.transaction do
new_timeline_id = Prog::Postgres::PostgresTimelineNexus.assemble(
location_id: postgres_resource.location_id
).id
upgrade_candidate.update(version: postgres_resource.version, timeline_id: new_timeline_id)
end
upgrade_candidate.incr_refresh_walg_credentials
upgrade_candidate.incr_configure
upgrade_candidate.incr_restart
# We do an unplanned take over here because the primary is already fenced
# above. With a planned take over, the candidate server would be stuck
# waiting for the fence semaphore to be unset.
upgrade_candidate.incr_unplanned_take_over
hop_wait_takeover
end
label def wait_takeover
nap 5 unless postgres_resource.representative_server&.strand&.label == "wait"
hop_prune_servers
end
label def upgrade_failed
if upgrade_candidate && !upgrade_candidate.destroy_set?
logs = upgrade_candidate.vm.sshable.cmd("sudo journalctl -u upgrade_postgres")
logs.split("\n").each { |line| Clog.emit("Postgres resource upgrade failed") { {resource_id: postgres_resource.id, log: line} } }
upgrade_candidate.incr_destroy
end
postgres_resource.representative_server.incr_unfence if postgres_resource.representative_server.strand.label == "wait_fence"
nap 6 * 60 * 60
end
label def recycle_representative_server
if (rs = postgres_resource.representative_server) && !postgres_resource.ongoing_failover?
hop_prune_servers unless rs.needs_recycling?
hop_provision_servers unless postgres_resource.has_enough_ready_servers?
register_deadline(nil, 10 * 60)
rs.trigger_failover(mode: "planned")
end
nap 60
end
label def prune_servers
# Below we only keep servers that does not need recycling or are of the
# current version. If there are more such servers than required, we prefer
# ready and recent servers (in that order)
servers_to_keep = postgres_resource.servers
.reject { it.representative_at || it.needs_recycling? || it.version != postgres_resource.version }
.sort_by { [(it.strand.label == "wait") ? 0 : 1, Time.now - it.created_at] }
.take(postgres_resource.target_standby_count) + [postgres_resource.representative_server]
(postgres_resource.servers - servers_to_keep).each.each(&:incr_destroy)
servers_to_keep.each(&:incr_configure)
postgres_resource.incr_update_billing_records
pop "postgres resource is converged"
end
def upgrade_candidate
@upgrade_candidate ||= postgres_resource.upgrade_candidate_server
end
end