ubicloud/prog/postgres/postgres_timeline_nexus.rb
shikharbhardwaj 56bf890ae2
Implement Prog::Postgres::UpgradePostgresResource
The Upgrade prog is responsible for matching the current Postgres
version to the desired version. If there is a mismatch (current <
desired), the Upgrade prog is launched and takes precedence over
Convergence.

Roughly, the Upgrade prog does the following:
1. Create a new "candidate standby" with the same version as the current
   one and wait for it to catch up.
2. Fence the current primary.
3. Upgrade the candidate standby to the desired version.
4. Switch the candidate to use a new timeline.
5. Take over from the current primary.
6. Prune any older version servers and exit. The convergence prog will
   take care of starting any new standbys needed.

In case anything fails, we delete the candidate standby and unfence the
primary to bring the database back. During the Upgrade health checking
is effectively disabled as the auto-recovery causes conflicts with the
several restarts of various versions on the candidate.
2025-09-15 19:41:58 +02:00

164 lines
5.9 KiB
Ruby

# frozen_string_literal: true
require "forwardable"
require "aws-sdk-iam"
class Prog::Postgres::PostgresTimelineNexus < Prog::Base
subject_is :postgres_timeline
extend Forwardable
def_delegators :postgres_timeline, :blob_storage_client
def self.assemble(location_id:, parent_id: nil)
if parent_id && PostgresTimeline[parent_id].nil?
fail "No existing parent"
end
unless (location = Location[location_id])
fail "No existing location"
end
DB.transaction do
postgres_timeline = PostgresTimeline.create(
parent_id: parent_id,
access_key: SecureRandom.hex(16),
secret_key: SecureRandom.hex(32),
blob_storage_id: MinioCluster.first(project_id: Config.postgres_service_project_id, location_id: location.id)&.id,
location_id: location.id
)
Strand.create_with_id(postgres_timeline.id, prog: "Postgres::PostgresTimelineNexus", label: "start")
end
end
def before_run
when_destroy_set? do
if strand.label != "destroy"
hop_destroy
end
end
end
label def start
if postgres_timeline.blob_storage
setup_blob_storage
hop_setup_bucket
end
hop_wait_leader
end
label def setup_bucket
nap 1 if postgres_timeline.aws? && !aws_access_key_is_available?
# Create bucket for the timeline
postgres_timeline.create_bucket
postgres_timeline.set_lifecycle_policy
hop_wait_leader
end
label def wait_leader
nap 5 if postgres_timeline.leader.nil? || postgres_timeline.leader.strand.label != "wait"
hop_wait
end
label def wait
leader = postgres_timeline.leader
backups = leader ? postgres_timeline.backups : []
if leader.nil? && backups.empty? && Time.now - postgres_timeline.created_at > 10 * 24 * 60 * 60
Clog.emit("Self-destructing timeline as no leader or backups are present and it is older than 10 days") { postgres_timeline }
hop_destroy
end
nap 20 * 60 if postgres_timeline.blob_storage.nil?
# For the purpose of missing backup pages, we act like the very first backup
# is taken at the creation, which ensures that we would get a page if and only
# if no backup is taken for 2 days.
latest_backup_completed_at = backups.map(&:last_modified).max || postgres_timeline.created_at
if leader && latest_backup_completed_at < Time.now - 2 * 24 * 60 * 60 # 2 days
Prog::PageNexus.assemble("Missing backup at #{postgres_timeline}!", ["MissingBackup", postgres_timeline.id], postgres_timeline.ubid)
else
Page.from_tag_parts("MissingBackup", postgres_timeline.id)&.incr_resolve
end
if postgres_timeline.need_backup?
hop_take_backup
end
nap 20 * 60
end
label def take_backup
# It is possible that we already started backup but crashed before saving
# the state to database. Since backup taking is an expensive operation,
# we check if backup is truly needed.
if postgres_timeline.need_backup?
postgres_timeline.leader.vm.sshable.cmd("common/bin/daemonizer 'sudo postgres/bin/take-backup #{postgres_timeline.leader.resource.version}' take_postgres_backup")
postgres_timeline.latest_backup_started_at = Time.now
postgres_timeline.save_changes
end
hop_wait
end
label def destroy
decr_destroy
destroy_blob_storage if postgres_timeline.blob_storage
postgres_timeline.destroy
pop "postgres timeline is deleted"
end
def destroy_blob_storage
return destroy_aws_s3 if postgres_timeline.aws?
admin_client.admin_remove_user(postgres_timeline.access_key)
admin_client.admin_policy_remove(postgres_timeline.ubid)
end
def destroy_aws_s3
iam_client.list_attached_user_policies(user_name: postgres_timeline.ubid).attached_policies.each do |it|
iam_client.detach_user_policy(user_name: postgres_timeline.ubid, policy_arn: it.policy_arn)
iam_client.delete_policy(policy_arn: it.policy_arn)
end
iam_client.list_access_keys(user_name: postgres_timeline.ubid).access_key_metadata.each do |it|
iam_client.delete_access_key(user_name: postgres_timeline.ubid, access_key_id: it.access_key_id)
end
iam_client.delete_user(user_name: postgres_timeline.ubid)
end
def setup_blob_storage
return setup_aws_s3 if postgres_timeline.aws?
# Setup user keys and policy for the timeline
admin_client.admin_add_user(postgres_timeline.access_key, postgres_timeline.secret_key)
admin_client.admin_policy_add(postgres_timeline.ubid, postgres_timeline.blob_storage_policy)
admin_client.admin_policy_set(postgres_timeline.ubid, postgres_timeline.access_key)
end
def setup_aws_s3
iam_client.create_user(user_name: postgres_timeline.ubid)
policy = iam_client.create_policy(policy_name: postgres_timeline.ubid, policy_document: postgres_timeline.blob_storage_policy.to_json)
iam_client.attach_user_policy(user_name: postgres_timeline.ubid, policy_arn: policy.policy.arn)
response = iam_client.create_access_key(user_name: postgres_timeline.ubid)
postgres_timeline.update(access_key: response.access_key.access_key_id, secret_key: response.access_key.secret_access_key)
postgres_timeline.leader.incr_refresh_walg_credentials
end
def aws_access_key_is_available?
iam_client.list_access_keys(user_name: postgres_timeline.ubid).access_key_metadata.any? { |it| it.access_key_id == postgres_timeline.access_key }
end
def iam_client
@iam_client ||= Aws::IAM::Client.new(access_key_id: postgres_timeline.location.location_credential.access_key, secret_access_key: postgres_timeline.location.location_credential.secret_key, region: postgres_timeline.location.name)
end
def admin_client
@admin_client ||= Minio::Client.new(
endpoint: postgres_timeline.blob_storage_endpoint,
access_key: postgres_timeline.blob_storage.admin_user,
secret_key: postgres_timeline.blob_storage.admin_password,
ssl_ca_data: postgres_timeline.blob_storage.root_certs
)
end
end