mirror of
https://github.com/ubicloud/ubicloud.git
synced 2025-10-05 22:31:57 +08:00
The Upgrade prog is responsible for matching the current Postgres version to the desired version. If there is a mismatch (current < desired), the Upgrade prog is launched and takes precedence over Convergence. Roughly, the Upgrade prog does the following: 1. Create a new "candidate standby" with the same version as the current one and wait for it to catch up. 2. Fence the current primary. 3. Upgrade the candidate standby to the desired version. 4. Switch the candidate to use a new timeline. 5. Take over from the current primary. 6. Prune any older version servers and exit. The convergence prog will take care of starting any new standbys needed. In case anything fails, we delete the candidate standby and unfence the primary to bring the database back. During the Upgrade health checking is effectively disabled as the auto-recovery causes conflicts with the several restarts of various versions on the candidate.
164 lines
5.9 KiB
Ruby
164 lines
5.9 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "forwardable"
|
|
require "aws-sdk-iam"
|
|
|
|
class Prog::Postgres::PostgresTimelineNexus < Prog::Base
|
|
subject_is :postgres_timeline
|
|
|
|
extend Forwardable
|
|
def_delegators :postgres_timeline, :blob_storage_client
|
|
|
|
def self.assemble(location_id:, parent_id: nil)
|
|
if parent_id && PostgresTimeline[parent_id].nil?
|
|
fail "No existing parent"
|
|
end
|
|
|
|
unless (location = Location[location_id])
|
|
fail "No existing location"
|
|
end
|
|
|
|
DB.transaction do
|
|
postgres_timeline = PostgresTimeline.create(
|
|
parent_id: parent_id,
|
|
access_key: SecureRandom.hex(16),
|
|
secret_key: SecureRandom.hex(32),
|
|
blob_storage_id: MinioCluster.first(project_id: Config.postgres_service_project_id, location_id: location.id)&.id,
|
|
location_id: location.id
|
|
)
|
|
Strand.create_with_id(postgres_timeline.id, prog: "Postgres::PostgresTimelineNexus", label: "start")
|
|
end
|
|
end
|
|
|
|
def before_run
|
|
when_destroy_set? do
|
|
if strand.label != "destroy"
|
|
hop_destroy
|
|
end
|
|
end
|
|
end
|
|
|
|
label def start
|
|
if postgres_timeline.blob_storage
|
|
setup_blob_storage
|
|
hop_setup_bucket
|
|
end
|
|
|
|
hop_wait_leader
|
|
end
|
|
|
|
label def setup_bucket
|
|
nap 1 if postgres_timeline.aws? && !aws_access_key_is_available?
|
|
|
|
# Create bucket for the timeline
|
|
postgres_timeline.create_bucket
|
|
postgres_timeline.set_lifecycle_policy
|
|
hop_wait_leader
|
|
end
|
|
|
|
label def wait_leader
|
|
nap 5 if postgres_timeline.leader.nil? || postgres_timeline.leader.strand.label != "wait"
|
|
hop_wait
|
|
end
|
|
|
|
label def wait
|
|
leader = postgres_timeline.leader
|
|
backups = leader ? postgres_timeline.backups : []
|
|
if leader.nil? && backups.empty? && Time.now - postgres_timeline.created_at > 10 * 24 * 60 * 60
|
|
Clog.emit("Self-destructing timeline as no leader or backups are present and it is older than 10 days") { postgres_timeline }
|
|
hop_destroy
|
|
end
|
|
|
|
nap 20 * 60 if postgres_timeline.blob_storage.nil?
|
|
|
|
# For the purpose of missing backup pages, we act like the very first backup
|
|
# is taken at the creation, which ensures that we would get a page if and only
|
|
# if no backup is taken for 2 days.
|
|
latest_backup_completed_at = backups.map(&:last_modified).max || postgres_timeline.created_at
|
|
if leader && latest_backup_completed_at < Time.now - 2 * 24 * 60 * 60 # 2 days
|
|
Prog::PageNexus.assemble("Missing backup at #{postgres_timeline}!", ["MissingBackup", postgres_timeline.id], postgres_timeline.ubid)
|
|
else
|
|
Page.from_tag_parts("MissingBackup", postgres_timeline.id)&.incr_resolve
|
|
end
|
|
|
|
if postgres_timeline.need_backup?
|
|
hop_take_backup
|
|
end
|
|
|
|
nap 20 * 60
|
|
end
|
|
|
|
label def take_backup
|
|
# It is possible that we already started backup but crashed before saving
|
|
# the state to database. Since backup taking is an expensive operation,
|
|
# we check if backup is truly needed.
|
|
if postgres_timeline.need_backup?
|
|
postgres_timeline.leader.vm.sshable.cmd("common/bin/daemonizer 'sudo postgres/bin/take-backup #{postgres_timeline.leader.resource.version}' take_postgres_backup")
|
|
postgres_timeline.latest_backup_started_at = Time.now
|
|
postgres_timeline.save_changes
|
|
end
|
|
|
|
hop_wait
|
|
end
|
|
|
|
label def destroy
|
|
decr_destroy
|
|
destroy_blob_storage if postgres_timeline.blob_storage
|
|
postgres_timeline.destroy
|
|
pop "postgres timeline is deleted"
|
|
end
|
|
|
|
def destroy_blob_storage
|
|
return destroy_aws_s3 if postgres_timeline.aws?
|
|
|
|
admin_client.admin_remove_user(postgres_timeline.access_key)
|
|
admin_client.admin_policy_remove(postgres_timeline.ubid)
|
|
end
|
|
|
|
def destroy_aws_s3
|
|
iam_client.list_attached_user_policies(user_name: postgres_timeline.ubid).attached_policies.each do |it|
|
|
iam_client.detach_user_policy(user_name: postgres_timeline.ubid, policy_arn: it.policy_arn)
|
|
iam_client.delete_policy(policy_arn: it.policy_arn)
|
|
end
|
|
|
|
iam_client.list_access_keys(user_name: postgres_timeline.ubid).access_key_metadata.each do |it|
|
|
iam_client.delete_access_key(user_name: postgres_timeline.ubid, access_key_id: it.access_key_id)
|
|
end
|
|
iam_client.delete_user(user_name: postgres_timeline.ubid)
|
|
end
|
|
|
|
def setup_blob_storage
|
|
return setup_aws_s3 if postgres_timeline.aws?
|
|
|
|
# Setup user keys and policy for the timeline
|
|
admin_client.admin_add_user(postgres_timeline.access_key, postgres_timeline.secret_key)
|
|
admin_client.admin_policy_add(postgres_timeline.ubid, postgres_timeline.blob_storage_policy)
|
|
admin_client.admin_policy_set(postgres_timeline.ubid, postgres_timeline.access_key)
|
|
end
|
|
|
|
def setup_aws_s3
|
|
iam_client.create_user(user_name: postgres_timeline.ubid)
|
|
policy = iam_client.create_policy(policy_name: postgres_timeline.ubid, policy_document: postgres_timeline.blob_storage_policy.to_json)
|
|
iam_client.attach_user_policy(user_name: postgres_timeline.ubid, policy_arn: policy.policy.arn)
|
|
response = iam_client.create_access_key(user_name: postgres_timeline.ubid)
|
|
postgres_timeline.update(access_key: response.access_key.access_key_id, secret_key: response.access_key.secret_access_key)
|
|
postgres_timeline.leader.incr_refresh_walg_credentials
|
|
end
|
|
|
|
def aws_access_key_is_available?
|
|
iam_client.list_access_keys(user_name: postgres_timeline.ubid).access_key_metadata.any? { |it| it.access_key_id == postgres_timeline.access_key }
|
|
end
|
|
|
|
def iam_client
|
|
@iam_client ||= Aws::IAM::Client.new(access_key_id: postgres_timeline.location.location_credential.access_key, secret_access_key: postgres_timeline.location.location_credential.secret_key, region: postgres_timeline.location.name)
|
|
end
|
|
|
|
def admin_client
|
|
@admin_client ||= Minio::Client.new(
|
|
endpoint: postgres_timeline.blob_storage_endpoint,
|
|
access_key: postgres_timeline.blob_storage.admin_user,
|
|
secret_key: postgres_timeline.blob_storage.admin_password,
|
|
ssl_ca_data: postgres_timeline.blob_storage.root_certs
|
|
)
|
|
end
|
|
end
|