ubicloud/spec/prog/postgres/postgres_timeline_nexus_spec.rb
shikharbhardwaj 56bf890ae2
Implement Prog::Postgres::UpgradePostgresResource
The Upgrade prog is responsible for matching the current Postgres
version to the desired version. If there is a mismatch (current <
desired), the Upgrade prog is launched and takes precedence over
Convergence.

Roughly, the Upgrade prog does the following:
1. Create a new "candidate standby" with the same version as the current
   one and wait for it to catch up.
2. Fence the current primary.
3. Upgrade the candidate standby to the desired version.
4. Switch the candidate to use a new timeline.
5. Take over from the current primary.
6. Prune any older version servers and exit. The convergence prog will
   take care of starting any new standbys needed.

In case anything fails, we delete the candidate standby and unfence the
primary to bring the database back. During the Upgrade health checking
is effectively disabled as the auto-recovery causes conflicts with the
several restarts of various versions on the candidate.
2025-09-15 19:41:58 +02:00

269 lines
14 KiB
Ruby

# frozen_string_literal: true
require_relative "../../model/spec_helper"
RSpec.describe Prog::Postgres::PostgresTimelineNexus do
subject(:nx) { described_class.new(Strand.new(id: "8148ebdf-66b8-8ed0-9c2f-8cfe93f5aa77")) }
let(:postgres_timeline) {
instance_double(
PostgresTimeline,
id: "b253669e-1cf5-8ada-9337-5fc319690838",
ubid: "ptp99pd7gwyp4jcvnzgrsd443g",
blob_storage: instance_double(MinioCluster, url: "https://blob-endpoint", root_certs: "certs"),
blob_storage_endpoint: "https://blob-endpoint",
blob_storage_client: instance_double(Minio::Client),
access_key: "dummy-access-key",
secret_key: "dummy-secret-key",
blob_storage_policy: {"Version" => "2012-10-17", "Statement" => [{"Action" => ["s3:GetBucketLocation"], "Effect" => "Allow", "Principal" => {"AWS" => ["*"]}, "Resource" => ["arn:aws:s3:::test"], "Sid" => ""}]},
aws?: false
)
}
before do
allow(nx).to receive(:postgres_timeline).and_return(postgres_timeline)
end
describe ".assemble" do
it "throws an exception if parent is not found" do
expect {
described_class.assemble(location_id: Location::HETZNER_FSN1_ID, parent_id: "69c0f4cd-99c1-8ed0-acfe-7b013ce2fa0b")
}.to raise_error RuntimeError, "No existing parent"
end
it "throws an exception if location is not found" do
expect {
described_class.assemble(location_id: nil)
}.to raise_error RuntimeError, "No existing location"
end
it "creates postgres timeline" do
st = described_class.assemble(location_id: Location::HETZNER_FSN1_ID)
postgres_timeline = PostgresTimeline[st.id]
expect(postgres_timeline).not_to be_nil
end
it "creates postgres timeline with blob storage when it exists" do
project = Project.create(name: "mc-project")
expect(Config).to receive(:minio_service_project_id).and_return(project.id).at_least(:once)
expect(Config).to receive(:postgres_service_project_id).and_return(project.id)
mc = Prog::Minio::MinioClusterNexus.assemble(project.id, "minio", Location::HETZNER_FSN1_ID, "minio-admin", 100, 1, 1, 1, "standard-2").subject
st = described_class.assemble(location_id: Location::HETZNER_FSN1_ID)
postgres_timeline = PostgresTimeline[st.id]
expect(postgres_timeline.blob_storage_id).to eq(mc.id)
end
end
describe "#before_run" do
it "hops to destroy when needed" do
expect(nx).to receive(:when_destroy_set?).and_yield
expect { nx.before_run }.to hop("destroy")
end
it "does not hop to destroy if already in the destroy state" do
expect(nx).to receive(:when_destroy_set?).and_yield
expect(nx.strand).to receive(:label).and_return("destroy")
expect { nx.before_run }.not_to hop("destroy")
end
end
describe "#start" do
let(:admin_blob_storage_client) { instance_double(Minio::Client) }
describe "when blob storage is minio" do
it "creates user and policies and hops" do
expect(postgres_timeline).to receive(:blob_storage).and_return(instance_double(MinioCluster, url: "https://blob-endpoint", root_certs: "certs", admin_user: "admin", admin_password: "secret")).at_least(:once)
expect(Minio::Client).to receive(:new).with(endpoint: "https://blob-endpoint", access_key: "admin", secret_key: "secret", ssl_ca_data: "certs").and_return(admin_blob_storage_client)
expect(admin_blob_storage_client).to receive(:admin_add_user).with(postgres_timeline.access_key, postgres_timeline.secret_key).and_return(200)
expect(admin_blob_storage_client).to receive(:admin_policy_add).with(postgres_timeline.ubid, postgres_timeline.blob_storage_policy).and_return(200)
expect(admin_blob_storage_client).to receive(:admin_policy_set).with(postgres_timeline.ubid, postgres_timeline.access_key).and_return(200)
expect { nx.start }.to hop("setup_bucket")
end
end
describe "when blob storage is aws s3" do
it "creates user and policies and hops" do
expect(postgres_timeline).to receive(:aws?).and_return(true)
expect(postgres_timeline).to receive(:location).and_return(instance_double(Location, name: "us-west-2", location_credential: instance_double(LocationCredential, access_key: "access-key", secret_key: "secret-key"))).at_least(:once)
client = Aws::IAM::Client.new(stub_responses: true)
expect(Aws::IAM::Client).to receive(:new).and_return(client)
client.stub_responses(:create_user)
client.stub_responses(:create_policy)
client.stub_responses(:attach_user_policy)
client.stub_responses(:create_access_key, access_key: {access_key_id: "access-key", secret_access_key: "secret-key", user_name: "username", status: "Active"})
expect(postgres_timeline).to receive(:update).with(access_key: "access-key", secret_key: "secret-key").and_return(postgres_timeline)
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer, strand: instance_double(Strand, label: "wait"))).at_least(:once)
expect(postgres_timeline.leader).to receive(:incr_refresh_walg_credentials)
expect { nx.start }.to hop("setup_bucket")
end
end
it "hops without creating bucket if blob storage is not configured" do
expect(postgres_timeline).to receive(:blob_storage).and_return(nil)
expect(nx).not_to receive(:setup_blob_storage)
expect { nx.start }.to hop("wait_leader")
end
end
describe "#setup_bucket" do
it "hops to wait_leader if bucket is created" do
expect(postgres_timeline).to receive(:create_bucket).and_return(true)
expect(postgres_timeline).to receive(:set_lifecycle_policy).and_return(true)
expect { nx.setup_bucket }.to hop("wait_leader")
end
it "naps if aws and the key is not available" do
expect(postgres_timeline).to receive(:aws?).and_return(true)
expect(postgres_timeline).to receive(:location).and_return(instance_double(Location, name: "us-west-2", location_credential: instance_double(LocationCredential, access_key: "access-key", secret_key: "secret-key"))).at_least(:once)
iam_client = Aws::IAM::Client.new(stub_responses: true)
expect(Aws::IAM::Client).to receive(:new).and_return(iam_client)
iam_client.stub_responses(:list_access_keys, access_key_metadata: [{access_key_id: "access-key"}])
expect(postgres_timeline).to receive(:access_key).and_return("not-access-key")
expect { nx.setup_bucket }.to nap(1)
end
it "hops to wait_leader if aws and the key is available" do
expect(postgres_timeline).to receive(:aws?).and_return(true)
expect(nx).to receive(:aws_access_key_is_available?).and_return(true)
expect(postgres_timeline).to receive(:create_bucket).and_return(true)
expect(postgres_timeline).to receive(:set_lifecycle_policy).and_return(true)
expect { nx.setup_bucket }.to hop("wait_leader")
end
end
describe "#wait_leader" do
it "naps if leader not ready" do
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer, strand: instance_double(Strand, label: "start"))).twice
expect { nx.wait_leader }.to nap(5)
end
it "hops if leader is ready" do
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer, strand: instance_double(Strand, label: "wait"))).twice
expect { nx.wait_leader }.to hop("wait")
end
end
describe "#wait" do
it "naps if blob storage is not configured" do
expect(postgres_timeline).to receive(:leader).and_return("something")
expect(postgres_timeline).to receive(:backups).and_return([])
expect(postgres_timeline).to receive(:blob_storage).and_return(nil)
expect { nx.wait }.to nap(20 * 60)
end
it "self-destructs if there's no leader, no backups and the timeline is old enough" do
expect(postgres_timeline).to receive(:leader).and_return(nil)
expect(postgres_timeline).to receive(:created_at).and_return(Time.now - 11 * 24 * 60 * 60)
expect(Clog).to receive(:emit).with(/Self-destructing timeline/)
expect { nx.wait }.to hop("destroy")
end
it "avoids API calls backups if there is no leader" do
expect(postgres_timeline).to receive(:leader).and_return(nil)
expect(postgres_timeline).to receive(:created_at).and_return(Time.now - 6 * 24 * 60 * 60).twice
expect(postgres_timeline).not_to receive(:backups)
expect(postgres_timeline).to receive(:need_backup?).and_return(false)
expect { nx.wait }.to nap(20 * 60)
end
it "hops to take_backup if backup is needed" do
expect(postgres_timeline).to receive(:need_backup?).and_return(true)
backup = Struct.new(:last_modified)
expect(postgres_timeline).to receive(:backups).and_return([instance_double(backup, last_modified: Time.now - 3 * 24 * 60 * 60)])
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer))
expect { nx.wait }.to hop("take_backup")
end
it "creates a missing backup page if last completed backup is older than 2 days" do
expect(postgres_timeline).to receive(:need_backup?).and_return(false)
backup = Struct.new(:last_modified)
expect(postgres_timeline).to receive(:backups).and_return([instance_double(backup, last_modified: Time.now - 3 * 24 * 60 * 60)])
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer))
expect { nx.wait }.to nap(20 * 60)
expect(Page.active.count).to eq(1)
end
it "resolves the missing page if last completed backup is more recent than 2 days" do
expect(postgres_timeline).to receive(:need_backup?).and_return(false)
backup = Struct.new(:last_modified)
expect(postgres_timeline).to receive(:backups).and_return([instance_double(backup, last_modified: Time.now - 1 * 24 * 60 * 60)])
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer))
page = instance_double(Page)
expect(page).to receive(:incr_resolve)
expect(Page).to receive(:from_tag_parts).and_return(page)
expect { nx.wait }.to nap(20 * 60)
end
it "naps if there is nothing to do" do
expect(postgres_timeline).to receive(:need_backup?).and_return(false)
backup = Struct.new(:last_modified)
expect(postgres_timeline).to receive(:backups).and_return([instance_double(backup, last_modified: Time.now - 1 * 24 * 60 * 60)])
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer))
expect { nx.wait }.to nap(20 * 60)
end
end
describe "#take_backup" do
it "hops to wait if backup is not needed" do
expect(postgres_timeline).to receive(:need_backup?).and_return(false)
expect { nx.take_backup }.to hop("wait")
end
it "takes backup if it is needed" do
expect(postgres_timeline).to receive(:need_backup?).and_return(true)
sshable = instance_double(Sshable)
expect(sshable).to receive(:cmd).with("common/bin/daemonizer 'sudo postgres/bin/take-backup 16' take_postgres_backup")
expect(postgres_timeline).to receive(:leader).and_return(instance_double(PostgresServer, resource: instance_double(PostgresResource, version: "16"), vm: instance_double(Vm, sshable: sshable))).at_least(:once)
expect(postgres_timeline).to receive(:latest_backup_started_at=)
expect(postgres_timeline).to receive(:save_changes)
expect { nx.take_backup }.to hop("wait")
end
end
describe "#destroy" do
let(:admin_blob_storage_client) { instance_double(Minio::Client) }
it "completes destroy even if dns zone and blob_storage are not configured" do
expect(postgres_timeline).to receive(:blob_storage).and_return(nil)
expect(postgres_timeline).to receive(:destroy)
expect { nx.destroy }.to exit({"msg" => "postgres timeline is deleted"})
end
describe "when blob storage is minio" do
it "destroys blob storage and postgres timeline" do
expect(postgres_timeline).to receive(:blob_storage).and_return(instance_double(MinioCluster, url: "https://blob-endpoint", root_certs: "certs", admin_user: "admin", admin_password: "secret")).at_least(:once)
expect(postgres_timeline).to receive(:destroy)
expect(Minio::Client).to receive(:new).with(endpoint: postgres_timeline.blob_storage_endpoint, access_key: "admin", secret_key: "secret", ssl_ca_data: "certs").and_return(admin_blob_storage_client)
expect(admin_blob_storage_client).to receive(:admin_remove_user).with(postgres_timeline.access_key).and_return(200)
expect(admin_blob_storage_client).to receive(:admin_policy_remove).with(postgres_timeline.ubid).and_return(200)
expect { nx.destroy }.to exit({"msg" => "postgres timeline is deleted"})
end
end
describe "when blob storage is aws s3" do
before do
expect(postgres_timeline).to receive(:aws?).and_return(true)
expect(postgres_timeline).to receive(:location).and_return(instance_double(Location, name: "us-west-2", location_credential: instance_double(LocationCredential, access_key: "access-key", secret_key: "secret-key"))).at_least(:once)
end
it "destroys blob storage and postgres timeline" do
client = Aws::IAM::Client.new(stub_responses: true)
expect(Aws::IAM::Client).to receive(:new).and_return(client)
client.stub_responses(:delete_user)
client.stub_responses(:list_attached_user_policies, attached_policies: [{policy_arn: "arn:aws:iam::aws:policy/AmazonS3FullAccess"}])
client.stub_responses(:delete_policy)
client.stub_responses(:list_access_keys, access_key_metadata: [{access_key_id: "access-key"}])
client.stub_responses(:delete_access_key)
expect(postgres_timeline).to receive(:destroy)
expect { nx.destroy }.to exit({"msg" => "postgres timeline is deleted"})
end
end
end
end