ubicloud/spec/prog/postgres/upgrade_postgres_resource_spec.rb
shikharbhardwaj 56bf890ae2
Implement Prog::Postgres::UpgradePostgresResource
The Upgrade prog is responsible for matching the current Postgres
version to the desired version. If there is a mismatch (current <
desired), the Upgrade prog is launched and takes precedence over
Convergence.

Roughly, the Upgrade prog does the following:
1. Create a new "candidate standby" with the same version as the current
   one and wait for it to catch up.
2. Fence the current primary.
3. Upgrade the candidate standby to the desired version.
4. Switch the candidate to use a new timeline.
5. Take over from the current primary.
6. Prune any older version servers and exit. The convergence prog will
   take care of starting any new standbys needed.

In case anything fails, we delete the candidate standby and unfence the
primary to bring the database back. During the Upgrade health checking
is effectively disabled as the auto-recovery causes conflicts with the
several restarts of various versions on the candidate.
2025-09-15 19:41:58 +02:00

197 lines
7.5 KiB
Ruby

# frozen_string_literal: true
require_relative "../../model/spec_helper"
RSpec.describe Prog::Postgres::UpgradePostgresResource do
subject(:nx) { described_class.new(st) }
let(:st) {
Strand.new(
id: "329d16c7-670c-4f64-927f-67c3fef180b1",
stack: [{"candidate_server_id" => "6aea22d4-09ea-4ef3-b2e9-2868cc54a2fe",
"new_timeline_id" => "c3b94c9a-c664-4b94-96ad-0df6047df115"}]
)
}
let(:candidate_server) {
instance_double(PostgresServer,
id: "6aea22d4-09ea-4ef3-b2e9-2868cc54a2fe",
strand: instance_double(Strand, label: "wait"),
synchronization_status: "ready",
vm: instance_double(
Vm,
id: "1c7d59ee-8d46-8374-9553-6144490ecec5",
sshable: sshable,
ephemeral_net4: "1.1.1.1",
private_subnets: [instance_double(PrivateSubnet)]
))
}
let(:sshable) { instance_double(Sshable) }
let(:new_timeline) { instance_double(PostgresTimeline, id: "c3b94c9a-c664-4b94-96ad-0df6047df115") }
let(:primary_server) {
instance_double(PostgresServer,
id: "fe8ffa37-1f58-47e1-9f49-15af06cec185",
strand: instance_double(Strand, label: "wait"),
synchronization_status: "ready")
}
let(:postgres_resource) {
instance_double(PostgresResource,
id: "postgres_resource_id",
servers: [primary_server, candidate_server],
timeline: instance_double(PostgresTimeline, id: "timeline-id"),
location: instance_double(Location, aws?: false),
representative_server: primary_server,
desired_version: "17")
}
before do
allow(nx).to receive_messages(postgres_resource: postgres_resource)
allow(PostgresServer).to receive(:[]).with(candidate_server.id).and_return(candidate_server)
allow(PostgresServer).to receive(:[]).with(primary_server.id).and_return(primary_server)
allow(PostgresTimeline).to receive(:[]).with(new_timeline.id).and_return(new_timeline)
end
describe "#start" do
it "registers a deadline" do
expect(nx).to receive(:register_deadline).with("finish_upgrade", 2 * 60 * 60)
expect { nx.start }.to hop("wait_for_standby")
end
end
describe "#wait_for_standby" do
it "waits for the standby to be ready" do
expect(candidate_server).to receive(:strand).and_return(instance_double(Strand, label: "start"))
expect { nx.wait_for_standby }.to nap(5)
end
it "hops to wait_for_maintenance_window when the standby is ready" do
expect(candidate_server).to receive(:strand).and_return(instance_double(Strand, label: "wait"))
expect { nx.wait_for_standby }.to hop("wait_for_maintenance_window")
end
end
describe "#wait_for_maintenance_window" do
it "waits for the maintenance window to be open" do
expect(postgres_resource).to receive(:in_maintenance_window?).and_return(false)
expect { nx.wait_for_maintenance_window }.to nap(10 * 60)
end
it "hops to wait_fence_primary if primary is fenced" do
expect(postgres_resource).to receive(:in_maintenance_window?).and_return(true)
expect(primary_server).to receive(:incr_fence)
expect { nx.wait_for_maintenance_window }.to hop("wait_fence_primary")
end
end
describe "#wait_fence_primary" do
it "waits for the primary to be fenced" do
expect(primary_server).to receive(:strand).and_return(instance_double(Strand, label: "start"))
expect { nx.wait_fence_primary }.to nap(5)
end
it "hops to upgrade_standby if primary is fenced" do
expect(primary_server).to receive(:strand).and_return(instance_double(Strand, label: "wait_fence"))
expect { nx.wait_fence_primary }.to hop("upgrade_standby")
end
end
describe "#upgrade_standby" do
it "hops to update_metadata if the upgrade succeeds" do
expect(sshable).to receive(:d_check).with("upgrade_postgres").and_return("Succeeded")
expect(sshable).to receive(:d_clean).with("upgrade_postgres")
expect { nx.upgrade_standby }.to hop("update_metadata")
end
it "hops to upgrade_failed if the upgrade fails" do
expect(sshable).to receive(:d_check).with("upgrade_postgres").and_return("Failed")
expect { nx.upgrade_standby }.to hop("upgrade_failed")
end
it "starts the upgrade if it is not started" do
expect(sshable).to receive(:d_check).with("upgrade_postgres").and_return("NotStarted")
expect(sshable).to receive(:d_run).with("upgrade_postgres", "sudo", "postgres/bin/upgrade", "17")
expect { nx.upgrade_standby }.to nap(5)
end
it "naps for 5 seconds if the upgrade is unknown" do
expect(sshable).to receive(:d_check).with("upgrade_postgres").and_return("Unknown")
expect { nx.upgrade_standby }.to nap(5)
end
end
describe "#update_metadata" do
it "switches the candidate to a new timeline" do
expect(candidate_server).to receive(:update).with(timeline_id: "c3b94c9a-c664-4b94-96ad-0df6047df115", version: "17")
expect(postgres_resource).to receive(:update).with(timeline: new_timeline, version: "17")
expect(candidate_server).to receive_messages(incr_refresh_walg_credentials: nil, incr_configure: nil, incr_restart: nil, incr_unplanned_take_over: nil)
expect { nx.update_metadata }.to hop("wait_takeover")
end
end
describe "#wait_takeover" do
it "hops to prune_old_servers when the takeover is done" do
expect(candidate_server).to receive(:strand).and_return(instance_double(Strand, label: "wait"))
expect(postgres_resource).to receive(:representative_server).and_return(candidate_server)
expect { nx.wait_takeover }.to hop("prune_old_servers")
end
it "waits for the takeover to be done" do
expect(candidate_server).to receive(:strand).and_return(instance_double(Strand, label: "prepare_for_unplanned_take_over"))
expect { nx.wait_takeover }.to nap(5)
end
end
describe "#prune_old_servers" do
it "prunes old servers" do
expect(candidate_server).to receive(:version).and_return("17")
expect(primary_server).to receive(:version).and_return("16")
expect(primary_server).to receive(:incr_destroy)
expect { nx.prune_old_servers }.to hop("finish_upgrade")
end
end
describe "#finish_upgrade" do
it "finishes the upgrade" do
expect { nx.finish_upgrade }.to exit({"msg" => "upgrade prog finished"})
end
end
describe "#upgrade_failed" do
it "finishes the upgrade" do
expect(sshable).to receive(:cmd).with("sudo journalctl -u upgrade_postgres").and_return("log1\nlog2\nlog3")
expect(candidate_server).to receive(:incr_destroy)
expect(new_timeline).to receive(:incr_destroy)
expect(primary_server).to receive(:strand).and_return(instance_double(Strand, label: "wait_fence"))
expect(primary_server).to receive(:incr_unfence)
expect { nx.upgrade_failed }.to nap(6 * 60 * 60)
end
it "naps for 6 hours when the candidate server is nil" do
expect(nx).to receive(:candidate_server).and_return(nil)
expect(new_timeline).to receive(:incr_destroy)
expect { nx.upgrade_failed }.to nap(6 * 60 * 60)
end
it "naps for 6 hours when no candidate server or timeline is available" do
expect(nx).to receive(:candidate_server).and_return(nil)
expect(nx).to receive(:new_timeline).and_return(nil)
expect { nx.upgrade_failed }.to nap(6 * 60 * 60)
end
end
describe "#candidate_server" do
it "returns the candidate server" do
expect(nx.candidate_server).to eq(candidate_server)
end
end
describe "#new_timeline" do
it "returns the new timeline" do
expect(nx.new_timeline).to eq(new_timeline)
end
end
end