Files
ubicloud/spec/prog/test/vm_group_spec.rb
Jeremy Evans 7544717b96 Have last exiting child schedule parent strand
Previously, when reaping child processes, if there were no remaining
reapable children, the parent strand would only nap 1, which puts
unnecessary load on respirate unless at least one child strand
exits in the next second.

Change this approach by having the exiting child strands, after they
release the lease, schedule their parent immediately if the parent
has no non-exited child strands.

When doing this, you need to be careful to make sure there are not
race conditions that would delay the scheduling of the parent.
There are two potential situations you need to handle:

1. Multiple children exiting at the same time
2. Parent currently running while child is exiting

By waiting until after the child strand leases are released, you
still have a race condition with 1, but the race condition is that
multiple child strands exiting concurrently could both reschedule
the parent strand.  However, that isn't a problem. You want to avoid
the case where neither child strand schedules the parent, which
rescheduling after releasing the lease should do.

To handle 2, inside reap use Model#lock! to lock the parent strand.
This will make exiting child strands block if they UPDATE the parent
strand with a new schedule, until the parent strand's transaction
commits.  However, it's possible that a child strand already UPDATED
the parent. To handle this situation, before calling lock!, store
the cached schedule value in a local variable.  lock! implicitly does
a reload, so compare the schedule value after reload.  If the schedule
has changed, likely a child scheduled the parent for immediate
execution, so nap 0 in that case.

Just in case there are unforeseen race conditions that are not handled,
only nap for 120 seconds if there are active children.  Worst case
scenario, this results in a 2 minute delay for running the parent.
However, this can potentially result in 120x less load from parent
strands polling children.
2025-06-28 03:30:43 +09:00

299 lines
13 KiB
Ruby

# frozen_string_literal: true
require_relative "../../model/spec_helper"
RSpec.describe Prog::Test::VmGroup do
subject(:vg_test) { described_class.new(st) }
let(:st) { described_class.assemble(boot_images: ["ubuntu-noble", "debian-12"]) }
describe "#start" do
it "hops to setup_vms" do
expect { vg_test.start }.to hop("setup_vms")
end
end
describe "#setup_vms" do
it "hops to wait_children_ready" do
expect(vg_test).to receive(:update_stack).and_call_original
expect { vg_test.setup_vms }.to hop("wait_vms")
vm_images = vg_test.strand.stack.first["vms"].map { Vm[it].boot_image }
expect(vm_images).to eq(["ubuntu-noble", "debian-12", "ubuntu-noble"])
end
it "provisions at least one vm for each boot image" do
expect(vg_test).to receive(:update_stack).and_call_original
expect(vg_test).to receive(:frame).and_return({
"test_slices" => true,
"boot_images" => ["ubuntu-noble", "ubuntu-jammy", "debian-12", "almalinux-9"]
}).at_least(:once)
expect { vg_test.setup_vms }.to hop("wait_vms")
vm_images = vg_test.strand.stack.first["vms"].map { Vm[it].boot_image }
expect(vm_images).to eq(["ubuntu-noble", "ubuntu-jammy", "debian-12", "almalinux-9"])
end
it "hops to wait_children_ready if test_slices" do
expect(vg_test).to receive(:update_stack).and_call_original
expect(vg_test).to receive(:frame).and_return({
"storage_encrypted" => true,
"test_reboot" => true,
"test_slices" => true,
"vms" => [],
"boot_images" => ["ubuntu-noble", "ubuntu-jammy", "debian-12", "almalinux-9"]
}).at_least(:once)
expect { vg_test.setup_vms }.to hop("wait_vms")
end
end
describe "#wait_vms" do
it "hops to verify_vms if vms are ready" do
expect(vg_test).to receive(:frame).and_return({"vms" => ["111"]})
expect(Vm).to receive(:[]).with("111").and_return(instance_double(Vm, display_state: "running"))
expect { vg_test.wait_vms }.to hop("verify_vms")
end
it "naps if vms are not running" do
expect(vg_test).to receive(:frame).and_return({"vms" => ["111"]})
expect(Vm).to receive(:[]).with("111").and_return(instance_double(Vm, display_state: "creating"))
expect { vg_test.wait_vms }.to nap(10)
end
end
describe "#verify_vms" do
it "runs tests for the first vm" do
expect(vg_test).to receive(:frame).and_return({"vms" => ["111", "222"]})
expect(vg_test).to receive(:bud).with(Prog::Test::Vm, {subject_id: "111"})
expect(vg_test).to receive(:bud).with(Prog::Test::Vm, {subject_id: "222"})
expect { vg_test.verify_vms }.to hop("wait_verify_vms")
end
end
describe "#wait_verify_vms" do
it "hops to hop_wait_verify_vms" do
expect { vg_test.wait_verify_vms }.to hop("verify_host_capacity")
end
it "stays in wait_verify_vms" do
Strand.create(parent_id: st.id, prog: "Test::Vm", label: "start", stack: [{}], lease: Time.now + 10)
expect { vg_test.wait_verify_vms }.to nap(120)
expect(st).to receive(:lock!).and_wrap_original do |m|
# Pretend child strand updated schedule before lock.
# After the lock, shouldn't be possible as the child
# strand's update of the parent will block until
# parent strand commits.
st.this.update(schedule: Time.now - 1)
m.call
end
expect { vg_test.wait_verify_vms }.to nap(0)
end
end
describe "#verify_host_capacity" do
it "hops to verify_vm_host_slices" do
vm_host = instance_double(VmHost,
total_cpus: 16,
total_cores: 8,
used_cores: 3,
vms: [instance_double(Vm, cores: 2), instance_double(Vm, cores: 0)],
slices: [instance_double(VmHostSlice, cores: 1)],
cpus: [])
expect(vg_test).to receive_messages(vm_host: vm_host, frame: {"verify_host_capacity" => true})
expect { vg_test.verify_host_capacity }.to hop("verify_storage_backends")
end
it "skips if verify_host_capacity is not set" do
expect(vg_test).to receive(:frame).and_return({"verify_host_capacity" => false})
expect(vg_test).not_to receive(:vm_host)
expect { vg_test.verify_host_capacity }.to hop("verify_storage_backends")
end
it "fails if used cores do not match allocated VMs" do
vm_host = instance_double(VmHost,
total_cpus: 16,
total_cores: 8,
used_cores: 5,
vms: [instance_double(Vm, cores: 2), instance_double(Vm, cores: 0)],
slices: [instance_double(VmHostSlice, cores: 1)],
cpus: [])
expect(vg_test).to receive_messages(vm_host: vm_host, frame: {"verify_host_capacity" => true})
strand = instance_double(Strand)
allow(vg_test).to receive_messages(strand: strand)
expect(strand).to receive(:update).with(exitval: {msg: "Host used cores does not match the allocated VMs cores (vm_cores=2, slice_cores=1, spdk_cores=0, used_cores=5)"})
expect { vg_test.verify_host_capacity }.to hop("failed")
end
end
describe "#verify_storage_backends" do
it "fails if no vhost block backends" do
vm_host = instance_double(VmHost, vhost_block_backends: [])
expect(vg_test).to receive_messages(vm_host: vm_host)
expect { vg_test.verify_storage_backends }.to hop("failed")
end
it "checks that no SPDK volumes are present if vhost block backends exist" do
sshable = instance_double(Sshable)
vm_host = instance_double(VmHost,
vhost_block_backends: [instance_double(VhostBlockBackend)],
spdk_installations: [instance_double(SpdkInstallation, version: "23.09")],
sshable: sshable)
expect(vg_test).to receive_messages(vm_host: vm_host)
expect(sshable).to receive(:cmd).with("sudo /opt/spdk-23.09/scripts/rpc.py -s /home/spdk/spdk-23.09.sock bdev_get_bdevs").and_return("[]\n")
expect { vg_test.verify_storage_backends }.to hop("verify_vm_host_slices")
end
it "fails if SPDK volumes are present while vhost block backends exist" do
sshable = instance_double(Sshable)
vm_host = instance_double(VmHost,
vhost_block_backends: [instance_double(VhostBlockBackend)],
spdk_installations: [instance_double(SpdkInstallation, version: "23.09")],
sshable: sshable)
expect(vg_test).to receive_messages(vm_host: vm_host)
expect(sshable).to receive(:cmd).with("sudo /opt/spdk-23.09/scripts/rpc.py -s /home/spdk/spdk-23.09.sock bdev_get_bdevs").and_return('[{"name": "spdk_volume"}]')
expect { vg_test.verify_storage_backends }.to hop("failed")
end
end
describe "#verify_vm_host_slices" do
it "runs tests on vm host slices" do
expect(vg_test).to receive(:frame).and_return({"test_slices" => true, "vms" => ["111", "222", "333"]}).at_least(:once)
slice1 = instance_double(VmHostSlice, id: "456")
slice2 = instance_double(VmHostSlice, id: "789")
expect(Vm).to receive(:[]).with("111").and_return(instance_double(Vm, vm_host_slice: slice1))
expect(Vm).to receive(:[]).with("222").and_return(instance_double(Vm, vm_host_slice: slice2))
expect(Vm).to receive(:[]).with("333").and_return(instance_double(Vm, vm_host_slice: nil))
expect { vg_test.verify_vm_host_slices }.to hop("start", "Test::VmHostSlices")
end
it "hops to verify_firewall_rules if tests are done" do
expect(vg_test).to receive(:frame).and_return({"test_slices" => true})
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified VM Host Slices!"})
expect { vg_test.verify_vm_host_slices }.to hop("verify_firewall_rules")
end
end
describe "#verify_firewall_rules" do
it "hops to test_reboot if tests are done" do
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified Firewall Rules!"})
expect { vg_test.verify_firewall_rules }.to hop("verify_connected_subnets")
end
it "runs tests for the first firewall" do
subnet = instance_double(PrivateSubnet, firewalls: [instance_double(Firewall, id: "fw_id")])
expect(PrivateSubnet).to receive(:[]).and_return(subnet)
expect(vg_test).to receive(:frame).and_return({"subnets" => [subnet]})
expect { vg_test.verify_firewall_rules }.to hop("start", "Test::FirewallRules")
end
end
describe "#verify_connected_subnets" do
it "hops to test_reboot if tests are done" do
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified Connected Subnets!"})
expect { vg_test.verify_connected_subnets }.to hop("test_reboot")
end
it "runs tests for the first connected subnet" do
prj = Project.create_with_id(name: "project-1")
ps1 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps1", location_id: Location::HETZNER_FSN1_ID).subject
ps2 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps2", location_id: Location::HETZNER_FSN1_ID).subject
expect(vg_test).to receive(:frame).and_return({"subnets" => [ps1.id, ps2.id]}).at_least(:once)
expect { vg_test.verify_connected_subnets }.to hop("start", "Test::ConnectedSubnets")
end
it "runs tests for the second connected subnet" do
prj = Project.create_with_id(name: "project-1")
ps1 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps1", location_id: Location::HETZNER_FSN1_ID).subject
expect(ps1).to receive(:vms).and_return([instance_double(Vm, id: "vm1"), instance_double(Vm, id: "vm2")]).at_least(:once)
ps2 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps2", location_id: Location::HETZNER_FSN1_ID).subject
expect(PrivateSubnet).to receive(:[]).and_return(ps1, ps2)
expect(vg_test).to receive(:frame).and_return({"subnets" => [ps1.id, ps2.id]}).at_least(:once)
expect { vg_test.verify_connected_subnets }.to hop("start", "Test::ConnectedSubnets")
end
it "hops to destroy_resources if tests are done and reboot is not set" do
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified Connected Subnets!"})
expect(vg_test).to receive(:frame).and_return({"test_reboot" => false})
expect { vg_test.verify_connected_subnets }.to hop("destroy_resources")
end
end
describe "#test_reboot" do
it "hops to wait_reboot" do
expect(vg_test).to receive(:vm_host).and_return(instance_double(VmHost)).twice
expect(vg_test.vm_host).to receive(:incr_reboot).with(no_args)
expect { vg_test.test_reboot }.to hop("wait_reboot")
end
end
describe "#wait_reboot" do
before do
allow(vg_test).to receive(:vm_host).and_return(instance_double(VmHost))
allow(vg_test.vm_host).to receive(:strand).and_return(instance_double(Strand))
end
it "naps if strand is busy" do
expect(vg_test.vm_host.strand).to receive(:label).and_return("reboot")
expect { vg_test.wait_reboot }.to nap(20)
end
it "runs vm tests if reboot done" do
expect(vg_test.vm_host.strand).to receive(:label).and_return("wait")
expect(vg_test.vm_host.strand).to receive(:semaphores).and_return([])
expect { vg_test.wait_reboot }.to hop("verify_vms")
end
end
describe "#destroy_resources" do
it "hops to wait_resources_destroyed" do
allow(vg_test).to receive(:frame).and_return({"vms" => ["vm_id"], "subnets" => ["subnet_id"]}).twice
expect(Vm).to receive(:[]).with("vm_id").and_return(instance_double(Vm, incr_destroy: nil))
expect(PrivateSubnet).to receive(:[]).with("subnet_id").and_return(instance_double(PrivateSubnet, incr_destroy: nil, firewalls: []))
expect { vg_test.destroy_resources }.to hop("wait_resources_destroyed")
end
end
describe "#wait_resources_destroyed" do
it "hops to finish if all resources are destroyed" do
allow(vg_test).to receive(:frame).and_return({"vms" => ["vm_id"], "subnets" => ["subnet_id"]}).twice
expect(Vm).to receive(:[]).with("vm_id").and_return(nil)
expect(PrivateSubnet).to receive(:[]).with("subnet_id").and_return(nil)
expect { vg_test.wait_resources_destroyed }.to hop("finish")
end
it "naps if all resources are not destroyed yet" do
allow(vg_test).to receive(:frame).and_return({"vms" => ["vm_id"], "subnets" => ["subnet_id"]}).twice
expect(Vm).to receive(:[]).with("vm_id").and_return(instance_double(Vm))
expect { vg_test.wait_resources_destroyed }.to nap(5)
end
end
describe "#finish" do
it "exits" do
project = Project.create_with_id(name: "project-1")
allow(vg_test).to receive(:frame).and_return({"project_id" => project.id})
expect { vg_test.finish }.to exit({"msg" => "VmGroup tests finished!"})
end
end
describe "#failed" do
it "naps" do
expect { vg_test.failed }.to nap(15)
end
end
describe "#vm_host" do
it "returns first VM's host" do
vm_host = create_vm_host
vm = create_vm(vm_host_id: vm_host.id)
expect(vg_test).to receive(:frame).and_return({"vms" => [vm.id]})
expect(vg_test.vm_host).to eq(vm_host)
end
end
end