Previously, when reaping child processes, if there were no remaining reapable children, the parent strand would only nap 1, which puts unnecessary load on respirate unless at least one child strand exits in the next second. Change this approach by having the exiting child strands, after they release the lease, schedule their parent immediately if the parent has no non-exited child strands. When doing this, you need to be careful to make sure there are not race conditions that would delay the scheduling of the parent. There are two potential situations you need to handle: 1. Multiple children exiting at the same time 2. Parent currently running while child is exiting By waiting until after the child strand leases are released, you still have a race condition with 1, but the race condition is that multiple child strands exiting concurrently could both reschedule the parent strand. However, that isn't a problem. You want to avoid the case where neither child strand schedules the parent, which rescheduling after releasing the lease should do. To handle 2, inside reap use Model#lock! to lock the parent strand. This will make exiting child strands block if they UPDATE the parent strand with a new schedule, until the parent strand's transaction commits. However, it's possible that a child strand already UPDATED the parent. To handle this situation, before calling lock!, store the cached schedule value in a local variable. lock! implicitly does a reload, so compare the schedule value after reload. If the schedule has changed, likely a child scheduled the parent for immediate execution, so nap 0 in that case. Just in case there are unforeseen race conditions that are not handled, only nap for 120 seconds if there are active children. Worst case scenario, this results in a 2 minute delay for running the parent. However, this can potentially result in 120x less load from parent strands polling children.
299 lines
13 KiB
Ruby
299 lines
13 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require_relative "../../model/spec_helper"
|
|
|
|
RSpec.describe Prog::Test::VmGroup do
|
|
subject(:vg_test) { described_class.new(st) }
|
|
|
|
let(:st) { described_class.assemble(boot_images: ["ubuntu-noble", "debian-12"]) }
|
|
|
|
describe "#start" do
|
|
it "hops to setup_vms" do
|
|
expect { vg_test.start }.to hop("setup_vms")
|
|
end
|
|
end
|
|
|
|
describe "#setup_vms" do
|
|
it "hops to wait_children_ready" do
|
|
expect(vg_test).to receive(:update_stack).and_call_original
|
|
expect { vg_test.setup_vms }.to hop("wait_vms")
|
|
vm_images = vg_test.strand.stack.first["vms"].map { Vm[it].boot_image }
|
|
expect(vm_images).to eq(["ubuntu-noble", "debian-12", "ubuntu-noble"])
|
|
end
|
|
|
|
it "provisions at least one vm for each boot image" do
|
|
expect(vg_test).to receive(:update_stack).and_call_original
|
|
expect(vg_test).to receive(:frame).and_return({
|
|
"test_slices" => true,
|
|
"boot_images" => ["ubuntu-noble", "ubuntu-jammy", "debian-12", "almalinux-9"]
|
|
}).at_least(:once)
|
|
expect { vg_test.setup_vms }.to hop("wait_vms")
|
|
vm_images = vg_test.strand.stack.first["vms"].map { Vm[it].boot_image }
|
|
expect(vm_images).to eq(["ubuntu-noble", "ubuntu-jammy", "debian-12", "almalinux-9"])
|
|
end
|
|
|
|
it "hops to wait_children_ready if test_slices" do
|
|
expect(vg_test).to receive(:update_stack).and_call_original
|
|
expect(vg_test).to receive(:frame).and_return({
|
|
"storage_encrypted" => true,
|
|
"test_reboot" => true,
|
|
"test_slices" => true,
|
|
"vms" => [],
|
|
"boot_images" => ["ubuntu-noble", "ubuntu-jammy", "debian-12", "almalinux-9"]
|
|
}).at_least(:once)
|
|
expect { vg_test.setup_vms }.to hop("wait_vms")
|
|
end
|
|
end
|
|
|
|
describe "#wait_vms" do
|
|
it "hops to verify_vms if vms are ready" do
|
|
expect(vg_test).to receive(:frame).and_return({"vms" => ["111"]})
|
|
expect(Vm).to receive(:[]).with("111").and_return(instance_double(Vm, display_state: "running"))
|
|
expect { vg_test.wait_vms }.to hop("verify_vms")
|
|
end
|
|
|
|
it "naps if vms are not running" do
|
|
expect(vg_test).to receive(:frame).and_return({"vms" => ["111"]})
|
|
expect(Vm).to receive(:[]).with("111").and_return(instance_double(Vm, display_state: "creating"))
|
|
expect { vg_test.wait_vms }.to nap(10)
|
|
end
|
|
end
|
|
|
|
describe "#verify_vms" do
|
|
it "runs tests for the first vm" do
|
|
expect(vg_test).to receive(:frame).and_return({"vms" => ["111", "222"]})
|
|
expect(vg_test).to receive(:bud).with(Prog::Test::Vm, {subject_id: "111"})
|
|
expect(vg_test).to receive(:bud).with(Prog::Test::Vm, {subject_id: "222"})
|
|
expect { vg_test.verify_vms }.to hop("wait_verify_vms")
|
|
end
|
|
end
|
|
|
|
describe "#wait_verify_vms" do
|
|
it "hops to hop_wait_verify_vms" do
|
|
expect { vg_test.wait_verify_vms }.to hop("verify_host_capacity")
|
|
end
|
|
|
|
it "stays in wait_verify_vms" do
|
|
Strand.create(parent_id: st.id, prog: "Test::Vm", label: "start", stack: [{}], lease: Time.now + 10)
|
|
expect { vg_test.wait_verify_vms }.to nap(120)
|
|
|
|
expect(st).to receive(:lock!).and_wrap_original do |m|
|
|
# Pretend child strand updated schedule before lock.
|
|
# After the lock, shouldn't be possible as the child
|
|
# strand's update of the parent will block until
|
|
# parent strand commits.
|
|
st.this.update(schedule: Time.now - 1)
|
|
m.call
|
|
end
|
|
expect { vg_test.wait_verify_vms }.to nap(0)
|
|
end
|
|
end
|
|
|
|
describe "#verify_host_capacity" do
|
|
it "hops to verify_vm_host_slices" do
|
|
vm_host = instance_double(VmHost,
|
|
total_cpus: 16,
|
|
total_cores: 8,
|
|
used_cores: 3,
|
|
vms: [instance_double(Vm, cores: 2), instance_double(Vm, cores: 0)],
|
|
slices: [instance_double(VmHostSlice, cores: 1)],
|
|
cpus: [])
|
|
expect(vg_test).to receive_messages(vm_host: vm_host, frame: {"verify_host_capacity" => true})
|
|
expect { vg_test.verify_host_capacity }.to hop("verify_storage_backends")
|
|
end
|
|
|
|
it "skips if verify_host_capacity is not set" do
|
|
expect(vg_test).to receive(:frame).and_return({"verify_host_capacity" => false})
|
|
expect(vg_test).not_to receive(:vm_host)
|
|
expect { vg_test.verify_host_capacity }.to hop("verify_storage_backends")
|
|
end
|
|
|
|
it "fails if used cores do not match allocated VMs" do
|
|
vm_host = instance_double(VmHost,
|
|
total_cpus: 16,
|
|
total_cores: 8,
|
|
used_cores: 5,
|
|
vms: [instance_double(Vm, cores: 2), instance_double(Vm, cores: 0)],
|
|
slices: [instance_double(VmHostSlice, cores: 1)],
|
|
cpus: [])
|
|
expect(vg_test).to receive_messages(vm_host: vm_host, frame: {"verify_host_capacity" => true})
|
|
|
|
strand = instance_double(Strand)
|
|
allow(vg_test).to receive_messages(strand: strand)
|
|
expect(strand).to receive(:update).with(exitval: {msg: "Host used cores does not match the allocated VMs cores (vm_cores=2, slice_cores=1, spdk_cores=0, used_cores=5)"})
|
|
|
|
expect { vg_test.verify_host_capacity }.to hop("failed")
|
|
end
|
|
end
|
|
|
|
describe "#verify_storage_backends" do
|
|
it "fails if no vhost block backends" do
|
|
vm_host = instance_double(VmHost, vhost_block_backends: [])
|
|
expect(vg_test).to receive_messages(vm_host: vm_host)
|
|
expect { vg_test.verify_storage_backends }.to hop("failed")
|
|
end
|
|
|
|
it "checks that no SPDK volumes are present if vhost block backends exist" do
|
|
sshable = instance_double(Sshable)
|
|
vm_host = instance_double(VmHost,
|
|
vhost_block_backends: [instance_double(VhostBlockBackend)],
|
|
spdk_installations: [instance_double(SpdkInstallation, version: "23.09")],
|
|
sshable: sshable)
|
|
expect(vg_test).to receive_messages(vm_host: vm_host)
|
|
expect(sshable).to receive(:cmd).with("sudo /opt/spdk-23.09/scripts/rpc.py -s /home/spdk/spdk-23.09.sock bdev_get_bdevs").and_return("[]\n")
|
|
|
|
expect { vg_test.verify_storage_backends }.to hop("verify_vm_host_slices")
|
|
end
|
|
|
|
it "fails if SPDK volumes are present while vhost block backends exist" do
|
|
sshable = instance_double(Sshable)
|
|
vm_host = instance_double(VmHost,
|
|
vhost_block_backends: [instance_double(VhostBlockBackend)],
|
|
spdk_installations: [instance_double(SpdkInstallation, version: "23.09")],
|
|
sshable: sshable)
|
|
expect(vg_test).to receive_messages(vm_host: vm_host)
|
|
expect(sshable).to receive(:cmd).with("sudo /opt/spdk-23.09/scripts/rpc.py -s /home/spdk/spdk-23.09.sock bdev_get_bdevs").and_return('[{"name": "spdk_volume"}]')
|
|
|
|
expect { vg_test.verify_storage_backends }.to hop("failed")
|
|
end
|
|
end
|
|
|
|
describe "#verify_vm_host_slices" do
|
|
it "runs tests on vm host slices" do
|
|
expect(vg_test).to receive(:frame).and_return({"test_slices" => true, "vms" => ["111", "222", "333"]}).at_least(:once)
|
|
slice1 = instance_double(VmHostSlice, id: "456")
|
|
slice2 = instance_double(VmHostSlice, id: "789")
|
|
expect(Vm).to receive(:[]).with("111").and_return(instance_double(Vm, vm_host_slice: slice1))
|
|
expect(Vm).to receive(:[]).with("222").and_return(instance_double(Vm, vm_host_slice: slice2))
|
|
expect(Vm).to receive(:[]).with("333").and_return(instance_double(Vm, vm_host_slice: nil))
|
|
|
|
expect { vg_test.verify_vm_host_slices }.to hop("start", "Test::VmHostSlices")
|
|
end
|
|
|
|
it "hops to verify_firewall_rules if tests are done" do
|
|
expect(vg_test).to receive(:frame).and_return({"test_slices" => true})
|
|
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified VM Host Slices!"})
|
|
expect { vg_test.verify_vm_host_slices }.to hop("verify_firewall_rules")
|
|
end
|
|
end
|
|
|
|
describe "#verify_firewall_rules" do
|
|
it "hops to test_reboot if tests are done" do
|
|
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified Firewall Rules!"})
|
|
expect { vg_test.verify_firewall_rules }.to hop("verify_connected_subnets")
|
|
end
|
|
|
|
it "runs tests for the first firewall" do
|
|
subnet = instance_double(PrivateSubnet, firewalls: [instance_double(Firewall, id: "fw_id")])
|
|
expect(PrivateSubnet).to receive(:[]).and_return(subnet)
|
|
expect(vg_test).to receive(:frame).and_return({"subnets" => [subnet]})
|
|
expect { vg_test.verify_firewall_rules }.to hop("start", "Test::FirewallRules")
|
|
end
|
|
end
|
|
|
|
describe "#verify_connected_subnets" do
|
|
it "hops to test_reboot if tests are done" do
|
|
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified Connected Subnets!"})
|
|
expect { vg_test.verify_connected_subnets }.to hop("test_reboot")
|
|
end
|
|
|
|
it "runs tests for the first connected subnet" do
|
|
prj = Project.create_with_id(name: "project-1")
|
|
ps1 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps1", location_id: Location::HETZNER_FSN1_ID).subject
|
|
ps2 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps2", location_id: Location::HETZNER_FSN1_ID).subject
|
|
expect(vg_test).to receive(:frame).and_return({"subnets" => [ps1.id, ps2.id]}).at_least(:once)
|
|
expect { vg_test.verify_connected_subnets }.to hop("start", "Test::ConnectedSubnets")
|
|
end
|
|
|
|
it "runs tests for the second connected subnet" do
|
|
prj = Project.create_with_id(name: "project-1")
|
|
ps1 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps1", location_id: Location::HETZNER_FSN1_ID).subject
|
|
expect(ps1).to receive(:vms).and_return([instance_double(Vm, id: "vm1"), instance_double(Vm, id: "vm2")]).at_least(:once)
|
|
ps2 = Prog::Vnet::SubnetNexus.assemble(prj.id, name: "ps2", location_id: Location::HETZNER_FSN1_ID).subject
|
|
expect(PrivateSubnet).to receive(:[]).and_return(ps1, ps2)
|
|
expect(vg_test).to receive(:frame).and_return({"subnets" => [ps1.id, ps2.id]}).at_least(:once)
|
|
expect { vg_test.verify_connected_subnets }.to hop("start", "Test::ConnectedSubnets")
|
|
end
|
|
|
|
it "hops to destroy_resources if tests are done and reboot is not set" do
|
|
expect(vg_test.strand).to receive(:retval).and_return({"msg" => "Verified Connected Subnets!"})
|
|
expect(vg_test).to receive(:frame).and_return({"test_reboot" => false})
|
|
expect { vg_test.verify_connected_subnets }.to hop("destroy_resources")
|
|
end
|
|
end
|
|
|
|
describe "#test_reboot" do
|
|
it "hops to wait_reboot" do
|
|
expect(vg_test).to receive(:vm_host).and_return(instance_double(VmHost)).twice
|
|
expect(vg_test.vm_host).to receive(:incr_reboot).with(no_args)
|
|
expect { vg_test.test_reboot }.to hop("wait_reboot")
|
|
end
|
|
end
|
|
|
|
describe "#wait_reboot" do
|
|
before do
|
|
allow(vg_test).to receive(:vm_host).and_return(instance_double(VmHost))
|
|
allow(vg_test.vm_host).to receive(:strand).and_return(instance_double(Strand))
|
|
end
|
|
|
|
it "naps if strand is busy" do
|
|
expect(vg_test.vm_host.strand).to receive(:label).and_return("reboot")
|
|
expect { vg_test.wait_reboot }.to nap(20)
|
|
end
|
|
|
|
it "runs vm tests if reboot done" do
|
|
expect(vg_test.vm_host.strand).to receive(:label).and_return("wait")
|
|
expect(vg_test.vm_host.strand).to receive(:semaphores).and_return([])
|
|
expect { vg_test.wait_reboot }.to hop("verify_vms")
|
|
end
|
|
end
|
|
|
|
describe "#destroy_resources" do
|
|
it "hops to wait_resources_destroyed" do
|
|
allow(vg_test).to receive(:frame).and_return({"vms" => ["vm_id"], "subnets" => ["subnet_id"]}).twice
|
|
expect(Vm).to receive(:[]).with("vm_id").and_return(instance_double(Vm, incr_destroy: nil))
|
|
expect(PrivateSubnet).to receive(:[]).with("subnet_id").and_return(instance_double(PrivateSubnet, incr_destroy: nil, firewalls: []))
|
|
expect { vg_test.destroy_resources }.to hop("wait_resources_destroyed")
|
|
end
|
|
end
|
|
|
|
describe "#wait_resources_destroyed" do
|
|
it "hops to finish if all resources are destroyed" do
|
|
allow(vg_test).to receive(:frame).and_return({"vms" => ["vm_id"], "subnets" => ["subnet_id"]}).twice
|
|
expect(Vm).to receive(:[]).with("vm_id").and_return(nil)
|
|
expect(PrivateSubnet).to receive(:[]).with("subnet_id").and_return(nil)
|
|
|
|
expect { vg_test.wait_resources_destroyed }.to hop("finish")
|
|
end
|
|
|
|
it "naps if all resources are not destroyed yet" do
|
|
allow(vg_test).to receive(:frame).and_return({"vms" => ["vm_id"], "subnets" => ["subnet_id"]}).twice
|
|
expect(Vm).to receive(:[]).with("vm_id").and_return(instance_double(Vm))
|
|
expect { vg_test.wait_resources_destroyed }.to nap(5)
|
|
end
|
|
end
|
|
|
|
describe "#finish" do
|
|
it "exits" do
|
|
project = Project.create_with_id(name: "project-1")
|
|
allow(vg_test).to receive(:frame).and_return({"project_id" => project.id})
|
|
expect { vg_test.finish }.to exit({"msg" => "VmGroup tests finished!"})
|
|
end
|
|
end
|
|
|
|
describe "#failed" do
|
|
it "naps" do
|
|
expect { vg_test.failed }.to nap(15)
|
|
end
|
|
end
|
|
|
|
describe "#vm_host" do
|
|
it "returns first VM's host" do
|
|
vm_host = create_vm_host
|
|
vm = create_vm(vm_host_id: vm_host.id)
|
|
expect(vg_test).to receive(:frame).and_return({"vms" => [vm.id]})
|
|
expect(vg_test.vm_host).to eq(vm_host)
|
|
end
|
|
end
|
|
end
|