`unencrypted_vms` test runs concurrently with github runner E2E tests. If the `verify_host_capacity` step overlaps with github runner VMs being destroyed, then there's a race condition that host's capacity has been updated in `Nexus::destroy` but Vm's record hasn't been deleted yet, which happens in a following `Nexus::destroy_slice` label. An example of such failure was: https://github.com/ubicloud/ubicloud/actions/runs/13269484154/job/37045283146 In this failure, sequence of events that caused the problem were: ``` {"runner_allocated":{... "vm_ubid":"vmm5zmzc4qst7qx1h9gjahnk9x" ...}} {"strand_hopped":{"from":"Vm::Nexus.wait","to":"Vm::Nexus.destroy"},"thread":"vmm5zmzc4qst7qx1h9gjahnk9x"} {"strand_hopped":{"from":"Test::VmGroup.wait_verify_vms","to":"Test::VmGroup.verify_host_capacity"}} {"strand_hopped":{"from":"Test::VmGroup.verify_host_capacity","to":"Test::VmGroup.failed"}} {"strand_hopped":{"from":"Vm::Nexus.destroy","to":"Vm::Nexus.destroy_slice"},"thread":"vmm5zmzc4qst7qx1h9gjahnk9x"} ``` To avoid sporadic failure, and given that `verify_host_capacity` is already exercised in few relevant situations, we disable `verify_host_capacity` for `unencrypted_vms`. Also added more details to failure's error message, so we have more data to look into if this fails again.
100 lines
3.7 KiB
Ruby
Executable File
100 lines
3.7 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
require_relative "../loader"
|
|
require "time"
|
|
require "optparse"
|
|
|
|
@started_at = Time.now
|
|
|
|
def main(options)
|
|
all_test_cases = YAML.load_file("config/e2e_test_cases.yml").to_h { [_1["name"], _1] }
|
|
boot_images = all_test_cases.values_at(*options[:test_cases]).flat_map { _1["images"] }.uniq
|
|
|
|
hetzner_server_st = Prog::Test::HetznerServer.assemble(vm_host_id: options[:vm_host_id], default_boot_images: boot_images)
|
|
wait_until(hetzner_server_st, "wait")
|
|
|
|
tests_to_wait = []
|
|
if options[:test_cases].include?("vm") && (test_case = all_test_cases["vm"])
|
|
encrypted_vms_st = Prog::Test::VmGroup.assemble(boot_images: test_case["images"], storage_encrypted: true, test_reboot: true)
|
|
log(encrypted_vms_st, "storage_encrypted: true")
|
|
# Not running in parallel but waiting, since host is rebooted during test.
|
|
# Rebooting makes the next test to test reboot practically as well depending
|
|
# on when the host is rebooted and can cause flaky issues for github runner tests.
|
|
wait_until(encrypted_vms_st)
|
|
|
|
# Testing slices is not parallel with other tests as it requires a specific host state
|
|
Semaphore.incr(hetzner_server_st.id, "allow_slices")
|
|
wait_until(hetzner_server_st, "wait")
|
|
|
|
sliced_vms_st = Prog::Test::VmGroup.assemble(boot_images: test_case["images"], storage_encrypted: true, test_reboot: false, test_slices: true)
|
|
log(sliced_vms_st, "test_slices: true")
|
|
wait_until(sliced_vms_st)
|
|
|
|
Semaphore.incr(hetzner_server_st.id, "disallow_slices")
|
|
wait_until(hetzner_server_st, "wait")
|
|
|
|
unencrypted_vms_st = Prog::Test::VmGroup.assemble(boot_images: test_case["images"], storage_encrypted: false, test_reboot: false, verify_host_capacity: false)
|
|
log(unencrypted_vms_st, "storage_encrypted: false")
|
|
tests_to_wait << unencrypted_vms_st
|
|
end
|
|
|
|
if (gh_test_cases = all_test_cases.values_at(*options[:test_cases].select { _1.include?("github_runner") }))
|
|
tests_to_wait << Prog::Test::GithubRunner.assemble(gh_test_cases)
|
|
end
|
|
|
|
if options[:test_cases].include?("postgres_standard")
|
|
tests_to_wait << Prog::Test::PostgresResource.assemble
|
|
end
|
|
if options[:test_cases].include?("postgres_ha")
|
|
tests_to_wait << Prog::Test::HaPostgresResource.assemble
|
|
end
|
|
|
|
# Although wait_until will be blocked while checking the first one
|
|
# it won't affect the total time as other strands will continue in parallel.
|
|
# No need to make it parallel.
|
|
tests_to_wait.each { |st| wait_until(st) }
|
|
|
|
Semaphore.incr(hetzner_server_st.id, "destroy")
|
|
wait_until(hetzner_server_st)
|
|
end
|
|
|
|
def wait_until(st, label = nil)
|
|
while (loaded_st = Strand[st.id]) && loaded_st.label != label
|
|
if loaded_st.label == "failed"
|
|
log(st.reload, "FAILED: #{loaded_st.exitval.fetch("msg")}")
|
|
st.destroy
|
|
exit 1
|
|
end
|
|
log(st.reload, "waiting for #{label || "exit"}")
|
|
sleep 10
|
|
end
|
|
log(st, "reached")
|
|
end
|
|
|
|
def log(st, msg)
|
|
resources = case st.prog
|
|
when "Test::HetznerServer"
|
|
"VmHost.#{Strand[st.stack.first["vm_host_id"]]&.label}"
|
|
when "Test::VmGroup"
|
|
st.stack.first["vms"].map { "Vm.#{Strand[_1]&.label}" }.join(", ")
|
|
when "Test::Vm"
|
|
"Vm.#{Strand[st.stack.first["subject_id"]]&.label}"
|
|
else
|
|
"#{st.prog}.#{st.label}"
|
|
end
|
|
$stdout.write "#{((Time.now - @started_at) / 60).round(2)}m | #{st.id} | #{st.prog}.#{st.label} | #{msg} | #{resources}\n"
|
|
end
|
|
|
|
options = {test_cases: ["vm"]}
|
|
OptionParser.new do |opts|
|
|
opts.on("--vm-host-id VM_HOST_ID", "Use existing vm host") { |v| options[:vm_host_id] = (v.length == 26) ? VmHost.from_ubid(v).id : v }
|
|
opts.on("--test-cases TEST_CASES", Array, "List of test cases to run separated by comma") { |v| options[:test_cases] = v }
|
|
end.parse!
|
|
|
|
clover_freeze
|
|
|
|
$stdout.sync = true
|
|
|
|
main(options)
|