Files
ubicloud/bin/ci
Hadi Moshayedi eb9aee9e3f Don't verify_host_capacity for unencrypted_vms E2E test.
`unencrypted_vms` test runs concurrently with github runner E2E tests.
If the `verify_host_capacity` step overlaps with github runner VMs being
destroyed, then there's a race condition that host's capacity has been
updated in `Nexus::destroy` but Vm's record hasn't been deleted yet,
which happens in a following `Nexus::destroy_slice` label.

An example of such failure was:
https://github.com/ubicloud/ubicloud/actions/runs/13269484154/job/37045283146

In this failure, sequence of events that caused the problem were:

```
{"runner_allocated":{... "vm_ubid":"vmm5zmzc4qst7qx1h9gjahnk9x" ...}}
{"strand_hopped":{"from":"Vm::Nexus.wait","to":"Vm::Nexus.destroy"},"thread":"vmm5zmzc4qst7qx1h9gjahnk9x"}
{"strand_hopped":{"from":"Test::VmGroup.wait_verify_vms","to":"Test::VmGroup.verify_host_capacity"}}
{"strand_hopped":{"from":"Test::VmGroup.verify_host_capacity","to":"Test::VmGroup.failed"}}
{"strand_hopped":{"from":"Vm::Nexus.destroy","to":"Vm::Nexus.destroy_slice"},"thread":"vmm5zmzc4qst7qx1h9gjahnk9x"}
```

To avoid sporadic failure, and given that `verify_host_capacity` is
already exercised in few relevant situations, we disable
`verify_host_capacity` for `unencrypted_vms`.

Also added more details to failure's error message, so we have more data
to look into if this fails again.
2025-02-12 09:01:57 -08:00

100 lines
3.7 KiB
Ruby
Executable File

#!/usr/bin/env ruby
# frozen_string_literal: true
require_relative "../loader"
require "time"
require "optparse"
@started_at = Time.now
def main(options)
all_test_cases = YAML.load_file("config/e2e_test_cases.yml").to_h { [_1["name"], _1] }
boot_images = all_test_cases.values_at(*options[:test_cases]).flat_map { _1["images"] }.uniq
hetzner_server_st = Prog::Test::HetznerServer.assemble(vm_host_id: options[:vm_host_id], default_boot_images: boot_images)
wait_until(hetzner_server_st, "wait")
tests_to_wait = []
if options[:test_cases].include?("vm") && (test_case = all_test_cases["vm"])
encrypted_vms_st = Prog::Test::VmGroup.assemble(boot_images: test_case["images"], storage_encrypted: true, test_reboot: true)
log(encrypted_vms_st, "storage_encrypted: true")
# Not running in parallel but waiting, since host is rebooted during test.
# Rebooting makes the next test to test reboot practically as well depending
# on when the host is rebooted and can cause flaky issues for github runner tests.
wait_until(encrypted_vms_st)
# Testing slices is not parallel with other tests as it requires a specific host state
Semaphore.incr(hetzner_server_st.id, "allow_slices")
wait_until(hetzner_server_st, "wait")
sliced_vms_st = Prog::Test::VmGroup.assemble(boot_images: test_case["images"], storage_encrypted: true, test_reboot: false, test_slices: true)
log(sliced_vms_st, "test_slices: true")
wait_until(sliced_vms_st)
Semaphore.incr(hetzner_server_st.id, "disallow_slices")
wait_until(hetzner_server_st, "wait")
unencrypted_vms_st = Prog::Test::VmGroup.assemble(boot_images: test_case["images"], storage_encrypted: false, test_reboot: false, verify_host_capacity: false)
log(unencrypted_vms_st, "storage_encrypted: false")
tests_to_wait << unencrypted_vms_st
end
if (gh_test_cases = all_test_cases.values_at(*options[:test_cases].select { _1.include?("github_runner") }))
tests_to_wait << Prog::Test::GithubRunner.assemble(gh_test_cases)
end
if options[:test_cases].include?("postgres_standard")
tests_to_wait << Prog::Test::PostgresResource.assemble
end
if options[:test_cases].include?("postgres_ha")
tests_to_wait << Prog::Test::HaPostgresResource.assemble
end
# Although wait_until will be blocked while checking the first one
# it won't affect the total time as other strands will continue in parallel.
# No need to make it parallel.
tests_to_wait.each { |st| wait_until(st) }
Semaphore.incr(hetzner_server_st.id, "destroy")
wait_until(hetzner_server_st)
end
def wait_until(st, label = nil)
while (loaded_st = Strand[st.id]) && loaded_st.label != label
if loaded_st.label == "failed"
log(st.reload, "FAILED: #{loaded_st.exitval.fetch("msg")}")
st.destroy
exit 1
end
log(st.reload, "waiting for #{label || "exit"}")
sleep 10
end
log(st, "reached")
end
def log(st, msg)
resources = case st.prog
when "Test::HetznerServer"
"VmHost.#{Strand[st.stack.first["vm_host_id"]]&.label}"
when "Test::VmGroup"
st.stack.first["vms"].map { "Vm.#{Strand[_1]&.label}" }.join(", ")
when "Test::Vm"
"Vm.#{Strand[st.stack.first["subject_id"]]&.label}"
else
"#{st.prog}.#{st.label}"
end
$stdout.write "#{((Time.now - @started_at) / 60).round(2)}m | #{st.id} | #{st.prog}.#{st.label} | #{msg} | #{resources}\n"
end
options = {test_cases: ["vm"]}
OptionParser.new do |opts|
opts.on("--vm-host-id VM_HOST_ID", "Use existing vm host") { |v| options[:vm_host_id] = (v.length == 26) ? VmHost.from_ubid(v).id : v }
opts.on("--test-cases TEST_CASES", Array, "List of test cases to run separated by comma") { |v| options[:test_cases] = v }
end.parse!
clover_freeze
$stdout.sync = true
main(options)