Files
ubicloud/spec/prog/vm/github_runner_spec.rb
Burak Velioglu 1a4f9edb0c Publish unique cache proxy log line counts to mezmo
To be able to follow problematic operations on cache proxy,
publishing the count for each unique log line. Each log line either
shows the succesful operations or the issues observed. Logging
their count with the line text in a map format will follow us to
make analysis on mezmo easier.
2025-08-08 18:08:35 +03:00

822 lines
40 KiB
Ruby

# frozen_string_literal: true
require_relative "../../model/spec_helper"
require "netaddr"
require "octokit"
RSpec.describe Prog::Vm::GithubRunner do
subject(:nx) {
described_class.new(Strand.new).tap {
it.instance_variable_set(:@github_runner, runner)
}
}
let(:runner) do
customer_project = Project.create(name: "customer")
runner_project = Project.create(name: "runner-service")
installation_id = GithubInstallation.create(installation_id: 123, project_id: customer_project.id, name: "ubicloud", type: "Organization", created_at: now - 8 * 24 * 60 * 60).id
vm_id = create_vm(location_id: Location::GITHUB_RUNNERS_ID, project_id: runner_project.id, boot_image: "github-ubuntu-2204").id
Sshable.create_with_id(vm_id)
GithubRunner.create(installation_id:, vm_id:, repository_name: "test-repo", label: "ubicloud-standard-4", created_at: now, allocated_at: now + 10, ready_at: now + 20, workflow_job: {"id" => 123})
end
let(:vm) { runner.vm }
let(:installation) { runner.installation }
let(:project) { installation.project }
let(:client) { instance_double(Octokit::Client) }
let(:now) { Time.utc(2025, 5, 19, 19, 0) }
before do
allow(Config).to receive(:github_runner_service_project_id).and_return(vm.project_id)
allow(Github).to receive(:installation_client).and_return(client)
allow(Time).to receive(:now).and_return(now)
end
describe ".assemble" do
it "creates github runner and vm with sshable" do
runner = described_class.assemble(installation, repository_name: "test-repo", label: "ubicloud").subject
expect(runner).not_to be_nil
expect(runner.repository_name).to eq("test-repo")
expect(runner.label).to eq("ubicloud")
end
it "creates github runner with custom size" do
runner = described_class.assemble(installation, repository_name: "test-repo", label: "ubicloud-standard-8").subject
expect(runner).not_to be_nil
expect(runner.repository_name).to eq("test-repo")
expect(runner.label).to eq("ubicloud-standard-8")
end
it "fails if label is not valid" do
expect {
described_class.assemble(installation, repository_name: "test-repo", label: "ubicloud-standard-1")
}.to raise_error RuntimeError, "Invalid GitHub runner label: ubicloud-standard-1"
end
end
describe ".pick_vm" do
it "provisions a VM if the pool is not existing" do
vm = nx.pick_vm
expect(vm.pool_id).to be_nil
expect(vm.sshable.unix_user).to eq("runneradmin")
expect(vm.unix_user).to eq("runneradmin")
expect(vm.family).to eq("standard")
expect(vm.vcpus).to eq(4)
expect(vm.project_id).to eq(Config.github_runner_service_project_id)
end
it "provisions a new vm if pool is valid but there is no vm" do
VmPool.create(size: 2, vm_size: "standard-4", boot_image: "github-ubuntu-2204", location_id: Location::GITHUB_RUNNERS_ID, storage_size_gib: 150, arch: "x64")
vm = nx.pick_vm
expect(vm.pool_id).to be_nil
expect(vm.sshable.unix_user).to eq("runneradmin")
expect(vm.family).to eq("standard")
expect(vm.vcpus).to eq(4)
end
it "uses the existing vm if pool can pick one" do
pool = VmPool.create(size: 2, vm_size: "standard-4", boot_image: "github-ubuntu-2204", location_id: Location::GITHUB_RUNNERS_ID, storage_size_gib: 150, arch: "x64", storage_skip_sync: true)
vm = create_vm(pool_id: pool.id, display_state: "running")
picked_vm = nx.pick_vm
expect(vm.id).to eq(picked_vm.id)
end
it "uses the premium vm pool if the installation prefers premium runners" do
pool = VmPool.create(size: 2, vm_size: "premium-4", boot_image: "github-ubuntu-2204", location_id: Location::GITHUB_RUNNERS_ID, storage_size_gib: 150, arch: "x64", storage_skip_sync: true)
vm = create_vm(pool_id: pool.id, display_state: "running", family: "premium")
expect(installation).to receive(:premium_runner_enabled?).and_return(true)
picked_vm = nx.pick_vm
expect(vm.id).to eq(picked_vm.id)
expect(picked_vm.family).to eq("premium")
end
it "uses the premium vm pool if a free premium upgrade is enabled" do
pool = VmPool.create(size: 2, vm_size: "premium-4", boot_image: "github-ubuntu-2204", location_id: Location::GITHUB_RUNNERS_ID, storage_size_gib: 150, arch: "x64", storage_skip_sync: true)
vm = create_vm(pool_id: pool.id, display_state: "running", family: "premium")
expect(installation).to receive(:premium_runner_enabled?).and_return(false)
expect(installation).to receive(:free_runner_upgrade?).and_return(true)
picked_vm = nx.pick_vm
expect(vm.id).to eq(picked_vm.id)
expect(picked_vm.family).to eq("premium")
end
end
describe ".update_billing_record" do
it "not updates billing record if the runner is destroyed before it's ready" do
runner.update(ready_at: nil)
expect(nx.update_billing_record).to be_nil
expect(BillingRecord.count).to eq(0)
end
it "not updates billing record if the runner does not pick a job" do
runner.update(ready_at: now, workflow_job: nil)
expect(nx.update_billing_record).to be_nil
expect(BillingRecord.count).to eq(0)
end
it "creates new billing record when no daily record" do
runner.update(ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_call_original
nx.update_billing_record
br = BillingRecord[resource_id: project.id]
expect(br.amount).to eq(5)
expect(br.duration(now, now)).to eq(1)
end
it "uses separate billing rate for arm64 runners" do
runner.update(label: "ubicloud-arm", ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_call_original
nx.update_billing_record
br = BillingRecord[resource_id: project.id]
expect(br.amount).to eq(5)
expect(br.duration(now, now)).to eq(1)
expect(br.billing_rate["resource_family"]).to eq("standard-2-arm")
expect(runner.billed_vm_size).to eq("standard-2-arm")
end
it "uses separate billing rate for gpu runners" do
vm.update(family: "standard-gpu", vcpus: 6)
runner.update(label: "ubicloud-gpu", ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_call_original
nx.update_billing_record
br = BillingRecord[resource_id: project.id]
expect(br.amount).to eq(5)
expect(br.duration(now, now)).to eq(1)
expect(br.billing_rate["resource_family"]).to eq("standard-gpu-6")
expect(runner.billed_vm_size).to eq("standard-gpu-6")
end
it "uses the premium billing rate for upgraded runners" do
vm.update(family: "premium")
runner.update(label: "ubicloud-standard-2", ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_call_original
nx.update_billing_record
br = BillingRecord[resource_id: project.id]
expect(br.amount).to eq(5)
expect(br.duration(now, now)).to eq(1)
expect(br.billing_rate["resource_family"]).to eq("premium-2")
expect(runner.billed_vm_size).to eq("premium-2")
end
it "uses the original billing rate for runners who were upgraded for free based on runner creation time" do
vm.update(family: "premium")
runner.update(label: "ubicloud-standard-2", ready_at: now - 5 * 60, created_at: now - 100)
expect(installation).to receive(:free_runner_upgrade_expires_at).and_return(now - 50)
expect(BillingRecord).to receive(:create).and_call_original
nx.update_billing_record
br = BillingRecord[resource_id: project.id]
expect(br.amount).to eq(5)
expect(br.duration(now, now)).to eq(1)
expect(br.billing_rate["resource_family"]).to eq("standard-2")
expect(runner.billed_vm_size).to eq("standard-2")
end
it "updates the amount of existing billing record" do
runner.update(ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_call_original
# Create a record
nx.update_billing_record
expect { nx.update_billing_record }
.to change { BillingRecord[resource_id: project.id].amount }.from(5).to(10)
end
it "create a new record for a new day" do
today = Time.now
tomorrow = today + 24 * 60 * 60
expect(Time).to receive(:now).and_return(today).exactly(6)
expect(runner).to receive(:ready_at).and_return(today - 5 * 60).twice
expect(BillingRecord).to receive(:create).and_call_original
# Create today record
nx.update_billing_record
expect(Time).to receive(:now).and_return(tomorrow).at_least(:once)
expect(runner).to receive(:ready_at).and_return(tomorrow - 5 * 60).at_least(:once)
expect(BillingRecord).to receive(:create).and_call_original
# Create tomorrow record
expect { nx.update_billing_record }
.to change { BillingRecord.where(resource_id: project.id).count }.from(1).to(2)
expect(BillingRecord.where(resource_id: project.id).map(&:amount)).to eq([5, 5])
end
it "tries 3 times and creates single billing record" do
runner.update(ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_raise(Sequel::Postgres::ExclusionConstraintViolation).exactly(3)
expect(BillingRecord).to receive(:create).and_call_original
expect {
3.times { nx.update_billing_record }
}.to change { BillingRecord.where(resource_id: project.id).count }.from(0).to(1)
end
it "tries 4 times and fails" do
runner.update(ready_at: now - 5 * 60)
expect(BillingRecord).to receive(:create).and_raise(Sequel::Postgres::ExclusionConstraintViolation).at_least(:once)
expect {
4.times { nx.update_billing_record }
}.to raise_error(Sequel::Postgres::ExclusionConstraintViolation)
end
end
describe "#before_run" do
it "hops to destroy when needed" do
expect(nx).to receive(:when_destroy_set?).and_yield
expect(nx).to receive(:register_deadline)
expect(nx).to receive(:update_billing_record)
expect { nx.before_run }.to hop("destroy")
end
it "does not hop to destroy if already in the destroy state" do
expect(nx).to receive(:when_destroy_set?).and_yield
expect(nx.strand).to receive(:label).and_return("destroy")
expect { nx.before_run }.not_to hop("destroy")
end
it "does not hop to destroy if already in the wait_vm_destroy state" do
expect(nx).to receive(:when_destroy_set?).and_yield
expect(nx.strand).to receive(:label).and_return("wait_vm_destroy")
expect { nx.before_run }.not_to hop("destroy")
end
end
describe "#start" do
it "hops to wait_concurrency_limit if there is no capacity" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
expect(project).to receive(:active?).and_return(true)
expect { nx.start }.to hop("wait_concurrency_limit")
end
it "hops to allocate_vm if there is capacity" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(true)
expect(project).to receive(:active?).and_return(true)
expect { nx.start }.to hop("allocate_vm")
end
it "pops if the project is not active" do
expect(project).to receive(:active?).and_return(false)
expect { nx.start }.to exit({"msg" => "Could not provision a runner for inactive project"})
end
end
describe "#wait_concurrency_limit" do
before do
[
[Location::HETZNER_FSN1_ID, "x64", "standard"],
[Location::GITHUB_RUNNERS_ID, "x64", "standard"],
[Location::GITHUB_RUNNERS_ID, "x64", "premium"],
[Location::GITHUB_RUNNERS_ID, "arm64", "standard"]
].each do |location_id, arch, family|
create_vm_host(location_id:, arch:, family:, total_cores: 16, used_cores: 16)
end
end
it "hops to allocate_vm when customer concurrency limit frees up" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(true)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
context "when standard runner" do
it "waits if standard utilization is high" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
expect { nx.wait_concurrency_limit }.to nap
end
it "allocates if standard utilization is low" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost[arch: "x64", family: "standard"].update(used_cores: 8)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
end
context "when transparent premium runner" do
before { installation.update(allocator_preferences: {"family_filter" => ["premium", "standard"]}) }
it "waits if premium and standard utilizations are high" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
expect { nx.wait_concurrency_limit }.to nap
end
it "allocates if standard utilization is high but premium utilization is low" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost[arch: "x64", family: "premium"].update(used_cores: 8)
expect(runner).not_to receive(:incr_not_upgrade_premium)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
it "allocates without upgrade if premium utilization is high but standard utilization is low" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost[arch: "x64", family: "standard"].update(used_cores: 8)
expect(runner).to receive(:incr_not_upgrade_premium)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
it "allocates arm64 runners without checking premium utilization" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
runner.update(label: "ubicloud-standard-4-arm")
VmHost[arch: "arm64"].update(used_cores: 8)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
end
context "when explicit premium runner" do
before { runner.update(label: "ubicloud-premium-4") }
it "waits if premium and standard utilizations are high" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
expect { nx.wait_concurrency_limit }.to nap
end
it "allocates if premium utilization is low" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost[arch: "x64", family: "premium"].update(used_cores: 8)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
it "waits if premium utilization is high but standard utilization is low" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost[arch: "x64", family: "standard"].update(used_cores: 8)
expect { nx.wait_concurrency_limit }.to nap
end
end
context "when free premium runner" do
before { project.set_ff_free_runner_upgrade_until(Time.now + 100).reload }
it "waits if premium and standard utilizations are high" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
expect { nx.wait_concurrency_limit }.to nap
end
it "allocates if premium utilization is low than 50" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost.where(arch: "x64").update(used_cores: 4)
expect(runner).not_to receive(:incr_not_upgrade_premium)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
it "allocates without upgrade if premium utilization is higher than 50" do
expect(project).to receive(:quota_available?).with("GithubRunnerVCpu", 0).and_return(false)
VmHost.where(arch: "x64").update(used_cores: 10)
expect(runner).to receive(:incr_not_upgrade_premium)
expect { nx.wait_concurrency_limit }.to hop("allocate_vm")
end
end
end
describe "#allocate_vm" do
it "picks vm and hops" do
picked_vm = create_vm(name: "picked-vm")
expect(nx).to receive(:pick_vm).and_return(picked_vm)
expect(Clog).to receive(:emit).with("runner_allocated").and_call_original
expect { nx.allocate_vm }.to hop("wait_vm")
expect(runner.vm_id).to eq(picked_vm.id)
expect(runner.allocated_at).to eq(now)
expect(picked_vm.name).to eq(runner.ubid)
end
end
describe "#wait_vm" do
it "naps 11 seconds if vm is not allocated yet" do
vm.update(allocated_at: nil)
expect { nx.wait_vm }.to nap(11)
end
it "naps a second if vm is allocated but not provisioned yet" do
vm.update(allocated_at: now)
expect { nx.wait_vm }.to nap(1)
end
it "hops if vm is ready" do
vm.update(allocated_at: now, provisioned_at: now)
expect { nx.wait_vm }.to hop("setup_environment")
end
end
describe ".setup_info" do
it "returns setup info with vm pool ubid" do
vm_host = create_vm_host(total_cores: 4, used_cores: 4, data_center: "FSN1-DC8")
pool = VmPool.create(size: 1, vm_size: "standard-2", location_id: Location::GITHUB_RUNNERS_ID, boot_image: "github-ubuntu-2204", storage_size_gib: 86)
vm.update(pool_id: pool.id, vm_host_id: vm_host.id)
expect(nx.setup_info[:detail]).to eq("Name: #{runner.ubid}\nLabel: ubicloud-standard-4\nVM Family: standard\nArch: x64\nImage: github-ubuntu-2204\nVM Host: #{vm_host.ubid}\nVM Pool: #{pool.ubid}\nLocation: hetzner-fsn1\nDatacenter: FSN1-DC8\nProject: #{project.ubid}\nConsole URL: http://localhost:9292/project/#{project.ubid}/github")
end
end
describe "#setup_environment" do
before do
vm.update(vm_host_id: create_vm_host(data_center: "FSN1-DC8").id)
end
it "hops to register_runner" do
expect(vm).to receive(:runtime_token).and_return("my_token")
installation.update(use_docker_mirror: false, cache_enabled: false)
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND)
set -ueo pipefail
echo "image version: $ImageVersion"
sudo usermod -a -G sudo,adm runneradmin
jq '. += [{"group":"Ubicloud Managed Runner","detail":"Name: #{runner.ubid}\\nLabel: ubicloud-standard-4\\nVM Family: standard\\nArch: x64\\nImage: github-ubuntu-2204\\nVM Host: #{vm.vm_host.ubid}\\nVM Pool: \\nLocation: hetzner-fsn1\\nDatacenter: FSN1-DC8\\nProject: #{project.ubid}\\nConsole URL: http://localhost:9292/project/#{project.ubid}/github"}]' /imagegeneration/imagedata.json | sudo -u runner tee /home/runner/actions-runner/.setup_info > /dev/null
echo "UBICLOUD_RUNTIME_TOKEN=my_token
UBICLOUD_CACHE_URL=http://localhost:9292/runtime/github/" | sudo tee -a /etc/environment > /dev/null
if [ ! -f /etc/systemd/system/runner-script.service ]; then
sudo tee /etc/systemd/system/runner-script.service > /dev/null <<'EOT'
[Unit]
Description=runner-script
[Service]
RemainAfterExit=yes
User=runner
Group=runner
WorkingDirectory=/home/runner
ExecStart=/home/runner/actions-runner/run-withenv.sh
EOT
sudo -u runner tee /home/runner/actions-runner/run-withenv.sh > /dev/null <<'EOT'
#!/bin/bash
mapfile -t env </etc/environment
JIT_CONFIG="$(cat ./actions-runner/.jit_token)"
exec env -- "${env[@]}" ./actions-runner/run.sh --jitconfig "$JIT_CONFIG"
EOT
sudo systemctl daemon-reload
fi
COMMAND
expect { nx.setup_environment }.to hop("register_runner")
end
it "hops to register_runner with after enabling transparent cache" do
expect(vm).to receive(:runtime_token).and_return("my_token")
installation.update(use_docker_mirror: false, cache_enabled: true)
expect(vm).to receive(:nics).and_return([instance_double(Nic, private_ipv4: NetAddr::IPv4Net.parse("10.0.0.1/32"))]).at_least(:once)
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND)
set -ueo pipefail
echo "image version: $ImageVersion"
sudo usermod -a -G sudo,adm runneradmin
jq '. += [{"group":"Ubicloud Managed Runner","detail":"Name: #{runner.ubid}\\nLabel: ubicloud-standard-4\\nVM Family: standard\\nArch: x64\\nImage: github-ubuntu-2204\\nVM Host: #{vm.vm_host.ubid}\\nVM Pool: \\nLocation: hetzner-fsn1\\nDatacenter: FSN1-DC8\\nProject: #{project.ubid}\\nConsole URL: http://localhost:9292/project/#{project.ubid}/github"}]' /imagegeneration/imagedata.json | sudo -u runner tee /home/runner/actions-runner/.setup_info > /dev/null
echo "UBICLOUD_RUNTIME_TOKEN=my_token
UBICLOUD_CACHE_URL=http://localhost:9292/runtime/github/" | sudo tee -a /etc/environment > /dev/null
echo "CUSTOM_ACTIONS_CACHE_URL=http://10.0.0.1:51123/random_token/" | sudo tee -a /etc/environment > /dev/null
if [ ! -f /etc/systemd/system/runner-script.service ]; then
sudo tee /etc/systemd/system/runner-script.service > /dev/null <<'EOT'
[Unit]
Description=runner-script
[Service]
RemainAfterExit=yes
User=runner
Group=runner
WorkingDirectory=/home/runner
ExecStart=/home/runner/actions-runner/run-withenv.sh
EOT
sudo -u runner tee /home/runner/actions-runner/run-withenv.sh > /dev/null <<'EOT'
#!/bin/bash
mapfile -t env </etc/environment
JIT_CONFIG="$(cat ./actions-runner/.jit_token)"
exec env -- "${env[@]}" ./actions-runner/run.sh --jitconfig "$JIT_CONFIG"
EOT
sudo systemctl daemon-reload
fi
COMMAND
expect { nx.setup_environment }.to hop("register_runner")
end
end
describe "#register_runner" do
it "registers runner hops" do
expect(client).to receive(:post).with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label])).and_return({runner: {id: 123}, encoded_jit_config: "AABBCC$"})
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, stdin: "AABBCC$")
sudo -u runner tee /home/runner/actions-runner/.jit_token > /dev/null
sudo systemctl start runner-script.service
COMMAND
expect { nx.register_runner }.to hop("wait")
expect(runner.runner_id).to eq(123)
expect(runner.ready_at).to eq(now)
end
it "deletes the runner if the generate request fails due to 'already exists with the same name' error and the runner script does not start yet." do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Already exists - A runner with the name *** already exists."}))
expect(client).to receive(:paginate)
.and_yield({runners: [{name: runner.ubid.to_s, id: 123}]}, instance_double(Sawyer::Response, data: {runners: []}))
.and_return({runners: [{name: runner.ubid.to_s, id: 123}]})
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("dead")
expect(client).to receive(:delete).with("/repos/#{runner.repository_name}/actions/runners/123")
expect(Clog).to receive(:emit).with("Deregistering runner because it already exists").and_call_original
expect { nx.register_runner }.to nap(5)
end
it "hops to wait if the generate request fails due to 'already exists with the same name' error and the runner script is running" do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Already exists - A runner with the name *** already exists."}))
expect(client).to receive(:paginate)
.and_yield({runners: [{name: runner.ubid.to_s, id: 123}]}, instance_double(Sawyer::Response, data: {runners: []}))
.and_return({runners: [{name: runner.ubid.to_s, id: 123}]})
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect { nx.register_runner }.to hop("wait")
expect(runner.runner_id).to eq(123)
expect(runner.ready_at).to eq(now)
end
it "fails if the generate request fails due to 'already exists with the same name' error but couldn't find the runner" do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Already exists - A runner with the name *** already exists."}))
expect(client).to receive(:paginate).and_return({runners: []})
expect(client).not_to receive(:delete)
expect { nx.register_runner }.to raise_error RuntimeError, "BUG: Failed with runner already exists error but couldn't find it"
end
it "fails if the generate request fails due to 'Octokit::Conflict' but it's not already exists error" do
expect(client).to receive(:post)
.with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label]))
.and_raise(Octokit::Conflict.new({body: "409 - Another issue"}))
expect { nx.register_runner }.to raise_error Octokit::Conflict
end
it "logs if fails due to runner script failure" do
expect(client).to receive(:post).with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label])).and_return({runner: {id: 123}, encoded_jit_config: "AABBCC$"})
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, stdin: "AABBCC$").and_raise Sshable::SshError.new("command", "", "Job for runner-script.service failed.\n Check logs", 123, nil)
sudo -u runner tee /home/runner/actions-runner/.jit_token > /dev/null
sudo systemctl start runner-script.service
COMMAND
expect(Clog).to receive(:emit).with("Failed to start runner script").and_call_original
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND)
sudo journalctl -xeu runner-script.service
cat /run/systemd/transient/runner-script.service || true
COMMAND
expect { nx.register_runner }.to raise_error Sshable::SshError
end
it "fails without a log if the ssh error doesn't match" do
expect(client).to receive(:post).with(/.*generate-jitconfig/, hash_including(name: runner.ubid.to_s, labels: [runner.label])).and_return({runner: {id: 123}, encoded_jit_config: "AABBCC$"})
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, stdin: "AABBCC$").and_raise Sshable::SshError.new("command", "", "unknown command", 123, nil)
sudo -u runner tee /home/runner/actions-runner/.jit_token > /dev/null
sudo systemctl start runner-script.service
COMMAND
expect(Clog).not_to receive(:emit).with("Failed to start runner script").and_call_original
expect { nx.register_runner }.to raise_error Sshable::SshError
end
end
describe "#wait" do
it "does not destroy runner if it does not pick a job in five minutes, and busy" do
runner.update(ready_at: now - 6 * 60, workflow_job: nil)
expect(client).to receive(:get).and_return({busy: true})
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect(nx).not_to receive(:register_deadline).with(nil, 7200)
expect(runner).not_to receive(:incr_destroy)
expect { nx.wait }.to nap(60)
end
it "destroys runner if it does not pick a job in five minutes and not busy" do
runner.update(ready_at: now - 6 * 60, workflow_job: nil)
expect(client).to receive(:get).and_return({busy: false})
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect(runner).to receive(:incr_destroy)
expect(nx).to receive(:register_deadline).twice
expect(Clog).to receive(:emit).with("The runner did not pick a job").and_call_original
expect { nx.wait }.to nap(0)
end
it "destroys runner if it does not pick a job in five minutes and already deleted" do
runner.update(ready_at: now - 6 * 60, workflow_job: nil)
expect(client).to receive(:get).and_raise(Octokit::NotFound)
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect(runner).to receive(:incr_destroy)
expect(nx).to receive(:register_deadline).twice
expect(Clog).to receive(:emit).with("The runner did not pick a job").and_call_original
expect { nx.wait }.to nap(0)
end
it "does not destroy runner if it doesn not pick a job but two minutes not pass yet" do
runner.update(ready_at: now - 60, workflow_job: nil)
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect(runner).not_to receive(:incr_destroy)
expect { nx.wait }.to nap(60)
end
it "destroys the runner if the runner-script is succeeded" do
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("exited")
expect(runner).to receive(:incr_destroy)
expect { nx.wait }.to nap(15)
end
it "provisions a spare runner and destroys the current one if the runner-script is failed" do
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("failed")
expect(runner).to receive(:provision_spare_runner)
expect(runner).to receive(:incr_destroy)
expect { nx.wait }.to nap(0)
end
it "naps if the runner-script is running" do
expect(vm.sshable).to receive(:cmd).with("systemctl show -p SubState --value runner-script").and_return("running")
expect { nx.wait }.to nap(60)
end
end
describe ".collect_final_telemetry" do
before do
vm.update(vm_host_id: create_vm_host(data_center: "FSN1-DC8").id)
end
it "Logs journalctl, docker limits, and cache proxy log if workflow_job is not successful" do
runner.update(workflow_job: {"conclusion" => "failure"})
expect(vm.vm_host.sshable).to receive(:cmd).with("sudo ln /vm/#{vm.inhost_name}/serial.log /var/log/ubicloud/serials/#{runner.ubid}_serial.log")
expect(vm.sshable).to receive(:cmd).with("journalctl -u runner-script -t 'run-withenv.sh' -t 'systemd' --no-pager | grep -Fv Started")
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, log: false)
TOKEN=$(curl -m 10 -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:ratelimitpreview/test:pull" | jq -r .token)
curl -m 10 -s --head -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/ratelimitpreview/test/manifests/latest | grep ratelimit
COMMAND
expect(vm.sshable).to receive(:cmd).with("sudo cat /var/log/cacheproxy.log", log: false).and_return("Received request - method: GET urlPath: foo\nReceived request - method: GET urlPath: foo\nGetCacheEntry request failed with status code: 204\n")
expect(Clog).to receive(:emit).with("Cache proxy log line counts") do |&blk|
expect(blk.call).to eq(cache_proxy_log_line_counts: {"Received request - method: GET urlPath: foo" => 2, "GetCacheEntry request failed with status code: 204" => 1})
end
nx.collect_final_telemetry
end
it "Logs journalctl, docker limits, and cache proxy log if workflow_job is nil" do
runner.update(workflow_job: nil)
expect(vm.vm_host.sshable).to receive(:cmd).with("sudo ln /vm/#{vm.inhost_name}/serial.log /var/log/ubicloud/serials/#{runner.ubid}_serial.log")
expect(vm.sshable).to receive(:cmd).with("journalctl -u runner-script -t 'run-withenv.sh' -t 'systemd' --no-pager | grep -Fv Started")
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, log: false)
TOKEN=$(curl -m 10 -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:ratelimitpreview/test:pull" | jq -r .token)
curl -m 10 -s --head -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/ratelimitpreview/test/manifests/latest | grep ratelimit
COMMAND
expect(vm.sshable).to receive(:cmd).with("sudo cat /var/log/cacheproxy.log", log: false).and_return("Received request - method: GET urlPath: foo\nReceived request - method: GET urlPath: foo\nGetCacheEntry request failed with status code: 204\n")
expect(Clog).to receive(:emit).with("Cache proxy log line counts") do |&blk|
expect(blk.call).to eq(cache_proxy_log_line_counts: {"Received request - method: GET urlPath: foo" => 2, "GetCacheEntry request failed with status code: 204" => 1})
end
nx.collect_final_telemetry
end
it "Logs docker limits and cache proxy log if workflow_job is successful" do
runner.update(workflow_job: {"conclusion" => "success"})
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, log: false).and_return("ratelimit-limit: 100;w=21600\nratelimit-remaining: 98;w=21600\ndocker-ratelimit-source: 192.168.1.1\n")
TOKEN=$(curl -m 10 -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:ratelimitpreview/test:pull" | jq -r .token)
curl -m 10 -s --head -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/ratelimitpreview/test/manifests/latest | grep ratelimit
COMMAND
expect(Clog).to receive(:emit).with("Remaining DockerHub rate limits") do |&blk|
expect(blk.call).to eq(dockerhub_rate_limits: {limit: 100, limit_window: 21600, remaining: 98, remaining_window: 21600, source: "192.168.1.1"})
end
expect(vm.sshable).to receive(:cmd).with("sudo cat /var/log/cacheproxy.log", log: false).and_return("Received request - method: GET urlPath: foo\nReceived request - method: GET urlPath: foo\nGetCacheEntry request failed with status code: 204\n")
expect(Clog).to receive(:emit).with("Cache proxy log line counts") do |&blk|
expect(blk.call).to eq(cache_proxy_log_line_counts: {"Received request - method: GET urlPath: foo" => 2, "GetCacheEntry request failed with status code: 204" => 1})
end
nx.collect_final_telemetry
end
it "Logs docker limits and empty cache proxy log if workflow_job is successful" do
runner.update(workflow_job: {"conclusion" => "success"})
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, log: false).and_return("ratelimit-limit: 100;w=21600\nratelimit-remaining: 98;w=21600\ndocker-ratelimit-source: 192.168.1.1\n")
TOKEN=$(curl -m 10 -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:ratelimitpreview/test:pull" | jq -r .token)
curl -m 10 -s --head -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/ratelimitpreview/test/manifests/latest | grep ratelimit
COMMAND
expect(Clog).to receive(:emit).with("Remaining DockerHub rate limits") do |&blk|
expect(blk.call).to eq(dockerhub_rate_limits: {limit: 100, limit_window: 21600, remaining: 98, remaining_window: 21600, source: "192.168.1.1"})
end
expect(vm.sshable).to receive(:cmd).with("sudo cat /var/log/cacheproxy.log", log: false).and_return("")
expect(Clog).to receive(:emit).with("Cache proxy log line counts") do |&blk|
expect(blk.call).to eq(cache_proxy_log_line_counts: {})
end
nx.collect_final_telemetry
end
it "Logs docker limits and nil cache proxy log if workflow_job is successful" do
runner.update(workflow_job: {"conclusion" => "success"})
expect(vm.sshable).to receive(:cmd).with(<<~COMMAND, log: false).and_return("ratelimit-limit: 100;w=21600\nratelimit-remaining: 98;w=21600\ndocker-ratelimit-source: 192.168.1.1\n")
TOKEN=$(curl -m 10 -s "https://auth.docker.io/token?service=registry.docker.io&scope=repository:ratelimitpreview/test:pull" | jq -r .token)
curl -m 10 -s --head -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/ratelimitpreview/test/manifests/latest | grep ratelimit
COMMAND
expect(Clog).to receive(:emit).with("Remaining DockerHub rate limits") do |&blk|
expect(blk.call).to eq(dockerhub_rate_limits: {limit: 100, limit_window: 21600, remaining: 98, remaining_window: 21600, source: "192.168.1.1"})
end
expect(vm.sshable).to receive(:cmd).with("sudo cat /var/log/cacheproxy.log", log: false).and_return(nil)
nx.collect_final_telemetry
end
it "doesn't fail if it failed due to Sshable::SshError" do
runner.update(workflow_job: {"conclusion" => "success"})
expect(vm.sshable).to receive(:cmd).and_raise Sshable::SshError.new("bogus", "", "", nil, nil)
expect(Clog).to receive(:emit).with("Failed to collect final telemetry").and_call_original
nx.collect_final_telemetry
end
it "doesn't fail if it failed due to Net::SSH::ConnectionTimeout" do
runner.update(workflow_job: {"conclusion" => "success"})
expect(vm.sshable).to receive(:cmd).and_raise Net::SSH::ConnectionTimeout
expect(Clog).to receive(:emit).with("Failed to collect final telemetry").and_call_original
nx.collect_final_telemetry
end
end
describe "#destroy" do
it "naps if runner not deregistered yet" do
expect(client).to receive(:get).and_return(busy: false)
expect(client).to receive(:delete)
expect { nx.destroy }.to nap(5)
end
it "naps if runner still running a job" do
expect(client).to receive(:get).and_return(busy: true)
expect { nx.destroy }.to nap(15)
end
it "destroys resources and hops if runner deregistered" do
vm.update(vm_host_id: create_vm_host.id)
expect(nx).to receive(:decr_destroy)
expect(client).to receive(:get).and_raise(Octokit::NotFound)
expect(client).not_to receive(:delete)
expect(nx).to receive(:collect_final_telemetry)
fw = instance_double(Firewall)
ps = instance_double(PrivateSubnet, firewalls: [fw])
expect(fw).to receive(:destroy)
expect(ps).to receive(:incr_destroy)
expect(vm).to receive(:private_subnets).and_return([ps])
expect(vm).to receive(:incr_destroy)
expect { nx.destroy }.to hop("wait_vm_destroy")
end
it "skip deregistration and destroy vm immediately" do
vm.update(vm_host_id: create_vm_host.id)
expect(nx).to receive(:decr_destroy)
expect(runner).to receive(:skip_deregistration_set?).and_return(true)
expect(nx).to receive(:collect_final_telemetry)
expect(vm).to receive(:incr_destroy)
expect { nx.destroy }.to hop("wait_vm_destroy")
end
it "does not collect telemetry if the vm not allocated" do
vm.update(vm_host_id: nil)
expect(nx).to receive(:decr_destroy)
expect(client).to receive(:get).and_raise(Octokit::NotFound)
expect(nx).not_to receive(:collect_final_telemetry)
expect(vm).to receive(:incr_destroy)
expect { nx.destroy }.to hop("wait_vm_destroy")
end
it "does not destroy vm if it's already destroyed" do
runner.update(vm_id: nil)
expect(nx).to receive(:vm).and_return(nil).at_least(:once)
expect(nx).to receive(:decr_destroy)
expect(client).to receive(:get).and_raise(Octokit::NotFound)
expect(client).not_to receive(:delete)
expect { nx.destroy }.to hop("wait_vm_destroy")
end
end
describe "#wait_vm_destroy" do
it "naps if vm not destroyed yet" do
expect { nx.wait_vm_destroy }.to nap(10)
end
it "extends deadline if vm prevents destroy" do
expect(runner.vm).to receive(:prevent_destroy_set?).and_return(true)
expect(nx).to receive(:register_deadline).with(nil, 15 * 60, allow_extension: true)
expect { nx.wait_vm_destroy }.to nap(10)
end
it "pops if vm destroyed" do
expect(nx).to receive(:vm).and_return(nil).twice
expect(runner).to receive(:destroy)
expect { nx.wait_vm_destroy }.to exit({"msg" => "github runner deleted"})
end
end
end