Previously, we passed the JIT config to the runner execution script via command-line arguments using xargs. This included the JIT token in the transient systemd unit file generated by systemd-run, which sometimes failed with "Failed to resolve unit specifiers" errors. These issues are hard to reproduce locally, but may be caused by template expansion limits or token size. We take direct control over the unit file instead of relying on systemd-run’s transient unit generation, to better understand and debug any related issues. We now pass the JIT config via a file. This is more reliable for large strings and avoids leaking sensitive tokens into the unit description. This change is currently behind a feature flag. Once verified in production, we’ll move the file creation to the image generation step.
181 lines
5.6 KiB
Ruby
181 lines
5.6 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "octokit"
|
|
require "yaml"
|
|
|
|
class Prog::Test::GithubRunner < Prog::Test::Base
|
|
FAIL_CONCLUSIONS = ["action_required", "cancelled", "failure", "skipped", "stale", "timed_out"]
|
|
IN_PROGRESS_CONCLUSIONS = ["in_progress", "queued", "requested", "waiting", "pending", "neutral"]
|
|
|
|
def self.assemble(test_cases)
|
|
github_service_project = Project.create(name: "Github-Runner-Service-Project") { it.id = Config.github_runner_service_project_id }
|
|
|
|
vm_pool_service_project = Project.create(name: "Vm-Pool-Service-Project") { it.id = Config.vm_pool_project_id }
|
|
|
|
github_test_project = Project.create(name: "Github-Runner-Test-Project")
|
|
github_test_project.set_ff_runner_jit_file(true)
|
|
GithubInstallation.create(
|
|
installation_id: Config.e2e_github_installation_id,
|
|
name: "TestUser",
|
|
type: "User",
|
|
project_id: github_test_project.id,
|
|
created_at: Time.now - 8 * 24 * 60 * 60
|
|
)
|
|
|
|
Strand.create_with_id(
|
|
prog: "Test::GithubRunner",
|
|
label: "start",
|
|
stack: [{
|
|
"created_at" => Time.now.utc,
|
|
"test_cases" => test_cases,
|
|
"github_service_project_id" => github_service_project.id,
|
|
"vm_pool_service_project" => vm_pool_service_project.id,
|
|
"github_test_project_id" => github_test_project.id
|
|
}]
|
|
)
|
|
end
|
|
|
|
label def start
|
|
hop_create_vm_pool
|
|
end
|
|
|
|
label def create_vm_pool
|
|
label_data = Github.runner_labels["ubicloud"]
|
|
pool = Prog::Vm::VmPool.assemble(
|
|
size: 1,
|
|
vm_size: label_data["vm_size"],
|
|
boot_image: label_data["boot_image"],
|
|
location_id: Location::GITHUB_RUNNERS_ID,
|
|
storage_size_gib: label_data["storage_size_gib"],
|
|
arch: label_data["arch"],
|
|
storage_encrypted: true,
|
|
storage_skip_sync: true
|
|
).subject
|
|
update_stack({"vm_pool_id" => pool.id})
|
|
|
|
hop_wait_vm_pool_to_be_ready
|
|
end
|
|
|
|
label def wait_vm_pool_to_be_ready
|
|
pool = VmPool[frame["vm_pool_id"]]
|
|
nap 10 unless pool.size == pool.vms_dataset.exclude(provisioned_at: nil).count
|
|
|
|
# No need to provision a new VM to the pool when the first one is picked.
|
|
# This simplifies the process of verifying at the end of the test that VMs
|
|
# were correctly picked from the pool.
|
|
pool.update(size: 0)
|
|
hop_trigger_test_runs
|
|
end
|
|
|
|
label def trigger_test_runs
|
|
test_runs.each do |test_run|
|
|
unless trigger_test_run(test_run["repo_name"], test_run["workflow_name"], test_run["branch_name"])
|
|
update_stack({"fail_message" => "Can not trigger workflow for #{test_run["repo_name"]}, #{test_run["workflow_name"]}, #{test_run["branch_name"]}"})
|
|
hop_clean_resources
|
|
end
|
|
end
|
|
|
|
# To make sure that test runs are triggered
|
|
# We sill still check the runs in the next step in
|
|
# case an incident happens on the github side
|
|
sleep 30
|
|
|
|
hop_check_test_runs
|
|
end
|
|
|
|
label def check_test_runs
|
|
test_runs.each do |test_run|
|
|
latest_run = latest_run(test_run["repo_name"], test_run["workflow_name"], test_run["branch_name"])
|
|
|
|
# In case the run can not be triggered in the previous state
|
|
if latest_run[:created_at] < Time.parse(frame["created_at"])
|
|
update_stack({"fail_message" => "Can not trigger workflow for #{test_run["repo_name"]}, #{test_run["workflow_name"]}, #{test_run["branch_name"]}"})
|
|
break
|
|
end
|
|
|
|
conclusion = latest_run[:conclusion]
|
|
if FAIL_CONCLUSIONS.include?(conclusion)
|
|
update_stack({"fail_message" => "Test run for #{test_run["repo_name"]}, #{test_run["workflow_name"]}, #{test_run["branch_name"]} failed with conclusion #{conclusion}"})
|
|
break
|
|
elsif IN_PROGRESS_CONCLUSIONS.include?(conclusion) || conclusion.nil?
|
|
nap 15
|
|
end
|
|
end
|
|
|
|
hop_clean_resources
|
|
end
|
|
|
|
label def clean_resources
|
|
cancel_test_runs
|
|
|
|
if GithubRunner.any?
|
|
Clog.emit("Waiting runners to finish their jobs")
|
|
nap 15
|
|
end
|
|
|
|
if (pool = VmPool[frame["vm_pool_id"]])
|
|
unless pool.vms.count.zero?
|
|
update_stack({"fail_message" => "The runner did not picked from the pool"})
|
|
end
|
|
pool.incr_destroy
|
|
end
|
|
GithubRepository.each(&:incr_destroy)
|
|
|
|
if VmPool.any?
|
|
Clog.emit("Waiting vm pools to be destroyed")
|
|
nap 15
|
|
end
|
|
|
|
if GithubRepository.any?
|
|
Clog.emit("Waiting repositories to be destroyed")
|
|
nap 15
|
|
end
|
|
|
|
Project[frame["github_service_project_id"]]&.destroy
|
|
Project[frame["vm_pool_service_project"]]&.destroy
|
|
Project[frame["github_test_project_id"]]&.destroy
|
|
|
|
frame["fail_message"] ? fail_test(frame["fail_message"]) : hop_finish
|
|
end
|
|
|
|
label def finish
|
|
pop "GithubRunner tests are finished!"
|
|
end
|
|
|
|
label def failed
|
|
nap 15
|
|
end
|
|
|
|
def trigger_test_run(repo_name, workflow_name, branch_name)
|
|
client.workflow_dispatch(repo_name, workflow_name, branch_name, {inputs: {triggered_by: ENV["GITHUB_RUN_ID"]}})
|
|
end
|
|
|
|
def latest_run(repo_name, workflow_name, branch_name)
|
|
runs = client.workflow_runs(repo_name, workflow_name, {branch: branch_name})
|
|
runs[:workflow_runs].first
|
|
end
|
|
|
|
def cancel_test_runs
|
|
test_runs.each do |test_run|
|
|
cancel_test_run(test_run["repo_name"], test_run["workflow_name"], test_run["branch_name"])
|
|
end
|
|
end
|
|
|
|
def cancel_test_run(repo_name, workflow_name, branch_name)
|
|
run_id = latest_run(repo_name, workflow_name, branch_name)[:id]
|
|
begin
|
|
client.cancel_workflow_run(repo_name, run_id)
|
|
rescue
|
|
Clog.emit("Workflow run #{run_id} for #{repo_name} has already been finished")
|
|
end
|
|
end
|
|
|
|
def test_runs
|
|
@test_runs ||= frame["test_cases"].map { it["details"] }
|
|
end
|
|
|
|
def client
|
|
@client ||= Github.installation_client(Config.e2e_github_installation_id)
|
|
end
|
|
end
|