Files
ubicloud/prog/test/github_runner.rb
Enes Cakir e2627b5c96 Pass JIT config via file to runner script
Previously, we passed the JIT config to the runner execution script via
command-line arguments using xargs. This included the JIT token in the
transient systemd unit file generated by systemd-run, which sometimes
failed with "Failed to resolve unit specifiers" errors. These issues are
hard to reproduce locally, but may be caused by template expansion
limits or token size.

We take direct control over the unit file instead of relying on
systemd-run’s transient unit generation, to better understand and debug
any related issues.

We now pass the JIT config via a file. This is more reliable for large
strings and avoids leaking sensitive tokens into the unit description.

This change is currently behind a feature flag. Once verified in
production, we’ll move the file creation to the image generation step.
2025-07-08 14:06:34 +03:00

181 lines
5.6 KiB
Ruby

# frozen_string_literal: true
require "octokit"
require "yaml"
class Prog::Test::GithubRunner < Prog::Test::Base
FAIL_CONCLUSIONS = ["action_required", "cancelled", "failure", "skipped", "stale", "timed_out"]
IN_PROGRESS_CONCLUSIONS = ["in_progress", "queued", "requested", "waiting", "pending", "neutral"]
def self.assemble(test_cases)
github_service_project = Project.create(name: "Github-Runner-Service-Project") { it.id = Config.github_runner_service_project_id }
vm_pool_service_project = Project.create(name: "Vm-Pool-Service-Project") { it.id = Config.vm_pool_project_id }
github_test_project = Project.create(name: "Github-Runner-Test-Project")
github_test_project.set_ff_runner_jit_file(true)
GithubInstallation.create(
installation_id: Config.e2e_github_installation_id,
name: "TestUser",
type: "User",
project_id: github_test_project.id,
created_at: Time.now - 8 * 24 * 60 * 60
)
Strand.create_with_id(
prog: "Test::GithubRunner",
label: "start",
stack: [{
"created_at" => Time.now.utc,
"test_cases" => test_cases,
"github_service_project_id" => github_service_project.id,
"vm_pool_service_project" => vm_pool_service_project.id,
"github_test_project_id" => github_test_project.id
}]
)
end
label def start
hop_create_vm_pool
end
label def create_vm_pool
label_data = Github.runner_labels["ubicloud"]
pool = Prog::Vm::VmPool.assemble(
size: 1,
vm_size: label_data["vm_size"],
boot_image: label_data["boot_image"],
location_id: Location::GITHUB_RUNNERS_ID,
storage_size_gib: label_data["storage_size_gib"],
arch: label_data["arch"],
storage_encrypted: true,
storage_skip_sync: true
).subject
update_stack({"vm_pool_id" => pool.id})
hop_wait_vm_pool_to_be_ready
end
label def wait_vm_pool_to_be_ready
pool = VmPool[frame["vm_pool_id"]]
nap 10 unless pool.size == pool.vms_dataset.exclude(provisioned_at: nil).count
# No need to provision a new VM to the pool when the first one is picked.
# This simplifies the process of verifying at the end of the test that VMs
# were correctly picked from the pool.
pool.update(size: 0)
hop_trigger_test_runs
end
label def trigger_test_runs
test_runs.each do |test_run|
unless trigger_test_run(test_run["repo_name"], test_run["workflow_name"], test_run["branch_name"])
update_stack({"fail_message" => "Can not trigger workflow for #{test_run["repo_name"]}, #{test_run["workflow_name"]}, #{test_run["branch_name"]}"})
hop_clean_resources
end
end
# To make sure that test runs are triggered
# We sill still check the runs in the next step in
# case an incident happens on the github side
sleep 30
hop_check_test_runs
end
label def check_test_runs
test_runs.each do |test_run|
latest_run = latest_run(test_run["repo_name"], test_run["workflow_name"], test_run["branch_name"])
# In case the run can not be triggered in the previous state
if latest_run[:created_at] < Time.parse(frame["created_at"])
update_stack({"fail_message" => "Can not trigger workflow for #{test_run["repo_name"]}, #{test_run["workflow_name"]}, #{test_run["branch_name"]}"})
break
end
conclusion = latest_run[:conclusion]
if FAIL_CONCLUSIONS.include?(conclusion)
update_stack({"fail_message" => "Test run for #{test_run["repo_name"]}, #{test_run["workflow_name"]}, #{test_run["branch_name"]} failed with conclusion #{conclusion}"})
break
elsif IN_PROGRESS_CONCLUSIONS.include?(conclusion) || conclusion.nil?
nap 15
end
end
hop_clean_resources
end
label def clean_resources
cancel_test_runs
if GithubRunner.any?
Clog.emit("Waiting runners to finish their jobs")
nap 15
end
if (pool = VmPool[frame["vm_pool_id"]])
unless pool.vms.count.zero?
update_stack({"fail_message" => "The runner did not picked from the pool"})
end
pool.incr_destroy
end
GithubRepository.each(&:incr_destroy)
if VmPool.any?
Clog.emit("Waiting vm pools to be destroyed")
nap 15
end
if GithubRepository.any?
Clog.emit("Waiting repositories to be destroyed")
nap 15
end
Project[frame["github_service_project_id"]]&.destroy
Project[frame["vm_pool_service_project"]]&.destroy
Project[frame["github_test_project_id"]]&.destroy
frame["fail_message"] ? fail_test(frame["fail_message"]) : hop_finish
end
label def finish
pop "GithubRunner tests are finished!"
end
label def failed
nap 15
end
def trigger_test_run(repo_name, workflow_name, branch_name)
client.workflow_dispatch(repo_name, workflow_name, branch_name, {inputs: {triggered_by: ENV["GITHUB_RUN_ID"]}})
end
def latest_run(repo_name, workflow_name, branch_name)
runs = client.workflow_runs(repo_name, workflow_name, {branch: branch_name})
runs[:workflow_runs].first
end
def cancel_test_runs
test_runs.each do |test_run|
cancel_test_run(test_run["repo_name"], test_run["workflow_name"], test_run["branch_name"])
end
end
def cancel_test_run(repo_name, workflow_name, branch_name)
run_id = latest_run(repo_name, workflow_name, branch_name)[:id]
begin
client.cancel_workflow_run(repo_name, run_id)
rescue
Clog.emit("Workflow run #{run_id} for #{repo_name} has already been finished")
end
end
def test_runs
@test_runs ||= frame["test_cases"].map { it["details"] }
end
def client
@client ||= Github.installation_client(Config.e2e_github_installation_id)
end
end