ubicloud/prog/aws/instance.rb
Burak Velioglu 56bd53f29f Use aws-assigned public IPs for runner instances
Instead of creating separate EIPs, runner instances now rely on AWS
to assign public IPs during instance creation, simplifying the
networking setup and reducing resource overhead.
2025-09-30 01:32:08 +03:00

316 lines
9.4 KiB
Ruby

# frozen_string_literal: true
class Prog::Aws::Instance < Prog::Base
subject_is :vm, :aws_instance
label def start
# Cloudwatch is not needed for runner instances
hop_create_instance if is_runner?
assume_role_policy_document = {
Version: "2012-10-17",
Statement: [
{
Effect: "Allow",
Principal: {Service: "ec2.amazonaws.com"},
Action: "sts:AssumeRole"
}
]
}.to_json
ignore_invalid_entity do
iam_client.create_role({role_name:, assume_role_policy_document:})
end
hop_create_role_policy
end
label def create_role_policy
policy_document = {
Version: "2012-10-17",
Statement: [
{
Effect: "Allow",
Action: [
"logs:CreateLogStream",
"logs:PutLogEvents",
"logs:CreateLogGroup"
],
Resource: [
"arn:aws:logs:*:*:log-group:/#{vm.name}/auth:log-stream:*",
"arn:aws:logs:*:*:log-group:/#{vm.name}/postgresql:log-stream:*"
]
},
{
Effect: "Allow",
Action: "logs:DescribeLogStreams",
Resource: [
"arn:aws:logs:*:*:log-group:/#{vm.name}/auth:*",
"arn:aws:logs:*:*:log-group:/#{vm.name}/postgresql:*"
]
}
]
}.to_json
ignore_invalid_entity do
iam_client.create_policy({policy_name:, policy_document:})
end
hop_attach_role_policy
end
label def attach_role_policy
ignore_invalid_entity do
iam_client.attach_role_policy({role_name:, policy_arn: cloudwatch_policy.arn})
end
hop_create_instance_profile
end
label def create_instance_profile
ignore_invalid_entity do
iam_client.create_instance_profile({instance_profile_name:})
end
hop_add_role_to_instance_profile
end
label def add_role_to_instance_profile
ignore_invalid_entity do
iam_client.add_role_to_instance_profile({instance_profile_name:, role_name:})
end
hop_wait_instance_profile_created
end
label def wait_instance_profile_created
begin
iam_client.get_instance_profile({instance_profile_name:})
rescue Aws::IAM::Errors::NoSuchEntity
nap 1
end
hop_create_instance
end
label def create_instance
base_params = {
image_id: vm.boot_image, # AMI ID
instance_type: Option.aws_instance_type_name(vm.family, vm.vcpus),
block_device_mappings: [
{
device_name: "/dev/sda1",
ebs: {
encrypted: true,
delete_on_termination: true,
iops: 3000,
volume_size: vm.vm_storage_volumes_dataset.where(:boot).get(:size_gib),
volume_type: "gp3",
throughput: 125
}
}
],
private_dns_name_options: {
hostname_type: "ip-name",
enable_resource_name_dns_a_record: false,
enable_resource_name_dns_aaaa_record: false
},
min_count: 1,
max_count: 1,
tag_specifications: Util.aws_tag_specifications("instance", vm.name),
client_token: vm.id
}
params = base_params.merge(instance_specific_params)
begin
instance_response = client.run_instances(params)
rescue Aws::EC2::Errors::InvalidParameterValue => e
nap 1 if e.message.include?("Invalid IAM Instance Profile name")
raise
rescue Aws::EC2::Errors::InsufficientInstanceCapacity => e
if is_runner? && (runner = GithubRunner[vm_id: vm.id])
Clog.emit("insufficient instance capacity") { {insufficient_instance_capacity: {vm:, message: e.message}} }
runner.provision_spare_runner
runner.incr_destroy
nap 30 # It will be destroyed through runner -> vm destroy before next attempt
end
raise
end
instance = instance_response.instances.first
instance_id = instance.instance_id
subnet_id = instance.network_interfaces.first.subnet_id
subnet_response = client.describe_subnets(subnet_ids: [subnet_id])
az_id = subnet_response.subnets.first.availability_zone_id
ipv4_dns_name = instance.public_dns_name
AwsInstance.create_with_id(vm.id, instance_id:, az_id:, ipv4_dns_name:)
hop_wait_instance_created
end
label def wait_instance_created
instance_response = client.describe_instances({filters: [{name: "instance-id", values: [aws_instance.instance_id]}, {name: "tag:Ubicloud", values: ["true"]}]}).reservations[0].instances[0]
nap 1 unless instance_response.dig(:state, :name) == "running"
public_ipv4 = instance_response.dig(:network_interfaces, 0, :association, :public_ip)
public_ipv6 = instance_response.dig(:network_interfaces, 0, :ipv_6_addresses, 0, :ipv_6_address)
AssignedVmAddress.create(dst_vm_id: vm.id, ip: public_ipv4)
vm.sshable&.update(host: public_ipv4)
vm.update(cores: vm.vcpus / 2, allocated_at: Time.now, ephemeral_net6: public_ipv6)
pop "vm created"
end
label def destroy
if aws_instance
begin
client.terminate_instances(instance_ids: [aws_instance.instance_id])
rescue Aws::EC2::Errors::InvalidInstanceIDNotFound
end
aws_instance.destroy
end
pop "vm destroyed" if is_runner?
hop_cleanup_roles
end
label def cleanup_roles
ignore_invalid_entity do
iam_client.remove_role_from_instance_profile({instance_profile_name:, role_name:})
end
ignore_invalid_entity do
iam_client.delete_instance_profile({instance_profile_name:})
end
if cloudwatch_policy
ignore_invalid_entity do
iam_client.detach_role_policy({role_name:, policy_arn: cloudwatch_policy.arn})
end
ignore_invalid_entity do
iam_client.delete_policy({policy_arn: cloudwatch_policy.arn})
end
end
ignore_invalid_entity do
iam_client.delete_role({role_name:})
end
pop "vm destroyed"
end
def client
@client ||= vm.location.location_credential.client
end
def iam_client
@iam_client ||= vm.location.location_credential.iam_client
end
def cloudwatch_policy
@cloudwatch_policy ||= iam_client.list_policies(scope: "Local").policies.find { |p| p.policy_name == policy_name }
end
def policy_name
"#{vm.name}-cw-agent-policy"
end
def role_name
vm.name
end
def instance_profile_name
"#{vm.name}-instance-profile"
end
def is_runner?
@is_runner ||= vm.unix_user == "runneradmin"
end
def instance_specific_params
if is_runner?
{
user_data: Base64.encode64(runner_user_data.gsub(/^(\s*# .*)?\n/, "")),
instance_market_options: runner_market_options,
network_interfaces: runner_network_interfaces
}
else
{
user_data: Base64.encode64(standard_user_data.gsub(/^(\s*# .*)?\n/, "")),
iam_instance_profile: {name: instance_profile_name},
network_interfaces: standard_network_interfaces
}
end
end
def runner_user_data
standard_user_data + "\necho \"#{vm.private_ipv4} ubicloudhostplaceholder.blob.core.windows.net\" >> /etc/hosts"
end
def standard_user_data
public_keys = (vm.sshable.keys.map(&:public_key) + (vm.project.get_ff_vm_public_ssh_keys || [])).join("\n")
<<~USER_DATA
#!/bin/bash
custom_user="#{vm.unix_user}"
if [ ! -d /home/$custom_user ]; then
# Create the custom user
adduser $custom_user --disabled-password --gecos ""
# Add the custom user to the sudo group
usermod -aG sudo $custom_user
# disable password for the custom user
echo "$custom_user ALL=(ALL:ALL) NOPASSWD:ALL" | sudo tee /etc/sudoers.d/$custom_user
# Set up SSH access for the custom user
mkdir -p /home/$custom_user/.ssh
cp /home/ubuntu/.ssh/authorized_keys /home/$custom_user/.ssh/
chown -R $custom_user:$custom_user /home/$custom_user/.ssh
chmod 700 /home/$custom_user/.ssh
chmod 600 /home/$custom_user/.ssh/authorized_keys
fi
echo #{public_keys.shellescape} > /home/$custom_user/.ssh/authorized_keys
usermod -L ubuntu
USER_DATA
end
def runner_market_options
if Config.github_runner_aws_spot_instance_enabled
spot_options = {
spot_instance_type: "one-time",
instance_interruption_behavior: "terminate"
}
if Config.github_runner_aws_spot_instance_max_price_per_vcpu > 0
# Not setting max_price means you'll pay up to the on-demand price,
spot_options[:max_price] = (vm.vcpus * Config.github_runner_aws_spot_instance_max_price_per_vcpu * 60).to_s
end
{market_type: "spot", spot_options:}
end
end
def standard_network_interfaces
[{device_index: 0, network_interface_id: vm.nics.first.nic_aws_resource.network_interface_id}]
end
# While creating runner instance, NIC and EIP are not created. So, we depend on
# AWS to assign a public IP to the instance.
def runner_network_interfaces
[
{
device_index: 0,
subnet_id: vm.nics.first.nic_aws_resource.subnet_id,
groups: [vm.nics.first.private_subnet.private_subnet_aws_resource.security_group_id],
associate_public_ip_address: true,
ipv_6_address_count: 1
}
]
end
def ignore_invalid_entity
yield
rescue Aws::IAM::Errors::InvalidInstanceProfileName,
Aws::IAM::Errors::InvalidRoleName,
Aws::IAM::Errors::NoSuchEntity,
Aws::IAM::Errors::EntityAlreadyExists => e
Clog.emit("ID not found or already exists for aws instance") { {ignored_aws_instance_failure: {exception: Util.exception_to_hash(e, backtrace: nil)}} }
end
end