Additionally, allow it to choose whether to support hugepages and to choose the firmware version (though currently only one firmware version is supported). The default for hugepages is set to "on," pending additional work to avoid the reboot overheads of hugepages in development. In rhizome prep_host, download all supported cloud-hypervisor versions and firmware versions. Currently, only one firmware version is supported. On Ubuntu 24, both cloud-hypervisor 35.1 and 45.0 are installed, with 45.0 as the default if it is installed. On Ubuntu 22, only cloud-hypervisor 35.1 is installed.
749 lines
25 KiB
Ruby
749 lines
25 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require_relative "../../common/lib/util"
|
|
require_relative "../../common/lib/network"
|
|
|
|
require "fileutils"
|
|
require "netaddr"
|
|
require "json"
|
|
require "openssl"
|
|
require "base64"
|
|
require "uri"
|
|
require_relative "vm_path"
|
|
require_relative "cloud_hypervisor"
|
|
require_relative "storage_volume"
|
|
|
|
class VmSetup
|
|
Nic = Struct.new(:net6, :net4, :tap, :mac, :private_ipv4_gateway)
|
|
|
|
def initialize(vm_name, hugepages: true, ch_version: nil, firmware_version: nil)
|
|
@vm_name = vm_name
|
|
@hugepages = hugepages
|
|
@ch_version = CloudHypervisor::Version[ch_version] || no_valid_ch_version
|
|
@firmware_version = CloudHypervisor::Firmware[firmware_version] || no_valid_firmware_version
|
|
end
|
|
|
|
private def no_valid_ch_version
|
|
raise("no valid cloud hypervisor version")
|
|
end
|
|
|
|
private def no_valid_firmware_version
|
|
raise("no valid cloud hypervisor firmware version")
|
|
end
|
|
|
|
def q_vm
|
|
@q_vm ||= @vm_name.shellescape
|
|
end
|
|
|
|
# YAML quoting
|
|
def yq(s)
|
|
require "yaml"
|
|
# I don't see a better way to quote a string meant for embedding
|
|
# in literal YAML other than to generate a full YAML document and
|
|
# then stripping out headers and footers. Consider the special
|
|
# string "NO" (parses as boolean, unless quoted):
|
|
#
|
|
# > YAML.dump('NO')
|
|
# => "--- 'NO'\n"
|
|
#
|
|
# > YAML.dump('NO')[4..-2]
|
|
# => "'NO'"
|
|
YAML.dump(s, line_width: -1)[4..-2]
|
|
end
|
|
|
|
def vp
|
|
@vp ||= VmPath.new(@vm_name)
|
|
end
|
|
|
|
def prep(unix_user, public_keys, nics, gua, ip4, local_ip4, max_vcpus, cpu_topology,
|
|
mem_gib, ndp_needed, storage_params, storage_secrets, swap_size_bytes, pci_devices,
|
|
boot_image, dns_ipv4, slice_name, cpu_percent_limit, cpu_burst_percent_limit)
|
|
|
|
cloudinit(unix_user, public_keys, gua, nics, swap_size_bytes, boot_image, dns_ipv4)
|
|
network_thread = Thread.new do
|
|
setup_networking(false, gua, ip4, local_ip4, nics, ndp_needed, dns_ipv4, multiqueue: max_vcpus > 1)
|
|
end
|
|
storage_thread = Thread.new do
|
|
storage(storage_params, storage_secrets, true)
|
|
end
|
|
[network_thread, storage_thread].each(&:join)
|
|
hugepages(mem_gib)
|
|
prepare_pci_devices(pci_devices)
|
|
install_systemd_unit(max_vcpus, cpu_topology, mem_gib, storage_params, nics, pci_devices, slice_name, cpu_percent_limit)
|
|
start_systemd_unit
|
|
update_via_routes(nics)
|
|
|
|
enable_bursting(slice_name, cpu_burst_percent_limit) unless cpu_burst_percent_limit == 0
|
|
end
|
|
|
|
def recreate_unpersisted(gua, ip4, local_ip4, nics, mem_gib, ndp_needed, storage_params,
|
|
storage_secrets, dns_ipv4, pci_devices, slice_name, cpu_burst_percent_limit, multiqueue:)
|
|
|
|
setup_networking(true, gua, ip4, local_ip4, nics, ndp_needed, dns_ipv4, multiqueue: multiqueue)
|
|
hugepages(mem_gib)
|
|
storage(storage_params, storage_secrets, false)
|
|
prepare_pci_devices(pci_devices)
|
|
start_systemd_unit
|
|
update_via_routes(nics)
|
|
|
|
enable_bursting(slice_name, cpu_burst_percent_limit) unless cpu_burst_percent_limit == 0
|
|
end
|
|
|
|
def restart(slice_name, cpu_burst_percent_limit)
|
|
restart_systemd_unit
|
|
enable_bursting(slice_name, cpu_burst_percent_limit) unless cpu_burst_percent_limit == 0
|
|
end
|
|
|
|
def reassign_ip6(unix_user, public_keys, nics, gua, ip4, local_ip4, max_vcpus, cpu_topology,
|
|
mem_gib, ndp_needed, storage_params, storage_secrets, swap_size_bytes, pci_devices, boot_image,
|
|
dns_ipv4, slice_name, cpu_percent_limit, cpu_burst_percent_limit)
|
|
|
|
cloudinit(unix_user, public_keys, gua, nics, swap_size_bytes, boot_image, dns_ipv4)
|
|
setup_networking(false, gua, ip4, local_ip4, nics, ndp_needed, dns_ipv4, multiqueue: max_vcpus > 1)
|
|
hugepages(mem_gib)
|
|
storage(storage_params, storage_secrets, false)
|
|
install_systemd_unit(max_vcpus, cpu_topology, mem_gib, storage_params, nics, pci_devices, slice_name, cpu_percent_limit)
|
|
update_via_routes(nics)
|
|
|
|
enable_bursting(slice_name, cpu_burst_percent_limit) unless cpu_burst_percent_limit == 0
|
|
end
|
|
|
|
def setup_networking(skip_persisted, gua, ip4, local_ip4, nics, ndp_needed, dns_ipv4, multiqueue:)
|
|
ip4 = nil if ip4.empty?
|
|
guest_ephemeral, clover_ephemeral = subdivide_network(NetAddr.parse_net(gua))
|
|
|
|
if !skip_persisted
|
|
# Write out guest-delegated and clover infrastructure address
|
|
# ranges, designed around non-floating IPv6 networks bound to the
|
|
# host.
|
|
vp.write_guest_ephemeral(guest_ephemeral.to_s)
|
|
vp.write_clover_ephemeral(clover_ephemeral.to_s)
|
|
|
|
if ip4
|
|
vm_sub = NetAddr::IPv4Net.parse(ip4)
|
|
vp.write_public_ipv4(vm_sub.to_s)
|
|
unblock_ip4(ip4)
|
|
end
|
|
end
|
|
|
|
interfaces(nics, multiqueue)
|
|
setup_veths_6(guest_ephemeral, clover_ephemeral, gua, ndp_needed)
|
|
setup_taps_6(gua, nics, dns_ipv4)
|
|
routes4(ip4, local_ip4, nics)
|
|
write_nftables_conf(ip4, gua, nics)
|
|
forwarding
|
|
end
|
|
|
|
def unblock_ip4(ip4)
|
|
ip_net = NetAddr::IPv4Net.parse(ip4).network.to_s
|
|
filename = "/etc/nftables.d/#{q_vm}.conf"
|
|
temp_filename = "#{filename}.tmp"
|
|
File.open(temp_filename, File::RDWR | File::CREAT) do |f|
|
|
f.flock(File::LOCK_EX | File::LOCK_NB)
|
|
f.puts(<<-NFTABLES)
|
|
#!/usr/sbin/nft -f
|
|
add element inet drop_unused_ip_packets allowed_ipv4_addresses { #{ip_net} }
|
|
NFTABLES
|
|
File.rename(temp_filename, filename)
|
|
end
|
|
|
|
reload_nftables
|
|
end
|
|
|
|
def block_ip4
|
|
FileUtils.rm_f("/etc/nftables.d/#{q_vm}.conf")
|
|
reload_nftables
|
|
end
|
|
|
|
def reload_nftables
|
|
r "systemctl reload nftables"
|
|
end
|
|
|
|
# Delete all traces of the VM.
|
|
def purge
|
|
purge_network
|
|
purge_without_network
|
|
purge_user
|
|
end
|
|
|
|
def purge_network
|
|
block_ip4
|
|
|
|
begin
|
|
r "ip netns del #{q_vm}"
|
|
rescue CommandFail => ex
|
|
raise unless /Cannot remove namespace file ".*": No such file or directory/.match?(ex.stderr)
|
|
end
|
|
end
|
|
|
|
def purge_without_network
|
|
FileUtils.rm_f(vp.systemd_service)
|
|
FileUtils.rm_f(vp.dnsmasq_service)
|
|
r "systemctl daemon-reload"
|
|
|
|
purge_storage
|
|
unmount_hugepages
|
|
end
|
|
|
|
def purge_user
|
|
r "deluser --remove-home #{q_vm}"
|
|
rescue CommandFail => ex
|
|
raise unless /The user `.*' does not exist./.match?(ex.stderr)
|
|
end
|
|
|
|
def purge_storage
|
|
# prep.json doesn't exist, nothing more to do
|
|
return if !File.exist?(vp.prep_json)
|
|
|
|
storage_roots = []
|
|
|
|
params = JSON.parse(File.read(vp.prep_json))
|
|
params["storage_volumes"].reject { _1["read_only"] }.each { |params|
|
|
volume = StorageVolume.new(@vm_name, params)
|
|
volume.purge_spdk_artifacts
|
|
storage_roots.append(volume.storage_root)
|
|
}
|
|
|
|
storage_roots.each { |path|
|
|
rm_if_exists(path)
|
|
}
|
|
end
|
|
|
|
def unmount_hugepages
|
|
return unless @hugepages
|
|
r "umount #{vp.q_hugepages}"
|
|
rescue CommandFail => ex
|
|
raise unless /(no mount point specified)|(not mounted)|(No such file or directory)/.match?(ex.stderr)
|
|
end
|
|
|
|
def hugepages(mem_gib)
|
|
return unless @hugepages
|
|
FileUtils.mkdir_p vp.hugepages
|
|
FileUtils.chown @vm_name, @vm_name, vp.hugepages
|
|
r "mount -t hugetlbfs -o uid=#{q_vm},size=#{mem_gib}G nodev #{vp.q_hugepages}"
|
|
end
|
|
|
|
def interfaces(nics, multiqueue)
|
|
# We first delete the network namespace for idempotency. Instead
|
|
# we could catch various exceptions for each command run, and if
|
|
# the error message matches certain text, we could resume. But
|
|
# the "ip link add" step generates the MAC addresses randomly,
|
|
# which makes it unsuitable for error message matching. Deleting
|
|
# and recreating the network namespace seems easier and safer.
|
|
begin
|
|
r "ip netns del #{q_vm}"
|
|
rescue CommandFail => ex
|
|
raise unless /Cannot remove namespace file ".*": No such file or directory/.match?(ex.stderr)
|
|
end
|
|
|
|
# After the above deletion, the vetho interface may still exist because the
|
|
# namespace deletion does not handle related interface deletion
|
|
# in an atomic way. The command returns success and the cleanup of the
|
|
# vetho* interface may be done a little bit later. Here, we wait for the
|
|
# interface to disappear before going ahead because the ip link add command
|
|
# is not idempotent, either.
|
|
5.times do
|
|
if File.exist?("/sys/class/net/vetho#{q_vm}")
|
|
sleep 0.1
|
|
else
|
|
break
|
|
end
|
|
end
|
|
|
|
r "ip netns add #{q_vm}"
|
|
|
|
# Generate MAC addresses rather than letting Linux do it to avoid
|
|
# a vexing bug whereby a freshly created link will, at least once,
|
|
# spontaneously change its MAC address sometime soon after
|
|
# creation, as caught by instrumenting reads of
|
|
# /sys/class/net/vethi#{q_vm}/address at two points in time. The
|
|
# result is a race condition that *sometimes* worked.
|
|
r "ip link add vetho#{q_vm} addr #{gen_mac.shellescape} type veth peer name vethi#{q_vm} addr #{gen_mac.shellescape} netns #{q_vm}"
|
|
multiqueue_fragment = multiqueue ? " multi_queue vnet_hdr " : " "
|
|
nics.each do |nic|
|
|
r "ip -n #{q_vm} tuntap add dev #{nic.tap} mode tap user #{q_vm} #{multiqueue_fragment}"
|
|
r "ip -n #{q_vm} addr replace #{nic.private_ipv4_gateway} dev #{nic.tap}"
|
|
end
|
|
end
|
|
|
|
def subdivide_network(net)
|
|
prefix = net.netmask.prefix_len + 1
|
|
halved = net.resize(prefix)
|
|
[halved, halved.next_sib]
|
|
end
|
|
|
|
def setup_veths_6(guest_ephemeral, clover_ephemeral, gua, ndp_needed)
|
|
# Routing: from host to subordinate.
|
|
vethi_ll = mac_to_ipv6_link_local(r("ip netns exec #{q_vm} cat /sys/class/net/vethi#{q_vm}/address").chomp)
|
|
r "ip link set dev vetho#{q_vm} up"
|
|
r "ip route replace #{gua.shellescape} via #{vethi_ll.shellescape} dev vetho#{q_vm}"
|
|
|
|
if ndp_needed
|
|
routes = r "ip -j route"
|
|
main_device = parse_routes(routes)
|
|
r "ip -6 neigh add proxy #{guest_ephemeral.nth(2)} dev #{main_device}"
|
|
r "ip -6 neigh add proxy #{clover_ephemeral.nth(0)} dev #{main_device}"
|
|
end
|
|
|
|
# Accept clover traffic within the namespace (don't just let it
|
|
# enter a default routing loop via forwarding)
|
|
r "ip -n #{q_vm} addr replace #{clover_ephemeral.to_s.shellescape} dev vethi#{q_vm}"
|
|
|
|
# Routing: from subordinate to host.
|
|
vetho_ll = mac_to_ipv6_link_local(File.read("/sys/class/net/vetho#{q_vm}/address").chomp)
|
|
r "ip -n #{q_vm} link set dev vethi#{q_vm} up"
|
|
r "ip -n #{q_vm} route replace 2000::/3 via #{vetho_ll.shellescape} dev vethi#{q_vm}"
|
|
end
|
|
|
|
def setup_taps_6(gua, nics, dns_ipv4)
|
|
# Write out guest-delegated and clover infrastructure address
|
|
# ranges, designed around non-floating IPv6 networks bound to the
|
|
# host.
|
|
guest_ephemeral, _ = subdivide_network(NetAddr.parse_net(gua))
|
|
|
|
# Allocate ::1 in the guest network for DHCPv6.
|
|
guest_intrusion = guest_ephemeral.nth(1).to_s + "/" + guest_ephemeral.netmask.prefix_len.to_s
|
|
nics.each do |nic|
|
|
r "ip -n #{q_vm} addr replace #{guest_intrusion.shellescape} dev #{nic.tap}"
|
|
|
|
r "ip -n #{q_vm} addr replace #{dns_ipv4} dev #{nic.tap}"
|
|
|
|
# Route ephemeral address to tap.
|
|
r "ip -n #{q_vm} link set dev #{nic.tap} up"
|
|
r "ip -n #{q_vm} route replace #{guest_ephemeral.to_s.shellescape} via #{mac_to_ipv6_link_local(nic.mac)} dev #{nic.tap}"
|
|
r "ip -n #{q_vm} route del #{guest_ephemeral.to_s.shellescape} dev #{nic.tap}"
|
|
|
|
# Route private subnet addresses to tap.
|
|
ip6 = NetAddr::IPv6Net.parse(nic.net6)
|
|
|
|
# Allocate ::1 in the guest network for DHCPv6.
|
|
r "ip -n #{q_vm} addr replace #{ip6.nth(1)}/#{ip6.netmask.prefix_len} dev #{nic.tap} noprefixroute"
|
|
r "ip -n #{q_vm} route replace #{ip6.to_s.shellescape} via #{mac_to_ipv6_link_local(nic.mac)} dev #{nic.tap}"
|
|
end
|
|
|
|
r "ip -n #{q_vm} addr replace fd00:0b1c:100d:5AFE:CE:: dev #{nics.first.tap}"
|
|
r "ip -n #{q_vm} addr replace fd00:0b1c:100d:53:: dev #{nics.first.tap}"
|
|
end
|
|
|
|
def parse_routes(routes)
|
|
routes_j = JSON.parse(routes)
|
|
default_route = routes_j.find { |route| route["dst"] == "default" }
|
|
return default_route["dev"] if default_route
|
|
|
|
fail "No default route found in #{routes_j.inspect}"
|
|
end
|
|
|
|
def routes4(ip4, ip4_local, nics)
|
|
vm_sub = NetAddr::IPv4Net.parse(ip4) if ip4
|
|
local_ip = NetAddr::IPv4Net.parse(ip4_local)
|
|
vm = vm_sub.to_s if ip4
|
|
vetho, vethi = [local_ip.network.to_s,
|
|
local_ip.next_sib.network.to_s]
|
|
|
|
r "ip addr replace #{vetho}/32 dev vetho#{q_vm}"
|
|
r "ip route replace #{vm} dev vetho#{q_vm}" if ip4
|
|
r "echo 1 > /proc/sys/net/ipv4/conf/vetho#{q_vm}/proxy_arp"
|
|
|
|
r "ip -n #{q_vm} addr replace #{vethi}/32 dev vethi#{q_vm}"
|
|
# default?
|
|
r "ip -n #{q_vm} route replace #{vetho} dev vethi#{q_vm}"
|
|
|
|
nics.each do |nic|
|
|
r "ip -n #{q_vm} route replace #{vm} dev #{nic.tap}" if ip4
|
|
r "ip -n #{q_vm} route replace default via #{vetho} dev vethi#{q_vm}"
|
|
|
|
r "ip netns exec #{q_vm} bash -c 'echo 1 > /proc/sys/net/ipv4/conf/vethi#{q_vm}/proxy_arp'"
|
|
r "ip netns exec #{q_vm} bash -c 'echo 1 > /proc/sys/net/ipv4/conf/#{nic.tap}/proxy_arp'"
|
|
end
|
|
end
|
|
|
|
def update_via_routes(nics)
|
|
return if NetAddr::IPv4Net.parse(nics.first.net4).netmask.prefix_len == 32
|
|
|
|
# we create tap devices in "interfaces" function in this file. but
|
|
# code execution happens faster than linux taking care of the device creation.
|
|
# that's why by the time we reach this function, we need to check whether the
|
|
# device is created or not and then proceed to modify the routes.
|
|
success = false
|
|
5.times do
|
|
if r("ip -n #{q_vm} link | grep -E '^[0-9]+: nc[^:]+:' | grep -q 'state UP' && echo UP || echo DOWN").chomp == "UP"
|
|
success = true
|
|
break
|
|
end
|
|
sleep 0.5
|
|
end
|
|
unless success
|
|
raise "VM #{q_vm} tap device not ready after 5 retries."
|
|
end
|
|
|
|
nics.each do |nic|
|
|
local_ip4 = NetAddr::IPv4Net.parse(nic.net4)
|
|
r "ip -n #{q_vm} route replace #{local_ip4.to_s.shellescape} via #{local_ip4.nth(1).to_s.shellescape} dev #{nic.tap}" unless local_ip4.netmask.prefix_len == 32
|
|
end
|
|
end
|
|
|
|
def write_nftables_conf(ip4, gua, nics)
|
|
config = build_nftables_config(gua, nics, ip4)
|
|
vp.write_nftables_conf(config)
|
|
apply_nftables
|
|
end
|
|
|
|
def generate_nat4_rules(ip4, private_ip)
|
|
return unless ip4
|
|
|
|
public_ipv4 = NetAddr::IPv4Net.parse(ip4).network.to_s
|
|
private_ipv4_addr = NetAddr::IPv4Net.parse(private_ip)
|
|
private_ipv4 = (private_ipv4_addr.netmask.prefix_len == 32) ? private_ipv4_addr.network.to_s : private_ipv4_addr.nth(1).to_s
|
|
<<~NAT4_RULES
|
|
table ip nat {
|
|
chain prerouting {
|
|
type nat hook prerouting priority dstnat; policy accept;
|
|
ip daddr #{public_ipv4} dnat to #{private_ipv4}
|
|
}
|
|
|
|
chain postrouting {
|
|
type nat hook postrouting priority srcnat; policy accept;
|
|
ip saddr #{private_ipv4} ip daddr != { 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 } snat to #{public_ipv4}
|
|
ip saddr #{private_ipv4} ip daddr #{private_ipv4} snat to #{public_ipv4}
|
|
}
|
|
}
|
|
NAT4_RULES
|
|
end
|
|
|
|
def generate_ip4_filter_rules(nics, ip4)
|
|
ips = nics.map(&:net4).push(ip4).join(", ")
|
|
macs = nics.map(&:mac).join(", ")
|
|
"ether saddr {#{macs}} ip saddr != {#{ips}} drop"
|
|
end
|
|
|
|
def generate_dhcp_filter_rule
|
|
"oifname vethi#{q_vm} udp sport { 67, 68 } udp dport { 67, 68 } drop"
|
|
end
|
|
|
|
def generate_ip6_public_filter(nic_first, guest_ephemeral)
|
|
"ether saddr #{nic_first.mac} ip6 saddr != {#{guest_ephemeral},#{nic_first.net6},#{mac_to_ipv6_link_local(nic_first.mac)}} drop"
|
|
end
|
|
|
|
def generate_ip6_private_filter_rules(nics)
|
|
nics.map { "ether saddr #{_1.mac} ip6 saddr != #{_1.net6} drop" }.join("\n")
|
|
end
|
|
|
|
def build_nftables_config(gua, nics, ip4)
|
|
guest_ephemeral = subdivide_network(NetAddr.parse_net(gua)).first
|
|
<<~NFTABLES_CONF
|
|
table ip raw {
|
|
chain prerouting {
|
|
type filter hook prerouting priority raw; policy accept;
|
|
# allow dhcp
|
|
udp sport 68 udp dport 67 accept
|
|
udp sport 67 udp dport 68 accept
|
|
|
|
# avoid ip4 spoofing
|
|
#{generate_ip4_filter_rules(nics, ip4)}
|
|
}
|
|
chain postrouting {
|
|
type filter hook postrouting priority raw; policy accept;
|
|
# avoid dhcp ports to be used for spoofing
|
|
#{generate_dhcp_filter_rule}
|
|
}
|
|
}
|
|
table ip6 raw {
|
|
chain prerouting {
|
|
type filter hook prerouting priority raw; policy accept;
|
|
# avoid ip6 spoofing
|
|
#{generate_ip6_public_filter(nics.first, guest_ephemeral)}
|
|
#{generate_ip6_private_filter_rules(nics[1..])}
|
|
}
|
|
}
|
|
|
|
table ip6 nat_metadata_endpoint {
|
|
chain prerouting {
|
|
type nat hook prerouting priority dstnat; policy accept;
|
|
ip6 daddr FD00:0B1C:100D:5AFE:CE:: tcp dport 80 dnat to [FD00:0B1C:100D:5AFE:CE::]:8080
|
|
}
|
|
}
|
|
|
|
# NAT4 rules
|
|
#{generate_nat4_rules(ip4, nics.first.net4)}
|
|
table inet fw_table {
|
|
chain forward_ingress {
|
|
type filter hook forward priority filter; policy drop;
|
|
ip saddr 0.0.0.0/0 tcp dport 22 ip daddr #{nics.first.net4} ct state established,related,new counter accept
|
|
ip saddr #{nics.first.net4} tcp sport 22 ct state established,related counter accept
|
|
}
|
|
}
|
|
NFTABLES_CONF
|
|
end
|
|
|
|
def apply_nftables
|
|
r "ip netns exec #{q_vm} nft flush ruleset"
|
|
r "ip netns exec #{q_vm} nft -f #{vp.q_nftables_conf}"
|
|
end
|
|
|
|
def cloudinit(unix_user, public_keys, gua, nics, swap_size_bytes, boot_image, dns_ipv4)
|
|
vp.write_meta_data(<<EOS)
|
|
instance-id: #{yq(@vm_name)}
|
|
local-hostname: #{yq(@vm_name)}
|
|
EOS
|
|
|
|
guest_network = subdivide_network(NetAddr.parse_net(gua)).first
|
|
private_ip_dhcp = nics.map do |nic|
|
|
vm_sub_6 = NetAddr::IPv6Net.parse(nic.net6)
|
|
vm_net4 = NetAddr::IPv4Net.parse(nic.net4)
|
|
vm_sub_4 = (vm_net4.netmask.prefix_len == 32) ? vm_net4.nth(0) : vm_net4.nth(1)
|
|
<<DHCP
|
|
dhcp-range=#{nic.tap},#{vm_sub_4},#{vm_sub_4},6h
|
|
dhcp-range=#{nic.tap},#{vm_sub_6.nth(2)},#{vm_sub_6.nth(2)},#{vm_sub_6.netmask.prefix_len}
|
|
DHCP
|
|
end.join("\n")
|
|
|
|
raparams = nics.map { "ra-param=#{_1.tap}" }.join("\n")
|
|
interfaces = nics.map { "interface=#{_1.tap}" }.join("\n")
|
|
dnsmasq_address_ip6 = NetAddr::IPv6.parse("fd00:0b1c:100d:53::")
|
|
runner_config = if boot_image.include?("github")
|
|
<<~ADDRESSES
|
|
address=/ubicloudhostplaceholder.blob.core.windows.net/#{nics.first.net4.split("/").first}
|
|
address=/.docker.io/::
|
|
ADDRESSES
|
|
else
|
|
""
|
|
end
|
|
vp.write_dnsmasq_conf(<<DNSMASQ_CONF)
|
|
pid-file=
|
|
leasefile-ro
|
|
enable-ra
|
|
dhcp-authoritative
|
|
domain-needed
|
|
bogus-priv
|
|
no-resolv
|
|
#{raparams}
|
|
#{interfaces}
|
|
dhcp-range=#{guest_network.nth(2)},#{guest_network.nth(2)},#{guest_network.netmask.prefix_len}
|
|
#{private_ip_dhcp}
|
|
server=2606:4700:4700::1111
|
|
server=2001:4860:4860::8888
|
|
dhcp-option=6,#{dns_ipv4}
|
|
listen-address=#{dns_ipv4}
|
|
dhcp-option=26,1400
|
|
bind-interfaces
|
|
#{runner_config}
|
|
dhcp-option=54,#{dns_ipv4}
|
|
dns-forward-max=10000
|
|
dhcp-option=option6:dns-server,#{dnsmasq_address_ip6}
|
|
listen-address=#{dnsmasq_address_ip6}
|
|
all-servers
|
|
DNSMASQ_CONF
|
|
|
|
ethernets = nics.map do |nic|
|
|
<<ETHERNETS
|
|
#{yq("enx" + nic.mac.tr(":", "").downcase)}:
|
|
match:
|
|
macaddress: "#{nic.mac}"
|
|
dhcp6: true
|
|
dhcp4: true
|
|
ETHERNETS
|
|
end.join("\n")
|
|
|
|
vp.write_network_config(<<EOS)
|
|
version: 2
|
|
ethernets:
|
|
#{ethernets}
|
|
EOS
|
|
|
|
write_user_data(unix_user, public_keys, swap_size_bytes, boot_image)
|
|
|
|
FileUtils.rm_rf(vp.cloudinit_img)
|
|
r "mkdosfs -n CIDATA -C #{vp.q_cloudinit_img} 8192"
|
|
r "mcopy -oi #{vp.q_cloudinit_img} -s #{vp.q_user_data} ::"
|
|
r "mcopy -oi #{vp.q_cloudinit_img} -s #{vp.q_meta_data} ::"
|
|
r "mcopy -oi #{vp.q_cloudinit_img} -s #{vp.q_network_config} ::"
|
|
FileUtils.chown @vm_name, @vm_name, vp.cloudinit_img
|
|
end
|
|
|
|
def generate_swap_config(swap_size_bytes)
|
|
return unless swap_size_bytes
|
|
fail "BUG: swap_size_bytes must be an integer" unless swap_size_bytes.instance_of?(Integer)
|
|
|
|
<<~SWAP_CONFIG
|
|
swap:
|
|
filename: /swapfile
|
|
size: #{yq(swap_size_bytes)}
|
|
SWAP_CONFIG
|
|
end
|
|
|
|
def write_user_data(unix_user, public_keys, swap_size_bytes, boot_image)
|
|
install_cmd = if boot_image.include?("almalinux")
|
|
" - [dnf, install, '-y', nftables]\n"
|
|
elsif boot_image.include?("debian")
|
|
<<YAML
|
|
- [apt-get, update]
|
|
- [apt-get, install, -y, nftables]
|
|
YAML
|
|
else
|
|
""
|
|
end
|
|
nft_safe_sudo_allow = <<NFT_ADD_COMMS
|
|
- [nft, add, table, ip6, filter]
|
|
- [nft, add, chain, ip6, filter, output, "{", type, filter, hook, output, priority, 0, ";", "}"]
|
|
- [nft, add, rule, ip6, filter, output, ip6, daddr, 'fd00:0b1c:100d:5AFE::/64', meta, skuid, "!=", 0, tcp, flags, syn, reject, with, tcp, reset]
|
|
NFT_ADD_COMMS
|
|
|
|
vp.write_user_data(<<EOS)
|
|
#cloud-config
|
|
users:
|
|
- name: #{yq(unix_user)}
|
|
sudo: ALL=(ALL) NOPASSWD:ALL
|
|
shell: /bin/bash
|
|
ssh_authorized_keys:
|
|
#{public_keys.map { " - #{yq(_1)}" }.join("\n")}
|
|
|
|
ssh_pwauth: False
|
|
|
|
runcmd:
|
|
- [systemctl, daemon-reload]
|
|
#{install_cmd}
|
|
|
|
bootcmd:
|
|
#{nft_safe_sudo_allow}
|
|
|
|
#{generate_swap_config(swap_size_bytes)}
|
|
EOS
|
|
end
|
|
|
|
def storage(storage_params, storage_secrets, prep)
|
|
storage_params.reject { _1["read_only"] }.map { |params|
|
|
device_id = params["device_id"]
|
|
key_wrapping_secrets = storage_secrets[device_id]
|
|
storage_volume = StorageVolume.new(@vm_name, params)
|
|
storage_volume.prep(key_wrapping_secrets) if prep
|
|
storage_volume.start(key_wrapping_secrets)
|
|
}
|
|
end
|
|
|
|
# Unnecessary if host has this set before creating the netns, but
|
|
# harmless and fast enough to double up.
|
|
def forwarding
|
|
r("ip netns exec #{q_vm} sysctl -w net.ipv6.conf.all.forwarding=1")
|
|
r("ip netns exec #{q_vm} sysctl -w net.ipv4.conf.all.forwarding=1")
|
|
r("ip netns exec #{q_vm} sysctl -w net.ipv4.ip_forward=1")
|
|
end
|
|
|
|
def prepare_pci_devices(pci_devices)
|
|
pci_devices.select { _1[0].end_with? ".0" }.each do |pci_dev|
|
|
r("echo 1 > /sys/bus/pci/devices/0000:#{pci_dev[0]}/reset")
|
|
r("chown #{@vm_name}:#{@vm_name} /sys/kernel/iommu_groups/#{pci_dev[1]} /dev/vfio/#{pci_dev[1]}")
|
|
end
|
|
end
|
|
|
|
def install_systemd_unit(max_vcpus, cpu_topology, mem_gib, storage_params, nics, pci_devices, slice_name, cpu_percent_limit)
|
|
cpu_setting = "boot=#{max_vcpus},topology=#{cpu_topology}"
|
|
|
|
tapnames = nics.map { "-i #{_1.tap}" }.join(" ")
|
|
|
|
vp.write_dnsmasq_service <<DNSMASQ_SERVICE
|
|
[Unit]
|
|
Description=A lightweight DHCP and caching DNS server
|
|
After=network.target
|
|
|
|
[Service]
|
|
Slice=#{slice_name}
|
|
NetworkNamespacePath=/var/run/netns/#{@vm_name}
|
|
Type=simple
|
|
ExecStartPre=/usr/local/sbin/dnsmasq --test
|
|
ExecStart=/usr/local/sbin/dnsmasq -k -h -C /vm/#{@vm_name}/dnsmasq.conf --log-debug #{tapnames} --user=#{@vm_name} --group=#{@vm_name}
|
|
ExecReload=/bin/kill -HUP $MAINPID
|
|
ProtectSystem=strict
|
|
PrivateDevices=yes
|
|
PrivateTmp=yes
|
|
ProtectKernelTunables=yes
|
|
ProtectControlGroups=yes
|
|
ProtectHome=yes
|
|
NoNewPrivileges=yes
|
|
ReadOnlyPaths=/
|
|
DNSMASQ_SERVICE
|
|
|
|
storage_volumes = storage_params.map { |params| StorageVolume.new(@vm_name, params) }
|
|
|
|
disk_params = storage_volumes.map { |volume|
|
|
if volume.read_only
|
|
"path=#{volume.image_path},readonly=on"
|
|
else
|
|
"vhost_user=true,socket=#{volume.vhost_sock},num_queues=1,queue_size=256"
|
|
end
|
|
}
|
|
disk_params << "path=#{vp.cloudinit_img}"
|
|
|
|
disk_args = if Gem::Version.new(@ch_version.version) >= Gem::Version.new("36")
|
|
"--disk #{disk_params.join(" ")}"
|
|
else
|
|
disk_params.map { |x| "--disk #{x}" }.join(" ")
|
|
end
|
|
|
|
spdk_services = storage_volumes.map { |volume| volume.spdk_service }.uniq
|
|
spdk_after = spdk_services.map { |s| "After=#{s}" }.join("\n")
|
|
spdk_requires = spdk_services.map { |s| "Requires=#{s}" }.join("\n")
|
|
|
|
net_params = nics.map { "--net mac=#{_1.mac},tap=#{_1.tap},ip=,mask=,num_queues=#{max_vcpus * 2 + 1}" }
|
|
pci_device_params = pci_devices.map { " --device path=/sys/bus/pci/devices/0000:#{_1[0]}/" }.join
|
|
limit_memlock = pci_devices.empty? ? "" : "LimitMEMLOCK=#{mem_gib * 1073741824}"
|
|
cpu_quota = (cpu_percent_limit == 0) ? "" : "CPUQuota=#{cpu_percent_limit}%"
|
|
|
|
# YYY: Do something about systemd escaping, i.e. research the
|
|
# rules and write a routine for it. Banning suspicious strings
|
|
# from VmPath is also a good idea.
|
|
fail "BUG" if /["'\s]/.match?(cpu_setting)
|
|
vp.write_systemd_service <<SERVICE
|
|
[Unit]
|
|
Description=#{@vm_name}
|
|
After=network.target
|
|
#{spdk_after}
|
|
After=#{@vm_name}-dnsmasq.service
|
|
#{spdk_requires}
|
|
Wants=#{@vm_name}-dnsmasq.service
|
|
|
|
[Service]
|
|
Slice=#{slice_name}
|
|
NetworkNamespacePath=/var/run/netns/#{@vm_name}
|
|
ExecStartPre=/usr/bin/rm -f #{vp.ch_api_sock}
|
|
|
|
ExecStart=#{@ch_version.bin} -v \
|
|
--api-socket path=#{vp.ch_api_sock} \
|
|
--kernel #{@firmware_version.path} \
|
|
#{disk_args} \
|
|
--console off --serial file=#{vp.serial_log} \
|
|
--cpus #{cpu_setting} \
|
|
--memory size=#{mem_gib}G,#{@hugepages ? "hugepages=on,hugepage_size=1G" : "shared=on"} \
|
|
#{pci_device_params} \
|
|
#{net_params.join(" \\\n")}
|
|
|
|
ExecStop=#{@ch_version.ch_remote_bin} --api-socket #{vp.ch_api_sock} shutdown-vmm
|
|
Restart=no
|
|
User=#{@vm_name}
|
|
Group=#{@vm_name}
|
|
|
|
LimitNOFILE=500000
|
|
#{limit_memlock}
|
|
#{cpu_quota}
|
|
SERVICE
|
|
r "systemctl daemon-reload"
|
|
end
|
|
|
|
def enable_bursting(slice_name, cpu_burst_percent_limit)
|
|
# Convert cpu_burst_percent limit to a usec value. This is the additional quota every
|
|
# 100,000 use that the VM is occasionally allowed to burst to.
|
|
cpu_burst_limit = cpu_burst_percent_limit * 1000
|
|
|
|
cpu_max_burst_path = File.join("/sys/fs/cgroup", slice_name, "#{@vm_name}.service", "cpu.max.burst")
|
|
|
|
File.write(cpu_max_burst_path, cpu_burst_limit.to_s)
|
|
end
|
|
|
|
def start_systemd_unit
|
|
r "systemctl start #{q_vm}"
|
|
end
|
|
|
|
def restart_systemd_unit
|
|
r "systemctl restart #{q_vm}"
|
|
end
|
|
end
|