This was originally added in1c75bfdb8e
(June 2023), and when originally added, it used HetznerHost, and it worked correctly. Inb48756e163
, it was changed from using HetznerHost to using HostProvider, and at this point it broke, because host_provider.{server_identifier,provider_name} are primary key columns, and cannot be set via mass assignment by default. It's possible to fix this issue, but considering it has been broken for over 6 months, and is not used anywhere, it seems better to remove it. The only reason this wasn't discovered sooner is that the VmHost model specs are mostly mocked (yet another bug hidden by mocking).
467 lines
19 KiB
Ruby
467 lines
19 KiB
Ruby
# frozen_string_literal: true
|
||
|
||
require "shellwords"
|
||
require_relative "../model"
|
||
require_relative "../lib/hosting/apis"
|
||
require_relative "../lib/system_parser"
|
||
|
||
class VmHost < Sequel::Model
|
||
one_to_one :strand, key: :id
|
||
one_to_one :sshable, key: :id
|
||
one_to_many :vms
|
||
one_to_many :assigned_subnets, key: :routed_to_host_id, class: :Address
|
||
one_to_one :provider, key: :id, class: :HostProvider
|
||
one_to_many :assigned_host_addresses, key: :host_id, class: :AssignedHostAddress
|
||
one_to_many :spdk_installations, key: :vm_host_id
|
||
one_to_many :vhost_block_backends
|
||
one_to_many :storage_devices, key: :vm_host_id
|
||
one_to_many :pci_devices, key: :vm_host_id
|
||
one_to_many :boot_images, key: :vm_host_id
|
||
one_to_many :slices, class: :VmHostSlice, key: :vm_host_id
|
||
one_to_many :cpus, class: :VmHostCpu, key: :vm_host_id
|
||
many_to_one :location, key: :location_id, class: :Location
|
||
|
||
many_to_many :assigned_vm_addresses, join_table: :address, left_key: :routed_to_host_id, right_key: :id, right_primary_key: :address_id, read_only: true
|
||
|
||
plugin :association_dependencies, assigned_host_addresses: :destroy, assigned_subnets: :destroy, provider: :destroy, spdk_installations: :destroy, storage_devices: :destroy, pci_devices: :destroy, boot_images: :destroy, slices: :destroy, cpus: :destroy, vhost_block_backends: :destroy
|
||
|
||
plugin ResourceMethods
|
||
plugin SemaphoreMethods, :checkup, :reboot, :hardware_reset, :destroy, :graceful_reboot, :configure_metrics
|
||
include HealthMonitorMethods
|
||
include MetricsTargetMethods
|
||
|
||
def host_prefix
|
||
net6.netmask.prefix_len
|
||
end
|
||
|
||
def provider_name
|
||
provider&.provider_name
|
||
end
|
||
|
||
# Compute the IPv6 Subnet that can be used to address the host
|
||
# itself, and should not be delegated to any VMs.
|
||
#
|
||
# The default prefix length is 79, so that customers can be given a
|
||
# /80 for their own exclusive use, and paired is the adjacent /80
|
||
# for Clover's use on behalf of that VM. This leaves 15 bits of
|
||
# entropy relative to the customary /64 allocated to a real device.
|
||
#
|
||
# Offering a /80 to the VM renders nicely in the IPv6 format, as
|
||
# it's 5 * 16, and each delimited part of IPv6 is 16 bits.
|
||
#
|
||
# A /80 is the longest prefix that is divisible by 16 and contains
|
||
# multiple /96 subnets within it. /96 is of special significance
|
||
# because it contains enough space within it to hold the IPv4
|
||
# address space, i.e. leaving the door open for schemes relying on
|
||
# SIIT translation: https://datatracker.ietf.org/doc/html/rfc7915
|
||
def ip6_reserved_network(prefix = 79)
|
||
# Interpret a nil ip6 address and a not-nil network as there being
|
||
# no network reserved for the host in net6.
|
||
#
|
||
# The reason: some vendors prefer to delegate a network that
|
||
# bypasses neighbor discovery protocol (NDP) that is different
|
||
# than the network customarily assigned to the physical network
|
||
# interface.
|
||
#
|
||
# And on at least one, the network interface *must* respond to
|
||
# neighbor discovery for that customary network/IP in order to be
|
||
# routed the delegated /64. The two networks have no overlap with
|
||
# one another. So alas, our ability to auto-configure from the
|
||
# host in such a case is limited.
|
||
return nil if ip6.nil? && !net6.nil?
|
||
fail "BUG: host prefix must be is shorter than reserved prefix" unless host_prefix < prefix
|
||
|
||
NetAddr::IPv6Net.new(ip6, NetAddr::Mask128.new(prefix))
|
||
end
|
||
|
||
# Generate a random network that is a slice of the host's network
|
||
# for delegation to a VM.
|
||
def ip6_random_vm_network
|
||
prefix = host_prefix + 15
|
||
# We generate 2 bytes of entropy for the lower bits
|
||
# and append them to the host's network. This way,
|
||
# the first X bits are the same as the host's network.
|
||
# And the next 16 bits are random. To achieve that, we shift
|
||
# the random number to the left by (128 - prefix - 1) bits.
|
||
# This way, the random number is placed in the correct position.
|
||
# With a simple example:
|
||
# Let's say we have a /64 network: 0:0:0:0::/64
|
||
# This can be expanded to: 0:0:0:0:0:0:0:0/64
|
||
# With even further expansion: 0000:0000:0000:0000:0000:0000:0000:0000/64
|
||
# Now, we can generate a random number between 2 and 2**16 (65536). This
|
||
# will be our next 2 bytes. Let's say the random number is 5001. In base 16,
|
||
# this is 0x1389.
|
||
# Now, if we shift it by 48 bits (3 octets) as it is in ipv6 addresses:
|
||
# 1389:0:0:0
|
||
# now if we OR it with the host's network:
|
||
# 0:0:0:0:0:0:0:0 | 0:0:0:0:1389:0:0:0 = 0:0:0:0:1389:0:0:0/80
|
||
# We are not done yet, if you realized, we are masking it with 79 not 80.
|
||
# Because this /79 is later split into 2 /80s for internal and external use.
|
||
# Therefore, the network we return is:
|
||
# 2a01:4f9:2b:35a:1388:0:0:0/79
|
||
# and the two /80 networks are:
|
||
# 2a01:4f9:2b:35a:1388:0:0:0/80 and 2a01:4f9:2b:35a:1389:0:0:0/80
|
||
lower_bits = SecureRandom.random_number(2...2**16) << (128 - prefix - 1)
|
||
|
||
# Combine it with the higher bits for the host.
|
||
proposal = NetAddr::IPv6Net.new(
|
||
NetAddr::IPv6.new(net6.network.addr | lower_bits), NetAddr::Mask128.new(prefix)
|
||
)
|
||
|
||
# :nocov:
|
||
fail "BUG: host should be supernet of randomized subnet" unless net6.rel(proposal) == 1
|
||
# :nocov:
|
||
|
||
case (rn = ip6_reserved_network(prefix)) && proposal.network.cmp(rn.network)
|
||
when 0
|
||
# Guard against choosing the host-reserved network for a guest
|
||
# and try again. Recursion is used here because it's a likely
|
||
# code path, and if there's a bug, it's better to stack overflow
|
||
# rather than loop forever.
|
||
ip6_random_vm_network
|
||
else
|
||
proposal
|
||
end
|
||
end
|
||
|
||
def ip4_random_vm_network
|
||
res = DB[:ipv4_address]
|
||
.join(:address, [:cidr])
|
||
.where(cidr: assigned_subnets_dataset.select(:cidr))
|
||
.exclude(assigned_vm_addresses_dataset.where(ip: Sequel[:ipv4_address][:ip]).select(1).exists)
|
||
.order { random.function }
|
||
.first
|
||
|
||
res ? [res.delete(:ip), Address.call(res)] : [nil, nil]
|
||
end
|
||
|
||
# IPv4 range to use for setting local_vetho_ip for VM in hosts.
|
||
APIPA_RANGE = NetAddr::IPv4Net.parse("169.254.0.0/16").freeze
|
||
|
||
# Calculate number of usable addresses in above range. However, substract 1024.
|
||
# If the random assignment conflict with an already used address, we increment
|
||
# instead of looking for a different random assignment. Doing so can have problems
|
||
# when you are near the top of the range, so carve out 1024 spots at the top of
|
||
# the range. This should work correctly as long as a host does not run more than
|
||
# 1024 VMs.
|
||
#
|
||
# we get 1 address here and use the next address to assign
|
||
# route for vetho* and vethi* devices. So, we are splitting the local address
|
||
# space to two but only store 1 of them for the existence check.
|
||
# that's why the range is 2 * ((addr.len - 2) / 2)
|
||
APIPA_LEN = ((APIPA_RANGE.len - 2) / 2) - 1024
|
||
|
||
APIPA_MASK = NetAddr::Mask32.new(32).freeze
|
||
|
||
def veth_pair_random_ip4_addr
|
||
ips = vms_dataset.select_map(:local_vetho_ip)
|
||
ip = APIPA_RANGE.nth(2 * SecureRandom.random_number(APIPA_LEN))
|
||
ip = ip.next.next while ips.include?(ip.to_s)
|
||
NetAddr::IPv4Net.new(ip, APIPA_MASK)
|
||
end
|
||
|
||
def sshable_address
|
||
assigned_host_addresses_dataset.first { {family(ip) => 4} }
|
||
end
|
||
|
||
def spdk_cpu_count
|
||
if total_cpus <= 64
|
||
2
|
||
else
|
||
4
|
||
end
|
||
end
|
||
|
||
def create_addresses(ip_records: nil)
|
||
ip_records ||= Hosting::Apis.pull_ips(self)
|
||
return if ip_records.nil? || ip_records.empty?
|
||
|
||
DB.transaction do
|
||
ip_records.each do |ip_record|
|
||
ip_addr = ip_record.ip_address
|
||
source_host_ip = ip_record.source_host_ip
|
||
is_failover_ip = ip_record.is_failover
|
||
|
||
next if assigned_subnets.any? { |a| a.cidr.to_s == ip_addr }
|
||
|
||
# we need to find if it was previously created
|
||
# if it was, we need to update the routed_to_host_id but only if there is no VM that's using it
|
||
# if it wasn't, we need to create it
|
||
adr = Address.where(cidr: ip_addr).first
|
||
if adr && is_failover_ip
|
||
if adr.assigned_vm_addresses.count > 0
|
||
fail "BUG: failover ip #{ip_addr} is already assigned to a vm"
|
||
end
|
||
|
||
adr.update(routed_to_host_id: id)
|
||
else
|
||
if Sshable.where(host: source_host_ip).count == 0
|
||
fail "BUG: source host #{source_host_ip} isn't added to the database"
|
||
end
|
||
adr = Address.create(cidr: ip_addr, routed_to_host_id: id, is_failover_ip: is_failover_ip)
|
||
end
|
||
|
||
unless is_failover_ip
|
||
AssignedHostAddress.create(host_id: id, ip: ip_addr, address_id: adr.id)
|
||
end
|
||
end
|
||
end
|
||
|
||
Strand.create(prog: "SetupNftables", label: "start", stack: [{subject_id: id}])
|
||
end
|
||
|
||
# Operational Functions
|
||
|
||
# Introduced for refreshing rhizome programs via REPL.
|
||
def install_rhizome(install_specs: false)
|
||
Strand.create(prog: "InstallRhizome", label: "start", stack: [{subject_id: id, target_folder: "host", install_specs: install_specs}])
|
||
end
|
||
|
||
# Introduced for downloading a new boot image via REPL.
|
||
def download_boot_image(image_name, version:, custom_url: nil)
|
||
Strand.create(prog: "DownloadBootImage", label: "start", stack: [{subject_id: id, image_name: image_name, custom_url: custom_url, version: version}])
|
||
end
|
||
|
||
# Introduced for downloading firmware via REPL.
|
||
def download_firmware(version_x64: nil, version_arm64: nil, sha256_x64: nil, sha256_arm64: nil)
|
||
version, sha256 = (arch == "x64") ? [version_x64, sha256_x64] : [version_arm64, sha256_arm64]
|
||
fail ArgumentError, "No version provided" if version.nil?
|
||
fail ArgumentError, "No SHA-256 digest provided" if sha256.nil?
|
||
Strand.create(prog: "DownloadFirmware", label: "start", stack: [{subject_id: id, version: version, sha256: sha256}])
|
||
end
|
||
|
||
# Introduced for downloading cloud hypervisor via REPL.
|
||
def download_cloud_hypervisor(version_x64: nil, version_arm64: nil, sha256_ch_bin_x64: nil, sha256_ch_bin_arm64: nil, sha256_ch_remote_x64: nil, sha256_ch_remote_arm64: nil)
|
||
version, sha256_ch_bin, sha256_ch_remote = if arch == "x64"
|
||
[version_x64, sha256_ch_bin_x64, sha256_ch_remote_x64]
|
||
elsif arch == "arm64"
|
||
[version_arm64, sha256_ch_bin_arm64, sha256_ch_remote_arm64]
|
||
else
|
||
fail "BUG: unexpected architecture"
|
||
end
|
||
fail ArgumentError, "No version provided" if version.nil?
|
||
Strand.create(prog: "DownloadCloudHypervisor", label: "start", stack: [{subject_id: id, version: version, sha256_ch_bin: sha256_ch_bin, sha256_ch_remote: sha256_ch_remote}])
|
||
end
|
||
|
||
def set_data_center
|
||
update(data_center: Hosting::Apis.pull_data_center(self))
|
||
end
|
||
|
||
def allow_slices
|
||
update(accepts_slices: true)
|
||
end
|
||
|
||
def disallow_slices
|
||
update(accepts_slices: false)
|
||
end
|
||
|
||
def set_server_name
|
||
Hosting::Apis.set_server_name(self)
|
||
end
|
||
|
||
def reimage
|
||
unless Config.development?
|
||
fail "BUG: reimage is only allowed in development"
|
||
end
|
||
|
||
Hosting::Apis.reimage_server(self)
|
||
end
|
||
|
||
# Cuts power to a Server and starts it again. This forcefully stops it
|
||
# without giving the Server operating system time to gracefully stop. This
|
||
# may lead to data loss, it’s equivalent to pulling the power cord and
|
||
# plugging it in again. Reset should only be used when reboot does not work.
|
||
def hardware_reset
|
||
Hosting::Apis.hardware_reset_server(self)
|
||
end
|
||
|
||
def check_storage_smartctl(ssh_session, devices)
|
||
devices.map do |device_name|
|
||
command = "sudo smartctl -j -H /dev/#{device_name}"
|
||
command << " -d scsi" if device_name.start_with?("sd")
|
||
command << " | jq .smart_status.passed"
|
||
passed = ssh_session.exec!(command).strip == "true"
|
||
Clog.emit("Device #{device_name} failed smartctl check on VmHost #{ubid}") unless passed
|
||
passed
|
||
end.all?(true)
|
||
end
|
||
|
||
def check_storage_nvme(ssh_session, devices)
|
||
devices.reject { |device_name| !device_name.start_with?("nvme") }.map do |device_name|
|
||
passed = ssh_session.exec!("sudo nvme smart-log /dev/#{device_name} | grep \"critical_warning\" | awk '{print $3}'").strip == "0"
|
||
Clog.emit("Device #{device_name} failed nvme smart-log check on VmHost #{ubid}") unless passed
|
||
passed
|
||
end.all?(true)
|
||
end
|
||
|
||
def check_storage_read_write(ssh_session, devices)
|
||
lsblk_json_info = ssh_session.exec!("lsblk --json")
|
||
devices_with_mount_points = devices.map { |device| SystemParser.get_device_mount_points_from_lsblk_json(lsblk_json_info, device) }
|
||
|
||
all_mount_points = []
|
||
devices_with_mount_points.each do |device_mount_points|
|
||
device_mount_points.each_value do |mount_points|
|
||
all_mount_points.concat(mount_points)
|
||
end
|
||
end
|
||
|
||
all_mount_points.uniq.all? do |mount_point|
|
||
file_name = Shellwords.escape(File.join(mount_point, "test-file"))
|
||
|
||
write_result = ssh_session.exec!("sudo bash -c \"head -c 1M </dev/zero > #{file_name}\"")
|
||
write_status = write_result.exitstatus == 0
|
||
hash_result = ssh_session.exec!("sha256sum #{file_name}")
|
||
hash_status = hash_result.strip == "30e14955ebf1352266dc2ff8067e68104607e750abb9d3b36582b8af909fcb58 #{file_name}"
|
||
delete_result = ssh_session.exec!("sudo rm #{file_name}")
|
||
delete_status = delete_result.exitstatus == 0
|
||
|
||
unless write_status && hash_status && delete_status
|
||
failure_reasons = []
|
||
failure_reasons << "Write failed (exitstatus=#{write_result.exitstatus}, output=#{write_result.strip})" unless write_status
|
||
failure_reasons << "Hash check failed (expected hash mismatch, output=#{hash_result.strip})" unless hash_status
|
||
failure_reasons << "Delete failed (exitstatus=#{delete_result.exitstatus}, output=#{delete_result.strip})" unless delete_status
|
||
|
||
Clog.emit("Failed to perform write/read/delete on mountpoint #{mount_point} on VmHost #{ubid}: #{failure_reasons.join("; ")}")
|
||
end
|
||
|
||
write_status && hash_status && delete_status
|
||
end
|
||
end
|
||
|
||
def check_storage_kernel_logs(ssh_session, devices)
|
||
kernel_logs = ssh_session.exec!("journalctl -kS -1min --no-pager")
|
||
return false unless kernel_logs.exitstatus == 0
|
||
|
||
error_count = kernel_logs.scan(/Buffer I\/O error on dev (\w+)/).tally
|
||
Clog.emit("found error on kernel logs. devices with error_count: #{error_count} on VmHost #{ubid}") unless error_count.empty?
|
||
error_count.empty?
|
||
end
|
||
|
||
def init_health_monitor_session
|
||
{
|
||
ssh_session: sshable.start_fresh_session
|
||
}
|
||
end
|
||
|
||
def init_metrics_export_session
|
||
{
|
||
ssh_session: sshable.start_fresh_session
|
||
}
|
||
end
|
||
|
||
def disk_device_ids
|
||
# we use this next line to migrate data from the old formatting (storing device names) to the new (storing id) so we trigger the convert
|
||
# whenever an element inside unix_device_list is not a SSD or NVMe id.
|
||
# SSD and NVMe ids start with wwn or nvme-eui respectively.
|
||
# YYY: This next line can be removed in the future after the first run of the code.
|
||
storage_devices.each { |sd| sd.migrate_device_name_to_device_id if sd.unix_device_list.any? { |device_name| device_name !~ /\A(wwn|nvme-eui)/i } }
|
||
storage_devices.flat_map { |sd| sd.unix_device_list }
|
||
end
|
||
|
||
def disk_device_names(ssh_session)
|
||
disk_device_ids.map { |id| ssh_session.exec!("readlink -f /dev/disk/by-id/#{id}").delete_prefix("/dev/").strip }
|
||
end
|
||
|
||
def perform_health_checks(ssh_session)
|
||
device_names = disk_device_names(ssh_session)
|
||
check_storage_smartctl(ssh_session, device_names) &&
|
||
check_storage_nvme(ssh_session, device_names) &&
|
||
check_storage_read_write(ssh_session, device_names) &&
|
||
check_storage_kernel_logs(ssh_session, device_names)
|
||
end
|
||
|
||
def check_pulse(session:, previous_pulse:)
|
||
reading = begin
|
||
perform_health_checks(session[:ssh_session]) ? "up" : "down"
|
||
rescue
|
||
"down"
|
||
end
|
||
pulse = aggregate_readings(previous_pulse: previous_pulse, reading: reading)
|
||
|
||
if pulse[:reading] == "down" && pulse[:reading_rpt] > 5 && Time.now - pulse[:reading_chg] > 30 && !reload.checkup_set?
|
||
incr_checkup
|
||
end
|
||
|
||
pulse
|
||
end
|
||
|
||
def available_storage_gib
|
||
storage_devices.sum(&:available_storage_gib)
|
||
end
|
||
|
||
def total_storage_gib
|
||
storage_devices.sum(&:total_storage_gib)
|
||
end
|
||
|
||
def render_arch(arm64:, x64:)
|
||
case arch
|
||
when "arm64"
|
||
arm64
|
||
when "x64"
|
||
x64
|
||
else
|
||
fail "BUG: inexhaustive render code"
|
||
end
|
||
end
|
||
|
||
def metrics_config
|
||
{
|
||
endpoints: [
|
||
"http://localhost:9100/metrics"
|
||
],
|
||
max_file_retention: 120,
|
||
interval: "15s",
|
||
additional_labels: {ubicloud_resource_id: ubid},
|
||
metrics_dir: "/home/rhizome/host/metrics",
|
||
project_id: Config.monitoring_service_project_id
|
||
}
|
||
end
|
||
end
|
||
|
||
# Table: vm_host
|
||
# Columns:
|
||
# id | uuid | PRIMARY KEY
|
||
# allocation_state | allocation_state | NOT NULL DEFAULT 'unprepared'::allocation_state
|
||
# ip6 | inet |
|
||
# net6 | cidr |
|
||
# total_mem_gib | integer |
|
||
# total_sockets | integer |
|
||
# total_cores | integer |
|
||
# total_cpus | integer |
|
||
# used_cores | integer | NOT NULL DEFAULT 0
|
||
# ndp_needed | boolean | NOT NULL DEFAULT false
|
||
# total_hugepages_1g | integer | NOT NULL DEFAULT 0
|
||
# used_hugepages_1g | integer | NOT NULL DEFAULT 0
|
||
# last_boot_id | text |
|
||
# created_at | timestamp with time zone | NOT NULL DEFAULT now()
|
||
# data_center | text |
|
||
# arch | arch |
|
||
# total_dies | integer |
|
||
# os_version | text |
|
||
# accepts_slices | boolean | NOT NULL DEFAULT false
|
||
# location_id | uuid | NOT NULL
|
||
# family | text | NOT NULL
|
||
# Indexes:
|
||
# vm_host_pkey | PRIMARY KEY btree (id)
|
||
# vm_host_ip6_key | UNIQUE btree (ip6)
|
||
# vm_host_net6_key | UNIQUE btree (net6)
|
||
# Check constraints:
|
||
# core_allocation_limit | (used_cores <= total_cores)
|
||
# hugepages_allocation_limit | (used_hugepages_1g <= total_hugepages_1g)
|
||
# used_cores_above_zero | (used_cores >= 0)
|
||
# Foreign key constraints:
|
||
# vm_host_id_fkey | (id) REFERENCES sshable(id)
|
||
# vm_host_location_id_fkey | (location_id) REFERENCES location(id)
|
||
# Referenced By:
|
||
# address | address_routed_to_host_id_fkey | (routed_to_host_id) REFERENCES vm_host(id)
|
||
# assigned_host_address | assigned_host_address_host_id_fkey | (host_id) REFERENCES vm_host(id)
|
||
# boot_image | boot_image_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# host_provider | host_provider_id_fkey | (id) REFERENCES vm_host(id)
|
||
# pci_device | pci_device_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# spdk_installation | spdk_installation_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# storage_device | storage_device_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# vhost_block_backend | vhost_block_backend_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# vm | vm_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# vm_host_cpu | vm_host_cpu_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
||
# vm_host_slice | vm_host_slice_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|