This patch switches the VM allocation from Cores to VCpus when selecting a host. There are two use cases motivating this change: - we have x64 hosts that have threads_per_cores ratio of 1 (GEX44). That breaks the assumption encoded in the VmSizes, per architecture type - we are going to introduce Burstable family, where relation between number of CPUs allocated for a VM and number of Cores allocated to a slice hosting that VM may vary per VM instance, regardless of the architecture. With this change, the number of cores is computed during the allocation, based on the actual architecture of the candidate host and then updated back to the VM. In case when the VM is allocated in a slice, the number of cores is left as 0 on the VM, and instead, the number of cores is saved in the VmHostSlice, and that is subtracted from the host. At any point in time this should be true: vm_host.used_cores == SUM(vm_host_slice.cores) + SUM(vm.cores if vm.vm_host_slice_id.nil?) This logic also helps us indicate who is really controlling the cores - it is either the VmHostSlice or a Vm running without the slice. Vms inside the slice, do not control the cores and relay on the slice instead. The special case for vcpus==1 in cloud_hypervisor_cpu_topology is needed for Burstables, where we will have Burstable-1 size. I wanted to include this in the review together with this patch for completeness.
304 lines
12 KiB
Ruby
304 lines
12 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "jwt"
|
|
require_relative "../model"
|
|
|
|
class Vm < Sequel::Model
|
|
one_to_one :strand, key: :id
|
|
many_to_one :vm_host
|
|
many_to_one :project
|
|
one_to_many :nics, key: :vm_id, class: :Nic
|
|
many_to_many :private_subnets, join_table: :nic, left_key: :vm_id, right_key: :private_subnet_id
|
|
one_to_one :sshable, key: :id
|
|
one_to_one :assigned_vm_address, key: :dst_vm_id, class: :AssignedVmAddress
|
|
one_to_many :vm_storage_volumes, key: :vm_id, order: Sequel.desc(:boot)
|
|
one_to_many :active_billing_records, class: :BillingRecord, key: :resource_id do |ds| ds.active end
|
|
one_to_many :pci_devices, key: :vm_id, class: :PciDevice
|
|
one_through_one :load_balancer, left_key: :vm_id, right_key: :load_balancer_id, join_table: :load_balancers_vms
|
|
one_to_one :load_balancers_vms, key: :vm_id, class: :LoadBalancersVms
|
|
many_to_one :vm_host_slice
|
|
|
|
plugin :association_dependencies, sshable: :destroy, assigned_vm_address: :destroy, vm_storage_volumes: :destroy, load_balancers_vms: :destroy
|
|
|
|
dataset_module Pagination
|
|
|
|
include ResourceMethods
|
|
include SemaphoreMethods
|
|
include HealthMonitorMethods
|
|
semaphore :destroy, :start_after_host_reboot, :prevent_destroy, :update_firewall_rules, :checkup, :update_spdk_dependency, :waiting_for_capacity, :lb_expiry_started
|
|
semaphore :restart, :stop
|
|
|
|
include ObjectTag::Cleanup
|
|
|
|
def firewalls
|
|
private_subnets.flat_map(&:firewalls)
|
|
end
|
|
|
|
def display_location
|
|
LocationNameConverter.to_display_name(location)
|
|
end
|
|
|
|
def path
|
|
"/location/#{display_location}/vm/#{name}"
|
|
end
|
|
|
|
def ephemeral_net4
|
|
assigned_vm_address&.ip&.network
|
|
end
|
|
|
|
def ip4
|
|
assigned_vm_address&.ip
|
|
end
|
|
|
|
def private_ipv4
|
|
(nics.first.private_ipv4.netmask.prefix_len == 32) ? nics.first.private_ipv4.network : nics.first.private_ipv4.nth(1)
|
|
end
|
|
|
|
def private_ipv6
|
|
nics.first.private_ipv6.nth(2)
|
|
end
|
|
|
|
def runtime_token
|
|
JWT.encode({sub: ubid, iat: Time.now.to_i}, Config.clover_runtime_token_secret, "HS256")
|
|
end
|
|
|
|
def display_state
|
|
return "deleting" if destroy_set? || strand&.label == "destroy"
|
|
return "restarting" if restart_set? || strand&.label == "restart"
|
|
return "stopped" if stop_set? || strand&.label == "stopped"
|
|
if waiting_for_capacity_set?
|
|
return "no capacity available" if Time.now - created_at > 15 * 60
|
|
return "waiting for capacity"
|
|
end
|
|
super
|
|
end
|
|
|
|
# cloud-hypervisor takes topology information in this format:
|
|
#
|
|
# topology=<threads_per_core>:<cores_per_die>:<dies_per_package>:<packages>
|
|
#
|
|
# And the result of multiplication must equal the thread/vcpu count
|
|
# we wish to allocate:
|
|
#
|
|
# let total = t.threads_per_core * t.cores_per_die * t.dies_per_package * t.packages;
|
|
# if total != self.cpus.max_vcpus {
|
|
# return Err(ValidationError::CpuTopologyCount);
|
|
# }
|
|
CloudHypervisorCpuTopo = Struct.new(:threads_per_core, :cores_per_die, :dies_per_package, :packages) do
|
|
def to_s
|
|
to_a.map(&:to_s).join(":")
|
|
end
|
|
|
|
def max_vcpus
|
|
@max_vcpus ||= to_a.reduce(&:*)
|
|
end
|
|
end
|
|
|
|
def cloud_hypervisor_cpu_topology
|
|
threads_per_core, r = vm_host.total_cpus.divmod vm_host.total_cores
|
|
fail "BUG" unless r.zero?
|
|
|
|
total_dies_per_package, r = vm_host.total_dies.divmod vm_host.total_sockets
|
|
fail "BUG" unless r.zero?
|
|
|
|
total_packages = vm_host.total_sockets
|
|
|
|
# Computed all-system statistics, now scale it down to meet VM needs.
|
|
if vcpus == 1 && threads_per_core > 1
|
|
# special case for single-threaded VMs
|
|
cores_from_cpus = Rational(vcpus)
|
|
threads_per_core = 1
|
|
else
|
|
cores_from_cpus = Rational(vcpus) / threads_per_core
|
|
end
|
|
proportion = cores_from_cpus / vm_host.total_cores
|
|
packages = (total_packages * proportion).ceil
|
|
dies_per_package = (total_dies_per_package * proportion).ceil
|
|
cores_per_die = cores_from_cpus / (packages * dies_per_package)
|
|
fail "BUG: need uniform number of cores allocated per die" unless cores_per_die.denominator == 1
|
|
|
|
topo = [threads_per_core, cores_per_die, dies_per_package, packages].map { |num|
|
|
# :nocov:
|
|
fail "BUG: non-integer in topology array" unless num.denominator == 1
|
|
# :nocov:
|
|
Integer(num)
|
|
}
|
|
|
|
# :nocov:
|
|
unless topo.reduce(&:*) == vcpus
|
|
fail "BUG: arithmetic does not result in the correct number of vcpus"
|
|
end
|
|
# :nocov:
|
|
|
|
CloudHypervisorCpuTopo.new(*topo)
|
|
end
|
|
|
|
# Reverse look-up the vm_size instance that was used to create this VM
|
|
# and use its name as a display name.
|
|
def display_size
|
|
vm_size = Option::VmSizes.find {
|
|
_1.family == family &&
|
|
_1.arch == arch &&
|
|
_1.vcpus == vcpus &&
|
|
(cpu_percent_limit.nil? || _1.cpu_percent_limit == cpu_percent_limit)
|
|
}
|
|
vm_size.name
|
|
end
|
|
|
|
# Various names in linux, like interface names, are obliged to be
|
|
# short, so truncate the ubid. This does introduce the spectre of
|
|
# collisions. When the time comes, we'll have to ensure it doesn't
|
|
# happen on a single host, pushing into the allocation process.
|
|
def self.ubid_to_name(id)
|
|
id.to_s[0..7]
|
|
end
|
|
|
|
def inhost_name
|
|
self.class.ubid_to_name(UBID.from_uuidish(id))
|
|
end
|
|
|
|
def storage_size_gib
|
|
vm_storage_volumes.map { _1.size_gib }.sum
|
|
end
|
|
|
|
def init_health_monitor_session
|
|
{
|
|
ssh_session: vm_host.sshable.start_fresh_session
|
|
}
|
|
end
|
|
|
|
def check_pulse(session:, previous_pulse:)
|
|
reading = begin
|
|
session[:ssh_session].exec!("systemctl is-active #{inhost_name} #{inhost_name}-dnsmasq").split("\n").all?("active") ? "up" : "down"
|
|
rescue
|
|
"down"
|
|
end
|
|
pulse = aggregate_readings(previous_pulse: previous_pulse, reading: reading)
|
|
|
|
if pulse[:reading] == "down" && pulse[:reading_rpt] > 5 && Time.now - pulse[:reading_chg] > 30 && !reload.checkup_set?
|
|
incr_checkup
|
|
end
|
|
|
|
pulse
|
|
end
|
|
|
|
def update_spdk_version(version)
|
|
spdk_installation = vm_host.spdk_installations_dataset[version: version]
|
|
fail "SPDK version #{version} not found on host" unless spdk_installation
|
|
vm_storage_volumes_dataset.update(spdk_installation_id: spdk_installation.id)
|
|
incr_update_spdk_dependency
|
|
end
|
|
|
|
def self.redacted_columns
|
|
super + [:public_key]
|
|
end
|
|
|
|
def params_json(swap_size_bytes)
|
|
topo = cloud_hypervisor_cpu_topology
|
|
|
|
project_public_keys = project.get_ff_vm_public_ssh_keys || []
|
|
|
|
# we don't write secrets to params_json, because it
|
|
# shouldn't be stored in the host for security reasons.
|
|
JSON.pretty_generate({
|
|
"vm_name" => name,
|
|
"public_ipv6" => ephemeral_net6.to_s,
|
|
"public_ipv4" => ip4.to_s || "",
|
|
"local_ipv4" => local_vetho_ip.to_s.shellescape || "",
|
|
"dns_ipv4" => nics.first.private_subnet.net4.nth(2).to_s,
|
|
"unix_user" => unix_user,
|
|
"ssh_public_keys" => [public_key] + project_public_keys,
|
|
"nics" => nics.map { |nic| [nic.private_ipv6.to_s, nic.private_ipv4.to_s, nic.ubid_to_tap_name, nic.mac, nic.private_ipv4_gateway] },
|
|
"boot_image" => boot_image,
|
|
"max_vcpus" => topo.max_vcpus,
|
|
"cpu_topology" => topo.to_s,
|
|
"mem_gib" => memory_gib,
|
|
"ndp_needed" => vm_host.ndp_needed,
|
|
"storage_volumes" => storage_volumes,
|
|
"swap_size_bytes" => swap_size_bytes,
|
|
"pci_devices" => pci_devices.map { [_1.slot, _1.iommu_group] },
|
|
"slice_name" => vm_host_slice&.inhost_name || "system.slice",
|
|
"cpu_percent_limit" => cpu_percent_limit || 0,
|
|
"cpu_burst_percent_limit" => cpu_burst_percent_limit || 0
|
|
})
|
|
end
|
|
|
|
def storage_volumes
|
|
vm_storage_volumes.map { |s|
|
|
{
|
|
"boot" => s.boot,
|
|
"image" => s.boot_image&.name,
|
|
"image_version" => s.boot_image&.version,
|
|
"size_gib" => s.size_gib,
|
|
"device_id" => s.device_id,
|
|
"disk_index" => s.disk_index,
|
|
"encrypted" => !s.key_encryption_key_1.nil?,
|
|
"spdk_version" => s.spdk_version,
|
|
"use_bdev_ubi" => s.use_bdev_ubi,
|
|
"skip_sync" => s.skip_sync,
|
|
"storage_device" => s.storage_device.name,
|
|
"read_only" => s.size_gib == 0,
|
|
"max_ios_per_sec" => s.max_ios_per_sec,
|
|
"max_read_mbytes_per_sec" => s.max_read_mbytes_per_sec,
|
|
"max_write_mbytes_per_sec" => s.max_write_mbytes_per_sec
|
|
}
|
|
}
|
|
end
|
|
|
|
def storage_secrets
|
|
vm_storage_volumes.filter_map { |s|
|
|
if !s.key_encryption_key_1.nil?
|
|
[s.device_id, s.key_encryption_key_1.secret_key_material_hash]
|
|
end
|
|
}.to_h
|
|
end
|
|
end
|
|
|
|
# Table: vm
|
|
# Columns:
|
|
# id | uuid | PRIMARY KEY
|
|
# ephemeral_net6 | cidr |
|
|
# vm_host_id | uuid |
|
|
# unix_user | text | NOT NULL
|
|
# public_key | text | NOT NULL
|
|
# display_state | vm_display_state | NOT NULL DEFAULT 'creating'::vm_display_state
|
|
# name | text | NOT NULL
|
|
# location | text | NOT NULL
|
|
# boot_image | text | NOT NULL
|
|
# local_vetho_ip | text |
|
|
# ip4_enabled | boolean | NOT NULL DEFAULT false
|
|
# family | text | NOT NULL
|
|
# cores | integer | NOT NULL
|
|
# pool_id | uuid |
|
|
# created_at | timestamp with time zone | NOT NULL DEFAULT now()
|
|
# arch | arch | NOT NULL DEFAULT 'x64'::arch
|
|
# allocated_at | timestamp with time zone |
|
|
# provisioned_at | timestamp with time zone |
|
|
# vcpus | integer | NOT NULL
|
|
# memory_gib | integer | NOT NULL
|
|
# vm_host_slice_id | uuid |
|
|
# project_id | uuid | NOT NULL
|
|
# cpu_percent_limit | integer |
|
|
# cpu_burst_percent_limit | integer |
|
|
# Indexes:
|
|
# vm_pkey | PRIMARY KEY btree (id)
|
|
# vm_ephemeral_net6_key | UNIQUE btree (ephemeral_net6)
|
|
# vm_project_id_location_name_uidx | UNIQUE btree (project_id, location, name)
|
|
# Foreign key constraints:
|
|
# vm_pool_id_fkey | (pool_id) REFERENCES vm_pool(id)
|
|
# vm_project_id_fkey | (project_id) REFERENCES project(id)
|
|
# vm_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
|
# vm_vm_host_slice_id_fkey | (vm_host_slice_id) REFERENCES vm_host_slice(id)
|
|
# Referenced By:
|
|
# assigned_vm_address | assigned_vm_address_dst_vm_id_fkey | (dst_vm_id) REFERENCES vm(id)
|
|
# dns_servers_vms | dns_servers_vms_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# inference_endpoint_replica | inference_endpoint_replica_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# kubernetes_clusters_cp_vms | kubernetes_clusters_cp_vms_cp_vm_id_fkey | (cp_vm_id) REFERENCES vm(id)
|
|
# kubernetes_nodepools_vms | kubernetes_nodepools_vms_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# load_balancers_vms | load_balancers_vms_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# minio_server | minio_server_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# nic | nic_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# pci_device | pci_device_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# postgres_server | postgres_server_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# vm_storage_volume | vm_storage_volume_vm_id_fkey | (vm_id) REFERENCES vm(id)
|