This change enables setting limits on total IOPS, read bandwidth, and write bandwidth for each `VmStorageVolume`. These limits can be specified in `Nexus::assemble`: ``` Prog::Vm::Nexus.assemble( ... storage_volumes: [{ size_gib: 40, encrypted: true, max_read_mbytes_per_sec: 200, max_write_mbytes_per_sec: 150, max_ios_per_sec: 25600 }], ...) ``` The implementation uses SPDK's `bdev_set_qos_limit` RPC call during volume setup on the Rhizome side. This feature prepares for future support of burstable instances with limited IO capacity and allows proportional IO allocation based on VM size. These additional allocation strategies will be addressed in upcoming PRs.
301 lines
11 KiB
Ruby
301 lines
11 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "jwt"
|
|
require_relative "../model"
|
|
|
|
class Vm < Sequel::Model
|
|
one_to_one :strand, key: :id
|
|
many_to_one :vm_host
|
|
one_to_many :nics, key: :vm_id, class: :Nic
|
|
many_to_many :private_subnets, join_table: :nic, left_key: :vm_id, right_key: :private_subnet_id
|
|
one_to_one :sshable, key: :id
|
|
one_to_one :assigned_vm_address, key: :dst_vm_id, class: :AssignedVmAddress
|
|
one_to_many :vm_storage_volumes, key: :vm_id, order: Sequel.desc(:boot)
|
|
one_to_many :active_billing_records, class: :BillingRecord, key: :resource_id do |ds| ds.active end
|
|
one_to_many :pci_devices, key: :vm_id, class: :PciDevice
|
|
one_through_one :load_balancer, left_key: :vm_id, right_key: :load_balancer_id, join_table: :load_balancers_vms
|
|
one_to_one :load_balancers_vms, key: :vm_id, class: :LoadBalancersVms
|
|
|
|
plugin :association_dependencies, sshable: :destroy, assigned_vm_address: :destroy, vm_storage_volumes: :destroy, load_balancers_vms: :destroy
|
|
|
|
dataset_module Pagination
|
|
dataset_module Authorization::Dataset
|
|
|
|
include ResourceMethods
|
|
include SemaphoreMethods
|
|
include HealthMonitorMethods
|
|
semaphore :destroy, :start_after_host_reboot, :prevent_destroy, :update_firewall_rules, :checkup, :update_spdk_dependency, :waiting_for_capacity, :lb_expiry_started
|
|
|
|
include Authorization::HyperTagMethods
|
|
|
|
def hyper_tag_name(project)
|
|
"project/#{project.ubid}/location/#{display_location}/vm/#{name}"
|
|
end
|
|
|
|
include Authorization::TaggableMethods
|
|
|
|
def firewalls
|
|
private_subnets.flat_map(&:firewalls)
|
|
end
|
|
|
|
def display_location
|
|
LocationNameConverter.to_display_name(location)
|
|
end
|
|
|
|
def path
|
|
"/location/#{display_location}/vm/#{name}"
|
|
end
|
|
|
|
def ephemeral_net4
|
|
assigned_vm_address&.ip&.network
|
|
end
|
|
|
|
def ip4
|
|
assigned_vm_address&.ip
|
|
end
|
|
|
|
def runtime_token
|
|
JWT.encode({sub: ubid, iat: Time.now.to_i}, Config.clover_runtime_token_secret, "HS256")
|
|
end
|
|
|
|
def display_state
|
|
return "deleting" if destroy_set? || strand&.label == "destroy"
|
|
if waiting_for_capacity_set?
|
|
return "no capacity available" if Time.now - created_at > 15 * 60
|
|
return "waiting for capacity"
|
|
end
|
|
super
|
|
end
|
|
|
|
def mem_gib_ratio
|
|
return 3.2 if arch == "arm64"
|
|
# Special case for GPUs
|
|
return 10.68 if family == "standard-gpu"
|
|
8
|
|
end
|
|
|
|
def mem_gib
|
|
(cores * mem_gib_ratio).to_i
|
|
end
|
|
|
|
# cloud-hypervisor takes topology information in this format:
|
|
#
|
|
# topology=<threads_per_core>:<cores_per_die>:<dies_per_package>:<packages>
|
|
#
|
|
# And the result of multiplication must equal the thread/vcpu count
|
|
# we wish to allocate:
|
|
#
|
|
# let total = t.threads_per_core * t.cores_per_die * t.dies_per_package * t.packages;
|
|
# if total != self.cpus.max_vcpus {
|
|
# return Err(ValidationError::CpuTopologyCount);
|
|
# }
|
|
CloudHypervisorCpuTopo = Struct.new(:threads_per_core, :cores_per_die, :dies_per_package, :packages) do
|
|
def to_s
|
|
to_a.map(&:to_s).join(":")
|
|
end
|
|
|
|
def max_vcpus
|
|
@max_vcpus ||= to_a.reduce(&:*)
|
|
end
|
|
end
|
|
|
|
def cloud_hypervisor_cpu_topology
|
|
threads_per_core, r = vm_host.total_cpus.divmod vm_host.total_cores
|
|
fail "BUG" unless r.zero?
|
|
|
|
total_dies_per_package, r = vm_host.total_dies.divmod vm_host.total_sockets
|
|
fail "BUG" unless r.zero?
|
|
|
|
total_packages = vm_host.total_sockets
|
|
|
|
# Computed all-system statistics, now scale it down to meet VM needs.
|
|
proportion = Rational(cores) / vm_host.total_cores
|
|
packages = (total_packages * proportion).ceil
|
|
dies_per_package = (total_dies_per_package * proportion).ceil
|
|
cores_per_die = Rational(cores) / (packages * dies_per_package)
|
|
fail "BUG: need uniform number of cores allocated per die" unless cores_per_die.denominator == 1
|
|
|
|
topo = [threads_per_core, cores_per_die, dies_per_package, packages].map { |num|
|
|
# :nocov:
|
|
fail "BUG: non-integer in topology array" unless num.denominator == 1
|
|
# :nocov:
|
|
Integer(num)
|
|
}
|
|
|
|
# :nocov:
|
|
unless topo.reduce(&:*) == threads_per_core * cores
|
|
fail "BUG: arithmetic does not result in the correct number of vcpus"
|
|
end
|
|
# :nocov:
|
|
|
|
CloudHypervisorCpuTopo.new(*topo)
|
|
end
|
|
|
|
def display_size
|
|
# With additional product families, it is likely that we hit a
|
|
# case where this conversion wouldn't work. We can use map or
|
|
# when/case block at that time.
|
|
|
|
# Define suffix integer as 2 * numcores. This coincides with
|
|
# SMT-enabled x86 processors, to give people the right idea if
|
|
# they compare the product code integer to the preponderance of
|
|
# spec sheets on the web.
|
|
#
|
|
# With non-SMT processors, maybe we'll keep it that way too,
|
|
# even though it doesn't describe any attribute about the
|
|
# processor. But, it does allow "standard-2" is compared to
|
|
# another "standard-2" variant regardless of SMT,
|
|
# e.g. "standard-2-arm", instead of making people interpreting
|
|
# the code adjust the scale factor to do the comparison
|
|
# themselves.
|
|
#
|
|
# Another weakness of this approach, besides it being indirect
|
|
# in description of non-SMT processors, is having "standard-2"
|
|
# be the smallest unit of product is also noisier than
|
|
# "standard-1".
|
|
"#{family}-#{cores * 2}"
|
|
end
|
|
|
|
# Various names in linux, like interface names, are obliged to be
|
|
# short, so truncate the ubid. This does introduce the spectre of
|
|
# collisions. When the time comes, we'll have to ensure it doesn't
|
|
# happen on a single host, pushing into the allocation process.
|
|
def self.ubid_to_name(id)
|
|
id.to_s[0..7]
|
|
end
|
|
|
|
def inhost_name
|
|
self.class.ubid_to_name(UBID.from_uuidish(id))
|
|
end
|
|
|
|
def storage_size_gib
|
|
vm_storage_volumes.map { _1.size_gib }.sum
|
|
end
|
|
|
|
def init_health_monitor_session
|
|
{
|
|
ssh_session: vm_host.sshable.start_fresh_session
|
|
}
|
|
end
|
|
|
|
def check_pulse(session:, previous_pulse:)
|
|
reading = begin
|
|
session[:ssh_session].exec!("systemctl is-active #{inhost_name} #{inhost_name}-dnsmasq").split("\n").all?("active") ? "up" : "down"
|
|
rescue
|
|
"down"
|
|
end
|
|
pulse = aggregate_readings(previous_pulse: previous_pulse, reading: reading)
|
|
|
|
if pulse[:reading] == "down" && pulse[:reading_rpt] > 5 && Time.now - pulse[:reading_chg] > 30 && !reload.checkup_set?
|
|
incr_checkup
|
|
end
|
|
|
|
pulse
|
|
end
|
|
|
|
def update_spdk_version(version)
|
|
spdk_installation = vm_host.spdk_installations_dataset[version: version]
|
|
fail "SPDK version #{version} not found on host" unless spdk_installation
|
|
vm_storage_volumes_dataset.update(spdk_installation_id: spdk_installation.id)
|
|
incr_update_spdk_dependency
|
|
end
|
|
|
|
def self.redacted_columns
|
|
super + [:public_key]
|
|
end
|
|
|
|
def params_json(swap_size_bytes)
|
|
topo = cloud_hypervisor_cpu_topology
|
|
|
|
project_public_keys = projects.first.get_ff_vm_public_ssh_keys || []
|
|
|
|
# we don't write secrets to params_json, because it
|
|
# shouldn't be stored in the host for security reasons.
|
|
JSON.pretty_generate({
|
|
"vm_name" => name,
|
|
"public_ipv6" => ephemeral_net6.to_s,
|
|
"public_ipv4" => ip4.to_s || "",
|
|
"local_ipv4" => local_vetho_ip.to_s.shellescape || "",
|
|
"dns_ipv4" => nics.first.private_subnet.net4.nth(2).to_s,
|
|
"unix_user" => unix_user,
|
|
"ssh_public_keys" => [public_key] + project_public_keys,
|
|
"nics" => nics.map { |nic| [nic.private_ipv6.to_s, nic.private_ipv4.to_s, nic.ubid_to_tap_name, nic.mac, nic.private_ipv4_gateway] },
|
|
"boot_image" => boot_image,
|
|
"max_vcpus" => topo.max_vcpus,
|
|
"cpu_topology" => topo.to_s,
|
|
"mem_gib" => mem_gib,
|
|
"ndp_needed" => vm_host.ndp_needed,
|
|
"storage_volumes" => storage_volumes,
|
|
"swap_size_bytes" => swap_size_bytes,
|
|
"pci_devices" => pci_devices.map { [_1.slot, _1.iommu_group] }
|
|
})
|
|
end
|
|
|
|
def storage_volumes
|
|
vm_storage_volumes.map { |s|
|
|
{
|
|
"boot" => s.boot,
|
|
"image" => s.boot_image&.name,
|
|
"image_version" => s.boot_image&.version,
|
|
"size_gib" => s.size_gib,
|
|
"device_id" => s.device_id,
|
|
"disk_index" => s.disk_index,
|
|
"encrypted" => !s.key_encryption_key_1.nil?,
|
|
"spdk_version" => s.spdk_version,
|
|
"use_bdev_ubi" => s.use_bdev_ubi,
|
|
"skip_sync" => s.skip_sync,
|
|
"storage_device" => s.storage_device.name,
|
|
"read_only" => s.size_gib == 0,
|
|
"max_ios_per_sec" => s.max_ios_per_sec,
|
|
"max_read_mbytes_per_sec" => s.max_read_mbytes_per_sec,
|
|
"max_write_mbytes_per_sec" => s.max_write_mbytes_per_sec
|
|
}
|
|
}
|
|
end
|
|
|
|
def storage_secrets
|
|
vm_storage_volumes.filter_map { |s|
|
|
if !s.key_encryption_key_1.nil?
|
|
[s.device_id, s.key_encryption_key_1.secret_key_material_hash]
|
|
end
|
|
}.to_h
|
|
end
|
|
end
|
|
|
|
# Table: vm
|
|
# Columns:
|
|
# id | uuid | PRIMARY KEY
|
|
# ephemeral_net6 | cidr |
|
|
# vm_host_id | uuid |
|
|
# unix_user | text | NOT NULL
|
|
# public_key | text | NOT NULL
|
|
# display_state | vm_display_state | NOT NULL DEFAULT 'creating'::vm_display_state
|
|
# name | text | NOT NULL
|
|
# location | text | NOT NULL
|
|
# boot_image | text | NOT NULL
|
|
# local_vetho_ip | text |
|
|
# ip4_enabled | boolean | NOT NULL DEFAULT false
|
|
# family | text | NOT NULL
|
|
# cores | integer | NOT NULL
|
|
# pool_id | uuid |
|
|
# created_at | timestamp with time zone | NOT NULL DEFAULT now()
|
|
# arch | arch | NOT NULL DEFAULT 'x64'::arch
|
|
# allocated_at | timestamp with time zone |
|
|
# provisioned_at | timestamp with time zone |
|
|
# Indexes:
|
|
# vm_pkey | PRIMARY KEY btree (id)
|
|
# vm_ephemeral_net6_key | UNIQUE btree (ephemeral_net6)
|
|
# Foreign key constraints:
|
|
# vm_pool_id_fkey | (pool_id) REFERENCES vm_pool(id)
|
|
# vm_vm_host_id_fkey | (vm_host_id) REFERENCES vm_host(id)
|
|
# Referenced By:
|
|
# assigned_vm_address | assigned_vm_address_dst_vm_id_fkey | (dst_vm_id) REFERENCES vm(id)
|
|
# dns_servers_vms | dns_servers_vms_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# inference_endpoint_replica | inference_endpoint_replica_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# load_balancers_vms | load_balancers_vms_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# minio_server | minio_server_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# nic | nic_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# pci_device | pci_device_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# postgres_server | postgres_server_vm_id_fkey | (vm_id) REFERENCES vm(id)
|
|
# vm_storage_volume | vm_storage_volume_vm_id_fkey | (vm_id) REFERENCES vm(id)
|