Files
ubicloud/rhizome/host/lib/storage_volume.rb
Hadi Moshayedi a8a386862b Fix persistent_device_id for NVMe disks
NVMe disk Ids start with "nvme-eui." not "nmve-eui-".

For example:

```
nvme-eui.3634473057c004720025384e00000001
```
2025-07-16 10:26:54 -07:00

595 lines
17 KiB
Ruby

# frozen_string_literal: true
require_relative "../../common/lib/util"
require "fileutils"
require "json"
require "openssl"
require "base64"
require "timeout"
require "yaml"
require_relative "boot_image"
require_relative "vm_path"
require_relative "spdk_path"
require_relative "spdk_rpc"
require_relative "spdk_setup"
require_relative "storage_key_encryption"
require_relative "storage_path"
require_relative "vhost_block_backend"
class StorageVolume
attr_reader :image_path, :read_only
def initialize(vm_name, params)
@vm_name = vm_name
@vhost_backend_version = params["vhost_block_backend_version"]
@disk_index = params["disk_index"]
@device_id = params["device_id"]
@encrypted = params["encrypted"]
@disk_size_gib = params["size_gib"]
@use_bdev_ubi = params["use_bdev_ubi"] || false
@skip_sync = params["skip_sync"] || false
@image_path = BootImage.new(params["image"], params["image_version"]).image_path if params["image"]
@device = params["storage_device"] || DEFAULT_STORAGE_DEVICE
@spdk_version = params["spdk_version"]
@read_only = params["read_only"] || false
@max_read_mbytes_per_sec = params["max_read_mbytes_per_sec"]
@max_write_mbytes_per_sec = params["max_write_mbytes_per_sec"]
@slice = params.fetch("slice_name", "system.slice")
@num_queues = params.fetch("num_queues", 1)
@queue_size = params.fetch("queue_size", 256)
@copy_on_read = params.fetch("copy_on_read", false)
@stripe_sector_count_shift = Integer(params.fetch("stripe_sector_count_shift", 11))
end
def vp
@vp ||= VmPath.new(@vm_name)
end
def rpc_client
@rpc_client ||= SpdkRpc.new(SpdkPath.rpc_sock(@spdk_version))
end
def prep(key_wrapping_secrets)
# Device path is intended to be created by system admin, so fail loudly if
# it doesn't exist
fail "Storage device directory doesn't exist: #{sp.device_path}" if !File.exist?(sp.device_path)
FileUtils.mkdir_p storage_dir
FileUtils.chown @vm_name, @vm_name, storage_dir
encryption_key = setup_data_encryption_key(key_wrapping_secrets) if @encrypted
if @vhost_backend_version
create_empty_disk_file
prep_vhost_backend(encryption_key, key_wrapping_secrets)
return
end
if @image_path.nil?
fail "bdev_ubi requires a base image" if @use_bdev_ubi
create_empty_disk_file
return
end
verify_imaged_disk_size
if @use_bdev_ubi
create_ubi_writespace(encryption_key)
elsif @encrypted
create_empty_disk_file
encrypted_image_copy(encryption_key, @image_path)
else
unencrypted_image_copy
end
end
def prep_vhost_backend(encryption_key, key_wrapping_secrets)
vhost_backend_create_config(encryption_key, key_wrapping_secrets)
vhost_backend_create_metadata(key_wrapping_secrets) if @image_path
vhost_backend_create_service_file
end
def write_new_file(path, user)
rm_if_exists(path)
File.open(path, "w", 0o600, flags: File::CREAT | File::EXCL) do |file|
FileUtils.chown user, user, path
yield file
end
end
def vhost_backend_create_config(encryption_key, key_wrapping_secrets)
config_path = sp.vhost_backend_config
config = vhost_backend_config(encryption_key, key_wrapping_secrets)
write_new_file(config_path, @vm_name) do |file|
file.write(config.to_yaml)
fsync_or_fail(file)
end
sync_parent_dir(config_path)
end
def vhost_backend_create_metadata(key_wrapping_secrets)
vhost_backend = VhostBlockBackend.new(@vhost_backend_version)
metadata_path = sp.vhost_backend_metadata
config_path = sp.vhost_backend_config
if @encrypted
kek_yaml = vhost_backend_kek(key_wrapping_secrets).to_yaml
kek_arg = "--kek /dev/stdin"
else
kek_yaml = ""
end
write_new_file(metadata_path, @vm_name) do |file|
file.truncate(8 * 1024 * 1024)
end
r "#{vhost_backend.init_metadata_path.shellescape} -s #{@stripe_sector_count_shift} --config #{config_path.shellescape} #{kek_arg}", stdin: kek_yaml
sync_parent_dir(metadata_path)
end
def vhost_backend_create_service_file
vhost_backend = VhostBlockBackend.new(@vhost_backend_version)
kek_arg = if @encrypted
"--kek #{sp.kek_pipe}"
end
# systemd-analyze security result:
# Overall exposure level for #{vhost_user_block_service}: 0.5 SAFE
service_file_path = "/etc/systemd/system/#{vhost_user_block_service}"
File.write(service_file_path, <<~SERVICE)
[Unit]
Description=Vhost Block Backend Service for #{@vm_name}
After=network.target
[Service]
Slice=#{@slice}
Environment=RUST_LOG=info
ExecStart=#{vhost_backend.bin_path} --config #{sp.vhost_backend_config} #{kek_arg}
Restart=always
User=#{@vm_name}
Group=#{@vm_name}
#{systemd_io_rate_limits}
RemoveIPC=true
NoNewPrivileges=true
CapabilityBoundingSet=
AmbientCapabilities=
PrivateDevices=true
DevicePolicy=closed
DeviceAllow=/dev/null rw
DeviceAllow=/dev/zero rw
DeviceAllow=/dev/urandom rw
DeviceAllow=/dev/random rw
ProtectSystem=full
ProtectHome=tmpfs
ReadWritePaths=#{storage_root}
PrivateTmp=true
PrivateMounts=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectControlGroups=true
ProtectClock=true
ProtectHostname=true
LockPersonality=true
ProtectKernelLogs=true
ProtectProc=invisible
RestrictAddressFamilies=AF_UNIX
RestrictNamespaces=true
SystemCallArchitectures=native
SystemCallFilter=@system-service
MemoryDenyWriteExecute=yes
RestrictSUIDSGID=yes
RestrictRealtime=yes
ProcSubset=pid
PrivateNetwork=yes
PrivateUsers=yes
IPAddressDeny=any
[Install]
WantedBy=multi-user.target
SERVICE
end
def systemd_io_rate_limits
limits = {IOReadBandwidthMax: @max_read_mbytes_per_sec,
IOWriteBandwidthMax: @max_write_mbytes_per_sec}.compact
return "" if limits.empty?
dev = persistent_device_id(storage_dir)
limits
.map { |(key, mb)| "#{key}=#{dev} #{mb * 1024 * 1024}" }
.join("\n")
end
def persistent_device_id(path)
path_stat = File.stat(path)
Dir["/dev/disk/by-id/*"].each do |id|
dev_path = File.realpath(id)
dev_stat = File.stat(dev_path)
next unless dev_stat.rdev_major == path_stat.dev_major && dev_stat.rdev_minor == path_stat.dev_minor
# Choose stable symlink types by subsystem:
# - SSDs: Use identifiers starting with 'wwn' (World Wide Name), globally unique.
# - NVMe: Use identifiers starting with 'nvme-eui', also globally unique.
# - MD devices: Use uuid identifiers.
dev = File.basename(dev_path)
return id if (dev.start_with?("nvme") && id.include?("nvme-eui.")) ||
(dev.start_with?("sd") && id.include?("wwn-")) ||
(dev.start_with?("md") && id.include?("md-uuid-"))
rescue SystemCallError
next
end
raise "No persistent device ID found for storage path: #{path}"
end
def wrap_key_b64(storage_key_encryption, key)
key_bytes = [key].pack("H*")
wrapped_key = storage_key_encryption.wrap_key(key_bytes).join
Base64.strict_encode64(wrapped_key).strip
end
def vhost_backend_config(encryption_key, key_wrapping_secrets)
config = {
"path" => disk_file,
"socket" => vhost_sock,
"num_queues" => @num_queues,
"queue_size" => @queue_size,
"seg_size_max" => 64 * 1024,
"seg_count_max" => 4,
"copy_on_read" => @copy_on_read,
"poll_queue_timeout_us" => 1000,
"device_id" => @device_id,
"skip_sync" => @skip_sync
}
if @image_path
config["image_path"] = @image_path
config["metadata_path"] = sp.vhost_backend_metadata
end
if @encrypted
key_encryption = StorageKeyEncryption.new(key_wrapping_secrets)
key1_wrapped_b64 = wrap_key_b64(key_encryption, encryption_key[:key])
key2_wrapped_b64 = wrap_key_b64(key_encryption, encryption_key[:key2])
config["encryption_key"] = [key1_wrapped_b64, key2_wrapped_b64]
end
config
end
def vhost_backend_kek(key_wrapping_secrets)
{
"method" => "aes256-gcm",
"key" => key_wrapping_secrets["key"].strip,
"init_vector" => key_wrapping_secrets["init_vector"].strip,
"auth_data" => Base64.strict_encode64(key_wrapping_secrets["auth_data"]).strip
}
end
def start(key_wrapping_secrets)
if @vhost_backend_version
vhost_backend_start(key_wrapping_secrets)
return
end
encryption_key = read_data_encryption_key(key_wrapping_secrets) if @encrypted
retries = 0
begin
setup_spdk_bdev(encryption_key)
set_qos_limits
setup_spdk_vhost
rescue SpdkExists
# If some of SPDK artifacts exist, purge and retry. But retry only once
# to prevent potential retry loops.
if retries == 0
retries += 1
purge_spdk_artifacts
retry
end
raise
end
end
def vhost_backend_start(key_wrapping_secrets)
# Stop the service in case this is a retry.
r "systemctl stop #{q_vhost_user_block_service}"
unless @encrypted
r "systemctl start #{q_vhost_user_block_service}"
return
end
begin
kek_pipe = sp.kek_pipe
rm_if_exists(kek_pipe)
File.mkfifo(kek_pipe, 0o600)
FileUtils.chown @vm_name, @vm_name, kek_pipe
r "systemctl start #{q_vhost_user_block_service}"
Timeout.timeout(5) do
kek_yaml = vhost_backend_kek(key_wrapping_secrets).to_yaml
File.write(kek_pipe, kek_yaml)
end
ensure
FileUtils.rm_f(kek_pipe)
end
end
def stop_service_if_loaded(name)
r "systemctl stop #{name.shellescape}"
rescue CommandFail => e
raise unless e.stderr.include?("not loaded")
end
def purge_spdk_artifacts
if @vhost_backend_version
service_file_path = "/etc/systemd/system/#{vhost_user_block_service}"
stop_service_if_loaded(vhost_user_block_service)
rm_if_exists(service_file_path)
rm_if_exists(vhost_sock)
return
end
vhost_controller = SpdkPath.vhost_controller(@vm_name, @disk_index)
rpc_client.vhost_delete_controller(vhost_controller)
non_ubi_bdev = @use_bdev_ubi ? "#{@device_id}_base" : @device_id
if @use_bdev_ubi
rpc_client.bdev_ubi_delete(@device_id)
end
if @encrypted
rpc_client.bdev_crypto_delete(non_ubi_bdev)
rpc_client.bdev_aio_delete("#{@device_id}_aio")
rpc_client.accel_crypto_key_destroy("#{@device_id}_key")
else
rpc_client.bdev_aio_delete(non_ubi_bdev)
end
rm_if_exists(SpdkPath.vhost_sock(vhost_controller))
end
def setup_data_encryption_key(key_wrapping_secrets)
data_encryption_key = OpenSSL::Cipher.new("aes-256-xts").random_key.unpack1("H*")
result = {
cipher: "AES_XTS",
key: data_encryption_key[..63],
key2: data_encryption_key[64..]
}
key_file = data_encryption_key_path
# save encrypted key
sek = StorageKeyEncryption.new(key_wrapping_secrets)
sek.write_encrypted_dek(key_file, result)
FileUtils.chown @vm_name, @vm_name, key_file
FileUtils.chmod "u=rw,g=,o=", key_file
sync_parent_dir(key_file)
result
end
def read_data_encryption_key(key_wrapping_secrets)
sek = StorageKeyEncryption.new(key_wrapping_secrets)
sek.read_encrypted_dek(data_encryption_key_path)
end
def unencrypted_image_copy
q_image_path = @image_path.shellescape
q_disk_file = disk_file.shellescape
r "cp --reflink=auto #{q_image_path} #{q_disk_file}"
r "truncate -s #{@disk_size_gib}G #{q_disk_file}"
set_disk_file_permissions
end
def verify_imaged_disk_size
size = File.size(@image_path)
fail "Image size greater than requested disk size" unless size <= @disk_size_gib * 2**30
end
def encrypted_image_copy(encryption_key, input_file, block_size: 2097152, count: nil)
# Note that spdk_dd doesn't interact with the main spdk process. It is a
# tool which starts the spdk infra as a separate process, creates bdevs
# from config, does the copy, and exits. Since it is a separate process
# for each image, although bdev names are same, they don't conflict.
# Goal is to copy the image into disk_file, which will be registered
# in the main spdk daemon after this function returns.
bdev_conf = [{
method: "bdev_aio_create",
params: {
name: "aio0",
block_size: 512,
filename: disk_file,
readonly: false
}
},
{
method: "bdev_crypto_create",
params: {
base_bdev_name: "aio0",
name: "crypt0",
key_name: "super_key"
}
}]
accel_conf = [
{
method: "accel_crypto_key_create",
params: {
name: "super_key",
cipher: encryption_key[:cipher],
key: encryption_key[:key],
key2: encryption_key[:key2]
}
}
]
spdk_config_json = {
subsystems: [
{
subsystem: "accel",
config: accel_conf
},
{
subsystem: "bdev",
config: bdev_conf
}
]
}.to_json
# spdk_dd uses the same spdk app infra, so it will bind to an rpc socket,
# which we won't use. But its path shouldn't conflict with other VM setups,
# so it doesn't error out in concurrent VM creations.
rpc_socket = "/var/tmp/spdk_dd.sock.#{@vm_name}"
count_param = count.nil? ? "" : "--count #{count}"
r("#{SpdkPath.bin(@spdk_version, "spdk_dd")} --config /dev/stdin " \
"--disable-cpumask-locks " \
"--rpc-socket #{rpc_socket.shellescape} " \
"--if #{input_file.shellescape} " \
"--ob crypt0 " \
"--bs=#{block_size} #{count_param}", stdin: spdk_config_json)
end
def create_ubi_writespace(encryption_key)
create_empty_disk_file(disk_size_mib: @disk_size_gib * 1024 + 16)
if @encrypted
# just clear the metadata section, i.e. first 8MB
encrypted_image_copy(encryption_key, "/dev/zero", block_size: 2097152, count: 4)
end
end
def create_empty_disk_file(disk_size_mib: @disk_size_gib * 1024)
FileUtils.touch(disk_file)
File.truncate(disk_file, disk_size_mib * 1024 * 1024)
set_disk_file_permissions
end
def set_disk_file_permissions
FileUtils.chown @vm_name, @vm_name, disk_file
# don't allow others to read user's disk
FileUtils.chmod "u=rw,g=r,o=", disk_file
# allow spdk to access the image
r "setfacl -m u:spdk:rw #{disk_file.shellescape}"
end
def setup_spdk_bdev(encryption_key)
non_ubi_bdev = @use_bdev_ubi ? "#{@device_id}_base" : @device_id
if encryption_key
key_name = "#{@device_id}_key"
aio_bdev = "#{@device_id}_aio"
rpc_client.accel_crypto_key_create(
key_name,
encryption_key[:cipher],
encryption_key[:key],
encryption_key[:key2]
)
rpc_client.bdev_aio_create(aio_bdev, disk_file, 512)
rpc_client.bdev_crypto_create(non_ubi_bdev, aio_bdev, key_name)
else
rpc_client.bdev_aio_create(non_ubi_bdev, disk_file, 512)
end
if @use_bdev_ubi
rpc_client.bdev_ubi_create(@device_id, non_ubi_bdev, @image_path, @skip_sync)
end
end
def set_qos_limits
return unless @max_read_mbytes_per_sec || @max_write_mbytes_per_sec
rpc_client.bdev_set_qos_limit(
@device_id,
r_mbytes_per_sec: @max_read_mbytes_per_sec,
w_mbytes_per_sec: @max_write_mbytes_per_sec
)
end
def setup_spdk_vhost
vhost_controller = SpdkPath.vhost_controller(@vm_name, @disk_index)
spdk_vhost_sock = SpdkPath.vhost_sock(vhost_controller)
rpc_client.vhost_create_blk_controller(vhost_controller, @device_id)
# don't allow others to access the vhost socket
FileUtils.chmod "u=rw,g=r,o=", spdk_vhost_sock
# allow vm user to access the vhost socket
r "setfacl -m u:#{@vm_name}:rw #{spdk_vhost_sock.shellescape}"
# create a symlink to the socket in the per vm storage dir
rm_if_exists(vhost_sock)
FileUtils.ln_s spdk_vhost_sock, vhost_sock
# Change ownership of the symlink. FileUtils.chown uses File.lchown for
# symlinks and doesn't follow links. We don't use File.lchown directly
# because it expects numeric uid & gid, which is less convenient.
FileUtils.chown @vm_name, @vm_name, vhost_sock
vhost_sock
end
def spdk_service
@spdk_service ||= SpdkSetup.new(@spdk_version).spdk_service if @spdk_version
end
def vhost_user_block_service
@vhost_user_block_service ||= "#{@vm_name}-#{@disk_index}-storage.service" if @vhost_backend_version
end
def q_vhost_user_block_service
@q_vhost_user_block_service ||= vhost_user_block_service.shellescape if vhost_user_block_service
end
def sp
@sp ||= StoragePath.new(@vm_name, @device, @disk_index)
end
def storage_root
@storage_root ||= sp.storage_root
end
def storage_dir
@storage_dir ||= sp.storage_dir
end
def disk_file
@disk_file ||= sp.disk_file
end
def data_encryption_key_path
@dek_path ||= sp.data_encryption_key
end
def vhost_sock
@vhost_sock ||= sp.vhost_sock
end
attr_reader :num_queues
attr_reader :queue_size
end