Files
ubicloud/prog/minio/minio_server_nexus.rb
Jeremy Evans 15b91ff0c4 Absorb leaf into reap
Reviewing leaf usage in progs, it always occurs right after reap.
Combining leaf and reap methods avoids a redundant query for the
strand's children.

It's typical for nap or donate to be called after the leaf check
after reap.  Also build this into reap, by calling donate by default,
or nap if a nap keyword argument is given.

There are a few cases where reap was called without leaf/donate.
Add a fallthrough keyword argument to support this, so if there are
no children, it does not call either nap or donate

Vm::HostNexus#wait_prep and Kubernetes::UpgradeKubernetesNode#wait_new_node
both need the return value of the reapable child(ren). Add a reaper
keyword argument for this, which is called once for each child.

The most common pattern for using reap/leaf/donate was:

```ruby
reap
hop_download_lb_cert if leaf?
donate
```

This turns into:

```ruby
reap(:download_lb_cert)
```

The second most common pattern was:

```ruby
reap
donate unless leaf?
pop "upgrade cancelled" # or other code
```

This turns into:

```ruby
reap { pop "upgrade cancelled" }
```

In a few places, I changed operations on strand.children to
strand.children_dataset.  Now that we are no longer using
cached children by default, it's better to do these checks
in the database intead of in Ruby.  These places deserve careful
review:

* Prog::Minio::MinioServerNexus#unavailable
* Prog::Postgres::PostgresResourceNexus#wait
* Prog::Postgres::PostgresServerNexus#unavailable

For Prog::Vnet::LoadBalancerNexus#wait_update_vm_load_balancers,
I removed a check on the children completely. It was checking
for an exitval using children_dataset directly after reap,
which should only be true if there was still an active lease
for the child.  This also deserves careful review.

This broke many mocked tests.  This fixes the mocked tests
to use database-backed objects, ensuring that we are testing
observable behavior and not implementation details.
2025-06-26 03:49:53 +09:00

210 lines
6.2 KiB
Ruby

# frozen_string_literal: true
require "forwardable"
require_relative "../../lib/util"
class Prog::Minio::MinioServerNexus < Prog::Base
subject_is :minio_server
extend Forwardable
def_delegators :minio_server, :vm
def self.assemble(minio_pool_id, index)
unless (minio_pool = MinioPool[minio_pool_id])
fail "No existing pool"
end
DB.transaction do
ubid = MinioServer.generate_ubid
vm_st = Prog::Vm::Nexus.assemble_with_sshable(
Config.minio_service_project_id,
sshable_unix_user: "ubi",
location_id: minio_pool.cluster.location_id,
name: ubid.to_s,
size: minio_pool.vm_size,
storage_volumes: [
{encrypted: true, size_gib: 30}
] + Array.new(minio_pool.per_server_drive_count) { {encrypted: true, size_gib: (minio_pool.per_server_storage_size / minio_pool.per_server_drive_count).floor} },
boot_image: "ubuntu-jammy",
enable_ip4: true,
private_subnet_id: minio_pool.cluster.private_subnet.id,
distinct_storage_devices: Config.production? && !Config.is_e2e
)
minio_server = MinioServer.create(minio_pool_id: minio_pool_id, vm_id: vm_st.id, index: index) { it.id = ubid.to_uuid }
Strand.create(prog: "Minio::MinioServerNexus", label: "start") { it.id = minio_server.id }
end
end
def before_run
when_destroy_set? do
if strand.label != "destroy"
hop_destroy
elsif strand.stack.count > 1
pop "operation is cancelled due to the destruction of the minio server"
end
end
end
def cluster
@cluster ||= minio_server.cluster
end
label def start
nap 5 unless vm.strand.label == "wait"
minio_server.incr_initial_provisioning
register_deadline("wait", 10 * 60)
minio_server.cluster.dns_zone&.insert_record(record_name: cluster.hostname, type: "A", ttl: 10, data: vm.ephemeral_net4.to_s)
cert, cert_key = create_certificate
minio_server.update(cert: cert, cert_key: cert_key)
hop_bootstrap_rhizome
end
label def bootstrap_rhizome
bud Prog::BootstrapRhizome, {"target_folder" => "minio", "subject_id" => vm.id, "user" => "ubi"}
hop_wait_bootstrap_rhizome
end
label def wait_bootstrap_rhizome
reap(:create_minio_user)
end
label def create_minio_user
begin
minio_server.vm.sshable.cmd("sudo groupadd -f --system minio-user")
minio_server.vm.sshable.cmd("sudo useradd --no-create-home --system -g minio-user minio-user")
rescue => ex
raise unless ex.message.include?("already exists")
end
hop_setup
end
label def setup
bud Prog::Minio::SetupMinio, {}, :mount_data_disks
bud Prog::Minio::SetupMinio, {}, :install_minio
bud Prog::Minio::SetupMinio, {}, :configure_minio
hop_wait_setup
end
label def wait_setup
reap(:wait)
end
label def wait
when_checkup_set? do
hop_unavailable if !available?
decr_checkup
end
when_reconfigure_set? do
bud Prog::Minio::SetupMinio, {}, :configure_minio
hop_wait_reconfigure
end
when_restart_set? do
decr_restart
# We start the minio server only after the initial provisioning is done
# for all of the servers in the pool.
when_initial_provisioning_set? do
decr_initial_provisioning
end
push self.class, frame, "minio_restart"
end
if minio_server.certificate_last_checked_at < Time.now - 60 * 60 * 24 * 30 # ~1 month
hop_refresh_certificates
end
nap 10
end
label def refresh_certificates
cert, cert_key = create_certificate
minio_server.update(cert: cert, cert_key: cert_key, certificate_last_checked_at: Time.now)
incr_reconfigure
hop_wait
end
label def wait_reconfigure
decr_reconfigure
reap(:wait)
end
label def minio_restart
case minio_server.vm.sshable.cmd("common/bin/daemonizer --check restart_minio")
when "Succeeded"
minio_server.vm.sshable.cmd("common/bin/daemonizer --clean restart_minio")
pop "minio server is restarted"
when "Failed", "NotStarted"
minio_server.vm.sshable.cmd("common/bin/daemonizer 'systemctl restart minio' restart_minio")
end
nap 1
end
label def unavailable
register_deadline("wait", 10 * 60)
reap(fallthrough: true)
nap 5 unless strand.children_dataset.where(prog: "Minio::MinioServerNexus", label: "minio_restart").empty?
if available?
decr_checkup
hop_wait
end
bud self.class, frame, :minio_restart
nap 5
end
label def destroy
register_deadline(nil, 10 * 60)
decr_destroy
minio_server.cluster.dns_zone&.delete_record(record_name: cluster.hostname, type: "A", data: vm.ephemeral_net4&.to_s)
minio_server.vm.sshable.destroy
minio_server.vm.nics.each { it.incr_destroy }
minio_server.vm.incr_destroy
minio_server.destroy
pop "minio server destroyed"
end
def available?
return true if minio_server.initial_provisioning_set?
server_data = JSON.parse(minio_server.client.admin_info.body)["servers"].find { it["endpoint"] == minio_server.endpoint }
server_data["state"] == "online" && server_data["drives"].all? { it["state"] == "ok" }
rescue => ex
Clog.emit("Minio server is down") { {minio_server_down: {ubid: minio_server.ubid, exception: Util.exception_to_hash(ex)}} }
false
end
def create_certificate
root_cert = OpenSSL::X509::Certificate.new(minio_server.cluster.root_cert_1)
root_cert_key = OpenSSL::PKey::EC.new(minio_server.cluster.root_cert_key_1)
if root_cert.not_after < Time.now + 60 * 60 * 24 * 365 * 1
root_cert = OpenSSL::X509::Certificate.new(minio_server.cluster.root_cert_2)
root_cert_key = OpenSSL::PKey::EC.new(minio_server.cluster.root_cert_key_2)
end
ip_san = (Config.development? || Config.is_e2e) ? ",IP:#{minio_server.vm.ephemeral_net4}" : nil
Util.create_certificate(
subject: "/C=US/O=Ubicloud/CN=#{minio_server.cluster.ubid} Server Certificate",
extensions: ["subjectAltName=DNS:#{minio_server.cluster.hostname},DNS:#{minio_server.hostname}#{ip_san}", "keyUsage=digitalSignature,keyEncipherment", "subjectKeyIdentifier=hash", "extendedKeyUsage=serverAuth"],
duration: 60 * 60 * 24 * 30 * 6, # ~6 months
issuer_cert: root_cert,
issuer_key: root_cert_key
).map(&:to_pem)
end
end