Files
ubicloud/prog/kubernetes/upgrade_kubernetes_node.rb
mohi-kalantari 3eafd0e0ec Restart node drain on failure
kubectl drain node_name is an idempotent command which allows us to
easily retry it

During the upgrade, since a new node has just joined the cluster, the etcd
leader might change and the control plane nodes are not stable. That can
easily disrupt the drain command but we will keep retrying to drain the node

Since a 1hour deadline is set on the drain node, we know we will get the page
and we won't restart indefinitely.
2025-05-22 11:55:47 +02:00

95 lines
2.3 KiB
Ruby

# frozen_string_literal: true
class Prog::Kubernetes::UpgradeKubernetesNode < Prog::Base
subject_is :kubernetes_cluster
def old_vm
@old_vm ||= Vm[frame.fetch("old_vm_id")]
end
def new_vm
@new_vm ||= Vm[frame.fetch("new_vm_id")]
end
def kubernetes_nodepool
@kubernetes_nodepool ||= KubernetesNodepool[frame.fetch("nodepool_id", nil)]
end
def before_run
if kubernetes_cluster.strand.label == "destroy" && strand.label != "destroy"
reap
donate unless leaf?
pop "upgrade cancelled"
end
end
label def start
new_frame = if kubernetes_nodepool
{"nodepool_id" => kubernetes_nodepool.id}
else
{}
end
bud Prog::Kubernetes::ProvisionKubernetesNode, new_frame
hop_wait_new_node
end
label def wait_new_node
res = reap
donate if res.empty?
current_frame = strand.stack.first
current_frame["new_vm_id"] = res.first.exitval.fetch("vm_id")
strand.modified!(:stack)
hop_drain_old_node
end
label def drain_old_node
register_deadline("remove_old_node_from_cluster", 60 * 60)
vm = kubernetes_cluster.cp_vms.last
case vm.sshable.d_check("drain_node")
when "Succeeded"
hop_remove_old_node_from_cluster
when "NotStarted"
vm.sshable.d_run("drain_node", "sudo", "kubectl", "--kubeconfig=/etc/kubernetes/admin.conf",
"drain", old_vm.name, "--ignore-daemonsets", "--delete-emptydir-data")
nap 10
when "InProgress"
nap 10
when "Failed"
vm.sshable.d_restart("drain_node")
nap 10
end
nap 60 * 60
end
label def remove_old_node_from_cluster
if kubernetes_nodepool
kubernetes_nodepool.remove_vm(old_vm)
else
kubernetes_cluster.remove_cp_vm(old_vm)
kubernetes_cluster.api_server_lb.detach_vm(old_vm)
end
# kubeadm reset is necessary for etcd member removal, delete node itself
# doesn't remove node from the etcd member, hurting the etcd cluster health
old_vm.sshable.cmd("sudo kubeadm reset --force")
hop_delete_node_object
end
label def delete_node_object
res = kubernetes_cluster.client.delete_node(old_vm.name)
fail "delete node object failed: #{res}" unless res.exitstatus.zero?
hop_destroy_node
end
label def destroy_node
old_vm.incr_destroy
pop "upgraded node"
end
end