mirror of
https://github.com/ubicloud/ubicloud.git
synced 2025-10-04 05:42:15 +08:00
After going through multiple tests of data migration, we would want to test the normal pod restart to make sure garbage cleaning is done and pods can go on with their normal restarts.
294 lines
9.4 KiB
Ruby
294 lines
9.4 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require_relative "../../lib/util"
|
|
|
|
class Prog::Test::Kubernetes < Prog::Test::Base
|
|
semaphore :destroy
|
|
|
|
MIGRATION_TRIES = 3
|
|
|
|
def self.assemble
|
|
kubernetes_test_project = Project.create(name: "Kubernetes-Test-Project", feature_flags: {"install_csi" => true})
|
|
kubernetes_service_project = Project.create_with_id(Config.kubernetes_service_project_id, name: "Ubicloud-Kubernetes-Resources")
|
|
|
|
Strand.create(
|
|
prog: "Test::Kubernetes",
|
|
label: "start",
|
|
stack: [{
|
|
"kubernetes_service_project_id" => kubernetes_service_project.id,
|
|
"kubernetes_test_project_id" => kubernetes_test_project.id,
|
|
"migration_number" => 0
|
|
}]
|
|
)
|
|
end
|
|
|
|
label def start
|
|
kc = Prog::Kubernetes::KubernetesClusterNexus.assemble(
|
|
name: "kubernetes-test-standard",
|
|
project_id: frame["kubernetes_test_project_id"],
|
|
location_id: Location::HETZNER_FSN1_ID,
|
|
version: Option.kubernetes_versions.first,
|
|
cp_node_count: 1
|
|
).subject
|
|
Prog::Kubernetes::KubernetesNodepoolNexus.assemble(
|
|
name: "kubernetes-test-standard-nodepool",
|
|
node_count: 2,
|
|
kubernetes_cluster_id: kc.id,
|
|
target_node_size: "standard-2"
|
|
)
|
|
|
|
update_stack({"kubernetes_cluster_id" => kc.id})
|
|
hop_update_loadbalancer_hostname
|
|
end
|
|
|
|
label def update_loadbalancer_hostname
|
|
nap 5 unless kubernetes_cluster.api_server_lb
|
|
kubernetes_cluster.api_server_lb.update(custom_hostname: "k8s-e2e-test.ubicloud.test")
|
|
hop_update_all_nodes_hosts_entries
|
|
end
|
|
|
|
label def update_all_nodes_hosts_entries
|
|
expected_node_count = kubernetes_cluster.cp_node_count + nodepool.node_count
|
|
current_nodes = kubernetes_cluster.nodes + nodepool.nodes
|
|
current_node_count = current_nodes.count
|
|
|
|
current_nodes.each { |node|
|
|
unless node_host_entries_set?(node.name)
|
|
nap 5 unless vm_ready?(node.vm)
|
|
ensure_hosts_entry(node.sshable, kubernetes_cluster.api_server_lb.hostname)
|
|
set_node_entries_status(node.name)
|
|
end
|
|
}
|
|
|
|
hop_wait_for_kubernetes_bootstrap if current_node_count == expected_node_count
|
|
nap 10
|
|
end
|
|
|
|
label def wait_for_kubernetes_bootstrap
|
|
hop_test_nodes if kubernetes_cluster.strand.label == "wait"
|
|
nap 10
|
|
end
|
|
|
|
label def test_nodes
|
|
begin
|
|
nodes_output = kubernetes_cluster.client.kubectl("get nodes")
|
|
rescue RuntimeError => ex
|
|
update_stack({"fail_message" => "Failed to run test kubectl command: #{ex.message}"})
|
|
hop_destroy_kubernetes
|
|
end
|
|
missing_nodes = []
|
|
kubernetes_cluster.all_nodes.each { |node|
|
|
missing_nodes.append(node.name) unless nodes_output.include?(node.name)
|
|
}
|
|
if missing_nodes.any?
|
|
update_stack({"fail_message" => "node #{missing_nodes.join(", ")} not found in cluster"})
|
|
hop_destroy_kubernetes
|
|
end
|
|
hop_test_csi
|
|
end
|
|
|
|
label def test_csi
|
|
sts = <<STS
|
|
apiVersion: apps/v1
|
|
kind: StatefulSet
|
|
metadata:
|
|
name: ubuntu-statefulset
|
|
spec:
|
|
serviceName: ubuntu
|
|
replicas: 1
|
|
selector:
|
|
matchLabels: { app: ubuntu }
|
|
template:
|
|
metadata:
|
|
labels: { app: ubuntu }
|
|
spec:
|
|
containers:
|
|
- name: ubuntu
|
|
image: ubuntu:24.04
|
|
command: ["/bin/sh", "-c", "sleep infinity"]
|
|
volumeMounts:
|
|
- { name: data-volume, mountPath: /etc/data }
|
|
volumeClaimTemplates:
|
|
- metadata:
|
|
name: data-volume
|
|
spec:
|
|
accessModes: [ "ReadWriteOnce" ]
|
|
resources:
|
|
requests: { storage: 1Gi }
|
|
storageClassName: ubicloud-standard
|
|
STS
|
|
kubernetes_cluster.sshable.cmd("sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f -", stdin: sts)
|
|
hop_wait_for_statefulset
|
|
end
|
|
|
|
label def wait_for_statefulset
|
|
pod_status = kubernetes_cluster.client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.status.phase}").strip
|
|
nap 5 unless pod_status == "Running"
|
|
hop_test_lsblk
|
|
end
|
|
|
|
label def test_lsblk
|
|
begin
|
|
verify_mount
|
|
rescue => e
|
|
update_stack({"fail_message" => e.message})
|
|
hop_destroy_kubernetes
|
|
end
|
|
hop_test_data_write
|
|
end
|
|
|
|
label def test_data_write
|
|
write_hash = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- sh -c \"head -c 200M /dev/urandom | tee /etc/data/random-data | sha256sum | awk '{print \\$1}'\"").strip
|
|
read_hash = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- sh -c \"sha256sum /etc/data/random-data | awk '{print \\$1}'\"").strip
|
|
if write_hash != read_hash
|
|
update_stack({"fail_message" => "wrong read hash, expected: #{write_hash}, got: #{read_hash}"})
|
|
hop_destroy_kubernetes
|
|
end
|
|
update_stack({"read_hash" => read_hash})
|
|
hop_test_pod_data_migration
|
|
end
|
|
|
|
label def test_pod_data_migration
|
|
client = kubernetes_cluster.client
|
|
pod_node = client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.spec.nodeName}").strip
|
|
client.kubectl("cordon #{pod_node}")
|
|
# we need to uncordon other nodes each time so we won't run out of nodes accepting pods
|
|
nodepool.nodes.reject { it.name == pod_node }.each { |node|
|
|
client.kubectl("uncordon #{node.name}")
|
|
}
|
|
client.kubectl("delete pod ubuntu-statefulset-0 --wait=false")
|
|
hop_verify_data_after_migration
|
|
end
|
|
|
|
label def verify_data_after_migration
|
|
nap 5 unless pod_status == "Running"
|
|
new_hash = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- sh -c \"sha256sum /etc/data/random-data | awk '{print \\$1}'\"").strip
|
|
expected_hash = strand.stack.first["read_hash"]
|
|
if new_hash != expected_hash
|
|
update_stack({"fail_message" => "data hash changed after migration, expected: #{expected_hash}, got: #{new_hash}"})
|
|
hop_destroy_kubernetes
|
|
end
|
|
hop_test_normal_pod_restart if migration_number == MIGRATION_TRIES
|
|
increment_migration_number
|
|
hop_test_pod_data_migration
|
|
end
|
|
|
|
label def test_normal_pod_restart
|
|
client = kubernetes_cluster.client
|
|
pod_node = client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.spec.nodeName}").strip
|
|
update_stack({"normal_pod_restart_test_node" => pod_node})
|
|
client.kubectl("delete pod ubuntu-statefulset-0 --wait=false")
|
|
hop_verify_normal_pod_restart
|
|
end
|
|
|
|
label def verify_normal_pod_restart
|
|
nap 5 unless pod_status == "Running"
|
|
pod_node = kubernetes_cluster.client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.spec.nodeName}").strip
|
|
expected_pod_node = strand.stack.first["normal_pod_restart_test_node"]
|
|
if pod_node != expected_pod_node
|
|
update_stack({"fail_message" => "unexpected pod node change after restart, expected: #{expected_pod_node}, got: #{pod_node}"})
|
|
hop_destroy_kubernetes
|
|
end
|
|
|
|
begin
|
|
verify_mount
|
|
rescue => e
|
|
update_stack({"fail_message" => e.message})
|
|
end
|
|
hop_destroy_kubernetes
|
|
end
|
|
|
|
label def destroy_kubernetes
|
|
kubernetes_cluster.incr_destroy
|
|
hop_destroy
|
|
end
|
|
|
|
label def destroy
|
|
nap 5 if kubernetes_cluster
|
|
kubernetes_test_project.destroy
|
|
|
|
fail_test(frame["fail_message"]) if frame["fail_message"]
|
|
|
|
pop "Kubernetes tests are finished!"
|
|
end
|
|
|
|
label def failed
|
|
nap 15
|
|
end
|
|
|
|
def ensure_hosts_entry(sshable, api_hostname)
|
|
host_line = "#{kubernetes_cluster.sshable.host} #{api_hostname}"
|
|
output = sshable.cmd("cat /etc/hosts")
|
|
unless output.include?(host_line)
|
|
sshable.cmd("echo #{host_line.shellescape} | sudo tee -a /etc/hosts > /dev/null")
|
|
end
|
|
end
|
|
|
|
def vm_ready?(vm)
|
|
return false unless vm
|
|
vm.sshable.cmd("uptime")
|
|
true
|
|
rescue
|
|
false
|
|
end
|
|
|
|
def kubernetes_test_project
|
|
@kubernetes_test_project ||= Project.with_pk(frame["kubernetes_test_project_id"])
|
|
end
|
|
|
|
def kubernetes_cluster
|
|
@kubernetes_cluster ||= KubernetesCluster.with_pk(frame["kubernetes_cluster_id"])
|
|
end
|
|
|
|
def nodepool
|
|
kubernetes_cluster.nodepools.first
|
|
end
|
|
|
|
def node_host_entries_set?(node_name)
|
|
strand.stack.first.dig("nodes_status", node_name) == true
|
|
end
|
|
|
|
def set_node_entries_status(node_name)
|
|
frame = strand.stack.first
|
|
frame["nodes_status"] ||= {}
|
|
frame["nodes_status"][node_name] = true
|
|
update_stack(frame)
|
|
end
|
|
|
|
def migration_number
|
|
strand.stack.first["migration_number"]
|
|
end
|
|
|
|
def increment_migration_number
|
|
update_stack({"migration_number" => migration_number + 1})
|
|
end
|
|
|
|
def verify_mount
|
|
lsblk_output = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- lsblk")
|
|
lines = lsblk_output.split("\n")[1..]
|
|
data_mount = lines.find { |line| line.include?("/etc/data") }
|
|
if data_mount
|
|
cols = data_mount.split
|
|
device_name = cols[0] # e.g. "loop3"
|
|
size = cols[3] # e.g. "1G"
|
|
mountpoint = cols[6] # e.g. "/etc/data"
|
|
|
|
if device_name.start_with?("loop") && size == "1G" && mountpoint == "/etc/data"
|
|
# no op
|
|
else
|
|
raise "/etc/data is mounted incorrectly: #{data_mount}"
|
|
end
|
|
else
|
|
raise "No /etc/data mount found in lsblk output"
|
|
end
|
|
end
|
|
|
|
# we are not using jsonpath for extracting the status because even though a pod is termination, its phase
|
|
# from API Server's point of view is Running, in order to detect that using jsonpath, we needed to check for
|
|
# deletion timestamp, all conditions in status and .status.phase.
|
|
# to keep the query simple, we let the kubectl do the processing and observe the system from the eyes of a
|
|
# customer. This also keeps the logic simpler
|
|
def pod_status
|
|
kubernetes_cluster.client.kubectl("get pods ubuntu-statefulset-0 | grep -v NAME | awk '{print $3}'").strip
|
|
end
|
|
end
|