ubicloud/prog/test/kubernetes.rb
2025-09-27 23:11:50 +03:00

295 lines
9.4 KiB
Ruby

# frozen_string_literal: true
require_relative "../../lib/util"
class Prog::Test::Kubernetes < Prog::Test::Base
semaphore :destroy
MIGRATION_TRIES = 3
def self.assemble
kubernetes_test_project = Project.create(name: "Kubernetes-Test-Project", feature_flags: {"install_csi" => true})
kubernetes_service_project = Project.create_with_id(Config.kubernetes_service_project_id, name: "Ubicloud-Kubernetes-Resources")
Strand.create(
prog: "Test::Kubernetes",
label: "start",
stack: [{
"kubernetes_service_project_id" => kubernetes_service_project.id,
"kubernetes_test_project_id" => kubernetes_test_project.id,
"migration_number" => 0
}]
)
end
label def start
kc = Prog::Kubernetes::KubernetesClusterNexus.assemble(
name: "kubernetes-test-standard",
project_id: frame["kubernetes_test_project_id"],
location_id: Location::HETZNER_FSN1_ID,
version: Option.kubernetes_versions.first,
cp_node_count: 1
).subject
Prog::Kubernetes::KubernetesNodepoolNexus.assemble(
name: "kubernetes-test-standard-nodepool",
node_count: 2,
kubernetes_cluster_id: kc.id,
target_node_size: "standard-2"
)
update_stack({"kubernetes_cluster_id" => kc.id})
hop_update_loadbalancer_hostname
end
label def update_loadbalancer_hostname
nap 5 unless kubernetes_cluster.api_server_lb
kubernetes_cluster.api_server_lb.update(custom_hostname: "k8s-e2e-test.ubicloud.test")
hop_update_all_nodes_hosts_entries
end
label def update_all_nodes_hosts_entries
expected_node_count = kubernetes_cluster.cp_node_count + nodepool.node_count
current_nodes = kubernetes_cluster.nodes + nodepool.nodes
current_node_count = current_nodes.count
current_nodes.each { |node|
unless node_host_entries_set?(node.name)
nap 5 unless vm_ready?(node.vm)
ensure_hosts_entry(node.sshable, kubernetes_cluster.api_server_lb.hostname)
set_node_entries_status(node.name)
end
}
hop_wait_for_kubernetes_bootstrap if current_node_count == expected_node_count
nap 10
end
label def wait_for_kubernetes_bootstrap
hop_test_nodes if kubernetes_cluster.strand.label == "wait"
nap 10
end
label def test_nodes
begin
nodes_output = kubernetes_cluster.client.kubectl("get nodes")
rescue RuntimeError => ex
update_stack({"fail_message" => "Failed to run test kubectl command: #{ex.message}"})
hop_destroy_kubernetes
end
missing_nodes = []
kubernetes_cluster.all_nodes.each { |node|
missing_nodes.append(node.name) unless nodes_output.include?(node.name)
}
if missing_nodes.any?
update_stack({"fail_message" => "node #{missing_nodes.join(", ")} not found in cluster"})
hop_destroy_kubernetes
end
hop_test_csi
end
label def test_csi
sts = <<STS
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: ubuntu-statefulset
spec:
serviceName: ubuntu
replicas: 1
selector:
matchLabels: { app: ubuntu }
template:
metadata:
labels: { app: ubuntu }
spec:
containers:
- name: ubuntu
image: ubuntu:24.04
command: ["/bin/sh", "-c", "sleep infinity"]
volumeMounts:
- { name: data-volume, mountPath: /etc/data }
volumeClaimTemplates:
- metadata:
name: data-volume
spec:
accessModes: [ "ReadWriteOnce" ]
resources:
requests: { storage: 1Gi }
storageClassName: ubicloud-standard
STS
kubernetes_cluster.sshable.cmd("sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f -", stdin: sts)
hop_wait_for_statefulset
end
label def wait_for_statefulset
pod_status = kubernetes_cluster.client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.status.phase}").strip
nap 5 unless pod_status == "Running"
hop_test_lsblk
end
label def test_lsblk
begin
verify_mount
rescue => e
update_stack({"fail_message" => e.message})
hop_destroy_kubernetes
end
hop_test_data_write
end
label def test_data_write
write_hash = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- sh -c \"head -c 200M /dev/urandom | tee /etc/data/random-data | sha256sum | awk '{print \\$1}'\"").strip
read_hash = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- sh -c \"sha256sum /etc/data/random-data | awk '{print \\$1}'\"").strip
if write_hash != read_hash
update_stack({"fail_message" => "wrong read hash, expected: #{write_hash}, got: #{read_hash}"})
hop_destroy_kubernetes
end
update_stack({"read_hash" => read_hash})
hop_test_pod_data_migration
end
label def test_pod_data_migration
client = kubernetes_cluster.client
pod_node = client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.spec.nodeName}").strip
client.kubectl("cordon #{pod_node}")
# we need to uncordon other nodes each time so we won't run out of nodes accepting pods
nodepool.nodes.reject { it.name == pod_node }.each { |node|
client.kubectl("uncordon #{node.name}")
}
client.kubectl("delete pod ubuntu-statefulset-0 --wait=false")
hop_verify_data_after_migration
end
label def verify_data_after_migration
nap 5 unless pod_status == "Running"
new_hash = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- sh -c \"sha256sum /etc/data/random-data | awk '{print \\$1}'\"").strip
expected_hash = strand.stack.first["read_hash"]
if new_hash != expected_hash
update_stack({"fail_message" => "data hash changed after migration, expected: #{expected_hash}, got: #{new_hash}"})
hop_destroy_kubernetes
end
hop_test_normal_pod_restart if migration_number == MIGRATION_TRIES
increment_migration_number
hop_test_pod_data_migration
end
label def test_normal_pod_restart
client = kubernetes_cluster.client
pod_node = client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.spec.nodeName}").strip
update_stack({"normal_pod_restart_test_node" => pod_node})
client.kubectl("delete pod ubuntu-statefulset-0 --wait=false")
hop_verify_normal_pod_restart
end
label def verify_normal_pod_restart
nap 5 unless pod_status == "Running"
pod_node = kubernetes_cluster.client.kubectl("get pods ubuntu-statefulset-0 -ojsonpath={.spec.nodeName}").strip
expected_pod_node = strand.stack.first["normal_pod_restart_test_node"]
if pod_node != expected_pod_node
update_stack({"fail_message" => "unexpected pod node change after restart, expected: #{expected_pod_node}, got: #{pod_node}"})
hop_destroy_kubernetes
end
begin
verify_mount
rescue => e
update_stack({"fail_message" => e.message})
end
hop_destroy_kubernetes
end
label def destroy_kubernetes
kubernetes_cluster.incr_destroy
hop_destroy
end
label def destroy
nap 5 if kubernetes_cluster
kubernetes_test_project.destroy
fail_test(frame["fail_message"]) if frame["fail_message"]
pop "Kubernetes tests are finished!"
end
label def failed
nap 15
end
def ensure_hosts_entry(sshable, api_hostname)
host_line = "#{kubernetes_cluster.sshable.host} #{api_hostname}"
output = sshable.cmd("cat /etc/hosts")
unless output.include?(host_line)
sshable.cmd("echo #{host_line.shellescape} | sudo tee -a /etc/hosts > /dev/null")
end
end
def vm_ready?(vm)
return false unless vm
vm.sshable.cmd("uptime")
true
rescue
false
end
def kubernetes_test_project
@kubernetes_test_project ||= Project.with_pk(frame["kubernetes_test_project_id"])
end
def kubernetes_cluster
@kubernetes_cluster ||= KubernetesCluster.with_pk(frame["kubernetes_cluster_id"])
end
def nodepool
kubernetes_cluster.nodepools.first
end
def node_host_entries_set?(node_name)
strand.stack.first.dig("nodes_status", node_name) == true
end
def set_node_entries_status(node_name)
frame = strand.stack.first
frame["nodes_status"] ||= {}
frame["nodes_status"][node_name] = true
update_stack(frame)
end
def migration_number
strand.stack.first["migration_number"]
end
def increment_migration_number
update_stack({"migration_number" => migration_number + 1})
end
def verify_mount
lsblk_output = kubernetes_cluster.client.kubectl("exec -t ubuntu-statefulset-0 -- lsblk")
lines = lsblk_output.split("\n")[1..]
data_mount = lines.find { |line| line.include?("/etc/data") }
if data_mount
cols = data_mount.split
device_name = cols[0] # e.g. "loop3"
size = cols[3] # e.g. "1G"
mountpoint = cols[6] # e.g. "/etc/data"
if device_name.start_with?("loop") && size == "1G" && mountpoint == "/etc/data"
# no op
else
raise "/etc/data is mounted incorrectly: #{data_mount}"
end
else
raise "No /etc/data mount found in lsblk output"
end
end
# we are not using jsonpath for extracting the status because even though a pod is termination, its phase
# from API Server's point of view is Running, in order to detect that using jsonpath, we needed to check for
# deletion timestamp, all conditions in status and .status.phase.
# to keep the query simple, we let the kubectl do the processing and observe the system from the eyes of a
# customer. This also keeps the logic simpler
def pod_status
kubernetes_cluster.client.kubectl("get pods ubuntu-statefulset-0 | grep -v NAME | awk '{print $3}'").strip
end
end