mirror of
https://github.com/ubicloud/ubicloud.git
synced 2025-11-28 08:30:27 +08:00
Upgrades (including control plane VM replacements) caused disruptions in
pod-to-service communication. Symptoms included CoreDNS failing to reach the
API server ("no route to host"), leading to DNS resolution failures (e.g.,
connection refused to kube-dns at 10.96.0.10), reject rules in iptables, and
broader service access issues. Pod-to-pod and pod-to-host traffic were
unaffected, indicating a service endpoint problem.
Root cause: The kubeadm-config ConfigMap set apiServer.extraArgs.advertise-
address to a static IP (e.g., the initial control plane IP). During upgrades,
this IP became outdated as new VMs received new IPs, but the config wasn't
updated. This led to:
- kube-apiserver advertising the old IP
- The default/kubernetes service’s Endpoints/EndpointSlice being recreated with
the wrong backend IP
- kube-proxy DNAT rules routing traffic (e.g., to 10.96.0.1:443) to the
unreachable old IP
- Circular dependency: CoreDNS couldn’t sync with the API, preventing readiness
and worsening DNS issues.
Solution: Remove the advertise-address arg entirely from kubeadm-config. This
lets kube-apiserver auto-detect and advertise the node’s primary interface IP
(default behavior per Kubernetes docs). On upgrade:
- New control plane VMs advertise their current IP
- Endpoints/EndpointSlice update automatically during manifest regeneration or
upgrade apply
This fix applies universally:
Single-node: Prevents total disruption from IP changes
Multi-node (HA): Each control plane node advertises its own IP;
Endpoints include all nodes for failover
119 lines
3.1 KiB
Ruby
Executable file
119 lines
3.1 KiB
Ruby
Executable file
#!/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
require "json"
|
|
require "yaml"
|
|
require_relative "../../common/lib/util"
|
|
|
|
params = JSON.parse($stdin.read)
|
|
|
|
begin
|
|
cluster_name = params.fetch("cluster_name")
|
|
lb_hostname = params.fetch("lb_hostname")
|
|
port = params.fetch("port")
|
|
private_subnet_cidr4 = params.fetch("private_subnet_cidr4")
|
|
private_subnet_cidr6 = params.fetch("private_subnet_cidr6")
|
|
node_name = params.fetch("node_name")
|
|
node_ipv4 = params.fetch("node_ipv4")
|
|
node_ipv6 = params.fetch("node_ipv6")
|
|
service_subnet_cidr6 = params.fetch("service_subnet_cidr6")
|
|
rescue KeyError => e
|
|
puts "Needed #{e.key} in parameters"
|
|
exit 1
|
|
end
|
|
service_account_name = "k8s-access"
|
|
secret_name = service_account_name
|
|
|
|
init_config = {
|
|
"apiVersion" => "kubeadm.k8s.io/v1beta4",
|
|
"kind" => "InitConfiguration",
|
|
"nodeRegistration" => {
|
|
"name" => node_name,
|
|
"kubeletExtraArgs" => [
|
|
{
|
|
"name" => "node-ip",
|
|
"value" => "#{node_ipv4},#{node_ipv6}"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
cluster_config = {
|
|
"apiVersion" => "kubeadm.k8s.io/v1beta4",
|
|
"kind" => "ClusterConfiguration",
|
|
"clusterName" => cluster_name,
|
|
"controlPlaneEndpoint" => "#{lb_hostname}:#{port}",
|
|
"apiServer" => {
|
|
"certSANs" => [lb_hostname],
|
|
"extraArgs" => [
|
|
{
|
|
"name" => "bind-address",
|
|
"value" => "::"
|
|
}
|
|
]
|
|
},
|
|
"networking" => {
|
|
"podSubnet" => "#{private_subnet_cidr4},#{private_subnet_cidr6}",
|
|
"serviceSubnet" => "10.96.0.0/12,#{service_subnet_cidr6}"
|
|
},
|
|
"controllerManager" => {
|
|
"extraArgs" => [
|
|
{
|
|
"name" => "allocate-node-cidrs",
|
|
"value" => "false"
|
|
}
|
|
]
|
|
},
|
|
"etcd" => {
|
|
"local" => {
|
|
"dataDir" => "/var/lib/etcd"
|
|
}
|
|
}
|
|
}
|
|
|
|
kubelet_config = {
|
|
"apiVersion" => "kubelet.config.k8s.io/v1beta1",
|
|
"kind" => "KubeletConfiguration",
|
|
"serverTLSBootstrap" => true
|
|
}
|
|
|
|
config_path = "/tmp/kubeadm-config.yaml"
|
|
File.open(config_path, "w") do |file|
|
|
file.write(init_config.to_yaml)
|
|
file.write("---\n")
|
|
file.write(cluster_config.to_yaml)
|
|
file.write("---\n")
|
|
file.write(kubelet_config.to_yaml)
|
|
end
|
|
|
|
r("sudo kubeadm init --config #{config_path} --node-name #{node_name}")
|
|
r("sudo /home/ubi/kubernetes/bin/setup-cni")
|
|
|
|
api_server_up = false
|
|
5.times do
|
|
r("kubectl --kubeconfig=/etc/kubernetes/admin.conf get --raw='/healthz'")
|
|
api_server_up = true
|
|
break
|
|
rescue CommandFail
|
|
puts "API server is not up yet, retrying in 5 seconds..."
|
|
sleep 5
|
|
end
|
|
|
|
unless api_server_up
|
|
puts "API server is not healthy. Could not create customer credentials."
|
|
exit 1
|
|
end
|
|
|
|
r "kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system create serviceaccount #{service_account_name}"
|
|
r "kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system create clusterrolebinding #{service_account_name}-binding --clusterrole=cluster-admin --serviceaccount=kube-system:#{service_account_name}"
|
|
r "kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f - <<EOF
|
|
apiVersion: v1
|
|
kind: Secret
|
|
metadata:
|
|
name: #{secret_name}
|
|
namespace: kube-system
|
|
annotations:
|
|
kubernetes.io/service-account.name: #{service_account_name}
|
|
type: kubernetes.io/service-account-token
|
|
EOF
|
|
"
|