mirror of
https://github.com/ubicloud/ubicloud.git
synced 2025-10-04 05:42:15 +08:00
This change ensures last_pulse is explicitly set after a resource is health-checked. Currently, only load balancers set last_pulse; all other resources leave it as nil. As a result, the retry logic in MonitorableResource#perform_checkup only applies to load balancers, because the condition: ``` @session[:last_pulse]&.<(Time.now - 8) ``` is always false when @session[:last_pulse] is nil. Because COMMON_SSH_ARGS sets the SSH timeout to 10 seconds, resources that don't swallow exceptions like IOError or Errno::ECONNRESET, primarily VM hosts, produce logs that typically look like this: Sep 24 13:59:29 ubicloud-console app[monitor] info Got new pulse. Sep 24 14:00:01 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:00:19 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:00:57 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:01:23 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:02:00 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:02:28 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:03:01 ubicloud-console app[monitor] info Pulse checking has failed. Sep 24 14:03:30 ubicloud-console app[monitor] info Pulse checking has failed. ... After restarting monitor, the first health check succeeds. However, the second health check - which runs 30 seconds later - fails because the SSH session has been idle for more than 10 seconds and is now broken. Subsequent health checks continue to fail because we neither retry on broken sessions nor reestablish them.
93 lines
3.5 KiB
Ruby
93 lines
3.5 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require_relative "../model"
|
|
|
|
class LoadBalancerVmPort < Sequel::Model
|
|
many_to_one :load_balancer_vm
|
|
many_to_one :load_balancer_port
|
|
plugin ResourceMethods
|
|
include HealthMonitorMethods
|
|
|
|
def load_balancer
|
|
load_balancer_port.load_balancer
|
|
end
|
|
|
|
def vm
|
|
load_balancer_vm.vm
|
|
end
|
|
|
|
def init_health_monitor_session
|
|
{
|
|
ssh_session: vm.vm_host.sshable.start_fresh_session
|
|
}
|
|
end
|
|
|
|
def health_check(session:)
|
|
[
|
|
check_probe(session, :ipv4),
|
|
check_probe(session, :ipv6)
|
|
]
|
|
end
|
|
|
|
def check_probe(session, type)
|
|
if type == :ipv4
|
|
return "up" unless load_balancer.ipv4_enabled?
|
|
elsif type == :ipv6
|
|
return "up" unless load_balancer.ipv6_enabled?
|
|
else
|
|
raise "Invalid type: #{type}"
|
|
end
|
|
|
|
begin
|
|
(session[:ssh_session].exec!(health_check_cmd(type)).strip == "200") ? "up" : "down"
|
|
rescue IOError, Errno::ECONNRESET
|
|
raise
|
|
rescue => e
|
|
Clog.emit("Exception in LoadBalancerVmPort #{ubid}") { Util.exception_to_hash(e) }
|
|
"down"
|
|
end
|
|
end
|
|
|
|
def health_check_cmd(type)
|
|
address = (type == :ipv4) ? vm.private_ipv4 : vm.ephemeral_net6.nth(2)
|
|
if load_balancer.health_check_protocol == "tcp"
|
|
"sudo ip netns exec #{vm.inhost_name} nc -z -w #{load_balancer.health_check_timeout} #{address} #{load_balancer_port.dst_port} >/dev/null 2>&1 && echo 200 || echo 400"
|
|
else
|
|
"sudo ip netns exec #{vm.inhost_name} curl --insecure --resolve #{load_balancer.hostname}:#{load_balancer_port.dst_port}:#{(address.version == 6) ? "[#{address}]" : address} --max-time #{load_balancer.health_check_timeout} --silent --output /dev/null --write-out '%{http_code}' #{load_balancer.health_check_url(use_endpoint: true)}"
|
|
end
|
|
end
|
|
|
|
def check_pulse(session:, previous_pulse:)
|
|
reading_ipv4, reading_ipv6 = health_check(session:)
|
|
reading = (reading_ipv4 == "up" && reading_ipv6 == "up") ? "up" : "down"
|
|
pulse = aggregate_readings(previous_pulse:, reading:, data: {ipv4: reading_ipv4, ipv6: reading_ipv6})
|
|
|
|
time_passed_health_check_interval = Time.now - pulse[:reading_chg] > load_balancer.health_check_interval
|
|
|
|
if state == "up" && pulse[:reading] == "down" && pulse[:reading_rpt] > load_balancer.health_check_down_threshold && time_passed_health_check_interval && !load_balancer.reload.update_load_balancer_set?
|
|
update(state: "down")
|
|
load_balancer.incr_update_load_balancer
|
|
end
|
|
|
|
if state == "down" && pulse[:reading] == "up" && pulse[:reading_rpt] > load_balancer.health_check_up_threshold && time_passed_health_check_interval && !load_balancer.reload.update_load_balancer_set?
|
|
update(state: "up")
|
|
load_balancer.incr_update_load_balancer
|
|
end
|
|
|
|
pulse
|
|
end
|
|
end
|
|
|
|
# Table: load_balancer_vm_port
|
|
# Columns:
|
|
# id | uuid | PRIMARY KEY
|
|
# load_balancer_vm_id | uuid | NOT NULL
|
|
# load_balancer_port_id | uuid | NOT NULL
|
|
# state | lb_node_state | NOT NULL DEFAULT 'down'::lb_node_state
|
|
# last_checked_at | timestamp with time zone | NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
# Indexes:
|
|
# load_balancer_vm_port_pkey | PRIMARY KEY btree (id)
|
|
# lb_vm_port_unique_index | UNIQUE btree (load_balancer_port_id, load_balancer_vm_id)
|
|
# Foreign key constraints:
|
|
# load_balancer_vm_port_load_balancer_port_id_fkey | (load_balancer_port_id) REFERENCES load_balancer_port(id)
|
|
# load_balancer_vm_port_load_balancer_vm_id_fkey | (load_balancer_vm_id) REFERENCES load_balancers_vms(id) ON DELETE CASCADE
|