ubicloud/model/load_balancer_vm_port.rb
Benjamin Satzger d261f3e5d2 Set last pulse of monitorable resources
This change ensures last_pulse is explicitly set after a resource is
health-checked. Currently, only load balancers set last_pulse; all other
resources leave it as nil. As a result, the retry logic in
MonitorableResource#perform_checkup only applies to load balancers,
because the condition:
```
@session[:last_pulse]&.<(Time.now - 8)
```
is always false when @session[:last_pulse] is nil.

Because COMMON_SSH_ARGS sets the SSH timeout to 10 seconds, resources
that don't swallow exceptions like IOError or Errno::ECONNRESET,
primarily VM hosts, produce logs that typically look like this:

Sep 24 13:59:29 ubicloud-console app[monitor] info Got new pulse.
Sep 24 14:00:01 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:00:19 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:00:57 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:01:23 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:02:00 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:02:28 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:03:01 ubicloud-console app[monitor] info Pulse checking has failed.
Sep 24 14:03:30 ubicloud-console app[monitor] info Pulse checking has failed.
...

After restarting monitor, the first health check succeeds. However, the
second health check - which runs 30 seconds later - fails because the SSH
session has been idle for more than 10 seconds and is now broken.
Subsequent health checks continue to fail because we neither retry on
broken sessions nor reestablish them.
2025-09-26 09:04:02 +02:00

93 lines
3.5 KiB
Ruby

# frozen_string_literal: true
require_relative "../model"
class LoadBalancerVmPort < Sequel::Model
many_to_one :load_balancer_vm
many_to_one :load_balancer_port
plugin ResourceMethods
include HealthMonitorMethods
def load_balancer
load_balancer_port.load_balancer
end
def vm
load_balancer_vm.vm
end
def init_health_monitor_session
{
ssh_session: vm.vm_host.sshable.start_fresh_session
}
end
def health_check(session:)
[
check_probe(session, :ipv4),
check_probe(session, :ipv6)
]
end
def check_probe(session, type)
if type == :ipv4
return "up" unless load_balancer.ipv4_enabled?
elsif type == :ipv6
return "up" unless load_balancer.ipv6_enabled?
else
raise "Invalid type: #{type}"
end
begin
(session[:ssh_session].exec!(health_check_cmd(type)).strip == "200") ? "up" : "down"
rescue IOError, Errno::ECONNRESET
raise
rescue => e
Clog.emit("Exception in LoadBalancerVmPort #{ubid}") { Util.exception_to_hash(e) }
"down"
end
end
def health_check_cmd(type)
address = (type == :ipv4) ? vm.private_ipv4 : vm.ephemeral_net6.nth(2)
if load_balancer.health_check_protocol == "tcp"
"sudo ip netns exec #{vm.inhost_name} nc -z -w #{load_balancer.health_check_timeout} #{address} #{load_balancer_port.dst_port} >/dev/null 2>&1 && echo 200 || echo 400"
else
"sudo ip netns exec #{vm.inhost_name} curl --insecure --resolve #{load_balancer.hostname}:#{load_balancer_port.dst_port}:#{(address.version == 6) ? "[#{address}]" : address} --max-time #{load_balancer.health_check_timeout} --silent --output /dev/null --write-out '%{http_code}' #{load_balancer.health_check_url(use_endpoint: true)}"
end
end
def check_pulse(session:, previous_pulse:)
reading_ipv4, reading_ipv6 = health_check(session:)
reading = (reading_ipv4 == "up" && reading_ipv6 == "up") ? "up" : "down"
pulse = aggregate_readings(previous_pulse:, reading:, data: {ipv4: reading_ipv4, ipv6: reading_ipv6})
time_passed_health_check_interval = Time.now - pulse[:reading_chg] > load_balancer.health_check_interval
if state == "up" && pulse[:reading] == "down" && pulse[:reading_rpt] > load_balancer.health_check_down_threshold && time_passed_health_check_interval && !load_balancer.reload.update_load_balancer_set?
update(state: "down")
load_balancer.incr_update_load_balancer
end
if state == "down" && pulse[:reading] == "up" && pulse[:reading_rpt] > load_balancer.health_check_up_threshold && time_passed_health_check_interval && !load_balancer.reload.update_load_balancer_set?
update(state: "up")
load_balancer.incr_update_load_balancer
end
pulse
end
end
# Table: load_balancer_vm_port
# Columns:
# id | uuid | PRIMARY KEY
# load_balancer_vm_id | uuid | NOT NULL
# load_balancer_port_id | uuid | NOT NULL
# state | lb_node_state | NOT NULL DEFAULT 'down'::lb_node_state
# last_checked_at | timestamp with time zone | NOT NULL DEFAULT CURRENT_TIMESTAMP
# Indexes:
# load_balancer_vm_port_pkey | PRIMARY KEY btree (id)
# lb_vm_port_unique_index | UNIQUE btree (load_balancer_port_id, load_balancer_vm_id)
# Foreign key constraints:
# load_balancer_vm_port_load_balancer_port_id_fkey | (load_balancer_port_id) REFERENCES load_balancer_port(id)
# load_balancer_vm_port_load_balancer_vm_id_fkey | (load_balancer_vm_id) REFERENCES load_balancers_vms(id) ON DELETE CASCADE