ubicloud/spec/lib/monitorable_resource_spec.rb
mohi-kalantari 8fda323c06 Retry once when resource healthcheck fails because of dropped ssh connections
There are many cases when monitor is overloaded and resource's keep
alive settings drops the connection. With this commit, we add a
retry before considering a dead connection "down"
2025-09-08 11:47:10 +02:00

186 lines
8.2 KiB
Ruby

# frozen_string_literal: true
require_relative "../model/spec_helper"
RSpec.describe MonitorableResource do
let(:postgres_server) { PostgresServer.new { it.id = "c068cac7-ed45-82db-bf38-a003582b36ee" } }
let(:r_w_event_loop) { described_class.new(postgres_server) }
let(:vm_host) { VmHost.new { it.id = "46683a25-acb1-4371-afe9-d39f303e44b4" } }
let(:r_without_event_loop) { described_class.new(vm_host) }
describe "#open_resource_session" do
it "returns if session is not nil and pulse reading is up" do
r_w_event_loop.instance_variable_set(:@session, "not nil")
r_w_event_loop.instance_variable_set(:@pulse, {reading: "up"})
expect(postgres_server).not_to receive(:reload)
r_w_event_loop.open_resource_session
end
it "sets session to resource's init_health_monitor_session" do
expect(postgres_server).to receive(:reload).and_return(postgres_server)
expect(postgres_server).to receive(:init_health_monitor_session).and_return("session")
expect { r_w_event_loop.open_resource_session }.to change { r_w_event_loop.instance_variable_get(:@session) }.from(nil).to("session")
end
it "sets deleted to true if resource is deleted" do
expect(postgres_server).to receive(:reload).and_raise(Sequel::NoExistingObject)
expect { r_w_event_loop.open_resource_session }.to change(r_w_event_loop, :deleted).from(false).to(true)
end
it "ignores exception if it is not Sequel::NoExistingObject" do
expect(postgres_server).to receive(:reload).and_raise(StandardError)
expect { r_w_event_loop.open_resource_session }.not_to raise_error
end
end
describe "#check_pulse" do
it "does not create thread if session is nil or resource does not need event loop" do
expect(Thread).not_to receive(:new)
# session is nil
r_w_event_loop.check_pulse
# resource does not need event loop
r_without_event_loop.instance_variable_set(:@session, "not nil")
r_without_event_loop.check_pulse
end
it "swallows exception and logs it if event loop fails" do
session = {ssh_session: instance_double(Net::SSH::Connection::Session)}
r_w_event_loop.instance_variable_set(:@session, session)
expect(Thread).to receive(:new).and_call_original
expect(session[:ssh_session]).to receive(:loop).and_raise(StandardError)
expect(Clog).to receive(:emit).twice.and_call_original
expect(r_w_event_loop).to receive(:close_resource_session)
r_w_event_loop.check_pulse
end
it "swallows exception and logs it if check_pulse fails" do
session = {ssh_session: instance_double(Net::SSH::Connection::Session)}
r_without_event_loop.instance_variable_set(:@session, session)
expect(vm_host).to receive(:check_pulse).and_raise(StandardError)
expect(Clog).to receive(:emit).and_call_original
expect { r_without_event_loop.check_pulse }.not_to raise_error
end
end
describe "#check_pulse with session and event loop" do
let(:session) { {ssh_session: instance_double(Net::SSH::Connection::Session)} }
before do
r_w_event_loop.instance_variable_set(:@session, session)
expect(session[:ssh_session]).to receive(:loop)
end
it "creates a new thread and runs the event loop" do
expect(Thread).to receive(:new).and_call_original
r_w_event_loop.check_pulse
end
it "calls check_pulse on resource and sets pulse" do
expect(postgres_server).to receive(:check_pulse).and_return({reading: "up"})
expect { r_w_event_loop.check_pulse }.to change { r_w_event_loop.instance_variable_get(:@pulse) }.from({}).to({reading: "up"})
end
it "swallows exception and logs it if check_pulse fails" do
expect(postgres_server).to receive(:check_pulse).and_raise(StandardError)
expect(Clog).to receive(:emit).and_call_original
expect { r_w_event_loop.check_pulse }.not_to raise_error
end
it "does not log the pulse if reading is up and reading_rpt is not every 5th and reading_rpt is large enough" do
expect(postgres_server).to receive(:check_pulse).and_return({reading: "up", reading_rpt: 13})
expect(Clog).not_to receive(:emit).and_call_original
r_w_event_loop.check_pulse
end
it "logs the pulse if reading is not up" do
expect(postgres_server).to receive(:check_pulse).and_return({reading: "down", reading_rpt: 13})
expect(Clog).to receive(:emit).and_call_original
r_w_event_loop.check_pulse
end
it "logs the pulse if reading is up and reading_rpt is every 5th reading" do
expect(postgres_server).to receive(:check_pulse).and_return({reading: "up", reading_rpt: 6})
expect(Clog).to receive(:emit).and_call_original
r_w_event_loop.check_pulse
end
it "logs the pulse if reading is up and reading_rpt is recent enough" do
expect(postgres_server).to receive(:check_pulse).and_return({reading: "up", reading_rpt: 3})
expect(Clog).to receive(:emit).and_call_original
r_w_event_loop.check_pulse
end
end
[IOError.new("closed stream"), Errno::ECONNRESET.new("recvfrom(2)")].each do |ex|
describe "#check_pulse", "stale connection retry behavior with #{ex.class}" do
let(:session) { {ssh_session: instance_double(Net::SSH::Connection::Session)} }
before do
r_w_event_loop.instance_variable_set(:@session, session)
expect(session[:ssh_session]).to receive(:loop)
end
it "does not retry if the last pulse is not set" do
expect(postgres_server).to receive(:check_pulse).and_raise(ex)
expect(Clog).to receive(:emit).and_call_original
expect { r_w_event_loop.check_pulse }.not_to raise_error
end
it "does not retry if the connection is fresh" do
session[:last_pulse] = Time.now - 1
expect(postgres_server).to receive(:check_pulse).and_raise(ex)
expect(Clog).to receive(:emit).and_call_original
expect { r_w_event_loop.check_pulse }.not_to raise_error
end
it "is up on a retry on a stale connection that works the second time" do
session[:last_pulse] = Time.now - 10
expect(postgres_server).to receive(:check_pulse).and_raise(ex)
second_session = instance_double(Net::SSH::Connection::Session)
expect(postgres_server).to receive(:init_health_monitor_session).and_return(second_session)
expect(session).to receive(:merge!).with(second_session)
expect(postgres_server).to receive(:check_pulse).and_return({reading: "up", reading_rpt: 1})
r_w_event_loop.check_pulse
end
it "is down if consecutive errors are raised even on a stale connection" do
session[:last_pulse] = Time.now - 10
expect(postgres_server).to receive(:check_pulse).and_raise(ex)
second_session = instance_double(Net::SSH::Connection::Session)
expect(postgres_server).to receive(:init_health_monitor_session).and_return(second_session)
expect(session).to receive(:merge!).with(second_session)
expect(postgres_server).to receive(:check_pulse).and_return(ex)
expect(Clog).to receive(:emit).and_call_original
expect { r_w_event_loop.check_pulse }.not_to raise_error
end
it "is down for a matching exception without a matching message" do
session[:last_pulse] = Time.now - 10
expect(postgres_server).to receive(:check_pulse).and_raise(ex.class.new("something else"))
expect(Clog).to receive(:emit).and_call_original
expect { r_w_event_loop.check_pulse }.not_to raise_error
end
end
end
describe "#close_resource_session" do
it "returns if session is nil" do
session = {ssh_session: instance_double(Net::SSH::Connection::Session)}
expect(session[:ssh_session]).not_to receive(:shutdown!)
expect(session).to receive(:nil?).and_return(true)
r_w_event_loop.instance_variable_set(:@session, session)
r_w_event_loop.close_resource_session
end
it "shuts down and closes the session" do
session = {ssh_session: instance_double(Net::SSH::Connection::Session)}
expect(session[:ssh_session]).to receive(:shutdown!)
expect(session[:ssh_session]).to receive(:close)
r_w_event_loop.instance_variable_set(:@session, session)
r_w_event_loop.close_resource_session
end
end
end