Files
ubicloud/bin/monitor
Burak Yucesoy 4ac52f6fc2 Add more logging to monitor
We are investigating frequent stuck pulse checks problem. This commit will help
us to understand what is going on at the time of the stuck pulse checks.
2024-05-06 16:58:13 +02:00

59 lines
1.5 KiB
Ruby
Executable File

#!/usr/bin/env ruby
# frozen_string_literal: true
require_relative "../loader"
clover_freeze
resources = {}
mutex = Mutex.new
resource_scanner = Thread.new do
monitorable_resource_types = [VmHost, PostgresServer, Vm.where(~Sshable.where(id: Sequel[:vm][:id]).exists), MinioServer, GithubRunner]
loop do
mutex.synchronize do
Enumerator::Chain.new(*monitorable_resource_types).each do |r|
resources[r.id] ||= MonitorableResource.new(r)
end
end
sleep 60
end
rescue => ex
Clog.emit("Resource scanning has failed.") { {resource_scanning_failure: {exception: Util.exception_to_hash(ex)}} }
ThreadPrinter.run
Kernel.exit!
end
pulse_checker = Thread.new do
loop do
mutex.synchronize do
# Deduct 3 for the threads that are always running: main, resource_scanner, pulse_checker
Clog.emit("Active threads count.") { {active_threads_count: Thread.list.count - 3} }
resources.each do |r_id, r|
sleep 1 while Thread.list.count + 2 > Config.max_monitor_threads
r.force_stop_if_stuck
Thread.new do
r.lock_no_wait do
r.open_resource_session
r.process_event_loop
r.check_pulse
end
end
end
resources.select { |r_id, r| r.deleted }.each { |r_id, r| resources.delete(r_id) }
end
sleep 5
end
rescue => ex
Clog.emit("Pulse checking has failed.") { {pulse_checking_failure: {exception: Util.exception_to_hash(ex)}} }
ThreadPrinter.run
Kernel.exit!
end
resource_scanner.join
pulse_checker.join