mirror of
https://github.com/ubicloud/ubicloud.git
synced 2025-10-04 13:52:06 +08:00
Merge the process_event_loop method into the check_pulse method. Have the pulse thread change a local variable if there was an error during the event loop. Have the monitor thread (not the pulse thread) clear the session, after the pulse thread has ended, if the event loop failed. Don't sleep until run_event_loop is true. With the merged code, run_event_loop would be set immediately after the start of the thread, so there is no reason to sleep. Even before this change, I don't think there was a reason to sleep previously.
112 lines
3.4 KiB
Ruby
Executable file
112 lines
3.4 KiB
Ruby
Executable file
#!/usr/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
ENV["MONITOR_PROCESS"] = "1"
|
|
|
|
partition_number = ARGV[0]
|
|
partition_number ||= if (match = /monitor\.(\d+)\z/.match(ENV["DYNO"] || ENV["PS"]))
|
|
match[1] # Heroku/Foreman
|
|
end
|
|
|
|
# For simplicity, monitor always runs partitioned. Even if only running a single
|
|
# process, in which case, the partition is the entire id space.
|
|
partition_number ||= "1"
|
|
|
|
max_partition = 8
|
|
|
|
partition_number = Integer(partition_number)
|
|
raise "invalid partition_number: #{partition_number}" if partition_number < 1 || partition_number > max_partition
|
|
|
|
require_relative "../loader"
|
|
clover_freeze
|
|
|
|
# Information (seconds, log message, log key) for stuck pulses for monitor jobs.
|
|
monitor_pulse_info = [120, "Pulse check has stuck.", :pulse_check_stuck].freeze
|
|
|
|
# Information (seconds, log message, log key) for stuck pulses for metric export jobs.
|
|
metric_export_pulse_info = [100, "Pulse check has stuck.", :pulse_check_stuck].freeze
|
|
|
|
if Config.test?
|
|
# Run during monitor smoke tests
|
|
$stdout.sync = $stderr.sync = true
|
|
# Ensure clog output during smoke test
|
|
Clog::Config = Struct.new(:test?).new(false)
|
|
MonitorResourceStub.add(UBID.generate_vanity("et", "mr", "vp"))
|
|
MonitorResourceStub.add(UBID.generate_vanity("et", "mr", "down"), pulse: "down")
|
|
MonitorResourceStub.add(UBID.generate_vanity("et", "mr", "evloop"), need_event_loop: true)
|
|
MonitorResourceStub.add(UBID.generate_vanity("et", "mr", "mc2"), metrics_count: 2)
|
|
monitor_models = [MonitorResourceStub]
|
|
metric_export_models = [MonitorResourceStub]
|
|
runner_args = {scan_every: 1, report_every: 1, enqueue_every: 1, check_stuck_pulses_every: 1}
|
|
else
|
|
runner_args = {ignore_threads: 2}
|
|
monitor_models = [
|
|
VmHost,
|
|
PostgresServer,
|
|
Vm.where(~Sshable.where(id: Sequel[:vm][:id]).exists),
|
|
MinioServer,
|
|
GithubRunner,
|
|
VmHostSlice,
|
|
LoadBalancerVmPort,
|
|
KubernetesCluster,
|
|
VictoriaMetricsServer
|
|
]
|
|
|
|
metric_export_models = [
|
|
PostgresServer,
|
|
VmHost
|
|
]
|
|
end
|
|
|
|
# Handle both monitored resources and metric export resources.
|
|
monitor_resources = MonitorResourceType.create(
|
|
MonitorableResource,
|
|
monitor_pulse_info,
|
|
Config.max_health_monitor_threads,
|
|
monitor_models,
|
|
&:check_pulse
|
|
)
|
|
metric_export_resources = MonitorResourceType.create(
|
|
MetricsTargetResource,
|
|
metric_export_pulse_info,
|
|
Config.max_metrics_export_threads,
|
|
metric_export_models,
|
|
&:export_metrics
|
|
)
|
|
|
|
repartitioner = Repartitioner.new(partition_number:, channel: :monitor, listen_timeout: 1, recheck_seconds: 18, stale_seconds: 40, max_partition:)
|
|
|
|
runner = MonitorRunner.new(monitor_resources:, metric_export_resources:, repartitioner:, **runner_args)
|
|
|
|
repartition_thread = Thread.new { repartitioner.listen }
|
|
|
|
# Only NOTIFY for the first 3-5 seconds, so that by the time we actually start monitoring,
|
|
# all monitor processes know the expected partitioning. The rand is to avoid thundering herd issues.
|
|
sleep(1 + rand)
|
|
3.times do
|
|
repartitioner.notify
|
|
sleep 1
|
|
end
|
|
|
|
do_shutdown = proc do
|
|
repartitioner.shutdown!
|
|
runner.shutdown!
|
|
end
|
|
|
|
Signal.trap("INT", &do_shutdown)
|
|
Signal.trap("TERM", &do_shutdown)
|
|
|
|
runner.run
|
|
|
|
# If not all threads exit within two seconds, exit 1 to indicate
|
|
# unclean shutdown.
|
|
exit_status = 1
|
|
|
|
Thread.new do
|
|
repartition_thread.join
|
|
[monitor_resources, metric_export_resources].each(&:wait_cleanup!)
|
|
# If all threads exit within two seconds, exit 0 to indicate clean shutdown.
|
|
exit_status = 0
|
|
end.join(2)
|
|
|
|
exit exit_status
|