Prometheus has many modes for CPU usage metrics and we only share information about few of them. We recently hit a case where softirq usage was significant (> 10%), but it was not shown in the dashboard, because it was not one of the metrics we were showing in the dashboard. This commit adds softirq usage to the dashboard, so that we can see it in the future. It seems it is interesting enough to be shown in the dashboard.
194 lines
7.7 KiB
Ruby
194 lines
7.7 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module Metrics
|
|
TimeSeries = Data.define(:labels, :query)
|
|
MetricDefinition = Data.define(:name, :description, :unit, :series)
|
|
|
|
POSTGRES_METRICS = {
|
|
cpu_usage:
|
|
MetricDefinition.new(
|
|
name: "CPU Usage",
|
|
description: "Percentage of CPU used by the system",
|
|
unit: "%",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {},
|
|
query: "avg(rate(node_cpu_seconds_total{mode=~\"(iowait|user|system|steal|softirq)\", ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m])) by (mode) * 100"
|
|
)
|
|
]
|
|
),
|
|
load_average:
|
|
MetricDefinition.new(
|
|
name: "Load Average",
|
|
description: "System load average over different time periods",
|
|
unit: nil,
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "1 minute"},
|
|
query: "sum(node_load1{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"})"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "5 minutes"},
|
|
query: "sum(node_load5{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"})"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "15 minutes"},
|
|
query: "sum(node_load15{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"})"
|
|
)
|
|
]
|
|
),
|
|
memory_usage:
|
|
MetricDefinition.new(
|
|
name: "Memory Usage",
|
|
description: "Total memory usage vs cache & buffers",
|
|
unit: "%",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Used Memory"},
|
|
query: "sum((1 - (node_memory_MemAvailable_bytes{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"} / node_memory_MemTotal_bytes{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"})) * 100)"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Cache & Buffers"},
|
|
query: "sum((node_memory_Cached_bytes{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"} + node_memory_Buffers_bytes{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}) / node_memory_MemTotal_bytes{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"} * 100)"
|
|
)
|
|
]
|
|
),
|
|
disk_usage:
|
|
MetricDefinition.new(
|
|
name: "Disk Usage",
|
|
description: "Disk space utilization",
|
|
unit: "%",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Used Space"},
|
|
query: "100 - (sum(node_filesystem_avail_bytes{mountpoint=\"/dat\", ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"} / node_filesystem_size_bytes{mountpoint=\"/dat\", ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}) * 100)"
|
|
)
|
|
]
|
|
),
|
|
disk_io:
|
|
MetricDefinition.new(
|
|
name: "Disk I/O",
|
|
description: "I/O operations per second",
|
|
unit: "IOPS",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Reads"},
|
|
query: "sum(rate(node_disk_reads_completed_total{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Writes"},
|
|
query: "sum(rate(node_disk_writes_completed_total{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
)
|
|
]
|
|
),
|
|
network_traffic:
|
|
MetricDefinition.new(
|
|
name: "Network Traffic",
|
|
description: "Incoming and outgoing network traffic",
|
|
unit: "bytes/s",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Received"},
|
|
query: "sum(rate(node_network_receive_bytes_total{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Transmitted"},
|
|
query: "sum(rate(node_network_transmit_bytes_total{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
)
|
|
]
|
|
),
|
|
connection_count:
|
|
MetricDefinition.new(
|
|
name: "Connection Count",
|
|
description: "Database activity metrics",
|
|
unit: "count",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Active"},
|
|
query: "sum(pg_stat_activity_count{state=\"active\", ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"})"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Total"},
|
|
query: "sum(pg_stat_activity_count{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"})"
|
|
)
|
|
]
|
|
),
|
|
cache_hit_ratio:
|
|
MetricDefinition.new(
|
|
name: "Cache Hit Ratio",
|
|
description: "Percentage of cache hits vs reads",
|
|
unit: "%",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {},
|
|
query: "sum(rate(pg_stat_database_blks_hit{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m])) / (sum(rate(pg_stat_database_blks_hit{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m])) + sum(rate(pg_stat_database_blks_read{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))) * 100"
|
|
)
|
|
]
|
|
),
|
|
operation_throughput:
|
|
MetricDefinition.new(
|
|
name: "Operation Throughput",
|
|
description: "Fetch, insert, update, delete operations per second",
|
|
unit: "ops/s",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Fetch"},
|
|
query: "sum(rate(pg_stat_database_tup_fetched{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Insert"},
|
|
query: "sum(rate(pg_stat_database_tup_inserted{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Update"},
|
|
query: "sum(rate(pg_stat_database_tup_updated{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Delete"},
|
|
query: "sum(rate(pg_stat_database_tup_deleted{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
)
|
|
]
|
|
),
|
|
deadlocks:
|
|
MetricDefinition.new(
|
|
name: "Deadlocks",
|
|
description: "Deadlocks per second",
|
|
unit: "deadlocks/s",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {},
|
|
query: "sum(rate(pg_stat_database_deadlocks{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
)
|
|
]
|
|
),
|
|
database_size:
|
|
MetricDefinition.new(
|
|
name: "Database Size",
|
|
description: "Top 5 databases by size",
|
|
unit: "bytes",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {},
|
|
query: "topk(5, sum(pg_database_size_bytes{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\", datname!~\"template0|template1\"}) by (datname))"
|
|
)
|
|
]
|
|
),
|
|
transactions:
|
|
MetricDefinition.new(
|
|
name: "Transactions",
|
|
description: "Committed vs rolled back transactions",
|
|
unit: "count/s",
|
|
series: [
|
|
TimeSeries.new(
|
|
labels: {name: "Commits"},
|
|
query: "sum(rate(pg_stat_database_xact_commit{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
),
|
|
TimeSeries.new(
|
|
labels: {name: "Rollbacks"},
|
|
query: "sum(rate(pg_stat_database_xact_rollback{ubicloud_resource_id=\"$ubicloud_resource_id\", ubicloud_resource_role=\"primary\"}[1m]))"
|
|
)
|
|
]
|
|
)
|
|
}
|
|
end
|