diff --git a/docs/node-mixin/.lint b/docs/node-mixin/.lint new file mode 100644 index 0000000000..c95289c4dc --- /dev/null +++ b/docs/node-mixin/.lint @@ -0,0 +1,58 @@ +--- +exclusions: + template-datasource-rule: + reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources. + panel-datasource-rule: + reason: using the not yet implemented new convention for dashboards with Loki and Prometheus datasources. + target-job-rule: + reason: Job is hardcoded by the mixin. + entries: + - dashboard: Node Exporter / USE Method / Node + - dashboard: Node Exporter / Nodes + - dashboard: Node Exporter / MacOS + - dashboard: Node Exporter / USE Method / Multi-cluster + - dashboard: Node Exporter / USE Method / Cluster + template-job-rule: + reason: Job is hardcoded by the mixin. + entries: + - dashboard: Node Exporter / USE Method / Node + - dashboard: Node Exporter / Nodes + - dashboard: Node Exporter / MacOS + - dashboard: Node Exporter / USE Method / Multi-cluster + - dashboard: Node Exporter / USE Method / Cluster + target-instance-rule: + entries: + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Instances are aggregated for all clusters + - dashboard: Node Exporter / USE Method / Cluster + reason: Instances are aggregated for the whole cluster + - dashboard: Node Exporter / USE Method / Node + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / Nodes + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / MacOS + reason: Dashboard only allows selecting a single instance at a time. + template-instance-rule: + entries: + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Instances are aggregated for all clusters + - dashboard: Node Exporter / USE Method / Cluster + reason: Instances are aggregated for the whole cluster + - dashboard: Node Exporter / Nodes + reason: Dashboard only allows selecting a single instance at a time. + - dashboard: Node Exporter / MacOS + reason: Ignoring mislabeling of instance template + - dashboard: Node Exporter / USE Method / Node + reason: Ignoring mislabeling of instance template + panel-units-rule: + entries: + - dashboard: Node Exporter / Nodes + reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / MacOS + reason: Units are indeed set for all but load average (which doesn't have a reasonable unit), but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Cluster + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Multi-cluster + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. + - dashboard: Node Exporter / USE Method / Node + reason: Units are indeed set, but in the yaxis "format" property rather than in field config. The dashboard linter needs to be patched accordingly. diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 1eaedd3d2e..67e71d140b 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: { groups+: [ { - name: 'node-exporter', + name: 'node-exporter-filesystem', rules: [ { alert: 'NodeFilesystemSpaceFillingUp', @@ -156,6 +156,11 @@ description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', }, }, + ], + }, + { + name: 'node-exporter', + rules: [ { alert: 'NodeNetworkReceiveErrs', expr: ||| diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 4427b59d14..86f874c257 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -82,9 +82,36 @@ rateInterval: '5m', // Opt-in for multi-cluster support. showMultiCluster: false, + clusterLabel: 'cluster', + // groupLabels is a string with comma-separated + // labels that are common labels of instances belonging to the + // same logical group. Include not only enough labels to + // identify cluster members, but also all common labels you want + // to keep for resulting cluster-level alerts. + groupLabels: 'job', + // commaSeparated list of labels identifying a single instance: + instanceLabels: 'instance', + dashboardNamePrefix: 'Node Exporter / ', dashboardTags: ['node-exporter-mixin'], + dashboardRefresh: '30s', + dashboardTimezone: 'utc', + dashboardInterval: 'now-2h', + + // Grafana dashboard IDs are necessary for stable links for dashboards + grafanaDashboardIDs: { + 'node-rsrc-use.json': 'node-rsrc-use', + 'node-cluster-rsrc-use.json': 'node-cluster-rsrc-use', + 'node-multicluster-rsrc-use.json': 'node-multicluster-rsrc-use', + 'nodes.json': 'nodes', + 'nodes-darwin.json': 'nodes-darwin', + 'nodes-system.json': 'node-system', + 'nodes-memory.json': 'node-memory', + 'nodes-network.json': 'node-network', + 'nodes-disk.json': 'node-disk', + 'nodes-fleet.json': 'node-fleet', + }, }, } diff --git a/docs/node-mixin/dashboards/disk.libsonnet b/docs/node-mixin/dashboards/disk.libsonnet new file mode 100644 index 0000000000..2f78c4da3e --- /dev/null +++ b/docs/node-mixin/dashboards/disk.libsonnet @@ -0,0 +1,165 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +local common = import '../lib/common.libsonnet'; + +{ + + // https://www.robustperception.io/filesystem-metrics-from-the-node-exporter/ + new(config=null, platform=null):: { + local c = common.new(config=config, platform=platform), + local commonPromTarget = c.commonPromTarget, + local templates = c.templates, + local q = c.queries, + + local fsAvailable = + nodeTimeseries.new( + 'Filesystem Space Available', + description=||| + Filesystem space utilisation in bytes, by mountpoint. + ||| + ) + .withUnits('decbytes') + .withFillOpacity(5) + .addTarget(commonPromTarget( + expr=q.node_filesystem_avail_bytes, + legendFormat='{{ mountpoint }}', + )), + + local fsInodes = + nodeTimeseries.new( + 'Free inodes', + description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', + ) + .withUnits('short') + .addTarget(commonPromTarget( + expr=q.node_filesystem_files_free, + legendFormat='{{ mountpoint }}' + )) + .addTarget(commonPromTarget( + expr=q.node_filesystem_files, + legendFormat='{{ mountpoint }}' + )), + local fsInodesTotal = + nodeTimeseries.new( + 'Total inodes', + description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', + ) + .withUnits('short') + .addTarget(commonPromTarget( + expr=q.node_filesystem_files, + legendFormat='{{ mountpoint }}' + )), + local fsErrorsandRO = + nodeTimeseries.new('Filesystems with errors / read-only') + .withMax(1) + .addTarget(commonPromTarget( + expr=q.node_filesystem_readonly, + legendFormat='{{ mountpoint }}' + )) + .addTarget(commonPromTarget( + expr=q.node_filesystem_device_error, + legendFormat='{{ mountpoint }}' + )), + local fileDescriptors = + nodeTimeseries.new( + 'File Descriptors', + description=||| + File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe. + The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them. + ||| + ) + .addTarget(commonPromTarget( + expr=q.process_max_fds, + legendFormat='Maximum open file descriptors', + )) + .addTarget(commonPromTarget( + expr=q.process_open_fds, + legendFormat='Open file descriptors', + )), + + local diskIOcompleted = + nodeTimeseries.new( + title='Disk IOps completed', + description='The number (after merges) of I/O requests completed per second for the device' + ) + .withUnits('iops') + .withNegativeYByRegex('reads') + .withAxisLabel('read(-) | write(+)') + .addTarget(commonPromTarget( + expr=q.node_disk_reads_completed_total, + legendFormat='{{device}} reads completed', + )) + .addTarget(commonPromTarget( + expr=q.node_disk_writes_completed_total, + legendFormat='{{device}} writes completed', + )), + + local diskAvgWaitTime = + nodeTimeseries.new( + title='Disk Average Wait Time', + description='The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.' + ) + .withUnits('s') + .withNegativeYByRegex('read') + .withAxisLabel('read(-) | write(+)') + .addTarget(commonPromTarget( + expr=q.diskWaitReadTime, + legendFormat='{{device}} read wait time avg', + )) + .addTarget(commonPromTarget( + expr=q.diskWaitWriteTime, + legendFormat='{{device}} write wait time avg', + )), + + local diskAvgQueueSize = + nodeTimeseries.new( + title='Average Queue Size (aqu-sz)', + description='The average queue length of the requests that were issued to the device.' + ) + .addTarget(commonPromTarget( + expr=q.diskAvgQueueSize, + legendFormat='{{device}}', + )), + + local panelsGrid = + [ + { type: 'row', title: 'Filesystem', gridPos: { y: 0 } }, + fsAvailable { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, + c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, + fsInodes { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, + fsInodesTotal { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, + fsErrorsandRO { gridPos: { x: 0, w: 12, h: 8, y: 0 } }, + fileDescriptors { gridPos: { x: 12, w: 12, h: 8, y: 0 } }, + { type: 'row', title: 'Disk', gridPos: { y: 25 } }, + c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, + diskIOcompleted { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, + diskAvgWaitTime { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, + diskAvgQueueSize { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, + ], + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNode Filesystem and Disk' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes-disk.json'] + ) + .addLink(c.links.fleetDash) + .addLink(c.links.nodeDash) + .addLink(c.links.otherDashes) + .addAnnotations(c.annotations) + .addTemplates(templates) + .addPanels(panelsGrid) + else if platform == 'Darwin' then {}, + }, +} diff --git a/docs/node-mixin/dashboards/fleet.libsonnet b/docs/node-mixin/dashboards/fleet.libsonnet new file mode 100644 index 0000000000..a9939e59e2 --- /dev/null +++ b/docs/node-mixin/dashboards/fleet.libsonnet @@ -0,0 +1,505 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +local common = import '../lib/common.libsonnet'; + +{ + + new(config=null, platform=null):: { + local c = common.new(config=config, platform=platform), + local commonPromTarget = c.commonPromTarget, + + + local templates = [ + if std.member(std.split(config.instanceLabels, ','), template.name) + then + template + { + allValue: '.+', + includeAll: true, + multi: true, + } + else template + for template in c.templates + ], + + local q = c.queries, + + local fleetTable = + nodePanels.table.new( + title='Linux Nodes Overview' + ) + .addTarget(commonPromTarget(expr=q.osInfo, format='table', instant=true) { refId: 'INFO' }) + .addTarget(commonPromTarget(expr=q.nodeInfo, format='table', instant=true) { refId: 'OS' }) + .addTarget(commonPromTarget(expr=q.uptime, format='table', instant=true) { refId: 'UPTIME' }) + .addTarget(commonPromTarget(expr=q.systemLoad1, format='table', instant=true) { refId: 'LOAD1' }) + .addTarget(commonPromTarget(expr=q.systemLoad5, format='table', instant=true) { refId: 'LOAD5' }) + .addTarget(commonPromTarget(expr=q.systemLoad15, format='table', instant=true) { refId: 'LOAD15' }) + .addTarget(commonPromTarget( + expr=q.cpuCount, + format='table', + instant=true, + ) { refId: 'CPUCOUNT' }) + .addTarget(commonPromTarget( + expr=q.cpuUsage, format='table', instant=true, + ) { refId: 'CPUUSAGE' }) + .addTarget(commonPromTarget(expr=q.memoryTotal, format='table', instant=true) { refId: 'MEMTOTAL' }) + .addTarget(commonPromTarget(expr=q.memoryUsage, format='table', instant=true) { refId: 'MEMUSAGE' }) + .addTarget(commonPromTarget(expr=q.fsSizeTotalRoot, format='table', instant=true) { refId: 'FSTOTAL' }) + .addTarget(commonPromTarget( + expr= + ||| + 100-(max by (%(instanceLabels)s) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, fstype!="", mountpoint="/"}) + / + max by (%(instanceLabels)s) (node_filesystem_size_bytes{%(nodeQuerySelector)s, fstype!="", mountpoint="/"}) * 100) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + ) { refId: 'FSUSAGE' }) + .addTarget(commonPromTarget( + expr='count by (%(instanceLabels)s) (max_over_time(ALERTS{%(nodeQuerySelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(nodeQuerySelector)s})' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true + ) { refId: 'CRITICAL' }) + .addTarget(commonPromTarget( + expr='count by (%(instanceLabels)s) (max_over_time(ALERTS{%(nodeQuerySelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(nodeQuerySelector)s})' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true + ) { refId: 'WARNING' }) + .withTransform() + .joinByField(field=std.split(config.instanceLabels, ',')[0]) + .filterFieldsByName(std.split(config.instanceLabels, ',')[0] + '|nodename|Value.+') + .organize( + excludeByName={ + 'Value #OS': true, + 'Value #INFO': true, + 'Value #LOAD5': true, + 'Value #LOAD15': true, + }, + renameByName={ + instance: 'Instance', + pretty_name: 'OS', + nodename: 'Hostname', + release: 'Kernel version', + 'Value #LOAD1': 'Load 1m', + 'Value #LOAD5': 'Load 5m', + 'Value #LOAD15': 'Load 15m', + 'Value #CPUCOUNT': 'Cores', + 'Value #CPUUSAGE': 'CPU usage', + 'Value #MEMTOTAL': 'Memory total', + 'Value #MEMUSAGE': 'Memory usage', + 'Value #FSTOTAL': 'Root disk size', + 'Value #FSUSAGE': 'Root disk usage', + 'Value #UPTIME': 'Uptime', + 'Value #CRITICAL': 'Crit Alerts', + 'Value #WARNING': 'Warnings', + } + ) + .withFooter(reducer=['mean'], fields=[ + 'Value #LOAD1', + 'Value #MEMUSAGE', + 'Value #CPUUSAGE', + ]) + .addThresholdStep(color='light-blue', value=null) + .addThresholdStep(color='light-yellow', value=80) + .addThresholdStep(color='light-red', value=90) + .addOverride( + matcher={ + id: 'byName', + options: 'Instance', + }, + properties=[ + { + id: 'links', + value: [ + { + targetBlank: true, + title: c.links.instanceDataLinkForTable.title, + url: c.links.instanceDataLinkForTable.url, + }, + ], + }, + { + id: 'custom.filterable', + value: true, + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'OS|Kernel version|Hostname', + }, + properties=[ + { + id: 'custom.filterable', + value: true, + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'Memory total|Root disk size', + }, + properties=[ + { + id: 'unit', + value: 'bytes', + }, + { + id: 'decimals', + value: 0, + }, + ] + ) + .addOverride( + matcher={ + id: 'byName', + options: 'Cores', + }, + properties=[ + { + id: 'custom.width', + value: 60, + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'Load.+', + }, + properties=[ + { + id: 'custom.width', + value: 60, + }, + ] + ) + .addOverride( + matcher={ + id: 'byName', + options: 'Uptime', + }, + properties=[ + { + id: 'unit', + value: 'dtdurations', + }, + { + id: 'custom.displayMode', + value: 'color-text', + }, + { + id: 'thresholds', + value: { + mode: 'absolute', + steps: [ + { + color: 'light-orange', + value: null, + }, + { + color: 'text', + value: 300, + }, + ], + }, + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'CPU usage|Memory usage|Root disk usage', + }, + properties=[ + { + id: 'unit', + value: 'percent', + }, + // { + // id: 'custom.displayMode', + // value: 'gradient-gauge', + // }, + { + id: 'custom.displayMode', + value: 'basic', + }, + { + id: 'max', + value: 100, + }, + { + id: 'min', + value: 0, + }, + ] + ) + .sortBy('Instance') + , + + local memoryUsagePanel = + nodePanels.timeseries.new('Memory Usage', description='Top 25') + .withUnits('percent') + .withMin(0) + .withMax(100) + .withColor(mode='continuous-BlYlRd') + .withFillOpacity(1) + .withGradientMode('scheme') + .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') + .addDataLink( + title=c.links.instanceDataLink.title, + url=c.links.instanceDataLink.url, + ) + .addTarget(commonPromTarget( + expr='topk(25, ' + q.memoryUsage + ')', + legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + )) + .addTarget(commonPromTarget( + expr='avg(' + q.memoryUsage + ')', + legendFormat='Mean', + )) + .addOverride( + matcher={ + id: 'byName', + options: 'Mean', + + }, + properties=[ + { + id: 'custom.lineStyle', + value: { + fill: 'dash', + dash: [ + 10, + 10, + ], + }, + }, + { + id: 'custom.fillOpacity', + value: 0, + }, + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: 'light-purple', + }, + }, + { + id: 'custom.lineWidth', + value: 2, + }, + ] + ), + + local cpuUsagePanel = + nodePanels.timeseries.new('CPU Usage', description='Top 25') + .withUnits('percent') + .withMin(0) + .withMax(100) + .withFillOpacity(1) + .withColor(mode='continuous-BlYlRd') + .withGradientMode('scheme') + .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') + .addDataLink( + title=c.links.instanceDataLink.title, + url=c.links.instanceDataLink.url, + ) + .addTarget(commonPromTarget( + expr='topk(25, ' + q.cpuUsage + ')', + legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')), + )) + .addTarget(commonPromTarget( + expr='avg(' + q.cpuUsage + ')', + legendFormat='Mean', + )) + .addOverride( + matcher={ + id: 'byName', + options: 'Mean', + + }, + properties=[ + { + id: 'custom.lineStyle', + value: { + fill: 'dash', + dash: [ + 10, + 10, + ], + }, + }, + { + id: 'custom.fillOpacity', + value: 0, + }, + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: 'light-purple', + }, + }, + { + id: 'custom.lineWidth', + value: 2, + }, + ] + ), + + local diskIOPanel = + nodePanels.timeseries.new('Disks I/O', description='Top 25') + .withUnits('percentunit') + .withMin(0) + .withMax(1) + .withFillOpacity(1) + .withColor(mode='continuous-BlYlRd') + .withGradientMode('scheme') + .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') + .addDataLink( + title=c.links.instanceDataLink.title, + url=c.links.instanceDataLink.url, + ) + .addTarget(commonPromTarget( + expr='topk(25, ' + q.diskIoTime + ')', + legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{device}}', + )) + .addOverride( + matcher={ + id: 'byName', + options: 'Mean', + + }, + properties=[ + { + id: 'custom.lineStyle', + value: { + fill: 'dash', + dash: [ + 10, + 10, + ], + }, + }, + { + id: 'custom.fillOpacity', + value: 0, + }, + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: 'light-purple', + }, + }, + { + id: 'custom.lineWidth', + value: 2, + }, + ] + ), + local diskSpacePanel = + nodePanels.timeseries.new('Disks Space Usage', description='Top 25') + .withUnits('percentunit') + .withMin(0) + .withMax(1) + .withFillOpacity(1) + .withColor(mode='continuous-BlYlRd') + .withGradientMode('scheme') + .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') + .addDataLink( + title=c.links.instanceDataLink.title, + url=c.links.instanceDataLink.url, + ) + .addTarget(commonPromTarget( + expr='topk(25, ' + q.diskSpaceUsage + ')', + legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{mountpoint}}', + )) + .addOverride( + matcher={ + id: 'byName', + options: 'Mean', + + }, + properties=[ + { + id: 'custom.lineStyle', + value: { + fill: 'dash', + dash: [ + 10, + 10, + ], + }, + }, + { + id: 'custom.fillOpacity', + value: 0, + }, + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: 'light-purple', + }, + }, + { + id: 'custom.lineWidth', + value: 2, + }, + ] + ), + local networkErrorsDropsPanel = + nodePanels.timeseries.new('Network Errors and Dropped Packets', description='Top 25') + .withLegend(mode='table', calcs=['mean', 'max', 'lastNotNull'], placement='right') + .addTarget(commonPromTarget( + expr='topk(25, ' + q.networkReceiveErrorsPerSec + ' + ' + q.networkTransmitErrorsPerSec + ' + ' + q.networkReceiveDropsPerSec + ' + ' + q.networkTransmitDropsPerSec + ') > 0.5', + legendFormat=c.labelsToLegend(std.split(config.instanceLabels, ',')) + ': {{device}}', + )) + .withDecimals(1) + .withUnits('pps') + .withDrawStyle('points') + .withPointsSize(5) + .addDataLink( + title=c.links.instanceDataLink.title, + url=c.links.instanceDataLink.url, + ), + + local rows = + [ + row.new('Overview') + .addPanel(fleetTable { span: 12, height: '800px' }) + .addPanel(cpuUsagePanel { span: 12 }) + .addPanel(memoryUsagePanel { span: 12 }) + .addPanel(diskIOPanel { span: 6 }).addPanel(diskSpacePanel { span: 6 }) + .addPanel(networkErrorsDropsPanel { span: 12 }), + ], + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNode Fleet Overview' % config.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes-fleet.json'], + ) + .addLink(c.links.otherDashes { includeVars: false }) + .addAnnotations(c.annotations) + .addTemplates(templates) + .addRows(rows) + else if platform == 'Darwin' then {}, + }, +} diff --git a/docs/node-mixin/dashboards/memory.libsonnet b/docs/node-mixin/dashboards/memory.libsonnet new file mode 100644 index 0000000000..5b6e613851 --- /dev/null +++ b/docs/node-mixin/dashboards/memory.libsonnet @@ -0,0 +1,406 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +local common = import '../lib/common.libsonnet'; + +{ + + new(config=null, platform=null):: { + local c = common.new(config=config, platform=platform), + local commonPromTarget = c.commonPromTarget, + local templates = c.templates, + local q = c.queries, + + local memoryPagesInOut = + nodeTimeseries.new( + 'Memory Pages In / Out', + description=||| + Page-In - Return of pages to physical memory. This is a common and normal event. + + Page-Out - process of writing pages to disk. Unlike page-in, page-outs can indicate trouble. + When the kernel detects low memory, it attempts to free memory by paging out. + While occasional page-outs are normal, excessive and frequent page-outs can lead to thrashing. + Thrashing is a state in which the kernel spends more time managing paging activity than running applications, resulting in poor system performance. + ||| + ) + .withNegativeYByRegex('out') + .withAxisLabel('out(-) | in(+)') + .addTarget(commonPromTarget( + expr='irate(node_vmstat_pgpgin{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Page-In' + )) + .addTarget(commonPromTarget( + expr='irate(node_vmstat_pgpgout{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Page-Out' + )), + local memoryPagesSwapInOut = + nodeTimeseries.new( + 'Memory Pages Swapping In / Out', + description=||| + Compared to the speed of the CPU and main memory, writing pages out to disk is relatively slow. + Nonetheless, it is a preferable option to crashing or killing off processes. + + The process of writing pages out to disk to free memory is known as swapping-out. + If a page fault occurs because the page is on disk, in the swap area, rather than in memory, + the kernel will read the page back in from the disk to satisfy the page fault. + This is known as swapping-in. + ||| + ) + .withNegativeYByRegex('out') + .withAxisLabel('out(-) | in(+)') + .addTarget(commonPromTarget( + expr='irate(node_vmstat_pswpin{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Pages swapped in' + )) + .addTarget(commonPromTarget( + expr='irate(node_vmstat_pswpout{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Pages swapped out' + )), + + local memoryPagesFaults = + nodeTimeseries.new( + 'Memory Page Faults', + description=||| + A page fault is an exception raised by the memory when a process accesses a memory page without the necessary preparations, + requiring a mapping to be added to the process's virtual address space. The page contents may also need to be loaded from a backing store such as a disk. + While the MMU detects the page fault, the operating system's kernel handles the exception by either making the required page accessible in physical memory or denying an illegal memory access. + Valid page faults are common and necessary to increase memory availability in any operating system that uses virtual memory, including Windows, macOS, and the Linux kernel. + ||| + ) + .addTarget(commonPromTarget( + expr='irate(node_vmstat_pgmajfault{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Major page fault operations' + )) + .addTarget(commonPromTarget( + expr= + ||| + irate(node_vmstat_pgfault{%(nodeQuerySelector)s}[$__rate_interval]) + - + irate(node_vmstat_pgmajfault{%(nodeQuerySelector)s}[$__rate_interval]) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Minor page fault operations' + )), + + local memoryOOMkiller = + nodeTimeseries.new( + 'OOM Killer', + description=||| + Out Of Memory Killer is a process used by the Linux kernel when the system is running critically low on memory. + This can happen when the kernel has allocated more memory than is available for its processes. + ||| + ) + .addTarget(commonPromTarget( + expr='increase(node_vmstat_oom_kill{%(nodeQuerySelector)s}[$__interval] offset -$__interval)' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='OOM killer invocations' + )), + + local memoryActiveInactive = + nodeTimeseries.new( + 'Memory Active / Inactive', + description=||| + Inactive: Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. + Active: Memory that has been used more recently and usually not reclaimed unless absolutely necessary. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_Inactive_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Inactive', + )) + .addTarget(commonPromTarget( + expr='node_memory_Active_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Active', + )), + + local memoryActiveInactiveDetail = + nodeTimeseries.new( + 'Memory Active / Inactive Details', + description=||| + Inactive_file: File-backed memory on inactive LRU list. + Inactive_anon: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem). + Active_file: File-backed memory on active LRU list. + Active_anon: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_Inactive_file_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Inactive_file', + )) + .addTarget(commonPromTarget( + expr='node_memory_Inactive_anon_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Inactive_anon', + )) + .addTarget(commonPromTarget( + expr='node_memory_Active_file_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Active_file', + )) + .addTarget(commonPromTarget( + expr='node_memory_Active_anon_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Active_anon', + )), + + local memoryCommited = + nodeTimeseries.new( + 'Memory Commited', + description=||| + Committed_AS - Amount of memory presently allocated on the system. + CommitLimit - Amount of memory currently available to be allocated on the system. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_Committed_AS_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Committed_AS' + )) + .addTarget(commonPromTarget( + expr='node_memory_CommitLimit_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='CommitLimit' + )), + local memorySharedAndMapped = + nodeTimeseries.new( + 'Memory Shared and Mapped', + description=||| + Mapped: This refers to the memory used in mapped page files that have been memory mapped, such as libraries. + Shmem: This is the memory used by shared memory, which is shared between multiple processes, including RAM disks. + ShmemHugePages: This is the memory used by shared memory and tmpfs allocated with huge pages. + ShmemPmdMapped: This is the amount of shared memory (shmem/tmpfs) backed by huge pages. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_Mapped_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Mapped' + )) + .addTarget(commonPromTarget( + expr='node_memory_Shmem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Shmem' + )) + .addTarget(commonPromTarget( + expr='node_memory_ShmemHugePages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ShmemHugePages' + )) + .addTarget(commonPromTarget( + expr='node_memory_ShmemPmdMapped_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ShmemPmdMapped' + )), + + local memoryWriteAndDirty = + nodeTimeseries.new( + 'Memory Writeback and Dirty', + description=||| + Writeback: This refers to the memory that is currently being actively written back to the disk. + WritebackTmp: This is the memory used by FUSE for temporary writeback buffers. + Dirty: This type of memory is waiting to be written back to the disk. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_Writeback_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Writeback' + )) + .addTarget(commonPromTarget( + expr='node_memory_WritebackTmp_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='WritebackTmp' + )) + .addTarget(commonPromTarget( + expr='node_memory_Dirty_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Dirty' + )), + + local memoryVmalloc = + nodeTimeseries.new( + 'Memory Vmalloc', + description=||| + Virtual Memory Allocation is a type of memory allocation in Linux that allows a process to request a contiguous block of memory larger than the amount of physically available memory. This is achieved by mapping the requested memory to virtual addresses that are backed by a combination of physical memory and swap space on disk. + + VmallocChunk: Largest contiguous block of vmalloc area which is free. + VmallocTotal: Total size of vmalloc memory area. + VmallocUsed: Amount of vmalloc area which is used. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_VmallocChunk_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='VmallocChunk' + )) + .addTarget(commonPromTarget( + expr='node_memory_VmallocTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='VmallocTotal' + )) + .addTarget(commonPromTarget( + expr='node_memory_VmallocUsed_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='VmallocUsed' + )), + + local memorySlab = + nodeTimeseries.new('Memory Slab', + description=||| + Slab Allocation is a type of memory allocation in Linux that allows the kernel to efficiently manage the allocation and deallocation of small and frequently used data structures, such as network packets, file system objects, and process descriptors. + + The Slab Allocator maintains a cache of pre-allocated objects of a fixed size and type, called slabs. When an application requests an object of a particular size and type, the Slab Allocator checks if a pre-allocated object of that size and type is available in the cache. If an object is available, it is returned to the application; if not, a new slab of objects is allocated and added to the cache. + + SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure. + SReclaimable: Part of Slab, that might be reclaimed, such as caches. + |||) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_SUnreclaim_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='SUnreclaim' + )) + .addTarget(commonPromTarget( + expr='node_memory_SReclaimable_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='SReclaimable' + )), + + local memoryAnonymous = + nodeTimeseries.new( + 'Memory Anonymous', + description=||| + Memory Anonymous refers to the portion of the virtual memory that is used by a process for dynamically allocated memory that is not backed by any file or device. + + This type of memory is commonly used for heap memory allocation, which is used by programs to allocate and free memory dynamically during runtime. + + Memory Anonymous is different from Memory Mapped files, which refer to portions of the virtual memory space that are backed by a file or device, + and from Memory Shared with other processes, + which refers to memory regions that can be accessed and modified by multiple processes. + + AnonHugePages: Memory in anonymous huge pages. + AnonPages: Memory in user pages not backed by files. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_AnonHugePages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='AnonHugePages' + )) + .addTarget(commonPromTarget( + expr='node_memory_AnonPages_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='AnonPages' + )), + + local memoryHugePagesCounter = + nodeTimeseries.new( + 'Memory HugePages Counter', + description=||| + Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. + + HugePages_Free: Huge pages in the pool that are not yet allocated. + HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made. + HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. + ||| + ) + .addTarget(commonPromTarget( + expr='node_memory_HugePages_Free{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='HugePages_Free' + )) + .addTarget(commonPromTarget( + expr='node_memory_HugePages_Rsvd{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='HugePages_Rsvd' + )) + .addTarget(commonPromTarget( + expr='node_memory_HugePages_Surp{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='HugePages_Surp' + )), + local memoryHugePagesSize = + nodeTimeseries.new( + 'Memory HugePages Size', + description=||| + Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_HugePages_Total{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Huge pages total size' + )) + .addTarget(commonPromTarget( + expr='node_memory_Hugepagesize_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Huge page size' + )), + local memoryDirectMap = + nodeTimeseries.new( + 'Memory Direct Map', + description=||| + Direct Map memory refers to the portion of the kernel's virtual address space that is directly mapped to physical memory. This mapping is set up by the kernel during boot time and is used to provide fast access to certain critical kernel data structures, such as page tables and interrupt descriptor tables. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_DirectMap1G_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='DirectMap1G' + )) + .addTarget(commonPromTarget( + expr='node_memory_DirectMap2M_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='DirectMap2M' + )) + .addTarget(commonPromTarget( + expr='node_memory_DirectMap4k_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='DirectMap4k' + )), + + local memoryBounce = + nodeTimeseries.new( + 'Memory Bounce', + description=||| + Memory bounce is a technique used in the Linux kernel to handle situations where direct memory access (DMA) is required but the physical memory being accessed is not contiguous. This can happen when a device, such as a network interface card or a disk controller, requires access to a large amount of memory that is not available as a single contiguous block. + + To handle this situation, the kernel uses a technique called memory bouncing. In memory bouncing, the kernel sets up a temporary buffer in physical memory that is large enough to hold the entire data block being transferred by the device. The data is then copied from the non-contiguous source memory to the temporary buffer, which is physically contiguous. + + Bounce: Memory used for block device bounce buffers. + ||| + ) + .withUnits('decbytes') + .addTarget(commonPromTarget( + expr='node_memory_Bounce_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Bounce' + )), + local panelsGrid = + [ + c.panelsWithTargets.memoryGauge { gridPos: { x: 0, w: 6, h: 6, y: 0 } }, + c.panelsWithTargets.memoryGraph { gridPos: { x: 6, w: 18, h: 6, y: 0 } }, + { type: 'row', title: 'Vmstat', gridPos: { y: 25 } }, + memoryPagesInOut { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, + memoryPagesSwapInOut { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, + memoryPagesFaults { gridPos: { x: 0, w: 12, h: 8, y: 25 } }, + memoryOOMkiller { gridPos: { x: 12, w: 12, h: 8, y: 25 } }, + { type: 'row', title: 'Memstat', gridPos: { y: 50 } }, + memoryActiveInactive { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, + memoryActiveInactiveDetail { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, + memoryCommited { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, + memorySharedAndMapped { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, + memoryWriteAndDirty { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, + memoryVmalloc { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, + memorySlab { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, + memoryAnonymous { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, + memoryHugePagesCounter { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, + memoryHugePagesSize { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, + memoryDirectMap { gridPos: { x: 0, w: 12, h: 8, y: 50 } }, + memoryBounce { gridPos: { x: 12, w: 12, h: 8, y: 50 } }, + ], + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNode Memory' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes-memory.json'], + ) + .addLink(c.links.fleetDash) + .addLink(c.links.nodeDash) + .addLink(c.links.otherDashes) + .addAnnotations(c.annotations) + .addTemplates(templates) + .addPanels(panelsGrid) + else if platform == 'Darwin' then {}, + }, +} diff --git a/docs/node-mixin/dashboards/network.libsonnet b/docs/node-mixin/dashboards/network.libsonnet new file mode 100644 index 0000000000..ceacd13e42 --- /dev/null +++ b/docs/node-mixin/dashboards/network.libsonnet @@ -0,0 +1,796 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +local common = import '../lib/common.libsonnet'; + +{ + + new(config=null, platform=null):: { + local c = common.new(config=config, platform=platform), + local commonPromTarget = c.commonPromTarget, + local templates = c.templates, + local q = c.queries, + + local networkTrafficPanel = + commonPanels.networkTrafficGraph.new( + 'Network Traffic', + description=||| + Network interfaces utilisation by device and direction. + ||| + ) + .addTarget(commonPromTarget( + expr=q.networkReceiveBitsPerSec, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + expr=q.networkTransmitBitsPerSec, + legendFormat='{{device}} transmitted', + )), + + local networkPacketsPanel = + nodeTimeseries.new( + 'Packets', + description=||| + packets received: Number of good packets received by the interface. + For hardware interfaces counts all good packets received from the device by the host, including packets which host had to drop at various stages of processing (even in the driver). + + packets transmitted: Number of packets successfully transmitted. + For hardware interfaces counts packets which host was able to successfully hand over to the device, + which does not necessarily mean that packets had been successfully transmitted out of the device, only that device acknowledged it copied them out of host memory. + + https://docs.kernel.org/networking/statistics.html + ||| + ) + .addTarget(commonPromTarget( + 'irate(node_network_receive_packets_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + 'irate(node_network_transmit_packets_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)'), + + local networkErrorsPanel = + nodeTimeseries.new( + 'Network Errors', + description=||| + errors received: Total number of bad packets received on this network device. This counter must include events counted by rx_length_errors, rx_crc_errors, rx_frame_errors and other errors not otherwise counted. + + errors transmitted: Total number of transmit problems. This counter must include events counter by tx_aborted_errors, tx_carrier_errors, tx_fifo_errors, tx_heartbeat_errors, tx_window_errors and other errors not otherwise counted. + + https://docs.kernel.org/networking/statistics.html + ||| + ) + .addTarget(commonPromTarget( + expr=q.networkReceiveErrorsPerSec, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + expr=q.networkTransmitErrorsPerSec, + legendFormat='{{device}} transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)'), + + local networkDropsPanel = + nodeTimeseries.new( + 'Dropped Packets', + description=||| + drops received: Number of packets received but not processed, e.g. due to lack of resources or unsupported protocol. For hardware interfaces this counter may include packets discarded due to L2 address filtering but should not include packets dropped by the device due to buffer exhaustion which are counted separately in rx_missed_errors (since procfs folds those two counters together). + + drops transmitted: Number of packets dropped on their way to transmission, e.g. due to lack of resources. + https://docs.kernel.org/networking/statistics.html + ||| + ) + .addTarget(commonPromTarget( + expr=q.networkReceiveDropsPerSec, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + expr=q.networkTransmitDropsPerSec, + legendFormat='{{device}} transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)'), + local networkCompressedPanel = + nodeTimeseries.new( + 'Compressed Packets', + description=||| + compressed received: + Number of correctly received compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). + + compressed transmitted: + Number of transmitted compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). + + https://docs.kernel.org/networking/statistics.html + ||| + ) + .addTarget(commonPromTarget( + 'irate(node_network_receive_compressed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + 'irate(node_network_transmit_compressed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)'), + + local networkMulticastPanel = + nodeTimeseries.new( + 'Multicast Packets', + description=||| + Multicast packets received and transmitted. + ||| + ) + .addTarget(commonPromTarget( + 'irate(node_network_receive_multicast_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + 'irate(node_network_transmit_multicast_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('transmit'), + + local networkFifoPanel = + nodeTimeseries.new( + 'Network FIFO', + description=||| + Network FIFO (First-In, First-Out) refers to a buffer used by the network stack to store packets in a queue. + It is a mechanism used to manage network traffic and ensure that packets are delivered to their destination in the order they were received. + Packets are stored in the FIFO buffer until they can be transmitted or processed further. + ||| + ) + .addTarget(commonPromTarget( + 'irate(node_network_receive_fifo_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + 'irate(node_network_transmit_fifo_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}} transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)'), + + local networkNFConntrack = + nodeTimeseries.new( + 'NF Conntrack', + description=||| + NF Conntrack is a component of the Linux kernel's netfilter framework that provides stateful packet inspection to track and manage network connections, + enforce firewall rules, perform NAT, and manage network address/port translation. + ||| + ) + .addTarget(commonPromTarget( + 'node_nf_conntrack_entries{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='NF conntrack entries', + )) + .addTarget(commonPromTarget( + 'node_nf_conntrack_entries_limit{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='NF conntrack limits', + )) + .withFillOpacity(0), + + local networkSoftnetPanel = + nodeTimeseries.new( + 'Softnet Packets', + description=||| + Softnet packets are received by the network and queued for processing by the kernel's networking stack. + Softnet packets are usually generated by network traffic that is directed to the local host, and they are typically processed by the kernel's networking subsystem before being passed on to the relevant application. + ||| + ) + .addTarget(commonPromTarget( + 'irate(node_softnet_processed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='CPU {{cpu }} proccessed', + )) + .addTarget(commonPromTarget( + 'irate(node_softnet_dropped_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='CPU {{cpu }} dropped', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('dropped') + .withAxisLabel('Dropped(-) | Processed(+)'), + + local networkSoftnetSqueezePanel = + nodeTimeseries.new( + 'Softnet Out of Quota', + description=||| + "Softnet Out of Quota" is a network-related metric in Linux that measures the number of times the kernel's softirq processing was unable to handle incoming network traffic due to insufficient softirq processing capacity. + This means that the kernel has reached its processing capacity limit for incoming packets, and any additional packets will be dropped or deferred. + ||| + ) + .addTarget(commonPromTarget( + 'irate(node_softnet_times_squeezed_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='CPU {{cpu}} out of quota', + )) + .withDecimals(1) + .withUnits('pps'), + + local networkInterfacesTable = + nodePanels.table.new( + title='Network Interfaces Overview' + ) + // "Value #A" + .addTarget(commonPromTarget( + expr='node_network_up{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + // "Value #B" + .addTarget(commonPromTarget( + expr='node_network_carrier{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + // "Value #C" + .addTarget(commonPromTarget( + expr=q.networkTransmitBitsPerSec, + format='table', + instant=true, + )) + // "Value #D" + .addTarget(commonPromTarget( + expr=q.networkReceiveBitsPerSec, + format='table', + instant=true, + )) + // "Value #E" + .addTarget(commonPromTarget( + expr='node_arp_entries{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + // "Value #F" + .addTarget(commonPromTarget( + expr='node_network_mtu_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + // "Value #G" + .addTarget(commonPromTarget( + expr='node_network_speed_bytes{%(nodeQuerySelector)s} * 8' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + // "Value #H" + .addTarget(commonPromTarget( + expr='node_network_transmit_queue_length{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + // "VALUE #I" + .addTarget(commonPromTarget( + expr='node_network_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + format='table', + instant=true, + )) + .withTransform() + .joinByField(field='device') + .filterFieldsByName('device|address|duplex|Value.+') + .organize( + excludeByName={ + 'Value #I': true, + }, + renameByName= + { + device: 'Interface', + address: 'Address', + duplex: 'Duplex', + 'Value #A': 'Up', + 'Value #B': 'Carrier', + 'Value #C': 'Transmit', + 'Value #D': 'Receive', + 'Value #E': 'ARP entries', + 'Value #F': 'MTU', + 'Value #G': 'Speed', + 'Value #H': 'Queue length', + } + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'Speed', + }, + properties=[ + { + id: 'unit', + value: 'bps', + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'Carrier|Up', + }, + properties=[ + { + id: 'custom.displayMode', + value: 'color-text', + }, + { + id: 'mappings', + value: [ + { + type: 'value', + options: { + '0': { + text: 'Down', + color: 'light-red', + index: 1, + }, + '1': { + text: 'Up', + color: 'light-green', + index: 0, + }, + }, + }, + ], + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: 'Transmit|Receive', + }, + properties=[ + { + id: 'unit', + value: 'bps', + }, + { + id: 'custom.displayMode', + value: 'gradient-gauge', + }, + { + id: 'color', + value: { + mode: 'continuous-BlYlRd', + }, + }, + { + id: 'max', + value: 1000 * 1000 * 100, + }, + ] + ) + , + + local networkOperStatus = + nodeTimeseries.new( + title='Network Interfaces Carrier Status', + description='Network Interfaces Carrier Status', + ) + .withColor(mode='palette-classic') + .withFillOpacity(100) + .withLegend(mode='list') + .addTarget(commonPromTarget( + expr='node_network_carrier{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='{{device}}' + )) + + { + maxDataPoints: 100, + type: 'status-history', + fieldConfig+: { + defaults+: { + mappings+: [ + { + type: 'value', + options: { + '1': { + text: 'Up', + color: 'light-green', + index: 1, + }, + }, + }, + { + type: 'value', + options: { + '0': { + text: 'Down', + color: 'light-red', + index: 0, + }, + }, + }, + + ], + }, + }, + }, + // https://github.com/prometheus/node_exporter/pull/2346/files#diff-3699c850869aecf912f8e8272958b556913fc266534206833a5dcb7d6cca3610 + local networkSockstatTCP = + nodeTimeseries.new( + title='Sockets TCP', + description=||| + TCP sockets are used for establishing and managing network connections between two endpoints over the TCP/IP protocol. + + Orphan sockets: If a process terminates unexpectedly or is terminated without closing its sockets properly, the sockets may become orphaned. + ||| + ) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP_alloc{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Allocated' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv6 In use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv4 In use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP_orphan{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Orphan sockets' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP_tw{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Time wait' + )), + + local networkSockstatUDP = + nodeTimeseries.new( + title='Sockets UDP', + description=||| + UDP (User Datagram Protocol) and UDPlite (UDP-Lite) sockets are used for transmitting and receiving data over the UDP and UDPlite protocols, respectively. + Both UDP and UDPlite are connectionless protocols that do not provide a reliable data delivery mechanism. + ||| + ) + .addTarget(commonPromTarget( + expr='node_sockstat_UDPLITE_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv4 UDPLITE in use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_UDP_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv4 UDP in use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_UDPLITE6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv6 UDPLITE in use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_UDP6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv6 UDP in use' + )), + + local networkSockstatOther = + nodeTimeseries.new( + title='Sockets Other', + description=||| + FRAG (IP fragment) sockets: Used to receive and process fragmented IP packets. FRAG sockets are useful in network monitoring and analysis. + + RAW sockets: Allow applications to send and receive raw IP packets directly without the need for a transport protocol like TCP or UDP. + ||| + ) + .addTarget(commonPromTarget( + expr='node_sockstat_FRAG_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv4 Frag sockets in use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_FRAG6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv6 Frag sockets in use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_RAW_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv4 Raw sockets in use' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_RAW6_inuse{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv6 Raw sockets in use' + )), + + + local networkSockstatMemory = + nodeTimeseries.new( + title='Sockets Memory', + description=||| + Memory currently in use for sockets. + ||| + ) + .withMaxDataPoints(100) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP_mem{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Memory pages allocated for TCP sockets' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_UDP_mem{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Memory pages allocated for UDP sockets' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_TCP_mem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Memory bytes allocated for TCP sockets' + )) + .addTarget(commonPromTarget( + expr='node_sockstat_UDP_mem_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Memory bytes allocated for UDP sockets' + )) + .addOverride( + matcher={ + id: 'byRegexp', + options: '/bytes/', + }, + properties=[ + { + id: 'unit', + value: 'bytes', + }, + { + id: 'custom.drawStyle', + value: 'lines', + }, + { + id: 'custom.drawStyle', + value: 'bars', + }, + { + id: 'custom.stacking', + value: { + mode: 'normal', + group: 'A', + }, + }, + ] + ), + + local networkSockstatAll = + nodeTimeseries.new( + title='Sockets in use', + description='Number of sockets currently in use.', + ) + .addTarget(commonPromTarget( + expr='node_sockstat_sockets_used{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='IPv4 sockets in use' + )), + + local networkNetstatIP = + nodeTimeseries.new( + title='IP octets', + description='Rate of IP octets received and transmitted.' + ) + .withUnits('oct/s') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)') + .addTarget(commonPromTarget( + expr='irate(node_netstat_IpExt_InOctets{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Octets received' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_IpExt_OutOctets{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Octets transmitted' + )), + + + local networkNetstatTCP = + nodeTimeseries.new( + title='TCP segments', + description='Rate of TCP segments received and transmitted.' + ) + .withUnits('seg/s') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)') + .addTarget(commonPromTarget( + expr='irate(node_netstat_Tcp_InSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP received' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Tcp_OutSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP transmitted' + )), + + local networkNetstatTCPerrors = + nodeTimeseries.new( + title='TCP errors rate', + description='Rate of TCP errors.' + ) + .withUnits('err/s') + .addTarget(commonPromTarget( + expr='irate(node_netstat_TcpExt_ListenOverflows{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP overflow' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_TcpExt_ListenDrops{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP ListenDrops - SYNs to LISTEN sockets ignored' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_TcpExt_TCPSynRetrans{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP SYN rentransmits' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Tcp_RetransSegs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP retransmitted segments, containing one or more previously transmitted octets' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Tcp_InErrs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP received with errors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Tcp_OutRsts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='TCP segments sent with RST flag' + )), + + local networkNetstatUDP = + nodeTimeseries.new( + title='UDP datagrams', + description='Rate of UDP datagrams received and transmitted.' + ) + .withUnits('dat/s') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)') + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp_InDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP received' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp_OutDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP transmitted' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp6_InDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP6 received' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp6_OutDatagrams{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP6 transmitted' + )), + + local networkNetstatUDPerrors = + nodeTimeseries.new( + title='UDP errors rate', + description='Rate of UDP datagrams received and transmitted with errors.' + ) + .withUnits('err/s') + .addTarget(commonPromTarget( + expr='irate(node_netstat_UdpLite_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDPLite InErrors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP InErrors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp6_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP6 InErrors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp_NoPorts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP NoPorts' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp6_NoPorts{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP6 NoPorts' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp_RcvbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP receive buffer errors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp6_RcvbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP6 receive buffer errors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp_SndbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP send buffer errors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Udp6_SndbufErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='UDP6 send buffer errors' + )), + + + local networkNetstatICMP = + nodeTimeseries.new( + title='ICMP messages', + description="Rate of ICMP messages, like 'ping', received and transmitted." + ) + .withUnits('msg/s') + .withNegativeYByRegex('transmit') + .withAxisLabel('out(-) | in(+)') + .addTarget(commonPromTarget( + expr='irate(node_netstat_Icmp_InMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ICMP received' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Icmp_OutMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ICMP transmitted' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Icmp6_InMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ICMP6 received' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Icmp6_OutMsgs{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ICMP6 transmitted' + )), + + local networkNetstatICMPerrors = + nodeTimeseries.new( + title='ICMP errors rate', + description='Rate of ICMP messages received and transmitted with errors.' + ) + .withUnits('err/s') + .addTarget(commonPromTarget( + expr='irate(node_netstat_Icmp_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ICMP Errors' + )) + .addTarget(commonPromTarget( + expr='irate(node_netstat_Icmp6_InErrors{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='ICMP6 Errors' + )), + + + local rows = + [ + row.new('Network') + .addPanel(networkInterfacesTable { span: 12 }) + .addPanel(networkTrafficPanel { span: 6 }) + .addPanel(networkOperStatus { span: 6 }) + .addPanel(networkErrorsPanel { span: 6 }) + .addPanel(networkDropsPanel { span: 6 }) + .addPanel(networkPacketsPanel { span: 6 }) + .addPanel(networkMulticastPanel { span: 6 }) + .addPanel(networkFifoPanel { span: 6 }) + .addPanel(networkCompressedPanel { span: 6 }) + .addPanel(networkNFConntrack { span: 6 }) + .addPanel(networkSoftnetPanel { span: 6 }) + .addPanel(networkSoftnetSqueezePanel { span: 6 }), + row.new('Network Sockets') + .addPanel(networkSockstatAll { span: 12 }) + .addPanel(networkSockstatTCP { span: 6 }) + .addPanel(networkSockstatUDP { span: 6 }) + .addPanel(networkSockstatMemory { span: 6 }) + .addPanel(networkSockstatOther { span: 6 }), + + row.new('Network Netstat') + .addPanel(networkNetstatIP { span: 12 }) + .addPanel(networkNetstatTCP { span: 6 }) + .addPanel(networkNetstatTCPerrors { span: 6 }) + .addPanel(networkNetstatUDP { span: 6 }) + .addPanel(networkNetstatUDPerrors { span: 6 }) + .addPanel(networkNetstatICMP { span: 6 }) + .addPanel(networkNetstatICMPerrors { span: 6 }), + ], + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNode Network' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes-network.json'] + ) + .addLink(c.links.fleetDash) + .addLink(c.links.nodeDash) + .addLink(c.links.otherDashes) + .addAnnotations(c.annotations) + .addTemplates(templates) + .addRows(rows) + else if platform == 'Darwin' then {}, + }, +} diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 898c912d56..a00eb1b9f7 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -1,7 +1,19 @@ { - local nodemixin = import '../lib/prom-mixin.libsonnet', + local nodemixin = import './prom-mixin.libsonnet', + local cpu = import './cpu.libsonnet', + local system = import './system.libsonnet', + local memory = import './memory.libsonnet', + local disk = import './disk.libsonnet', + local network = import './network.libsonnet', + local fleet = import './fleet.libsonnet', + grafanaDashboards+:: { 'nodes.json': nodemixin.new(config=$._config, platform='Linux').dashboard, 'nodes-darwin.json': nodemixin.new(config=$._config, platform='Darwin').dashboard, + 'nodes-system.json': system.new(config=$._config, platform='Linux').dashboard, + 'nodes-memory.json': memory.new(config=$._config, platform='Linux').dashboard, + 'nodes-network.json': network.new(config=$._config, platform='Linux').dashboard, + 'nodes-disk.json': disk.new(config=$._config, platform='Linux').dashboard, + 'nodes-fleet.json': fleet.new(config=$._config, platform='Linux').dashboard, }, } diff --git a/docs/node-mixin/dashboards/prom-mixin.libsonnet b/docs/node-mixin/dashboards/prom-mixin.libsonnet new file mode 100644 index 0000000000..a562844073 --- /dev/null +++ b/docs/node-mixin/dashboards/prom-mixin.libsonnet @@ -0,0 +1,180 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local statPanel = grafana.statPanel; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +local common = import '../lib/common.libsonnet'; +local nodeTemplates = common.templates; + +{ + + new(config=null, platform=null):: { + + local c = common.new(config=config, platform=platform), + local commonPromTarget = c.commonPromTarget, + local templates = c.templates, + local q = c.queries, + + local uptimePanel = + commonPanels.uptimeStat.new() + .addTarget(commonPromTarget(expr=q.uptime)), + + local cpuCountPanel = + commonPanels.infoStat.new('CPU Count') + .addTarget(commonPromTarget(expr=q.cpuCount)), + + local memoryTotalPanel = + commonPanels.infoStat.new('Memory Total') + .addTarget(commonPromTarget(expr=q.memoryTotal)) + .withUnits('bytes') + .withDecimals(0), + + local osPanel = + commonPanels.infoStat.new('OS') + .addTarget(commonPromTarget( + expr=q.osInfo, format='table' + )) { options+: { reduceOptions+: { fields: '/^pretty_name$/' } } }, + + local nodeNamePanel = + commonPanels.infoStat.new('Hostname') + .addTarget(commonPromTarget( + expr=q.nodeInfo, format='table' + )) + { options+: { reduceOptions+: { fields: '/^nodename$/' } } }, + + local kernelVersionPanel = + + commonPanels.infoStat.new('Kernel version') + .addTarget(commonPromTarget( + expr=q.nodeInfo, format='table' + )) + { options+: { reduceOptions+: { fields: '/^release$/' } } } + , + + local totalSwapPanel = + commonPanels.infoStat.new('Total swap') + .addTarget(commonPromTarget( + expr=q.memorySwapTotal + )) + .withUnits('bytes') + .withDecimals(0), + + local totalRootFSPanel = + commonPanels.infoStat.new('Root mount size') + .addTarget(commonPromTarget( + expr=q.fsSizeTotalRoot, + )) + .withUnits('bytes') + .withDecimals(0), + + local networkTrafficPanel = + commonPanels.networkTrafficGraph.new( + 'Network Traffic', description='Network transmitted and received (bits/s)', + ) + .addTarget(commonPromTarget( + expr=q.networkReceiveBitsPerSec, + legendFormat='{{device}} received', + )) + .addTarget(commonPromTarget( + expr=q.networkTransmitBitsPerSec, + legendFormat='{{device}} transmitted', + )), + + local networkErrorsDropsPanel = + nodePanels.timeseries.new( + 'Network Errors and Dropped Packets', + description=||| + errors received: Total number of bad packets received on this network device. This counter must include events counted by rx_length_errors, rx_crc_errors, rx_frame_errors and other errors not otherwise counted. + + errors transmitted: Total number of transmit problems. This counter must include events counter by tx_aborted_errors, tx_carrier_errors, tx_fifo_errors, tx_heartbeat_errors, tx_window_errors and other errors not otherwise counted. + + drops received: Number of packets received but not processed, e.g. due to lack of resources or unsupported protocol. For hardware interfaces this counter may include packets discarded due to L2 address filtering but should not include packets dropped by the device due to buffer exhaustion which are counted separately in rx_missed_errors (since procfs folds those two counters together). + + drops transmitted: Number of packets dropped on their way to transmission, e.g. due to lack of resources. + + https://docs.kernel.org/networking/statistics.html + ||| + ) + .addTarget(commonPromTarget( + expr=q.networkReceiveErrorsPerSec, + legendFormat='{{device}} errors received', + )) + .addTarget(commonPromTarget( + expr=q.networkTransmitErrorsPerSec, + legendFormat='{{device}} errors transmitted', + )) + .addTarget(commonPromTarget( + expr=q.networkReceiveDropsPerSec, + legendFormat='{{device}} drops received', + )) + .addTarget(commonPromTarget( + expr=q.networkTransmitDropsPerSec, + legendFormat='{{device}} drops transmitted', + )) + .withDecimals(1) + .withUnits('pps') + .withNegativeYByRegex('trasnmitted') + .withAxisLabel('out(-) | in(+)'), + + + local panelsGrid = + [ + // use negative gravity effect, max w=24, default h=8 + { type: 'row', title: 'Overview' }, + uptimePanel { gridPos: { x: 0, w: 6, h: 2 } }, + nodeNamePanel { gridPos: { x: 6, w: 6, h: 2 } }, + kernelVersionPanel { gridPos: { x: 12, w: 6, h: 2 } }, + osPanel { gridPos: { x: 18, w: 6, h: 2 } }, + cpuCountPanel { gridPos: { x: 0, w: 6, h: 2 } }, + memoryTotalPanel { gridPos: { x: 6, w: 6, h: 2 } }, + totalSwapPanel { gridPos: { x: 12, w: 6, h: 2 } }, + totalRootFSPanel { gridPos: { x: 18, w: 6, h: 2 } }, + { type: 'row', title: 'CPU' } { gridPos: { y: 25 } }, + c.panelsWithTargets.cpuStatPanel { gridPos: { x: 0, w: 6, h: 6, y: 25 } }, + c.panelsWithTargets.idleCPU { gridPos: { x: 6, w: 12, h: 6, y: 25 } }, + c.panelsWithTargets.systemLoad { gridPos: { x: 18, w: 6, h: 6, y: 25 } }, + { type: 'row', title: 'Memory' } { gridPos: { y: 50 } }, + c.panelsWithTargets.memoryGauge { gridPos: { x: 0, w: 6, h: 6, y: 50 } }, + c.panelsWithTargets.memoryGraph { gridPos: { x: 6, w: 18, h: 6, y: 50 } }, + { type: 'row', title: 'Disk' } { gridPos: { y: 75 } }, + c.panelsWithTargets.diskIO { gridPos: { x: 0, w: 12, h: 8, y: 75 } }, + c.panelsWithTargets.diskSpaceUsage { gridPos: { x: 12, w: 12, h: 8, y: 75 } }, + { type: 'row', title: 'Network' } { gridPos: { y: 100 } }, + networkTrafficPanel { gridPos: { x: 0, w: 12, h: 8, y: 100 } }, + networkErrorsDropsPanel { gridPos: { x: 12, w: 12, h: 8, y: 100 } }, + ], + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNode Overview' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes.json'], + ) + .addLink(c.links.fleetDash) + .addLink(c.links.otherDashes) + .addAnnotations(c.annotations) + .addTemplates(templates) + .addPanels(panelsGrid) + else if platform == 'Darwin' then + dashboard.new( + '%sMacOS' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes-darwin.json'], + ) + .addTemplates(templates) + .addPanels(panelsGrid), + + }, +} diff --git a/docs/node-mixin/dashboards/system.libsonnet b/docs/node-mixin/dashboards/system.libsonnet new file mode 100644 index 0000000000..e1bd58d759 --- /dev/null +++ b/docs/node-mixin/dashboards/system.libsonnet @@ -0,0 +1,150 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +local common = import '../lib/common.libsonnet'; + +{ + + new(config=null, platform=null):: { + local c = common.new(config=config, platform=platform), + local commonPromTarget = c.commonPromTarget, + local templates = c.templates, + local q = c.queries, + + local cpuUsageModes = + nodeTimeseries.new( + 'CPU Usage', + description=||| + System: Processes executing in kernel mode. + User: Normal processes executing in user mode. + Nice: Niced processes executing in user mode. + Idle: Waiting for something to happen. + Iowait: Waiting for I/O to complete. + Irq: Servicing interrupts. + Softirq: Servicing softirqs. + Steal: Time spent in other operating systems when running in a virtualized environment. + ||| + ) + .withStacking('normal') + .withUnits('percent') + .withFillOpacity(100) + .withMax(100) + .withMin(0) + .addTarget(commonPromTarget( + expr=q.cpuUsageModes, + legendFormat='{{mode}}', + )), + local timeZoneOffset = + commonPanels.infoStat.new( + 'Timezone', + description='Timezone set on instance.' + ) + .addTarget(commonPromTarget( + expr=q.node_time_zone_offset_seconds, format='table' + )) + { options+: { reduceOptions+: { fields: '/^time_zone$/' } } }, + local timeSyncDrift = + nodeTimeseries.new( + 'Time Synchronized Drift', + description=||| + Time synchronization is essential to ensure accurate timekeeping, which is critical for many system operations such as logging, authentication, and network communication, as well as distributed systems or clusters where data consistency is important. + ||| + ) + .withUnits('s') + .addTarget(commonPromTarget( + expr=q.node_timex_estimated_error_seconds, + legendFormat='Estimated error in seconds', + )) + .addTarget(commonPromTarget( + expr=q.node_timex_offset_seconds, + legendFormat='Time offset in between local system and reference clock', + )) + .addTarget(commonPromTarget( + expr=q.node_timex_maxerror_seconds, + legendFormat='Maximum error in seconds' + )), + + local timeSynchronizedStatus = + nodeTimeseries.new( + 'Time Synchronized Status', + description='Status of time synchronization.' + ) + .withColor(mode='palette-classic') + .withFillOpacity(75) + .withLegend(show=false) + { + maxDataPoints: 100, + type: 'status-history', + fieldConfig+: { + defaults+: { + mappings+: [ + { + type: 'value', + options: { + '1': { + text: 'In sync', + color: 'light-green', + index: 1, + }, + }, + }, + { + type: 'value', + options: { + '0': { + text: 'Not in sync', + color: 'light-yellow', + index: 0, + }, + }, + }, + + ], + }, + }, + } + .addTarget(commonPromTarget( + expr=q.node_timex_sync_status, + legendFormat='Sync status', + )), + + local panelsGrid = + [ + //use negative gravity(skip y), max w=24, default h should be '6'. + c.panelsWithTargets.cpuStatPanel { gridPos: { x: 0, w: 6, h: 6 } }, + c.panelsWithTargets.idleCPU { gridPos: { x: 6, h: 6, w: 9 } }, + cpuUsageModes { gridPos: { x: 15, h: 6, w: 9 } }, + //pseudorow y:25 + c.panelsWithTargets.systemLoad { gridPos: { x: 0, h: 6, w: 12, y: 25 } }, + c.panelsWithTargets.systemContextSwitches { gridPos: { x: 12, h: 6, w: 12, y: 25 } }, + { type: 'row', title: 'Time', gridPos: { x: 0, w: 24, y: 75 } }, + timeZoneOffset { gridPos: { x: 0, h: 3, w: 3, y: 75 } }, + timeSynchronizedStatus { gridPos: { x: 3, h: 3, w: 21, y: 75 } }, + timeSyncDrift { gridPos: { x: 0, h: 6, w: 24, y: 80 } }, + ], + + dashboard: if platform == 'Linux' then + dashboard.new( + '%sNode CPU and System' % config { nodeQuerySelector: c.nodeQuerySelector }.dashboardNamePrefix, + time_from=config.dashboardInterval, + tags=(config.dashboardTags), + timezone=config.dashboardTimezone, + refresh=config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=config.grafanaDashboardIDs['nodes-system.json'], + ) + .addLink(c.links.fleetDash) + .addLink(c.links.nodeDash) + .addLink(c.links.otherDashes) + .addAnnotations(c.annotations) + .addTemplates(templates) + .addPanels(panelsGrid) + else if platform == 'Darwin' then {}, + }, +} diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 65e96dd8dc..9de0c4103a 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -5,8 +5,6 @@ local prometheus = grafana.prometheus; local template = grafana.template; local graphPanel = grafana.graphPanel; -local c = import '../config.libsonnet'; - local datasourceTemplate = { current: { text: 'default', @@ -25,6 +23,7 @@ local datasourceTemplate = { local CPUUtilisation = graphPanel.new( 'CPU Utilisation', + description='Total CPU utilisation percent.', datasource='$datasource', span=6, format='percentunit', @@ -38,6 +37,7 @@ local CPUSaturation = // average relates to the "CPU saturation" in the title. graphPanel.new( 'CPU Saturation (Load1 per CPU)', + description='System load average over the last minute. A measurement of how many processes are waiting for CPU cycles. The value is as a percent compared to the number of CPU cores for the node.', datasource='$datasource', span=6, format='percentunit', @@ -49,6 +49,7 @@ local CPUSaturation = local memoryUtilisation = graphPanel.new( 'Memory Utilisation', + description='Total memory utilisation in percent.', datasource='$datasource', span=6, format='percentunit', @@ -60,6 +61,7 @@ local memoryUtilisation = local memorySaturation = graphPanel.new( 'Memory Saturation (Major Page Faults)', + description='Rate of major memory page faults.', datasource='$datasource', span=6, format='rds', @@ -71,6 +73,7 @@ local memorySaturation = local networkUtilisation = graphPanel.new( 'Network Utilisation (Bytes Receive/Transmit)', + description='Network Utilisation (Bytes Receive/Transmit)', datasource='$datasource', span=6, format='Bps', @@ -85,6 +88,7 @@ local networkUtilisation = local networkSaturation = graphPanel.new( 'Network Saturation (Drops Receive/Transmit)', + description='Network Saturation (Drops Receive/Transmit)', datasource='$datasource', span=6, format='Bps', @@ -99,6 +103,7 @@ local networkSaturation = local diskIOUtilisation = graphPanel.new( 'Disk IO Utilisation', + description='Disk total IO seconds.', datasource='$datasource', span=6, format='percentunit', @@ -110,6 +115,7 @@ local diskIOUtilisation = local diskIOSaturation = graphPanel.new( 'Disk IO Saturation', + description='Disk saturation (weighted seconds spent, 1 second rate)', datasource='$datasource', span=6, format='percentunit', @@ -121,6 +127,7 @@ local diskIOSaturation = local diskSpaceUtilisation = graphPanel.new( 'Disk Space Utilisation', + description='Total disk utilisation percent', datasource='$datasource', span=12, format='percentunit', @@ -146,11 +153,12 @@ local diskSpaceUtilisation = dashboard.new( '%sUSE Method / Node' % $._config.dashboardNamePrefix, - time_from='now-1h', + time_from=$._config.dashboardInterval, tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' + timezone=$._config.dashboardTimezone, + refresh=$._config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=$._config.grafanaDashboardIDs['node-rsrc-use.json'], ) .addTemplate(datasourceTemplate) .addTemplate($._clusterTemplate) @@ -211,11 +219,12 @@ local diskSpaceUtilisation = 'node-cluster-rsrc-use.json': dashboard.new( '%sUSE Method / Cluster' % $._config.dashboardNamePrefix, - time_from='now-1h', + time_from=$._config.dashboardInterval, tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' + timezone=$._config.dashboardTimezone, + refresh=$._config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=$._config.grafanaDashboardIDs['node-cluster-rsrc-use.json'], ) .addTemplate(datasourceTemplate) .addTemplate($._clusterTemplate) @@ -322,11 +331,12 @@ local diskSpaceUtilisation = 'node-multicluster-rsrc-use.json': dashboard.new( '%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix, - time_from='now-1h', + time_from=$._config.dashboardInterval, tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' + timezone=$._config.dashboardTimezone, + refresh=$._config.dashboardRefresh, + graphTooltip='shared_crosshair', + uid=$._config.grafanaDashboardIDs['node-multicluster-rsrc-use.json'], ) .addTemplate(datasourceTemplate) .addRow( diff --git a/docs/node-mixin/lib/common.libsonnet b/docs/node-mixin/lib/common.libsonnet new file mode 100644 index 0000000000..66d45fd5c5 --- /dev/null +++ b/docs/node-mixin/lib/common.libsonnet @@ -0,0 +1,707 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local nodePanels = import '../lib/panels/panels.libsonnet'; +local commonPanels = import '../lib/panels/common/panels.libsonnet'; +local nodeTimeseries = nodePanels.timeseries; +{ + + new(config=null, platform=null):: { + + local c = self, + + local labelsToRegexSelector(labels) = + std.join(',', ['%s=~"$%s"' % [label, label] for label in labels]), + local labelsToLegend(labels) = + std.join('/', ['{{%s}}' % [label] for label in labels]), + + local labelsToURLvars(labels, prefix) = + std.join('&', ['var-%s=${%s%s}' % [label, prefix, label] for label in labels]), + // export + labelsToLegend:: labelsToLegend, + labelsToURLvars:: labelsToURLvars, + // add to all queries but not templates + local nodeQuerySelector = labelsToRegexSelector(std.split(config.groupLabels + ',' + config.instanceLabels, ',')), + nodeQuerySelector:: nodeQuerySelector, + + // common templates + local prometheusDatasourceTemplate = { + current: { + text: 'default', + value: 'default', + }, + hide: 0, + label: 'Data Source', + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + + local chainLabelsfold(prev, label) = { + chain: + if std.length(prev) > 0 + then + [[label] + prev.chain[0]] + prev.chain + else + [[label]], + }, + + local chainLabels(labels) = + [ + { + label: l[0:1][0], + chainSelector: labelsToRegexSelector(std.reverse(l[1:])), + } + for l in std.reverse(std.foldl(chainLabelsfold, labels, init={}).chain) + ], + + local groupTemplates = + [ + template.new( + name=label.label, + label=label.label, + datasource='$datasource', + query='', + current='', + refresh=2, + includeAll=true, + // do not use .*, will get series without such label at all when ALL is selected, ignoring nodeExporterSelector results + allValues=null, + multi=true, + sort=1 + ) + { + query: if platform == 'Darwin' then 'label_values(node_uname_info{sysname="Darwin", %(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: label.chainSelector } + else 'label_values(node_uname_info{sysname!="Darwin", %(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: label.chainSelector }, + } + for label in chainLabels(std.split(config.groupLabels, ',')) + ], + + local instanceTemplates = + [ + template.new( + label.label, + '$datasource', + 'label_values(node_uname_info{%(nodeExporterSelector)s, %(chainSelector)s}, %(label)s)' % config { label: label.label, chainSelector: labelsToRegexSelector(std.split(config.groupLabels, ',')) + ',' + label.chainSelector }, + sort=1, + refresh='time', + label=label.label, + ) + for label in chainLabels(std.split(config.instanceLabels, ',')) + ], + + // return common templates + templates: [prometheusDatasourceTemplate] + groupTemplates + instanceTemplates, + // return templates where instance select is not required + groupDashboardTemplates: [prometheusDatasourceTemplate] + groupTemplates, + + local rebootAnnotation = { + datasource: { + type: 'prometheus', + uid: '$datasource', + }, + enable: true, + hide: true, + expr: 'node_boot_time_seconds{%(nodeQuerySelector)s}*1000 > $__from < $__to' % config { nodeQuerySelector: nodeQuerySelector }, + name: 'Reboot', + iconColor: 'light-orange', + tagKeys: config.instanceLabels, + textFormat: '', + titleFormat: 'Reboot', + useValueForTime: 'on', + }, + local memoryOOMkillerAnnotation = { + datasource: { + type: 'prometheus', + uid: '$datasource', + }, + enable: true, + hide: true, + expr: 'increase(node_vmstat_oom_kill{%(nodeQuerySelector)s}[$__interval])' % config { nodeQuerySelector: nodeQuerySelector }, + name: 'OOMkill', + iconColor: 'light-purple', + tagKeys: config.instanceLabels, + textFormat: '', + titleFormat: 'OOMkill', + }, + local newKernelAnnotation = { + datasource: { + type: 'prometheus', + uid: '$datasource', + }, + enable: true, + hide: true, + expr: ||| + changes( + sum by (%(instanceLabels)s) ( + group by (%(instanceLabels)s,release) (node_uname_info{%(nodeQuerySelector)s}) + ) + [$__interval:1m] offset -$__interval) > 1 + ||| % config { nodeQuerySelector: nodeQuerySelector }, + name: 'Kernel update', + iconColor: 'light-blue', + tagKeys: config.instanceLabels, + textFormat: '', + titleFormat: 'Kernel update', + step: '5m', // must be larger than possible scrape periods + }, + // return common annotations + annotations: [rebootAnnotation, memoryOOMkillerAnnotation, newKernelAnnotation], + + // return common prometheus target (with project defaults) + commonPromTarget( + expr=null, + intervalFactor=1, + datasource='$datasource', + legendFormat=null, + format='timeseries', + instant=null, + hide=null, + interval=null, + ):: + prometheus.target( + expr=expr, + intervalFactor=intervalFactor, + datasource=datasource, + legendFormat=legendFormat, + format=format, + instant=instant, + hide=hide, + interval=interval + ), + // link to fleet panel + links:: { + fleetDash:: grafana.link.dashboards( + asDropdown=false, + title='Back to Node Fleet Overview', + tags=[], + includeVars=false, + keepTime=true, + url='d/' + config.grafanaDashboardIDs['nodes-fleet.json'] + ) { type: 'link', icon: 'dashboard' }, + nodeDash:: grafana.link.dashboards( + asDropdown=false, + title='Back to Node Overview', + tags=[], + includeVars=true, + keepTime=true, + url='d/' + config.grafanaDashboardIDs['nodes.json'] + ) { type: 'link', icon: 'dashboard' }, + otherDashes:: grafana.link.dashboards( + asDropdown=true, + title='Other Node Dashboards', + includeVars=true, + keepTime=true, + tags=(config.dashboardTags), + ), + // used in fleet table + instanceDataLinkForTable:: { + title: 'Drill down to instance ${__data.fields.%s}' % std.split(config.instanceLabels, ',')[0], + url: 'd/' + config.grafanaDashboardIDs['nodes.json'] + '?' + labelsToURLvars(std.split(config.instanceLabels, ','), prefix='__data.fields.') + '&${__url_time_range}&var-datasource=${datasource}', + }, + // used in ts panels + instanceDataLink:: { + title: 'Drill down to instance ${__field.labels.%s}' % std.split(config.instanceLabels, ',')[0], + url: 'd/' + config.grafanaDashboardIDs['nodes.json'] + '?' + labelsToURLvars(std.split(config.instanceLabels, ','), prefix='__field.labels.') + '&${__url_time_range}&var-datasource=${datasource}', + }, + }, + // return common queries that could be used in multiple dashboards + queries:: { + systemLoad1:: 'avg by (%(instanceLabels)s) (node_load1{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, + systemLoad5:: 'avg by (%(instanceLabels)s) (node_load5{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, + systemLoad15:: 'avg by (%(instanceLabels)s) (node_load15{%(nodeQuerySelector)s})' % config { nodeQuerySelector: nodeQuerySelector }, + uptime:: 'time() - node_boot_time_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + cpuCount:: 'count by (%(instanceLabels)s) (node_cpu_seconds_total{%(nodeQuerySelector)s, mode="idle"})' % config { nodeQuerySelector: nodeQuerySelector }, + cpuUsage:: + ||| + (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(nodeQuerySelector)s}) by (cpu, %(instanceLabels)s))) + - + avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(nodeQuerySelector)s}[$__rate_interval])))) * 100) + / + count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(nodeQuerySelector)s}) by (cpu, %(instanceLabels)s)) + ||| % config { nodeQuerySelector: nodeQuerySelector }, + cpuUsageModes:: + ||| + sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(nodeQuerySelector)s}[$__rate_interval])) + / on(%(instanceLabels)s) + group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(nodeQuerySelector)s}[$__rate_interval]))) * 100 + ||| % config { nodeQuerySelector: nodeQuerySelector }, + cpuUsagePerCore:: + ||| + ( + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeQuerySelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) + / ignoring(cpu) group_left + count without (cpu, mode) (node_cpu_seconds_total{%(nodeQuerySelector)s, mode="idle"}) + ) * 100 + ||| % config { nodeQuerySelector: nodeQuerySelector }, + memoryTotal:: 'node_memory_MemTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + memorySwapTotal:: 'node_memory_SwapTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + memoryUsage:: + ||| + 100 - + ( + avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(nodeQuerySelector)s}) / + avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(nodeQuerySelector)s}) + * 100 + ) + ||| % config { nodeQuerySelector: nodeQuerySelector }, + + process_max_fds:: 'process_max_fds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + process_open_fds:: 'process_open_fds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + + fsSizeTotalRoot:: 'node_filesystem_size_bytes{%(nodeQuerySelector)s, mountpoint="/",fstype!="rootfs"}' % config { nodeQuerySelector: nodeQuerySelector }, + osInfo:: 'node_os_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + nodeInfo:: 'node_uname_info{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_disk_reads_completed_total:: 'irate(node_disk_reads_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + node_disk_writes_completed_total:: 'irate(node_disk_writes_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + diskReadTime:: 'rate(node_disk_read_bytes_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + diskWriteTime:: 'rate(node_disk_written_bytes_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + diskIoTime:: 'rate(node_disk_io_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + diskWaitReadTime:: + ||| + irate(node_disk_read_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + / + irate(node_disk_reads_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % config { nodeQuerySelector: nodeQuerySelector }, + diskWaitWriteTime:: + ||| + irate(node_disk_write_time_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + / + irate(node_disk_writes_completed_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % config { nodeQuerySelector: nodeQuerySelector }, + diskAvgQueueSize:: 'irate(node_disk_io_time_weighted_seconds_total{%(nodeQuerySelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + diskSpaceUsage:: + ||| + sort_desc(1 - + ( + max by (job, %(instanceLabels)s, fstype, device, mountpoint) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) + / + max by (job, %(instanceLabels)s, fstype, device, mountpoint) (node_filesystem_size_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) + ) != 0 + ) + ||| % config { nodeQuerySelector: nodeQuerySelector }, + node_filesystem_avail_bytes:: 'node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_filesystem_files_free:: 'node_filesystem_files_free{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_filesystem_files:: 'node_filesystem_files{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_filesystem_readonly:: 'node_filesystem_readonly{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_filesystem_device_error:: 'node_filesystem_device_error{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + networkReceiveBitsPerSec:: 'irate(node_network_receive_bytes_total{%(nodeQuerySelector)s}[$__rate_interval])*8' % config { nodeQuerySelector: nodeQuerySelector }, + networkTransmitBitsPerSec:: 'irate(node_network_transmit_bytes_total{%(nodeQuerySelector)s}[$__rate_interval])*8' % config { nodeQuerySelector: nodeQuerySelector }, + networkReceiveErrorsPerSec:: 'irate(node_network_receive_errs_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + networkTransmitErrorsPerSec:: 'irate(node_network_transmit_errs_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + networkReceiveDropsPerSec:: 'irate(node_network_receive_drop_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + networkTransmitDropsPerSec:: 'irate(node_network_transmit_drop_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + + systemContextSwitches:: 'irate(node_context_switches_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + systemInterrupts:: 'irate(node_intr_total{%(nodeQuerySelector)s}[$__rate_interval])' % config { nodeQuerySelector: nodeQuerySelector }, + + //time + node_timex_estimated_error_seconds:: 'node_timex_estimated_error_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_timex_offset_seconds:: 'node_timex_offset_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_timex_maxerror_seconds:: 'node_timex_maxerror_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + + node_timex_sync_status:: 'node_timex_sync_status{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_time_zone_offset_seconds:: 'node_time_zone_offset_seconds{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + node_systemd_units:: 'node_systemd_units{%(nodeQuerySelector)s}' % config { nodeQuerySelector: nodeQuerySelector }, + + + }, + // share across dashboards + panelsWithTargets:: { + // cpu + idleCPU:: + nodePanels.timeseries.new( + 'CPU Usage', + description='Total CPU utilisation percent.' + ) + .withUnits('percent') + .withStacking('normal') + .withMin(0) + .withMax(100) + .addTarget(c.commonPromTarget( + expr=c.queries.cpuUsagePerCore, + legendFormat='cpu {{cpu}}', + )), + + systemLoad:: + nodePanels.timeseries.new( + 'Load Average', + description='System load average over the previous 1, 5, and 15 minute ranges. A measurement of how many processes are waiting for CPU cycles. The maximum number is the number of CPU cores for the node.', + ) + .withUnits('short') + .withMin(0) + .withFillOpacity(0) + .addTarget(c.commonPromTarget(c.queries.systemLoad1, legendFormat='1m load average')) + .addTarget(c.commonPromTarget(c.queries.systemLoad5, legendFormat='5m load average')) + .addTarget(c.commonPromTarget(c.queries.systemLoad15, legendFormat='15m load average')) + .addTarget(c.commonPromTarget(c.queries.cpuCount, legendFormat='logical cores')) + .addOverride( + matcher={ + id: 'byName', + options: 'logical cores', + }, + properties=[ + { + id: 'custom.lineStyle', + value: { + fill: 'dash', + dash: [ + 10, + 10, + ], + }, + }, + ] + ), + cpuStatPanel:: + commonPanels.percentUsageStat.new( + 'CPU Usage', + description='Total CPU utilisation percent.' + ) + .addTarget(c.commonPromTarget( + expr=c.queries.cpuUsage + )), + systemContextSwitches:: + nodePanels.timeseries.new( + 'Context Switches / Interrupts', + description=||| + Context switches occur when the operating system switches from running one process to another. + Interrupts are signals sent to the CPU by external devices to request its attention. + + A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. + ||| + ) + .addTarget(c.commonPromTarget(c.queries.systemContextSwitches, legendFormat='Context Switches')) + .addTarget(c.commonPromTarget(c.queries.systemInterrupts, legendFormat='Interrupts')), + + diskSpaceUsage:: + nodePanels.table.new( + title='Disk Space Usage', + description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.', + ) + .setFieldConfig(unit='decbytes') + //.addThresholdStep(color='light-green', value=null) + .addThresholdStep(color='light-blue', value=null) + .addThresholdStep(color='light-yellow', value=0.8) + .addThresholdStep(color='light-red', value=0.9) + .addTarget(c.commonPromTarget( + ||| + max by (mountpoint) (node_filesystem_size_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='', + instant=true, + format='table' + )) + .addTarget(c.commonPromTarget( + ||| + max by (mountpoint) (node_filesystem_avail_bytes{%(nodeQuerySelector)s, %(fsSelector)s, %(fsMountpointSelector)s}) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='', + instant=true, + format='table', + )) + .addOverride( + matcher={ + id: 'byName', + options: 'Mounted on', + }, + properties=[ + { + id: 'custom.width', + value: 260, + }, + ], + ) + .addOverride( + matcher={ + id: 'byName', + options: 'Size', + }, + properties=[ + + { + id: 'custom.width', + value: 93, + }, + + ], + ) + .addOverride( + matcher={ + id: 'byName', + options: 'Used', + }, + properties=[ + { + id: 'custom.width', + value: 72, + }, + ], + ) + .addOverride( + matcher={ + id: 'byName', + options: 'Available', + }, + properties=[ + { + id: 'custom.width', + value: 88, + }, + ], + ) + + .addOverride( + matcher={ + id: 'byName', + options: 'Used, %', + }, + properties=[ + { + id: 'unit', + value: 'percentunit', + }, + { + id: 'custom.displayMode', + value: 'basic', + }, + { + id: 'max', + value: 1, + }, + { + id: 'min', + value: 0, + }, + ] + ) + .sortBy('Mounted on') + + { + transformations+: [ + { + id: 'groupBy', + options: { + fields: { + 'Value #A': { + aggregations: [ + 'lastNotNull', + ], + operation: 'aggregate', + }, + 'Value #B': { + aggregations: [ + 'lastNotNull', + ], + operation: 'aggregate', + }, + mountpoint: { + aggregations: [], + operation: 'groupby', + }, + }, + }, + }, + { + id: 'merge', + options: {}, + }, + { + id: 'calculateField', + options: { + alias: 'Used', + binary: { + left: 'Value #A (lastNotNull)', + operator: '-', + reducer: 'sum', + right: 'Value #B (lastNotNull)', + }, + mode: 'binary', + reduce: { + reducer: 'sum', + }, + }, + }, + { + id: 'calculateField', + options: { + alias: 'Used, %', + binary: { + left: 'Used', + operator: '/', + reducer: 'sum', + right: 'Value #A (lastNotNull)', + }, + mode: 'binary', + reduce: { + reducer: 'sum', + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: {}, + indexByName: {}, + renameByName: { + 'Value #A (lastNotNull)': 'Size', + 'Value #B (lastNotNull)': 'Available', + mountpoint: 'Mounted on', + }, + }, + }, + ], + }, + memoryGraphPanelPrototype:: + nodePanels.timeseries.new( + 'Memory Usage', + description='Memory usage by category, measured in bytes.', + ) + .withMin(0) + .withUnits('bytes'), + memoryGraph:: + if platform == 'Linux' then + self.memoryGraphPanelPrototype + { + description: ||| + Used: The amount of physical memory currently in use by the system. + Cached: The amount of physical memory used for caching data from disk. The Linux kernel uses available memory to cache data that is read from or written to disk. This helps speed up disk access times. + Free: The amount of physical memory that is currently not in use. + Buffers: The amount of physical memory used for temporary storage of data being transferred between devices or applications. + Available: The amount of physical memory that is available for use by applications. This takes into account memory that is currently being used for caching but can be freed up if needed. + |||, + } + { stack: true } + .addTarget(c.commonPromTarget( + ||| + ( + node_memory_MemTotal_bytes{%(nodeQuerySelector)s} + - + node_memory_MemFree_bytes{%(nodeQuerySelector)s} + - + node_memory_Buffers_bytes{%(nodeQuerySelector)s} + - + node_memory_Cached_bytes{%(nodeQuerySelector)s} + ) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, + legendFormat='Memory used' + )) + .addTarget(c.commonPromTarget('node_memory_Buffers_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory buffers')) + .addTarget(c.commonPromTarget('node_memory_Cached_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory cached')) + .addTarget(c.commonPromTarget('node_memory_MemFree_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory free')) + .addTarget(c.commonPromTarget('node_memory_MemAvailable_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory available')) + .addTarget(c.commonPromTarget('node_memory_MemTotal_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory total')) + else if platform == 'Darwin' then + // not useful to stack + self.memoryGraphPanelPrototype { stack: false } + .addTarget(c.commonPromTarget('node_memory_total_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Physical Memory')) + .addTarget(c.commonPromTarget( + ||| + ( + node_memory_internal_bytes{%(nodeQuerySelector)s} - + node_memory_purgeable_bytes{%(nodeQuerySelector)s} + + node_memory_wired_bytes{%(nodeQuerySelector)s} + + node_memory_compressed_bytes{%(nodeQuerySelector)s} + ) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Memory Used' + )) + .addTarget(c.commonPromTarget( + ||| + ( + node_memory_internal_bytes{%(nodeQuerySelector)s} - + node_memory_purgeable_bytes{%(nodeQuerySelector)s} + ) + ||| % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='App Memory' + )) + .addTarget(c.commonPromTarget('node_memory_wired_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Wired Memory')) + .addTarget(c.commonPromTarget('node_memory_compressed_bytes{%(nodeQuerySelector)s}' % config { nodeQuerySelector: c.nodeQuerySelector }, legendFormat='Compressed')), + + // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. + memoryGaugePanelPrototype:: + commonPanels.percentUsageStat.new( + 'Memory Usage', + description='Total memory utilisation.', + ), + + memoryGauge:: + if platform == 'Linux' then + self.memoryGaugePanelPrototype + + .addTarget(c.commonPromTarget(c.queries.memoryUsage)) + + else if platform == 'Darwin' then + self.memoryGaugePanelPrototype + .addTarget(c.commonPromTarget( + ||| + ( + ( + avg(node_memory_internal_bytes{%(nodeQuerySelector)s}) - + avg(node_memory_purgeable_bytes{%(nodeQuerySelector)s}) + + avg(node_memory_wired_bytes{%(nodeQuerySelector)s}) + + avg(node_memory_compressed_bytes{%(nodeQuerySelector)s}) + ) / + avg(node_memory_total_bytes{%(nodeQuerySelector)s}) + ) + * + 100 + ||| % config { nodeQuerySelector: c.nodeQuerySelector } + )), + diskIO:: + nodePanels.timeseries.new( + 'Disk I/O', + description='Disk read/writes in bytes, and total IO seconds.' + ) + .withFillOpacity(0) + .withMin(0) + .addTarget(c.commonPromTarget( + c.queries.diskReadTime, + legendFormat='{{device}} read', + )) + .addTarget(c.commonPromTarget( + c.queries.diskWriteTime, + legendFormat='{{device}} written', + )) + .addTarget(c.commonPromTarget( + c.queries.diskIoTime, + legendFormat='{{device}} io time', + )) + .addOverride( + matcher={ + id: 'byRegexp', + options: '/ read| written/', + }, + properties=[ + { + id: 'unit', + value: 'bps', + }, + ] + ) + .addOverride( + matcher={ + id: 'byRegexp', + options: '/ io time/', + }, + properties=[ + { + id: 'unit', + value: 'percentunit', + }, + { + id: 'custom.axisSoftMax', + value: 1, + }, + { + id: 'custom.drawStyle', + value: 'points', + }, + ] + ), + }, + }, + +} diff --git a/docs/node-mixin/lib/panels/common/info.libsonnet b/docs/node-mixin/lib/panels/common/info.libsonnet new file mode 100644 index 0000000000..3b54d39382 --- /dev/null +++ b/docs/node-mixin/lib/panels/common/info.libsonnet @@ -0,0 +1,30 @@ +// Info panel text (number or text) +local statPanel = import '../stat.libsonnet'; +statPanel { + new( + title=null, + description=null, + datasource=null, + ):: + super.new( + title, + description, + datasource, + ) + + self.withColor(color='text') + + self.withTextSize(value=20) + + self.withGraphMode('none') + + + { + options+: { + reduceOptions: { + values: false, + calcs: [ + 'lastNotNull', + ], + fields: '', + }, + graphMode: 'none', + }, + }, +} diff --git a/docs/node-mixin/lib/panels/common/networktraffic.libsonnet b/docs/node-mixin/lib/panels/common/networktraffic.libsonnet new file mode 100644 index 0000000000..09f3370f67 --- /dev/null +++ b/docs/node-mixin/lib/panels/common/networktraffic.libsonnet @@ -0,0 +1,18 @@ +// Panels to graph network traffic in and out +local timeseries = import '../timeseries.libsonnet'; +timeseries { + new( + title=null, + description=null, + datasource=null, + ):: + super.new( + title, + description, + datasource, + ) + + self.withDecimals(1) + + self.withUnits('bps') + + self.withNegativeYByRegex('transmit|tx|out') + + self.withAxisLabel('out(-) | in(+)'), +} diff --git a/docs/node-mixin/lib/panels/common/panels.libsonnet b/docs/node-mixin/lib/panels/common/panels.libsonnet new file mode 100644 index 0000000000..88fea17a6e --- /dev/null +++ b/docs/node-mixin/lib/panels/common/panels.libsonnet @@ -0,0 +1,6 @@ +{ + uptimeStat:: import 'uptime.libsonnet', + infoStat:: import 'info.libsonnet', + percentUsageStat:: import 'percentusage.libsonnet', + networkTrafficGraph:: import 'networktraffic.libsonnet', +} diff --git a/docs/node-mixin/lib/panels/common/percentusage.libsonnet b/docs/node-mixin/lib/panels/common/percentusage.libsonnet new file mode 100644 index 0000000000..884878f673 --- /dev/null +++ b/docs/node-mixin/lib/panels/common/percentusage.libsonnet @@ -0,0 +1,30 @@ +// Panels to display metrics that can go from 0 to 100%. (cpu utilization, memory utilization etc). Full utilization is considered an issue. +local statPanel = import '../stat.libsonnet'; +statPanel { + new( + title=null, + description=null, + datasource=null, + ):: + super.new( + title, + description, + datasource, + ) + + self.withDecimals(1) + + self.withUnits('percent') + + self.withMax(100) + + self.withMin(0) + + self.withColor(mode='continuous-BlYlRd') + { + options+: { + reduceOptions: { + values: false, + calcs: [ + 'lastNotNull', + ], + fields: '', + }, + }, + }, +} diff --git a/docs/node-mixin/lib/panels/common/uptime.libsonnet b/docs/node-mixin/lib/panels/common/uptime.libsonnet new file mode 100644 index 0000000000..a64a179faa --- /dev/null +++ b/docs/node-mixin/lib/panels/common/uptime.libsonnet @@ -0,0 +1,43 @@ +local statPanel = import '../stat.libsonnet'; +statPanel { + new( + title='Uptime', + description=null, + datasource=null, + ):: + super.new( + title, + description, + datasource, + ) + + self.withDecimals(1) + + self.withGraphMode('none') + + self.withTextSize(value=20) + + self.withUnits('dtdurations') + + self.withThresholds( + mode='absolute', + steps=[ + { + color: 'orange', + value: null, + }, + { + color: 'text', + value: 300, + }, + ] + ) + + self.withColor(mode='thresholds') + + + { + options+: { + reduceOptions: { + values: false, + calcs: [ + 'lastNotNull', + ], + fields: '', + }, + }, + }, +} diff --git a/docs/node-mixin/lib/panels/panel.libsonnet b/docs/node-mixin/lib/panels/panel.libsonnet new file mode 100644 index 0000000000..8ede6ffe87 --- /dev/null +++ b/docs/node-mixin/lib/panels/panel.libsonnet @@ -0,0 +1,129 @@ +// generic grafana dashboard +{ + //feed grafonnet panel + new():: {}, + + withUnits(unit):: self { + + fieldConfig+: { + defaults+: { + unit: unit, + }, + }, + }, + + withLegend(show=true, mode='table', placement='bottom', calcs=['min', 'mean', 'max', 'lastNotNull']):: self { + options+: { + legend: { + showLegend: show, + displayMode: mode, + placement: placement, + calcs: calcs, + }, + }, + }, + withDecimals(decimals):: self { + + fieldConfig+: { + defaults+: { + decimals: decimals, + }, + }, + }, + + withThresholds(mode='absolute', steps=null):: self { + + fieldConfig+: { + defaults+: { + thresholds: { + mode: mode, + steps: steps, + }, + }, + }, + }, + withMin(value):: self { + fieldConfig+: { + defaults+: { + min: value, + }, + }, + }, + withMax(value):: self { + fieldConfig+: { + defaults+: { + max: value, + }, + }, + }, + withColor(color=null, mode='fixed'):: self { + fieldConfig+: { + defaults+: { + color: { + mode: mode, + fixedColor: if mode == 'fixed' then color else null, + }, + }, + }, + }, + withMaxDataPoints(value):: self { + maxDataPoints: value, + }, + withTransform():: self { + + merge():: self + { + transformations+: [ + { + id: 'merge', + options: {}, + }, + ], + }, + filterFieldsByName(pattern=null):: self + { + transformations+: [ + { + id: 'filterFieldsByName', + options: { + include: { + pattern: pattern, + }, + }, + }, + ], + }, + joinByField( + mode='outer', + field=null + ):: self { + transformations+: [ + { + id: 'joinByField', + options: { + byField: field, + mode: mode, + }, + }, + ], + }, + organize( + excludeByName={}, + indexByName={}, + renameByName={}, + + ):: self + { + transformations+: [ + { + id: 'organize', + options: { + excludeByName: excludeByName, + indexByName: indexByName, + renameByName: renameByName, + }, + }, + ], + }, + }, +} diff --git a/docs/node-mixin/lib/panels/panels.libsonnet b/docs/node-mixin/lib/panels/panels.libsonnet new file mode 100644 index 0000000000..19c9a1d896 --- /dev/null +++ b/docs/node-mixin/lib/panels/panels.libsonnet @@ -0,0 +1,5 @@ +{ + timeseries:: import 'timeseries.libsonnet', + stat:: import 'stat.libsonnet', + table:: import 'table.libsonnet', +} diff --git a/docs/node-mixin/lib/panels/stat.libsonnet b/docs/node-mixin/lib/panels/stat.libsonnet new file mode 100644 index 0000000000..e3fa4172f3 --- /dev/null +++ b/docs/node-mixin/lib/panels/stat.libsonnet @@ -0,0 +1,28 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local genericPanel = import 'panel.libsonnet'; +genericPanel { + new( + title=null, + description=null, + datasource=null, + ):: self + + grafana.statPanel.new( + title=title, + description=description, + datasource=datasource, + ), + withGraphMode(mode='none'):: self { + options+: + { + graphMode: mode, + }, + }, + withTextSize(value='auto', title='auto'):: self { + options+: + { text: { + valueSize: value, + titleSize: title, + } }, + }, + +} diff --git a/docs/node-mixin/lib/panels/table.libsonnet b/docs/node-mixin/lib/panels/table.libsonnet new file mode 100644 index 0000000000..4a9c36cc66 --- /dev/null +++ b/docs/node-mixin/lib/panels/table.libsonnet @@ -0,0 +1,37 @@ +local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; +local genericPanel = import 'panel.libsonnet'; +local table = grafana70.panel.table; +genericPanel +{ + new( + title=null, + description=null, + datasource=null, + ):: self + + table.new( + title=title, + description=description, + datasource=datasource, + ), + sortBy(field, desc=false):: self { + options+: { + sortBy: [ + { + displayName: field, + desc: desc, + }, + ], + }, + }, + withFooter(reducer=['mean'], fields=[]):: self { + + options+: { + footer: { + show: true, + reducer: reducer, + fields: fields, + }, + }, + }, + +} diff --git a/docs/node-mixin/lib/panels/timeseries.libsonnet b/docs/node-mixin/lib/panels/timeseries.libsonnet new file mode 100644 index 0000000000..816ec49ad0 --- /dev/null +++ b/docs/node-mixin/lib/panels/timeseries.libsonnet @@ -0,0 +1,145 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local genericPanel = import 'panel.libsonnet'; +genericPanel +{ + new( + title=null, + description=null, + datasource=null, + ):: self + + grafana.graphPanel.new( + title=title, + description=description, + datasource=datasource, + ) + + + { + type: 'timeseries', + } + + self.withFillOpacity(10) + + self.withGradientMode('opacity') + + self.withLineInterpolation('smooth') + + self.withShowPoints('never') + + self.withTooltip(mode='multi', sort='desc') + + self.withLegend(mode='list', calcs=[]), + withDrawStyle(style):: self { + fieldConfig+: { + defaults+: { + custom+: { + drawStyle: style, + }, + }, + }, + }, + withPointsSize(size):: self { + fieldConfig+: { + defaults+: { + custom+: { + pointSize: size, + }, + }, + }, + }, + withTooltip(mode=null, sort='none'):: self { + options+: { + tooltip: { + mode: 'multi', + sort: sort, + }, + }, + }, + withLineInterpolation(value):: self { + fieldConfig+: { + defaults+: { + custom+: { + lineInterpolation: value, + }, + }, + }, + }, + withShowPoints(value):: self { + fieldConfig+: { + defaults+: { + custom+: { + showPoints: value, + }, + }, + }, + }, + withStacking(stack='normal'):: self { + fieldConfig+: { + defaults+: { + custom+: { + stacking: { + mode: stack, + group: 'A', + }, + }, + }, + }, + }, + withGradientMode(mode):: self { + fieldConfig+: { + defaults+: { + custom+: { + gradientMode: mode, + }, + }, + }, + }, + addDataLink(title, url):: self { + + fieldConfig+: { + defaults+: { + links: [ + { + title: title, + url: url, + }, + ], + }, + }, + }, + withFillOpacity(opacity):: self { + fieldConfig+: { + defaults+: { + custom+: { + fillOpacity: opacity, + }, + }, + }, + + }, + + withAxisLabel(label):: self { + fieldConfig+: { + defaults+: { + custom+: { + axisLabel: label, + }, + }, + }, + }, + + withNegativeYByRegex(regex):: self { + fieldConfig+: { + overrides+: [ + { + matcher: { + id: 'byRegexp', + options: '/' + regex + '/', + }, + properties: [ + { + id: 'custom.transform', + value: 'negative-Y', + }, + ], + }, + + ], + }, + + + }, +} diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet deleted file mode 100644 index 6c4d990481..0000000000 --- a/docs/node-mixin/lib/prom-mixin.libsonnet +++ /dev/null @@ -1,504 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; -local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; -local gaugePanel = grafana70.panel.gauge; -local table = grafana70.panel.table; - -{ - - new(config=null, platform=null):: { - - local prometheusDatasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - - local instanceTemplatePrototype = - template.new( - 'instance', - '$datasource', - '', - refresh='time', - label='Instance', - ), - local instanceTemplate = - if platform == 'Darwin' then - instanceTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}, instance)' % config } - else - instanceTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}, instance)' % config }, - - - local idleCPU = - graphPanel.new( - 'CPU Usage', - datasource='$datasource', - span=6, - format='percentunit', - max=1, - min=0, - stack=true, - ) - .addTarget(prometheus.target( - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}) - ) - ||| % config, - legendFormat='{{cpu}}', - intervalFactor=5, - )), - - local systemLoad = - graphPanel.new( - 'Load Average', - datasource='$datasource', - span=6, - format='short', - min=0, - fill=0, - ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='1m load average')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='5m load average')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='15m load average')) - .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", mode="idle"})' % config, legendFormat='logical cores')), - - local memoryGraphPanelPrototype = - graphPanel.new( - 'Memory Usage', - datasource='$datasource', - span=9, - format='bytes', - min=0, - ), - local memoryGraph = - if platform == 'Linux' then - memoryGraphPanelPrototype { stack: true } - .addTarget(prometheus.target( - ||| - ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} - - - node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) - ||| % config, - legendFormat='memory used' - )) - .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory free')) - else if platform == 'Darwin' then - // not useful to stack - memoryGraphPanelPrototype { stack: false } - .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Physical Memory')) - .addTarget(prometheus.target( - ||| - ( - node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) - ||| % config, legendFormat='Memory Used' - )) - .addTarget(prometheus.target( - ||| - ( - node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) - ||| % config, legendFormat='App Memory' - )) - .addTarget(prometheus.target('node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Wired Memory')) - .addTarget(prometheus.target('node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Compressed')), - - // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. - local memoryGaugePanelPrototype = - gaugePanel.new( - title='Memory Usage', - datasource='$datasource', - ) - .addThresholdStep('rgba(50, 172, 45, 0.97)') - .addThresholdStep('rgba(237, 129, 40, 0.89)', 80) - .addThresholdStep('rgba(245, 54, 54, 0.9)', 90) - .setFieldConfig(max=100, min=0, unit='percent') - + { - span: 3, - }, - - local memoryGauge = - if platform == 'Linux' then - memoryGaugePanelPrototype - - .addTarget(prometheus.target( - ||| - 100 - - ( - avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) / - avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - * 100 - ) - ||| % config, - )) - - else if platform == 'Darwin' then - memoryGaugePanelPrototype - .addTarget(prometheus.target( - ||| - ( - ( - avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - - avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"}) + - avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}) + - avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}) - ) / - avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}) - ) - * - 100 - ||| % config - )), - - local diskIO = - graphPanel.new( - 'Disk I/O', - datasource='$datasource', - span=6, - min=0, - fill=0, - ) - // TODO: Does it make sense to have those three in the same panel? - .addTarget(prometheus.target( - 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, - legendFormat='{{device}} read', - intervalFactor=1, - )) - .addTarget(prometheus.target( - 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, - legendFormat='{{device}} written', - intervalFactor=1, - )) - .addTarget(prometheus.target( - 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, - legendFormat='{{device}} io time', - intervalFactor=1, - )) + - { - seriesOverrides: [ - { - alias: '/ read| written/', - yaxis: 1, - }, - { - alias: '/ io time/', - yaxis: 2, - }, - ], - yaxes: [ - self.yaxe(format='Bps'), - self.yaxe(format='percentunit'), - ], - }, - - local diskSpaceUsage = - table.new( - title='Disk Space Usage', - datasource='$datasource', - ) - .setFieldConfig(unit='decbytes') - .addThresholdStep(color='green', value=null) - .addThresholdStep(color='yellow', value=0.8) - .addThresholdStep(color='red', value=0.9) - .addTarget(prometheus.target( - ||| - max by (mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config, - legendFormat='', - instant=true, - format='table' - )) - .addTarget(prometheus.target( - ||| - max by (mountpoint) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config, - legendFormat='', - instant=true, - format='table' - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mounted on', - }, - properties=[ - { - id: 'custom.width', - value: 260, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Size', - }, - properties=[ - - { - id: 'custom.width', - value: 93, - }, - - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Used', - }, - properties=[ - { - id: 'custom.width', - value: 72, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Available', - }, - properties=[ - { - id: 'custom.width', - value: 88, - }, - ], - ) - - .addOverride( - matcher={ - id: 'byName', - options: 'Used, %', - }, - properties=[ - { - id: 'unit', - value: 'percentunit', - }, - { - id: 'custom.displayMode', - value: 'gradient-gauge', - }, - { - id: 'max', - value: 1, - }, - { - id: 'min', - value: 0, - }, - ] - ) - + { span: 6 } - + { - transformations: [ - { - id: 'groupBy', - options: { - fields: { - 'Value #A': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - 'Value #B': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - mountpoint: { - aggregations: [], - operation: 'groupby', - }, - }, - }, - }, - { - id: 'merge', - options: {}, - }, - { - id: 'calculateField', - options: { - alias: 'Used', - binary: { - left: 'Value #A (lastNotNull)', - operator: '-', - reducer: 'sum', - right: 'Value #B (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, - }, - }, - { - id: 'calculateField', - options: { - alias: 'Used, %', - binary: { - left: 'Used', - operator: '/', - reducer: 'sum', - right: 'Value #A (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, - }, - }, - { - id: 'organize', - options: { - excludeByName: {}, - indexByName: {}, - renameByName: { - 'Value #A (lastNotNull)': 'Size', - 'Value #B (lastNotNull)': 'Available', - mountpoint: 'Mounted on', - }, - }, - }, - { - id: 'sortBy', - options: { - fields: {}, - sort: [ - { - field: 'Mounted on', - }, - ], - }, - }, - ], - }, - - - local networkReceived = - graphPanel.new( - 'Network Received', - description='Network received (bits/s)', - datasource='$datasource', - span=6, - format='bps', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval]) * 8' % config, - legendFormat='{{device}}', - intervalFactor=1, - )), - - local networkTransmitted = - graphPanel.new( - 'Network Transmitted', - description='Network transmitted (bits/s)', - datasource='$datasource', - span=6, - format='bps', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval]) * 8' % config, - legendFormat='{{device}}', - intervalFactor=1, - )), - - local cpuRow = - row.new('CPU') - .addPanel(idleCPU) - .addPanel(systemLoad), - - local memoryRow = - row.new('Memory') - .addPanel(memoryGraph) - .addPanel(memoryGauge), - - local diskRow = - row.new('Disk') - .addPanel(diskIO) - .addPanel(diskSpaceUsage), - - local networkRow = - row.new('Network') - .addPanel(networkReceived) - .addPanel(networkTransmitted), - - local rows = - [ - cpuRow, - memoryRow, - diskRow, - networkRow, - ], - - local templates = - [ - prometheusDatasourceTemplate, - instanceTemplate, - ], - - - dashboard: if platform == 'Linux' then - dashboard.new( - '%sNodes' % config.dashboardNamePrefix, - time_from='now-1h', - tags=(config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' - ) - .addTemplates(templates) - .addRows(rows) - else if platform == 'Darwin' then - dashboard.new( - '%sMacOS' % config.dashboardNamePrefix, - time_from='now-1h', - tags=(config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair' - ) - .addTemplates(templates) - .addRows(rows), - - }, -} diff --git a/docs/node-observ-lib/.gitignore b/docs/node-observ-lib/.gitignore new file mode 100644 index 0000000000..f9bf6ba815 --- /dev/null +++ b/docs/node-observ-lib/.gitignore @@ -0,0 +1,2 @@ +jsonnetfile.lock.json +vendor diff --git a/docs/node-observ-lib/g.libsonnet b/docs/node-observ-lib/g.libsonnet new file mode 100644 index 0000000000..6da9f4eef9 --- /dev/null +++ b/docs/node-observ-lib/g.libsonnet @@ -0,0 +1 @@ +import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet' diff --git a/docs/node-observ-lib/jsonnetfile.json b/docs/node-observ-lib/jsonnetfile.json new file mode 100644 index 0000000000..b12b5dc0af --- /dev/null +++ b/docs/node-observ-lib/jsonnetfile.json @@ -0,0 +1,33 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" + } + }, + "version": "master" + } + ], + "legacyImports": true +} \ No newline at end of file diff --git a/docs/node-observ-lib/linux/README.md b/docs/node-observ-lib/linux/README.md new file mode 100644 index 0000000000..3582c0cbb9 --- /dev/null +++ b/docs/node-observ-lib/linux/README.md @@ -0,0 +1,163 @@ +# Node exporter observability lib + +This jsonnet observability lib can be used to generate observability package for node exporter. + +## Import + +```sh +jb init +jb install https://github.com/grafana/node_exporter/docs/node-observ-lib +``` + +## Examples + +### Example 1: Basic example + +You can use observ-lib to fill in monitoring-mixin structure: + +```jsonnet +// mixin.libsonnet file +local nodelib = import 'node-observ-lib/linux/main.libsonnet'; + +local linux = + nodelib.new() + + nodelib.withConfigMixin({ + filteringSelector: 'job=~".*node.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + dashboardTags: ['node-exporter-mixin'], + uid: 'node', + // enable loki logs + enableLokiLogs: true, + }); + +{ + grafanaDashboards+:: linux.grafana.dashboards, + prometheusAlerts+:: linux.prometheus.alerts, + prometheusRules+:: linux.prometheus.recordingRules, +} + +``` + +### Example 2: Fill in monitoring-mixin with default config values and enable loki logs: + + +```jsonnet +// mixin.libsonnet file +local nodelib = import 'node-observ-lib/linux/main.libsonnet'; + +local linux = + nodelib.new() + + nodelib.withConfigMixin({ + enableLokiLogs: true, + }); + +{ + grafanaDashboards+:: linux.grafana.dashboards, + prometheusAlerts+:: linux.prometheus.alerts, + prometheusRules+:: linux.prometheus.recordingRules, +} + +``` + +### Example 3: Override some of default config values from file: + + +```jsonnet +// overrides.libsonnet +{ + // Memory utilzation (%) level on which to trigger the + // 'NodeMemoryHighUtilization' alert. + memoryHighUtilizationThreshold: 80, + + // Threshold for the rate of memory major page faults to trigger + // 'NodeMemoryMajorPagesFaults' alert. + memoryMajorPagesFaultsThreshold: 1000, + + // Disk IO queue level above which to trigger + // 'NodeDiskIOSaturation' alert. + diskIOSaturationThreshold: 20, +} + +// mixin.libsonnet file +local configOverride = import './overrides.libsonnet'; +local nodelib = import 'node-observ-lib/linux/main.libsonnet'; + +local linux = + nodelib.new() + + nodelib.withConfigMixin(configOverride); + +{ + grafanaDashboards+:: linux.grafana.dashboards, + prometheusAlerts+:: linux.prometheus.alerts, + prometheusRules+:: linux.prometheus.recordingRules, +} + +``` + +### Example 4: Modify specific panel before rendering dashboards + +```jsonnet +local g = import './g.libsonnet'; +// mixin.libsonnet file +local nodelib = import 'node-observ-lib/linux/main.libsonnet'; + +local linux = + nodelib.new() + + nodelib.withConfigMixin({ + filteringSelector: 'job=~".*node.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + dashboardTags: ['node-exporter-mixin'], + uid: 'node', + }) + + { + grafana+: { + panels+: { + networkSockstatAll+: + + g.panel.timeSeries.fieldConfig.defaults.custom.withDrawStyle('bars') + } + } + }; + +{ + grafanaDashboards+:: linux.grafana.dashboards, + prometheusAlerts+:: linux.prometheus.alerts, + prometheusRules+:: linux.prometheus.recordingRules, +} + +``` + +## Collectors used: + +Grafana Agent or combination of node_exporter/promtail can be used in order to collect data required. + +### Logs collection + +Loki logs are used to populate logs dashboard and also for annotations. + +To use logs, you need to opt-in, with setting `enableLokiLogs: true` in config. + +See example above. + +The following scrape snippet can be used in grafana-agent/promtail: + +```yaml + - job_name: integrations/node_exporter_journal_scrape + journal: + max_age: 24h + labels: + instance: '' + job: integrations/node_exporter + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' + - source_labels: ['__journal__boot_id'] + target_label: 'boot_id' + - source_labels: ['__journal__transport'] + target_label: 'transport' + - source_labels: ['__journal_priority_keyword'] + target_label: 'level' +``` diff --git a/docs/node-observ-lib/linux/alerts.libsonnet b/docs/node-observ-lib/linux/alerts.libsonnet new file mode 100644 index 0000000000..e7db3fba77 --- /dev/null +++ b/docs/node-observ-lib/linux/alerts.libsonnet @@ -0,0 +1,420 @@ +{ + new(this): { + groups: [ + { + name: if this.config.uid == 'node' then 'node-exporter-filesystem' else this.config.uid + '-filesystem-alerts', + rules: [ + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + ( + node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d + and + predict_linear(node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem is predicted to run out of space within the next 24 hours.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', + }, + }, + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + ( + node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d + and + predict_linear(node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '1h', + labels: { + severity: '%(nodeCriticalSeverity)s' % this.config, + }, + annotations: { + summary: 'Filesystem is predicted to run out of space within the next 4 hours.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfSpace', + expr: ||| + ( + node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceAvailableWarningThreshold)d + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % this.config, + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfSpace', + expr: ||| + ( + node_filesystem_avail_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceAvailableCriticalThreshold)d + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '30m', + labels: { + severity: '%(nodeCriticalSeverity)s' % this.config, + }, + annotations: { + summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % this.config, + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + ( + node_filesystem_files_free{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 40 + and + predict_linear(node_filesystem_files_free{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + ( + node_filesystem_files_free{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 20 + and + predict_linear(node_filesystem_files_free{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '1h', + labels: { + severity: '%(nodeCriticalSeverity)s' % this.config, + }, + annotations: { + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfFiles', + expr: ||| + ( + node_filesystem_files_free{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 5 + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Filesystem has less than 5% inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + }, + }, + { + alert: 'NodeFilesystemAlmostOutOfFiles', + expr: ||| + ( + node_filesystem_files_free{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 3 + and + node_filesystem_readonly{%(filteringSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 + ) + ||| % this.config, + 'for': '1h', + labels: { + severity: '%(nodeCriticalSeverity)s' % this.config, + }, + annotations: { + summary: 'Filesystem has less than 3% inodes left.', + description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.', + }, + }, + ], + }, + { + // defaults to 'node-exporter for backward compatibility with old node-mixin + name: if this.config.uid == 'node' then 'node-exporter' else this.config.uid + '-alerts', + rules: [ + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + rate(node_network_receive_errs_total{%(filteringSelector)s}[2m]) / rate(node_network_receive_packets_total{%(filteringSelector)s}[2m]) > 0.01 + ||| % this.config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Network interface is reporting many receive errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + rate(node_network_transmit_errs_total{%(filteringSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(filteringSelector)s}[2m]) > 0.01 + ||| % this.config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Network interface is reporting many transmit errors.', + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.', + }, + }, + { + alert: 'NodeHighNumberConntrackEntriesUsed', + expr: ||| + (node_nf_conntrack_entries{%(filteringSelector)s} / node_nf_conntrack_entries_limit) > 0.75 + ||| % this.config, + annotations: { + summary: 'Number of conntrack are getting close to the limit.', + description: '{{ $value | humanizePercentage }} of conntrack entries are used.', + }, + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeTextFileCollectorScrapeError', + expr: ||| + node_textfile_scrape_error{%(filteringSelector)s} == 1 + ||| % this.config, + annotations: { + summary: 'Node Exporter text file collector failed to scrape.', + description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.', + }, + labels: { + severity: 'warning', + }, + }, + { + alert: 'NodeClockSkewDetected', + expr: ||| + ( + node_timex_offset_seconds{%(filteringSelector)s} > 0.05 + and + deriv(node_timex_offset_seconds{%(filteringSelector)s}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{%(filteringSelector)s} < -0.05 + and + deriv(node_timex_offset_seconds{%(filteringSelector)s}[5m]) <= 0 + ) + ||| % this.config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Clock skew detected.', + description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.', + }, + }, + { + alert: 'NodeClockNotSynchronising', + expr: ||| + min_over_time(node_timex_sync_status{%(filteringSelector)s}[5m]) == 0 + and + node_timex_maxerror_seconds{%(filteringSelector)s} >= 16 + ||| % this.config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Clock not synchronising.', + description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.', + }, + }, + { + alert: 'NodeRAIDDegraded', + expr: ||| + node_md_disks_required{%(filteringSelector)s,%(diskDeviceSelector)s} - ignoring (state) (node_md_disks{state="active",%(filteringSelector)s,%(diskDeviceSelector)s}) > 0 + ||| % this.config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'RAID Array is degraded.', + description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.", + }, + }, + { + alert: 'NodeRAIDDiskFailure', + expr: ||| + node_md_disks{state="failed",%(filteringSelector)s,%(diskDeviceSelector)s} > 0 + ||| % this.config, + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Failed device in RAID array.', + description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.", + }, + }, + { + alert: 'NodeFileDescriptorLimit', + expr: ||| + ( + node_filefd_allocated{%(filteringSelector)s} * 100 / node_filefd_maximum{%(filteringSelector)s} > 70 + ) + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Kernel is predicted to exhaust file descriptors limit soon.', + description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', + }, + }, + { + alert: 'NodeFileDescriptorLimit', + expr: ||| + ( + node_filefd_allocated{%(filteringSelector)s} * 100 / node_filefd_maximum{%(filteringSelector)s} > 90 + ) + ||| % this.config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Kernel is predicted to exhaust file descriptors limit soon.', + description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.', + }, + }, + { + alert: 'NodeCPUHighUsage', + expr: ||| + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(filteringSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'info', + }, + annotations: { + summary: 'High CPU usage.', + description: ||| + CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % this.config, + }, + }, + { + alert: 'NodeSystemSaturation', + expr: ||| + node_load1{%(filteringSelector)s} + / count without (cpu, mode) (node_cpu_seconds_total{%(filteringSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'System saturated, load per core is very high.', + description: ||| + System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + ||| % this.config, + }, + }, + { + alert: 'NodeMemoryMajorPagesFaults', + expr: ||| + rate(node_vmstat_pgmajfault{%(filteringSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Memory major page faults are occurring at very high rate.', + description: ||| + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + Please check that there is enough memory available at this instance. + ||| % this.config, + }, + }, + { + alert: 'NodeMemoryHighUtilization', + expr: ||| + 100 - (node_memory_MemAvailable_bytes{%(filteringSelector)s} / node_memory_MemTotal_bytes{%(filteringSelector)s} * 100) > %(memoryHighUtilizationThreshold)d + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Host is running out of memory.', + description: ||| + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%. + ||| % this.config, + }, + }, + { + alert: 'NodeDiskIOSaturation', + expr: ||| + rate(node_disk_io_time_weighted_seconds_total{%(filteringSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d + ||| % this.config, + 'for': '30m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Disk IO queue is high.', + description: ||| + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}. + This symptom might indicate disk saturation. + ||| % this.config, + }, + }, + { + alert: 'NodeSystemdServiceFailed', + expr: ||| + node_systemd_unit_state{%(filteringSelector)s, state="failed"} == 1 + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Systemd service has entered failed state.', + description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}', + }, + }, + ], + }, + ], + }, +} diff --git a/docs/node-observ-lib/linux/annotations.libsonnet b/docs/node-observ-lib/linux/annotations.libsonnet new file mode 100644 index 0000000000..5b8cb7ff79 --- /dev/null +++ b/docs/node-observ-lib/linux/annotations.libsonnet @@ -0,0 +1,69 @@ +local g = import '../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +{ + new(this): + { + reboot: + commonlib.annotations.reboot.new( + title='Reboot', + target=this.grafana.targets.reboot, + instanceLabels=std.join(',', this.config.instanceLabels), + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)), + memoryOOM: + commonlib.annotations.base.new( + 'OOMkill', + this.grafana.targets.memoryOOMkiller + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)) + + commonlib.annotations.base.withTextFormat('') + { + hide: true, + iconColor: 'light-purple', + }, + kernelUpdate: + commonlib.annotations.base.new( + 'Kernel update', + this.grafana.targets.kernelUpdate + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)) + + commonlib.annotations.base.withTextFormat('') + { + hide: true, + iconColor: 'light-blue', + step: '5m', + }, + } + + + if + this.config.enableLokiLogs + then + { + serviceFailed: commonlib.annotations.serviceFailed.new( + title='Service failed', + target=this.grafana.targets.serviceFailed, + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])), + criticalEvents: commonlib.annotations.fatal.new( + title='Critical system event', + target=this.grafana.targets.criticalEvents, + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])), + sessionOpened: + commonlib.annotations.base.new( + title='Session opened', + target=this.grafana.targets.sessionOpened, + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])) + { hide: true }, + sessionClosed: + commonlib.annotations.base.new( + title='Session closed', + target=this.grafana.targets.sessionOpened, + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])) + { hide: true }, + } + else + {}, +} diff --git a/docs/node-observ-lib/linux/config.libsonnet b/docs/node-observ-lib/linux/config.libsonnet new file mode 100644 index 0000000000..eed54bbab0 --- /dev/null +++ b/docs/node-observ-lib/linux/config.libsonnet @@ -0,0 +1,104 @@ +{ + + // any modular observability library should inlcude as inputs: + // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups + // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. + // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. + // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. + // 'uid' - UID to prefix all dashboards original uids + + filteringSelector: std.get(self, 'nodeExporterSelector', default='job="node"'), + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + uid: 'node', + + dashboardTags: [self.uid], + + // Select the fstype for filesystem-related queries. If left + // empty, all filesystems are selected. If you have unusual + // filesystem you don't want to include in dashboards and + // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. + fsSelector: 'fstype!=""', + + // Select the mountpoint for filesystem-related queries. If left + // empty, all mountpoints are selected. For example if you have a + // special purpose tmpfs instance that has a fixed size and will + // always be 100% full, but you still want alerts and dashboards for + // other tmpfs instances, you can exclude those by mountpoint prefix + // like so: 'mountpoint!~"/var/lib/foo.*"'. + fsMountpointSelector: 'mountpoint!=""', + + // Select the device for disk-related queries. If left empty, all + // devices are selected. If you have unusual devices you don't + // want to include in dashboards and alerting, you can exclude + // them here, e.g. 'device!="tmpfs"'. + diskDeviceSelector: 'device!=""', + + // Some of the alerts are meant to fire if a criticadiskDeviceSelector failure of a + // node is imminent (e.g. the disk is about to run full). In a + // true “cloud native” setup, failures of a single node should be + // tolerated. Hence, even imminent failure of a single node is no + // reason to create a paging alert. However, in practice there are + // still many situations where operators like to get paged in time + // before a node runs out of disk space. nodeCriticalSeverity can + // be set to the desired severity for this kind of alerts. This + // can even be templated to depend on labels of the node, e.g. you + // could make this critical for traditional database masters but + // just a warning for K8s nodes. + nodeCriticalSeverity: 'critical', + + // CPU utilization (%) on which to trigger the + // 'NodeCPUHighUsage' alert. + cpuHighUsageThreshold: 90, + // Load average 1m (per core) on which to trigger the + // 'NodeSystemSaturation' alert. + systemSaturationPerCoreThreshold: 2, + + // Available disk space (%) thresholds on which to trigger the + // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk + // usage grows in a way that it is predicted to run out in 4h or 1d + // and if the provided thresholds have been reached right now. + // In some cases you'll want to adjust these, e.g. by default Kubernetes + // runs the image garbage collection when the disk usage reaches 85% + // of its available space. In that case, you'll want to reduce the + // critical threshold below to something like 14 or 15, otherwise + // the alert could fire under normal node usage. + fsSpaceFillingUpWarningThreshold: 40, + fsSpaceFillingUpCriticalThreshold: 20, + + // Available disk space (%) thresholds on which to trigger the + // 'NodeFilesystemAlmostOutOfSpace' alerts. + fsSpaceAvailableWarningThreshold: 5, + fsSpaceAvailableCriticalThreshold: 3, + + // Memory utilzation (%) level on which to trigger the + // 'NodeMemoryHighUtilization' alert. + memoryHighUtilizationThreshold: 90, + + // Threshold for the rate of memory major page faults to trigger + // 'NodeMemoryMajorPagesFaults' alert. + memoryMajorPagesFaultsThreshold: 500, + + // Disk IO queue level above which to trigger + // 'NodeDiskIOSaturation' alert. + diskIOSaturationThreshold: 10, + + rateInterval: '5m', + + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + // logs lib related + enableLokiLogs: false, + extraLogLabels: ['transport', 'unit', 'level'], + logsVolumeGroupBy: 'level', + showLogsVolume: true, + logsFilteringSelector: self.filteringSelector, + logsExtraFilters: + ||| + | label_format timestamp="{{__timestamp__}}" + | line_format `{{ if eq "[[instance]]" ".*" }}{{alignLeft 25 .instance}}|{{alignLeft 25 .unit}}|{{else}}{{alignLeft 25 .unit}}|{{end}} {{__line__}}` + |||, +} diff --git a/docs/node-observ-lib/linux/dashboards.libsonnet b/docs/node-observ-lib/linux/dashboards.libsonnet new file mode 100644 index 0000000000..f09bef6415 --- /dev/null +++ b/docs/node-observ-lib/linux/dashboards.libsonnet @@ -0,0 +1,225 @@ +local g = import '../g.libsonnet'; +local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; +{ + local root = self, + new(this): + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = g.util.string.slugify(this.config.uid); + local vars = this.grafana.variables; + local annotations = this.grafana.annotations; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardPeriod; + local timezone = this.config.dashboardTimezone; + local panels = this.grafana.panels; + local stat = g.panel.stat; + { + fleet: + local title = prefix + 'fleet overview'; + g.dashboard.new(title) + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + // g.panel.row.new("Overview"), + panels.fleetOverviewTable { gridPos+: { w: 24, h: 16 } }, + panels.cpuUsageTopk { gridPos+: { w: 24 } }, + panels.memotyUsageTopKPercent { gridPos+: { w: 24 } }, + panels.diskIOutilPercentTopK { gridPos+: { w: 12 } }, + panels.diskUsagePercentTopK { gridPos+: { w: 12 } }, + panels.networkErrorsAndDroppedPerSecTopK { gridPos+: { w: 24 } }, + ], 12, 7 + ) + ) + // hide link to self + + root.applyCommon(vars.multiInstance, uid + '-fleet', tags, links { backToFleet+:: {}, backToOverview+:: {} }, annotations, timezone, refresh, period), + overview: + g.dashboard.new(prefix + 'overview') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('Overview'), + panels.uptime, + panels.hostname, + panels.kernelVersion, + panels.osInfo, + panels.cpuCount, + panels.memoryTotalBytes, + panels.memorySwapTotalBytes, + panels.diskTotalRoot, + g.panel.row.new('CPU'), + panels.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, + panels.cpuUsageTsPerCore { gridPos+: { w: 12, h: 6 } }, + panels.systemLoad { gridPos+: { w: 6, h: 6 } }, + g.panel.row.new('Memory'), + panels.memoryUsageStatPercent { gridPos+: { w: 6, h: 6 } }, + panels.memoryUsageTsBytes { gridPos+: { w: 18, h: 6 } }, + g.panel.row.new('Disk'), + panels.diskIOBytesPerSec { gridPos+: { w: 12, h: 8 } }, + panels.diskUsage { gridPos+: { w: 12, h: 8 } }, + g.panel.row.new('Network'), + panels.networkUsagePerSec { gridPos+: { w: 12, h: 8 } }, + panels.networkErrorsAndDroppedPerSec { gridPos+: { w: 12, h: 8 } }, + ], 6, 2 + ) + ) + // defaults to uid=nodes for backward compatibility with old node-mixins + + root.applyCommon(vars.singleInstance, (if uid == 'node' then 'nodes' else uid + '-overview'), tags, links { backToOverview+:: {} }, annotations, timezone, refresh, period), + network: + g.dashboard.new(prefix + 'network') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('Network'), + panels.networkOverviewTable { gridPos: { w: 24 } }, + panels.networkUsagePerSec, + panels.networkOperStatus, + panels.networkErrorsPerSec, + panels.networkDroppedPerSec, + panels.networkPacketsPerSec, + panels.networkMulticastPerSec, + panels.networkFifo, + panels.networkCompressedPerSec, + panels.networkNFConntrack, + panels.networkSoftnet, + panels.networkSoftnetSqueeze, + g.panel.row.new('Network sockets'), + panels.networkSockstatAll { gridPos: { w: 24 } }, + panels.networkSockstatTCP, + panels.networkSockstatUDP, + panels.networkSockstatMemory, + panels.networkSockstatOther, + g.panel.row.new('Network netstat'), + panels.networkNetstatIP { gridPos: { w: 24 } }, + panels.networkNetstatTCP, + panels.networkNetstatTCPerrors, + panels.networkNetstatUDP, + panels.networkNetstatUDPerrors, + panels.networkNetstatICMP, + panels.networkNetstatICMPerrors, + ], 12, 8 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-network', tags, links, annotations, timezone, refresh, period), + memory: + g.dashboard.new(prefix + 'memory') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + panels.memoryUsageStatPercent { gridPos+: { w: 6, h: 6 } }, + panels.memoryUsageTsBytes { gridPos+: { w: 18, h: 6 } }, + g.panel.row.new('Vmstat'), + panels.memoryPagesInOut, + panels.memoryPagesSwapInOut, + panels.memoryPagesFaults, + panels.memoryOOMkiller, + g.panel.row.new('Memstat'), + panels.memoryActiveInactive, + panels.memoryActiveInactiveDetail, + panels.memoryCommited, + panels.memorySharedAndMapped, + panels.memoryWriteAndDirty, + panels.memoryVmalloc, + panels.memorySlab, + panels.memoryAnonymous, + panels.memoryHugePagesCounter, + panels.memoryHugePagesSize, + panels.memoryDirectMap, + panels.memoryBounce, + ], 12, 8 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-memory', tags, links, annotations, timezone, refresh, period), + + system: + g.dashboard.new(prefix + 'CPU and system') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('System'), + panels.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, + panels.cpuUsageTsPerCore { gridPos+: { w: 9, h: 6 } }, + panels.cpuUsageByMode { gridPos+: { w: 9, h: 6 } }, + panels.systemLoad, + panels.systemContextSwitchesAndInterrupts, + g.panel.row.new('Time'), + panels.osTimezone { gridPos+: { w: 3, h: 4 } }, + panels.timeNtpStatus { gridPos+: { x: 0, y: 0, w: 21, h: 4 } }, + panels.timeSyncDrift { gridPos+: { w: 24, h: 7 } }, + ], 12, 7 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-system', tags, links, annotations, timezone, refresh, period), + + disks: + g.dashboard.new(prefix + 'filesystem and disks') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('Filesystem'), + panels.diskFreeTs, + panels.diskUsage, + panels.diskInodesFree, + panels.diskInodesTotal, + panels.diskErrorsandRO, + panels.fileDescriptors, + g.panel.row.new('Disk'), + panels.diskIOBytesPerSec, + panels.diskIOps, + panels.diskIOWaitTime, + panels.diskQueue, + ], 12, 8 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-disk', tags, links, annotations, timezone, refresh, period), + } + + + if this.config.enableLokiLogs + then + { + logs: + logslib.new( + prefix + 'logs', + datasourceName=this.grafana.variables.datasources.loki.name, + datasourceRegex=this.grafana.variables.datasources.loki.regex, + filterSelector=this.config.logsFilteringSelector, + labels=this.config.groupLabels + this.config.instanceLabels + this.config.extraLogLabels, + formatParser=null, + showLogsVolume=this.config.showLogsVolume, + logsVolumeGroupBy=this.config.logsVolumeGroupBy, + extraFilters=this.config.logsExtraFilters + ) + { + dashboards+: + { + logs+: + // reference to self, already generated variables, to keep them, but apply other common data in applyCommon + root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links, annotations=annotations, timezone=timezone, refresh=refresh, period=period), + }, + panels+: + { + // modify log panel + logs+: + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + }, + variables+: { + // add prometheus datasource for annotations processing + toArray+: [ + this.grafana.variables.datasources.prometheus { hide: 2 }, + ], + }, + }.dashboards.logs, + } + else {}, + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)), +} diff --git a/docs/node-observ-lib/linux/links.libsonnet b/docs/node-observ-lib/linux/links.libsonnet new file mode 100644 index 0000000000..cc24910a65 --- /dev/null +++ b/docs/node-observ-lib/linux/links.libsonnet @@ -0,0 +1,19 @@ +local g = import '../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +{ + new(this): + { + local link = g.dashboard.link, + backToFleet: + link.link.new('Back to ' + this.config.dashboardNamePrefix + 'fleet', '/d/' + this.grafana.dashboards.fleet.uid) + + link.link.options.withKeepTime(true), + backToOverview: + link.link.new('Back to ' + this.config.dashboardNamePrefix + 'overview', '/d/' + this.grafana.dashboards.overview.uid) + + link.link.options.withKeepTime(true), + otherDashboards: + link.dashboards.new('All ' + this.config.dashboardNamePrefix + ' dashboards', this.config.dashboardTags) + + link.dashboards.options.withIncludeVars(true) + + link.dashboards.options.withKeepTime(true) + + link.dashboards.options.withAsDropdown(true), + }, +} diff --git a/docs/node-observ-lib/linux/main.libsonnet b/docs/node-observ-lib/linux/main.libsonnet new file mode 100644 index 0000000000..0fb13f70d2 --- /dev/null +++ b/docs/node-observ-lib/linux/main.libsonnet @@ -0,0 +1,39 @@ +local alerts = import './alerts.libsonnet'; +local annotations = import './annotations.libsonnet'; +local config = import './config.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local datasources = import './datasources.libsonnet'; +local g = import './g.libsonnet'; +local links = import './links.libsonnet'; +local panels = import './panels.libsonnet'; +local rules = import './rules.libsonnet'; +local targets = import './targets.libsonnet'; +local variables = import './variables.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + withConfigMixin(config): { + config+: config, + }, + + new(): { + + local this = self, + config: config, + grafana: { + variables: variables.new(this), + targets: targets.new(this), + panels: panels.new(this), + annotations: annotations.new(this), + // common links here used across all dashboards + links: links.new(this), + dashboards: dashboards.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: rules.new(this), + }, + + }, +} diff --git a/docs/node-observ-lib/linux/panels.libsonnet b/docs/node-observ-lib/linux/panels.libsonnet new file mode 100644 index 0000000000..d924ac2eb9 --- /dev/null +++ b/docs/node-observ-lib/linux/panels.libsonnet @@ -0,0 +1,1142 @@ +local g = import '../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = this.config.instanceLabels[0], + fleetOverviewTable: + commonlib.panels.generic.table.base.new( + 'Fleet overview', + targets= + [ + t.osInfoCombined + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('OS Info'), + t.uptime + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Uptime'), + t.systemLoad1 + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Load 1'), + t.cpuCount + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Cores'), + t.cpuUsage + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CPU usage'), + t.memoryTotalBytes + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Memory total'), + t.memoryUsagePercent + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Memory usage'), + t.diskTotalRoot + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Root mount size'), + t.diskUsageRootPercent + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Root mount used'), + t.alertsCritical + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CRITICAL'), + t.alertsWarning + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('WARNING'), + ], + description="All nodes' perfomance at a glance." + ) + + g.panel.table.options.withFooter( + value={ + reducer: ['sum'], + show: true, + fields: [ + 'Value #Cores', + 'Value #Load 1', + 'Value #Memory total', + 'Value #Root mount size', + ], + } + ) + + commonlib.panels.system.table.uptime.stylizeByName('Uptime') + + table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Product|^Hostname$') + + fieldOverride.byRegexp.withProperty('custom.filterable', true), + fieldOverride.byName.new('Instance') + + fieldOverride.byName.withProperty('custom.filterable', true) + + fieldOverride.byName.withProperty('links', [ + { + targetBlank: false, + title: 'Drill down to ${__field.name} ${__value.text}', + url: 'd/%s?var-%s=${__data.fields.%s}&${__url_time_range}' % [this.grafana.dashboards.overview.uid, instanceLabel, instanceLabel], + }, + ]), + fieldOverride.byRegexp.new(std.join('|', std.map(utils.toSentenceCase, this.config.groupLabels))) + + fieldOverride.byRegexp.withProperty('custom.filterable', true) + + fieldOverride.byRegexp.withProperty('links', [ + { + targetBlank: false, + title: 'Filter by ${__field.name}', + url: 'd/%s?var-${__field.name}=${__value.text}&${__url_time_range}' % [this.grafana.dashboards.fleet.uid], + }, + ]), + fieldOverride.byName.new('Cores') + + fieldOverride.byName.withProperty('custom.width', '120'), + fieldOverride.byName.new('CPU usage') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withProperty('custom.displayMode', 'basic') + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + fieldOverride.byName.new('Memory total') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bytes') + ), + fieldOverride.byName.new('Memory usage') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withProperty('custom.displayMode', 'basic') + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + fieldOverride.byName.new('Root mount size') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bytes') + ), + fieldOverride.byName.new('Root mount used') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withProperty('custom.displayMode', 'basic') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('percent') + ) + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + ]) + + table.queryOptions.withTransformationsMixin( + [ + { + id: 'joinByField', + options: { + byField: instanceLabel, + mode: 'outer', + }, + }, + { + id: 'filterFieldsByName', + options: { + include: { + //' 1' - would only match first occurence of group label, so no duplicates + pattern: instanceLabel + '|' + + + std.join( + '|', + std.map( + function(x) '%s 1' % x, this.config.instanceLabels + ) + ) + + '|' + + std.join( + '|', + std.map( + function(x) '%s 1' % x, this.config.groupLabels + ) + ) + + '|product|^hostname$|^nodename$|^pretty_name$|Value.+', + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Value #OS Info': true, + }, + indexByName: + { + [instanceLabel]: 0, + nodename: 1, + hostname: 1, + pretty_name: 2, + product: 2, + } + + + // group labels are named as 'job 1' and so on. + { + [label]: 3 + for label in this.config.groupLabels + }, + renameByName: + { + [label + ' 1']: utils.toSentenceCase(label) + for label in this.config.instanceLabels + } + { + [instanceLabel]: utils.toSentenceCase(instanceLabel), + product: 'OS', // windows + pretty_name: 'OS', // linux + hostname: 'Hostname', // windows + nodename: 'Hostname', // Linux + } + + + // group labels are named as 'job 1' and so on. + { + [label + ' 1']: utils.toSentenceCase(label) + for label in this.config.groupLabels + }, + + }, + }, + { + id: 'renameByRegex', + options: { + regex: 'Value #(.*)', + renamePattern: '$1', + }, + }, + ] + ), + uptime: commonlib.panels.system.stat.uptime.new(targets=[t.uptime]), + + systemLoad: + commonlib.panels.system.timeSeries.loadAverage.new( + loadTargets=[t.systemLoad1, t.systemLoad5, t.systemLoad15], + cpuCountTarget=t.cpuCount, + ), + + systemContextSwitchesAndInterrupts: + commonlib.panels.generic.timeSeries.base.new( + 'Context switches/Interrupts', + targets=[ + t.systemContextSwitches, + t.systemInterrupts, + ], + description=||| + Context switches occur when the operating system switches from running one process to another. Interrupts are signals sent to the CPU by external devices to request its attention. + + A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. + ||| + ), + + timeNtpStatus: + commonlib.panels.system.statusHistory.ntp.new( + 'NTP status', + targets=[t.timeNtpStatus], + description='Status of time synchronization.' + ) + + g.panel.timeSeries.standardOptions.withNoValue('No data.') + + g.panel.statusHistory.options.withLegend(false), + timeSyncDrift: + commonlib.panels.generic.timeSeries.base.new( + 'Time synchronized drift', + targets=[ + t.timeEstimatedError, + t.timeOffset, + t.timeMaxError, + ], + description=||| + Time synchronization is essential to ensure accurate timekeeping, which is critical for many system operations such as logging, authentication, and network communication, as well as distributed systems or clusters where data consistency is important. + ||| + ) + + g.panel.timeSeries.standardOptions.withUnit('seconds') + + g.panel.timeSeries.standardOptions.withNoValue('No data.'), + cpuCount: commonlib.panels.cpu.stat.count.new(targets=[t.cpuCount]), + cpuUsageTsPerCore: commonlib.panels.cpu.timeSeries.utilization.new(targets=[t.cpuUsagePerCore]) + + g.panel.timeSeries.fieldConfig.defaults.custom.withStacking({ mode: 'normal' }), + + cpuUsageTopk: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='CPU usage', + target=t.cpuUsage, + topk=25, + instanceLabels=this.config.instanceLabels, + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + cpuUsageStat: commonlib.panels.cpu.stat.usage.new(targets=[t.cpuUsage]), + cpuUsageByMode: commonlib.panels.cpu.timeSeries.utilizationByMode.new( + targets=[t.cpuUsageByMode], + description=||| + - System: Processes executing in kernel mode. + - User: Normal processes executing in user mode. + - Nice: Niced processes executing in user mode. + - Idle: Waiting for something to happen. + - Iowait: Waiting for I/O to complete. + - Irq: Servicing interrupts. + - Softirq: Servicing softirqs. + - Steal: Time spent in other operating systems when running in a virtualized environment. + ||| + ), + + memoryTotalBytes: commonlib.panels.memory.stat.total.new(targets=[t.memoryTotalBytes]), + memorySwapTotalBytes: + commonlib.panels.memory.stat.total.new( + 'Total swap', + targets=[t.memorySwapTotal], + description=||| + Total swap available. + + Swap is a space on a storage device (usually a dedicated swap partition or a swap file) + used as virtual memory when the physical RAM (random-access memory) is fully utilized. + Swap space helps prevent memory-related performance issues by temporarily transferring less-used data from RAM to disk, + freeing up physical memory for active processes and applications. + ||| + ), + memoryUsageStatPercent: commonlib.panels.memory.stat.usage.new(targets=[t.memoryUsagePercent]), + memotyUsageTopKPercent: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Memory usage', + target=t.memoryUsagePercent, + topk=25, + instanceLabels=this.config.instanceLabels, + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + memoryUsageTsBytes: + commonlib.panels.memory.timeSeries.usageBytes.new( + targets=[ + t.memoryUsedBytes, + t.memoryCachedBytes, + t.memoryAvailableBytes, + t.memoryBuffersBytes, + t.memoryFreeBytes, + t.memoryTotalBytes, + ], + description= + ||| + - Used: The amount of physical memory currently in use by the system. + - Cached: The amount of physical memory used for caching data from disk. The Linux kernel uses available memory to cache data that is read from or written to disk. This helps speed up disk access times. + - Free: The amount of physical memory that is currently not in use. + - Buffers: The amount of physical memory used for temporary storage of data being transferred between devices or applications. + - Available: The amount of physical memory that is available for use by applications. This takes into account memory that is currently being used for caching but can be freed up if needed. + ||| + ) + + g.panel.timeSeries.standardOptions.withOverridesMixin( + { + __systemRef: 'hideSeriesFrom', + matcher: { + id: 'byNames', + options: { + mode: 'exclude', + names: [ + t.memoryTotalBytes.legendFormat, + t.memoryUsedBytes.legendFormat, + ], + prefix: 'All except:', + readOnly: true, + }, + }, + properties: [ + { + id: 'custom.hideFrom', + value: { + viz: true, + legend: false, + tooltip: false, + }, + }, + ], + } + ), + + memoryPagesInOut: + commonlib.panels.memory.timeSeries.base.new( + 'Memory pages in / out', + targets=[t.memoryPagesIn, t.memoryPagesOut], + description=||| + Page-In - Return of pages to physical memory. This is a common and normal event. + + Page-Out - process of writing pages to disk. Unlike page-in, page-outs can indicate trouble. + When the kernel detects low memory, it attempts to free memory by paging out. + While occasional page-outs are normal, excessive and frequent page-outs can lead to thrashing. + Thrashing is a state in which the kernel spends more time managing paging activity than running applications, resulting in poor system performance. + ||| + ) + + commonlib.panels.network.timeSeries.base.withNegateOutPackets(), + + memoryPagesSwapInOut: + commonlib.panels.memory.timeSeries.base.new( + 'Memory pages swapping in / out', + targets=[t.memoryPagesSwapIn, t.memoryPagesSwapOut], + description=||| + Compared to the speed of the CPU and main memory, writing pages out to disk is relatively slow. + Nonetheless, it is a preferable option to crashing or killing off processes. + + The process of writing pages out to disk to free memory is known as swapping-out. + If a page fault occurs because the page is on disk, in the swap area, rather than in memory, + the kernel will read the page back in from the disk to satisfy the page fault. + This is known as swapping-in. + ||| + ) + + commonlib.panels.network.timeSeries.base.withNegateOutPackets(), + + memoryPagesFaults: + commonlib.panels.memory.timeSeries.base.new( + 'Memory page faults', + targets=[t.memoryPageMajorFaults, t.memoryPageMinorFaults], + description=||| + A page fault is an exception raised by the memory when a process accesses a memory page without the necessary preparations, + requiring a mapping to be added to the process's virtual address space. + + The page contents may also need to be loaded from a backing store such as a disk. + While the MMU detects the page fault, the operating system's kernel handles the exception by either making the required page accessible in physical memory or denying an illegal memory access. + Valid page faults are common and necessary to increase memory availability in any operating system that uses virtual memory, including Windows, macOS, and the Linux kernel. + |||, + ), + + memoryOOMkiller: + commonlib.panels.memory.timeSeries.base.new( + 'OOM Killer', + targets=[t.memoryOOMkiller], + description=||| + Out Of Memory killer is a process used by the Linux kernel when the system is running critically low on memory. + + This can happen when the kernel has allocated more memory than is available for its processes. + ||| + ), + + memoryActiveInactive: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory active / inactive', + targets=[t.memoryActiveBytes, t.memoryInactiveBytes], + description=||| + - Inactive: Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. + - Active: Memory that has been used more recently and usually not reclaimed unless absolutely necessary. + |||, + ), + + memoryActiveInactiveDetail: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory active / inactive details', + targets=[t.memoryInactiveFile, t.memoryInactiveAnon, t.memoryActiveFile, t.memoryActiveAnon], + description=||| + - Inactive_file: File-backed memory on inactive LRU list. + - Inactive_anon: Anonymous and swap cache on inactive LRU list, including tmpfs (shmem). + - Active_file: File-backed memory on active LRU list. + - Active_anon: Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs. + |||, + ), + + memoryCommited: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory commited', + targets=[t.memoryCommitedAs, t.memoryCommitedLimit], + description=||| + - Committed_AS - Amount of memory presently allocated on the system. + - CommitLimit - Amount of memory currently available to be allocated on the system. + ||| + ), + + memorySharedAndMapped: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory shared and mapped', + targets=[t.memoryMappedBytes, t.memoryShmemBytes, t.memoryShmemPmdMappedBytes, t.memoryShmemHugePagesBytes], + description=||| + - Mapped: This refers to the memory used in mapped page files that have been memory mapped, such as libraries. + - Shmem: This is the memory used by shared memory, which is shared between multiple processes, including RAM disks. + - ShmemHugePages: This is the memory used by shared memory and tmpfs allocated with huge pages. + - ShmemPmdMapped: This is the amount of shared memory (shmem/tmpfs) backed by huge pages. + ||| + ), + memoryWriteAndDirty: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory writeback and dirty', + targets=[t.memoryWriteback, t.memoryWritebackTmp, t.memoryDirty], + description=||| + - Writeback: This refers to the memory that is currently being actively written back to the disk. + - WritebackTmp: This is the memory used by FUSE for temporary writeback buffers. + - Dirty: This type of memory is waiting to be written back to the disk. + ||| + ), + memoryVmalloc: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory Vmalloc', + targets=[t.memoryVmallocChunk, t.memoryVmallocTotal, t.memoryVmallocUsed], + description=||| + Virtual Memory Allocation is a type of memory allocation in Linux that allows a process to request a contiguous block of memory larger than the amount of physically available memory. This is achieved by mapping the requested memory to virtual addresses that are backed by a combination of physical memory and swap space on disk. + + - VmallocChunk: Largest contiguous block of vmalloc area which is free. + - VmallocTotal: Total size of vmalloc memory area. + - VmallocUsed: Amount of vmalloc area which is used. + ||| + ), + memorySlab: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory slab', + targets=[t.memorySlabSUnreclaim, t.memorySlabSReclaimable], + description=||| + Slab Allocation is a type of memory allocation in Linux that allows the kernel to efficiently manage the allocation and deallocation of small and frequently used data structures, such as network packets, file system objects, and process descriptors. + + The Slab Allocator maintains a cache of pre-allocated objects of a fixed size and type, called slabs. When an application requests an object of a particular size and type, the Slab Allocator checks if a pre-allocated object of that size and type is available in the cache. If an object is available, it is returned to the application; if not, a new slab of objects is allocated and added to the cache. + + - SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure. + - SReclaimable: Part of Slab, that might be reclaimed, such as caches. + ||| + ), + memoryAnonymous: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory slab', + targets=[t.memoryAnonHugePages, t.memoryAnonPages], + description=||| + Memory Anonymous refers to the portion of the virtual memory that is used by a process for dynamically allocated memory that is not backed by any file or device. + + This type of memory is commonly used for heap memory allocation, which is used by programs to allocate and free memory dynamically during runtime. + + Memory Anonymous is different from Memory Mapped files, which refer to portions of the virtual memory space that are backed by a file or device, + and from Memory Shared with other processes, + which refers to memory regions that can be accessed and modified by multiple processes. + + - AnonHugePages: Memory in anonymous huge pages. + - AnonPages: Memory in user pages not backed by files. + ||| + ), + + memoryHugePagesCounter: + commonlib.panels.memory.timeSeries.base.new( + 'Memory HugePages counter', + targets=[t.memoryHugePages_Free, t.memoryHugePages_Rsvd, t.memoryHugePages_Surp], + description= + ||| + Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. + + - HugePages_Free: Huge pages in the pool that are not yet allocated. + - HugePages_Rsvd: Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made. + - HugePages_Surp: Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. + ||| + ), + memoryHugePagesSize: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory HugePages size', + targets=[t.memoryHugePagesTotalSize, t.memoryHugePagesSize], + + description=||| + Huge Pages are a feature that allows for the allocation of larger memory pages than the standard 4KB page size. By using larger page sizes, the kernel can reduce the overhead associated with managing a large number of smaller pages, which can improve system performance for certain workloads. + ||| + ), + + memoryDirectMap: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory direct map', + targets=[t.memoryDirectMap1G, t.memoryDirectMap2M, t.memoryDirectMap4k], + + description=||| + Direct Map memory refers to the portion of the kernel's virtual address space that is directly mapped to physical memory. This mapping is set up by the kernel during boot time and is used to provide fast access to certain critical kernel data structures, such as page tables and interrupt descriptor tables. + ||| + ), + memoryBounce: + commonlib.panels.memory.timeSeries.usageBytes.new( + 'Memory bounce', + targets=[t.memoryBounce], + description=||| + Memory bounce is a technique used in the Linux kernel to handle situations where direct memory access (DMA) is required but the physical memory being accessed is not contiguous. This can happen when a device, such as a network interface card or a disk controller, requires access to a large amount of memory that is not available as a single contiguous block. + + To handle this situation, the kernel uses a technique called memory bouncing. In memory bouncing, the kernel sets up a temporary buffer in physical memory that is large enough to hold the entire data block being transferred by the device. The data is then copied from the non-contiguous source memory to the temporary buffer, which is physically contiguous. + + - Bounce: Memory used for block device bounce buffers. + ||| + ), + diskTotalRoot: + commonlib.panels.disk.stat.total.new( + 'Root mount size', + targets=[t.diskTotalRoot], + description=||| + Total capacity on the primary mount point /. + ||| + ), + diskUsage: + commonlib.panels.disk.table.usage.new( + totalTarget= + ( + t.diskTotal + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + ), + freeTarget= + t.diskFree + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true), + groupLabel='mountpoint' + , + description='Disk utilisation in percent, by mountpoint. Some duplication can occur if the same filesystem is mounted in multiple locations.' + ), + diskFreeTs: + commonlib.panels.disk.timeSeries.available.new( + 'Filesystem space availabe', + targets=[ + t.diskFree, + ], + description='Filesystem space utilisation in bytes, by mountpoint.' + ), + diskInodesFree: + commonlib.panels.disk.timeSeries.base.new( + 'Free inodes', + targets=[t.diskInodesFree], + description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.' + ) + + g.panel.timeSeries.standardOptions.withUnit('short'), + diskInodesTotal: + commonlib.panels.disk.timeSeries.base.new( + 'Total inodes', + targets=[t.diskInodesTotal], + description='The inode is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.', + ) + + g.panel.timeSeries.standardOptions.withUnit('short'), + diskErrorsandRO: + commonlib.panels.disk.timeSeries.base.new( + 'Filesystems with errors / read-only', + targets=[ + t.diskDeviceError, + t.diskReadOnly, + ], + description='', + ) + + g.panel.timeSeries.standardOptions.withMax(1), + fileDescriptors: + commonlib.panels.disk.timeSeries.base.new( + 'File descriptors', + targets=[ + t.processMaxFds, + t.processOpenFds, + ], + description=||| + File descriptor is a handle to an open file or input/output (I/O) resource, such as a network socket or a pipe. + The operating system uses file descriptors to keep track of open files and I/O resources, and provides a way for programs to read from and write to them. + ||| + ), + diskUsagePercentTopK: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Disk space usage', + target=t.diskUsagePercent, + topk=25, + instanceLabels=this.config.instanceLabels + ['volume'], + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + diskIOBytesPerSec: commonlib.panels.disk.timeSeries.ioBytesPerSec.new( + targets=[t.diskIOreadBytesPerSec, t.diskIOwriteBytesPerSec, t.diskIOutilization] + ), + diskIOutilPercentTopK: + commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Disk IO', + target=t.diskIOutilization, + topk=25, + instanceLabels=this.config.instanceLabels + ['volume'], + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + diskIOps: + commonlib.panels.disk.timeSeries.iops.new( + targets=[ + t.diskIOReads, + t.diskIOWrites, + ] + ), + + diskQueue: + commonlib.panels.disk.timeSeries.ioQueue.new( + 'Disk average queue', + targets= + [ + t.diskAvgQueueSize, + ] + ), + diskIOWaitTime: commonlib.panels.disk.timeSeries.ioWaitTime.new( + targets=[ + t.diskIOWaitReadTime, + t.diskIOWaitWriteTime, + ] + ), + osInfo: commonlib.panels.generic.stat.info.new( + 'OS', + targets=[t.osInfo], + description='Operating system' + ) + { options+: { reduceOptions+: { fields: '/^pretty_name$/' } } }, + kernelVersion: + commonlib.panels.generic.stat.info.new('Kernel version', + targets=[t.unameInfo], + description='Kernel version of linux host.') + { options+: { reduceOptions+: { fields: '/^release$/' } } }, + osTimezone: + commonlib.panels.generic.stat.info.new( + 'Timezone', targets=[t.osTimezone], description='Current system timezone.' + ) + { options+: { reduceOptions+: { fields: '/^time_zone$/' } } }, + hostname: + commonlib.panels.generic.stat.info.new( + 'Hostname', + targets=[t.unameInfo], + description="System's hostname." + ) + { options+: { reduceOptions+: { fields: '/^nodename$/' } } }, + networkErrorsAndDroppedPerSec: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors and dropped packets', + targets=std.map( + function(t) t + { + expr: t.expr + '>0', + }, + [ + t.networkOutErrorsPerSec, + t.networkInErrorsPerSec, + t.networkOutDroppedPerSec, + t.networkInDroppedPerSec, + ] + ), + description=||| + **Network errors**: + + Network errors refer to issues that occur during the transmission of data across a network. + + These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. + + Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. + + **Dropped packets**: + + Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. + + Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. + + Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. + ||| + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkErrorsAndDroppedPerSecTopK: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors and dropped packets', + targets=std.map( + function(t) t + { + expr: 'topk(25, ' + t.expr + ')>0', + legendFormat: '{{' + this.config.instanceLabels[0] + '}}: ' + std.get(t, 'legendFormat', '{{ nic }}'), + }, + [ + t.networkOutErrorsPerSec, + t.networkInErrorsPerSec, + t.networkOutDroppedPerSec, + t.networkInDroppedPerSec, + ] + ), + description=||| + Top 25. + + **Network errors**: + + Network errors refer to issues that occur during the transmission of data across a network. + + These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. + + Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. + + **Dropped packets**: + + Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. + + Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. + + Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. + ||| + ) + + g.panel.timeSeries.fieldConfig.defaults.custom.withDrawStyle('points') + + g.panel.timeSeries.fieldConfig.defaults.custom.withPointSize(5), + + networkErrorsPerSec: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors', + targets=[t.networkInErrorsPerSec, t.networkOutErrorsPerSec] + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkDroppedPerSec: + commonlib.panels.network.timeSeries.dropped.new( + targets=[t.networkInDroppedPerSec, t.networkOutDroppedPerSec] + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkUsagePerSec: + commonlib.panels.network.timeSeries.traffic.new( + targets=[t.networkInBitPerSecFiltered, t.networkOutBitPerSecFiltered] + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkPacketsPerSec: + commonlib.panels.network.timeSeries.packets.new( + targets=[t.networkInPacketsPerSec, t.networkOutPacketsPerSec] + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkMulticastPerSec: + commonlib.panels.network.timeSeries.multicast.new( + 'Multicast packets', + targets=[t.networkInMulticastPacketsPerSec, t.networkOutMulticastPacketsPerSec], + description='Multicast packets received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + + networkFifo: + commonlib.panels.network.timeSeries.packets.new( + 'Network FIFO', + targets=[t.networkFifoInPerSec, t.networkFifoOutPerSec], + description=||| + Network FIFO (First-In, First-Out) refers to a buffer used by the network stack to store packets in a queue. + It is a mechanism used to manage network traffic and ensure that packets are delivered to their destination in the order they were received. + Packets are stored in the FIFO buffer until they can be transmitted or processed further. + ||| + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkCompressedPerSec: + commonlib.panels.network.timeSeries.packets.new( + 'Compressed packets', + targets=[t.networkCompressedInPerSec, t.networkCompressedOutPerSec], + description=||| + - Compressed received: + Number of correctly received compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). + + - Compressed transmitted: + Number of transmitted compressed packets. This counters is only meaningful for interfaces which support packet compression (e.g. CSLIP, PPP). + + https://docs.kernel.org/networking/statistics.html + |||, + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkNFConntrack: + commonlib.panels.generic.timeSeries.base.new( + 'NF conntrack', + targets=[t.networkNFConntrackEntries, t.networkNFConntrackLimits], + description=||| + NF Conntrack is a component of the Linux kernel's netfilter framework that provides stateful packet inspection to track and manage network connections, + enforce firewall rules, perform NAT, and manage network address/port translation. + ||| + ) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(0), + + networkSoftnet: + commonlib.panels.network.timeSeries.packets.new( + 'Softnet packets', + targets=[t.networkSoftnetProcessedPerSec, t.networkSoftnetDroppedPerSec], + description=||| + Softnet packets are received by the network and queued for processing by the kernel's networking stack. + Softnet packets are usually generated by network traffic that is directed to the local host, and they are typically processed by the kernel's networking subsystem before being passed on to the relevant application. + ||| + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets('/dropped/') + + g.panel.timeSeries.fieldConfig.defaults.custom.withAxisLabel('Dropped(-) | Processed(+)'), + networkSoftnetSqueeze: + commonlib.panels.network.timeSeries.packets.new( + 'Softnet out of quota', + targets=[t.networkSoftnetSqueezedPerSec], + description=||| + "Softnet out of quota" is a network-related metric in Linux that measures the number of times the kernel's softirq processing was unable to handle incoming network traffic due to insufficient softirq processing capacity. + This means that the kernel has reached its processing capacity limit for incoming packets, and any additional packets will be dropped or deferred. + ||| + ), + networkOperStatus: + commonlib.panels.network.statusHistory.interfaceStatus.new( + 'Network interfaces carrier status', + targets=[t.networkCarrier], + description='Network interfaces carrier status', + ), + networkOverviewTable: + commonlib.panels.generic.table.base.new( + 'Network interfaces overview', + targets= + [ + t.networkUp + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Up'), + t.networkCarrier + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Carrier'), + t.networkOutBitPerSec + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(false) + + g.query.prometheus.withRefId('Transmitted'), + t.networkInBitPerSec + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(false) + + g.query.prometheus.withRefId('Received'), + t.networkArpEntries + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('ARP entries'), + t.networkMtuBytes + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('MTU'), + t.networkSpeedBitsPerSec + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Speed'), + t.networkTransmitQueueLength + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Queue length'), + t.networkInfo + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Info'), + ], + description='Network interfaces overview.' + ) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byName.new('Speed') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bps') + ), + ]) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Transmitted|Received') + + fieldOverride.byRegexp.withProperty('custom.displayMode', 'gradient-gauge') + + fieldOverride.byRegexp.withPropertiesFromOptions( + table.standardOptions.withUnit('bps') + + table.standardOptions.color.withMode('continuous-BlYlRd') + + table.standardOptions.withMax(1000 * 1000 * 100) + ), + ]) + + g.panel.table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Carrier|Up') + + fieldOverride.byRegexp.withProperty('custom.displayMode', 'color-text') + + fieldOverride.byRegexp.withPropertiesFromOptions( + table.standardOptions.withMappings( + { + type: 'value', + options: { + '0': { + text: 'Down', + color: 'light-red', + index: 0, + }, + '1': { + text: 'Up', + color: 'light-green', + index: 1, + }, + }, + } + ), + ), + ]) + + table.queryOptions.withTransformationsMixin( + [ + { + id: 'joinByField', + options: { + byField: 'device', + mode: 'outer', + }, + }, + { + id: 'filterFieldsByName', + options: { + include: { + pattern: 'device|duplex|address|Value.+', + }, + }, + }, + { + id: 'renameByRegex', + options: { + regex: '(Value) #(.*)', + renamePattern: '$2', + }, + }, + { + id: 'organize', + options: { + excludeByName: { + Info: true, + }, + renameByName: + { + device: 'Interface', + duplex: 'Duplex', + address: 'Address', + }, + }, + }, + { + id: 'organize', + options: { + indexByName: { + Interface: 0, + Up: 1, + Carrier: 2, + Received: 3, + Transmitted: 4, + }, + }, + }, + ] + ), + networkSockstatAll: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets in use', + targets=[t.networkSocketsUsed], + description='Number of sockets currently in use.', + ), + + networkSockstatTCP: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets TCP', + targets=[t.networkSocketsTCPAllocated, t.networkSocketsTCPIPv4, t.networkSocketsTCPIPv6, t.networkSocketsTCPOrphans, t.networkSocketsTCPTimeWait], + description=||| + TCP sockets are used for establishing and managing network connections between two endpoints over the TCP/IP protocol. + + Orphan sockets: If a process terminates unexpectedly or is terminated without closing its sockets properly, the sockets may become orphaned. + ||| + ), + networkSockstatUDP: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets UDP', + targets=[t.networkSocketsUDPLiteInUse, t.networkSocketsUDPInUse, t.networkSocketsUDPLiteIPv6InUse, t.networkSocketsUDPIPv6InUse], + description=||| + UDP (User Datagram Protocol) and UDPlite (UDP-Lite) sockets are used for transmitting and receiving data over the UDP and UDPlite protocols, respectively. + Both UDP and UDPlite are connectionless protocols that do not provide a reliable data delivery mechanism. + ||| + ), + networkSockstatOther: + commonlib.panels.generic.timeSeries.base.new( + 'Sockets other', + targets=[t.networkSocketsFragInUse, t.networkSocketsFragIPv6InUse, t.networkSocketsRawInUse, t.networkSocketsIPv6RawInUse], + description=||| + FRAG (IP fragment) sockets: Used to receive and process fragmented IP packets. FRAG sockets are useful in network monitoring and analysis. + + RAW sockets: Allow applications to send and receive raw IP packets directly without the need for a transport protocol like TCP or UDP. + ||| + ), + networkSockstatMemory: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.generic.timeSeries.base.new( + title='Sockets memory', + targets=[t.networkSocketsTCPMemoryPages, t.networkSocketsUDPMemoryPages, t.networkSocketsTCPMemoryBytes, t.networkSocketsUDPMemoryBytes], + description=||| + Memory currently in use for sockets. + |||, + ) + + panel.queryOptions.withMaxDataPoints(100) + + panel.fieldConfig.defaults.custom.withAxisLabel('Pages') + + panel.standardOptions.withOverridesMixin( + panel.standardOptions.override.byRegexp.new('/bytes/') + + override.byType.withPropertiesFromOptions( + panel.standardOptions.withDecimals(2) + + panel.standardOptions.withUnit('bytes') + + panel.fieldConfig.defaults.custom.withDrawStyle('bars') + + panel.fieldConfig.defaults.custom.withStacking(value={ mode: 'normal', group: 'A' }) + + panel.fieldConfig.defaults.custom.withAxisLabel('Bytes') + ) + ), + + networkNetstatIP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'IP octets', + targets=[t.networkNetstatIPInOctetsPerSec, t.networkNetstatIPOutOctetsPerSec], + description='Rate of IP octets received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('oct/s'), + + networkNetstatTCP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'TCP segments', + targets=[t.networkNetstatTCPInSegmentsPerSec, t.networkNetstatTCPOutSegmentsPerSec], + description='Rate of TCP segments received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('seg/s'), + + networkNetstatTCPerrors: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.errors.new( + title='TCP errors rate', + targets=[ + t.networkNetstatTCPOverflowPerSec, + t.networkNetstatTCPListenDropsPerSec, + t.networkNetstatTCPRetransPerSec, + t.networkNetstatTCPRetransSegPerSec, + t.networkNetstatTCPInWithErrorsPerSec, + t.networkNetstatTCPOutWithRstPerSec, + ], + description='Rate of TCP errors.' + ) + + panel.standardOptions.withUnit('err/s'), + + networkNetstatUDP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'UDP datagrams', + targets=[ + t.networkNetstatIPInUDPPerSec, + t.networkNetstatIPOutUDPPerSec, + t.networkNetstatIPInUDP6PerSec, + t.networkNetstatIPOutUDP6PerSec, + ], + description='Rate of UDP datagrams received and transmitted.' + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('dat/s'), + + networkNetstatUDPerrors: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.errors.new( + title='UDP errors rate', + targets=[ + t.networkNetstatUDPLiteInErrorsPerSec, + t.networkNetstatUDPInErrorsPerSec, + t.networkNetstatUDP6InErrorsPerSec, + t.networkNetstatUDPNoPortsPerSec, + t.networkNetstatUDP6NoPortsPerSec, + t.networkNetstatUDPRcvBufErrsPerSec, + t.networkNetstatUDP6RcvBufErrsPerSec, + t.networkNetstatUDPSndBufErrsPerSec, + t.networkNetstatUDP6SndBufErrsPerSec, + ], + description='Rate of UDP errors.' + ) + + panel.standardOptions.withUnit('err/s'), + + networkNetstatICMP: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.packets.new( + 'ICMP messages', + targets=[ + t.networkNetstatICMPInPerSec, + t.networkNetstatICMPOutPerSec, + t.networkNetstatICMP6InPerSec, + t.networkNetstatICMP6OutPerSec, + ], + description="Rate of ICMP messages, like 'ping', received and transmitted." + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets() + + panel.standardOptions.withUnit('msg/s'), + + networkNetstatICMPerrors: + local panel = g.panel.timeSeries; + local override = g.panel.timeSeries.standardOptions.override; + commonlib.panels.network.timeSeries.errors.new( + title='ICMP errors rate', + targets=[ + t.networkNetstatICMPInErrorsPerSec, + t.networkNetstatICM6PInErrorsPerSec, + ], + description='Rate of ICMP messages received and transmitted with errors.' + ) + + panel.standardOptions.withUnit('err/s'), + }, +} diff --git a/docs/node-observ-lib/linux/rules.libsonnet b/docs/node-observ-lib/linux/rules.libsonnet new file mode 100644 index 0000000000..97ed548de3 --- /dev/null +++ b/docs/node-observ-lib/linux/rules.libsonnet @@ -0,0 +1,119 @@ +{ + new(this): { + groups+: [ + { + name: if this.config.uid == 'node' then 'node-exporter.rules' else this.config.uid + '-linux-rules', + rules: [ + { + // This rule gives the number of CPUs per node. + record: 'instance:node_num_cpu:sum', + expr: ||| + count without (cpu, mode) ( + node_cpu_seconds_total{%(filteringSelector)s,mode="idle"} + ) + ||| % this.config, + }, + { + // CPU utilisation is % CPU without {idle,iowait,steal}. + record: 'instance:node_cpu_utilisation:rate%(rateInterval)s' % this.config, + expr: ||| + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{%(filteringSelector)s, mode=~"idle|iowait|steal"}[%(rateInterval)s])) + ) + ||| % this.config, + }, + { + // This is CPU saturation: 1min avg run queue length / number of CPUs. + // Can go over 1. + // TODO: There are situation where a run queue >1/core is just normal and fine. + // We need to clarify how to read this metric and if its usage is helpful at all. + record: 'instance:node_load1_per_cpu:ratio', + expr: ||| + ( + node_load1{%(filteringSelector)s} + / + instance:node_num_cpu:sum{%(filteringSelector)s} + ) + ||| % this.config, + }, + { + // Memory utilisation (ratio of used memory per instance). + record: 'instance:node_memory_utilisation:ratio', + expr: ||| + 1 - ( + ( + node_memory_MemAvailable_bytes{%(filteringSelector)s} + or + ( + node_memory_Buffers_bytes{%(filteringSelector)s} + + + node_memory_Cached_bytes{%(filteringSelector)s} + + + node_memory_MemFree_bytes{%(filteringSelector)s} + + + node_memory_Slab_bytes{%(filteringSelector)s} + ) + ) + / + node_memory_MemTotal_bytes{%(filteringSelector)s} + ) + ||| % this.config, + }, + { + record: 'instance:node_vmstat_pgmajfault:rate%(rateInterval)s' % this.config, + expr: ||| + rate(node_vmstat_pgmajfault{%(filteringSelector)s}[%(rateInterval)s]) + ||| % this.config, + }, + { + // Disk utilisation (seconds spent, 1 second rate). + record: 'instance_device:node_disk_io_time_seconds:rate%(rateInterval)s' % this.config, + expr: ||| + rate(node_disk_io_time_seconds_total{%(filteringSelector)s, %(diskDeviceSelector)s}[%(rateInterval)s]) + ||| % this.config, + }, + { + // Disk saturation (weighted seconds spent, 1 second rate). + record: 'instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s' % this.config, + expr: ||| + rate(node_disk_io_time_weighted_seconds_total{%(filteringSelector)s, %(diskDeviceSelector)s}[%(rateInterval)s]) + ||| % this.config, + }, + { + record: 'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s' % this.config, + expr: ||| + sum without (device) ( + rate(node_network_receive_bytes_total{%(filteringSelector)s, device!="lo"}[%(rateInterval)s]) + ) + ||| % this.config, + }, + { + record: 'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s' % this.config, + expr: ||| + sum without (device) ( + rate(node_network_transmit_bytes_total{%(filteringSelector)s, device!="lo"}[%(rateInterval)s]) + ) + ||| % this.config, + }, + // TODO: Find out if those drops ever happen on modern switched networks. + { + record: 'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s' % this.config, + expr: ||| + sum without (device) ( + rate(node_network_receive_drop_total{%(filteringSelector)s, device!="lo"}[%(rateInterval)s]) + ) + ||| % this.config, + }, + { + record: 'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s' % this.config, + expr: ||| + sum without (device) ( + rate(node_network_transmit_drop_total{%(filteringSelector)s, device!="lo"}[%(rateInterval)s]) + ) + ||| % this.config, + }, + ], + }, + ], + }, +} diff --git a/docs/node-observ-lib/linux/targets.libsonnet b/docs/node-observ-lib/linux/targets.libsonnet new file mode 100644 index 0000000000..f3b6dcb1ff --- /dev/null +++ b/docs/node-observ-lib/linux/targets.libsonnet @@ -0,0 +1,1139 @@ +local g = import '../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + uptimeQuery:: 'node_boot_time_seconds', + + reboot: + prometheusQuery.new( + prometheusDatasource, + self.uptimeQuery + '{%(queriesSelector)s}*1000 > $__from < $__to' % variables, + ), + + serviceFailed: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, unit="init.scope"} |= "code=exited, status=1/FAILURE"' % variables + ), + // those events should be rare, so can be shown as annotations + criticalEvents: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, transport="kernel", level="emerg"}' % variables + ), + memoryOOMkiller: + prometheusQuery.new( + prometheusDatasource, + 'increase(node_vmstat_oom_kill{%(queriesSelector)s}[$__interval])' % variables, + ) + + prometheusQuery.withLegendFormat('OOM killer invocations'), + + kernelUpdate: + prometheusQuery.new( + prometheusDatasource, + expr=||| + changes( + sum by (%(instanceLabels)s) ( + group by (%(instanceLabels)s,release) (node_uname_info{%(queriesSelector)s}) + ) + [$__interval:1m] offset -$__interval) > 1 + ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + + // new interactive session in logs: + sessionOpened: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, unit="systemd-logind.service"}|= "New session"' % variables + ), + sessionClosed: + lokiQuery.new( + lokiDatasource, + '{%(queriesSelector)s, unit="systemd-logind.service"} |= "logged out"' % variables + ), + + alertsCritical: + prometheusQuery.new( + prometheusDatasource, + 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + alertsWarning: + prometheusQuery.new( + prometheusDatasource, + 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (node_uname_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + + uptime: + prometheusQuery.new( + prometheusDatasource, + 'time() - ' + self.uptimeQuery + '{%(queriesSelector)s}' % variables + ), + cpuCount: + prometheusQuery.new( + prometheusDatasource, + 'count without (cpu) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"})' % variables + ) + + prometheusQuery.withLegendFormat('Cores'), + cpuUsage: + prometheusQuery.new( + prometheusDatasource, + ||| + (((count by (%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s))) + - + avg by (%(instanceLabels)s) (sum by (%(instanceLabels)s, mode)(irate(node_cpu_seconds_total{mode='idle',%(queriesSelector)s}[$__rate_interval])))) * 100) + / + count by(%(instanceLabels)s) (count(node_cpu_seconds_total{%(queriesSelector)s}) by (cpu, %(instanceLabels)s)) + ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ) + + prometheusQuery.withLegendFormat('CPU usage'), + cpuUsagePerCore: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(queriesSelector)s, mode=~"idle|iowait|steal"}[$__rate_interval]))) + / ignoring(cpu) group_left + count without (cpu, mode) (node_cpu_seconds_total{%(queriesSelector)s, mode="idle"}) + ) * 100 + ||| % variables, + ) + + prometheusQuery.withLegendFormat('CPU {{cpu}}'), + cpuUsageByMode: + prometheusQuery.new( + prometheusDatasource, + ||| + sum by(%(instanceLabels)s, mode) (irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval])) + / on(%(instanceLabels)s) + group_left sum by (%(instanceLabels)s)((irate(node_cpu_seconds_total{%(queriesSelector)s}[$__rate_interval]))) * 100 + ||| % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ) + + prometheusQuery.withLegendFormat('{{ mode }}'), + memoryTotalBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_MemTotal_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory total'), + memoryFreeBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_MemFree_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory free'), + memoryAvailableBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_MemAvailable_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory available'), + memoryCachedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Cached_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory cached'), + memoryBuffersBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Buffers_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory buffers'), + memoryUsedBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_MemTotal_bytes{%(queriesSelector)s} + - + node_memory_MemFree_bytes{%(queriesSelector)s} + - + node_memory_Buffers_bytes{%(queriesSelector)s} + - + node_memory_Cached_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('Memory used'), + memoryUsagePercent: + prometheusQuery.new( + prometheusDatasource, + ||| + 100 - + ( + avg by (%(instanceLabels)s) (node_memory_MemAvailable_bytes{%(queriesSelector)s}) / + avg by (%(instanceLabels)s) (node_memory_MemTotal_bytes{%(queriesSelector)s}) + * 100 + ) + ||| + % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + memorySwapTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_SwapTotal_bytes{%(queriesSelector)s}' % variables + ), + memoryPagesIn: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pgpgin{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Page-In'), + memoryPagesOut: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pgpgout{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Page-Out'), + + memoryPagesSwapIn: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pswpin{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Pages swapped in'), + memoryPagesSwapOut: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pswpout{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Pages swapped out'), + + memoryPageMajorFaults: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Major page fault operations'), + memoryPageMinorFaults: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_vmstat_pgfault{%(queriesSelector)s}[$__rate_interval]) + - + irate(node_vmstat_pgmajfault{%(queriesSelector)s}[$__rate_interval]) + ||| % variables, + ) + + prometheusQuery.withLegendFormat('Minor page fault operations'), + + memoryInactiveBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Inactive_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Inactive'), + memoryActiveBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Active_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Active'), + + memoryInactiveFile: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Inactive_file_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Inactive_file'), + + memoryInactiveAnon: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Inactive_anon_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Inactive_anon'), + + memoryActiveFile: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Active_file_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Active_file'), + + memoryActiveAnon: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Active_anon_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Active_anon'), + + memoryCommitedAs: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Committed_AS_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Commited_AS'), + memoryCommitedLimit: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_CommitLimit_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('CommitLimit'), + + memoryMappedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Mapped_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Mapped'), + memoryShmemBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Shmem_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Shmem'), + memoryShmemHugePagesBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_ShmemHugePages_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('ShmemHugePages'), + memoryShmemPmdMappedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_ShmemPmdMapped_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('ShmemPmdMapped'), + memoryWriteback: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Writeback_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Writeback'), + memoryWritebackTmp: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_WritebackTmp_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('WritebackTmp'), + memoryDirty: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Dirty_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Dirty'), + + memoryVmallocChunk: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_VmallocChunk_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('VmallocChunk'), + memoryVmallocTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_VmallocTotal_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('VmallocTotal'), + memoryVmallocUsed: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_VmallocUsed_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('VmallocUsed'), + memorySlabSUnreclaim: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_SUnreclaim_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('SUnreclaim'), + memorySlabSReclaimable: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_SReclaimable_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('SReclaimable'), + + memoryAnonHugePages: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_AnonHugePages_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('AnonHugePages'), + memoryAnonPages: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_AnonPages_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('AnonPages'), + + memoryHugePages_Free: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Free{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('HugePages_Free'), + memoryHugePages_Rsvd: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Rsvd{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('HugePages_Rsvd'), + memoryHugePages_Surp: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Surp{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('HugePages_Surp'), + memoryHugePagesTotalSize: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_HugePages_Total{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Huge pages total size'), + memoryHugePagesSize: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Hugepagesize_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Huge page size'), + memoryDirectMap1G: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_DirectMap1G_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('DirectMap1G'), + memoryDirectMap2M: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_DirectMap2M_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('DirectMap2M'), + memoryDirectMap4k: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_DirectMap4k_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('DirectMap4k'), + memoryBounce: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_Bounce_bytes{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Bounce'), + + diskTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ), + diskTotalRoot: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_size_bytes{%(queriesSelector)s, mountpoint="/", fstype!="rootfs"}' % variables, + ), + diskUsageRoot: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_avail_bytes{%(queriesSelector)s, mountpoint="/",fstype!="rootfs"}' % variables + ), + diskUsageRootPercent: + prometheusQuery.new( + prometheusDatasource, + '100 - node_filesystem_avail_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}/node_filesystem_size_bytes{mountpoint="/",fstype!="rootfs", %(queriesSelector)s}*100' % variables + ), + diskFree: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} free'), + diskUsagePercent: + prometheusQuery.new( + prometheusDatasource, + '100 - node_filesystem_avail_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}/node_filesystem_size_bytes{%(fsSelector)s, %(fsMountpointSelector)s, %(queriesSelector)s}*100' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} used, %'), + + diskInodesFree: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_files_free{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector }, + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes free'), + diskInodesTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_files{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + prometheusQuery.withLegendFormat('{{ mountpoint }} inodes total'), + diskReadOnly: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_readonly{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} read-only'), + diskDeviceError: + prometheusQuery.new( + prometheusDatasource, + 'node_filesystem_device_error{%(queriesSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}' % variables { fsMountpointSelector: config.fsMountpointSelector, fsSelector: config.fsSelector } + ) + + prometheusQuery.withLegendFormat('{{ mountpoint }} device error'), + // descriptors + processMaxFds: + prometheusQuery.new( + prometheusDatasource, + 'process_max_fds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Maximum open file descriptors'), + processOpenFds: + prometheusQuery.new( + prometheusDatasource, + 'process_open_fds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Open file descriptors'), + + // disk(device) + diskIOreadBytesPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_read_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} read'), + diskIOwriteBytesPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_written_bytes_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} written'), + diskIOutilization: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_io_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} io util'), + diskAvgQueueSize: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_disk_io_time_weighted_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval])' % variables { diskDeviceSelector: config.diskDeviceSelector }, + ) + + prometheusQuery.withLegendFormat('{{ device }} avg queue'), + + diskIOWaitWriteTime: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_write_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + / + irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} avg write time'), + diskIOWaitReadTime: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_read_time_seconds_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + / + irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} avg read time'), + diskIOReads: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_reads_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} reads'), + diskIOWrites: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_disk_writes_completed_total{%(queriesSelector)s, %(diskDeviceSelector)s}[$__rate_interval]) + ||| % variables { diskDeviceSelector: config.diskDeviceSelector } + ) + + prometheusQuery.withLegendFormat('{{ device }} writes'), + + unameInfo: + prometheusQuery.new( + prometheusDatasource, + 'node_uname_info{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withFormat('table'), + osInfo: + prometheusQuery.new( + prometheusDatasource, + ||| + node_os_info{%(queriesSelector)s} + ||| % variables, + ) + + prometheusQuery.withFormat('table'), + osInfoCombined: + prometheusQuery.new( + prometheusDatasource, + ||| + node_uname_info{%(queriesSelector)s} + * on (%(groupLabels)s,%(instanceLabels)s) + group_left(pretty_name) + node_os_info{%(queriesSelector)s} + ||| % variables { + instanceLabels: std.join(',', this.config.instanceLabels), + groupLabels: std.join(',', this.config.groupLabels), + }, + ) + + prometheusQuery.withFormat('table'), + + osTimezone: //timezone label + prometheusQuery.new( + prometheusDatasource, + 'node_time_zone_offset_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withFormat('table'), + + systemLoad1: + prometheusQuery.new( + prometheusDatasource, + 'node_load1{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('1m'), + systemLoad5: + prometheusQuery.new( + prometheusDatasource, + 'node_load5{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('5m'), + systemLoad15: + prometheusQuery.new( + prometheusDatasource, + 'node_load15{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('15m'), + + systemContextSwitches: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_context_switches_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Context switches'), + + systemInterrupts: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_intr_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Interrupts'), + + timeNtpStatus: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_sync_status{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('NTP status'), + + timeOffset: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_offset_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Time offset'), + + timeEstimatedError: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_estimated_error_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Estimated error in seconds'), + timeMaxError: + prometheusQuery.new( + prometheusDatasource, + 'node_timex_maxerror_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Maximum error in seconds'), + + networkUp: + prometheusQuery.new( + prometheusDatasource, + 'node_network_up{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('{{device}}'), + networkCarrier: + prometheusQuery.new( + prometheusDatasource, + 'node_network_carrier{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('{{device}}'), + networkArpEntries: + prometheusQuery.new( + prometheusDatasource, + 'node_arp_entries{%(queriesSelector)s}' % variables, + ), + networkMtuBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_network_mtu_bytes{%(queriesSelector)s}' % variables, + ), + networkSpeedBitsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'node_network_speed_bytes{%(queriesSelector)s} * 8' % variables, + ), + networkTransmitQueueLength: + prometheusQuery.new( + prometheusDatasource, + 'node_network_transmit_queue_length{%(queriesSelector)s}' % variables, + ), + networkInfo: + prometheusQuery.new( + prometheusDatasource, + 'node_network_info{%(queriesSelector)s}' % variables, + ), + + networkOutBitPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + networkInBitPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkOutBitPerSecFiltered: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_network_transmit_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 + # only show interfaces that had traffic change at least once during selected dashboard interval: + and + increase( + node_network_transmit_bytes_total{%(queriesSelector)s}[$__range] + ) > 0 + ||| % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + networkInBitPerSecFiltered: + prometheusQuery.new( + prometheusDatasource, + ||| + irate(node_network_receive_bytes_total{%(queriesSelector)s}[$__rate_interval])*8 + # only show interfaces that had traffic change at least once during selected dashboard interval: + and + increase( + node_network_receive_bytes_total{%(queriesSelector)s}[$__range] + ) > 0 + ||| % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + + + networkOutErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} errors transmitted'), + networkInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_errs_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} errors received'), + networkOutDroppedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted dropped'), + networkInDroppedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_drop_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received dropped'), + + networkInPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkOutPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_packets_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + + networkInMulticastPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkOutMulticastPacketsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_multicast_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + networkFifoInPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkFifoOutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_fifo_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + + networkCompressedInPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_receive_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} received'), + networkCompressedOutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_network_transmit_compressed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ device }} transmitted'), + + networkNFConntrackEntries: + prometheusQuery.new( + prometheusDatasource, + 'node_nf_conntrack_entries{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('NF conntrack entries'), + networkNFConntrackLimits: + prometheusQuery.new( + prometheusDatasource, + 'node_nf_conntrack_entries_limit{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('NF conntrack limits'), + + networkSoftnetProcessedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_softnet_processed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('CPU {{ cpu }} processed'), + networkSoftnetDroppedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_softnet_dropped_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('CPU {{ cpu }} dropped'), + networkSoftnetSqueezedPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_softnet_times_squeezed_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('CPU {{ cpu }} out of quota'), + + networkSocketsUsed: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_sockets_used{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 sockets in use'), + networkSocketsTCPAllocated: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_alloc{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Allocated'), + networkSocketsTCPIPv6: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 in use'), + networkSocketsTCPIPv4: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 in use'), + networkSocketsTCPOrphans: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_orphan{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Orphan sockets'), + networkSocketsTCPTimeWait: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_tw{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Time wait'), + + networkSocketsUDPLiteInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDPLITE_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 UDPLITE in use'), + networkSocketsUDPInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 UDP in use'), + networkSocketsUDPLiteIPv6InUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDPLITE6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 UDPLITE in use'), + networkSocketsUDPIPv6InUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 UDP in use'), + + networkSocketsFragInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_FRAG_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 Frag sockets in use'), + networkSocketsFragIPv6InUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_FRAG6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 Frag sockets in use'), + networkSocketsRawInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_RAW_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv4 Raw sockets in use'), + networkSocketsIPv6RawInUse: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_RAW6_inuse{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('IPv6 Raw sockets in use'), + + networkSocketsTCPMemoryPages: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_mem{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory pages allocated for TCP sockets'), + networkSocketsUDPMemoryPages: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP_mem{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory pages allocated for UDP sockets'), + + networkSocketsTCPMemoryBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_TCP_mem_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory bytes allocated for TCP sockets'), + networkSocketsUDPMemoryBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_sockstat_UDP_mem_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory bytes allocated for UDP sockets'), + + networkNetstatIPInOctetsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_IpExt_InOctets{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('Octets received'), + networkNetstatIPOutOctetsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_IpExt_OutOctets{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('Octets transmitted'), + + networkNetstatTCPInSegmentsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_InSegs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP received'), + networkNetstatTCPOutSegmentsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_OutSegs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP transmitted'), + + networkNetstatTCPOverflowPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_TcpExt_ListenOverflows{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP overflow'), + + networkNetstatTCPListenDropsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_TcpExt_ListenDrops{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP ListenDrops - SYNs to LISTEN sockets ignored'), + + networkNetstatTCPRetransPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_TcpExt_TCPSynRetrans{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP SYN rentransmits'), + + networkNetstatTCPRetransSegPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_RetransSegs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP retransmitted segments, containing one or more previously transmitted octets'), + networkNetstatTCPInWithErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_InErrs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP received with errors'), + + networkNetstatTCPOutWithRstPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Tcp_OutRsts{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('TCP segments sent with RST flag'), + + networkNetstatIPInUDPPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP received'), + + networkNetstatIPOutUDPPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP transmitted'), + + networkNetstatIPInUDP6PerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_InDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 received'), + + networkNetstatIPOutUDP6PerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_OutDatagrams{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 transmitted'), + + //UDP errors + networkNetstatUDPLiteInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_UdpLite_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDPLite InErrors'), + + networkNetstatUDPInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP InErrors'), + networkNetstatUDP6InErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 InErrors'), + networkNetstatUDPNoPortsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP NoPorts'), + networkNetstatUDP6NoPortsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_NoPorts{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 NoPorts'), + networkNetstatUDPRcvBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP receive buffer errors'), + networkNetstatUDP6RcvBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_RcvbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 receive buffer errors'), + networkNetstatUDPSndBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP transmit buffer errors'), + networkNetstatUDP6SndBufErrsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Udp6_SndbufErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('UDP6 transmit buffer errors'), + + //ICMP + networkNetstatICMPInPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP received'), + networkNetstatICMPOutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP transmitted'), + networkNetstatICMP6InPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp6_InMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 received'), + networkNetstatICMP6OutPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp6_OutMsgs{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 transmitted'), + + networkNetstatICMPInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 errors'), + networkNetstatICM6PInErrorsPerSec: + prometheusQuery.new( + prometheusDatasource, + 'irate(node_netstat_Icmp6_InErrors{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('ICMP6 errors'), + }, +} diff --git a/docs/node-observ-lib/linux/variables.libsonnet b/docs/node-observ-lib/linux/variables.libsonnet new file mode 100644 index 0000000000..e2f1ace8c4 --- /dev/null +++ b/docs/node-observ-lib/linux/variables.libsonnet @@ -0,0 +1,71 @@ +// variables.libsonnet +local g = import '../g.libsonnet'; +local var = g.dashboard.variable; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; + +{ + new( + this + ): { + local filteringSelector = this.config.filteringSelector, + local groupLabels = this.config.groupLabels, + local instanceLabels = this.config.instanceLabels, + local root = self, + local varMetric = 'node_uname_info', + local variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=true) = + local chainVarProto(index, chainVar) = + var.query.new(chainVar.label) + + var.query.withDatasourceFromVariable(root.datasources.prometheus) + + var.query.queryTypes.withLabelValues( + chainVar.label, + '%s{%s}' % [varMetric, chainVar.chainSelector], + ) + + var.query.generalOptions.withLabel(utils.toSentenceCase(chainVar.label)) + + var.query.selectionOptions.withIncludeAll( + value=if (!multiInstance && std.member(instanceLabels, chainVar.label)) then false else true, + customAllValue='.+' + ) + + var.query.selectionOptions.withMulti( + if (!multiInstance && std.member(instanceLabels, chainVar.label)) then false else true, + ) + + var.query.refresh.onTime() + + var.query.withSort( + i=1, + type='alphabetical', + asc=true, + caseInsensitive=false + ); + std.mapWithIndex(chainVarProto, utils.chainLabels(groupLabels + instanceLabels, [filteringSelector])), + datasources: { + prometheus: + var.datasource.new('datasource', 'prometheus') + + var.datasource.generalOptions.withLabel('Data source') + + var.datasource.withRegex(''), + loki: + var.datasource.new('loki_datasource', 'loki') + + var.datasource.generalOptions.withLabel('Loki data source') + + var.datasource.withRegex('') + + var.datasource.generalOptions.showOnDashboard.withNothing(), + }, + // Use on dashboards where multiple entities can be selected, like fleet dashboards + multiInstance: + [root.datasources.prometheus] + + variablesFromLabels(groupLabels, instanceLabels, filteringSelector), + // Use on dashboards where only single entity can be selected + singleInstance: + [root.datasources.prometheus] + + variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=false), + + queriesSelector: + '%s,%s' % [ + filteringSelector, + utils.labelsToPromQLSelector(groupLabels + instanceLabels), + ], + } + + if this.config.enableLokiLogs then self.withLokiLogs(this) else {}, + withLokiLogs(this): { + multiInstance+: [this.grafana.variables.datasources.loki], + singleInstance+: [this.grafana.variables.datasources.loki], + }, +} diff --git a/docs/node-observ-lib/macos/README.md b/docs/node-observ-lib/macos/README.md new file mode 100644 index 0000000000..815903ffc1 --- /dev/null +++ b/docs/node-observ-lib/macos/README.md @@ -0,0 +1,86 @@ +# MacOS exporter observability lib + +This jsonnet observability lib can be used to generate observability package for node exporter(MacOS). + +## Import + +```sh +jb init +jb install https://github.com/grafana/node_exporter/docs/node-observ-lib +``` + +## Examples + +### Example 1: Basic example + +You can use observ-lib to fill in monitoring-mixin structure: + +```jsonnet +// mixin.libsonnet file +local macoslib = import 'node-observ-lib/macos/main.libsonnet'; + +local mac = + macoslib.new() + + macoslib.withConfigMixin({ + filteringSelector: 'job=~".*mac.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'MacOS / ', + dashboardTags: ['macos-mixin'], + uid: 'darwin', + // enable loki logs + enableLokiLogs: true, + }); + +{ + grafanaDashboards+:: mac.grafana.dashboards, + prometheusAlerts+:: mac.prometheus.alerts, + prometheusRules+:: mac.prometheus.recordingRules, +} + +``` +For more examples see [node-observ-lib/linux](../linux). + +## Collectors used: + +Grafana Agent or combination of node_exporter/promtail can be used in order to collect data required. + +### Logs collection + +Loki logs are used to populate logs dashboard and also for annotations. + +To use logs, you need to opt-in, with setting `enableLokiLogs: true` in config. + +See example above. + +The following scrape snippet can be used in grafana-agent/promtail: + +```yaml + - job_name: integrations/node_exporter_direct_scrape + static_configs: + - targets: + - localhost + labels: + __path__: /var/log/*.log + instance: '' + job: integrations/macos-node + pipeline_stages: + - multiline: + firstline: '^([\w]{3} )?[\w]{3} +[\d]+ [\d]+:[\d]+:[\d]+|[\w]{4}-[\w]{2}-[\w]{2} [\w]{2}:[\w]{2}:[\w]{2}(?:[+-][\w]{2})?' + - regex: + expression: '(?P([\w]{3} )?[\w]{3} +[\d]+ [\d]+:[\d]+:[\d]+|[\w]{4}-[\w]{2}-[\w]{2} [\w]{2}:[\w]{2}:[\w]{2}(?:[+-][\w]{2})?) (?P\S+) (?P.+?)\[(?P\d+)\]:? (?P(?s:.*))$' + - labels: + sender: + hostname: + pid: + - match: + selector: '{sender!="", pid!=""}' + stages: + - template: + source: message + template: '{{ .sender }}[{{ .pid }}]: {{ .message }}' + - labeldrop: + - pid + - output: + source: message +``` diff --git a/docs/node-observ-lib/macos/alerts.libsonnet b/docs/node-observ-lib/macos/alerts.libsonnet new file mode 100644 index 0000000000..88714f8d96 --- /dev/null +++ b/docs/node-observ-lib/macos/alerts.libsonnet @@ -0,0 +1,23 @@ +{ + new(this, parentPrometheus): + { + groups: + //keep only alerts listed in alertsMacKeep + std.filter( + function(group) std.length(group.rules) > 0, + [ + { + name: group.name, + rules: [ + rule + for rule in group.rules + if std.length(std.find(rule.alert, this.config.alertsMacKeep)) > 0 + ], + } + for group in parentPrometheus.alerts.groups + ], + + ), + + }, +} diff --git a/docs/node-observ-lib/macos/config.libsonnet b/docs/node-observ-lib/macos/config.libsonnet new file mode 100644 index 0000000000..49ea6ecc4a --- /dev/null +++ b/docs/node-observ-lib/macos/config.libsonnet @@ -0,0 +1,59 @@ +{ + + // any modular observability library should inlcude as inputs: + // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups + // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. + // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. + // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. + // 'uid' - UID to prefix all dashboards original uids + + filteringSelector: 'job="integrations/macos-node"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'MacOS / ', + uid: 'darwin', + + dashboardTags: [self.uid], + + // Select the fstype for filesystem-related queries. If left + // empty, all filesystems are selected. If you have unusual + // filesystem you don't want to include in dashboards and + // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. + fsSelector: 'fstype!=""', + + // Select the mountpoint for filesystem-related queries. If left + // empty, all mountpoints are selected. For example if you have a + // special purpose tmpfs instance that has a fixed size and will + // always be 100% full, but you still want alerts and dashboards for + // other tmpfs instances, you can exclude those by mountpoint prefix + // like so: 'mountpoint!~"/var/lib/foo.*"'. + fsMountpointSelector: 'mountpoint!=""', + + // Select the device for disk-related queries. If left empty, all + // devices are selected. If you have unusual devices you don't + // want to include in dashboards and alerting, you can exclude + // them here, e.g. 'device!="tmpfs"'. + diskDeviceSelector: 'device!=""', + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + // Alerts to keep from node-observ-lib: + alertsMacKeep: [ + 'NodeFilesystemAlmostOutOfSpace', + 'NodeNetworkReceiveErrs', + 'NodeNetworkTransmitErrs', + 'NodeTextFileCollectorScrapeError', + 'NodeFilesystemFilesFillingUp', + 'NodeFilesystemAlmostOutOfFiles', + ], + // logs lib related + enableLokiLogs: true, + extraLogLabels: ['filename', 'sender'], + logsVolumeGroupBy: 'sender', + showLogsVolume: true, + logsFilteringSelector: self.filteringSelector, + logsExtraFilters: '', + + +} diff --git a/docs/node-observ-lib/macos/main.libsonnet b/docs/node-observ-lib/macos/main.libsonnet new file mode 100644 index 0000000000..ca898f9b3e --- /dev/null +++ b/docs/node-observ-lib/macos/main.libsonnet @@ -0,0 +1,56 @@ +local g = import '../g.libsonnet'; +local nodelib = import '../linux/main.libsonnet'; +local alerts = import './alerts.libsonnet'; +local config = import './config.libsonnet'; +local panels = import './panels.libsonnet'; +local targets = import './targets.libsonnet'; + + +// inherit nodelib +nodelib +{ + + new(): + super.new() + + nodelib.withConfigMixin(config) + + + { + local this = self, + local parentGrafana = super.grafana, + local parentPrometheus = super.prometheus, + + grafana+: { + // drop backToFleet link + links+: { + backToFleet:: {}, + }, + annotations: { + // keep only reboot annotation + reboot: parentGrafana.annotations.reboot, + }, + // override targets (memory) + targets+: targets.new(this), + // override panels (update description and targets in panels) + panels+: panels.new(this), + + // keep only overview and logs(optionally) dashes + dashboards: + { + overview: parentGrafana.dashboards.overview, + } + + + ( + if this.config.enableLokiLogs + then + { + logs: parentGrafana.dashboards.logs, + } + ), + }, + prometheus+: { + recordingRules: {}, + alerts: alerts.new(this, parentPrometheus), + }, + }, + +} diff --git a/docs/node-observ-lib/macos/panels.libsonnet b/docs/node-observ-lib/macos/panels.libsonnet new file mode 100644 index 0000000000..e0cca3131d --- /dev/null +++ b/docs/node-observ-lib/macos/panels.libsonnet @@ -0,0 +1,38 @@ +local g = import '../g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = this.config.instanceLabels[0], + + // override description and targets + memoryUsageTsBytes+: + g.panel.timeSeries.panelOptions.withDescription( + ||| + - Physical memory: Total amount of memory installed in this computer; + - App memory: Physical memory allocated by apps and system processes; + - Wired memory: Physical memory, containing data that cannot be compressed or swapped to disk; + - Compressed memory: Physical memory used to store a compressed version of data that has not been used recently; + - Swap used: Amount of compressed data temporarily moved to disk to make room in memory for more recently used data. + ||| + ) + + g.panel.timeSeries.queryOptions.withTargets([ + t.memoryUsedBytes, + t.memoryTotalBytes, + t.memoryAppBytes, + t.memoryWiredBytes, + t.memoryCompressedBytes, + t.memorySwapUsedBytes, + ]) + + commonlib.panels.generic.timeSeries.threshold.stylizeByRegexp('Physical memory'), + + //override reduceOption field to version + osInfo+: + g.panel.timeSeries.panelOptions.withTitle('OS version') + + { options+: { reduceOptions: { fields: '/^version$/' } } }, + }, +} diff --git a/docs/node-observ-lib/macos/targets.libsonnet b/docs/node-observ-lib/macos/targets.libsonnet new file mode 100644 index 0000000000..25efc16572 --- /dev/null +++ b/docs/node-observ-lib/macos/targets.libsonnet @@ -0,0 +1,87 @@ +local g = import '../g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables, + local config = this.config, + local prometheusDatasource = '${' + variables.datasources.prometheus.name + '}', + local lokiDatasource = '${' + variables.datasources.loki.name + '}', + + memoryTotalBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_total_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Physical memory'), + + memoryUsedBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_internal_bytes{%(queriesSelector)s} - + node_memory_purgeable_bytes{%(queriesSelector)s} + + node_memory_wired_bytes{%(queriesSelector)s} + + node_memory_compressed_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('Memory used'), + memoryAppBytes: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + node_memory_internal_bytes{%(queriesSelector)s} - + node_memory_purgeable_bytes{%(queriesSelector)s} + ) + ||| % variables + ) + + prometheusQuery.withLegendFormat('App memory'), + memoryWiredBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_wired_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Wired memory'), + memoryCompressedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_compressed_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Compressed memory'), + + memoryUsagePercent: + prometheusQuery.new( + prometheusDatasource, + ||| + ( + ( + avg(node_memory_internal_bytes{%(queriesSelector)s}) - + avg(node_memory_purgeable_bytes{%(queriesSelector)s}) + + avg(node_memory_wired_bytes{%(queriesSelector)s}) + + avg(node_memory_compressed_bytes{%(queriesSelector)s}) + ) / + avg(node_memory_total_bytes{%(queriesSelector)s}) + ) + * + 100 + ||| + % variables, + ), + memorySwapTotal: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_swap_total_bytes{%(queriesSelector)s}' % variables + ), + + memorySwapUsedBytes: + prometheusQuery.new( + prometheusDatasource, + 'node_memory_swap_used_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Swap used'), + }, +} diff --git a/docs/node-observ-lib/mixin-mac.libsonnet b/docs/node-observ-lib/mixin-mac.libsonnet new file mode 100644 index 0000000000..d0b56adf12 --- /dev/null +++ b/docs/node-observ-lib/mixin-mac.libsonnet @@ -0,0 +1,8 @@ +local macoslib = import './macos/main.libsonnet'; +local macos = macoslib.new(); + +{ + grafanaDashboards+:: macos.grafana.dashboards, + prometheusAlerts+:: macos.prometheus.alerts, + prometheusRules+:: macos.prometheus.recordingRules, +} diff --git a/docs/node-observ-lib/mixin.libsonnet b/docs/node-observ-lib/mixin.libsonnet new file mode 100644 index 0000000000..284f307dd4 --- /dev/null +++ b/docs/node-observ-lib/mixin.libsonnet @@ -0,0 +1,16 @@ +local nodelib = import './linux/main.libsonnet'; +local linux = + nodelib.new() + + nodelib.withConfigMixin({ + filteringSelector: 'job=~".*node.*"', + groupLabels: ['job'], + instanceLabels: ['instance'], + dashboardNamePrefix: 'Node exporter / ', + dashboardTags: ['node-exporter-mixin'], + uid: 'node', + }); +{ + grafanaDashboards+:: linux.grafana.dashboards, + prometheusAlerts+:: linux.prometheus.alerts, + prometheusRules+:: linux.prometheus.recordingRules, +}