diff --git a/windows-mixin/alerts/alerts.libsonnet b/windows-mixin/alerts/alerts.libsonnet deleted file mode 100644 index 99680e2f7..000000000 --- a/windows-mixin/alerts/alerts.libsonnet +++ /dev/null @@ -1,75 +0,0 @@ -{ - prometheusAlerts+:: { - groups+: [ - { - name: 'windows-alerts', - rules: [ - { - alert: 'WindowsCPUHigh', - expr: ||| - 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > %(alertsCPUThresholdWarning)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - summary: 'High CPU usage on Windows host.', - description: ||| - CPU usage on host {{ $labels.instance }} is above %(alertsCPUThresholdWarning)s%%. The currect value is {{ $value | printf "%%.2f" }}%%. - ||| % $._config, - }, - }, - { - alert: 'WindowsMemoryHigh', - expr: ||| - 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > %(alertMemoryUsageThresholdCritical)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'High memory usage on Windows host.', - description: ||| - Memory usage on host {{ $labels.instance }} is above %(alertMemoryUsageThresholdCritical)s%%. The currect value is {{ $value | printf "%%.2f" }}%%. - ||| % $._config, - }, - }, - { - alert: 'WindowsDiskUsageHigh', - expr: ||| - 100 - ((windows_logical_disk_free_bytes ) / (windows_logical_disk_size_bytes)) * 100 > %(alertDiskUsageThresholdCritical)s - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Disk is almost full on Windows host.', - description: ||| - Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than %(alertDiskUsageThresholdCritical)s%% of space is used. The currect volume utilization is {{ $value | printf "%%.2f" }}%%. - ||| % $._config, - }, - }, - { - alert: 'WindowsServiceNotHealthy', - expr: ||| - windows_service_status{status!~"starting|stopping|ok"} > 0 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Windows service is not healthy.', - description: ||| - Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'. - ||| % $._config, - }, - }, - ], - }, - ], - }, -} diff --git a/windows-mixin/config.libsonnet b/windows-mixin/config.libsonnet index 51a2bf5db..ac6560efc 100644 --- a/windows-mixin/config.libsonnet +++ b/windows-mixin/config.libsonnet @@ -1,5 +1,15 @@ { _config+:: { + // labels to group windows hosts: + groupLabels: ['job'], + // labels to identify single windows host: + instanceLabels: ['instance'], + // selector to include in all queries(including alerts) + filteringSelector: 'job=~".*windows.*"', + // prefix all dashboards uids and alert groups + uid: 'windows', + // prefix dashboards titles + dashboardNamePrefix: '', dashboardTags: ['windows'], dashboardPeriod: 'now-1h', dashboardTimezone: 'default', @@ -8,5 +18,7 @@ alertsCPUThresholdWarning: '90', alertMemoryUsageThresholdCritical: '90', alertDiskUsageThresholdCritical: '90', + // set to false to disable logs dashboard and logs annotations + enableLokiLogs: true, }, } diff --git a/windows-mixin/dashboards/dashboards.libsonnet b/windows-mixin/dashboards/dashboards.libsonnet deleted file mode 100644 index 5d95c5b39..000000000 --- a/windows-mixin/dashboards/dashboards.libsonnet +++ /dev/null @@ -1,2 +0,0 @@ -(import 'windows_exporter.libsonnet') + -(import 'windows_logs.libsonnet') diff --git a/windows-mixin/dashboards/windows_exporter.libsonnet b/windows-mixin/dashboards/windows_exporter.libsonnet deleted file mode 100644 index afd2ba6b9..000000000 --- a/windows-mixin/dashboards/windows_exporter.libsonnet +++ /dev/null @@ -1,448 +0,0 @@ -local win = import './wintable.libsonnet'; -local grafana = import 'grafonnet/grafana.libsonnet'; -local prometheus = grafana.prometheus; -local graphPanel = grafana.graphPanel; - -local host_matcher = 'job=~"$job", agent_hostname=~"$hostname"'; - -// Templates -local ds_template = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data source', - name: 'prometheus_datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', -}; - -local job_template = grafana.template.new( - 'job', - '$prometheus_datasource', - 'label_values(windows_cs_hostname, job)', - label='Job', - refresh='load', - multi=true, - includeAll=true, - allValues='.+', - sort=1, -); - -local hostname_template = grafana.template.new( - 'hostname', - '$prometheus_datasource', - 'label_values(windows_cs_hostname{job=~"$job"}, hostname)', - label='Hostname', - refresh='load', - multi=true, - allValues='.+', - sort=1, -); - -{ - grafanaDashboards+:: { - 'windows_exporter.json': - grafana.dashboard.new( - 'Windows overview', - time_from='%s' % $._config.dashboardPeriod, - editable=false, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - uid='windows-overview', - ) - - .addTemplates([ - ds_template, - job_template, - hostname_template, - ]) - - .addLink(grafana.link.dashboards( - asDropdown=false, - title='Windows dashboards', - includeVars=true, - keepTime=true, - tags=($._config.dashboardTags), - )) - - - // Status Row - .addPanel(grafana.row.new(title='Integration status'), gridPos={ x: 0, y: 0, w: 0, h: 0 }) - // Integration status - .addPanel(integration_status_panel, gridPos={ x: 0, y: 0, w: 8, h: 2 }) - // Latest metric received - .addPanel(latest_metric_panel, gridPos={ x: 8, y: 0, w: 8, h: 2 }) - - .addPanel(grafana.row.new(title='Overview'), gridPos={ x: 0, y: 2, w: 0, h: 0 }) - .addPanel(usageTable, gridPos={ x: 0, y: 2, w: 24, h: 8 }) - - .addPanel(grafana.row.new(title='Overview graphs'), gridPos={ x: 0, y: 10, w: 0, h: 0 }) - .addPanel(perCpu, gridPos={ x: 0, y: 10, w: 12, h: 6 }) - .addPanel(perMemory, gridPos={ x: 12, y: 10, w: 12, h: 6 }) - - .addPanel(grafana.row.new(title='Resource details'), gridPos={ x: 0, y: 16, w: 0, h: 0 }) - .addPanel(uptime, gridPos={ x: 0, y: 16, w: 8, h: 4 }) - .addPanel(errorService, gridPos={ x: 8, y: 16, w: 8, h: 4 }) - .addPanel(diskUsage, gridPos={ x: 16, y: 16, w: 8, h: 4 }) - .addPanel(diskIO, gridPos={ x: 0, y: 20, w: 8, h: 8 }) - .addPanel(networkUsage, gridPos={ x: 8, y: 20, w: 8, h: 8 }) - .addPanel(iisConnections, gridPos={ x: 16, y: 20, w: 8, h: 8 }), - - }, - - local integration_status_panel = - grafana.statPanel.new( - 'Integration status', - datasource='$prometheus_datasource', - colorMode='background', - graphMode='none', - noValue='No Data', - reducerFunction='lastNotNull', - description='Indicates if the agent is configured and sending metrics.', - ) - .addMappings( - [ - { - options: { - from: 1, - result: { - color: 'green', - index: 0, - text: 'Agent configured - sending metrics', - }, - to: 10000000000000, - }, - type: 'range', - }, - { - options: { - from: 0, - result: { - color: 'red', - index: 1, - text: 'No Data', - }, - to: 0, - }, - type: 'range', - }, - ] - ) - .addTarget( - grafana.prometheus.target('sum(windows_cpu_time_total{' + host_matcher + ',mode="idle"})') - ), - - local latest_metric_panel = - grafana.statPanel.new( - 'Latest metric received', - datasource='$prometheus_datasource', - colorMode='background', - fields='Time', - graphMode='none', - noValue='No Data', - reducerFunction='lastNotNull', - description='Date and time of the latest metric received from the Windows host', - ) - .addTarget( - grafana.prometheus.target('sum(windows_cpu_time_total{' + host_matcher + ',mode="idle"})') - ), - - local usageTable = - win.wintable('Usage', 'Shows overall CPU, memory and disk stats.', '$prometheus_datasource') - .addQuery('windows_os_info{' + host_matcher + '} * on(instance) group_right(product) windows_cs_hostname', 'group', 'group') - .addQuery('100 - (avg by (instance) (rate(windows_cpu_time_total{' + host_matcher + ',mode="idle"}[$__rate_interval])) * 100)', 'CPU Usage %', 'cpuusage') - .addQuery('time() - windows_system_system_up_time{' + host_matcher + '}', 'Uptime', 'uptime') - .addQuery('windows_cs_logical_processors{' + host_matcher + '} - 0', 'CPUs', 'cpus') - .addQuery('windows_cs_physical_memory_bytes{' + host_matcher + '} - 0', 'Memory', 'memory') - .addQuery('100 - 100 * windows_os_physical_memory_free_bytes{' + host_matcher + '} / windows_cs_physical_memory_bytes{' + host_matcher + '}', 'Memory Used', 'memoryused') - .addQuery('(windows_logical_disk_free_bytes{' + host_matcher + ',volume=~"C:"}/windows_logical_disk_size_bytes{' + host_matcher + ',volume=~"C:"}) * 100', 'C:\\ Free %', 'cfree') - .hideColumn('Time') - .hideColumn('domain') - .hideColumn('fqdn') - .hideColumn('job') - .hideColumn('agent_hostname') - .hideColumn('Value #group') - .hideColumn('instance') - .renameColumn('Value #cpuusage', 'CPU usage %') - .renameColumn('hostname', 'Hostname') - .renameColumn('product', 'OS version') - .addThreshold('CPU usage %', [ - { - color: 'dark-green', - value: 0, - }, - { - color: 'dark-yellow', - value: 40, - }, - { - color: 'dark-red', - value: 80, - }, - ], 'absolute') - .renameColumn('Value #uptime', 'Uptime') - .setColumnUnit('Uptime', 's') - .addThreshold('Uptime', [ - { - color: 'dark-red', - value: 0, - }, - { - color: 'dark-yellow', - value: 259200, - }, - { - color: 'dark-green', - value: 432000, - }, - ], 'absolute') - .renameColumn('Value #cpus', 'CPUs') - .renameColumn('Value #memory', 'Total memory') - .setColumnUnit('Total memory', 'bytes') - .renameColumn('Value #memoryused', 'Memory used %') - .addThreshold('Memory used %', [ - { - color: 'dark-green', - value: 0, - }, - { - color: 'dark-yellow', - value: 60, - }, - { - color: 'dark-red', - value: 80, - }, - ], 'absolute') - .renameColumn('Value #cfree', 'C:\\ free %') - .hideColumn('volume') - .addThreshold('C:\\ free %', [ - { - color: 'dark-red', - value: null, - }, - { - color: 'dark-yellow', - value: 20, - }, - { - color: 'dark-green', - value: 80, - }, - ], 'absolute'), - - local perCpu = - graphPanel.new( - title='CPU usage % by host', - description='CPU usage % by host, measured by the percentage of time spent not idle over the last $__rate_interval.', - datasource='$prometheus_datasource', - span=6, - min=0, - max=1, - legend_show=false, - percentage=true, - format='percentunit' - ) - .addTarget(prometheus.target( - expr='1 - (avg by (agent_hostname) ( rate(windows_cpu_time_total{' + host_matcher + ',mode="idle"}[$__rate_interval])) )', - legendFormat='{{agent_hostname}}', - intervalFactor=2, - )), - - local perMemory = - graphPanel.new( - title='Memory usage % by host', - description='Physical memory usage % by host.', - datasource='$prometheus_datasource', - span=6, - min=0, - max=1, - legend_show=false, - percentage=true, - format='percentunit' - ) - .addTarget(prometheus.target( - expr='1 - windows_os_physical_memory_free_bytes{' + host_matcher + '} / windows_cs_physical_memory_bytes{' + host_matcher + '}', - legendFormat='{{agent_hostname}}', - )), - - - local iisConnections = - graphPanel.new( - title='IIS connections', - description='The number of active IIS connections by host.', - datasource='$prometheus_datasource', - span=3, - ) - .addTarget(prometheus.target( - expr='windows_iis_current_connections{' + host_matcher + '}', - legendFormat='{{agent_hostname}}', - )), - - local diskUsage = - win.winbargauge( - 'Usage of each partition', - 'Disk usage % per partition.', - [ - { - color: 'green', - value: null, - }, - { - color: '#EAB839', - value: 80, - }, - { - color: 'red', - value: 90, - }, - ], - '100 - (windows_logical_disk_free_bytes{' + host_matcher + '} / windows_logical_disk_size_bytes{' + host_matcher + '})*100', - '{{volume}}' - ), - - local diskIO = - graphPanel.new( - title='Disk read write', - description='Disk read and write rate by host.', - datasource='$prometheus_datasource', - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format='Bps', - span=2 - ) - .addTarget(prometheus.target( - expr='rate(windows_logical_disk_write_bytes_total{' + host_matcher + '}[$__rate_interval])', - legendFormat='write {{volume}}', - )) - .addTarget(prometheus.target( - expr='rate(windows_logical_disk_read_bytes_total{' + host_matcher + '}[$__rate_interval])', - legendFormat='read {{volume}}', - )), - - local errorService = - win.winstat( - span=1, - title='Services in error', - description='Services in error by host.', - datasource='$prometheus_datasource', - unit='short', - overrides=[ - { - matcher: { - id: 'byFrameRefID', - options: 'A', - }, - properties: [ - { - id: 'thresholds', - value: { - mode: 'absolute', - steps: [ - - { - color: 'dark-green', - value: 0, - }, - { - color: 'dark-red', - value: 1, - }, - ], - }, - }, - { - id: 'color', - }, - ], - }, - ], - ) - .addTarget(prometheus.target( - expr='sum(windows_service_status{status="error",' + host_matcher + '})', - instant=true, - - )), - - local networkUsage = - graphPanel.new( - title='Network usage', - description='Network usage by host, excluding isatap and VPN interfaces.', - datasource='$prometheus_datasource', - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format='percent', - bars=true, - lines=false, - min=0, - max=100, - span=3 - ) - .addTarget(prometheus.target( - expr='(rate(windows_net_bytes_total{' + host_matcher + ',nic!~"isatap.*|VPN.*"}[$__rate_interval]) * 8 / windows_net_current_bandwidth{' + host_matcher + ',nic!~"isatap.*|VPN.*"}) * 100', - legendFormat='{{nic}}', - )), - - - local uptime = - win.winstat( - span=1, - title='Uptime', - description='Uptime by host.', - datasource='$prometheus_datasource', - format='s', - overrides=[ - { - matcher: { - id: 'byFrameRefID', - options: 'A', - }, - properties: [ - { - id: 'thresholds', - value: { - mode: 'absolute', - steps: [ - { - color: 'dark-red', - value: null, - }, - { - color: 'dark-yellow', - value: 259200, - }, - { - color: 'dark-green', - value: 432000, - }, - ], - }, - }, - { - id: 'color', - }, - ], - }, - ], - ) - .addTarget(prometheus.target( - expr='time() - windows_system_system_up_time{' + host_matcher + '}', - instant=true, - - )), - -} diff --git a/windows-mixin/dashboards/windows_logs.libsonnet b/windows-mixin/dashboards/windows_logs.libsonnet deleted file mode 100644 index 449f66d4f..000000000 --- a/windows-mixin/dashboards/windows_logs.libsonnet +++ /dev/null @@ -1,362 +0,0 @@ -local g = (import 'grafana-builder/grafana.libsonnet'); -local grafana = (import 'grafonnet/grafana.libsonnet'); -local custom_barchart_grafonnet = import '../lib/custom-barchart-grafonnet/custom-barchart.libsonnet'; - -local host_matcher = 'job=~"$job", agent_hostname=~"$hostname"'; -local log_channel_matcher = host_matcher + ', channel=~"$channel", source=~"$source"'; -local windows_event_parser = '| json | line_format "ProcessID: {{.execution_processId}} Source: {{.source}} EventID: {{.event_id}} Level: {{.levelText}} Message: {{.message}}"'; - -local queries = { - total_log_lines: 'sum(count_over_time({' + log_channel_matcher + '}[$__interval]))', - total_log_warnings: 'sum(count_over_time({' + log_channel_matcher + '} |= "Warning" [$__interval]))', - total_log_errors: 'sum(count_over_time({' + log_channel_matcher + '} |= "Error" [$__interval]))', - error_percentage: 'sum(count_over_time({' + log_channel_matcher + '} |= "Error" [$__interval])) / sum(count_over_time({' + log_channel_matcher + '} [$__interval]))', - total_bytes: 'sum(bytes_over_time({' + log_channel_matcher + '} [$__interval]))', - error_log_lines: '{' + log_channel_matcher + '} |= "Error" ' + windows_event_parser, - warning_log_lines: '{' + log_channel_matcher + '} |= "Warning" ' + windows_event_parser, - log_full_lines: '{' + log_channel_matcher + '} ' + windows_event_parser, -}; - -local stackstyle = { - line: 1, - fill: 5, - fillGradient: 10, -}; - -// Templates -local prometheus_template = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Prometheus data source', - name: 'prometheus_datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', -}; - -local loki_template = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Loki Data Source', - name: 'loki_datasource', - options: [], - query: 'loki', - refresh: 1, - regex: '', - type: 'datasource', -}; - -local job_template = grafana.template.new( - 'job', - '$prometheus_datasource', - 'label_values(windows_cpu_time_total, job)', - label='Job', - refresh='load', - multi=true, - includeAll=true, - allValues='.+', - sort=1, - regex='' -); - -local host_template = grafana.template.new( - 'hostname', - '$prometheus_datasource', - 'label_values(windows_cpu_time_total{job=~"$job"}, agent_hostname)', - label='Hostname', - refresh='load', - multi=true, - includeAll=true, - allValues='.+', - sort=1, - regex='' -); - -local channel_template = grafana.template.new( - 'channel', - '$loki_datasource', - 'label_values({job=~"$job", agent_hostname=~"$hostname"}, channel)', - label='Channel', - refresh='load', - multi=true, - includeAll=true, - allValues='.+', - sort=1, -); - -local source_template = grafana.template.new( - 'source', - '$loki_datasource', - 'label_values({job=~"$job", agent_hostname=~"$hostname", channel=~"$channel"}, source)', - label='Source', - refresh='load', - multi=true, - includeAll=true, - allValues='.+', - sort=1, -); - -// Panels -local integration_status_panel = - grafana.statPanel.new( - 'Integration status', - datasource='$loki_datasource', - colorMode='background', - graphMode='none', - noValue='No Data', - reducerFunction='lastNotNull', - description='Indicates if the agent is configured and sending metrics.', - ) - .addMappings( - [ - { - options: { - from: 1, - result: { - color: 'green', - index: 0, - text: 'Agent configured - sending logs', - }, - to: 10000000000000, - }, - type: 'range', - }, - { - options: { - from: 0, - result: { - color: 'red', - index: 1, - text: 'No Data', - }, - to: 0, - }, - type: 'range', - }, - ] - ) - .addTarget( - grafana.loki.target(queries.total_log_lines) - ); - -local latest_metric_panel = - grafana.statPanel.new( - 'Latest metric received', - datasource='$loki_datasource', - colorMode='background', - fields='Time', - graphMode='none', - noValue='No Data', - reducerFunction='lastNotNull', - description='Date and time of the latest metric received from the Windows host', - ) - .addTarget( - grafana.loki.target(queries.total_log_lines) - ); - -local total_log_lines_panel = - grafana.statPanel.new( - 'Total log lines', - datasource='$loki_datasource', - graphMode='none', - reducerFunction='sum', - unit='short', - description='Total number of log lines received from the Windows host.', - ) - .addThreshold( - { color: 'rgb(192, 216, 255)', value: 0 } - ) - .addTarget( - grafana.loki.target(queries.total_log_lines) - ); - -local total_log_warnings_panel = - grafana.statPanel.new( - 'Warnings', - datasource='$loki_datasource', - graphMode='none', - reducerFunction='sum', - unit='short', - description='Total number of log lines with a severity of Warning.', - ).addThreshold( - { color: 'rgb(255, 152, 48)', value: 0 } - ) - .addTarget( - grafana.loki.target(queries.total_log_warnings) - ); - -local total_log_errors_panel = - grafana.statPanel.new( - 'Errors', - datasource='$loki_datasource', - graphMode='none', - reducerFunction='sum', - unit='short', - description='Total number of log lines with a severity of Error.', - ).addThreshold( - { color: 'rgb(242, 73, 92)', value: 0 } - ) - .addTarget( - grafana.loki.target(queries.total_log_errors) - ); - -local error_percentage_panel = - grafana.statPanel.new( - 'Error percentage', - datasource='$loki_datasource', - graphMode='none', - reducerFunction='lastNotNull', - unit='percentunit', - description='Percentage of log lines with a severity of Error against the total number of log lines.', - ).addThresholds([ - { color: 'rgb(255, 166, 176)', value: 0 }, - { color: 'rgb(255, 115, 131)', value: 25 }, - { color: 'rgb(196, 22, 42)', value: 50 }, - ]) - .addTarget( - grafana.loki.target(queries.error_percentage) - ); - -local total_bytes_panel = - grafana.statPanel.new( - 'Bytes used', - datasource='$loki_datasource', - graphMode='none', - reducerFunction='sum', - unit='bytes', - description='Total bytes of logs captured.', - ) - .addThreshold( - { color: 'rgb(184, 119, 217)', value: 0 } - ) - .addTarget( - grafana.loki.target(queries.total_bytes) - ); - -local historical_logs_errors_warnings_panel = - custom_barchart_grafonnet.new( - q1=queries.total_log_lines, - q2=queries.total_log_warnings, - q3=queries.total_log_errors, - ); - -local log_errors_panel = - grafana.logPanel.new( - 'Errors', - datasource='$loki_datasource', - ) - .addTarget( - grafana.loki.target(queries.error_log_lines) - ) + { - description: 'Log lines with a severity of Error', - }; - -local log_warnings_panel = - grafana.logPanel.new( - 'Warnings', - datasource='$loki_datasource', - ) - .addTarget( - grafana.loki.target(queries.warning_log_lines) - ) + { - description: 'Log lines with a severity of Warning', - }; - -local log_full_panel = - grafana.logPanel.new( - 'Full log file', - datasource='$loki_datasource', - ) - .addTarget( - grafana.loki.target(queries.log_full_lines) - ) + { - description: 'All log lines', - }; - -// Manifested stuff starts here -{ - grafanaDashboards+:: { - 'windows_logs.json': - grafana.dashboard.new( - 'Windows logs', - time_from='%s' % $._config.dashboardPeriod, - editable=false, - tags=($._config.dashboardTags), - timezone='%s' % $._config.dashboardTimezone, - refresh='%s' % $._config.dashboardRefresh, - uid='windows-logs' - ) - - .addTemplates([ - prometheus_template, - loki_template, - job_template, - host_template, - channel_template, - source_template, - ]) - - .addLink(grafana.link.dashboards( - asDropdown=false, - title='Windows dashboards', - includeVars=true, - keepTime=true, - tags=($._config.dashboardTags), - )) - - // Status Row - .addPanel(grafana.row.new(title='Integration status'), gridPos={ x: 0, y: 0, w: 0, h: 0 }) - // Integration status - .addPanel(integration_status_panel, gridPos={ x: 0, y: 0, w: 8, h: 2 }) - // Latest metric received - .addPanel(latest_metric_panel, gridPos={ x: 8, y: 0, w: 8, h: 2 }) - - // Overview Row - .addPanel(grafana.row.new(title='Overview'), gridPos={ x: 0, y: 2, w: 0, h: 0 }) - // Total Log Lines - .addPanel(total_log_lines_panel, gridPos={ x: 0, y: 2, w: 4, h: 4 }) - // Warnings - .addPanel(total_log_warnings_panel, gridPos={ x: 4, y: 2, w: 4, h: 4 }) - // Errors - .addPanel(total_log_errors_panel, gridPos={ x: 8, y: 2, w: 4, h: 4 }) - // Error Percentage - .addPanel(error_percentage_panel, gridPos={ x: 12, y: 2, w: 4, h: 4 }) - // Bytes Used - .addPanel(total_bytes_panel, gridPos={ x: 16, y: 2, w: 4, h: 4 }) - // Historical Logs / Warnings / Errors - .addPanel(historical_logs_errors_warnings_panel, gridPos={ x: 0, y: 6, w: 24, h: 6 }) - - // Errors Row - .addPanel( - grafana.row.new(title='Errors', collapse=true) - // Errors - .addPanel(log_errors_panel, gridPos={ x: 0, y: 12, w: 24, h: 8 }), - gridPos={ x: 0, y: 12, w: 0, h: 0 } - ) - - - // Warnings Row - .addPanel( - grafana.row.new(title='Warnings', collapse=true) - // Warnings - .addPanel(log_warnings_panel, gridPos={ x: 0, y: 20, w: 24, h: 8 }), - gridPos={ x: 0, y: 20, w: 0, h: 0 } - ) - - // Complete Log File - .addPanel( - grafana.row.new(title='Complete log file', collapse=true) - // Full Log File - .addPanel(log_full_panel, gridPos={ x: 0, y: 28, w: 24, h: 8 }), - gridPos={ x: 0, y: 28, w: 0, h: 0 } - ), - }, -} diff --git a/windows-mixin/dashboards/wintable.libsonnet b/windows-mixin/dashboards/wintable.libsonnet deleted file mode 100644 index 1a99c2a54..000000000 --- a/windows-mixin/dashboards/wintable.libsonnet +++ /dev/null @@ -1,308 +0,0 @@ -{ - wintable(title, description=null, datasource):: { - local s = self, - type: 'table', - title: title, - datasource: datasource, - description: description, - _overrides:: [], - _hiddenColumns:: [], - _originalNames:: [], - _newNames:: [], - _targets:: [], - _columns:: [], - addQuery(expression, name, id):: self { - _targets+: [{ - expr: expression, - format: 'table', - legendFormat: name, - instant: true, - refId: id, - }], - }, - hideColumn(columnName):: self { - _hiddenColumns+: [columnName], - }, - renameColumn(originalName, newName):: self { - _originalNames+: [originalName], - _newNames+: [newName], - }, - setColumnUnit(displayName, unit):: self { - _overrides+: [{ - matcher: { - id: 'byName', - options: displayName, - }, - properties: [ - { - id: 'unit', - value: unit, - }, - ], - }], - }, - addColumn(column):: self { - _columns+: [column], - _targets+: [{ - expr: [column.expression], - format: 'table', - legendFormat: [column.name], - instant: true, - refId: [column.id], - }], - }, - addThreshold(displayName, steps, mode):: self { - _overrides+: [{ - matcher: { - id: 'byName', - options: std.format('%s', displayName), - }, - properties: [ - { - id: 'thresholds', - value: { - mode: std.format('%s', mode), - steps: steps, - }, - }, - { - id: 'color', - value: { - mode: 'thresholds', - }, - }, - { - id: 'custom.displayMode', - value: 'color-background', - }, - - ], - }], - }, - targets: s._targets, - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - }, - overrides: s._overrides, - }, - transformations: [ - { - id: 'merge', - options: {}, - }, - { - id: 'organize', - options: { - excludeByName: { - [hiddenColumn.name]: true - for hiddenColumn in std.makeArray(std.length(s._hiddenColumns), function(x) { name: s._hiddenColumns[x] }) - }, - renameByName: { - [nameChange.old]: nameChange.new - for nameChange in std.makeArray(std.length(s._originalNames), function(x) { old: s._originalNames[x], new: s._newNames[x] }) - }, - }, - }, - ], - }, - - winrow(title, showLegend=false, repeat=null):: { - _panels:: [], - addWinPanel(panel):: self { - _panels+: [panel], - }, - - panels: self._panels, - collapse: false, - height: '250px', - repeatIteration: null, - repeatRowId: null, - showTitle: true, - title: title, - titleSize: 'h6', - [if repeat != null then 'repeat']: repeat, - }, - - winstat( - title, - format='none', - description='', - interval=null, - height=null, - datasource=null, - span=null, - min_span=null, - decimals=null, - valueName='avg', - valueFontSize='80%', - prefixFontSize='50%', - postfixFontSize='50%', - mappingType=1, - repeat=null, - repeatDirection=null, - prefix='', - postfix='', - colors=[ - '#299c46', - 'rgba(237, 129, 40, 0.89)', - '#d44a3a', - ], - colorBackground=false, - colorValue=false, - thresholds='', - valueMaps=[ - { - value: 'null', - op: '=', - text: 'N/A', - }, - ], - rangeMaps=[ - { - from: 'null', - to: 'null', - text: 'N/A', - }, - ], - transparent=null, - sparklineFillColor='rgba(31, 118, 189, 0.18)', - sparklineFull=false, - sparklineLineColor='rgb(31, 120, 193)', - sparklineShow=false, - gaugeShow=false, - gaugeMinValue=0, - gaugeMaxValue=100, - gaugeThresholdMarkers=true, - gaugeThresholdLabels=false, - timeFrom=null, - links=[], - tableColumn='', - maxPerRow=null, - overrides=null, - unit='s', - ):: - { - [if height != null then 'height']: height, - [if description != '' then 'description']: description, - [if repeat != null then 'repeat']: repeat, - [if repeatDirection != null then 'repeatDirection']: repeatDirection, - [if transparent != null then 'transparent']: transparent, - [if min_span != null then 'minSpan']: min_span, - title: title, - [if span != null then 'span']: span, - type: 'stat', - datasource: datasource, - targets: [ - ], - links: links, - [if decimals != null then 'decimals']: decimals, - maxDataPoints: 100, - interval: interval, - cacheTimeout: null, - format: format, - prefix: prefix, - postfix: postfix, - nullText: null, - valueMaps: valueMaps, - [if maxPerRow != null then 'maxPerRow']: maxPerRow, - nullPointMode: 'connected', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - mappings: [ - { - id: 0, - op: '=', - text: 'N/A', - type: 1, - value: 'null', - }, - ], - unit: unit, - }, - overrides: overrides, - }, - valueName: valueName, - prefixFontSize: prefixFontSize, - valueFontSize: valueFontSize, - postfixFontSize: postfixFontSize, - thresholds: thresholds, - [if timeFrom != null then 'timeFrom']: timeFrom, - colorBackground: colorBackground, - colorValue: colorValue, - colors: colors, - gauge: { - show: gaugeShow, - minValue: gaugeMinValue, - maxValue: gaugeMaxValue, - thresholdMarkers: gaugeThresholdMarkers, - thresholdLabels: gaugeThresholdLabels, - }, - sparkline: { - fillColor: sparklineFillColor, - full: sparklineFull, - lineColor: sparklineLineColor, - show: sparklineShow, - }, - tableColumn: tableColumn, - _nextTarget:: 0, - addTarget(target):: self { - local nextTarget = super._nextTarget, - _nextTarget: nextTarget + 1, - targets+: [target { refId: std.char(std.codepoint('A') + nextTarget) }], - }, - }, - winbargauge(title, description=null, thresholdSteps, expr, exprLegend, span=4):: - { - [if span != null then 'span']: span, - - datasource: '${prometheus_datasource}', - fieldConfig: { - defaults: { - color: { - mode: 'thresholds', - }, - custom: {}, - mappings: [], - max: 100, - min: 0, - thresholds: { - mode: 'absolute', - steps: thresholdSteps, - }, - unit: 'percent', - }, - overrides: [], - }, - links: [], - options: { - displayMode: 'lcd', - orientation: 'horizontal', - reduceOptions: { - calcs: [ - 'lastNotNull', - ], - fields: '', - values: false, - }, - showUnfilled: true, - }, - targets: [ - { - expr: expr, - instant: false, - interval: '', - legendFormat: '{{volume}}', - refId: 'A', - }, - ], - title: title, - description: description, - type: 'bargauge', - }, -} diff --git a/windows-mixin/g.libsonnet b/windows-mixin/g.libsonnet new file mode 100644 index 000000000..6da9f4eef --- /dev/null +++ b/windows-mixin/g.libsonnet @@ -0,0 +1 @@ +import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet' diff --git a/windows-mixin/jsonnetfile.json b/windows-mixin/jsonnetfile.json index 93f3316ec..0fa15c4ab 100644 --- a/windows-mixin/jsonnetfile.json +++ b/windows-mixin/jsonnetfile.json @@ -1,15 +1,23 @@ { "version": 1, "dependencies": [ + { + "source": { + "local": { + "directory": "../windows-observ-lib" + } + }, + "version": "" + }, { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" } }, - "version": "master" + "version": "main" } ], "legacyImports": true -} +} \ No newline at end of file diff --git a/windows-mixin/lib/custom-barchart-grafonnet/custom-barchart.libsonnet b/windows-mixin/lib/custom-barchart-grafonnet/custom-barchart.libsonnet deleted file mode 100644 index a4d0b4a6a..000000000 --- a/windows-mixin/lib/custom-barchart-grafonnet/custom-barchart.libsonnet +++ /dev/null @@ -1,156 +0,0 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; - -{ - new(q1, q2, q3):: - { - datasource: { - type: 'loki', - uid: '${loki_datasource}', - }, - fieldConfig: { - defaults: { - color: { - mode: 'fixed', - }, - custom: { - axisLabel: '', - axisPlacement: 'auto', - axisSoftMin: 0, - fillOpacity: 50, - gradientMode: 'none', - hideFrom: { - legend: false, - tooltip: false, - viz: false, - }, - lineWidth: 1, - scaleDistribution: { - type: 'linear', - }, - }, - mappings: [], - thresholds: { - mode: 'absolute', - steps: [ - { - color: 'green', - value: null, - }, - ], - }, - unit: 'short', - }, - overrides: [ - { - matcher: { - id: 'byFrameRefID', - options: 'A', - }, - properties: [ - { - id: 'displayName', - value: 'Lines', - }, - { - id: 'color', - value: { - fixedColor: 'super-light-blue', - mode: 'fixed', - }, - }, - ], - }, - { - matcher: { - id: 'byFrameRefID', - options: 'B', - }, - properties: [ - { - id: 'displayName', - value: 'Warnings', - }, - { - id: 'color', - value: { - fixedColor: 'orange', - mode: 'fixed', - }, - }, - ], - }, - { - matcher: { - id: 'byFrameRefID', - options: 'C', - }, - properties: [ - { - id: 'displayName', - value: 'Errors', - }, - { - id: 'color', - value: { - fixedColor: 'red', - mode: 'fixed', - }, - }, - ], - }, - ], - }, - maxDataPoints: 25, - interval: '10s', - options: { - barRadius: 0.25, - barWidth: 0.7, - groupWidth: 0.5, - legend: { - calcs: [], - displayMode: 'list', - placement: 'bottom', - }, - orientation: 'auto', - showValue: 'never', - stacking: 'none', - tooltip: { - mode: 'multi', - sort: 'none', - }, - xTickLabelRotation: 0, - xTickLabelSpacing: 100, - }, - targets: [ - { - datasource: { - type: 'loki', - uid: '${loki_datasource}', - }, - expr: q1, - refId: 'A', - }, - { - datasource: { - type: 'loki', - uid: '${loki_datasource}', - }, - expr: q2, - hide: false, - refId: 'B', - }, - { - datasource: { - type: 'loki', - uid: '${loki_datasource}', - }, - expr: q3, - hide: false, - refId: 'C', - }, - ], - title: 'Historical Logs / Warnings / Errors', - description: 'Historical Logs / Warnings / Errors', - type: 'barchart', - }, -} diff --git a/windows-mixin/mixin.libsonnet b/windows-mixin/mixin.libsonnet index 4d987cf31..2bae55c62 100644 --- a/windows-mixin/mixin.libsonnet +++ b/windows-mixin/mixin.libsonnet @@ -1,3 +1,29 @@ -(import 'dashboards/dashboards.libsonnet') + -(import 'alerts/alerts.libsonnet') + -(import 'config.libsonnet') +local g = import './g.libsonnet'; +local var = g.dashboard.variable; +local winlib = import 'windows-observ-lib/main.libsonnet'; +local config = (import 'config.libsonnet')._config; +{ + local windows = + winlib.new( + dashboardNamePrefix=config.dashboardNamePrefix, + uid=config.uid, + filteringSelector=config.filteringSelector, + ) + + + { + config+: config, + }, + prometheusAlerts+:: windows.prometheus.alerts, + grafanaDashboards+:: + (windows { + grafana+: { + variables+: { + datasources+: { + loki+: var.datasource.withRegex('Loki|.+logs'), + prometheus+: var.datasource.withRegex('Prometheus|Cortex|Mimir|grafanacloud-.+-prom'), + }, + }, + }, + }) + .grafana.dashboards, +} diff --git a/windows-observ-lib/README.md b/windows-observ-lib/README.md new file mode 100644 index 000000000..c99ca7596 --- /dev/null +++ b/windows-observ-lib/README.md @@ -0,0 +1,178 @@ +# Windows observ lib + +This observ lib can be used to generate observ package for Windows. + +## Import + +```sh +jb init +jb install https://github.com/grafana/jsonnet-libs/windows-observ-lib +``` + +## Examples + +### Basic example + +You can use lib to fill in monitoring-mixin structure: + +```jsonnet +// mixin.libsonnet file +local g = import './g.libsonnet'; +local var = g.dashboard.variable; +local winlib = import 'github.com/grafana/jsonnet-libs/windows-observ-lib/main.libsonnet'; +local config = (import 'config.libsonnet')._config; +{ + local windows = + winlib.new( + dashboardNamePrefix=config.dashboardNamePrefix, + uid=config.uid, + filteringSelector=config.filteringSelector, + ) + + + { + config+: config, + }, + + // get alerts from package: + prometheusAlerts+:: windows.alerts, + + // get dashboards from package, buy modify datasource regex filters first using grafonnet: + grafanaDashboards+:: + (windows { + variables+: { + datasources+: { + loki+: var.datasource.withRegex('Loki|.+logs'), + prometheus+: var.datasource.withRegex('Prometheus|Cortex|Mimir|grafanacloud-.+-prom'), + }, + }, + }) + .dashboards, +} + +// config.libsonnet file + +{ + _config+:: { + // labels to group windows hosts: + groupLabels: ['job'], + // labels to identify single windows host: + instanceLabels: ['instance'], + // selector to include in all queries(including alerts) + filteringSelector: 'job=~".*windows.*"', + // prefix all dashboards uids and alert groups + uid: 'windows', + // prefix dashboards titles + dashboardNamePrefix: '', + dashboardTags: ['windows'], + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + alertsCPUThresholdWarning: '90', + alertMemoryUsageThresholdCritical: '90', + alertDiskUsageThresholdCritical: '90', + // set to false to disable logs dashboard and logs annotations + enableLokiLogs: true, + extraLogLabels: ['channel', 'source', 'keywords', 'level'], + logsVolumeGroupBy: 'level', + showLogsVolume: true, + logsExtraFilters: + ||| + | label_format timestamp="{{__timestamp__}}" + | drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted + | line_format `{{ if eq "[[instance]]" ".*" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}` + |||, + }, +} + + +``` +Examples: +Fleet dashboard: +![image](https://github.com/grafana/jsonnet-libs/assets/14870891/b36b6245-643a-426f-9745-5437d93815ad) +Overview dashboard: +![image](https://github.com/grafana/jsonnet-libs/assets/14870891/723df88c-a789-4e73-a85e-724d9ea06cd2) +Logs dashboard: +![image](https://github.com/grafana/jsonnet-libs/assets/14870891/ec136706-96c1-4bc4-b608-f7184327d845) +Drill down disks dashboard: +![image](https://github.com/grafana/jsonnet-libs/assets/14870891/dfcda70d-4c2e-494f-b092-7d37a13d65d1) + + +## Collectors used: + +Grafana Agent or combination of windows_exporter/promtail can be used in order to collect data required. + +The following collectors should be enabled in windows_exporter/windows integration: + +`enabled_collectors: cpu,cs,logical_disk,net,os,service,system,textfile,time,diskdrive` + +### Logs collection + +Loki logs are used to populate logs dashboard and also for quering annotations. + +To opt-out, you can set `enableLokiLogs: false` in config. See example above. + +The following scrape snippet can be used: + +```yaml + - job_name: integrations/windows-exporter-application + windows_events: + use_incoming_timestamp: true + bookmark_path: "C:\\Program Files\\Grafana Agent\\bookmarks-app.xml" + eventlog_name: "Application" + labels: + job: integrations/windows_exporter + instance: 'win-test' # must match instance used in windows_exporter + relabel_configs: + - source_labels: ['computer'] + target_label: 'agent_hostname' + pipeline_stages: + - json: + expressions: + source: source + level: levelText + - labels: + source: + level: + # disable or enable depending on your requirements + - job_name: integrations/windows-exporter-security + windows_events: + use_incoming_timestamp: true + bookmark_path: "C:\\Program Files\\Grafana Agent\\secsys.xml" + eventlog_name: Security + labels: + job: integrations/windows_exporter + instance: 'win-test' # must match instance used in windows_exporter + relabel_configs: + - source_labels: ['computer'] + target_label: 'agent_hostname' + pipeline_stages: + - json: + expressions: + source: source + level: levelText + - labels: + source: + level: + - job_name: integrations/windows-exporter-system + windows_events: + use_incoming_timestamp: true + bookmark_path: "C:\\Program Files\\Grafana Agent\\bookmarks-sys.xml" + eventlog_name: "System" + labels: + job: integrations/windows_exporter + instance: 'win-test' # must match instance used in windows_exporter + relabel_configs: + - source_labels: ['computer'] + target_label: 'agent_hostname' + pipeline_stages: + - json: + expressions: + source: source + level: levelText + keywords: + - labels: + source: + level: + keywords: +``` diff --git a/windows-observ-lib/alerts.libsonnet b/windows-observ-lib/alerts.libsonnet new file mode 100644 index 000000000..f649f8099 --- /dev/null +++ b/windows-observ-lib/alerts.libsonnet @@ -0,0 +1,127 @@ +{ + new(this): { + + groups: [ + { + name: 'windows-alerts-' + this.config.uid, + rules: [ + { + alert: 'WindowsCPUHighUsage', + expr: ||| + 100 - (avg without (mode, core) (rate(windows_cpu_time_total{%(filteringSelector)s, mode="idle"}[2m])) * 100) > %(alertsCPUThresholdWarning)s + ||| % this.config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'High CPU usage on Windows host.', + description: ||| + CPU usage on host {{ $labels.instance }} is above %(alertsCPUThresholdWarning)s%%. The currect value is {{ $value | printf "%%.2f" }}%%. + ||| % this.config, + }, + }, + { + alert: 'WindowsMemoryHighUtilization', + expr: ||| + 100 - ((windows_os_physical_memory_free_bytes{%(filteringSelector)s} + / + windows_cs_physical_memory_bytes{%(filteringSelector)s}) * 100) > %(alertMemoryUsageThresholdCritical)s + ||| % this.config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'High memory usage on Windows host.', + description: ||| + Memory usage on host {{ $labels.instance }} is above %(alertMemoryUsageThresholdCritical)s%%. The currect value is {{ $value | printf "%%.2f" }}%%. + ||| % this.config, + }, + }, + { + alert: 'WindowsDiskAlmostOutOfSpace', + expr: ||| + 100 - ((windows_logical_disk_free_bytes{%(filteringSelector)s} ) / (windows_logical_disk_size_bytes{%(filteringSelector)s})) * 100 > %(alertDiskUsageThresholdCritical)s + ||| % this.config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Disk is almost full on Windows host.', + description: ||| + Volume {{ $labels.volume }} is almost full on host {{ $labels.instance }}, more than %(alertDiskUsageThresholdCritical)s%% of space is used. The currect volume utilization is {{ $value | printf "%%.2f" }}%%. + ||| % this.config, + }, + }, + { + alert: 'WindowsServiceNotHealthy', + expr: ||| + windows_service_status{%(filteringSelector)s, status!~"starting|stopping|ok"} > 0 + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Windows service is not healthy.', + description: ||| + Windows service {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}'. + ||| % this.config, + }, + }, + // enable diskdrive collector for this alert + { + alert: 'WindowsDiskDriveNotHealthy', + expr: ||| + windows_disk_drive_status{%(filteringSelector)s, status="OK"} != 1 + ||| % this.config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Windows physical disk is not healthy.', + description: ||| + Windows disk {{ $labels.name }} is not in healthy state, currently in '{{ $labels.status }}' status. + ||| % this.config, + }, + }, + { + alert: 'WindowsNTPClientDelay', + expr: ||| + windows_time_ntp_round_trip_delay_seconds{%(filteringSelector)s} > 1 + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'NTP client delay.', + description: ||| + 'Round-trip time of NTP client on instance {{ $labels.instance }} is greater than 1 second. Delay is {{ $value }} sec.' + ||| % this.config, + }, + }, + { + alert: 'WindowsNTPTimeOffset', + expr: ||| + windows_time_computed_time_offset_seconds{%(filteringSelector)s} > 1 + ||| % this.config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'NTP time offset is too large.', + description: ||| + 'NTP time offset for instance {{ $labels.instance }} is greater than 1 second. Offset is {{ $value }} sec.' + ||| % this.config, + }, + }, + ], + }, + ], + }, +} diff --git a/windows-observ-lib/dashboards.libsonnet b/windows-observ-lib/dashboards.libsonnet new file mode 100644 index 000000000..99300ff36 --- /dev/null +++ b/windows-observ-lib/dashboards.libsonnet @@ -0,0 +1,155 @@ +local g = import './g.libsonnet'; +local logslib = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; +{ + local root = self, + new(this): + local prefix = this.config.dashboardNamePrefix; + local links = this.grafana.links; + local tags = this.config.dashboardTags; + local uid = g.util.string.slugify(this.config.uid); + local vars = this.grafana.variables; + local annotations = this.grafana.annotations; + local refresh = this.config.dashboardRefresh; + local period = this.config.dashboardPeriod; + local timezone = this.config.dashboardTimezone; + local panels = this.grafana.panels; + local stat = g.panel.stat; + { + fleet: + local title = prefix + 'Windows fleet overview'; + g.dashboard.new(title) + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + // g.panel.row.new("Overview"), + panels.fleetOverviewTable { gridPos+: { w: 24, h: 16 } }, + panels.cpuUsageTopk { gridPos+: { w: 24 } }, + panels.memotyUsageTopKPercent { gridPos+: { w: 24 } }, + panels.diskIOutilPercentTopK { gridPos+: { w: 12 } }, + panels.diskUsagePercentTopK { gridPos+: { w: 12 } }, + panels.networkErrorsAndDroppedPerSec { gridPos+: { w: 24 } }, + ], 12, 7 + ) + ) + // hide link to self + + root.applyCommon(vars.multiInstance, uid + '-fleet', tags, links { backToFleet+:: {}, backToOverview+:: {} }, annotations, timezone, refresh, period), + overview: g.dashboard.new(prefix + 'Windows overview') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('Overview'), + panels.uptime, + panels.hostname, + panels.osVersion, + panels.osInfo, + panels.cpuCount, + panels.memoryTotalBytes, + panels.memoryPageTotalBytes, + panels.diskTotalC, + g.panel.row.new('CPU'), + panels.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, + panels.cpuUsageTs { gridPos+: { w: 18, h: 6 } }, + g.panel.row.new('Memory'), + panels.memoryUsageStatPercent { gridPos+: { w: 6, h: 6 } }, + panels.memoryUsageTsBytes { gridPos+: { w: 18, h: 6 } }, + g.panel.row.new('Disk'), + panels.diskIOBytesPerSec { gridPos+: { w: 12, h: 8 } }, + panels.diskUsage { gridPos+: { w: 12, h: 8 } }, + g.panel.row.new('Network'), + panels.networkUsagePerSec { gridPos+: { w: 12, h: 8 } }, + panels.networkErrorsPerSec { gridPos+: { w: 12, h: 8 } }, + ], 6, 2 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-overview', tags, links { backToOverview+:: {} }, annotations, timezone, refresh, period), + + // add TODO advanced memory dashboard (must enable memory collector) + // memory: + + system: g.dashboard.new(prefix + 'Windows CPU and system') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('System'), + panels.cpuUsageStat { gridPos+: { w: 6, h: 6 } }, + panels.cpuUsageTs { gridPos+: { w: 9, h: 6 } }, + panels.cpuUsageByMode { gridPos+: { w: 9, h: 6 } }, + panels.cpuQueue, + panels.systemContextSwitchesAndInterrupts, + // panels.systemThreads, + // panels.systemExceptions, + g.panel.row.new('Time'), + panels.osTimezone { gridPos+: { w: 3, h: 4 } }, + panels.timeNtpStatus { gridPos+: { x: 0, y: 0, w: 21, h: 4 } }, + panels.timeNtpDelay { gridPos+: { w: 24, h: 7 } }, + ], 12, 7 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-system', tags, links, annotations, timezone, refresh, period), + + disks: g.dashboard.new(prefix + 'Windows disks and filesystems') + + g.dashboard.withPanels( + g.util.grid.wrapPanels( + [ + g.panel.row.new('Disk'), + panels.diskUsagePercent, + panels.diskUsage, + panels.diskIOBytesPerSec, + panels.diskIOps, + panels.diskIOWaitTime, + panels.diskQueue, + ], 12, 8 + ) + ) + + root.applyCommon(vars.singleInstance, uid + '-disks', tags, links, annotations, timezone, refresh, period), + } + + + if this.config.enableLokiLogs + then + { + logs: + logslib.new( + prefix + 'Windows logs', + datasourceName=this.grafana.variables.datasources.loki.name, + datasourceRegex=this.grafana.variables.datasources.loki.regex, + filterSelector=this.config.filteringSelector, + labels=this.config.groupLabels + this.config.instanceLabels + this.config.extraLogLabels, + formatParser='json', + showLogsVolume=this.config.showLogsVolume, + logsVolumeGroupBy=this.config.logsVolumeGroupBy, + extraFilters=this.config.logsExtraFilters + ) + { + dashboards+: + { + logs+: + // reference to self, already generated variables, to keep them, but apply other common data in applyCommon + root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links, annotations=annotations, timezone=timezone, refresh=refresh, period=period), + }, + panels+: + { + // modify log panel + logs+: + g.panel.logs.options.withEnableLogDetails(true) + + g.panel.logs.options.withShowTime(false) + + g.panel.logs.options.withWrapLogMessage(false), + }, + variables+: { + // add prometheus datasource for annotations processing + toArray+: [ + this.grafana.variables.datasources.prometheus { hide: 2 }, + ], + }, + }.dashboards.logs, + } + else {}, + applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period): + g.dashboard.withTags(tags) + + g.dashboard.withUid(uid) + + g.dashboard.withLinks(std.objectValues(links)) + + g.dashboard.withTimezone(timezone) + + g.dashboard.withRefresh(refresh) + + g.dashboard.time.withFrom(period) + + g.dashboard.withVariables(vars) + + g.dashboard.withAnnotations(std.objectValues(annotations)), +} diff --git a/windows-observ-lib/g.libsonnet b/windows-observ-lib/g.libsonnet new file mode 100644 index 000000000..6da9f4eef --- /dev/null +++ b/windows-observ-lib/g.libsonnet @@ -0,0 +1 @@ +import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet' diff --git a/windows-observ-lib/jsonnetfile.json b/windows-observ-lib/jsonnetfile.json new file mode 100644 index 000000000..69b58cc74 --- /dev/null +++ b/windows-observ-lib/jsonnetfile.json @@ -0,0 +1,33 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "common-lib" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" + } + }, + "version": "master" + } + ], + "legacyImports": true +} \ No newline at end of file diff --git a/windows-observ-lib/main.libsonnet b/windows-observ-lib/main.libsonnet new file mode 100644 index 000000000..64f4911a4 --- /dev/null +++ b/windows-observ-lib/main.libsonnet @@ -0,0 +1,115 @@ +local alerts = import './alerts.libsonnet'; +local dashboards = import './dashboards.libsonnet'; +local datasources = import './datasources.libsonnet'; +local g = import './g.libsonnet'; +local panels = import './panels.libsonnet'; +local targets = import './targets.libsonnet'; +local variables = import './variables.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; + +{ + new( + filteringSelector, + groupLabels=['job'], + instanceLabels=['instance'], + dashboardNamePrefix='', + dashboardTags=[uid], + uid, + ): { + + local this = self, + config: { + // any modular library should inlcude as inputs: + // 'dashboardNamePrefix' - Use as prefix for all Dashboards and (optional) rule groups + // 'filteringSelector' - Static selector to apply to ALL dashboard variables of type query, panel queries, alerts and recording rules. + // 'groupLabels' - one or more labels that can be used to identify 'group' of instances. In simple cases, can be 'job' or 'cluster'. + // 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'. + // 'uid' - UID to prefix all dashboards original uids + groupLabels: groupLabels, + instanceLabels: instanceLabels, + filteringSelector: filteringSelector, + dashboardTags: dashboardTags, + uid: uid, + dashboardNamePrefix: dashboardNamePrefix, + + // optional + ignoreVolumes: 'HarddiskVolume.*', + alertsCPUThresholdWarning: '90', + alertMemoryUsageThresholdCritical: '90', + alertDiskUsageThresholdCritical: '90', + dashboardPeriod: 'now-1h', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + // logs lib related + enableLokiLogs: true, + extraLogLabels: ['channel', 'source', 'keywords', 'level'], + logsVolumeGroupBy: 'level', + showLogsVolume: true, + logsExtraFilters: + ||| + | label_format timestamp="{{__timestamp__}}" + | drop channel_extracted,source_extracted,computer_extracted,level_extracted,keywords_extracted + | line_format `{{ if eq "[[instance]]" ".*" }}{{ alignLeft 25 .instance}}|{{end}}{{alignLeft 12 .channel }}| {{ alignLeft 25 .source}}| {{ .message }}` + |||, + }, + grafana: { + variables: variables.new(this), + targets: targets.new(this), + annotations: + { + reboot: commonlib.annotations.reboot.new( + title='Reboot', + target=this.grafana.targets.reboot, + instanceLabels=std.join(',', instanceLabels), + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels)), + } + + + if + this.config.enableLokiLogs + then + { + serviceFailed: commonlib.annotations.serviceFailed.new( + title='Service failed', + target=this.grafana.targets.serviceFailed, + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])) + + commonlib.annotations.base.withTextFormat('{{message}}'), + criticalEvents: commonlib.annotations.fatal.new( + title='Critical system event', + target=this.grafana.targets.criticalEvents, + ) + + commonlib.annotations.base.withTagKeys(std.join(',', this.config.groupLabels + this.config.instanceLabels + ['level'])) + + commonlib.annotations.base.withTextFormat('{{message}}'), + } + else + {}, + // common links here + links: { + local link = g.dashboard.link, + backToFleet: + link.link.new('Back to Windows fleet', '/d/' + this.grafana.dashboards.fleet.uid) + + link.link.options.withKeepTime(true), + backToOverview: + link.link.new('Back to Windows overview', '/d/' + this.grafana.dashboards.overview.uid) + + link.link.options.withKeepTime(true), + otherDashboards: + link.dashboards.new('All Windows dashboards', this.config.dashboardTags) + + link.dashboards.options.withIncludeVars(true) + + link.dashboards.options.withKeepTime(true) + + link.dashboards.options.withAsDropdown(true), + }, + + panels: panels.new(this), + dashboards: dashboards.new(this), + }, + + prometheus: { + alerts: alerts.new(this), + recordingRules: {}, + }, + + }, + +} diff --git a/windows-observ-lib/panels.libsonnet b/windows-observ-lib/panels.libsonnet new file mode 100644 index 000000000..537c5d265 --- /dev/null +++ b/windows-observ-lib/panels.libsonnet @@ -0,0 +1,403 @@ +local g = import './g.libsonnet'; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; +{ + new(this): + { + local t = this.grafana.targets, + local table = g.panel.table, + local fieldOverride = g.panel.table.fieldOverride, + local instanceLabel = this.config.instanceLabels[0], + fleetOverviewTable: + commonlib.panels.generic.table.base.new( + 'Fleet overview', + targets= + [ + t.osInfo + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('OS Info'), + t.uptime + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Uptime'), + t.cpuCount + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CPU count'), + t.cpuUsage + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CPU usage'), + t.memoryTotalBytes + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Memory total'), + t.memoryUsagePercent + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Memory usage'), + t.diskTotalC + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Disk C: total'), + t.diskUsageCPercent + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('Disk C: used'), + t.alertsCritical + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('CRITICAL'), + t.alertsWarning + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true) + + g.query.prometheus.withRefId('WARNING'), + ], + description="All Windows instances' perfomance at a glance." + ) + + commonlib.panels.system.table.uptime.stylizeByName('Uptime') + + table.standardOptions.withOverridesMixin([ + fieldOverride.byRegexp.new('Product|^Hostname$') + + fieldOverride.byRegexp.withProperty('custom.filterable', true), + fieldOverride.byName.new('Instance') + + fieldOverride.byName.withProperty('custom.filterable', true) + + fieldOverride.byName.withProperty('links', [ + { + targetBlank: false, + title: 'Drill down to ${__field.name} ${__value.text}', + url: 'd/%s?var-%s=${__data.fields.%s}&${__url_time_range}' % [this.grafana.dashboards.overview.uid, instanceLabel, instanceLabel], + }, + ]), + fieldOverride.byRegexp.new(std.join('|', std.map(utils.toSentenceCase, this.config.groupLabels))) + + fieldOverride.byRegexp.withProperty('custom.filterable', true) + + fieldOverride.byRegexp.withProperty('links', [ + { + targetBlank: false, + title: 'Filter by ${__field.name}', + url: 'd/%s?var-${__field.name}=${__value.text}&${__url_time_range}' % [this.grafana.dashboards.fleet.uid], + }, + ]), + fieldOverride.byName.new('CPU count') + + fieldOverride.byName.withProperty('custom.width', '120'), + fieldOverride.byName.new('CPU usage') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withProperty('custom.displayMode', 'basic') + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + fieldOverride.byName.new('Memory total') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bytes') + ), + fieldOverride.byName.new('Memory usage') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withProperty('custom.displayMode', 'basic') + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + fieldOverride.byName.new('Disk C: total') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('bytes') + ), + fieldOverride.byName.new('Disk C: used') + + fieldOverride.byName.withProperty('custom.width', '120') + + fieldOverride.byName.withProperty('custom.displayMode', 'basic') + + fieldOverride.byName.withPropertiesFromOptions( + table.standardOptions.withUnit('percent') + ) + + fieldOverride.byName.withPropertiesFromOptions( + commonlib.panels.cpu.timeSeries.utilization.stylize() + ), + ]) + + table.queryOptions.withTransformationsMixin( + [ + { + id: 'joinByField', + options: { + byField: instanceLabel, + mode: 'outer', + }, + }, + { + id: 'filterFieldsByName', + options: { + include: { + //' 1' - would only match first occurence of group label, so no duplicates + pattern: std.join(' 1|', this.config.groupLabels) + ' 1|' + instanceLabel + '|product|^hostname$|Value.+', + }, + }, + }, + { + id: 'organize', + options: { + excludeByName: { + 'Value #OS Info': true, + }, + indexByName: {}, + renameByName: + { + product: 'Product', + [instanceLabel]: utils.toSentenceCase(instanceLabel), + hostname: 'Hostname', + } + + + // group labels are named as 'job 1' and so on. + { + [label + ' 1']: utils.toSentenceCase(label) + for label in this.config.groupLabels + }, + + }, + }, + + { + id: 'renameByRegex', + options: { + regex: 'Value #(.*)', + renamePattern: '$1', + }, + }, + ] + ), + uptime: commonlib.panels.system.stat.uptime.new(targets=[t.uptime]), + systemContextSwitchesAndInterrupts: + commonlib.panels.generic.timeSeries.base.new( + 'Context switches/Interrupts', + targets=[ + t.systemContextSwitches, + t.systemInterrupts, + ], + description=||| + Context switches occur when the operating system switches from running one process to another. Interrupts are signals sent to the CPU by external devices to request its attention. + + A high number of context switches or interrupts can indicate that the system is overloaded or that there are problems with specific devices or processes. + ||| + ), + systemExceptions: + commonlib.panels.generic.timeSeries.base.new( + 'System calls and exceptions', + targets=[ + t.windowsSystemExceptions, + t.windowsSystemCalls, + ], + ), + systemThreads: + commonlib.panels.generic.timeSeries.base.new( + 'System threads', + targets=[ + t.windowsSystemThreads, + ], + ), + timeNtpStatus: + commonlib.panels.system.statusHistory.ntp.new( + 'NTP status', + targets=[t.timeNtpStatus], + description='' + ) + + g.panel.timeSeries.standardOptions.withNoValue('No data. Please check that "time" collector is enabled.'), + timeNtpDelay: + commonlib.panels.generic.timeSeries.base.new( + 'NTP delay', + targets=[ + t.timeNtpDelay, + t.timeOffset, + ], + description=||| + NTP trip delay: Total roundtrip delay experienced by the NTP client in receiving a response from the server for the most recent request, + in seconds. This is the time elapsed on the NTP client between transmitting a request to the NTP server and receiving a valid response from the server. + + Time offset: Absolute time offset between the system clock and the chosen time source, in seconds. + ||| + ) + + g.panel.timeSeries.standardOptions.withUnit('seconds') + + g.panel.timeSeries.standardOptions.withNoValue('No data. Please check that "time" collector is enabled.'), + cpuCount: commonlib.panels.cpu.stat.count.new(targets=[t.cpuCount]), + cpuUsageTs: commonlib.panels.cpu.timeSeries.utilization.new(targets=[t.cpuUsage]), + cpuUsageTopk: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='CPU usage', + target=t.cpuUsage, + topk=25, + instanceLabels=this.config.instanceLabels, + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + cpuUsageStat: commonlib.panels.cpu.stat.usage.new(targets=[t.cpuUsage]), + cpuUsageByMode: commonlib.panels.cpu.timeSeries.utilizationByMode.new( + targets=[t.cpuUsageByMode], + description=||| + CPU usage by different modes. + ||| + ), + cpuQueue: commonlib.panels.generic.timeSeries.base.new( + 'CPU average queue size', + targets=[t.cpuQueue], + description=||| + The CPU average queue size in Windows, often referred to as the "Processor Queue Length" or "CPU Queue Length," is a metric that measures the number of threads or tasks waiting to be processed by the central processing unit (CPU) at a given moment. + It is an essential performance indicator that reflects the workload and responsiveness of the CPU. + When the CPU queue length is high, it indicates that there are more tasks in line for processing than the CPU can handle immediately. + + This can lead to system slowdowns, decreased responsiveness, and potential performance issues. High CPU queue lengths are often associated with CPU saturation, where the CPU is struggling to keep up with the demands placed on it. + ||| + ), + memoryTotalBytes: commonlib.panels.memory.stat.total.new(targets=[t.memoryTotalBytes]), + memoryPageTotalBytes: + commonlib.panels.memory.stat.total.new( + 'Pagefile size', + targets=[t.memoryPageTotalBytes], + description=||| + A page file (also known as a "paging file") is an optional, hidden system file on a hard disk. + Page files enable the system to remove infrequently accessed modified pages from physical memory to let the system use physical memory more efficiently for more frequently accessed pages. + + https://learn.microsoft.com/en-us/troubleshoot/windows-client/performance/introduction-to-the-page-file + ||| + ), + memoryUsageStatPercent: commonlib.panels.memory.stat.usage.new(targets=[t.memoryUsagePercent]), + memotyUsageTopKPercent: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Memory usage', + target=t.memoryUsagePercent, + topk=25, + instanceLabels=this.config.instanceLabels, + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + memoryUsageTsBytes: commonlib.panels.memory.timeSeries.usageBytes.new(targets=[t.memoryUsedBytes, t.memoryTotalBytes]), + diskTotalC: + commonlib.panels.disk.stat.total.new( + 'Disk C: size', + targets=[t.diskTotalC], + description=||| + Total storage capacity on the primary hard drive (usually the system drive) of a computer running a Windows operating system. + ||| + ), + diskUsage: commonlib.panels.disk.table.usage.new( + totalTarget= + (t.diskTotal + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true)), + usageTarget=t.diskUsage + + g.query.prometheus.withFormat('table') + + g.query.prometheus.withInstant(true), + groupLabel='volume' + ), + diskUsagePercent: commonlib.panels.disk.timeSeries.usagePercent.new( + targets=[t.diskUsagePercent] + ), + diskUsagePercentTopK: commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Disk space usage', + target=t.diskUsagePercent, + topk=25, + instanceLabels=this.config.instanceLabels + ['volume'], + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + diskIOBytesPerSec: commonlib.panels.disk.timeSeries.ioBytesPerSec.new( + targets=[t.diskIOreadBytesPerSec, t.diskIOwriteBytesPerSec, t.diskIOutilization] + ), + diskIOutilPercentTopK: + commonlib.panels.generic.timeSeries.topkPercentage.new( + title='Disk IO', + target=t.diskIOutilization, + topk=25, + instanceLabels=this.config.instanceLabels + ['volume'], + drillDownDashboardUid=this.grafana.dashboards.overview.uid, + ), + diskIOps: + commonlib.panels.disk.timeSeries.iops.new( + targets=[ + t.diskIOReads, + t.diskIOWrites, + ] + ), + + diskQueue: + commonlib.panels.disk.timeSeries.ioQueue.new( + 'Disk average queue', + targets= + [ + t.diskReadQueue, + t.diskWriteQueue, + ] + ), + diskIOWaitTime: commonlib.panels.disk.timeSeries.ioWaitTime.new( + targets=[ + t.diskIOWaitReadTime, + t.diskIOWaitWriteTime, + ] + ) + , + osInfo: commonlib.panels.generic.stat.info.new( + 'OS family', + targets=[t.osInfo], + description='OS family includes various versions and editions of the Windows operating system.' + ) + { options+: { reduceOptions+: { fields: '/^product$/' } } }, + osVersion: + commonlib.panels.generic.stat.info.new('OS version', + targets=[t.osInfo], + description='Version of Windows operating system.') + { options+: { reduceOptions+: { fields: '/^version$/' } } }, + osTimezone: + commonlib.panels.generic.stat.info.new( + 'Timezone', targets=[t.osTimezone], description='Current system timezone.' + ) + { options+: { reduceOptions+: { fields: '/^timezone$/' } } }, + hostname: + commonlib.panels.generic.stat.info.new( + 'Hostname', + targets=[t.osInfo], + description="System's hostname." + ) + { options+: { reduceOptions+: { fields: '/^hostname$/' } } }, + networkErrorsAndDroppedPerSec: + commonlib.panels.network.timeSeries.errors.new( + 'Network errors and dropped packets', + targets=std.map( + function(t) t + { + expr: '(' + t.expr + ')>0.5', + legendFormat: '{{' + this.config.instanceLabels[0] + '}}: ' + std.get(t, 'legendFormat', '{{ nic }}'), + }, + [ + t.networkOutErrorsPerSec, + t.networkInErrorsPerSec, + t.networkInUknownPerSec, + t.networkOutDroppedPerSec, + t.networkInDroppedPerSec, + ] + ), + description=||| + **Network errors**: + + Network errors refer to issues that occur during the transmission of data across a network. + + These errors can result from various factors, including physical issues, jitter, collisions, noise and interference. + + Monitoring network errors is essential for diagnosing and resolving issues, as they can indicate problems with network hardware or environmental factors affecting network quality. + + **Dropped packets**: + + Dropped packets occur when data packets traveling through a network are intentionally discarded or lost due to congestion, resource limitations, or network configuration issues. + + Common causes include network congestion, buffer overflows, QoS settings, and network errors, as corrupted or incomplete packets may be discarded by receiving devices. + + Dropped packets can impact network performance and lead to issues such as degraded voice or video quality in real-time applications. + ||| + ), + + networkErrorsPerSec: commonlib.panels.network.timeSeries.errors.new('Network errors', + targets=[t.networkInErrorsPerSec, t.networkOutErrorsPerSec, t.networkInUknownPerSec]) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkDroppedPerSec: commonlib.panels.network.timeSeries.dropped.new( + targets=[t.networkInDroppedPerSec, t.networkOutDroppedPerSec] + ) + + commonlib.panels.network.timeSeries.errors.withNegateOutPackets(), + networkUsagePerSec: commonlib.panels.network.timeSeries.traffic.new( + targets=[t.networkInBitPerSec, t.networkOutBitPerSec] + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + networkPacketsPerSec: commonlib.panels.network.timeSeries.packets.new( + targets=[t.networkInPacketsPerSec, t.networkOutPacketsPerSec] + ) + + commonlib.panels.network.timeSeries.traffic.withNegateOutPackets(), + }, +} diff --git a/windows-observ-lib/targets.libsonnet b/windows-observ-lib/targets.libsonnet new file mode 100644 index 000000000..b9bbf2bea --- /dev/null +++ b/windows-observ-lib/targets.libsonnet @@ -0,0 +1,333 @@ +local g = import './g.libsonnet'; +local prometheusQuery = g.query.prometheus; +local lokiQuery = g.query.loki; + +{ + new(this): { + local variables = this.grafana.variables, + local config = this.config, + uptimeQuery:: 'windows_system_system_up_time', + + reboot: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + self.uptimeQuery + '{%(queriesSelector)s}*1000 > $__from < $__to' % variables, + ), + serviceFailed: + lokiQuery.new( + '${' + variables.datasources.loki.name + '}', + '{%(queriesSelector)s, source="Service Control Manager", level="Error"} |= "terminated" | json' % variables + ), + // those events should be rare, so can be shown as annotations + criticalEvents: + lokiQuery.new( + '${' + variables.datasources.loki.name + '}', + '{%(queriesSelector)s, channel="System", level="Critical"} | json' % variables + ), + alertsCritical: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="critical"}[1m])) * group by (%(instanceLabels)s) (windows_os_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + alertsWarning: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'count by (%(instanceLabels)s) (max_over_time(ALERTS{%(queriesSelector)s, alertstate="firing", severity="warning"}[1m])) * group by (%(instanceLabels)s) (windows_os_info{%(queriesSelector)s})' % variables { instanceLabels: std.join(',', this.config.instanceLabels) }, + ), + + uptime: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'time() - ' + self.uptimeQuery + '{%(queriesSelector)s}' % variables + ), + cpuCount: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_cs_logical_processors{%(queriesSelector)s}' % variables + ), + cpuUsage: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + '100 - (avg without (mode,core) (rate(windows_cpu_time_total{mode="idle", %(queriesSelector)s}[$__rate_interval])*100))' % variables + ) + + prometheusQuery.withLegendFormat('CPU usage'), + cpuUsageByMode: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + ||| + sum by(instance, mode) (irate(windows_cpu_time_total{%(queriesSelector)s}[$__rate_interval])) + / on(instance) + group_left sum by (instance) ((irate(windows_cpu_time_total{%(queriesSelector)s}[$__rate_interval]))) * 100 + ||| % variables + ) + + prometheusQuery.withLegendFormat('{{ mode }}'), + + // https://learn.microsoft.com/en-us/previous-versions/windows/it-pro/windows-2000-server/cc940375(v=technet.10)?redirectedfrom=MSDN + cpuQueue: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + ||| + windows_system_processor_queue_length{%(queriesSelector)s} + ||| % variables + ) + + prometheusQuery.withLegendFormat('CPU average queue'), + + memoryTotalBytes: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_cs_physical_memory_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory total'), + memoryFreeBytes: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_os_physical_memory_free_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory free'), + memoryUsedBytes: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_cs_physical_memory_bytes{%(queriesSelector)s} - windows_os_physical_memory_free_bytes{%(queriesSelector)s}' % variables + ) + + prometheusQuery.withLegendFormat('Memory used'), + memoryUsagePercent: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + '100 - windows_os_physical_memory_free_bytes{%(queriesSelector)s} / windows_cs_physical_memory_bytes{%(queriesSelector)s} * 100' % variables + ), + memoryPageTotalBytes: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_os_paging_limit_bytes{%(queriesSelector)s}' % variables + ), + diskTotal: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_logical_disk_size_bytes{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}' % variables { ignoreVolumes: config.ignoreVolumes } + ), + diskTotalC: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_logical_disk_size_bytes{volume="C:", %(queriesSelector)s}' % variables + ), + diskUsageC: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_logical_disk_size_bytes{volume="C:", %(queriesSelector)s}-windows_logical_disk_free_bytes{volume="C:", %(queriesSelector)s}' % variables + ), + diskUsageCPercent: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + '100 - windows_logical_disk_free_bytes{volume="C:", %(queriesSelector)s}/windows_logical_disk_size_bytes{volume="C:", %(queriesSelector)s}*100' % variables + ), + diskUsage: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_logical_disk_size_bytes{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}-windows_logical_disk_free_bytes{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} used'), + diskUsagePercent: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + '100 - windows_logical_disk_free_bytes{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}/windows_logical_disk_size_bytes{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}*100' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} used, %'), + diskIOreadBytesPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_logical_disk_read_bytes_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval])' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} read'), + diskIOwriteBytesPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_logical_disk_write_bytes_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval])' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} written'), + diskIOutilization: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + '(1-clamp_max(irate(windows_logical_disk_idle_seconds_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]),1)) * 100' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} io util'), + diskReadQueue: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_logical_disk_avg_read_requests_queued{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval])' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} read queue'), + diskWriteQueue: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_logical_disk_avg_write_requests_queued{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval])' % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} write queue'), + + diskIOWaitWriteTime: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + ||| + irate(windows_logical_disk_write_seconds_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]) + / + irate(windows_logical_disk_writes_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]) + ||| % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} avg write time'), + diskIOWaitReadTime: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + ||| + irate(windows_logical_disk_read_seconds_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]) + / + irate(windows_logical_disk_reads_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]) + ||| % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} avg read time'), + diskIOReads: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + ||| + irate(windows_logical_disk_reads_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]) + ||| % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} reads'), + diskIOWrites: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + ||| + irate(windows_logical_disk_writes_total{volume!~"%(ignoreVolumes)s", %(queriesSelector)s}[$__rate_interval]) + ||| % variables { ignoreVolumes: config.ignoreVolumes } + ) + + prometheusQuery.withLegendFormat('{{ volume }} writes'), + + osInfo: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_os_info{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withFormat('table'), + + osTimezone: //timezone label + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_os_timezone{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withFormat('table'), + systemContextSwitches: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_system_context_switches_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Context switches'), + windowsSystemThreads: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_system_threads{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('System threads'), + windowsSystemExceptions: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_system_exception_dispatches_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('System exceptions dispatched'), + windowsSystemCalls: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_system_system_calls_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('System calls'), + + systemInterrupts: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'sum without (core) (irate(windows_cpu_interrupts_total{%(queriesSelector)s}[$__rate_interval]))' % variables, + ) + + prometheusQuery.withLegendFormat('Interrupts'), + + timeNtpStatus: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'clamp_max(windows_time_ntp_client_time_sources{%(queriesSelector)s}, 1)' % variables, + ) + + prometheusQuery.withLegendFormat('NTP status'), + timeNtpDelay: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_time_ntp_round_trip_delay_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('NTP trip delay'), + + timeOffset: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'windows_time_computed_time_offset_seconds{%(queriesSelector)s}' % variables, + ) + + prometheusQuery.withLegendFormat('Time offset'), + + // Total adjustment made to the local system clock frequency by W32Time in parts per billion (PPB) units. 1 PPB adjustment implies the system clock was adjusted at a rate of 1 nanosecond per second (1 ns/s). The smallest possible adjustment can vary and is expected to be in the order of 100's of PPB. + timeAdjustments: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'rate(windows_time_clock_frequency_adjustment_ppb_total{%(queriesSelector)s}[$__rate_interval])' % variables, + ) + + prometheusQuery.withLegendFormat('Time adjustments'), + + + networkOutBitPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_bytes_sent_total{%(queriesSelector)s}[$__rate_interval])*8' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} transmitted'), + networkInBitPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_bytes_received_total{%(queriesSelector)s}[$__rate_interval])*8' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} received'), + networkOutErrorsPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_outbound_errors_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} transmitted'), + networkInErrorsPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_received_errors_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} received'), + networkInUknownPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_received_unknown_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} received (unknown)'), + networkOutDroppedPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_outbound_discarded_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} transmitted packets dropped'), + networkInDroppedPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_received_discarded_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} received packets dropped'), + + networkInPacketsPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_received_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} received'), + networkOutPacketsPerSec: + prometheusQuery.new( + '${' + variables.datasources.prometheus.name + '}', + 'irate(windows_net_packets_sent_total{%(queriesSelector)s}[$__rate_interval])' % variables + ) + + prometheusQuery.withLegendFormat('{{ nic }} transmitted'), + }, +} diff --git a/windows-observ-lib/variables.libsonnet b/windows-observ-lib/variables.libsonnet new file mode 100644 index 000000000..9e53d7a99 --- /dev/null +++ b/windows-observ-lib/variables.libsonnet @@ -0,0 +1,72 @@ +// variables.libsonnet +local g = import './g.libsonnet'; +local var = g.dashboard.variable; +local commonlib = import 'common-lib/common/main.libsonnet'; +local utils = commonlib.utils; + +{ + new( + this + ): { + + local filteringSelector = this.config.filteringSelector, + local groupLabels = this.config.groupLabels, + local instanceLabels = this.config.instanceLabels, + local root = self, + local varMetric = 'windows_os_info', + local variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=true) = + local chainVarProto(index, chainVar) = + var.query.new(chainVar.label) + + var.query.withDatasourceFromVariable(root.datasources.prometheus) + + var.query.queryTypes.withLabelValues( + chainVar.label, + '%s{%s}' % [varMetric, chainVar.chainSelector], + ) + + var.query.generalOptions.withLabel(utils.toSentenceCase(chainVar.label)) + + var.query.selectionOptions.withIncludeAll( + value=if (!multiInstance && std.member(instanceLabels, chainVar.label)) then false else true, + customAllValue='.+' + ) + + var.query.selectionOptions.withMulti( + if (!multiInstance && std.member(instanceLabels, chainVar.label)) then false else true, + ) + + var.query.refresh.onTime() + + var.query.withSort( + i=1, + type='alphabetical', + asc=true, + caseInsensitive=false + ); + std.mapWithIndex(chainVarProto, utils.chainLabels(groupLabels + instanceLabels, [filteringSelector])), + datasources: { + prometheus: + var.datasource.new('datasource', 'prometheus') + + var.datasource.generalOptions.withLabel('Data source') + + var.datasource.withRegex(''), + loki: + var.datasource.new('loki_datasource', 'loki') + + var.datasource.generalOptions.withLabel('Loki data source') + + var.datasource.withRegex('') + + var.datasource.generalOptions.showOnDashboard.withNothing(), + }, + // Use on dashboards where multiple entities can be selected, like fleet dashboards + multiInstance: + [root.datasources.prometheus] + + variablesFromLabels(groupLabels, instanceLabels, filteringSelector), + // Use on dashboards where only single entity can be selected + singleInstance: + [root.datasources.prometheus] + + variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=false), + + queriesSelector: + '%s,%s' % [ + filteringSelector, + utils.labelsToPromQLSelector(groupLabels + instanceLabels), + ], + } + + if this.config.enableLokiLogs then self.withLokiLogs(this) else {}, + withLokiLogs(this): { + multiInstance+: [this.grafana.variables.datasources.loki], + singleInstance+: [this.grafana.variables.datasources.loki], + }, +}