diff --git a/pkg/prom/wal/wal.go b/pkg/prom/wal/wal.go index 6f7f9e86c38b..75b5e763d258 100644 --- a/pkg/prom/wal/wal.go +++ b/pkg/prom/wal/wal.go @@ -22,10 +22,11 @@ import ( type storageMetrics struct { r prometheus.Registerer - numActiveSeries prometheus.Gauge - numDeletedSeries prometheus.Gauge - totalCreatedSeries prometheus.Counter - totalRemovedSeries prometheus.Counter + numActiveSeries prometheus.Gauge + numDeletedSeries prometheus.Gauge + totalCreatedSeries prometheus.Counter + totalRemovedSeries prometheus.Counter + totalAppendedSamples prometheus.Counter } func newStorageMetrics(r prometheus.Registerer) *storageMetrics { @@ -50,6 +51,11 @@ func newStorageMetrics(r prometheus.Registerer) *storageMetrics { Help: "Total number of created series removed from the WAL", }) + m.totalAppendedSamples = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "agent_wal_samples_appended_total", + Help: "Total number of samples appended to the WAL", + }) + if r != nil { r.MustRegister( m.numActiveSeries, diff --git a/production/grafana-agent-mixin/dashboards.libsonnet b/production/grafana-agent-mixin/dashboards.libsonnet index 4c484b14a561..5ff76559d198 100644 --- a/production/grafana-agent-mixin/dashboards.libsonnet +++ b/production/grafana-agent-mixin/dashboards.libsonnet @@ -12,18 +12,20 @@ local template = grafana.template; grafanaDashboards+:: { 'agent.json': g.dashboard('Agent') - .addMultiTemplate('job', 'agent_build_info', 'job') - .addMultiTemplate('instance', 'agent_build_info', 'instance') + .addMultiTemplate('cluster', 'agent_build_info', 'cluster') + .addMultiTemplate('namespace', 'agent_build_info', 'namespace') + .addMultiTemplate('container', 'agent_build_info', 'container') + .addMultiTemplate('pod', 'agent_build_info{container=~"$container"}', 'pod') .addRow( g.row('Agent Stats') .addPanel( g.panel('Agent Stats') + g.tablePanel([ - 'count by (job, instance, version) (agent_build_info{job=~"$job", instance=~"$instance"})', - 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})', + 'count by (pod, container, version) (agent_build_info{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', + 'max by (pod, container) (time() - process_start_time_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', ], { - job: { alias: 'Job' }, - instance: { alias: 'Instance' }, + pod: { alias: 'Pod' }, + container: { alias: 'Container' }, version: { alias: 'Version' }, 'Value #A': { alias: 'Count', type: 'hidden' }, 'Value #B': { alias: 'Uptime' }, @@ -34,12 +36,12 @@ local template = grafana.template; g.row('Prometheus Discovery') .addPanel( g.panel('Target Sync') + - g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') + + g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])) by (pod, scrape_job) * 1e3', '{{pod}}/{{scrape_job}}') + { yaxes: g.yaxes('ms') } ) .addPanel( g.panel('Targets') + - g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') + + g.queryPanel('sum(prometheus_sd_discovered_targets{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', 'Targets') + g.stack ) ) @@ -47,7 +49,12 @@ local template = grafana.template; g.row('Prometheus Retrieval') .addPanel( g.panel('Average Scrape Interval Duration') + - g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') + + g.queryPanel(||| + rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) + / + rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) + * 1e3 + |||, '{{pod}} {{interval}} configured') + { yaxes: g.yaxes('ms') } ) .addPanel( @@ -67,7 +74,7 @@ local template = grafana.template; ) .addPanel( g.panel('Appended Samples') + - g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') + + g.queryPanel('sum by (job, instance_name) (rate(agent_wal_storage_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]))', '{{job}} {{instance_name}}') + g.stack ) ), @@ -83,12 +90,13 @@ local template = grafana.template; .addTarget(prometheus.target( ||| ( - prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - - - ignoring(url, remote_name) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"} + prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"} + - + ignoring(url, instance_name, remote_name) group_right(pod) + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"} ) |||, - legendFormat='{{cluster}}:{{instance}}-{{url}}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local timestampComparisonRate = @@ -100,12 +108,13 @@ local template = grafana.template; .addTarget(prometheus.target( ||| ( - rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - - - ignoring (url, remote_name) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) + rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) + - + ignoring(url, instance_name, remote_name) group_right(pod) + rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) ) |||, - legendFormat='{{cluster}}:{{instance}}-{{url}}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local samplesRate = @@ -117,13 +126,14 @@ local template = grafana.template; .addTarget(prometheus.target( ||| rate( - prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m]) + prometheus_remote_storage_samples_in_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) - - ignoring(url, remote_name) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) + ignoring(url, instance_name, remote_name) group_right(pod) + rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) - - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) + rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m]) |||, - legendFormat='{{cluster}}:{{instance}}-{{url}}' + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local currentShards = @@ -134,8 +144,8 @@ local template = grafana.template; min_span=6, ) .addTarget(prometheus.target( - 'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_remote_storage_shards{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local maxShards = @@ -145,8 +155,8 @@ local template = grafana.template; span=4, ) .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_remote_storage_shards_max{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local minShards = @@ -156,8 +166,8 @@ local template = grafana.template; span=4, ) .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_remote_storage_shards_min{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local desiredShards = @@ -167,8 +177,8 @@ local template = grafana.template; span=4, ) .addTarget(prometheus.target( - 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_remote_storage_shards_desired{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local shardsCapacity = @@ -178,8 +188,8 @@ local template = grafana.template; span=6, ) .addTarget(prometheus.target( - 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local pendingSamples = @@ -189,8 +199,8 @@ local template = grafana.template; span=6, ) .addTarget(prometheus.target( - 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_remote_storage_pending_samples{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local queueSegment = @@ -201,8 +211,8 @@ local template = grafana.template; formatY1='none', ) .addTarget(prometheus.target( - 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'prometheus_wal_watcher_current_segment{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local droppedSamples = @@ -212,8 +222,8 @@ local template = grafana.template; span=3, ) .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local failedSamples = @@ -223,8 +233,8 @@ local template = grafana.template; span=3, ) .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local retriedSamples = @@ -234,8 +244,8 @@ local template = grafana.template; span=3, ) .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); local enqueueRetries = @@ -245,8 +255,8 @@ local template = grafana.template; span=3, ) .addTarget(prometheus.target( - 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', - legendFormat='{{cluster}}:{{instance}}-{{url}}' + 'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', + legendFormat='{{cluster}}:{{pod}}-{{instance_name}}-{{url}}', )); dashboard.new('Agent Prometheus Remote Write', editable=true) @@ -264,9 +274,9 @@ local template = grafana.template; ) .addTemplate( template.new( - 'instance', + 'cluster', '$datasource', - 'label_values(agent_build_info, instance)', + 'label_values(agent_build_info, cluster)', refresh='time', current={ selected: true, @@ -278,9 +288,37 @@ local template = grafana.template; ) .addTemplate( template.new( - 'cluster', + 'namespace', '$datasource', - 'label_values(agent_build_info, cluster)', + 'label_values(agent_build_info, namespace)', + refresh='time', + current={ + selected: true, + text: 'All', + value: '$__all', + }, + includeAll=true, + ), + ) + .addTemplate( + template.new( + 'container', + '$datasource', + 'label_values(agent_build_info, container)', + refresh='time', + current={ + selected: true, + text: 'All', + value: '$__all', + }, + includeAll=true, + ), + ) + .addTemplate( + template.new( + 'pod', + '$datasource', + 'label_values(agent_build_info{container=~"$container"}, pod)', refresh='time', current={ selected: true, @@ -294,7 +332,7 @@ local template = grafana.template; template.new( 'url', '$datasource', - 'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)', + 'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", pod=~"$pod"}, url)', refresh='time', includeAll=true, ) diff --git a/production/grafana-agent-mixin/debugging.libsonnet b/production/grafana-agent-mixin/debugging.libsonnet index 5edd7478acd0..851fda4ab8b0 100644 --- a/production/grafana-agent-mixin/debugging.libsonnet +++ b/production/grafana-agent-mixin/debugging.libsonnet @@ -6,15 +6,15 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.dashboard('Agent Operational') .addMultiTemplate('cluster', 'agent_build_info', 'cluster') .addMultiTemplate('namespace', 'agent_build_info', 'namespace') - .addMultiTemplate('job', 'agent_build_info', 'job') - .addMultiTemplate('instance', 'agent_build_info', 'instance') + .addMultiTemplate('container', 'agent_build_info', 'container') + .addMultiTemplate('pod', 'agent_build_info{container=~"$container"}', 'pod') .addRow( g.row('General') .addPanel( g.panel('GCs') + g.queryPanel( - 'rate(go_gc_duration_seconds_count{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}[5m])', - '{{job}}', + 'rate(go_gc_duration_seconds_count{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[5m])', + '{{pod}}', ) ) .addPanel( @@ -22,29 +22,29 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes('decbytes') } + { stack: 'true' } + g.queryPanel( - 'go_memstats_heap_inuse_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}', - '{{job}}', + 'go_memstats_heap_inuse_bytes{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + '{{pod}}', ) ) .addPanel( g.panel('Goroutines') + g.queryPanel( - 'go_goroutines{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}', - '{{job}}', + 'go_goroutines{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}', + '{{pod}}', ) ) .addPanel( g.panel('CPU') + g.queryPanel( - 'rate(container_cpu_usage_seconds_total{cluster=~"$cluster", pod_name=~".*grafana-agent.*"}[5m])', - '{{job}}', + 'rate(container_cpu_usage_seconds_total{cluster=~"$cluster", container=~"$container"}[5m])', + '{{pod}}', ) ) .addPanel( g.panel('WSS') + g.queryPanel( - 'container_memory_working_set_bytes{cluster=~"$cluster", pod_name=~".*grafana-agent.*"}', - '{{job}}', + 'container_memory_working_set_bytes{cluster=~"$cluster", container=~"$container"}', + '{{pod}}', ) ) .addPanel( @@ -58,33 +58,33 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addRow( g.row('Network') .addPanel( - g.panel('RX') + + g.panel('RX by Pod') + g.queryPanel( - 'rate(container_network_receive_bytes_total{cluster=~"$cluster", namespace=~"$namespace", pod_name=~".*grafana-agent.*"}[5m])', - '{{job}}', + 'rate(container_network_receive_bytes_total{cluster=~"$cluster", namespace=~"$namespace", pod=~"$pod"}[5m])', + '{{pod}}', ) ) .addPanel( - g.panel('TX') + + g.panel('TX by Pod') + g.queryPanel( - 'rate(container_network_transmit_bytes_total{cluster=~"$cluster", namespace=~"$namespace", pod_name=~".*grafana-agent.*"}[5m])', - '{{job}}', + 'rate(container_network_transmit_bytes_total{cluster=~"$cluster", namespace=~"$namespace", pod=~"$pod"}[5m])', + '{{pod}}', ) ) ) .addRow( g.row('Prometheus Read') .addPanel( - g.panel('Bytes/Series/Instance') + + g.panel('Bytes/Series/Pod') + { yaxes: g.yaxes('decbytes') } + { stack: 'true' } + g.queryPanel( ||| - (sum by (job, instance) (avg_over_time(go_memstats_heap_inuse_bytes{cluster=~"$cluster", job=~"$job", instance=~"$instance"}[1m]))) + (sum by (pod) (avg_over_time(go_memstats_heap_inuse_bytes{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))) / - (sum by (job, instance) (agent_wal_storage_active_series{cluster=~"$cluster", job=~"$job", instance=~"$instance"})) + (sum by (pod) (agent_wal_storage_active_series{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})) |||, - '{{instance}}', + '{{pod}}', ) ) .addPanel( @@ -93,26 +93,26 @@ local g = import 'grafana-builder/grafana.libsonnet'; { stack: 'true' } + g.queryPanel( ||| - (sum by (job) (avg_over_time(go_memstats_heap_inuse_bytes{cluster=~"$cluster", job=~"$job", instance=~"$instance"}[1m]))) + (sum by (container) (avg_over_time(go_memstats_heap_inuse_bytes{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"}[1m]))) / - (sum by (job) (agent_wal_storage_active_series{cluster=~"$cluster", job=~"$job", instance=~"$instance"})) + (sum by (container) (agent_wal_storage_active_series{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})) |||, - '{{job}}', + '{{container}}', ) ) .addPanel( - g.panel('Series/Instance') + + g.panel('Series/Pod') + { stack: 'true' } + g.queryPanel( - 'sum by (instance) (agent_wal_storage_active_series{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', - '{{instance}}', + 'sum by (pod) (agent_wal_storage_active_series{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', + '{{pod}}', ) ) .addPanel( g.panel('Series/Config') + { stack: 'true' } + g.queryPanel( - 'sum by (instance_name) (agent_wal_storage_active_series{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', + 'sum by (instance_name) (agent_wal_storage_active_series{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', '{{instance_name}}', ) ) @@ -120,8 +120,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Series') + { stack: 'true' } + g.queryPanel( - 'sum by (job) (agent_wal_storage_active_series{cluster=~"$cluster", job=~"$job", instance=~"$instance"})', - '{{job}}', + 'sum by (container) (agent_wal_storage_active_series{cluster=~"$cluster", namespace=~"$namespace", container=~"$container"})', + '{{container}}', ) ) ), diff --git a/production/grafana-agent-mixin/mixin.libsonnet b/production/grafana-agent-mixin/mixin.libsonnet index 25574e683817..1f11045a118a 100644 --- a/production/grafana-agent-mixin/mixin.libsonnet +++ b/production/grafana-agent-mixin/mixin.libsonnet @@ -1,7 +1,7 @@ local dashboards = import 'dashboards.libsonnet'; { - grafanaDashboards:: std.mapWithKey(function(field, obj) obj { + grafanaDashboards+:: std.mapWithKey(function(field, obj) obj { grafanaDashboardFolder: 'Agent', }, dashboards.grafanaDashboards), } diff --git a/production/tanka/grafana-agent/config.libsonnet b/production/tanka/grafana-agent/config.libsonnet index 6e9e7bfd85ad..a7614a666763 100644 --- a/production/tanka/grafana-agent/config.libsonnet +++ b/production/tanka/grafana-agent/config.libsonnet @@ -1,6 +1,7 @@ { _images+:: { agent: 'grafana/agent:latest', + agentctl: 'grafana/agentctl:latest', }, _config+:: { @@ -29,7 +30,7 @@ // as a DaemonSet (like it is here by default), then disabling this will // scrape all metrics multiple times, once per node, leading to // duplicate samples being rejected and might hit limits. - agent_host_filter: true, + agent_host_filter: false, // The directory where the WAL is stored for all instances. agent_wal_dir: '/var/lib/agent/data', @@ -151,11 +152,27 @@ action: 'replace', target_label: 'namespace', }, - - // Rename instances to be the pod name { source_labels: ['__meta_kubernetes_pod_name'], action: 'replace', + target_label: 'pod', // Not 'pod_name', which disappeared in K8s 1.16. + }, + { + source_labels: ['__meta_kubernetes_pod_container_name'], + action: 'replace', + target_label: 'container', // Not 'container_name', which disappeared in K8s 1.16. + }, + + // Rename instances to the concatenation of pod:container:port. + // All three components are needed to guarantee a unique instance label. + { + source_labels: [ + '__meta_kubernetes_pod_name', + '__meta_kubernetes_pod_container_name', + '__meta_kubernetes_pod_container_port_name', + ], + action: 'replace', + separator: ':', target_label: 'instance', }, @@ -203,12 +220,16 @@ action: 'keep', }, - // Rename instances to be the pod name. As the scrape two - // ports of kube-state-metrics, include the port name in the - // interface name. Otherwise, alerts about scrape failures and - // timeouts won't work. + // Rename instances to the concatenation of pod:container:port. + // In the specific case of KSM, we could leave out the container + // name and still have a unique instance label, but we leave it + // in here for consistency with the normal pod scraping. { - source_labels: ['__meta_kubernetes_pod_name', '__meta_kubernetes_pod_container_port_name'], + source_labels: [ + '__meta_kubernetes_pod_name', + '__meta_kubernetes_pod_container_name', + '__meta_kubernetes_pod_container_port_name', + ], action: 'replace', separator: ':', target_label: 'instance',