Skip to content

Commit 3eaf82a

Browse files
committed
cr: refactor sidecar prober logic
1 parent 5ff4082 commit 3eaf82a

File tree

3 files changed

+38
-9
lines changed

3 files changed

+38
-9
lines changed

cmd/thanos/compact.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,9 @@ func runCompact(
169169

170170
downsampleMetrics := newDownsampleMetrics(reg)
171171

172-
readinessProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
172+
statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
173173
// Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes.
174-
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil {
174+
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil {
175175
return errors.Wrap(err, "create readiness prober")
176176
}
177177

@@ -326,7 +326,7 @@ func runCompact(
326326
})
327327

328328
level.Info(logger).Log("msg", "starting compact node")
329-
readinessProber.SetReady()
329+
statusProber.SetReady()
330330
return nil
331331
}
332332

cmd/thanos/sidecar.go

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ func runSidecar(
120120
uploads = false
121121
}
122122

123-
readinessProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
123+
statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
124124
// Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes.
125-
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil {
125+
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil {
126126
return errors.Wrap(err, "create readiness prober")
127127
}
128128

@@ -148,6 +148,12 @@ func runSidecar(
148148
}
149149
}
150150

151+
// When the heartbeat to Prometheus fails, the sidecar is marked as not ready.
152+
// But after `heartbeatFailLimit` number of consequential fails it's marked also not healthy,
153+
// so the orchestrator (if any) can try restarting it if it would help.
154+
heartbeatFailCount := 0
155+
heartbeatFailLimit := 6
156+
151157
// Blocking query of external labels before joining as a Source Peer into gossip.
152158
// We retry infinitely until we reach and fetch labels from our Prometheus.
153159
err := runutil.Retry(2*time.Second, ctx.Done(), func() error {
@@ -157,7 +163,11 @@ func runSidecar(
157163
"err", err,
158164
)
159165
promUp.Set(0)
160-
readinessProber.SetNotReady(err)
166+
statusProber.SetNotReady(err)
167+
if heartbeatFailCount >= heartbeatFailLimit {
168+
statusProber.SetNotHealthy(err)
169+
}
170+
heartbeatFailCount++
161171
return err
162172
}
163173

@@ -166,7 +176,8 @@ func runSidecar(
166176
"external_labels", m.Labels().String(),
167177
)
168178
promUp.Set(1)
169-
readinessProber.SetReady()
179+
statusProber.SetReady()
180+
heartbeatFailCount = 0
170181
lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9)
171182
return nil
172183
})
@@ -187,10 +198,15 @@ func runSidecar(
187198
if err := m.UpdateLabels(iterCtx, logger); err != nil {
188199
level.Warn(logger).Log("msg", "heartbeat failed", "err", err)
189200
promUp.Set(0)
190-
readinessProber.SetNotReady(err)
201+
statusProber.SetNotReady(err)
202+
if heartbeatFailCount >= heartbeatFailLimit {
203+
statusProber.SetNotHealthy(err)
204+
}
205+
heartbeatFailCount++
191206
} else {
192207
promUp.Set(1)
193-
readinessProber.SetReady()
208+
statusProber.SetReady()
209+
heartbeatFailCount = 0
194210
lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9)
195211
}
196212

pkg/prober/prober.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,19 @@ type Prober struct {
3737
func NewProber(component component.Component, logger log.Logger, reg prometheus.Registerer) *Prober {
3838
initialErr := fmt.Errorf(initialErrorFmt, component)
3939

40+
// From Kubernetes documentation https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ :
41+
//
42+
// liveness: Many applications running for long periods of time eventually transition to broken states,
43+
// (healthy) and cannot recover except by being restarted.
44+
// Kubernetes provides liveness probes to detect and remedy such situations.
45+
//
46+
// readiness: Sometimes, applications are temporarily unable to serve traffic.
47+
// (ready) For example, an application might need to load large data or configuration files during startup,
48+
// or depend on external services after startup. In such cases, you don’t want to kill the application,
49+
// but you don’t want to send it requests either. Kubernetes provides readiness probes to detect
50+
// and mitigate these situations. A pod with containers reporting that they are not ready
51+
// does not receive traffic through Kubernetes Services.
52+
4053
p := &Prober{
4154
component: component,
4255
logger: logger,

0 commit comments

Comments
 (0)