@@ -120,9 +120,9 @@ func runSidecar(
120120 uploads = false
121121 }
122122
123- readinessProber := prober .NewProber (comp , logger , prometheus .WrapRegistererWithPrefix ("thanos_" , reg ))
123+ statusProber := prober .NewProber (comp , logger , prometheus .WrapRegistererWithPrefix ("thanos_" , reg ))
124124 // Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes.
125- if err := defaultHTTPListener (g , logger , reg , httpBindAddr , readinessProber ); err != nil {
125+ if err := defaultHTTPListener (g , logger , reg , httpBindAddr , statusProber ); err != nil {
126126 return errors .Wrap (err , "create readiness prober" )
127127 }
128128
@@ -148,6 +148,12 @@ func runSidecar(
148148 }
149149 }
150150
151+ // When the heartbeat to Prometheus fails, the sidecar is marked as not ready.
152+ // But after `heartbeatFailLimit` number of consequential fails it's marked also not healthy,
153+ // so the orchestrator (if any) can try restarting it if it would help.
154+ heartbeatFailCount := 0
155+ heartbeatFailLimit := 6
156+
151157 // Blocking query of external labels before joining as a Source Peer into gossip.
152158 // We retry infinitely until we reach and fetch labels from our Prometheus.
153159 err := runutil .Retry (2 * time .Second , ctx .Done (), func () error {
@@ -157,7 +163,11 @@ func runSidecar(
157163 "err" , err ,
158164 )
159165 promUp .Set (0 )
160- readinessProber .SetNotReady (err )
166+ statusProber .SetNotReady (err )
167+ if heartbeatFailCount >= heartbeatFailLimit {
168+ statusProber .SetNotHealthy (err )
169+ }
170+ heartbeatFailCount ++
161171 return err
162172 }
163173
@@ -166,7 +176,8 @@ func runSidecar(
166176 "external_labels" , m .Labels ().String (),
167177 )
168178 promUp .Set (1 )
169- readinessProber .SetReady ()
179+ statusProber .SetReady ()
180+ heartbeatFailCount = 0
170181 lastHeartbeat .Set (float64 (time .Now ().UnixNano ()) / 1e9 )
171182 return nil
172183 })
@@ -187,10 +198,15 @@ func runSidecar(
187198 if err := m .UpdateLabels (iterCtx , logger ); err != nil {
188199 level .Warn (logger ).Log ("msg" , "heartbeat failed" , "err" , err )
189200 promUp .Set (0 )
190- readinessProber .SetNotReady (err )
201+ statusProber .SetNotReady (err )
202+ if heartbeatFailCount >= heartbeatFailLimit {
203+ statusProber .SetNotHealthy (err )
204+ }
205+ heartbeatFailCount ++
191206 } else {
192207 promUp .Set (1 )
193- readinessProber .SetReady ()
208+ statusProber .SetReady ()
209+ heartbeatFailCount = 0
194210 lastHeartbeat .Set (float64 (time .Now ().UnixNano ()) / 1e9 )
195211 }
196212
0 commit comments