Skip to content
Prev Previous commit
Next Next commit
Add reconcile errors counter
  • Loading branch information
JoelSpeed committed Oct 31, 2018
commit d18803a523790b6fa20b5d42369606822c171fe5
1 change: 1 addition & 0 deletions pkg/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ func (c *Controller) processNextWorkItem() bool {
if result, err := c.Do.Reconcile(req); err != nil {
c.Queue.AddRateLimited(req)
log.Error(err, "Reconciler error", "Controller", c.Name, "Request", req)
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc()

return false
} else if result.RequeueAfter > 0 {
Expand Down
13 changes: 12 additions & 1 deletion pkg/internal/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,10 @@ var _ = Describe("controller", func() {
Name: "controller_runtime_reconcile_queue_length",
Help: "Length of reconcile queue per controller",
}, []string{"controller"})
ctrlmetrics.ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconcile errors per controller",
}, []string{"controller"})

fakeReconcile.Err = fmt.Errorf("expected error: reconcile")
go func() {
Expand All @@ -427,14 +431,21 @@ var _ = Describe("controller", func() {

By("Invoking Reconciler which will give an error")
Expect(<-reconciled).To(Equal(request))
var queueLength dto.Metric
var queueLength, reconcileErrs dto.Metric
Eventually(func() error {
ctrlmetrics.QueueLength.WithLabelValues(ctrl.Name).Write(&queueLength)
if queueLength.GetGauge().GetValue() != 1.0 {
return fmt.Errorf("metrics not updated")
}
return nil
}, 2.0).Should(Succeed())
Eventually(func() error {
ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs)
if reconcileErrs.GetCounter().GetValue() != 1.0 {
return fmt.Errorf("metrics not updated")
}
return nil
}, 2.0).Should(Succeed())

By("Invoking Reconciler a second time without error")
fakeReconcile.Err = nil
Expand Down
8 changes: 8 additions & 0 deletions pkg/internal/controller/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,16 @@ var (
Name: "controller_runtime_reconcile_queue_length",
Help: "Length of reconcile queue per controller",
}, []string{"controller"})

// ReconcileErrors is a prometheus counter metrics which holds the total
// number of errors from the Reconciler
ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Copy link

@lilic lilic Sep 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides the ReconcileErrors I would suggest adding also ReconcileTotal . That way we can see if in the past 5mins the rate of errors was too high.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small typo in the suggested new metric name, should probably be ReconcileTotal

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@droot @DirectXMan12 Do you think this would be a worthwhile metric to integrate into this PR?

Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconcile errors per controller",
}, []string{"controller"})
)

func init() {
metrics.Registry.MustRegister(QueueLength)
metrics.Registry.MustRegister(ReconcileErrors)
}