Skip to content

Commit 896d662

Browse files
l0wl3velBenjamin RitterJaydipGabani
authored
fix: unreliable webhook behaviour on gatekeeper pod startup and shutdown (#3780)
Signed-off-by: Benjamin Ritter <[email protected]> Signed-off-by: Benjamin Ritter <[email protected]> Signed-off-by: Benjamin Ritter <[email protected]> Signed-off-by: Jaydip Gabani <[email protected]> Co-authored-by: Benjamin Ritter <[email protected]> Co-authored-by: Jaydip Gabani <[email protected]>
1 parent 7627e3d commit 896d662

File tree

2 files changed

+62
-1
lines changed

2 files changed

+62
-1
lines changed

main.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ import (
2626
"net/http"
2727
_ "net/http/pprof"
2828
"os"
29+
"os/signal"
2930
"path/filepath"
31+
"syscall"
3032
"time"
3133

3234
"github.com/go-logr/zapr"
@@ -117,6 +119,7 @@ var (
117119
enableK8sCel = flag.Bool("enable-k8s-native-validation", true, "enable the validating admission policy driver")
118120
externaldataProviderResponseCacheTTL = flag.Duration("external-data-provider-response-cache-ttl", 3*time.Minute, "TTL for the external data provider response cache. Specify the duration in 'h', 'm', or 's' for hours, minutes, or seconds respectively. Defaults to 3 minutes if unspecified. Setting the TTL to 0 disables the cache.")
119121
enableReferential = flag.Bool("enable-referential-rules", true, "Enable referential rules. This flag defaults to true. Set this value to false if you want to disallow referential constraints. Because referential constraints read objects other than the object-under-test, they may be subject to race conditions. Users concerned about this may want to disable referential rules")
122+
shutdownDelay = flag.Int("shutdown-delay", 10, "Time in seconds the controller runtime shutdown gets delayed after receiving a pod termination event. Prevents failing webhooks on pod shutdown. default: 10")
120123
)
121124

122125
func init() {
@@ -310,9 +313,39 @@ func innerMain() int {
310313
}
311314
}
312315

316+
// Always enable downstream checking for the webhooks, if enabled.
317+
if len(webhooks) > 0 {
318+
tlsChecker := webhook.NewTLSChecker(*certDir, *port)
319+
setupLog.Info("setting up TLS readiness probe")
320+
if err := mgr.AddReadyzCheck("tls-check", tlsChecker); err != nil {
321+
setupLog.Error(err, "unable to create tls readiness check")
322+
return 1
323+
}
324+
}
325+
313326
// Setup controllers asynchronously, they will block for certificate generation if needed.
314327
setupErr := make(chan error)
315-
ctx := ctrl.SetupSignalHandler()
328+
329+
// Setup termination with grace period. Required to give K8s Services time to disconnect the Pod endpoint on termination.
330+
// Derived from how the controller-runtime sets up a signal handler with ctrl.SetupSignalHandler()
331+
// controller-runtime upstream issue: https://github.com/kubernetes-sigs/controller-runtime/issues/3113
332+
ctx, cancel := context.WithCancel(context.Background())
333+
334+
c := make(chan os.Signal, 2)
335+
signal.Notify(c, []os.Signal{os.Interrupt, syscall.SIGTERM}...)
336+
go func() {
337+
<-c
338+
setupLog.Info(fmt.Sprintf("Shutting Down, waiting for %ds", *shutdownDelay))
339+
go func() {
340+
time.Sleep(time.Duration(*shutdownDelay) * time.Second)
341+
setupLog.Info("Shutdown grace period finished")
342+
cancel()
343+
}()
344+
<-c
345+
setupLog.Info("Second signal received, killing now")
346+
os.Exit(1) // second signal. Exit directly.
347+
}()
348+
316349
go func() {
317350
setupErr <- setupControllers(ctx, mgr, tracker, setupFinished)
318351
}()

test/bats/test.bats

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,3 +694,31 @@ __expansion_audit_test() {
694694
assert_failure
695695
wait_for_process ${WAIT_TIME} ${SLEEP_TIME} "kubectl delete --ignore-not-found -f ${BATS_TESTS_DIR}/templates/k8srequiredlabels_template_regov1.yaml"
696696
}
697+
698+
@test "shutdown delay" {
699+
# Get the name of the Gatekeeper pod
700+
POD_NAME=$(kubectl get pods -n ${GATEKEEPER_NAMESPACE} -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
701+
702+
# Trigger the termination of the Gatekeeper pod
703+
TERMINATION_START=$(date +%s)
704+
run kubectl delete pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE}
705+
706+
# Monitor the termination process to ensure the grace period is respected
707+
echo "Monitoring the termination process..."
708+
run kubectl get pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} -w --ignore-not-found
709+
710+
# Wait for the pod to be fully terminated
711+
run kubectl wait --for=delete pod/${POD_NAME} -n ${GATEKEEPER_NAMESPACE} --timeout=120s
712+
713+
TERMINATION_END=$(date +%s)
714+
TERMINATION_DURATION=$((TERMINATION_END - TERMINATION_START))
715+
716+
# Validate that the pod is terminated after the specified grace period
717+
GRACE_PERIOD=10
718+
if [ "${TERMINATION_DURATION}" -ge "${GRACE_PERIOD}" ]; then
719+
echo "Pod termination respected the grace period of ${GRACE_PERIOD} seconds."
720+
else
721+
echo "Pod termination did not respect the grace period. Termination duration: ${TERMINATION_DURATION} seconds."
722+
assert_failure
723+
fi
724+
}

0 commit comments

Comments
 (0)