diff --git a/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation.go b/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation.go index 5ffde09f0e51f1..2d198613ee7be5 100644 --- a/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation.go +++ b/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation.go @@ -395,17 +395,25 @@ func initContainerResourceRequirements(pod *corev1.Pod, conf initResourceRequire } podRequirements := podSumRessourceRequirements(pod) insufficientResourcesMessage := "The overall pod's containers limit is too low" + for _, k := range [2]corev1.ResourceName{corev1.ResourceCPU, corev1.ResourceMemory} { - if q, ok := conf[k]; ok { - requirements.Limits[k] = q - requirements.Requests[k] = q - } else { + // Check if explicit requests/limits are configured + if req, hasReq := conf.requests[k]; hasReq { + requirements.Requests[k] = req + } + if lim, hasLim := conf.limits[k]; hasLim { + requirements.Limits[k] = lim + } + + // Auto-calculate requests and limits independently + if _, hasReq := conf.requests[k]; !hasReq { + if maxPodReq, ok := podRequirements.Requests[k]; ok { + requirements.Requests[k] = maxPodReq + } + } + if _, hasLim := conf.limits[k]; !hasLim { if maxPodLim, ok := podRequirements.Limits[k]; ok { - // If the pod before adding instrumentation init containers would have had a limits smaller than - // a certain amount, we just don't do anything, for two reasons: - // 1. The init containers need quite a lot of memory/CPU in order to not OOM or initialize in reasonnable time - // 2. The APM libraries themselves will increase footprint of the container by a - // non trivial amount, and we don't want to cause issues for constrained apps + // Check minimum requirements switch k { case corev1.ResourceMemory: if minimumMemoryLimit.Cmp(maxPodLim) == 1 { @@ -417,16 +425,12 @@ func initContainerResourceRequirements(pod *corev1.Pod, conf initResourceRequire decision.skipInjection = true insufficientResourcesMessage += fmt.Sprintf(", %v pod_limit=%v needed=%v", k, maxPodLim.String(), minimumCPULimit.String()) } - default: - // We don't support other resources } requirements.Limits[k] = maxPodLim } - if maxPodReq, ok := podRequirements.Requests[k]; ok { - requirements.Requests[k] = maxPodReq - } } } + if decision.skipInjection { log.Debug(insufficientResourcesMessage) decision.message = insufficientResourcesMessage diff --git a/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation_test.go b/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation_test.go index c387cca3c54408..d984fb84eebc7a 100644 --- a/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation_test.go +++ b/pkg/clusteragent/admission/mutate/autoinstrumentation/auto_instrumentation_test.go @@ -1970,3 +1970,181 @@ func languageSetOf(languages ...string) languagemodels.LanguageSet { } return set } +func TestSeparateRequestsAndLimits(t *testing.T) { + tests := []struct { + name string + requestsCPU string + requestsMem string + limitsCPU string + limitsMem string + wantErr bool + errMsg string + }{ + { + name: "separate requests and limits", + requestsCPU: "25m", + requestsMem: "50Mi", + limitsCPU: "500m", + limitsMem: "1Gi", + wantErr: false, + }, + { + name: "limits less than requests should fail", + requestsCPU: "500m", + requestsMem: "1Gi", + limitsCPU: "25m", + limitsMem: "50Mi", + wantErr: true, + errMsg: "limit", + }, + { + name: "only requests set", + requestsCPU: "100m", + requestsMem: "128Mi", + wantErr: false, + }, + { + name: "only limits set", + limitsCPU: "500m", + limitsMem: "512Mi", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockConfig := configmock.New(t) + + if tt.requestsCPU != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_requests.cpu", tt.requestsCPU) + } + if tt.requestsMem != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_requests.memory", tt.requestsMem) + } + if tt.limitsCPU != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_limits.cpu", tt.limitsCPU) + } + if tt.limitsMem != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_limits.memory", tt.limitsMem) + } + + _, err := initDefaultResources(mockConfig) + if tt.wantErr { + require.Error(t, err) + if tt.errMsg != "" { + require.Contains(t, err.Error(), tt.errMsg) + } + } else { + require.NoError(t, err) + } + }) + } +} +func TestSeparateRequestsAndLimitsIntegration(t *testing.T) { + tests := []struct { + name string + requestsCPU string + requestsMem string + limitsCPU string + limitsMem string + wantReqCPU string + wantReqMem string + wantLimCPU string + wantLimMem string + }{ + { + name: "separate requests and limits", + requestsCPU: "25m", + requestsMem: "50Mi", + limitsCPU: "500m", + limitsMem: "1Gi", + wantReqCPU: "25m", + wantReqMem: "50Mi", + wantLimCPU: "500m", + wantLimMem: "1Gi", + }, + { + name: "only requests set - limits should be empty", + requestsCPU: "100m", + requestsMem: "128Mi", + wantReqCPU: "100m", + wantReqMem: "128Mi", + wantLimCPU: "0", + wantLimMem: "0", + }, + { + name: "only limits set - requests should be empty", + limitsCPU: "500m", + limitsMem: "512Mi", + wantReqCPU: "0", + wantReqMem: "0", + wantLimCPU: "500m", + wantLimMem: "512Mi", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + wmeta := fxutil.Test[workloadmeta.Component](t, + core.MockBundle(), + workloadmetafxmock.MockModule(workloadmeta.NewParams()), + ) + imageResolver := NewImageResolver(nil, configmock.New(t)) + + mockConfig := configmock.New(t) + + if tt.requestsCPU != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_requests.cpu", tt.requestsCPU) + } + if tt.requestsMem != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_requests.memory", tt.requestsMem) + } + if tt.limitsCPU != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_limits.cpu", tt.limitsCPU) + } + if tt.limitsMem != "" { + mockConfig.SetWithoutSource("admission_controller.auto_instrumentation.init_limits.memory", tt.limitsMem) + } + + config, err := NewConfig(mockConfig) + require.NoError(t, err) + + mutator, err := NewNamespaceMutator(config, wmeta, imageResolver) + require.NoError(t, err) + + pod := common.FakePod("test-pod") + lang := java + c := lang.libInfo("", "").initContainers(config.version, imageResolver)[0] + requirements, injectionDecision := initContainerResourceRequirements(pod, config.defaultResourceRequirements) + require.False(t, injectionDecision.skipInjection) + + c.Mutators = mutator.core.newInitContainerMutators(requirements, pod.Namespace) + initialInitContainerCount := len(pod.Spec.InitContainers) + err = c.mutatePod(pod) + require.NoError(t, err) + require.Len(t, pod.Spec.InitContainers, initialInitContainerCount+1) + + initContainer := pod.Spec.InitContainers[initialInitContainerCount] + + // Check CPU requests + actualReqCPU := initContainer.Resources.Requests[corev1.ResourceCPU] + expectedReqCPU := resource.MustParse(tt.wantReqCPU) + require.Zero(t, expectedReqCPU.Cmp(actualReqCPU), "expected CPU request: %s, actual: %s", expectedReqCPU.String(), actualReqCPU.String()) + + // Check CPU limits + actualLimCPU := initContainer.Resources.Limits[corev1.ResourceCPU] + expectedLimCPU := resource.MustParse(tt.wantLimCPU) + require.Zero(t, expectedLimCPU.Cmp(actualLimCPU), "expected CPU limit: %s, actual: %s", expectedLimCPU.String(), actualLimCPU.String()) + + // Check Memory requests + actualReqMem := initContainer.Resources.Requests[corev1.ResourceMemory] + expectedReqMem := resource.MustParse(tt.wantReqMem) + require.Zero(t, expectedReqMem.Cmp(actualReqMem), "expected memory request: %s, actual: %s", expectedReqMem.String(), actualReqMem.String()) + + // Check Memory limits + actualLimMem := initContainer.Resources.Limits[corev1.ResourceMemory] + expectedLimMem := resource.MustParse(tt.wantLimMem) + require.Zero(t, expectedLimMem.Cmp(actualLimMem), "expected memory limit: %s, actual: %s", expectedLimMem.String(), actualLimMem.String()) + }) + } +} diff --git a/pkg/clusteragent/admission/mutate/autoinstrumentation/config.go b/pkg/clusteragent/admission/mutate/autoinstrumentation/config.go index d606e45609f60b..969a6fe341cff1 100644 --- a/pkg/clusteragent/admission/mutate/autoinstrumentation/config.go +++ b/pkg/clusteragent/admission/mutate/autoinstrumentation/config.go @@ -352,7 +352,10 @@ var ( minimumMemoryLimit resource.Quantity = resource.MustParse("100Mi") // 100 MB (recommended minimum by Alpine) ) -type initResourceRequirementConfiguration map[corev1.ResourceName]resource.Quantity +type initResourceRequirementConfiguration struct { + requests map[corev1.ResourceName]resource.Quantity + limits map[corev1.ResourceName]resource.Quantity +} // getOptionalBoolValue returns a pointer to a bool corresponding to the config value if the key is set in the config func getOptionalBoolValue(datadogConfig config.Component, key string) *bool { @@ -413,27 +416,79 @@ func getPinnedLibraries(libVersions map[string]string, registry string, checkDef } func initDefaultResources(datadogConfig config.Component) (initResourceRequirementConfiguration, error) { - conf := initResourceRequirementConfiguration{} + conf := initResourceRequirementConfiguration{ + requests: make(map[corev1.ResourceName]resource.Quantity), + limits: make(map[corev1.ResourceName]resource.Quantity), + } + + // Handle new separate requests/limits configuration + if datadogConfig.IsSet("admission_controller.auto_instrumentation.init_requests.cpu") { + quantity, err := resource.ParseQuantity(datadogConfig.GetString("admission_controller.auto_instrumentation.init_requests.cpu")) + if err != nil { + return conf, err + } + conf.requests[corev1.ResourceCPU] = quantity + } + + if datadogConfig.IsSet("admission_controller.auto_instrumentation.init_requests.memory") { + quantity, err := resource.ParseQuantity(datadogConfig.GetString("admission_controller.auto_instrumentation.init_requests.memory")) + if err != nil { + return conf, err + } + conf.requests[corev1.ResourceMemory] = quantity + } + if datadogConfig.IsSet("admission_controller.auto_instrumentation.init_limits.cpu") { + quantity, err := resource.ParseQuantity(datadogConfig.GetString("admission_controller.auto_instrumentation.init_limits.cpu")) + if err != nil { + return conf, err + } + conf.limits[corev1.ResourceCPU] = quantity + } + + if datadogConfig.IsSet("admission_controller.auto_instrumentation.init_limits.memory") { + quantity, err := resource.ParseQuantity(datadogConfig.GetString("admission_controller.auto_instrumentation.init_limits.memory")) + if err != nil { + return conf, err + } + conf.limits[corev1.ResourceMemory] = quantity + } + + // Handle legacy configuration (requests = limits) if datadogConfig.IsSet("admission_controller.auto_instrumentation.init_resources.cpu") { quantity, err := resource.ParseQuantity(datadogConfig.GetString("admission_controller.auto_instrumentation.init_resources.cpu")) if err != nil { return conf, err } - conf[corev1.ResourceCPU] = quantity - } /* else { - conf[corev1.ResourceCPU] = *resource.NewMilliQuantity(minimumCPULimit, resource.DecimalSI) - }*/ + if _, exists := conf.requests[corev1.ResourceCPU]; !exists { + conf.requests[corev1.ResourceCPU] = quantity + } + if _, exists := conf.limits[corev1.ResourceCPU]; !exists { + conf.limits[corev1.ResourceCPU] = quantity + } + } if datadogConfig.IsSet("admission_controller.auto_instrumentation.init_resources.memory") { quantity, err := resource.ParseQuantity(datadogConfig.GetString("admission_controller.auto_instrumentation.init_resources.memory")) if err != nil { return conf, err } - conf[corev1.ResourceMemory] = quantity - } /*else { - conf[corev1.ResourceCPU] = *resource.NewMilliQuantity(minimumMemoryLimit, resource.DecimalSI) - }*/ + if _, exists := conf.requests[corev1.ResourceMemory]; !exists { + conf.requests[corev1.ResourceMemory] = quantity + } + if _, exists := conf.limits[corev1.ResourceMemory]; !exists { + conf.limits[corev1.ResourceMemory] = quantity + } + } + + // Validate limits >= requests + for resource, limit := range conf.limits { + if request, exists := conf.requests[resource]; exists { + if limit.Cmp(request) < 0 { + return conf, fmt.Errorf("init container %s limit (%s) must be greater than or equal to request (%s)", resource, limit.String(), request.String()) + } + } + } return conf, nil } diff --git a/pkg/config/setup/config.go b/pkg/config/setup/config.go index bd96231222bda6..ed688841f55f60 100644 --- a/pkg/config/setup/config.go +++ b/pkg/config/setup/config.go @@ -884,7 +884,7 @@ func InitConfig(config pkgconfigmodel.Setup) { config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.patcher.enabled", false) config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.patcher.fallback_to_file_provider", false) // to be enabled only in e2e tests config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.patcher.file_provider_path", "/etc/datadog-agent/patch/auto-instru.json") // to be used only in e2e tests - config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.inject_auto_detected_libraries", true) // allows injecting libraries for languages detected by automatic language detection feature + config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.inject_auto_detected_libraries", true) config.BindEnv("admission_controller.auto_instrumentation.init_resources.cpu") //nolint:forbidigo // TODO: replace by 'SetDefaultAndBindEnv' config.BindEnv("admission_controller.auto_instrumentation.init_resources.memory") //nolint:forbidigo // TODO: replace by 'SetDefaultAndBindEnv' config.BindEnv("admission_controller.auto_instrumentation.init_security_context") //nolint:forbidigo // TODO: replace by 'SetDefaultAndBindEnv' @@ -892,6 +892,10 @@ func InitConfig(config pkgconfigmodel.Setup) { config.BindEnv("admission_controller.auto_instrumentation.iast.enabled", "DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_IAST_ENABLED") //nolint:forbidigo // TODO: replace by 'SetDefaultAndBindEnv' // config for IAST which is implemented in the client libraries config.BindEnv("admission_controller.auto_instrumentation.asm_sca.enabled", "DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_APPSEC_SCA_ENABLED") //nolint:forbidigo // TODO: replace by 'SetDefaultAndBindEnv' // config for SCA config.BindEnv("admission_controller.auto_instrumentation.profiling.enabled", "DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_PROFILING_ENABLED") //nolint:forbidigo // TODO: replace by 'SetDefaultAndBindEnv' // config for profiling + config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.init_requests.cpu", "") + config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.init_requests.memory", "") + config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.init_limits.cpu", "") + config.BindEnvAndSetDefault("admission_controller.auto_instrumentation.init_limits.memory", "") config.BindEnvAndSetDefault("admission_controller.cws_instrumentation.enabled", false) config.BindEnvAndSetDefault("admission_controller.cws_instrumentation.pod_endpoint", "/inject-pod-cws") config.BindEnvAndSetDefault("admission_controller.cws_instrumentation.command_endpoint", "/inject-command-cws") diff --git a/releasenotes/notes/separate-init-container-requests-limits-38831-5974c05b90677e00.yaml b/releasenotes/notes/separate-init-container-requests-limits-38831-5974c05b90677e00.yaml new file mode 100644 index 00000000000000..f0ffdf936c6e8a --- /dev/null +++ b/releasenotes/notes/separate-init-container-requests-limits-38831-5974c05b90677e00.yaml @@ -0,0 +1,22 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +features: + - | + Add support for separate CPU/Memory requests and limits for auto-instrumentation init containers. + New environment variables allow independent configuration of requests and limits: + + * ``DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_INIT_REQUESTS_CPU`` + * ``DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_INIT_REQUESTS_MEMORY`` + * ``DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_INIT_LIMITS_CPU`` + * ``DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_INIT_LIMITS_MEMORY`` + + This resolves OOMKilled issues while maintaining scheduling efficiency by allowing + low requests for scheduling and higher limits for initialization spikes. + Existing ``DD_ADMISSION_CONTROLLER_AUTO_INSTRUMENTATION_INIT_RESOURCES_*`` + environment variables continue to work for backward compatibility.