Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Support resource retention policy for apps.
  • Loading branch information
jiangzho committed Apr 9, 2024
commit 91bc916728d8a820774233c4eef7260092d427c3
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,12 @@ spec:
maxExecutors:
type: integer
type: object
deleteOnTermination:
type: boolean
resourceRetentionPolicy:
enum:
- AlwaysDelete
- RetainOnFailure
- NeverDelete
type: string
type: object
driverSpec:
properties:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@ public class ApplicationTolerations {
@Builder.Default
protected InstanceConfig instanceConfig = new InstanceConfig();
/**
* If disabled, operator would not attempt to delete resources after app terminates.
* While this can be helpful in dev phase, it shall not be enabled for prod use cases.
* Caution: in order to avoid resource conflicts among multiple attempts, this can be disabled
* iff restart policy is set to Never.
* Configure operator to delete / retain resources for an app after it terminates.
* While this can be helpful in dev phase, it shall not be enabled (or enabled with caution) for
* prod use cases: this could cause resource quota usage increase unexpectedly.
* Caution: in order to avoid resource conflicts among multiple attempts, this should be set to
* 'AlwaysDelete' unless restart policy is set to 'Never'.
*/
@Builder.Default
protected Boolean deleteOnTermination = true;
protected ResourceRetentionPolicy resourceRetentionPolicy = ResourceRetentionPolicy.AlwaysDelete;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.spark.kubernetes.operator.spec;

public enum ResourceRetentionPolicy {
AlwaysDelete,
RetainOnFailure,
NeverDelete
}
2 changes: 1 addition & 1 deletion spark-operator-docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ spec:
forceTerminationGracePeriodMillis: 300000
sparkSessionStartTimeoutMillis: 300000
terminationRequeuePeriodMillis: 2000
deleteOnTermination: false
resourceRetentionPolicy: OnFailure
instanceConfig:
initExecutors: 0
maxExecutors: 0
Expand Down
23 changes: 18 additions & 5 deletions spark-operator-docs/spark_application.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,17 @@ applicationTimeoutConfig:
# time to wait for force delete resources at the end of attempt
forceTerminationGracePeriodMillis: 300000
```



| Field | Type | Default Value | Descritpion |
|-----------------------------------------------------------------------------------------|---------|---------------|--------------------------------------------------------------------------------------------------------------------|
| .spec.applicationTolerations.applicationTimeoutConfig.driverStartTimeoutMillis | integer | 300000 | Time to wait for driver reaches running state after requested driver. |
| .spec.applicationTolerations.applicationTimeoutConfig.executorStartTimeoutMillis | integer | 300000 | Time to wait for driver to acquire minimal number of running executors. |
| .spec.applicationTolerations.applicationTimeoutConfig.forceTerminationGracePeriodMillis | integer | 300000 | Time to wait for force delete resources at the end of attempt. |
| .spec.applicationTolerations.applicationTimeoutConfig.sparkSessionStartTimeoutMillis | integer | 300000 | Time to wait for driver reaches ready state. |
| .spec.applicationTolerations.applicationTimeoutConfig.terminationRequeuePeriodMillis | integer | 2000 | Back-off time when releasing resource need to be re-attempted for application. |


### Instance Config

Instance Config helps operator to decide whether an application is running healthy. When
Expand Down Expand Up @@ -191,12 +201,15 @@ On the other hand, when developing an application, it's possible to configure

```yaml
applicationTolerations:
deleteOnTermination: false
# Acceptable values are 'AlwaysDelete', 'RetainOnFailure', 'NeverDelete'
resourceRetentionPolicy: RetainOnFailure
```

So operator would not attempt to delete resources after app terminates. Note that this
applies only to operator-created resources (driver .etc). You may also want to tune
`spark.kubernetes.executor.deleteOnTermination` to control the behavior of driver-created
So operator would not attempt to delete driver pod and driver resources if app fails. Similarly,
if resourceRetentionPolicy is set to `NeverDelete`, operator would not delete driver resources
when app ends. Note that this applies only to operator-created resources (driver pod, SparkConf
configmap .etc). You may also want to tune `spark.kubernetes.driver.service.deleteOnTermination`
and `spark.kubernetes.executor.deleteOnTermination` to control the behavior of driver-created
resources.

## Supported Spark Versions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.spark.kubernetes.operator.reconciler.SparkApplicationReconcileUtils;
import org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils;
import org.apache.spark.kubernetes.operator.spec.ApplicationTolerations;
import org.apache.spark.kubernetes.operator.spec.ResourceRetentionPolicy;
import org.apache.spark.kubernetes.operator.spec.RestartPolicy;
import org.apache.spark.kubernetes.operator.status.ApplicationState;
import org.apache.spark.kubernetes.operator.status.ApplicationStateSummary;
Expand All @@ -59,8 +60,10 @@ public ReconcileProgress reconcile(SparkApplicationContext context,
ApplicationStatus currentStatus = context.getSparkApplication().getStatus();
ApplicationTolerations tolerations =
context.getSparkApplication().getSpec().getApplicationTolerations();
ResourceRetentionPolicy resourceRetentionPolicy = tolerations.getResourceRetentionPolicy();
String stateMessage = null;
if (!tolerations.getDeleteOnTermination()) {

if (retainReleaseResource(resourceRetentionPolicy, currentStatus.getCurrentState())) {
if (tolerations.getRestartConfig() != null
&& !RestartPolicy.Never.equals(
tolerations.getRestartConfig().getRestartPolicy())) {
Expand Down Expand Up @@ -139,6 +142,18 @@ public ReconcileProgress reconcile(SparkApplicationContext context,

}

protected boolean retainReleaseResource(ResourceRetentionPolicy resourceRetentionPolicy,
ApplicationState currentState) {
switch (resourceRetentionPolicy) {
case AlwaysDelete:
return false;
case RetainOnFailure:
return currentState.getCurrentStateSummary().isFailure();
default:
return true;
}
}

private ReconcileProgress updateStateAndProceed(SparkApplicationContext context,
StatusRecorder statusRecorder,
ApplicationStatus updatedStatus,
Expand Down
2 changes: 1 addition & 1 deletion spark-operator/src/main/resources/spark-pi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
spark.kubernetes.namespace: "spark-test"
spark.kubernetes.authenticate.driver.serviceAccountName: "spark"
applicationTolerations:
deleteOnTermination: false
resourceRetentionPolicy: RetainOnFailure
runtimeVersions:
scalaVersion: v2_12
sparkVersion: v3_5_1