diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 00000000..1f808de5
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,2 @@
+[*]
+insert_final_newline = true
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..617550d6
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,5 @@
+#
+# https://help.github.com/articles/dealing-with-line-endings/
+#
+# Linux start script should use lf
+/gradlew text eol=lf
diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml
index 26ac0c1e..09a4d377 100644
--- a/.github/.licenserc.yaml
+++ b/.github/.licenserc.yaml
@@ -9,6 +9,7 @@ header:
paths-ignore:
- '**/*.md'
- '**/.gitignore'
+ - '**/.gitattributes'
- '.github/**'
- 'dev/**'
- 'LICENSE'
@@ -16,5 +17,11 @@ header:
- '.asf.yaml'
- '**/*.gradle'
- gradlew
+ - '**/.helmignore'
+ - '**/EcsLayout.json'
+ - '.editorconfig'
+ - 'gradle/**'
+ - '**/sparkapplications.org.apache.spark-v1.yml'
+
comment: on-failure
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6a5a1475..732f482c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -26,4 +26,20 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
config: .github/.licenserc.yaml
-
+ test_ci:
+ name: "Test CI"
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ java-version: [ 11, 17, 21 ]
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v3
+ - name: Set up JDK ${{ matrix.java-version }}
+ uses: actions/setup-java@v2
+ with:
+ java-version: ${{ matrix.java-version }}
+ distribution: 'adopt'
+ - name: Build with Gradle
+ run: |
+ set -o pipefail; ./gradlew clean build; set +o pipefail
diff --git a/.gitignore b/.gitignore
index 5e0e9b6b..3cecc1cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,30 @@ build
dependencies.lock
**/dependencies.lock
gradle/wrapper/gradle-wrapper.jar
+
+# Compiled source #
+###################
+*.class
+*.dll
+*.exe
+*.o
+*.so
+*.pyc
+
+# Packages #
+############
+*.7z
+*.dmg
+*.gz
+*.iso
+*.rar
+*.tar
+*.zip
+
+# Logs and databases #
+######################
+*.log
+
+# Other build and generated files #
+###################################
+build-tools/helm/spark-kubernetes-operator/crds/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..4596f276
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+FROM gradle:8.7-jdk17-alpine AS builder
+ARG BASE_VERSION
+WORKDIR /app
+COPY . .
+RUN ./gradlew clean build -x test
+
+FROM eclipse-temurin:17-jre-jammy
+ARG BASE_VERSION
+
+ENV SPARK_OPERATOR_HOME=/opt/spark-operator
+ENV SPARK_OPERATOR_WORK_DIR=/opt/spark-operator/operator
+ENV BASE_VERSION=$BASE_VERSION
+ENV OPERATOR_JAR=spark-kubernetes-operator-$BASE_VERSION-all.jar
+
+WORKDIR $SPARK_OPERATOR_WORK_DIR
+
+RUN groupadd --system --gid=9999 spark && \
+ useradd --system --home-dir $SPARK_OPERATOR_HOME --uid=9999 --gid=spark spark
+
+COPY --from=builder /app/spark-operator/build/libs/$OPERATOR_JAR .
+COPY docker-entrypoint.sh .
+
+RUN chown -R spark:spark $SPARK_OPERATOR_HOME && \
+ chown spark:spark $OPERATOR_JAR && \
+ chown spark:spark docker-entrypoint.sh
+
+USER spark
+ENTRYPOINT ["/docker-entrypoint.sh"]
+CMD ["help"]
diff --git a/README.md b/README.md
index 52a85bad..ae604944 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,44 @@
-# spark-kubernetes-operator
+# Spark-Kubernetes-Operator
+
+Welcome to the **Spark-Kubernetes-Operator**, a Kubernetes operator designed to simplify and
+automate the management of Spark applications in Kubernetes environments.
+
+## Project Status
+
+As of Apr 1, 2024, Spark-Kubernetes-Operator is under Active Development.
+
+- We are actively working on new features and improvements. We welcome contributions and
+ feedback to make the operator even better. Check out the **Issues** section to see what's
+ currently in progress or suggest new features.
+- Current API Version: `v1alpha1`
+
+## Key Features
+
+- Deploy and monitor SparkApplications throughout its lifecycle
+- Start / stop Spark Apps with simple yaml schema
+- Spark version agnostic
+- Full logging and metrics integration
+- Flexible deployments and native integration with Kubernetes tooling
+
+Please refer the [design](spark-operator-docs/architecture.md) section for architecture and
+design.
+
+## Quickstart
+
+[Getting started doc](./spark-operator-docs/getting_started.md) gives an example to install
+operator and run Spark Applications locally.
+
+In addition, [SparkApplication](./spark-operator-docs/spark_application.md) section
+describes how to write your own apps, [Operations](./spark-operator-docs/operations.md) section
+describes how to install operator with custom config overriding.
+
+
+
+## Contributing
+
+You can learn more about how to contribute in the [Apache Spark website](https://spark.
+apache.org/contributing.html).
+
+## License
+
+The code in this repository is licensed under the [Apache Software License 2](./LICENSE).
diff --git a/build-tools/helm/spark-kubernetes-operator/.helmignore b/build-tools/helm/spark-kubernetes-operator/.helmignore
new file mode 100644
index 00000000..e37341e8
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/.helmignore
@@ -0,0 +1,18 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/build-tools/helm/spark-kubernetes-operator/Chart.yaml b/build-tools/helm/spark-kubernetes-operator/Chart.yaml
new file mode 100644
index 00000000..81b501bd
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/Chart.yaml
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+---
+apiVersion: v2
+name: spark-kubernetes-operator
+description: A Helm chart for the Apache Spark Kubernetes Operator
+type: application
+version: 0.1.0
+appVersion: 0.1.0
diff --git a/build-tools/helm/spark-kubernetes-operator/conf/log4j2.properties b/build-tools/helm/spark-kubernetes-operator/conf/log4j2.properties
new file mode 100644
index 00000000..f3ad671b
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/conf/log4j2.properties
@@ -0,0 +1,52 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+status=debug
+strict=true
+dest=out
+name=PropertiesConfig
+property.filename=/opt/spark-operator/logs/spark-operator
+filter.threshold.type=ThresholdFilter
+filter.threshold.level=debug
+# console
+appender.console.type=Console
+appender.console.name=STDOUT
+appender.console.layout.type=PatternLayout
+appender.console.layout.pattern=%d %p %X %C{1.} [%t] %m%n
+appender.console.filter.threshold.type=ThresholdFilter
+appender.console.filter.threshold.level=info
+# rolling JSON
+appender.rolling.type=RollingFile
+appender.rolling.name=RollingFile
+appender.rolling.append=true
+appender.rolling.fileName=${filename}.log
+appender.rolling.filePattern=${filename}-%i.log.gz
+appender.rolling.layout.type=JsonTemplateLayout
+appender.rolling.layout.eventTemplateUri=classpath:EcsLayout.json
+appender.rolling.policies.type=Policies
+appender.rolling.policies.size.type=SizeBasedTriggeringPolicy
+appender.rolling.policies.size.size=100MB
+appender.rolling.strategy.type=DefaultRolloverStrategy
+appender.rolling.strategy.max=20
+appender.rolling.immediateFlush=true
+# chatty loggers
+rootLogger.level=all
+logger.netty.name=io.netty
+logger.netty.level=warn
+log4j2.contextSelector=org.apache.logging.log4j.core.async.AsyncLoggerContextSelector
+rootLogger.appenderRef.stdout.ref=STDOUT
+rootLogger.appenderRef.rolling.ref=RollingFile
diff --git a/build-tools/helm/spark-kubernetes-operator/conf/spark-operator.properties b/build-tools/helm/spark-kubernetes-operator/conf/spark-operator.properties
new file mode 100644
index 00000000..c852810b
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/conf/spark-operator.properties
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+# Property Overrides. e.g.
+# kubernetes.operator.reconcile.interval=15s
+# Enable this for hot property loading
+# spark.operator.dynamic.config.enabled=false
diff --git a/build-tools/helm/spark-kubernetes-operator/templates/_helpers.tpl b/build-tools/helm/spark-kubernetes-operator/templates/_helpers.tpl
new file mode 100644
index 00000000..cb442b18
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/templates/_helpers.tpl
@@ -0,0 +1,163 @@
+{{/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+*/}}
+
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "spark-operator.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "spark-operator.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "spark-operator.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "spark-operator.commonLabels" -}}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+helm.sh/chart: {{ include "spark-operator.chart" . }}
+{{- end }}
+
+{{/*
+Dynamic config labels
+*/}}
+{{- define "spark-operator.dynamicConfigLabels" -}}
+app.kubernetes.io/name: {{ include "spark-operator.name" . }}
+app.kubernetes.io/component: "operator-dynamic-config-overrides"
+{{ include "spark-operator.commonLabels" . }}
+{{- end }}
+
+{{/*
+Initial config labels
+*/}}
+{{- define "spark-operator.initialConfigLabels" -}}
+app.kubernetes.io/name: {{ include "spark-operator.name" . }}
+app.kubernetes.io/component: "operator-config"
+{{ include "spark-operator.commonLabels" . }}
+{{- end }}
+
+{{/*
+Deployment selector labels
+*/}}
+{{- define "spark-operator.deploymentSelectorLabels" -}}
+app.kubernetes.io/name: {{ include "spark-operator.name" . }}
+app.kubernetes.io/component: "operator-deployment"
+{{- end }}
+
+{{/*
+Create the path of the operator image to use
+*/}}
+{{- define "spark-operator.imagePath" -}}
+{{- if .Values.image.digest }}
+{{- .Values.image.repository }}@{{ .Values.image.digest }}
+{{- else }}
+{{- .Values.image.repository }}:{{ default .Chart.AppVersion .Values.image.tag }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create the name of the operator service account to use
+*/}}
+{{- define "spark-operator.serviceAccountName" -}}
+{{- if .Values.operatorRbac.serviceAccount.create }}
+{{- default (include "spark-operator.fullname" .) .Values.operatorRbac.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.operatorRbac.serviceAccount.name }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create the name of the operator dynamic configmap to use
+*/}}
+{{- define "spark-operator.dynamicConfigMapName" -}}
+{{- default (include "spark-operator.fullname" .) .Values.operatorConfiguration.dynamicConfig.name }}
+{{- end }}
+
+{{/*
+Default property overrides
+*/}}
+{{- define "spark-operator.defaultPropertyOverrides" -}}
+# Runtime resolved properties
+spark.operator.namespace={{ .Release.Namespace }}
+spark.operator.name={{- include "spark-operator.name" . }}
+spark.operator.dynamic.config.enabled={{ .Values.operatorConfiguration.dynamicConfig.create }}
+{{- if .Values.appResources.namespaces.watchGivenNamespacesOnly }}
+spark.operator.watched.namespaces={{- join "," .Values.appResources.namespaces.data }}
+{{- end }}
+{{- end }}
+
+{{/*
+Readiness Probe properties overrides
+*/}}
+{{- define "spark-operator.readinessProbe.failureThreshold" -}}
+{{- default 30 .Values.operatorDeployment.operatorPod.operatorContainer.probes.startupProbe.failureThreshold }}
+{{- end }}
+{{- define "spark-operator.readinessProbe.periodSeconds" -}}
+{{- default 10 .Values.operatorDeployment.operatorPod.operatorContainer.probes.startupProbe.periodSeconds }}
+{{- end }}
+
+{{/*
+Liveness Probe properties override
+*/}}
+{{- define "spark-operator.livenessProbe.initialDelaySeconds" -}}
+{{- default 30 .Values.operatorDeployment.operatorPod.operatorContainer.probes.livenessProbe.initialDelaySeconds }}
+{{- end }}
+{{- define "spark-operator.livenessProbe.periodSeconds" -}}
+{{- default 10 .Values.operatorDeployment.operatorPod.operatorContainer.probes.livenessProbe.periodSeconds }}
+{{- end }}
+
+{{/*
+Readiness Probe property overrides
+*/}}
+{{- define "spark-operator.probePort" -}}
+{{- default 18080 .Values.operatorDeployment.operatorPod.operatorContainer.probes.port }}
+{{- end }}
+
+{{/*
+Metrics port overrides
+*/}}
+{{- define "spark-operator.metricsPort" -}}
+{{- default 19090 .Values.operatorDeployment.operatorPod.operatorContainer.metrics.port }}
+{{- end }}
diff --git a/build-tools/helm/spark-kubernetes-operator/templates/rbac.yaml b/build-tools/helm/spark-kubernetes-operator/templates/rbac.yaml
new file mode 100644
index 00000000..0f8b8f09
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/templates/rbac.yaml
@@ -0,0 +1,124 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+{{/*
+RBAC rules used to create the operator (cluster)role
+*/}}
+{{- define "spark-operator.operatorRbacRules" }}
+rules:
+ - apiGroups:
+ - ""
+ resources:
+ - pods
+ - services
+ - configmaps
+ - persistentvolumeclaims
+ verbs:
+ - '*'
+ - apiGroups:
+ - "org.apache.spark"
+ resources:
+ - '*'
+ verbs:
+ - '*'
+{{- end }}
+
+{{/*
+Labels and annotations to be applied on rbacResources
+*/}}
+{{- define "spark-operator.rbacLabelsAnnotations" }}
+ labels:
+ {{- with .Values.operatorRbac.labels }}
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+ {{ include "spark-operator.commonLabels" . | nindent 4 }}
+ {{- with .Values.operatorRbac.annotations }}
+ annotations:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+{{- end }}
+
+---
+#Service account and rolebindings for operator
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: {{ include "spark-operator.serviceAccountName" $ }}
+ namespace: {{ .Release.Namespace }}
+{{- template "spark-operator.rbacLabelsAnnotations" $ }}
+---
+{{- if .Values.operatorRbac.clusterRoleBinding.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: {{ .Values.operatorRbac.clusterRoleBinding.name }}
+{{- template "spark-operator.rbacLabelsAnnotations" $ }}
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: {{ .Values.operatorRbac.clusterRole.name }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ include "spark-operator.serviceAccountName" $ }}
+ namespace: {{ .Release.Namespace }}
+{{- end }}
+---
+{{- if .Values.operatorRbac.clusterRole.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: {{ .Values.operatorRbac.clusterRole.name }}
+ namespace: {{ .Release.Namespace }}
+{{- template "spark-operator.rbacLabelsAnnotations" $ }}
+{{- template "spark-operator.operatorRbacRules" $ }}
+{{- end }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ name: {{ .Values.operatorRbac.configManagement.roleName }}
+ namespace: {{ .Release.Namespace }}
+{{- template "spark-operator.rbacLabelsAnnotations" $ }}
+rules:
+ - apiGroups:
+ - ""
+ resources:
+ - configmaps
+ verbs:
+ - '*'
+ - apiGroups:
+ - coordination.k8s.io
+ resources:
+ - leases
+ verbs:
+ - "*"
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: {{ .Values.operatorRbac.configManagement.roleBindingName }}
+ namespace: {{ .Release.Namespace }}
+{{- template "spark-operator.rbacLabelsAnnotations" $ }}
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: Role
+ name: {{ .Values.operatorRbac.configManagement.roleName }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ include "spark-operator.serviceAccountName" $ }}
+ namespace: {{ .Release.Namespace }}
diff --git a/build-tools/helm/spark-kubernetes-operator/templates/spark-operator.yaml b/build-tools/helm/spark-kubernetes-operator/templates/spark-operator.yaml
new file mode 100644
index 00000000..91f64da0
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/templates/spark-operator.yaml
@@ -0,0 +1,211 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ include "spark-operator.name" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "spark-operator.deploymentSelectorLabels" . | nindent 4 }}
+ {{- include "spark-operator.commonLabels" . | nindent 4 }}
+spec:
+ replicas: {{ .Values.operatorDeployment.replicas }}
+ revisionHistoryLimit: 2
+ strategy:
+ {{- toYaml .Values.operatorDeployment.strategy | nindent 4 }}
+ selector:
+ matchLabels:
+ {{- include "spark-operator.deploymentSelectorLabels" . | nindent 6 }}
+ template:
+ metadata:
+ labels:
+ {{- include "spark-operator.deploymentSelectorLabels" . | nindent 8 }}
+ {{- if index (.Values.operatorDeployment.operatorPod) "labels" }}
+ {{- with .Values.operatorDeployment.operatorPod.labels }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- end }}
+ annotations:
+ kubectl.kubernetes.io/default-container: {{ .Chart.Name }}
+ {{- if index (.Values.operatorDeployment.operatorPod) "annotations" }}
+ {{- with .Values.operatorDeployment.operatorPod.annotations }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- end }}
+ spec:
+ {{- with .Values.operatorDeployment.operatorPod.priorityClassName }}
+ priorityClassName: {{ . }}
+ {{- end }}
+ {{- with .Values.operatorDeployment.operatorPod.securityContext }}
+ securityContext:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- if .Values.operatorDeployment.operatorPod.nodeSelector }}
+ nodeSelector: {{ toYaml .Values.operatorDeployment.operatorPod.nodeSelector | nindent 8 }}
+ {{- end }}
+ {{- with .Values.operatorDeployment.operatorPod.tolerations }}
+ tolerations:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.operatorDeployment.operatorPod.affinity }}
+ affinity:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- with .Values.imagePullSecrets }}
+ imagePullSecrets:
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ serviceAccountName: {{ include "spark-operator.serviceAccountName" $ }}
+ {{- if .Values.operatorDeployment.operatorPod.topologySpreadConstraints }}
+ topologySpreadConstraints: {{ toYaml .Values.operatorDeployment.operatorPod.topologySpreadConstraints | nindent 8 }}
+ {{- end }}
+ containers:
+ - name: {{ .Chart.Name }}
+ image: {{ include "spark-operator.imagePath" . }}
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+ command: [ "./docker-entrypoint.sh", "operator" ]
+ ports:
+ - containerPort: {{ include "spark-operator.probePort" . }}
+ name: probe-port
+ - containerPort: {{ include "spark-operator.metricsPort" . }}
+ name: metrics-port
+ env:
+ - name: OPERATOR_NAMESPACE
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.namespace
+ - name: HOST_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.hostIP
+ - name: POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ - name: POD_NAME
+ valueFrom:
+ fieldRef:
+ fieldPath: metadata.name
+ - name: OPERATOR_NAME
+ value: {{ include "spark-operator.name" . }}
+ - name: LOG_CONFIG
+ value: -Dlog4j.configurationFile=/opt/spark-operator/conf/log4j2.properties
+ - name: OPERATOR_JAVA_OPTS
+ value: {{ .Values.operatorDeployment.operatorPod.operatorContainer.jvmArgs }}
+ {{- with .Values.operatorDeployment.operatorPod.operatorContainer.env }}
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.operatorDeployment.operatorPod.operatorContainer.envFrom }}
+ envFrom:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.operatorDeployment.operatorPod.operatorContainer.resources }}
+ resources:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ readinessProbe:
+ httpGet:
+ port: probe-port
+ path: /readyz
+ failureThreshold: {{ include "spark-operator.readinessProbe.failureThreshold" . }}
+ periodSeconds: {{ include "spark-operator.readinessProbe.periodSeconds" . }}
+ livenessProbe:
+ httpGet:
+ port: probe-port
+ path: /healthz
+ initialDelaySeconds: {{ include "spark-operator.livenessProbe.initialDelaySeconds" . }}
+ periodSeconds: {{ include "spark-operator.livenessProbe.periodSeconds" . }}
+ {{- with .Values.operatorDeployment.operatorPod.operatorContainer.securityContext }}
+ securityContext:
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ volumeMounts:
+ - name: spark-operator-config-volume
+ mountPath: /opt/spark-operator/conf
+ - name: logs-volume
+ mountPath: /opt/spark-operator/logs
+ {{- with .Values.operatorDeployment.operatorPod.operatorContainer.volumeMounts }}
+ {{- toYaml . | nindent 12 }}
+ {{- end }}
+ {{- with .Values.operatorDeployment.operatorPod.additionalContainers }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- if index (.Values.operatorDeployment.operatorPod) "dnsPolicy" }}
+ dnsPolicy: {{ .Values.operatorDeployment.operatorPod.dnsPolicy | quote }}
+ {{- end }}
+ {{- if index (.Values.operatorDeployment.operatorPod) "dnsConfig" }}
+ dnsConfig:
+ {{- with .Values.operatorDeployment.operatorPod.dnsConfig }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+ {{- end }}
+ restartPolicy: Always
+ volumes:
+ - name: spark-operator-config-volume
+ configMap:
+ name: spark-kubernetes-operator-configuration
+ - name: logs-volume
+ emptyDir: { }
+ {{- with .Values.operatorDeployment.operatorPod.volumes }}
+ {{- toYaml . | nindent 8 }}
+ {{- end }}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: spark-kubernetes-operator-configuration
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "spark-operator.initialConfigLabels" . | nindent 4 }}
+data:
+ log4j2.properties: |+
+{{- if .Values.operatorConfiguration.append }}
+ {{- $.Files.Get "conf/log4j2.properties" | nindent 4 -}}
+{{- end }}
+{{- if index (.Values.operatorConfiguration) "log4j2.properties" }}
+ {{- index (.Values.operatorConfiguration) "log4j2.properties" | nindent 4 -}}
+{{- end }}
+ spark-operator.properties: |+
+ {{- include "spark-operator.defaultPropertyOverrides" . | nindent 4 }}
+{{- if .Values.operatorConfiguration.append }}
+ {{- $.Files.Get "conf/spark-operator.properties" | nindent 4 -}}
+{{- end }}
+{{- if index (.Values.operatorConfiguration) "spark-operator.properties" }}
+ {{- index (.Values.operatorConfiguration) "spark-operator.properties" | nindent 4 -}}
+{{- end }}
+ metrics.properties: |+
+{{- if index (.Values.operatorConfiguration) "metrics.properties" }}
+ {{- index (.Values.operatorConfiguration) "metrics.properties" | nindent 4 -}}
+{{- end }}
+---
+{{- if .Values.operatorConfiguration.dynamicConfig.create }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: {{ include "spark-operator.dynamicConfigMapName" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ {{- include "spark-operator.dynamicConfigLabels" . | nindent 4 }}
+ annotations:
+ {{- toYaml .Values.operatorConfiguration.dynamicConfig.annotations | nindent 4 }}
+{{- with .Values.operatorConfiguration.dynamicConfig.data }}
+data:
+ {{- toYaml . | nindent 2 }}
+{{- end }}
+{{- end }}
diff --git a/build-tools/helm/spark-kubernetes-operator/templates/sparkapps-resource.yaml b/build-tools/helm/spark-kubernetes-operator/templates/sparkapps-resource.yaml
new file mode 100644
index 00000000..cd598cdd
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/templates/sparkapps-resource.yaml
@@ -0,0 +1,216 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+{{/*
+RBAC rules used to create the app (cluster)role based on the scope
+*/}}
+{{- define "spark-operator.appRbacRules" }}
+rules:
+ - apiGroups:
+ - ""
+ resources:
+ - pods
+ - services
+ - configmaps
+ - persistentvolumeclaims
+ verbs:
+ - '*'
+{{- end }}
+
+{{/*
+RoleRef for app service account rolebindings
+*/}}
+{{- define "spark-operator.appRoleRef" }}
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+{{- if .Values.appResources.clusterRole.create }}
+ kind: ClusterRole
+ name: {{ .Values.appResources.clusterRole.name }}
+{{- else if .Values.appResources.roles.create }}
+ kind: Role
+ name: {{ .Values.appResources.roles.name }}
+{{- else }}
+ kind: ClusterRole
+ name: {{ .Values.operatorRbac.clusterRole.name }}
+{{- end }}
+{{- end }}
+
+{{/*
+Labels and annotations to be applied
+*/}}
+{{- define "spark-operator.appLabels" }}
+ {{- with .Values.appResources.labels }}
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+ {{ include "spark-operator.commonLabels" . | nindent 4 }}
+{{- end }}
+
+{{- define "spark-operator.appAnnotations" }}
+ {{- with .Values.appResources.annotations }}
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+{{- end }}
+
+{{- define "spark-operator.appLabelsAnnotations" }}
+ labels:
+ {{ template "spark-operator.appLabels" $ }}
+ annotations:
+ {{ template "spark-operator.appAnnotations" $ }}
+{{- end }}
+---
+{{- $appResources := .Values.appResources -}}
+{{- $systemNs := .Release.Namespace -}}
+{{- $operatorRbac := .Values.operatorRbac -}}
+{{- if index (.Values.appResources.namespaces) "data" }}
+{{- range $appNs := .Values.appResources.namespaces.data }}
+{{- if $appResources.namespaces.create }}
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: {{ $appNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+---
+{{- end }}
+{{- if $appResources.serviceAccounts.create }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: {{ $appResources.serviceAccounts.name }}
+ namespace: {{ $appNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+---
+{{- end }}
+{{- if $appResources.roles.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ name: {{ $appResources.roles.name }}
+ namespace: {{ $appNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+{{- template "spark-operator.appRbacRules" $ }}
+---
+{{- end }}
+{{- if $appResources.roleBindings.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: {{ $appResources.roleBindings.name }}
+ namespace: {{ $appNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+{{- template "spark-operator.appRoleRef" $ }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ $appResources.serviceAccounts.name }}
+ namespace: {{ $appNs }}
+---
+{{- end }}
+{{- if not $operatorRbac.clusterRoleBinding.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: spark-operator-rolebinding
+ namespace: {{ $appNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: {{ $operatorRbac.clusterRole.name }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ $operatorRbac.serviceAccount.name }}
+ namespace: {{ $systemNs }}
+---
+{{- end }}
+{{- end }}
+{{- else }}
+{{- if $appResources.serviceAccounts.create }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: {{ $appResources.serviceAccounts.name }}
+ namespace: {{ $systemNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+---
+{{- end }}
+{{- if $appResources.roles.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+ name: {{ $appResources.roles.name }}
+ namespace: {{ $systemNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+{{- template "spark-operator.appRbacRules" $ }}
+---
+{{- end }}
+{{- if $appResources.roleBindings.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: {{ $appResources.serviceAccounts.name }}
+ namespace: {{ $systemNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+{{- template "spark-operator.appRoleRef" $ }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ $appResources.serviceAccounts.name }}
+ namespace: {{ $systemNs }}
+---
+{{- end }}
+{{- if not $operatorRbac.clusterRoleBinding.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+ name: spark-operator-rolebinding
+ namespace: {{ $systemNs }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: {{ $operatorRbac.clusterRole.name }}
+subjects:
+ - kind: ServiceAccount
+ name: {{ $operatorRbac.serviceAccount.name }}
+ namespace: {{ $systemNs }}
+---
+{{- end }}
+{{- end }}
+
+{{- if $appResources.clusterRole.create }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+ name: {{ $appResources.clusterRole.name }}
+{{- template "spark-operator.appLabelsAnnotations" $ }}
+{{- template "spark-operator.appRbacRules" $ }}
+---
+{{- end }}
+{{- if $appResources.sparkApplicationSentinel.create }}
+{{- range $sentinelNs := .Values.appResources.sparkApplicationSentinel.sentinelNamespaces.data }}
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ name: {{ $appResources.sparkApplicationSentinel.name }}
+ namespace: {{ $sentinelNs }}
+ labels:
+ "spark.operator/sentinel": "true"
+ {{- template "spark-operator.appLabels" $ }}
+ annotations:
+ {{- template "spark-operator.appAnnotations" $ }}
+{{- end }}
+---
+{{- end }}
diff --git a/build-tools/helm/spark-kubernetes-operator/values.yaml b/build-tools/helm/spark-kubernetes-operator/values.yaml
new file mode 100644
index 00000000..567d39bf
--- /dev/null
+++ b/build-tools/helm/spark-kubernetes-operator/values.yaml
@@ -0,0 +1,178 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+image:
+ repository: spark-kubernetes-operator
+ pullPolicy: IfNotPresent
+ # tag: latest
+ # If image digest is set then it takes precedence and the image tag will be ignored
+ # digest: ""
+
+imagePullSecrets: [ ]
+
+operatorDeployment:
+ # Replicas must be 1
+ replicas: 1
+ # Strategy type must be 'Recreate' unless leader election is configured
+ strategy:
+ type: Recreate
+ operatorPod:
+ priorityClassName: null
+ annotations: { }
+ labels: { }
+ affinity: { }
+ nodeSelector: { }
+ # Node tolerations for operator pod assignment
+ # https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
+ tolerations: [ ]
+ # Topology spread constrains
+ # https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
+ topologySpreadConstraints: [ ]
+ operatorContainer:
+ jvmArgs: "-XX:+UseG1GC -Xms3G -Xmx3G -Dfile.encoding=UTF8"
+ env:
+ envFrom:
+ volumeMounts: { }
+ resources:
+ limits:
+ cpu: "1"
+ ephemeral-storage: 2Gi
+ memory: 4Gi
+ requests:
+ cpu: "1"
+ ephemeral-storage: 2Gi
+ memory: 4Gi
+ probes:
+ port: 18080
+ livenessProbe:
+ periodSeconds: 10
+ initialDelaySeconds: 30
+ startupProbe:
+ failureThreshold: 30
+ periodSeconds: 10
+ metrics:
+ port: 19090
+ securityContext:
+ allowPrivilegeEscalation: false
+ capabilities:
+ drop:
+ - ALL
+ runAsNonRoot: true
+ runAsUser: 9999
+ seccompProfile:
+ type: RuntimeDefault
+ additionalContainers: { }
+ # additionalContainers:
+ # - name: ""
+ # image: ""
+ volumes: { }
+ # volumes:
+ # - name: spark-artifacts
+ # hostPath:
+ # path: /tmp/spark/artifacts
+ # type: DirectoryOrCreate
+ securityContext: { }
+ dnsPolicy:
+ dnsConfig:
+
+operatorRbac:
+ serviceAccount:
+ create: true
+ name: "spark-operator"
+ # If disabled, a Role would be created inside each app namespace for app operations
+ clusterRole:
+ create: true
+ name: "spark-operator-clusterrole"
+ # If disabled, a RoleBinding would be created inside each app namespace for app operations
+ clusterRoleBinding:
+ create: true
+ name: "spark-operator-clusterrolebinding"
+ configManagement:
+ roleName: "spark-operator-config-role"
+ roleBindingName: "spark-operator-config-role-binding"
+
+appResources:
+ # Create namespace(s), service account(s) and rolebinding(s) for SparkApps, if configured
+ # Operator would act at cluster level by default if no app namespace(s) are provided
+ namespaces:
+ create: true
+ # When enabled, operator would by default only watch namespace(s) provided in data field
+ watchGivenNamespacesOnly: false
+ data:
+ # - "spark-demo"
+ # - "spark-demo"
+ serviceAccounts:
+ create: true
+ name: "spark"
+ roleBindings:
+ create: true
+ name: "spark-app-rolebinding"
+ roles:
+ # if enabled, a role would be created in each app namespace for Spark apps
+ create: false
+ name: "spark-app-role"
+ clusterRole:
+ # if enabled, a clusterrole would be created for Spark app service accounts to use
+ # If neither role nor clusterrole is enabled: Spark app would use the same access as operator
+ create: false
+ name: "spark-app-cluster-role"
+ sparkApplicationSentinel:
+ create: false
+ name: "spark-app-sentinel"
+ sentinelNamespaces:
+ data:
+ # When enabled, sentinel resources will be deployed to namespace(s) provided in data field.
+ # Note that sentinelNamespaces list shall be a subset of appResources.namespaces.data.
+ # - "spark-demo"
+ # App resources are by default annotated to avoid app abort due to operator upgrade
+ annotations:
+ # "helm.sh/resource-policy": keep
+ # labels to be added on app resources
+ labels:
+ "app.kubernetes.io/component": "spark-apps"
+
+operatorConfiguration:
+ # If set to true, below conf file & properties would be appended to default conf.
+ # Otherwise, they would override default properties
+ append: true
+ log4j2.properties: |+
+ # Logging Overrides
+ # rootLogger.level=DEBUG
+ spark-operator.properties: |+
+ # Property Overrides.
+ #
+ # e.g. to watch namespace 'spark' and 'default' only, instead of
+ # the cluster, use
+ # spark.operator.watched.namespaces=spark,default
+ # When deployed via Helm, please note that the value of spark.operator.watched.namespaces
+ # should be a subset of .Values.appResources.namespaces.data so that the app namespaces
+ # properly configured by Helm before operator starts.
+ #
+ # Enable this for hot property loading
+ # spark.operator.dynamic.config.enabled=false
+ metrics.properties: |+
+ # Metrics Properties Overrides
+ dynamicConfig:
+ # If set to true, a config map would be created & watched by operator as source of truth
+ # for hot properties loading.
+ create: false
+ name: spark-kubernetes-operator-dynamic-configuration
+ annotations:
+ # "helm.sh/resource-policy": keep
+ data:
+ # Spark Operator Config Runtime Properties Overrides
diff --git a/build.gradle b/build.gradle
index 6732f5a2..d0134304 100644
--- a/build.gradle
+++ b/build.gradle
@@ -1,3 +1,16 @@
+buildscript {
+ repositories {
+ maven {
+ url = uri("https://plugins.gradle.org/m2/")
+ }
+ }
+ dependencies {
+ classpath "com.github.spotbugs.snom:spotbugs-gradle-plugin:${spotBugsGradlePluginVersion}"
+ }
+}
+
+assert JavaVersion.current().isJava11Compatible(): "Java 11 or newer is required"
+
subprojects {
apply plugin: 'idea'
apply plugin: 'eclipse'
@@ -6,7 +19,48 @@ subprojects {
targetCompatibility = 17
repositories {
- mavenCentral()
- jcenter()
+ mavenCentral()
+ jcenter()
+ }
+
+ configurations.all {
+ resolutionStrategy {
+ force "org.slf4j:slf4j-api:$slf4jVersion"
+ force "io.fabric8:kubernetes-model-core:$fabric8Version"
+ }
+ }
+
+ apply plugin: 'checkstyle'
+ checkstyle {
+ toolVersion = checkstyleVersion
+ configFile = file("$rootDir/config/checkstyle/checkstyle.xml")
+ ignoreFailures = false
+ showViolations = true
+ }
+
+ apply plugin: 'pmd'
+ pmd {
+ ruleSets = ["java-basic", "java-braces"]
+ ruleSetFiles = files("$rootDir/config/pmd/ruleset.xml")
+ toolVersion = pmdVersion
+ consoleOutput = true
+ ignoreFailures = false
+ }
+
+ apply plugin: 'com.github.spotbugs'
+ spotbugs {
+ toolVersion = spotBugsVersion
+ afterEvaluate {
+ reportsDir = file("${project.reporting.baseDir}/findbugs")
+ }
+ ignoreFailures = false
+ }
+
+ apply plugin: 'jacoco'
+ jacoco {
+ toolVersion = jacocoVersion
+ }
+ jacocoTestReport {
+ dependsOn test
}
}
diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml
new file mode 100644
index 00000000..aefd442b
--- /dev/null
+++ b/config/checkstyle/checkstyle.xml
@@ -0,0 +1,195 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/config/pmd/ruleset.xml b/config/pmd/ruleset.xml
new file mode 100644
index 00000000..689ef744
--- /dev/null
+++ b/config/pmd/ruleset.xml
@@ -0,0 +1,33 @@
+
+
+
+
+
+ Spark Operator Ruleset
+
+
+
+
+
+
+
+
+
+ .*/src/generated/.*
+
diff --git a/dev/.rat-excludes b/dev/.rat-excludes
index a24671bc..c4845e69 100644
--- a/dev/.rat-excludes
+++ b/dev/.rat-excludes
@@ -12,3 +12,10 @@ LICENSE
NOTICE
TAGS
RELEASE
+build
+.helmignore
+.editorconfig
+gradle
+build.gradle
+gradlew
+sparkapplications.org.apache.spark-v1.yml
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
new file mode 100755
index 00000000..62d2261d
--- /dev/null
+++ b/docker-entrypoint.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+args=("$@")
+
+if [ "$1" = "help" ]; then
+ printf "Usage: $(basename "$0") (operator)\n"
+ printf " Or $(basename "$0") help\n\n"
+ exit 0
+elif [ "$1" = "operator" ]; then
+ echo "Starting Operator"
+
+ exec java -cp "./$OPERATOR_JAR" $LOG_CONFIG $OPERATOR_JAVA_OPTS org.apache.spark.kubernetes.operator.SparkOperator
+fi
+
+args=("${args[@]}")
+
+# Running command in pass-through mode
+exec "${args[@]}"
diff --git a/e2e-tests/spark-apps/spark_3_5_1/pyspark-example.yaml b/e2e-tests/spark-apps/spark_3_5_1/pyspark-example.yaml
new file mode 100644
index 00000000..950a7018
--- /dev/null
+++ b/e2e-tests/spark-apps/spark_3_5_1/pyspark-example.yaml
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ name: py-spark-pi-351
+spec:
+ pyFiles: "local:///opt/spark/examples/src/main/python/pi.py"
+ sparkConf:
+ spark.executor.instances: "2"
+ # see also https://hub.docker.com/_/spark
+ spark.kubernetes.container.image: "spark:3.5.1-scala2.12-java17-python3-ubuntu"
+ spark.kubernetes.authenticate.driver.serviceAccountName: "spark"
+ runtimeVersions:
+ sparkVersion: v3_5_1
diff --git a/e2e-tests/spark-apps/spark_3_5_1/spark-pi_scala_2.12.yaml b/e2e-tests/spark-apps/spark_3_5_1/spark-pi_scala_2.12.yaml
new file mode 100644
index 00000000..53a5a389
--- /dev/null
+++ b/e2e-tests/spark-apps/spark_3_5_1/spark-pi_scala_2.12.yaml
@@ -0,0 +1,33 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ name: spark-pi-351-212
+spec:
+ mainClass: "org.apache.spark.examples.SparkPi"
+ jars: "local:///opt/spark/examples/jars/spark-examples_2.12-3.5.1.jar"
+ sparkConf:
+ spark.executor.instances: "2"
+ # see also https://hub.docker.com/_/spark
+ spark.kubernetes.container.image: "spark:3.5.1-scala2.12-java17-python3-ubuntu"
+ spark.kubernetes.authenticate.driver.serviceAccountName: "spark"
+ runtimeVersions:
+ scalaVersion: v2_12
+ sparkVersion: v3_5_1
diff --git a/e2e-tests/spark-apps/spark_3_5_1/sparkr-example.yaml b/e2e-tests/spark-apps/spark_3_5_1/sparkr-example.yaml
new file mode 100644
index 00000000..08414459
--- /dev/null
+++ b/e2e-tests/spark-apps/spark_3_5_1/sparkr-example.yaml
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ name: sparkr-example-351
+spec:
+ driverSpec:
+ podTemplateSpec:
+ metadata:
+ spec:
+ containers:
+ - name: driver
+ workingDir: /opt/spark
+ executorSpec:
+ podTemplateSpec:
+ metadata:
+ spec:
+ containers:
+ - name: executor
+ workingDir: /opt/spark
+ sparkRFiles: "local:///opt/spark/examples/src/main/r/dataframe.R"
+ sparkConf:
+ spark.executor.instances: "1"
+ # see also https://hub.docker.com/_/spark
+ spark.kubernetes.container.image: "spark:3.5.1-java17-r"
+ spark.kubernetes.authenticate.driver.serviceAccountName: "spark"
+ spark.home: "/opt/spark"
+ runtimeVersions:
+ sparkVersion: v3_5_1
diff --git a/gradle.properties b/gradle.properties
new file mode 100644
index 00000000..a7fe26e1
--- /dev/null
+++ b/gradle.properties
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+group=org.apache.spark.kubernetes.operator
+version=0.1.0
+commonsLang3Version=3.12.0
+commonsIOVersion=2.11.0
+commonsConfigurationVersion=2.9.0
+dropwizardMetricsVersion=4.2.25
+# Caution: fabric8 version should be aligned with Spark dependency
+fabric8Version=6.7.2
+lombokVersion=1.18.30
+operatorSDKVersion=4.7.0
+okHttpVersion=4.11.0
+# Spark
+sparkVersion=3.5.1
+sparkScalaVersion=2.12
+# Logging
+slf4jVersion=1.7.36
+log4jVersion=2.23.1
+log4jLayoutVersion=2.17.1
+# Test
+junitVersion=5.9.2
+mockitoVersion=5.10.0
+jacocoVersion=0.8.11
+# Build
+checkstyleVersion=10.8.1
+pmdVersion=6.55.0
+spotBugsGradlePluginVersion=5.2.5
+spotBugsVersion=4.2.3
+shadowJarPluginVersion=8.1.1
diff --git a/gradlew.bat b/gradlew.bat
new file mode 100644
index 00000000..25da30db
--- /dev/null
+++ b/gradlew.bat
@@ -0,0 +1,92 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%"=="" @echo off
+@rem ##########################################################################
+@rem
+@rem Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%"=="" set DIRNAME=.
+@rem This is normally unused
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if %ERRORLEVEL% equ 0 goto execute
+
+echo. 1>&2
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
+echo. 1>&2
+echo Please set the JAVA_HOME variable in your environment to match the 1>&2
+echo location of your Java installation. 1>&2
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo. 1>&2
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
+echo. 1>&2
+echo Please set the JAVA_HOME variable in your environment to match the 1>&2
+echo location of your Java installation. 1>&2
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if %ERRORLEVEL% equ 0 goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+set EXIT_CODE=%ERRORLEVEL%
+if %EXIT_CODE% equ 0 set EXIT_CODE=1
+if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
+exit /b %EXIT_CODE%
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/settings.gradle b/settings.gradle
new file mode 100644
index 00000000..ca9eb466
--- /dev/null
+++ b/settings.gradle
@@ -0,0 +1,23 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+rootProject.name = 'apache-spark-kubernetes-operator'
+
+include 'spark-operator-api'
+include 'spark-operator'
+include 'spark-submission-worker'
+include 'spark-operator-tests'
diff --git a/spark-operator-api/build.gradle b/spark-operator-api/build.gradle
new file mode 100644
index 00000000..4c32ce4f
--- /dev/null
+++ b/spark-operator-api/build.gradle
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+dependencies {
+ // fabric8
+ implementation("io.fabric8:kubernetes-client:$fabric8Version") {
+ exclude group: 'com.squareup.okhttp3'
+ }
+ compileOnly("io.fabric8:crd-generator-apt:$fabric8Version")
+ annotationProcessor("io.fabric8:crd-generator-apt:$fabric8Version")
+
+ // utils
+ implementation("org.apache.commons:commons-lang3:$commonsLang3Version")
+ implementation("commons-io:commons-io:$commonsIOVersion")
+ implementation("org.projectlombok:lombok:$lombokVersion")
+ annotationProcessor("org.projectlombok:lombok:$lombokVersion")
+
+ // logging
+ implementation("org.apache.logging.log4j:log4j-slf4j-impl:$log4jVersion")
+ implementation("org.apache.logging.log4j:log4j-core:$log4jVersion")
+
+ testImplementation platform("org.junit:junit-bom:$junitVersion")
+ testImplementation 'org.junit.jupiter:junit-jupiter'
+}
+
+test {
+ useJUnitPlatform()
+}
+
+task finalizeGeneratedCRD(type: Exec, dependsOn: jar) {
+ println "Updating PrinterColumns for generated CRD"
+ commandLine 'sh', './src/main/resources/printer-columns.sh'
+}
+
+task copyGeneratedCRD(type: Copy, dependsOn: finalizeGeneratedCRD) {
+ from "build/classes/java/main/META-INF/fabric8/sparkapplications.org.apache.spark-v1.yml"
+ into "../build-tools/helm/spark-kubernetes-operator/crds"
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/BaseResource.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/BaseResource.java
new file mode 100644
index 00000000..695ecfc0
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/BaseResource.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator;
+
+import io.fabric8.kubernetes.api.model.Namespaced;
+import io.fabric8.kubernetes.client.CustomResource;
+
+import org.apache.spark.kubernetes.operator.spec.BaseSpec;
+import org.apache.spark.kubernetes.operator.status.BaseAttemptSummary;
+import org.apache.spark.kubernetes.operator.status.BaseState;
+import org.apache.spark.kubernetes.operator.status.BaseStatus;
+
+public class BaseResource,
+ SPEC extends BaseSpec, STATUS extends BaseStatus>
+ extends CustomResource implements Namespaced {
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/Constants.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/Constants.java
new file mode 100644
index 00000000..c7b88642
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/Constants.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator;
+
+public class Constants {
+ public static final String API_GROUP = "org.apache.spark";
+ public static final String API_VERSION = "v1alpha1";
+ public static final String LABEL_SPARK_APPLICATION_NAME = "spark.operator/spark-app-name";
+ public static final String LABEL_SPARK_OPERATOR_NAME = "spark.operator/name";
+ public static final String LABEL_RESOURCE_NAME = "app.kubernetes.io/name";
+ public static final String LABEL_SPARK_ROLE_NAME = "spark-role";
+ public static final String LABEL_SPARK_ROLE_DRIVER_VALUE = "driver";
+ public static final String LABEL_SPARK_ROLE_EXECUTOR_VALUE = "executor";
+ public static final String SPARK_CONF_SENTINEL_DUMMY_FIELD = "sentinel.dummy.number";
+
+ public static final String SENTINEL_LABEL = "spark.operator/sentinel";
+
+ // Default state messages
+ public static final String DriverRequestedMessage = "Requested driver from resource scheduler. ";
+ public static final String DriverCompletedMessage = "Spark application completed successfully. ";
+ public static final String DriverTerminatedBeforeInitializationMessage =
+ "Driver container is terminated without SparkContext / SparkSession initialization. ";
+ public static final String DriverFailedInitContainersMessage =
+ "Driver has failed init container(s). Refer last observed status for details. ";
+ public static final String DriverFailedMessage =
+ "Driver has one or more failed critical container(s), refer last observed status for " +
+ "details. ";
+ public static final String DriverSucceededMessage =
+ "Driver has critical container(s) exited with 0. ";
+ public static final String DriverRestartedMessage =
+ "Driver has one or more critical container(s) restarted unexpectedly, refer last " +
+ "observed status for details. ";
+ public static final String AppCancelledMessage =
+ "Spark application has been shutdown as requested. ";
+ public static final String DriverUnexpectedRemovedMessage =
+ "Driver removed. This could caused by 'exit' called in driver process with non-zero " +
+ "code, involuntary disruptions or unintentional destroy behavior, check " +
+ "Kubernetes events for more details. ";
+ public static final String DriverLaunchTimeoutMessage =
+ "The driver has not responded to the initial health check request within the " +
+ "allotted start-up time. This can be configured by setting " +
+ ".spec.applicationTolerations.applicationTimeoutConfig ";
+ public static final String DriverRunning = "Driver has started running. ";
+ public static final String DriverReady = "Driver has reached ready state. ";
+ public static final String SubmittedStateMessage =
+ "Spark application has been created on Kubernetes Cluster. ";
+ public static final String UnknownStateMessage = "Cannot process application status. ";
+ public static final String ExceedMaxRetryAttemptMessage =
+ "The maximum number of restart attempts (%d) has been exceeded. ";
+ public static final String ScheduleFailureMessage =
+ "Failed to request driver from scheduler backend. ";
+ public static final String RunningHealthyMessage = "Application is running healthy. ";
+ public static final String InitializedWithBelowThresholdExecutorsMessage =
+ "The application is running with less than minimal number of requested initial " +
+ "executors. ";
+ public static final String RunningWithBelowThresholdExecutorsMessage =
+ "The Spark application is running with less than minimal number of requested " +
+ "executors. ";
+ public static final String ExecutorLaunchTimeoutMessage =
+ "The Spark application failed to get enough executors in the given time threshold. ";
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/SparkApplication.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/SparkApplication.java
new file mode 100644
index 00000000..0b7fa7bd
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/SparkApplication.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
+import io.fabric8.kubernetes.model.annotation.Group;
+import io.fabric8.kubernetes.model.annotation.ShortNames;
+import io.fabric8.kubernetes.model.annotation.Version;
+
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+import org.apache.spark.kubernetes.operator.status.ApplicationAttemptSummary;
+import org.apache.spark.kubernetes.operator.status.ApplicationState;
+import org.apache.spark.kubernetes.operator.status.ApplicationStateSummary;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonDeserialize()
+@Group(Constants.API_GROUP)
+@Version(Constants.API_VERSION)
+@ShortNames({"sparkapp"})
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class SparkApplication extends
+ BaseResource {
+ @Override
+ public ApplicationStatus initStatus() {
+ return new ApplicationStatus();
+ }
+
+ @Override
+ public ApplicationSpec initSpec() {
+ return new ApplicationSpec();
+ }
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/SparkApplicationList.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/SparkApplicationList.java
new file mode 100644
index 00000000..a2435b46
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/SparkApplicationList.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator;
+
+import io.fabric8.kubernetes.api.model.DefaultKubernetesResourceList;
+import lombok.NoArgsConstructor;
+
+@NoArgsConstructor
+public class SparkApplicationList extends DefaultKubernetesResourceList {
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/decorators/ResourceDecorator.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/decorators/ResourceDecorator.java
new file mode 100644
index 00000000..1c705eef
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/decorators/ResourceDecorator.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.decorators;
+
+import io.fabric8.kubernetes.api.model.HasMetadata;
+
+public interface ResourceDecorator {
+ T decorate(T resource);
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/diff/Diffable.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/diff/Diffable.java
new file mode 100644
index 00000000..5361554f
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/diff/Diffable.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.diff;
+
+public interface Diffable {
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationSpec.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationSpec.java
new file mode 100644
index 00000000..f6032d58
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationSpec.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import io.fabric8.generator.annotation.Required;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@EqualsAndHashCode(callSuper = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class ApplicationSpec extends BaseSpec {
+ protected String mainClass;
+ @Required
+ protected RuntimeVersions runtimeVersions;
+ protected String jars;
+ protected String pyFiles;
+ protected String sparkRFiles;
+ protected String files;
+ @Builder.Default
+ protected DeploymentMode deploymentMode = DeploymentMode.ClusterMode;
+ protected String proxyUser;
+ @Builder.Default
+ protected List driverArgs = new ArrayList<>();
+ @Builder.Default
+ protected ApplicationTolerations applicationTolerations = new ApplicationTolerations();
+ protected BaseApplicationTemplateSpec driverSpec;
+ protected BaseApplicationTemplateSpec executorSpec;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationTimeoutConfig.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationTimeoutConfig.java
new file mode 100644
index 00000000..4f51a7af
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationTimeoutConfig.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class ApplicationTimeoutConfig {
+ @Builder.Default
+ protected Long driverStartTimeoutMillis = 300 * 1000L;
+ @Builder.Default
+ protected Long sparkSessionStartTimeoutMillis = 300 * 1000L;
+ @Builder.Default
+ protected Long executorStartTimeoutMillis = 300 * 1000L;
+ @Builder.Default
+ protected Long forceTerminationGracePeriodMillis = 300 * 1000L;
+ @Builder.Default
+ protected Long terminationRequeuePeriodMillis = 2 * 1000L;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationTolerations.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationTolerations.java
new file mode 100644
index 00000000..1f20e021
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ApplicationTolerations.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class ApplicationTolerations {
+ @Builder.Default
+ protected RestartConfig restartConfig = new RestartConfig();
+ @Builder.Default
+ protected ApplicationTimeoutConfig applicationTimeoutConfig = new ApplicationTimeoutConfig();
+ /**
+ * Determine the toleration behavior for executor / worker instances.
+ */
+ @Builder.Default
+ protected InstanceConfig instanceConfig = new InstanceConfig();
+ /**
+ * Configure operator to delete / retain resources for an app after it terminates.
+ * While this can be helpful in dev phase, it shall not be enabled (or enabled with caution) for
+ * prod use cases: this could cause resource quota usage increase unexpectedly.
+ * Caution: in order to avoid resource conflicts among multiple attempts, this should be set to
+ * 'AlwaysDelete' unless restart policy is set to 'Never'.
+ */
+ @Builder.Default
+ protected ResourceRetentionPolicy resourceRetentionPolicy = ResourceRetentionPolicy.AlwaysDelete;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/BaseApplicationTemplateSpec.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/BaseApplicationTemplateSpec.java
new file mode 100644
index 00000000..54d52290
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/BaseApplicationTemplateSpec.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import io.fabric8.kubernetes.api.model.PodTemplateSpec;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class BaseApplicationTemplateSpec {
+ protected PodTemplateSpec podTemplateSpec;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/BaseSpec.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/BaseSpec.java
new file mode 100644
index 00000000..8a00f780
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/BaseSpec.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.Data;
+
+import org.apache.spark.kubernetes.operator.diff.Diffable;
+
+@Data
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class BaseSpec implements Diffable {
+ protected Map sparkConf = new HashMap<>();
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/DeploymentMode.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/DeploymentMode.java
new file mode 100644
index 00000000..8512d50b
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/DeploymentMode.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+public enum DeploymentMode {
+ ClusterMode,
+ ClientMode
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/InstanceConfig.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/InstanceConfig.java
new file mode 100644
index 00000000..c5e7fd00
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/InstanceConfig.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+/**
+ * Config tolerations of executor instances for the application.
+ * Used then the target cluster is lack of batch / gang scheduling
+ * This is different from SparkConf: spark.executor.instances
+ *
+ * Spark would try to bring up 10 executors as defined in SparkConf. In addition, from SparkApp
+ * perspective,
+ * + If Spark app acquires less than 5 executors in given tine window
+ * (.spec.applicationTolerations.applicationTimeoutConfig.executorStartTimeoutMillis) after
+ * submitted, it would be shut down proactively in order to avoid resource deadlock.
+ * + Spark app would be marked as 'RUNNING_WITH_PARTIAL_CAPACITY' if it loses executors after
+ * successfully start up.
+ * + Spark app would be marked as 'RUNNING_HEALTHY' if it has at least min executors after
+ * successfully start up.
+ */
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class InstanceConfig {
+ @Builder.Default
+ protected long initExecutors = 0L;
+ @Builder.Default
+ protected long minExecutors = 0L;
+ @Builder.Default
+ protected long maxExecutors = 0L;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/JDKVersion.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/JDKVersion.java
new file mode 100644
index 00000000..5163ceef
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/JDKVersion.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+public enum JDKVersion {
+ Java11,
+ Java17,
+ Java21
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ResourceRetentionPolicy.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ResourceRetentionPolicy.java
new file mode 100644
index 00000000..39014d8b
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ResourceRetentionPolicy.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+public enum ResourceRetentionPolicy {
+ AlwaysDelete,
+ RetainOnFailure,
+ NeverDelete
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RestartConfig.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RestartConfig.java
new file mode 100644
index 00000000..fefc033a
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RestartConfig.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class RestartConfig {
+ @Builder.Default
+ protected RestartPolicy restartPolicy = RestartPolicy.Never;
+ @Builder.Default
+ protected Long maxRestartAttempts = 3L;
+ @Builder.Default
+ protected Long restartBackoffMillis = 30000L;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RestartPolicy.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RestartPolicy.java
new file mode 100644
index 00000000..f5fbf9a6
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RestartPolicy.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import org.apache.spark.kubernetes.operator.status.BaseStateSummary;
+
+public enum RestartPolicy {
+ Always,
+ Never,
+ OnFailure,
+ OnInfrastructureFailure;
+
+ public static boolean attemptRestartOnState(final RestartPolicy policy,
+ final BaseStateSummary stateSummary) {
+ switch (policy) {
+ case Always:
+ return true;
+ case Never:
+ return false;
+ case OnFailure:
+ return stateSummary.isFailure();
+ case OnInfrastructureFailure:
+ return stateSummary.isInfrastructureFailure();
+ }
+ return false;
+ }
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RuntimeVersions.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RuntimeVersions.java
new file mode 100644
index 00000000..ec8517cd
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/RuntimeVersions.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import io.fabric8.generator.annotation.Required;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class RuntimeVersions {
+ @Required
+ protected SparkVersion sparkVersion;
+ protected ScalaVersion scalaVersion;
+ protected JDKVersion jdkVersion;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ScalaVersion.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ScalaVersion.java
new file mode 100644
index 00000000..496eba86
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/ScalaVersion.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+public enum ScalaVersion {
+ v2_12,
+ v2_13
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/SparkVersion.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/SparkVersion.java
new file mode 100644
index 00000000..d4e13de5
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/spec/SparkVersion.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.spec;
+
+/**
+ * Spark Operator supports versions with
+ * official Spark images
+ */
+public enum SparkVersion {
+ v3_5_1,
+ v3_4_2
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationAttemptSummary.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationAttemptSummary.java
new file mode 100644
index 00000000..e65a471a
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationAttemptSummary.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import java.util.Map;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@EqualsAndHashCode(callSuper = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class ApplicationAttemptSummary extends BaseAttemptSummary {
+ // The state transition history for given attempt
+ // This is used when state history trimming is enabled
+ protected Map stateTransitionHistory;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationState.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationState.java
new file mode 100644
index 00000000..d67e32af
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationState.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import java.io.Serializable;
+import java.time.Instant;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import io.fabric8.kubernetes.api.model.PodStatus;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.Setter;
+import lombok.ToString;
+
+import static org.apache.spark.kubernetes.operator.Constants.SubmittedStateMessage;
+
+@ToString(callSuper = true)
+@EqualsAndHashCode(callSuper = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class ApplicationState extends BaseState implements Serializable {
+
+ @Getter
+ @Setter
+ PodStatus lastObservedDriverStatus;
+
+ public ApplicationState() {
+ super(ApplicationStateSummary.SUBMITTED, Instant.now().toString(), SubmittedStateMessage);
+ }
+
+ public ApplicationState(ApplicationStateSummary currentStateSummary, String message) {
+ super(currentStateSummary, Instant.now().toString(), message);
+ }
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationStateSummary.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationStateSummary.java
new file mode 100644
index 00000000..96a7aa28
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationStateSummary.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import java.util.Set;
+
+public enum ApplicationStateSummary implements BaseStateSummary {
+ /**
+ * Spark application is submitted to the cluster but yet scheduled.
+ */
+ SUBMITTED,
+
+ /**
+ * Spark application will be restarted with same configuration
+ */
+ SCHEDULED_TO_RESTART,
+
+ /**
+ * A request has been made to start driver pod in the cluster
+ */
+ DRIVER_REQUESTED,
+
+ /**
+ * Driver pod has reached running state
+ */
+ DRIVER_STARTED,
+
+ /**
+ * Spark session is initialized
+ */
+ DRIVER_READY,
+
+ /**
+ * Less that minimal required executor pods become ready during starting up
+ */
+ INITIALIZED_BELOW_THRESHOLD_EXECUTORS,
+
+ /**
+ * All required executor pods started
+ */
+ RUNNING_HEALTHY,
+
+ /**
+ * The application has lost a fraction of executors for external reasons
+ */
+ RUNNING_WITH_BELOW_THRESHOLD_EXECUTORS,
+
+ /**
+ * The request timed out for driver
+ */
+ DRIVER_LAUNCH_TIMED_OUT,
+
+ /**
+ * The request timed out for executors
+ */
+ EXECUTORS_LAUNCH_TIMED_OUT,
+
+ /**
+ * Timed out waiting for context to be initialized
+ */
+ SPARK_SESSION_INITIALIZATION_TIMED_OUT,
+
+ /**
+ * The application completed successfully, or System.exit is called explicitly with zero state
+ */
+ SUCCEEDED,
+
+ /**
+ * The application has failed, JVM exited abnormally, or System.exit is called explicitly
+ * with non-zero state
+ */
+ FAILED,
+
+ /**
+ * The job has failed because of a scheduler side issue. e.g. driver scheduled on node with
+ * insufficient resources
+ */
+ SCHEDULING_FAILURE,
+
+ /**
+ * The driver pod was failed with Evicted reason
+ */
+ DRIVER_EVICTED,
+
+ /**
+ * all resources (pods, services .etc have been cleaned up)
+ */
+ RESOURCE_RELEASED,
+
+ /**
+ * If configured, operator may mark app as terminated without releasing resources. While this
+ * can be helpful in dev phase, it shall not be enabled for prod use cases.
+ */
+ TERMINATED_WITHOUT_RELEASE_RESOURCES;
+
+ public boolean isInitializing() {
+ return SUBMITTED.equals(this) || SCHEDULED_TO_RESTART.equals(this);
+ }
+
+ public boolean isStarting() {
+ return SCHEDULED_TO_RESTART.ordinal() < this.ordinal()
+ && RUNNING_HEALTHY.ordinal() > this.ordinal();
+ }
+
+ public boolean isTerminated() {
+ return RESOURCE_RELEASED.equals(this)
+ || TERMINATED_WITHOUT_RELEASE_RESOURCES.equals(this);
+ }
+
+ public boolean isStopping() {
+ return RUNNING_HEALTHY.ordinal() < this.ordinal() && !isTerminated();
+ }
+
+ public static final Set infrastructureFailures =
+ Set.of(DRIVER_LAUNCH_TIMED_OUT,
+ EXECUTORS_LAUNCH_TIMED_OUT, SCHEDULING_FAILURE);
+
+ public static final Set failures = Set.of(DRIVER_LAUNCH_TIMED_OUT,
+ EXECUTORS_LAUNCH_TIMED_OUT, SCHEDULING_FAILURE, FAILED,
+ SPARK_SESSION_INITIALIZATION_TIMED_OUT);
+
+ @Override
+ public boolean isFailure() {
+ return failures.contains(this);
+ }
+
+ @Override
+ public boolean isInfrastructureFailure() {
+ return infrastructureFailures.contains(this);
+ }
+
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationStatus.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationStatus.java
new file mode 100644
index 00000000..ce77b67f
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/ApplicationStatus.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.EqualsAndHashCode;
+import lombok.ToString;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.spark.kubernetes.operator.spec.RestartConfig;
+import org.apache.spark.kubernetes.operator.spec.RestartPolicy;
+
+import static org.apache.spark.kubernetes.operator.Constants.ExceedMaxRetryAttemptMessage;
+
+@EqualsAndHashCode(callSuper = true)
+@ToString(callSuper = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class ApplicationStatus
+ extends BaseStatus {
+
+ public ApplicationStatus() {
+ super(new ApplicationState(), new ApplicationAttemptSummary());
+ }
+
+ public ApplicationStatus(ApplicationState currentState,
+ Map stateTransitionHistory,
+ ApplicationAttemptSummary previousAttemptSummary,
+ ApplicationAttemptSummary currentAttemptSummary) {
+ super(currentState, stateTransitionHistory, previousAttemptSummary, currentAttemptSummary);
+ }
+
+ /**
+ * Create a new ApplicationStatus, set the given latest state as current and update state
+ * history
+ */
+ public ApplicationStatus appendNewState(ApplicationState state) {
+ return new ApplicationStatus(state, createUpdatedHistoryWithNewState(state),
+ previousAttemptSummary, currentAttemptSummary);
+ }
+
+ /**
+ * Create ApplicationStatus to be updated upon termination of current attempt, with respect
+ * to current state and restart config.
+ *
+ * @param restartConfig restart config for the app
+ * @param stateMessageOverride state message to be applied
+ * @param trimStateTransitionHistory if enabled, operator would trim the state history,
+ * keeping only previous and current attempt.
+ * @return updated ApplicationStatus
+ */
+ public ApplicationStatus terminateOrRestart(final RestartConfig restartConfig,
+ String stateMessageOverride,
+ boolean trimStateTransitionHistory) {
+ if (!currentState.currentStateSummary.isStopping()) {
+ // application is not stopping, skip
+ throw new RuntimeException(
+ "Spark application cannot be directly terminated unless in stopping " +
+ "state, current state is: " + currentState);
+ }
+
+ if (!RestartPolicy.attemptRestartOnState(restartConfig.getRestartPolicy(),
+ currentState.getCurrentStateSummary())) {
+ // no restart configured
+ ApplicationState state = new ApplicationState(ApplicationStateSummary.RESOURCE_RELEASED,
+ stateMessageOverride);
+ return new ApplicationStatus(state, createUpdatedHistoryWithNewState(state),
+ previousAttemptSummary, currentAttemptSummary);
+ }
+
+ if (currentAttemptSummary.getAttemptInfo().getId() >=
+ restartConfig.getMaxRestartAttempts()) {
+ String stateMessage = String.format(ExceedMaxRetryAttemptMessage,
+ restartConfig.getMaxRestartAttempts());
+ if (StringUtils.isNotEmpty(stateMessageOverride)) {
+ stateMessage += stateMessageOverride;
+ }
+ // max number of restart attempt reached
+ ApplicationState state =
+ new ApplicationState(ApplicationStateSummary.RESOURCE_RELEASED, stateMessage);
+ // still use previous & current attempt summary - they are to be updated only upon
+ // new restart
+ return new ApplicationStatus(state, createUpdatedHistoryWithNewState(state),
+ previousAttemptSummary, currentAttemptSummary);
+ }
+
+ ApplicationAttemptSummary nextAttemptSummary = new ApplicationAttemptSummary();
+ nextAttemptSummary.setAttemptInfo(
+ currentAttemptSummary.getAttemptInfo().createNextAttemptInfo());
+ ApplicationState state = new ApplicationState(ApplicationStateSummary.SCHEDULED_TO_RESTART,
+ stateMessageOverride);
+
+ if (trimStateTransitionHistory) {
+ currentAttemptSummary.setStateTransitionHistory(stateTransitionHistory);
+ return new ApplicationStatus(state,
+ Collections.singletonMap(getCurrentStateId() + 1, state), currentAttemptSummary,
+ nextAttemptSummary);
+ } else {
+ return new ApplicationStatus(state, createUpdatedHistoryWithNewState(state),
+ currentAttemptSummary, nextAttemptSummary);
+ }
+ }
+
+ private Map createUpdatedHistoryWithNewState(ApplicationState state) {
+ TreeMap updatedHistory = new TreeMap<>(stateTransitionHistory);
+ updatedHistory.put(updatedHistory.lastKey() + 1L, state);
+ return updatedHistory;
+ }
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/AttemptInfo.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/AttemptInfo.java
new file mode 100644
index 00000000..35e200cc
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/AttemptInfo.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+@Builder
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class AttemptInfo {
+ @Builder.Default
+ protected final Long id = 0L;
+
+ public AttemptInfo createNextAttemptInfo() {
+ return new AttemptInfo(id + 1L);
+ }
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseAttemptSummary.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseAttemptSummary.java
new file mode 100644
index 00000000..bd9bc2fb
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseAttemptSummary.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.Data;
+
+@Data
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class BaseAttemptSummary {
+ protected AttemptInfo attemptInfo = new AttemptInfo();
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseState.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseState.java
new file mode 100644
index 00000000..aba5b7c4
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseState.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import java.io.Serializable;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+
+@Data
+@AllArgsConstructor
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class BaseState implements Serializable {
+ protected T currentStateSummary;
+ protected String lastTransitionTime;
+ protected String message;
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseStateSummary.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseStateSummary.java
new file mode 100644
index 00000000..21519b6f
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseStateSummary.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+public interface BaseStateSummary {
+ boolean isFailure();
+
+ boolean isInfrastructureFailure();
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseStatus.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseStatus.java
new file mode 100644
index 00000000..5f085ad4
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/status/BaseStatus.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.ToString;
+
+@ToString
+@EqualsAndHashCode
+@JsonInclude(JsonInclude.Include.NON_NULL)
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class BaseStatus, AS extends BaseAttemptSummary> {
+ @Getter
+ STATE currentState;
+ @Getter
+ Map stateTransitionHistory;
+ @Getter
+ AS previousAttemptSummary;
+ @Getter
+ AS currentAttemptSummary;
+
+ public BaseStatus(STATE initState, AS currentAttemptSummary) {
+ this.currentState = initState;
+ this.stateTransitionHistory = new TreeMap<>();
+ this.stateTransitionHistory.put(0L, initState);
+ this.previousAttemptSummary = null;
+ this.currentAttemptSummary = currentAttemptSummary;
+ }
+
+ public BaseStatus(STATE currentState,
+ Map stateTransitionHistory,
+ AS previousAttemptSummary,
+ AS currentAttemptSummary) {
+ this.currentState = currentState;
+ this.stateTransitionHistory = new TreeMap<>(stateTransitionHistory);
+ this.previousAttemptSummary = previousAttemptSummary;
+ this.currentAttemptSummary = currentAttemptSummary;
+ }
+
+ protected long getCurrentStateId() {
+ return Collections.max(stateTransitionHistory.keySet());
+ }
+}
diff --git a/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/utils/ModelUtils.java b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/utils/ModelUtils.java
new file mode 100644
index 00000000..fd71fbf2
--- /dev/null
+++ b/spark-operator-api/src/main/java/org/apache/spark/kubernetes/operator/utils/ModelUtils.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.utils;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.fabric8.kubernetes.api.model.OwnerReference;
+import io.fabric8.kubernetes.api.model.OwnerReferenceBuilder;
+import io.fabric8.kubernetes.api.model.Pod;
+import io.fabric8.kubernetes.api.model.PodBuilder;
+import io.fabric8.kubernetes.api.model.PodTemplateSpec;
+
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+
+public class ModelUtils {
+ public static final String DRIVER_SPARK_CONTAINER_PROP_KEY =
+ "spark.kubernetes.driver.podTemplateContainerName";
+ public static final String DRIVER_SPARK_TEMPLATE_FILE_PROP_KEY =
+ "spark.kubernetes.driver.podTemplateFile";
+ public static final String EXECUTOR_SPARK_TEMPLATE_FILE_PROP_KEY =
+ "spark.kubernetes.executor.podTemplateFile";
+ public static final ObjectMapper objectMapper = new ObjectMapper();
+
+ public static Pod defaultPod() {
+ return new PodBuilder()
+ .withNewMetadata()
+ .endMetadata()
+ .withNewSpec()
+ .endSpec()
+ .build();
+ }
+
+ public static Pod getPodFromTemplateSpec(PodTemplateSpec podTemplateSpec) {
+ if (podTemplateSpec != null) {
+ return new PodBuilder()
+ .withMetadata(podTemplateSpec.getMetadata())
+ .withSpec(podTemplateSpec.getSpec())
+ .withAdditionalProperties(podTemplateSpec.getAdditionalProperties())
+ .build();
+ } else {
+ return defaultPod();
+ }
+ }
+
+ /**
+ * Return true if given container name is main container in driver pod
+ * If `spark.kubernetes.driver.podTemplateContainerName` is not set, all containers are
+ * considered as main
+ */
+ public static boolean isDriverMainContainer(final ApplicationSpec appSpec,
+ final String containerName) {
+ if (appSpec == null || appSpec.getSparkConf() == null
+ || !appSpec.getSparkConf().containsKey(DRIVER_SPARK_CONTAINER_PROP_KEY)) {
+ return true;
+ }
+ return appSpec.getSparkConf().get(DRIVER_SPARK_CONTAINER_PROP_KEY)
+ .equalsIgnoreCase(containerName);
+ }
+
+ /**
+ * Build OwnerReference to the given resource
+ *
+ * @param owner the owner
+ * @return OwnerReference to be used for subresources
+ */
+ public static OwnerReference buildOwnerReferenceTo(HasMetadata owner) {
+ return new OwnerReferenceBuilder()
+ .withName(owner.getMetadata().getName())
+ .withApiVersion(owner.getApiVersion())
+ .withKind(owner.getKind())
+ .withUid(owner.getMetadata().getUid())
+ .withBlockOwnerDeletion(true)
+ .build();
+ }
+
+ public static String asJsonString(T resource) {
+ try {
+ return objectMapper.writeValueAsString(resource);
+ } catch (JsonProcessingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static boolean overrideDriverTemplate(ApplicationSpec applicationSpec) {
+ return applicationSpec != null && applicationSpec.getDriverSpec() != null
+ && applicationSpec.getDriverSpec().getPodTemplateSpec() != null;
+ }
+
+ public static boolean overrideExecutorTemplate(ApplicationSpec applicationSpec) {
+ return applicationSpec != null && applicationSpec.getExecutorSpec() != null
+ && applicationSpec.getExecutorSpec().getPodTemplateSpec() != null;
+ }
+}
diff --git a/spark-operator-api/src/main/resources/printer-columns.sh b/spark-operator-api/src/main/resources/printer-columns.sh
new file mode 100755
index 00000000..b64a56eb
--- /dev/null
+++ b/spark-operator-api/src/main/resources/printer-columns.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+# This is a workaround. See https://github.com/fabric8io/kubernetes-client/issues/3069
+# We do a yq to add printer columns
+
+script_path=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+crd_path="${script_path}/../../../build/classes/java/main/META-INF/fabric8/sparkapplications.org.apache.spark-v1.yml"
+yq -i '.spec.versions[0] += ({"additionalPrinterColumns": [{"jsonPath": ".status.currentState.currentStateSummary", "name": "Current State", "type": "string"}, {"jsonPath": ".metadata.creationTimestamp", "name": "Age", "type": "date"}]})' $crd_path
+
diff --git a/spark-operator-api/src/test/java/org/apache/spark/kubernetes/operator/status/ApplicationStatusTest.java b/spark-operator-api/src/test/java/org/apache/spark/kubernetes/operator/status/ApplicationStatusTest.java
new file mode 100644
index 00000000..0cbe94c8
--- /dev/null
+++ b/spark-operator-api/src/test/java/org/apache/spark/kubernetes/operator/status/ApplicationStatusTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.status;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import static org.apache.spark.kubernetes.operator.status.ApplicationStateSummary.SUBMITTED;
+
+class ApplicationStatusTest {
+
+ @Test
+ void testInitStatus() {
+ ApplicationStatus applicationStatus = new ApplicationStatus();
+ Assertions.assertEquals(SUBMITTED, applicationStatus.currentState.currentStateSummary);
+ Assertions.assertEquals(1, applicationStatus.stateTransitionHistory.size());
+ Assertions.assertEquals(applicationStatus.currentState,
+ applicationStatus.stateTransitionHistory.get(0L));
+ }
+
+ @Test
+ void testAppendNewState() {
+ ApplicationStatus applicationStatus = new ApplicationStatus();
+ ApplicationState newState =
+ new ApplicationState(ApplicationStateSummary.RUNNING_HEALTHY, "foo");
+ ApplicationStatus newStatus = applicationStatus.appendNewState(newState);
+ Assertions.assertEquals(2, newStatus.stateTransitionHistory.size());
+ Assertions.assertEquals(newState, newStatus.stateTransitionHistory.get(1L));
+ }
+
+}
diff --git a/spark-operator-docs/.gitignore b/spark-operator-docs/.gitignore
new file mode 100644
index 00000000..3348a208
--- /dev/null
+++ b/spark-operator-docs/.gitignore
@@ -0,0 +1,42 @@
+.gradle
+build/
+!gradle/wrapper/gradle-wrapper.jar
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### IntelliJ IDEA ###
+.idea/modules.xml
+.idea/jarRepositories.xml
+.idea/compiler.xml
+.idea/libraries/
+*.iws
+*.iml
+*.ipr
+out/
+!**/src/main/**/out/
+!**/src/test/**/out/
+
+### Eclipse ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+bin/
+!**/src/main/**/bin/
+!**/src/test/**/bin/
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+
+### VS Code ###
+.vscode/
+
+### Mac OS ###
+.DS_Store
diff --git a/spark-operator-docs/architecture.md b/spark-operator-docs/architecture.md
new file mode 100644
index 00000000..69347448
--- /dev/null
+++ b/spark-operator-docs/architecture.md
@@ -0,0 +1,64 @@
+
+
+# Design & Architecture
+
+**Spark-Kubernetes-Operator** (Operator) acts as a control plane to manage the complete
+deployment lifecycle of Spark applications. The Operator can be installed on a Kubernetes
+cluster using Helm. In most production environments it is typically deployed in a designated
+namespace and controls Spark deployments in one or more managed namespaces. The custom resource
+definition (CRD) that describes the schema of a SparkApplication is a cluster wide resource.
+For a CRD, the declaration must be registered before any resources of that CRDs kind(s) can be
+used, and the registration process sometimes takes a few seconds.
+
+Users can interact with the operator using the kubectl or k8s API. The Operator continuously
+tracks cluster events relating to the SparkApplication custom resources. When the operator
+receives a new resource update, it will take action to adjust the Kubernetes cluster to the
+desired state as part of its reconciliation loop. The initial loop consists of the following
+high-level steps:
+
+* User submits a SparkApplication custom resource(CR) using kubectl / API
+* Operator launches driver and observes its status
+* Operator observes driver-spawn resources (e.g. executors) till app terminates
+* Operator releases all Spark-app owned resources to cluster
+* The SparkApplication CR can be (re)applied on the cluster any time - e.g. to issue proactive
+ termination of an application. The Operator makes continuous adjustments to imitate the
+ desired state until the
+ current state becomes the desired state. All lifecycle management operations are realized
+ using this very simple
+ principle in the Operator.
+
+The Operator is built with the Java Operator SDK and uses the Native Kubernetes Integration for
+launching Spark deployments and submitting jobs under the hood. The Java Operator SDK is a
+higher level
+framework and related tooling to support writing Kubernetes Operators in Java. Both the Java
+Operator SDK and Spark’s native
+kubernetes integration itself is using the Fabric8 Kubernetes Client to interact with the
+Kubernetes API Server.
+
+## State Transition
+
+[](resources/state.png)
+
+* Spark application are expected to run from submitted to succeeded before releasing resources
+* User may configure the app CR to time-out after given threshold of time
+* In addition, user may configure the app CR to skip releasing resources after terminated. This is
+ typically used at dev phase: pods / configmaps. etc would be kept for debugging. They have
+ ownerreference to the Application CR and therefore can still be cleaned up when the owner
+ SparkApplication CR is deleted.
diff --git a/spark-operator-docs/configuration.md b/spark-operator-docs/configuration.md
new file mode 100644
index 00000000..db256fac
--- /dev/null
+++ b/spark-operator-docs/configuration.md
@@ -0,0 +1,99 @@
+
+
+# Configuration
+
+## Configure Operator
+
+Spark Operator supports different ways to configure the behavior:
+
+* **spark-operator.properties** provided when deploying the operator. In addition to the
+ [property file](../build-tools/helm/spark-kubernetes-operator/conf/spark-operator.
+ properties), it is also possible to override or append config properties in helm [Values
+ files](../build-tools/helm/spark-kubernetes-operator/values.yaml).
+* **System Properties** : when provided as system properties (e.g. via -D options to the
+ operator JVM), it overrides the values provided in property file.
+* **Hot property loading** : when enabled, a [configmap](https://kubernetes.
+ io/docs/concepts/configuration/configmap/) would be created with the operator in the same
+ namespace. Operator would monitor updates performed on the configmap. Hot properties
+ override takes highest precedence.
+ - An example use case: operator use hot properties to figure the list of namespace(s) to
+ operate Spark applications. The hot properties config map can be updated and
+ maintained by user or additional microservice to tune the operator behavior without
+ rebooting it.
+ - Please be advised that not all properties can be hot-loaded and honored at runtime.
+ Refer the list of supported properties section for more details.
+
+To enable hot properties loading, update the **helm chart values file** with
+
+```
+
+operatorConfiguration:
+ spark-operator.properties: |+
+ spark.operator.dynamic.config.enabled=true
+ # ... all other config overides...
+ dynamicConfig:
+ create: true
+
+```
+
+## Supported Config Properties
+
+| Name | Type | Default Value | Allow Hot Property Override | Description |
+|------------------------------------------------------------------------------|---------|----------------------------------------------|-----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| spark.operator.name | string | spark-kubernetes-operator | false | Name of the operator. |
+| spark.operator.namespace | string | spark-system | false | Namespace that operator is deployed within. |
+| spark.operator.watched.namespaces | string | | true | Comma-separated list of namespaces that the operator would be watching for Spark resources. If unset, operator would watch all namespaces by default. When deployed via Helm, please note that the value should be a subset of .Values.appResources.namespaces.data. |
+| spark.operator.dynamic.config.enabled | boolean | false | false | When enabled, operator would use config map as source of truth for config property override. The config map need to be created in spark.operator.namespace, and labeled with operator name. |
+| spark.operator.dynamic.config.selector.str | string | `app.kubernetes.io/component=dynamic-config` | false | The selector str applied to dynamic config map. |
+| spark.operator.terminate.on.informer.failure | boolean | false | false | Enable to indicate informer errors should stop operator startup. If disabled, operator startup will ignore recoverable errors, caused for example by RBAC issues and will retry periodically |
+| spark.operator.termination.timeout.seconds | integer | 30 | false | Grace period for operator shutdown before reconciliation threads are killed. |
+| spark.operator.reconciler.parallelism | integer | 30 | false | Thread pool size for Spark Operator reconcilers. Use -1 for unbounded pool. |
+| spark.operator.rate.limiter.refresh.period.seconds | integer | 15 | false | Operator rate limiter refresh period(in seconds) for each resource. |
+| spark.operator.rate.limiter.limit | integer | 5 | false | Max number of reconcile loops triggered within the rate limiter refresh period for each resource. Setting the limit <= 0 disables the limiter. |
+| spark.operator.retry.initial.internal.seconds | integer | 5 | false | Initial interval(in seconds) of retries on unhandled controller errors |
+| spark.operator.retry.internal.multiplier | double | 1.5 | false | Interval multiplier of retries on unhandled controller errors. |
+| spark.operator.retry.max.interval.seconds | integer | -1 | false | Max interval(in seconds) of retries on unhandled controller errors. Set to -1 for unlimited. |
+| spark.operator.retry.max.attempts | integer | 15 | false | Max attempts of retries on unhandled controller errors. |
+| spark.operator.driver.create.max.attempts | integer | 3 | true | Maximal number of retry attempts of requesting driver for Spark application. |
+| spark.operator.max.retry.attempts.on.k8s.failure | long | 3 | true | Maximal number of retry attempts of requests to k8s server upon response 429 and 5xx. |
+| spark.operator.retry.attempt.after.seconds | long | 1 | true | Default time (in seconds) to wait till next request. This would be used if server does not set Retry-After in response. |
+| spark.operator.max.retry.attempt.after.seconds | long | 15 | true | Maximal time (in seconds) to wait till next request. |
+| spark.operator.status.patch.max.retry | long | 3 | true | Maximal number of retry attempts of requests to k8s server for resource status update. |
+| spark.operator.status.patch.failure.backoff.seconds | long | 3 | true | Default time (in seconds) to wait till next request to patch resource status update. |
+| spark.operator.app.reconcile.interval.seconds | long | 120 | true | Interval (in seconds) to reconcile when application is is starting up. Note that reconcile is always expected to be triggered per update - this interval controls the reconcile behavior when operator still need to reconcile even when there's no update ,e.g. for timeout checks. |
+| spark.operator.foreground.request.timeout.seconds | long | 120 | true | Timeout (in seconds) to for requests made to API server. this applies only to foreground requests. |
+| spark.operator.trim.attempt.state.transition.history | boolean | true | true | When enabled, operator would trim state transition history when a new attempt starts, keeping previous attempt summary only. |
+| spark.operator.josdk.metrics.enabled | boolean | true | true | When enabled, the josdk metrics will be added in metrics source and configured for operator. |
+| spark.operator.kubernetes.client.metrics.enabled | boolean | true | true | Enable KubernetesClient metrics for measuring the HTTP traffic to the Kubernetes API Server. Since the metrics is collected via Okhttp interceptors, can be disabled when opt in customized interceptors. |
+| spark.operator.kubernetes.client.metrics.group.by.response.code.group.enable | boolean | true | true | When enabled, additional metrics group by http response code group(1xx, 2xx, 3xx, 4xx, 5xx) received from API server will be added. Users can disable it when their monitoring system can combine lower level kubernetes.client.http.response.<3-digit-response-code> metrics. |
+| spark.operator.probe.port | integer | 18080 | false | The port used for health/readiness check probe status. |
+| spark.operator.sentinel.executor.pool.size | integer | 3 | false | Size of executor service in Sentinel Managers to check the health of sentinel resources. |
+| spark.operator.health.sentinel.resource.reconciliation.delay.seconds | integer | 60 | true | Allowed max time(seconds) between spec update and reconciliation for sentinel resources. |
+| spark.operator.leader.election.enabled | boolean | false | false | Enable leader election for the operator to allow running standby instances. |
+| spark.operator.leader.election.lease.name | string | spark-operator-lease | false | Leader election lease name, must be unique for leases in the same namespace. |
+| spark.operator.leader.election.lease.duration.seconds | long | 1200 | false | Leader election lease duration. |
+| spark.operator.leader.election.renew.deadline.seconds | long | 600 | false | Leader election renew deadline. |
+| spark.operator.leader.election.retry.period.seconds | long | 180 | false | Leader election retry period. |
+| spark.operator.metrics.port | integer | 19090 | false | The port used for export metrics. |
+
+## Config Metrics Publishing Behavior
+
+Spark Operator uses the same source & sink interface as Apache Spark. This means you can
+use existing Spark metrics sink for both applications and the operator.
diff --git a/spark-operator-docs/developer_guide.md b/spark-operator-docs/developer_guide.md
new file mode 100644
index 00000000..4a298611
--- /dev/null
+++ b/spark-operator-docs/developer_guide.md
@@ -0,0 +1,84 @@
+
+
+# Developer Guide
+
+## Build Operator Locally
+
+To build operator locally, use
+
+```shell
+./gradlew clean build
+```
+
+If you are working on API (CRD) changes, remember to update CRD yaml in chart as well
+
+```shell
+# This requires yq installed locally to add additional printer columns
+# could be removed after fixing https://github.com/fabric8io/kubernetes-client/issues/3069
+./gradlew :spark-operator-api:copyGeneratedCRD
+```
+
+## Build Operator Image Locally
+
+ ```bash
+ # Build a local container image which can be used for minikube.etc.
+ # For testing in remote k8s cluster, please also do `docker push` to make it available
+ # to the cluster / nodes
+ docker build --build-arg BASE_VERSION=0.1.0 -t spark-kubernetes-operator:0.1.0 .
+ ```
+
+## Deploy Operator
+
+### Install the Spark Operator
+
+ ```bash
+ helm install spark-kubernetes-operator \
+ -f build-tools/helm/spark-kubernetes-operator/values.yaml \
+ build-tools/helm/spark-kubernetes-operator/
+ ```
+
+### Upgrade the operator to a new version
+
+ ```bash
+ # update CRD as applicable
+ kubectl replace -f /path/to/build-tools/helm/spark-kubernetes-operator/crds/sparkapplications.org.apache.spark-v1.yml
+
+ # upgrade deployment
+ helm upgrade spark-kubernetes-operator \
+ -f build-tools/helm/spark-kubernetes-operator/values.yaml \
+ --set image.tag= \
+ build-tools/helm/spark-kubernetes-operator/
+ ```
+
+## Run Tests
+
+In addition to unit tests, we are actively working on the e2e test framework for the
+operator. This depends on the CI integration for operator.
+
+For now, in order to manually run e2e tests:
+
+* Build operator image and install the built image in k8s cluster
+* Run AppSubmitToSucceedTest
+
+```shell
+java -cp /path/to/spark-operator-test.jar \
+ -Dspark.operator.test.app.yaml.files.dir=/path/to/e2e-tests/ \
+ org.apache.spark.kubernetes.operator.SparkAppSubmitToSucceedTest
+```
diff --git a/spark-operator-docs/getting_started.md b/spark-operator-docs/getting_started.md
new file mode 100644
index 00000000..6a87ccf2
--- /dev/null
+++ b/spark-operator-docs/getting_started.md
@@ -0,0 +1,195 @@
+
+
+## Getting Started
+
+This doc provides a quick introduction to creating and managing Spark applications with
+Operator.
+
+To follow along with this guide, first, clone this repository and have a
+[Minikube](https://minikube.sigs.k8s.io/docs/) cluster ready for a quick start of running examples
+locally. Make sure to update kube config as well - this example would deploy Spark Operator
+and run Spark application(s) in the current context and namespace.
+
+It is possible to try the operator on remote k8s cluster (EKS / GKE .etc). To do so, make
+sure you publish the built operator image to a docker registry that's accessible for the
+cluster.
+
+### Compatibility
+
+- JDK17, or 23
+- Operator used fabric8 which assumes to be compatible with available k8s versions.
+- Spark versions 3.4 and above
+
+### Start minikube
+
+Start miniKube and make it access locally-built image
+
+```shell
+minikube start
+eval $(minikube docker-env)
+```
+
+### Build Spark Operator Locally
+
+ ```bash
+ # Build a local container image which can be used for minikube.etc.
+ # For testing in remote k8s cluster, please also do `docker push` to make it available
+ # to the cluster / nodes
+ docker build --build-arg BASE_VERSION=0.1.0 -t spark-kubernetes-operator:0.1.0 .
+
+ # Generate CRD yaml and make it available for chart deployment
+ ./gradlew spark-operator-api:copyGeneratedCRD
+ ```
+### Install the Spark Operator
+
+ ```bash
+ helm install spark-kubernetes-operator -f build-tools/helm/spark-kubernetes-operator/values.yaml build-tools/helm/spark-kubernetes-operator/
+ ```
+### Verify the Installation
+
+Check if the pods are up and running:
+ ```shell
+ $ kubectl get pods
+ ```
+
+Which would show operator pod like
+
+```
+NAME READY STATUS RESTARTS AGE
+spark-kubernetes-operator-995d88bdf-nwr7r 1/1 Running 0 16s
+```
+
+You may also find the installed CRD with
+
+ ```shell
+ $ kubectl get crd sparkapplications.org.apache.spark
+ ```
+
+
+### Start Spark Application
+
+Start Spark-pi with
+
+ ```bash
+ kubectl create -f spark-operator/src/main/resources/spark-pi.yaml
+ ```
+
+### Monitor Spark Application State Transition
+
+ ```bash
+ kubectl get sparkapp spark-pi -o yaml
+ ```
+
+It should give Spark application spec as well as the state transition history, for example
+
+```
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ creationTimestamp: "2024-04-02T22:24:47Z"
+ finalizers:
+ - sparkapplications.org.apache.spark/finalizer
+ generation: 2
+ name: spark-pi
+ namespace: default
+ resourceVersion: "963"
+ uid: 356dedb1-0c09-4515-9233-165d28ae6d27
+spec:
+ applicationTolerations:
+ applicationTimeoutConfig:
+ driverStartTimeoutMillis: 300000
+ executorStartTimeoutMillis: 300000
+ forceTerminationGracePeriodMillis: 300000
+ sparkSessionStartTimeoutMillis: 300000
+ terminationRequeuePeriodMillis: 2000
+ resourceRetentionPolicy: OnFailure
+ instanceConfig:
+ initExecutors: 0
+ maxExecutors: 0
+ minExecutors: 0
+ restartConfig:
+ maxRestartAttempts: 3
+ restartBackoffMillis: 30000
+ restartPolicy: NEVER
+ deploymentMode: CLUSTER_MODE
+ driverArgs: []
+ jars: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.1.jar
+ mainClass: org.apache.spark.examples.SparkPi
+ runtimeVersions:
+ scalaVersion: v2_12
+ sparkVersion: v3_5_1
+ sparkConf:
+ spark.executor.instances: "5"
+ spark.kubernetes.authenticate.driver.serviceAccountName: spark
+ spark.kubernetes.container.image: spark:3.5.1-scala2.12-java11-python3-r-ubuntu
+status:
+ currentAttemptSummary:
+ attemptInfo:
+ id: 0
+ currentState:
+ currentStateSummary: RUNNING_HEALTHY
+ lastTransitionTime: "2024-04-02T22:24:52.342061Z"
+ message: 'Application is running healthy. '
+ stateTransitionHistory:
+ "0":
+ currentStateSummary: SUBMITTED
+ lastTransitionTime: "2024-04-02T22:24:47.592355Z"
+ message: 'Spark application has been created on Kubernetes Cluster. '
+ "1":
+ currentStateSummary: DRIVER_REQUESTED
+ lastTransitionTime: "2024-04-02T22:24:50.268363Z"
+ message: 'Requested driver from resource scheduler. '
+ "2":
+ currentStateSummary: DRIVER_STARTED
+ lastTransitionTime: "2024-04-02T22:24:52.238794Z"
+ message: 'Driver has started running. '
+ "3":
+ currentStateSummary: DRIVER_READY
+ lastTransitionTime: "2024-04-02T22:24:52.239101Z"
+ message: 'Driver has reached ready state. '
+ "4":
+ currentStateSummary: RUNNING_HEALTHY
+ lastTransitionTime: "2024-04-02T22:24:52.342061Z"
+ message: 'Application is running healthy. '
+```
+
+Delete application Spark-pi and its secondary resources with
+
+ ```bash
+ kubectl delete -f spark-operator/src/main/resources/spark-pi.yaml
+ ```
+
+
+#### Uninstallation
+
+To remove the installed resources from your cluster, reset environment to the defaults and
+shutdown the cluster:
+
+```bash
+helm uninstall spark-kubernetes-operator
+eval $(minikube docker-env --unset)
+minikube stop
+```
+
+### More examples
+
+More PySpark / SparkR examples can be found under [e2e-tests](../e2e-tests).
+
+Read more about how to understand, write and build your SparkApplication [here](spark_application.md).
diff --git a/spark-operator-docs/metrics_logging.md b/spark-operator-docs/metrics_logging.md
new file mode 100644
index 00000000..d943dbec
--- /dev/null
+++ b/spark-operator-docs/metrics_logging.md
@@ -0,0 +1,109 @@
+
+
+# Metrics
+
+Spark operator,
+following [Apache Spark](https://spark.apache.org/docs/latest/monitoring.html#metrics),
+has a configurable metrics system based on
+the [Dropwizard Metrics Library](https://metrics.dropwizard.io/4.2.25/). Note that Spark Operator
+does not have Spark UI, MetricsServlet
+and PrometheusServlet from org.apache.spark.metrics.sink package are not supported. If you are
+interested in Prometheus metrics exporting, please take a look at below section `Forward Metrics to Prometheus`
+
+## JVM Metrics
+
+Spark Operator collects JVM metrics
+via [Codahale JVM Metrics](https://javadoc.io/doc/com.codahale.metrics/metrics-jvm/latest/index.html)
+
+- BufferPoolMetricSet
+- FileDescriptorRatioGauge
+- GarbageCollectorMetricSet
+- MemoryUsageGaugeSet
+- ThreadStatesGaugeSet
+
+## Kubernetes Client Metrics
+
+| Metrics Name | Type | Description |
+|-----------------------------------------------------------|------------|--------------------------------------------------------------------------------------------------------------------------|
+| kubernetes.client.http.request | Meter | Tracking the rates of HTTP request sent to the Kubernetes API Server |
+| kubernetes.client.http.response | Meter | Tracking the rates of HTTP response from the Kubernetes API Server |
+| kubernetes.client.http.response.failed | Meter | Tracking the rates of HTTP requests which have no response from the Kubernetes API Server |
+| kubernetes.client.http.response.latency.nanos | Histograms | Measures the statistical distribution of HTTP response latency from the Kubernetes API Server |
+| kubernetes.client.http.response. | Meter | Tracking the rates of HTTP response based on response code from the Kubernetes API Server |
+| kubernetes.client.http.request. | Meter | Tracking the rates of HTTP request based type of method to the Kubernetes API Server |
+| kubernetes.client.http.response.1xx | Meter | Tracking the rates of HTTP Code 1xx responses (informational) received from the Kubernetes API Server per response code. |
+| kubernetes.client.http.response.2xx | Meter | Tracking the rates of HTTP Code 2xx responses (success) received from the Kubernetes API Server per response code. |
+| kubernetes.client.http.response.3xx | Meter | Tracking the rates of HTTP Code 3xx responses (redirection) received from the Kubernetes API Server per response code. |
+| kubernetes.client.http.response.4xx | Meter | Tracking the rates of HTTP Code 4xx responses (client error) received from the Kubernetes API Server per response code. |
+| kubernetes.client.http.response.5xx | Meter | Tracking the rates of HTTP Code 5xx responses (server error) received from the Kubernetes API Server per response code. |
+| kubernetes.client.. | Meter | Tracking the rates of HTTP request for a combination of one Kubernetes resource and one http method |
+| kubernetes.client... | Meter | Tracking the rates of HTTP request for a combination of one namespace-scoped Kubernetes resource and one http method |
+
+## Forward Metrics to Prometheus
+
+In this section, we will show you how to forward spark operator metrics
+to [Prometheus](https://prometheus.io).
+
+* Modify the
+ build-tools/helm/spark-kubernetes-operator/values.yaml file' s metrics properties section:
+
+```properties
+metrics.properties:|+
+ spark.metrics.conf.operator.sink.prometheus.class=org.apache.spark.kubernetes.operator.metrics.
+sink.PrometheusPullModelSink
+```
+
+* Install the Spark Operator
+
+```bash
+helm install spark-kubernetes-operator -f build-tools/helm/spark-kubernetes-operator/values.yaml build-tools/helm/spark-kubernetes-operator/
+```
+
+* Install Prometheus via Helm Chart
+
+```bash
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm install prometheus prometheus-community/prometheus
+```
+
+* Find and Annotate Spark Operator Pods
+
+```bash
+kubectl get pods -l app.kubernetes.io/name=spark-kubernetes-operator
+NAME READY STATUS RESTARTS AGE
+spark-kubernetes-operator-598cb5d569-bvvd2 1/1 Running 0 24m
+
+kubectl annotate pods spark-kubernetes-operator-598cb5d569-bvvd2 prometheus.io/scrape=true
+kubectl annotate pods spark-kubernetes-operator-598cb5d569-bvvd2 prometheus.io/path=/prometheus
+kubectl annotate pods spark-kubernetes-operator-598cb5d569-bvvd2 prometheus.io/port=19090
+```
+
+* Check Metrics via Prometheus UI
+
+```bash
+kubectl get pods | grep "prometheus-server"
+prometheus-server-654bc74fc9-8hgkb 2/2 Running 0 59m
+
+kubectl port-forward --address 0.0.0.0 pod/prometheus-server-654bc74fc9-8hgkb 8080:9090
+```
+
+open your browser with address `localhost:8080`. Click on Status Targets tab, you should be able
+to find target as below.
+[](resources/prometheus.png)
diff --git a/spark-operator-docs/operations.md b/spark-operator-docs/operations.md
new file mode 100644
index 00000000..2002de23
--- /dev/null
+++ b/spark-operator-docs/operations.md
@@ -0,0 +1,124 @@
+
+
+## Manage Your Spark Operator
+
+The operator installation is managed by a helm chart. To install run:
+
+```
+helm install spark-kubernetes-operator \
+ -f build-tools/helm/spark-kubernetes-operator/values.yaml \
+ build-tools/helm/spark-kubernetes-operator/
+```
+
+Alternatively to install the operator (and also the helm chart) to a specific namespace:
+
+```
+helm install spark-kubernetes-operator \
+ -f build-tools/helm/spark-kubernetes-operator/values.yaml \
+ build-tools/helm/spark-kubernetes-operator/ \
+ --namespace spark-system --create-namespace
+```
+
+Note that in this case you will need to update the namespace in the examples accordingly.
+
+### Spark Application Namespaces
+
+By default, Spark applications are created in the same namespace as the operator deployment.
+You many also configure the chart deployment to add necessary RBAC resources for
+applications to enable them running in additional namespaces.
+
+## Overriding configuration parameters during Helm install
+
+Helm provides different ways to override the default installation parameters (contained
+in `values.yaml`) for the Helm chart.
+
+To override single parameters you can use `--set`, for example:
+
+```
+helm install --set image.repository=/spark-kubernetes-operator \
+ -f build-tools/helm/spark-kubernetes-operator/values.yaml \
+ build-tools/helm/spark-kubernetes-operator/
+```
+
+You can also provide multiple custom values file by using the `-f` flag, the latest takes
+higher precedence:
+
+```
+helm install spark-kubernetes-operator \
+ -f build-tools/helm/spark-kubernetes-operator/values.yaml \
+ -f my_values.yaml \
+ build-tools/helm/spark-kubernetes-operator/
+```
+
+The configurable parameters of the Helm chart and which default values as detailed in the
+following table:
+
+| Parameters | Description | Default value |
+|----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|
+| image.repository | The image repository of spark-kubernetes-operator. | spark-kubernetes-operator |
+| image.pullPolicy | The image pull policy of spark-kubernetes-operator. | IfNotPresent |
+| image.tag | The image tag of spark-kubernetes-operator. | |
+| image.digest | The image digest of spark-kubernetes-operator. If set then it takes precedence and the image tag will be ignored. | |
+| imagePullSecrets | The image pull secrets of spark-kubernetes-operator. | |
+| operatorDeployment.replica | Operator replica count. Must be 1 unless leader election is configured. | 1 |
+| operatorDeployment.strategy.type | Operator pod upgrade strategy. Must be Recreate unless leader election is configured. | Recreate |
+| operatorDeployment.operatorPod.annotations | Custom annotations to be added to the operator pod | |
+| operatorDeployment.operatorPod.labels | Custom labels to be added to the operator pod | |
+| operatorDeployment.operatorPod.nodeSelector | Custom nodeSelector to be added to the operator pod. | |
+| operatorDeployment.operatorPod.topologySpreadConstraints | Custom topologySpreadConstraints to be added to the operator pod. | |
+| operatorDeployment.operatorPod.dnsConfig | DNS configuration to be used by the operator pod. | |
+| operatorDeployment.operatorPod.volumes | Additional volumes to be added to the operator pod. | |
+| operatorDeployment.operatorPod.priorityClassName | Priority class name to be used for the operator pod | |
+| operatorDeployment.operatorPod.securityContext | Security context overrides for the operator pod | |
+| operatorDeployment.operatorContainer.jvmArgs | JVM arg override for the operator container. | `-XX:+UseG1GC -Xms3G -Xmx3G -Dfile.encoding=UTF8` |
+| operatorDeployment.operatorContainer.env | Custom env to be added to the operator container. | |
+| operatorDeployment.operatorContainer.envFrom | Custom envFrom to be added to the operator container, e.g. for downward API. | |
+| operatorDeployment.operatorContainer.probes | Probe config for the operator container. | |
+| operatorDeployment.operatorContainer.securityContext | Security context overrides for the operator container. | run as non root for baseline secuirty standard compliance |
+| operatorDeployment.operatorContainer.resources | Resources for the operator container. | memory 4Gi, ephemeral storage 2Gi and 1 cpu |
+| operatorDeployment.additionalContainers | Additional containers to be added to the operator pod, e.g. sidecar. | |
+| operatorRbac.serviceAccount.create | Whether to create service account for operator to use. | |
+| operatorRbac.clusterRole.create | Whether to create ClusterRole for operator to use. If disabled, a role would be created in operator & app namespaces | true |
+| operatorRbac.clusterRoleBinding.create | Whether to create ClusterRoleBinding for operator to use. If disabled, a rolebinding would be created in operator & app namespaces | true |
+| operatorRbac.clusterRole.configManagement.roleName | Role name for operator configuration management (hot property loading and leader election) | `spark-operator-config-role` |
+| appResources.namespaces.create | Whether to create dedicated namespaces for Spark apps. | `spark-operator-config-role-binding` |
+| appResources.namespaces.watchGivenNamespacesOnly | When enabled, operator would by default only watch namespace(s) provided in data field. | false |
+| appResources.namespaces.data | list of namespaces to create for apps | |
+| appResources.clusterRole.create | Enable a ClusterRole to be created for apps. If neither role nor clusterrole is enabled: Spark app would use the same access as operator. | false |
+| appResources.role.create | Enable a Role to be created in each app namespace for apps. If neither role nor clusterrole is enabled: Spark app would use the same access as operator. | false |
+| appResources.serviceAccounts.create | Whether to create a service account for apps | true |
+| appResources.serviceAccounts.name | The name of Spark app service account | `spark` |
+| appResources.labels | Labels to be applied for all app resources | `"app.kubernetes.io/component": "spark-apps"` |
+| appResources.annotations | Annotations to be applied for all app resources | |
+| appResources.sparkApplicationSentinel.create | If enabled, sentinel resources will be created for operator to watch and reconcile for the health probe purpose. | false |
+| appResources.sparkApplicationSentinel.sentinelNamespaces | A list of namespaces where sentinel resources will be created in. Note that these namespaces have to be a subset of appResources.namespaces.data | |
+| operatorConfiguration.append | If set to true, below conf file & properties would be appended to default conf. Otherwise, they would override default properties | true |
+| operatorConfiguration.log4j2.properties | The default log4j2 configuration | Refer default [log4j2.properties](../build-tools/helm/spark-kubernetes-operator/conf/log4j2.properties) |
+| operatorConfiguration.spark-operator.properties | The default operator configuration | |
+| operatorConfiguration.metrics.properties | The default operator metrics (sink) configuration | |
+| operatorConfiguration.dynamicConfig.create | If set to true, a config map would be created & watched by operator as source of truth for hot properties loading. | false |
+| operatorConfiguration.dynamicConfig.name | Name of the dynamic config map for hot property loading. | spark-kubernetes-operator-dynamic-configuration |
+
+For more information check the [Helm documentation](https://helm.sh/docs/helm/helm_install/).
+
+__Notice__: The pod resources should be set as your workload in different environments to
+archive a matched K8s pod QoS. See
+also [Pod Quality of Service Classes](https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#quality-of-service-classes).
+
diff --git a/spark-operator-docs/operator_probes.md b/spark-operator-docs/operator_probes.md
new file mode 100644
index 00000000..f3dabf84
--- /dev/null
+++ b/spark-operator-docs/operator_probes.md
@@ -0,0 +1,82 @@
+
+
+# Operator Probes
+
+In Kubernetes world, the kubelet uses readiness probes to know when a container is ready to
+start accepting traffic, and it uses liveness probes to know when to restart a container. Here
+for Spark Operators, we provided those as below by default. You can override the values in
+values.yaml if you use Helm Chart to deploy the Spark Operator.
+
+```
+ports:
+- containerPort: 18080
+ name: probe-port
+livenessProbe:
+ httpGet:
+ port: probe-port
+ path: /healthz
+ initialDelaySeconds: 30
+ periodSeconds: 10
+readinessProbe:
+ httpGet:
+ port: probe-port
+ path: /readyz
+ failureThreshold: 30
+ periodSeconds: 10
+```
+
+## Operator Readiness Probe
+
+A readiness probe helps to determine whether current instances can serve the traffic.
+Therefore, Spark Operator's readiness probe has to make sure both operator has started and also
+need to verify the existence of required rbac access.
+
+## Operator Health(Liveness) Probe
+
+A built-in health endpoint that serves as the information source for Operator liveness. Since
+Java Operator SDK provides [runtimeInfo](https://javaoperatorsdk.io/docs/features#runtime-info)
+to check the actual health of event sources. Spark Operator' s healthProbe will check:
+
+* operator runtimeInfo health state
+* Sentinel resources health state
+
+### Operator Sentinel Resource
+
+Learning
+from [Apache Flink Operator](https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-main/docs/operations/health/#canary-resources),
+a dummy spark application resource in any watched namespace can help Spark operator health
+probe monitor.
+
+Here is a Spark Sentinel resource example with the label `"spark.operator/sentinel": "true"`
+and it will not result in creation of any other kubernetes resources. Controlled by
+property `health.sentinel.resource.reconciliation.delay.seconds`, by default, the timeout to
+reconcile the sentinel resources is 60 seconds. If the operator cannot reconcile these
+resources within limited time, the operator health probe will return HTTP code 500 when kubelet
+send the HTTP Get to the liveness endpoint, and the
+kubelet will then kill the spark operator container and restart it.
+
+```yaml
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ name: spark-sentinel-resources
+ labels:
+ "spark.operator/sentinel": "true"
+```
diff --git a/spark-operator-docs/resources/prometheus.png b/spark-operator-docs/resources/prometheus.png
new file mode 100644
index 00000000..5507d577
Binary files /dev/null and b/spark-operator-docs/resources/prometheus.png differ
diff --git a/spark-operator-docs/resources/state.png b/spark-operator-docs/resources/state.png
new file mode 100644
index 00000000..a75b9ce4
Binary files /dev/null and b/spark-operator-docs/resources/state.png differ
diff --git a/spark-operator-docs/spark_application.md b/spark-operator-docs/spark_application.md
new file mode 100644
index 00000000..30d96609
--- /dev/null
+++ b/spark-operator-docs/spark_application.md
@@ -0,0 +1,223 @@
+## Spark Application API
+
+The core user facing API of the Spark Kubernetes Operator is the SparkApplication Custom
+Resources Definition (CRD). Spark Application CustomResource extends standard k8s API,
+defines Spark Application spec and tracks status.
+
+Once the Spark Operator is installed and running in your Kubernetes environment, it will
+continuously watch SparkApplication(s) submitted, via k8s API client or kubectl by the user,
+orchestrate secondary resources (pods, configmaps .etc).
+
+Please check out the [quickstart](getting_started.md) as well for installing operator.
+
+## SparkApplication
+
+SparkApplication can be defined in YAML format and with bare minimal required fields in
+order to start:
+
+```
+apiVersion: org.apache.spark/v1alpha1
+kind: SparkApplication
+metadata:
+ name: spark-pi
+spec:
+ mainClass: "org.apache.spark.examples.SparkPi"
+ jars: "local:///opt/spark/examples/jars/spark-examples_2.12-3.5.1.jar"
+ sparkConf:
+ spark.executor.instances: "5"
+ spark.kubernetes.container.image: "spark:3.5.1-scala2.12-java17-python3-ubuntu"
+ spark.kubernetes.authenticate.driver.serviceAccountName: "spark"
+ runtimeVersions:
+ scalaVersion: v2_12
+ sparkVersion: v3_5_1
+
+```
+
+
+After application is submitted, Operator will add status information to your application based on
+the observed state:
+
+```
+kubectl get sparkapp spark-pi -o yaml
+```
+
+### Write and build your SparkApplication
+
+It's straightforward to convert your spark-submit application to `SparkApplication` yaml.
+Operators constructs driver spec in the similar approach. To submit Java / scala application,
+use `.spec.jars` and `.spec.mainClass`. Similarly, set `pyFiles` or `sparkRFiles` for Python /
+SparkR applications.
+
+While building images to use by driver and executor, it's recommended to use official
+[Spark Docker](https://github.com/apache/spark-docker) as base images. Check the pod template
+support (`.spec.driverSpec.podTemplateSpec` and `.spec.executorSpec.podTemplateSpec`) as well for
+setting custom Spark home and work dir.
+
+### Pod Template Support
+
+It is possible to configure pod template for driver & executor pods for configure spec that are
+not configurable from
+SparkConf.
+
+Spark Operator supports defining pod template for driver and executor pods in two ways:
+
+1. Set `PodTemplateSpec` in `SparkApplication`
+2. Config `spark.kubernetes.[driver/executor].podTemplateFile`
+
+See [this example](../spark-operator/src/main/resources/streaming.yaml) for configure pod
+template in SparkApplication.
+
+If pod template spec is set in application spec (option 1), it would take higher precedence
+than option 2. Also `spark.kubernetes.[driver/executor].podTemplateFile` would be unset to
+avoid multiple override.
+
+When pod template is set as remote file in conf properties (option 2), please ensure Spark
+Operator has necessary permission to access the remote file location, e.g. deploy operator
+with proper workload identity with target S3 / Cloud Storage bucket access. Similar permission
+requirements are also needed driver pod: operator needs template file access to create driver,
+and driver needs the same for creating executors.
+
+Please be advised that Spark still overrides necessary pod configuration in both options. For
+more details,
+refer [Spark doc](https://spark.apache.org/docs/latest/running-on-kubernetes.html#pod-template).
+
+## Understanding Failure Types
+
+In addition to the general `FAILURE` state (that driver pod fails or driver container exits
+with non-zero code), Spark Operator introduces a few different failure state for ease of
+app status monitoring at high level, and for ease of setting up different handlers if users
+are creating / managing SparkApplications with external microservices or workflow engines.
+
+
+Spark Operator recognizes "infrastructure failure" in the best effort way. It is possible to
+configure different restart policy on general failure(s) vs. on potential infrastructure
+failure(s). For example, you may configure the app to restart only upon infrastructure
+failures. If Spark application fails as a result of
+
+```
+DRIVER_LAUNCH_TIMED_OUT
+EXECUTORS_LAUNCH_TIMED_OUT
+SCHEDULING_FAILURE
+```
+
+It is more likely that the app failed as a result of infrastructure reason(s), including
+scenarios like driver or executors cannot be scheduled or cannot initialize in configured
+time window for scheduler reasons, as a result of insufficient capacity, cannot get IP
+allocated, cannot pull images, or k8s API server issue at scheduling .etc.
+
+Please be advised that this is a best-effort failure identification. You may still need to
+debug actual failure from the driver pods. Spark Operator would stage the last observed
+driver pod status with the stopping state for audit purposes.
+
+## Configure the Tolerations for SparkApplication
+
+### Restart
+
+Spark Operator enables configure app restart behavior for different failure types. Here's a
+sample restart config snippet:
+
+``` yaml
+restartConfig:
+ # accptable values are 'NEVER', 'ALWAYS', ON_FAILURE and 'ON_INFRASTRUCTURE_FAILURE'
+ restartPolicy: NEVER
+ # operator would retry the application if configured. All resources from current attepmt
+ # would be deleted before starting next attempt
+ maxRestartAttempts: 3
+ # backoff time (in millis) that operator would wait before next attempt
+ restartBackoffMillis: 30000
+```
+
+### Timeouts
+
+Example for configure timeouts:
+
+```yaml
+applicationTimeoutConfig:
+ # timeouts set to 5min
+
+ # time to wait for driver reaches running state after requested driver
+ driverStartTimeoutMillis: 300000
+
+ # time to wait for driver reaches ready state
+ sparkSessionStartTimeoutMillis: 300000
+
+ # time to wait for driver to acquire minimal number of running executors
+ executorStartTimeoutMillis: 300000
+
+ # time to wait for force delete resources at the end of attempt
+ forceTerminationGracePeriodMillis: 300000
+```
+
+
+| Field | Type | Default Value | Descritpion |
+|-----------------------------------------------------------------------------------------|---------|---------------|--------------------------------------------------------------------------------------------------------------------|
+| .spec.applicationTolerations.applicationTimeoutConfig.driverStartTimeoutMillis | integer | 300000 | Time to wait for driver reaches running state after requested driver. |
+| .spec.applicationTolerations.applicationTimeoutConfig.executorStartTimeoutMillis | integer | 300000 | Time to wait for driver to acquire minimal number of running executors. |
+| .spec.applicationTolerations.applicationTimeoutConfig.forceTerminationGracePeriodMillis | integer | 300000 | Time to wait for force delete resources at the end of attempt. |
+| .spec.applicationTolerations.applicationTimeoutConfig.sparkSessionStartTimeoutMillis | integer | 300000 | Time to wait for driver reaches ready state. |
+| .spec.applicationTolerations.applicationTimeoutConfig.terminationRequeuePeriodMillis | integer | 2000 | Back-off time when releasing resource need to be re-attempted for application. |
+
+
+### Instance Config
+
+Instance Config helps operator to decide whether an application is running healthy. When
+the underlying cluster has batch scheduler enabled, you may configure the apps to be
+started if and only if there are sufficient resources. If, however, the cluster does not
+have a batch scheduler, operator may help avoid app hanging with `InstanceConfig` that
+describes the bare minimal tolerable scenario.
+
+For example, with below spec:
+
+```yaml
+applicationTolerations:
+ instanceConfig:
+ minExecutors: 3
+ initExecutors: 5
+ maxExecutors: 10
+sparkConf:
+ spark.executor.instances: "10"
+```
+
+Spark would try to bring up 10 executors as defined in SparkConf. In addition, from
+operator perspective,
+
+* If Spark app acquires less than 5 executors in given tine window (.spec.
+ applicationTolerations.applicationTimeoutConfig.executorStartTimeoutMillis) after
+ submitted, it would be shut down proactively in order to avoid resource deadlock.
+* Spark app would be marked as 'RUNNING_WITH_PARTIAL_CAPACITY' if it loses executors after
+ successfully start up.
+* Spark app would be marked as 'RUNNING_HEALTHY' if it has at least min executors after
+ successfully started up.
+
+### Delete Resources On Termination
+
+Operator by default would delete all created resources at the end of an attempt. It would
+try to record the last observed driver status in `status` field of the application for
+troubleshooting purpose.
+
+On the other hand, when developing an application, it's possible to configure
+
+```yaml
+applicationTolerations:
+ # Acceptable values are 'AlwaysDelete', 'RetainOnFailure', 'NeverDelete'
+ resourceRetentionPolicy: RetainOnFailure
+```
+
+So operator would not attempt to delete driver pod and driver resources if app fails. Similarly,
+if resourceRetentionPolicy is set to `NeverDelete`, operator would not delete driver resources
+when app ends. Note that this applies only to operator-created resources (driver pod, SparkConf
+configmap .etc). You may also want to tune `spark.kubernetes.driver.service.deleteOnTermination`
+and `spark.kubernetes.executor.deleteOnTermination` to control the behavior of driver-created
+resources.
+
+## Supported Spark Versions
+
+Spark Version is a required field for SparkApplication. At current phase, operator uses
+single submission-worker mode to support all listed versions.
+
+```yaml
+runtimeVersions:
+ # Supported values are:
+ # v3_5_1, v3_4_2
+ sparkVersion: v3_4_2
+```
diff --git a/spark-operator-tests/.gitignore b/spark-operator-tests/.gitignore
new file mode 100644
index 00000000..b63da455
--- /dev/null
+++ b/spark-operator-tests/.gitignore
@@ -0,0 +1,42 @@
+.gradle
+build/
+!gradle/wrapper/gradle-wrapper.jar
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### IntelliJ IDEA ###
+.idea/modules.xml
+.idea/jarRepositories.xml
+.idea/compiler.xml
+.idea/libraries/
+*.iws
+*.iml
+*.ipr
+out/
+!**/src/main/**/out/
+!**/src/test/**/out/
+
+### Eclipse ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+bin/
+!**/src/main/**/bin/
+!**/src/test/**/bin/
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+
+### VS Code ###
+.vscode/
+
+### Mac OS ###
+.DS_Store
\ No newline at end of file
diff --git a/spark-operator-tests/build.gradle b/spark-operator-tests/build.gradle
new file mode 100644
index 00000000..42cc60e6
--- /dev/null
+++ b/spark-operator-tests/build.gradle
@@ -0,0 +1,13 @@
+dependencies {
+ testImplementation project(":spark-operator-api")
+
+ testImplementation("io.fabric8:kubernetes-client:$fabric8Version")
+ testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:$log4jVersion")
+ testImplementation("org.apache.logging.log4j:log4j-core:$log4jVersion")
+ testImplementation platform("org.junit:junit-bom:$junitVersion")
+ testImplementation 'org.junit.jupiter:junit-jupiter'
+}
+
+test {
+ useJUnitPlatform()
+}
diff --git a/spark-operator-tests/src/test/java/org/apache/spark/kubernetes/operator/SparkAppSubmitToSucceedTest.java b/spark-operator-tests/src/test/java/org/apache/spark/kubernetes/operator/SparkAppSubmitToSucceedTest.java
new file mode 100644
index 00000000..1dfa196d
--- /dev/null
+++ b/spark-operator-tests/src/test/java/org/apache/spark/kubernetes/operator/SparkAppSubmitToSucceedTest.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator;
+
+import java.io.File;
+import java.time.Duration;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+import io.fabric8.kubernetes.client.KubernetesClient;
+import io.fabric8.kubernetes.client.KubernetesClientBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.spark.kubernetes.operator.status.ApplicationStateSummary;
+
+class SparkAppSubmitToSucceedTest {
+ private static final Logger logger = LoggerFactory.getLogger(SparkAppSubmitToSucceedTest.class);
+
+ /**
+ * Create Spark app(s) & wait them for complete.
+ * This sample would check apps periodically, force delete them after timeout if they have
+ * not completed.
+ * Exit 0 iff all given app(s) isTerminated successfully.
+ * E.g. when test cluster is up and kube config is configured, this can be invoked as
+ * java -cp /path/to/test.jar -Dspark.operator.test.app.yaml.files.dir=/path/to/e2e-tests/
+ * org.apache.spark.kubernetes.operator.AppSubmitToSucceedTest
+ *
+ * @param args directory path(s) to load SparkApp yaml file(s) from
+ */
+ public static void main(String[] args) throws InterruptedException {
+ KubernetesClient client = new KubernetesClientBuilder().build();
+
+ Duration observeInterval = Duration.ofMinutes(
+ Long.parseLong(
+ System.getProperty("spark.operator.test.observe.interval.min", "1")));
+ Duration appExecTimeout = Duration.ofMinutes(
+ Long.parseLong(
+ System.getProperty("spark.operator.test.app.timeout.min", "10")));
+ Duration testTimeout = Duration.ofMinutes(
+ Long.parseLong(
+ System.getProperty("spark.operator.test.timeout.min", "30")));
+ Integer execParallelism = Integer.parseInt(
+ System.getProperty("spark.operator.test.exec.parallelism", "2"));
+ String testAppYamlFilesDir = System.getProperty("spark.operator.test.app.yaml.files.dir",
+ "e2e-tests/spark-apps/");
+ String testAppNamespace = System.getProperty("spark.operator.test.app.namespace",
+ "default");
+
+ Set testApps =
+ loadSparkAppsFromFile(client, new File(testAppYamlFilesDir));
+ ConcurrentMap failedApps = new ConcurrentHashMap<>();
+
+ ExecutorService execPool = Executors.newFixedThreadPool(execParallelism);
+ List> todos = new ArrayList<>(testApps.size());
+
+ for (SparkApplication app : testApps) {
+ todos.add(() -> {
+ try {
+ Instant timeoutTime = Instant.now().plus(appExecTimeout);
+ SparkApplication updatedApp =
+ client.resource(app).inNamespace(testAppNamespace).create();
+ if (logger.isInfoEnabled()) {
+ logger.info("Submitting app {}", updatedApp.getMetadata().getName());
+ }
+ while (Instant.now().isBefore(timeoutTime)) {
+ Thread.sleep(observeInterval.toMillis());
+ updatedApp = client.resource(app).inNamespace(testAppNamespace).get();
+ if (appCompleted(updatedApp)) {
+ boolean succeeded = updatedApp.getStatus().getStateTransitionHistory()
+ .entrySet()
+ .stream()
+ .anyMatch(e -> ApplicationStateSummary.SUCCEEDED.equals(
+ e.getValue().getCurrentStateSummary()));
+ if (succeeded) {
+ if (logger.isInfoEnabled()) {
+ logger.info("App succeeded: {}",
+ updatedApp.getMetadata().getName());
+ }
+ } else {
+ if (logger.isErrorEnabled()) {
+ logger.error("App failed: {}",
+ updatedApp.getMetadata().getName());
+ }
+ failedApps.put(updatedApp.getMetadata().getName(),
+ updatedApp.getStatus().toString());
+ }
+ return null;
+ } else {
+ if (logger.isInfoEnabled()) {
+ logger.info("Application {} not completed...",
+ app.getMetadata().getName());
+ }
+ }
+ }
+ if (logger.isInfoEnabled()) {
+ logger.info("App {} timed out.", app.getMetadata().getName());
+ }
+ failedApps.put(updatedApp.getMetadata().getName(),
+ "timed out: " + updatedApp.getStatus().toString());
+ return null;
+ } catch (Exception e) {
+ failedApps.put(app.getMetadata().getName(), "failed: " + e.getMessage());
+ return null;
+ }
+ });
+ }
+
+ int testSucceeded = 1;
+ try {
+ execPool.invokeAll(todos, testTimeout.toMillis(), TimeUnit.MILLISECONDS);
+ if (failedApps.isEmpty()) {
+ if (logger.isInfoEnabled()) {
+ logger.info("Test completed successfully");
+ }
+ testSucceeded = 0;
+ } else {
+ if (logger.isErrorEnabled()) {
+ logger.error("Failed apps found. ");
+ failedApps.forEach((k, v) -> {
+ logger.error("Application failed: {}", k);
+ logger.error("\t status: {}", v);
+ });
+ }
+ }
+ } finally {
+ for (SparkApplication app : testApps) {
+ try {
+ client.resource(app).inNamespace(testAppNamespace).delete();
+ } catch (Exception e) {
+ if (logger.isErrorEnabled()) {
+ logger.error("Failed to remove app {}", app.getMetadata().getName());
+ }
+ }
+ }
+ }
+ System.exit(testSucceeded);
+ }
+
+ private static Set loadSparkAppsFromFile(KubernetesClient client,
+ File appsFile) {
+ if (appsFile.exists()) {
+ if (appsFile.isFile()) {
+ return Collections.singleton(
+ client.resources(SparkApplication.class).load(appsFile).item());
+ } else {
+ Set applications = new HashSet<>();
+ File[] subDirs = appsFile.listFiles();
+ if (subDirs != null) {
+ for (File file : subDirs) {
+ applications.addAll(loadSparkAppsFromFile(client, file));
+ }
+ }
+ return applications;
+ }
+ }
+ if (logger.isErrorEnabled()) {
+ logger.error("No SparkApp found at {}", appsFile.getAbsolutePath());
+ }
+ return Collections.emptySet();
+ }
+
+ private static boolean appCompleted(SparkApplication app) {
+ return app != null && app.getStatus() != null && app.getStatus().getCurrentState() != null
+ && app.getStatus().getStateTransitionHistory() != null
+ && app.getStatus().getCurrentState().getCurrentStateSummary().isTerminated();
+ }
+}
diff --git a/spark-operator-tests/src/test/resources/EcsLayout.json b/spark-operator-tests/src/test/resources/EcsLayout.json
new file mode 100644
index 00000000..8d215ab5
--- /dev/null
+++ b/spark-operator-tests/src/test/resources/EcsLayout.json
@@ -0,0 +1,49 @@
+{
+ "@timestamp": {
+ "$resolver": "timestamp",
+ "pattern": {
+ "format": "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'",
+ "timeZone": "UTC"
+ }
+ },
+ "ecs.version": "1.2.0",
+ "log.level": {
+ "$resolver": "level",
+ "field": "name"
+ },
+ "message": {
+ "$resolver": "message",
+ "stringified": true
+ },
+ "process.thread.name": {
+ "$resolver": "thread",
+ "field": "name"
+ },
+ "log.logger": {
+ "$resolver": "logger",
+ "field": "name"
+ },
+ "labels": {
+ "$resolver": "mdc",
+ "flatten": true,
+ "stringified": true
+ },
+ "tags": {
+ "$resolver": "ndc"
+ },
+ "error.type": {
+ "$resolver": "exception",
+ "field": "className"
+ },
+ "error.message": {
+ "$resolver": "exception",
+ "field": "message"
+ },
+ "error.stack_trace": {
+ "$resolver": "exception",
+ "field": "stackTrace",
+ "stackTrace": {
+ "stringified": true
+ }
+ }
+}
diff --git a/spark-operator-tests/src/test/resources/log4j2.properties b/spark-operator-tests/src/test/resources/log4j2.properties
new file mode 100644
index 00000000..9285fa00
--- /dev/null
+++ b/spark-operator-tests/src/test/resources/log4j2.properties
@@ -0,0 +1,52 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+status=debug
+strict=true
+dest=out
+name=PropertiesConfig
+property.filename=/tmp/spark-operator
+filter.threshold.type=ThresholdFilter
+filter.threshold.level=debug
+# console
+appender.console.type=Console
+appender.console.name=STDOUT
+appender.console.layout.type=PatternLayout
+appender.console.layout.pattern=%d %p %X %C{1.} [%t] %m%n
+appender.console.filter.threshold.type=ThresholdFilter
+appender.console.filter.threshold.level=info
+# rolling JSON
+appender.rolling.type=RollingFile
+appender.rolling.name=RollingFile
+appender.rolling.append=true
+appender.rolling.fileName=${filename}.log
+appender.rolling.filePattern=${filename}-%i.log.gz
+appender.rolling.layout.type=JsonTemplateLayout
+appender.rolling.layout.eventTemplateUri=classpath:EcsLayout.json
+appender.rolling.policies.type=Policies
+appender.rolling.policies.size.type=SizeBasedTriggeringPolicy
+appender.rolling.policies.size.size=100MB
+appender.rolling.strategy.type=DefaultRolloverStrategy
+appender.rolling.strategy.max=20
+appender.rolling.immediateFlush=true
+# chatty loggers
+rootLogger.level=all
+logger.netty.name=io.netty
+logger.netty.level=warn
+log4j2.contextSelector=org.apache.logging.log4j.core.async.AsyncLoggerContextSelector
+rootLogger.appenderRef.stdout.ref=STDOUT
+rootLogger.appenderRef.rolling.ref=RollingFile
diff --git a/spark-operator/.gitignore b/spark-operator/.gitignore
new file mode 100644
index 00000000..b63da455
--- /dev/null
+++ b/spark-operator/.gitignore
@@ -0,0 +1,42 @@
+.gradle
+build/
+!gradle/wrapper/gradle-wrapper.jar
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### IntelliJ IDEA ###
+.idea/modules.xml
+.idea/jarRepositories.xml
+.idea/compiler.xml
+.idea/libraries/
+*.iws
+*.iml
+*.ipr
+out/
+!**/src/main/**/out/
+!**/src/test/**/out/
+
+### Eclipse ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+bin/
+!**/src/main/**/bin/
+!**/src/test/**/bin/
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+
+### VS Code ###
+.vscode/
+
+### Mac OS ###
+.DS_Store
\ No newline at end of file
diff --git a/spark-operator/build.gradle b/spark-operator/build.gradle
new file mode 100644
index 00000000..9719270d
--- /dev/null
+++ b/spark-operator/build.gradle
@@ -0,0 +1,78 @@
+apply plugin: 'com.github.johnrengelman.shadow'
+
+buildscript {
+ repositories {
+ maven {
+ url = uri("https://plugins.gradle.org/m2/")
+ }
+ }
+ dependencies {
+ classpath "com.github.johnrengelman:shadow:$shadowJarPluginVersion"
+ }
+}
+
+dependencies {
+ implementation project(":spark-operator-api")
+ implementation project(":spark-submission-worker")
+
+ implementation("io.javaoperatorsdk:operator-framework:$operatorSDKVersion") {
+ exclude group: 'com.squareup.okio'
+ }
+
+ testImplementation("io.javaoperatorsdk:operator-framework-junit-5:$operatorSDKVersion") {
+ exclude group: 'com.squareup.okio'
+ }
+
+ implementation("io.fabric8:kubernetes-httpclient-okhttp:$fabric8Version") {
+ exclude group: 'com.squareup.okhttp3'
+ }
+ implementation("com.squareup.okhttp3:okhttp:$okHttpVersion")
+ implementation("com.squareup.okhttp3:logging-interceptor:$okHttpVersion")
+ implementation("io.dropwizard.metrics:metrics-core:$dropwizardMetricsVersion")
+ implementation("io.dropwizard.metrics:metrics-jvm:$dropwizardMetricsVersion")
+ compileOnly("org.projectlombok:lombok:$lombokVersion")
+ implementation("io.dropwizard.metrics:metrics-healthchecks:$dropwizardMetricsVersion")
+ compileOnly("org.apache.spark:spark-core_$sparkScalaVersion:$sparkVersion") {
+ exclude group: "org.apache.logging.log4j"
+ exclude group: "org.slf4j"
+ }
+ annotationProcessor("org.projectlombok:lombok:$lombokVersion")
+
+ // logging
+ implementation("org.apache.logging.log4j:log4j-api:$log4jVersion")
+ implementation("org.apache.logging.log4j:log4j-core:$log4jVersion")
+ implementation("org.apache.logging.log4j:log4j-slf4j-impl:$log4jVersion")
+ implementation("org.apache.logging.log4j:log4j-1.2-api:$log4jVersion")
+ implementation("org.apache.logging.log4j:log4j-layout-template-json:$log4jLayoutVersion")
+
+ testImplementation("io.fabric8:kubernetes-server-mock:$fabric8Version") {
+ exclude group: 'junit'
+ }
+ testImplementation("org.apache.spark:spark-core_$sparkScalaVersion:$sparkVersion")
+ testImplementation("com.squareup.okhttp3:mockwebserver:$okHttpVersion")
+ testImplementation platform("org.junit:junit-bom:$junitVersion")
+ testImplementation("org.junit.jupiter:junit-jupiter:$junitVersion")
+ testImplementation("org.mockito:mockito-core:$mockitoVersion")
+}
+
+test {
+ useJUnitPlatform()
+}
+
+jar.dependsOn shadowJar
+
+jar {
+ zip64 = true
+ archiveVersion.set('')
+}
+
+
+shadowJar {
+ zip64 = true
+ mergeServiceFiles()
+ transform(com.github.jengelman.gradle.plugins.shadow.transformers.Log4j2PluginsCacheFileTransformer)
+}
+
+description = "Spark Kubernetes Operator"
+def artifact = "spark-kubernetes-operator"
+archivesBaseName = artifact
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/SparkOperator.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/SparkOperator.java
new file mode 100644
index 00000000..f2484c84
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/SparkOperator.java
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator;
+
+import java.time.Duration;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import io.fabric8.kubernetes.client.KubernetesClient;
+import io.javaoperatorsdk.operator.Operator;
+import io.javaoperatorsdk.operator.RegisteredController;
+import io.javaoperatorsdk.operator.api.config.ConfigurationServiceOverrider;
+import io.javaoperatorsdk.operator.api.config.ControllerConfigurationOverrider;
+import io.javaoperatorsdk.operator.processing.event.rate.LinearRateLimiter;
+import io.javaoperatorsdk.operator.processing.event.rate.RateLimiter;
+import io.javaoperatorsdk.operator.processing.retry.GenericRetry;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.collections.CollectionUtils;
+
+import org.apache.spark.kubernetes.operator.client.KubernetesClientFactory;
+import org.apache.spark.kubernetes.operator.config.SparkOperatorConf;
+import org.apache.spark.kubernetes.operator.config.SparkOperatorConfigMapReconciler;
+import org.apache.spark.kubernetes.operator.health.SentinelManager;
+import org.apache.spark.kubernetes.operator.metrics.MetricsService;
+import org.apache.spark.kubernetes.operator.metrics.MetricsSystem;
+import org.apache.spark.kubernetes.operator.metrics.MetricsSystemFactory;
+import org.apache.spark.kubernetes.operator.metrics.source.OperatorJosdkMetrics;
+import org.apache.spark.kubernetes.operator.probe.ProbeService;
+import org.apache.spark.kubernetes.operator.reconciler.SparkAppReconciler;
+import org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils;
+import org.apache.spark.kubernetes.operator.utils.SparkAppStatusRecorder;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.DynamicConfigEnabled;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.DynamicConfigSelectorStr;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.OperatorNamespace;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.ReconcilerParallelism;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.TerminateOnInformerFailure;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.TerminationTimeoutSeconds;
+
+/**
+ * Entry point for Spark Operator.
+ * Sets up reconcilers for CustomResource and health check servers
+ */
+@Slf4j
+public class SparkOperator {
+ private Operator sparkOperator;
+ private Operator sparkOperatorConfMonitor;
+ private KubernetesClient client;
+ private SparkAppSubmissionWorker appSubmissionWorker;
+ private SparkAppStatusRecorder sparkAppStatusRecorder;
+ protected Set> registeredSparkControllers;
+ protected Set watchedNamespaces;
+
+ private MetricsSystem metricsSystem;
+ private SentinelManager sentinelManager;
+ private ProbeService probeService;
+ private MetricsService metricsService;
+ private ExecutorService metricsResourcesSingleThreadPool;
+
+ public SparkOperator() {
+ this.metricsSystem = MetricsSystemFactory.createMetricsSystem();
+ this.client = KubernetesClientFactory.buildKubernetesClient(metricsSystem);
+ this.appSubmissionWorker = new SparkAppSubmissionWorker();
+ this.sparkAppStatusRecorder = new SparkAppStatusRecorder(
+ SparkOperatorConf.getAppStatusListener());
+ this.registeredSparkControllers = new HashSet<>();
+ this.watchedNamespaces = SparkReconcilerUtils.getWatchedNamespaces();
+ this.sentinelManager = new SentinelManager();
+ this.sparkOperator = createOperator();
+ this.sparkOperatorConfMonitor = createSparkOperatorConfMonitor();
+ var operators = Stream.of(this.sparkOperator, this.sparkOperatorConfMonitor)
+ .filter(Objects::nonNull).collect(Collectors.toList());
+ this.probeService = new ProbeService(operators, this.sentinelManager);
+ this.metricsService = new MetricsService(metricsSystem);
+ this.metricsResourcesSingleThreadPool = Executors.newSingleThreadExecutor();
+ }
+
+ protected Operator createOperator() {
+ Operator op = new Operator(this::overrideOperatorConfigs);
+ registeredSparkControllers.add(
+ op.register(new SparkAppReconciler(appSubmissionWorker, sparkAppStatusRecorder,
+ sentinelManager), this::overrideControllerConfigs));
+ return op;
+ }
+
+ protected Operator createSparkOperatorConfMonitor() {
+ if (DynamicConfigEnabled.getValue()) {
+ Operator op = new Operator(client, c -> {
+ c.withStopOnInformerErrorDuringStartup(true);
+ c.withCloseClientOnStop(false);
+ c.withInformerStoppedHandler(
+ (informer, ex) -> log.error(
+ "Dynamic config informer stopped: operator will not accept " +
+ "config updates.")
+ );
+ });
+ op.register(new SparkOperatorConfigMapReconciler(this::updateWatchingNamespaces), c -> {
+ c.settingNamespaces(OperatorNamespace.getValue());
+ c.withLabelSelector(DynamicConfigSelectorStr.getValue());
+ });
+ return op;
+ } else {
+ return null;
+ }
+ }
+
+ protected Operator getOperator() {
+ return this.sparkOperator;
+ }
+
+ protected ProbeService getProbeService() {
+ return this.probeService;
+ }
+
+ protected boolean updateWatchingNamespaces(Set namespaces) {
+ if (watchedNamespaces.equals(namespaces)) {
+ log.info("No watched namespace change detected");
+ return false;
+ }
+ if (CollectionUtils.isEmpty(namespaces)) {
+ log.error("Cannot updating namespaces to empty");
+ return false;
+ }
+ registeredSparkControllers.forEach(c -> {
+ if (c.allowsNamespaceChanges()) {
+ log.info("Updating operator namespaces to {}", namespaces);
+ c.changeNamespaces(namespaces);
+ }
+ });
+ this.watchedNamespaces = new HashSet<>(namespaces);
+ return true;
+ }
+
+ protected void overrideOperatorConfigs(ConfigurationServiceOverrider overrider) {
+ overrider.withKubernetesClient(client);
+ overrider.withStopOnInformerErrorDuringStartup(TerminateOnInformerFailure.getValue());
+ overrider.withTerminationTimeoutSeconds(TerminationTimeoutSeconds.getValue());
+ int parallelism = ReconcilerParallelism.getValue();
+ if (parallelism > 0) {
+ log.info("Configuring operator with {} reconciliation threads.", parallelism);
+ overrider.withConcurrentReconciliationThreads(parallelism);
+ } else {
+ log.info("Configuring operator with unbounded reconciliation thread pool.");
+ overrider.withExecutorService(Executors.newCachedThreadPool());
+ }
+ if (SparkOperatorConf.LEADER_ELECTION_ENABLED.getValue()) {
+ overrider.withLeaderElectionConfiguration(SparkOperatorConf.getLeaderElectionConfig());
+ }
+ if (SparkOperatorConf.JOSDKMetricsEnabled.getValue()) {
+ log.info("Adding OperatorJosdkMetrics.");
+ OperatorJosdkMetrics operatorJosdkMetrics = new OperatorJosdkMetrics();
+ overrider.withMetrics(operatorJosdkMetrics);
+ metricsSystem.registerSource(operatorJosdkMetrics);
+ }
+ }
+
+ protected void overrideControllerConfigs(ControllerConfigurationOverrider> overrider) {
+ if (watchedNamespaces.isEmpty()) {
+ log.info("Initializing operator watching at cluster level.");
+ } else {
+ log.info("Initializing with watched namespaces {}", watchedNamespaces);
+ }
+ overrider.settingNamespaces(watchedNamespaces);
+
+ RateLimiter> rateLimiter = new LinearRateLimiter(
+ Duration.ofSeconds(SparkOperatorConf.RateLimiterRefreshPeriodSeconds.getValue()),
+ SparkOperatorConf.RateLimiterLimit.getValue());
+ overrider.withRateLimiter(rateLimiter);
+
+ GenericRetry genericRetry = new GenericRetry()
+ .setMaxAttempts(SparkOperatorConf.RetryMaxAttempts.getValue())
+ .setInitialInterval(
+ Duration.ofSeconds(SparkOperatorConf.RetryInitialInternalSeconds.getValue())
+ .toMillis())
+ .setIntervalMultiplier(SparkOperatorConf.RetryInternalMultiplier.getValue());
+ if (SparkOperatorConf.RetryMaxIntervalSeconds.getValue() > 0) {
+ genericRetry.setMaxInterval(
+ Duration.ofSeconds(SparkOperatorConf.RetryMaxIntervalSeconds.getValue())
+ .toMillis());
+ }
+ overrider.withRetry(genericRetry);
+ }
+
+ public static void main(String[] args) {
+ SparkOperator sparkOperator = new SparkOperator();
+ sparkOperator.getOperator().start();
+ if (DynamicConfigEnabled.getValue() && sparkOperator.sparkOperatorConfMonitor != null) {
+ sparkOperator.sparkOperatorConfMonitor.start();
+ }
+ sparkOperator.probeService.start();
+ // MetricsServer start follows the MetricsSystem start
+ // so that MetricsSystem::getSinks will not return an empty list
+ sparkOperator.metricsResourcesSingleThreadPool.submit(() -> {
+ sparkOperator.metricsSystem.start();
+ });
+ sparkOperator.metricsResourcesSingleThreadPool.submit(() -> {
+ sparkOperator.metricsService.start();
+ });
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/client/KubernetesClientFactory.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/client/KubernetesClientFactory.java
new file mode 100644
index 00000000..249118a8
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/client/KubernetesClientFactory.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.client;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import io.fabric8.kubernetes.client.Config;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import io.fabric8.kubernetes.client.KubernetesClientBuilder;
+import io.fabric8.kubernetes.client.okhttp.OkHttpClientFactory;
+import okhttp3.Interceptor;
+import okhttp3.OkHttpClient;
+
+import org.apache.spark.kubernetes.operator.config.SparkOperatorConf;
+import org.apache.spark.kubernetes.operator.metrics.MetricsSystem;
+import org.apache.spark.kubernetes.operator.metrics.source.KubernetesMetricsInterceptor;
+
+/**
+ * Build Kubernetes Client with metrics configured
+ */
+public class KubernetesClientFactory {
+ private static final KubernetesMetricsInterceptor kubernetesMetricsInterceptor =
+ new KubernetesMetricsInterceptor();
+
+ public static KubernetesClient buildKubernetesClient(MetricsSystem metricsSystem) {
+ return buildKubernetesClient(metricsSystem, null);
+ }
+
+ public static KubernetesClient buildKubernetesClient(MetricsSystem metricsSystem,
+ Config kubernetesClientConfig) {
+ List clientInterceptors = new ArrayList<>();
+ clientInterceptors.add(new RetryInterceptor());
+
+ if (SparkOperatorConf.KubernetesClientMetricsEnabled.getValue()) {
+ clientInterceptors.add(kubernetesMetricsInterceptor);
+ // Avoid duplicate register metrics exception
+ if (!metricsSystem.getSources().contains(kubernetesMetricsInterceptor)) {
+ metricsSystem.registerSource(kubernetesMetricsInterceptor);
+ }
+ }
+
+ return new KubernetesClientBuilder()
+ .withConfig(kubernetesClientConfig)
+ .withHttpClientFactory(
+ new OkHttpClientFactory() {
+ @Override
+ protected void additionalConfig(OkHttpClient.Builder builder) {
+ for (Interceptor interceptor : clientInterceptors) {
+ builder.addInterceptor(interceptor);
+ }
+ }
+ }
+ )
+ .build();
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/client/RetryInterceptor.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/client/RetryInterceptor.java
new file mode 100644
index 00000000..4f8e8d44
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/client/RetryInterceptor.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.client;
+
+import java.io.IOException;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.Interceptor;
+import okhttp3.Request;
+import okhttp3.Response;
+import okhttp3.ResponseBody;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.spark.kubernetes.operator.config.SparkOperatorConf;
+
+import static io.fabric8.kubernetes.client.utils.Utils.closeQuietly;
+
+/**
+ * Intercepts HTTP requests and add custom retry on 429 and 5xx to overcome server instability
+ */
+@Slf4j
+public class RetryInterceptor implements Interceptor {
+ private static final String RETRY_AFTER_HEADER_NAME = "Retry-After";
+
+ private final Long maxAttemptCount;
+ private final Long maxRetryAfterInSecs;
+ private final Long defaultRetryAfterInSecs;
+
+ public RetryInterceptor() {
+ this.maxAttemptCount = SparkOperatorConf.MaxRetryAttemptOnKubeServerFailure.getValue();
+ this.maxRetryAfterInSecs = SparkOperatorConf.MaxRetryAttemptAfterSeconds.getValue();
+ this.defaultRetryAfterInSecs = SparkOperatorConf.RetryAttemptAfterSeconds.getValue();
+ }
+
+ @Override
+ public Response intercept(Chain chain) throws IOException {
+ Request request = chain.request();
+ Response response = chain.proceed(request);
+ int tryCount = 0;
+ while (!response.isSuccessful() && (response.code() == 429 || response.code() >= 500) &&
+ tryCount < maxAttemptCount) {
+ // only retry on consecutive 429 and 5xx failure responses
+ if (log.isWarnEnabled()) {
+ log.warn(
+ "Request is not successful. attempt={} response-code={} " +
+ "response-headers={}",
+ tryCount, response.code(), response.headers());
+ }
+ Optional retryAfter = getRetryAfter(response);
+ if (retryAfter.isPresent()) {
+ try {
+ TimeUnit.SECONDS.sleep(retryAfter.get());
+ } catch (InterruptedException e) {
+ if (log.isErrorEnabled()) {
+ log.error("Aborting retry.", e);
+ }
+ }
+ }
+ tryCount++;
+
+ ResponseBody responseBody = response.body();
+ if (responseBody != null) {
+ closeQuietly(responseBody);
+ }
+ // retry the request for 429 and 5xx
+ response = chain.proceed(request);
+ }
+ return response;
+ }
+
+ private Optional getRetryAfter(Response response) {
+ String retryAfter = response.header(RETRY_AFTER_HEADER_NAME);
+ if (StringUtils.isNotEmpty(retryAfter)) {
+ try {
+ return Optional.of(Math.min(Long.parseLong(retryAfter), maxRetryAfterInSecs));
+ } catch (Exception e) {
+ if (log.isErrorEnabled()) {
+ log.error(String.format(
+ "Error while parsing Retry-After header %s. Retrying with default %s",
+ retryAfter, defaultRetryAfterInSecs), e);
+ }
+ return Optional.of(defaultRetryAfterInSecs);
+ }
+ }
+ return Optional.empty();
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/ConfigOption.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/ConfigOption.java
new file mode 100644
index 00000000..94e7aa7e
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/ConfigOption.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.config;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.RequiredArgsConstructor;
+import lombok.ToString;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * Config options for Spark Operator. Supports primitive and serialized JSON
+ */
+
+@RequiredArgsConstructor
+@AllArgsConstructor
+@EqualsAndHashCode
+@ToString
+@Builder
+@Slf4j
+public class ConfigOption {
+ private static final ObjectMapper objectMapper = new ObjectMapper();
+
+ @Getter
+ @Builder.Default
+ private boolean enableDynamicOverride = true;
+ @Getter
+ private String key;
+ @Getter
+ private String description;
+ private T defaultValue;
+ private Class typeParameterClass;
+
+ public T getValue() {
+ return resolveValue();
+ }
+
+ private T resolveValue() {
+ try {
+ String value = SparkOperatorConfManager.INSTANCE.getValue(key);
+ if (!enableDynamicOverride) {
+ value = SparkOperatorConfManager.INSTANCE.getInitialValue(key);
+ }
+ if (StringUtils.isNotEmpty(value)) {
+ if (typeParameterClass.isPrimitive() || typeParameterClass == String.class) {
+ return (T) resolveValueToPrimitiveType(typeParameterClass, value);
+ } else {
+ return objectMapper.readValue(value, typeParameterClass);
+ }
+ } else {
+ return defaultValue;
+ }
+ } catch (Throwable t) {
+ log.error("Failed to resolve value for config key {}, using default value {}", key,
+ defaultValue, t);
+ return defaultValue;
+ }
+ }
+
+ public static Object resolveValueToPrimitiveType(Class clazz, String value) {
+ if (Boolean.class == clazz || Boolean.TYPE == clazz) {
+ return Boolean.parseBoolean(value);
+ }
+ if (Byte.class == clazz || Byte.TYPE == clazz) {
+ return Byte.parseByte(value);
+ }
+ if (Short.class == clazz || Short.TYPE == clazz) {
+ return Short.parseShort(value);
+ }
+ if (Integer.class == clazz || Integer.TYPE == clazz) {
+ return Integer.parseInt(value);
+ }
+ if (Long.class == clazz || Long.TYPE == clazz) {
+ return Long.parseLong(value);
+ }
+ if (Float.class == clazz || Float.TYPE == clazz) {
+ return Float.parseFloat(value);
+ }
+ if (Double.class == clazz || Double.TYPE == clazz) {
+ return Double.parseDouble(value);
+ }
+ return value;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConf.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConf.java
new file mode 100644
index 00000000..ac73bddd
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConf.java
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.config;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import io.javaoperatorsdk.operator.api.config.LeaderElectionConfiguration;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.spark.kubernetes.operator.listeners.SparkAppStatusListener;
+
+import static org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils.defaultOperatorConfigLabels;
+import static org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils.labelsAsStr;
+
+/**
+ * Spark Operator Configuration options.
+ */
+@Slf4j
+public class SparkOperatorConf {
+ public static final String METRIC_PREFIX = "spark.metrics.conf.operator.";
+ public static final String SINK = "sink.";
+ public static final String CLASS = "class";
+
+ public static final ConfigOption OperatorAppName = ConfigOption.builder()
+ .key("spark.operator.name")
+ .typeParameterClass(String.class)
+ .description("Name of the operator.")
+ .defaultValue("spark-kubernetes-operator")
+ .enableDynamicOverride(false)
+ .build();
+ public static final ConfigOption OperatorNamespace = ConfigOption.builder()
+ .key("spark.operator.namespace")
+ .typeParameterClass(String.class)
+ .description("Namespace that operator is deployed within.")
+ .defaultValue("spark-system")
+ .enableDynamicOverride(false)
+ .build();
+ public static final ConfigOption DynamicConfigEnabled = ConfigOption.builder()
+ .key("spark.operator.dynamic.config.enabled")
+ .typeParameterClass(Boolean.class)
+ .description(
+ "When enabled, operator would use config map as source of truth for config " +
+ "property override. The config map need to be created in " +
+ "spark.operator.namespace, and labeled with operator name.")
+ .defaultValue(false)
+ .enableDynamicOverride(false)
+ .build();
+ public static final ConfigOption DynamicConfigSelectorStr =
+ ConfigOption.builder()
+ .key("spark.operator.dynamic.config.selector.str")
+ .typeParameterClass(String.class)
+ .description("The selector str applied to dynamic config map.")
+ .defaultValue(labelsAsStr(defaultOperatorConfigLabels()))
+ .enableDynamicOverride(false)
+ .build();
+ public static final ConfigOption TerminateOnInformerFailure =
+ ConfigOption.builder()
+ .key("spark.operator.terminate.on.informer.failure")
+ .typeParameterClass(Boolean.class)
+ .description(
+ "Enable to indicate informer errors should stop operator startup. If " +
+ "disabled, operator startup will ignore recoverable errors, " +
+ "caused for example by RBAC issues and will retry " +
+ "periodically.")
+ .defaultValue(false)
+ .enableDynamicOverride(false)
+ .build();
+ public static final ConfigOption TerminationTimeoutSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.termination.timeout.seconds")
+ .description(
+ "Grace period for operator shutdown before reconciliation threads " +
+ "are killed.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(30)
+ .build();
+ public static final ConfigOption ReconcilerParallelism =
+ ConfigOption.builder()
+ .key("spark.operator.reconciler.parallelism")
+ .description(
+ "Thread pool size for Spark Operator reconcilers. Use -1 for " +
+ "unbounded pool.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(30)
+ .build();
+ public static final ConfigOption RateLimiterRefreshPeriodSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.rate.limiter.refresh.period.seconds")
+ .description(
+ "Operator rate limiter refresh period(in seconds) for each resource.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(15)
+ .build();
+ public static final ConfigOption RateLimiterLimit = ConfigOption.builder()
+ .key("spark.operator.rate.limiter.limit")
+ .description(
+ "Max number of reconcile loops triggered within the rate limiter refresh " +
+ "period for each resource. Setting the limit <= 0 disables the " +
+ "limiter.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(5)
+ .build();
+ public static final ConfigOption RetryInitialInternalSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.retry.initial.internal.seconds")
+ .description(
+ "Initial interval(in seconds) of retries on unhandled controller " +
+ "errors.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(5)
+ .build();
+ public static final ConfigOption RetryInternalMultiplier =
+ ConfigOption.builder()
+ .key("spark.operator.retry.internal.multiplier")
+ .description("Interval multiplier of retries on unhandled controller errors.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Double.class)
+ .defaultValue(1.5)
+ .build();
+ public static final ConfigOption RetryMaxIntervalSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.retry.max.interval.seconds")
+ .description(
+ "Max interval(in seconds) of retries on unhandled controller errors. " +
+ "Set to -1 for unlimited.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(-1)
+ .build();
+ public static final ConfigOption RetryMaxAttempts = ConfigOption.builder()
+ .key("spark.operator.retry.max.attempts")
+ .description("Max attempts of retries on unhandled controller errors.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .defaultValue(15)
+ .build();
+ public static final ConfigOption DriverCreateMaxAttempts = ConfigOption.builder()
+ .key("spark.operator.driver.create.max.attempts")
+ .description(
+ "Maximal number of retry attempts of requesting driver for Spark application.")
+ .defaultValue(3L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption MaxRetryAttemptOnKubeServerFailure =
+ ConfigOption.builder()
+ .key("spark.operator.max.retry.attempts.on.k8s.failure")
+ .description(
+ "Maximal number of retry attempts of requests to k8s server upon " +
+ "response 429 and 5xx.")
+ .defaultValue(3L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption RetryAttemptAfterSeconds = ConfigOption.builder()
+ .key("spark.operator.retry.attempt.after.seconds")
+ .description(
+ "Default time (in seconds) to wait till next request. This would be used if " +
+ "server does not set Retry-After in response.")
+ .defaultValue(1L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption MaxRetryAttemptAfterSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.max.retry.attempt.after.seconds")
+ .description("Maximal time (in seconds) to wait till next request.")
+ .defaultValue(15L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption StatusPatchMaxRetry = ConfigOption.builder()
+ .key("spark.operator.status.patch.max.retry")
+ .description(
+ "Maximal number of retry attempts of requests to k8s server for resource " +
+ "status update.")
+ .defaultValue(3L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption StatusPatchFailureBackoffSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.status.patch.failure.backoff.seconds")
+ .description(
+ "Default time (in seconds) to wait till next request to patch " +
+ "resource status update.")
+ .defaultValue(3L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption SparkAppReconcileIntervalSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.application.reconcile.interval.seconds")
+ .description(
+ "Interval (in seconds) to reconcile when application is is starting " +
+ "up. Note that reconcile is always expected to be triggered " +
+ "per update - this interval controls the reconcile behavior " +
+ "when operator still need to reconcile even when there's no " +
+ "update ,e.g. for timeout checks.")
+ .defaultValue(120L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption ForegroundRequestTimeoutSeconds =
+ ConfigOption.builder()
+ .key("spark.operator.foreground.request.timeout.seconds")
+ .description(
+ "Timeout (in seconds) to for requests made to API server. this " +
+ "applies only to foreground requests.")
+ .defaultValue(120L)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption OperatorWatchedNamespaces =
+ ConfigOption.builder()
+ .key("spark.operator.watched.namespaces")
+ .description(
+ "Comma-separated list of namespaces that the operator would be " +
+ "watching for Spark resources. If unset, operator would " +
+ "watch all namespaces by default.")
+ .defaultValue(null)
+ .typeParameterClass(String.class)
+ .build();
+ public static final ConfigOption TrimAttemptStateTransitionHistory =
+ ConfigOption.builder()
+ .key("spark.operator.trim.attempt.state.transition.history")
+ .description(
+ "When enabled, operator would trim state transition history when a " +
+ "new attempt starts, keeping previous attempt summary only.")
+ .defaultValue(true)
+ .typeParameterClass(Boolean.class)
+ .build();
+
+ public static final ConfigOption JOSDKMetricsEnabled = ConfigOption.builder()
+ .key("spark.operator.josdk.metrics.enabled")
+ .description(
+ "When enabled, the josdk metrics will be added in metrics source and " +
+ "configured for operator.")
+ .defaultValue(true)
+ .build();
+
+ public static final ConfigOption KubernetesClientMetricsEnabled =
+ ConfigOption.builder()
+ .key("spark.operator.kubernetes.client.metrics.enabled")
+ .defaultValue(true)
+ .description(
+ "Enable KubernetesClient metrics for measuring the HTTP traffic to " +
+ "the Kubernetes API Server. Since the metrics is collected " +
+ "via Okhttp interceptors, can be disabled when opt in " +
+ "customized interceptors.")
+ .build();
+
+ public static final ConfigOption
+ KubernetesClientMetricsGroupByResponseCodeGroupEnabled = ConfigOption.builder()
+ .key("spark.operator.kubernetes.client.metrics.group.by.response.code.group.enable")
+ .description(
+ "When enabled, additional metrics group by http response code group(1xx, " +
+ "2xx, 3xx, 4xx, 5xx) received from API server will be added. Users " +
+ "can disable it when their monitoring system can combine lower level " +
+ "kubernetes.client.http.response.<3-digit-response-code> metrics.")
+ .defaultValue(true)
+ .build();
+ public static final ConfigOption OperatorProbePort = ConfigOption.builder()
+ .key("spark.operator.probe.port")
+ .defaultValue(18080)
+ .description("The port used for health/readiness check probe status.")
+ .typeParameterClass(Integer.class)
+ .enableDynamicOverride(false)
+ .build();
+
+ public static final ConfigOption OperatorMetricsPort = ConfigOption.builder()
+ .key("spark.operator.metrics.port")
+ .defaultValue(19090)
+ .description("The port used for checking metrics")
+ .typeParameterClass(Integer.class)
+ .enableDynamicOverride(false)
+ .build();
+
+ public static final ConfigOption SentinelExecutorServicePoolSize =
+ ConfigOption.builder()
+ .key("spark.operator.sentinel.executor.pool.size")
+ .description(
+ "Size of executor service in Sentinel Managers to check the health " +
+ "of sentinel resources.")
+ .defaultValue(3)
+ .enableDynamicOverride(false)
+ .typeParameterClass(Integer.class)
+ .build();
+
+ public static final ConfigOption SENTINEL_RESOURCE_RECONCILIATION_DELAY =
+ ConfigOption.builder()
+ .key("spark.operator.health.sentinel.resource.reconciliation.delay.seconds")
+ .defaultValue(60L)
+ .description(
+ "Allowed max time(seconds) between spec update and reconciliation " +
+ "for sentinel resources.")
+ .enableDynamicOverride(true)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption SPARK_APP_STATUS_LISTENER_CLASS_NAMES =
+ ConfigOption.builder()
+ .key("spark.operator.application.status.listener.class.names")
+ .defaultValue("")
+ .description(
+ "Comma-separated names of SparkAppStatusListener class " +
+ "implementations")
+ .enableDynamicOverride(false)
+ .typeParameterClass(String.class)
+ .build();
+ public static final ConfigOption LEADER_ELECTION_ENABLED =
+ ConfigOption.builder()
+ .key("spark.operator.leader.election.enabled")
+ .defaultValue(false)
+ .description(
+ "Enable leader election for the operator to allow running standby " +
+ "instances.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Boolean.class)
+ .build();
+ public static final ConfigOption LEADER_ELECTION_LEASE_NAME =
+ ConfigOption.builder()
+ .key("spark.operator.leader.election.lease.name")
+ .defaultValue("spark-operator-lease")
+ .description(
+ "Leader election lease name, must be unique for leases in the same " +
+ "namespace.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(String.class)
+ .build();
+ public static final ConfigOption LEADER_ELECTION_LEASE_DURATION_SECONDS =
+ ConfigOption.builder()
+ .key("spark.operator.leader.election.lease.duration.seconds")
+ .defaultValue(1200L)
+ .description("Leader election lease duration.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption LEADER_ELECTION_RENEW_DEADLINE_SECONDS =
+ ConfigOption.builder()
+ .key("spark.operator.leader.election.renew.deadline.seconds")
+ .defaultValue(600L)
+ .description("Leader election renew deadline.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Long.class)
+ .build();
+ public static final ConfigOption LEADER_ELECTION_RETRY_PERIOD_SECONDS =
+ ConfigOption.builder()
+ .key("spark.operator.leader.election.retry.period.seconds")
+ .defaultValue(180L)
+ .description("Leader election retry period.")
+ .enableDynamicOverride(false)
+ .typeParameterClass(Long.class)
+ .build();
+
+ public static List getAppStatusListener() {
+ List listeners = new ArrayList<>();
+ String listenerNamesStr =
+ SparkOperatorConf.SPARK_APP_STATUS_LISTENER_CLASS_NAMES.getValue();
+ if (StringUtils.isNotBlank(listenerNamesStr)) {
+ try {
+ List listenerNames =
+ Arrays.stream(listenerNamesStr.split(",")).map(String::trim)
+ .collect(Collectors.toList());
+ for (String name : listenerNames) {
+ Class listenerClass = Class.forName(name);
+ if (SparkAppStatusListener.class.isAssignableFrom(listenerClass)) {
+ listeners.add((SparkAppStatusListener)
+ listenerClass.getConstructor().newInstance());
+ }
+ }
+ } catch (Exception e) {
+ if (log.isErrorEnabled()) {
+ log.error("Failed to initialize listeners for operator with {}",
+ listenerNamesStr, e);
+ }
+ }
+ }
+ return listeners;
+ }
+
+ public static LeaderElectionConfiguration getLeaderElectionConfig() {
+ return new LeaderElectionConfiguration(LEADER_ELECTION_LEASE_NAME.getValue(),
+ OperatorNamespace.getValue(),
+ Duration.ofSeconds(LEADER_ELECTION_LEASE_DURATION_SECONDS.getValue()),
+ Duration.ofSeconds(LEADER_ELECTION_RENEW_DEADLINE_SECONDS.getValue()),
+ Duration.ofSeconds(LEADER_ELECTION_RETRY_PERIOD_SECONDS.getValue()));
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConfManager.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConfManager.java
new file mode 100644
index 00000000..ca914263
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConfManager.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.config;
+
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.Properties;
+
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * Loads ConfigOption from properties file. In addition, loads hot properties override
+ * from config map if dynamic config is enabled.
+ */
+@Slf4j
+public class SparkOperatorConfManager {
+ public static final String INITIAL_CONFIG_FILE_PATH =
+ "/opt/spark-operator/conf/spark-operator.properties";
+
+ public static final String METRICS_CONFIG_FILE_PATH =
+ "/opt/spark-operator/conf/metrics.properties";
+
+ public static final String INITIAL_CONFIG_FILE_PATH_PROPS_KEY =
+ "spark.operator.base.property.file.name";
+
+ public static final String METRICS_CONFIG_FILE_PATH_PROPS_KEY =
+ "spark.operator.metrics.property.file.name";
+
+ public static final SparkOperatorConfManager INSTANCE = new SparkOperatorConfManager();
+ protected final Properties initialConfig;
+ protected final Properties metricsConfig;
+ protected Properties configOverrides;
+
+ protected SparkOperatorConfManager() {
+ this.initialConfig = new Properties();
+ this.configOverrides = new Properties();
+ this.metricsConfig = new Properties();
+ initialize();
+ }
+
+ public String getValue(String key) {
+ String currentValue = configOverrides.getProperty(key);
+ return StringUtils.isEmpty(currentValue) ? getInitialValue(key) : currentValue;
+ }
+
+ public String getInitialValue(String key) {
+ return initialConfig.getProperty(key);
+ }
+
+ public void refresh(Map updatedConfig) {
+ synchronized (this) {
+ this.configOverrides = new Properties();
+ configOverrides.putAll(updatedConfig);
+ }
+ }
+
+ public Properties getMetricsProperties() {
+ return metricsConfig;
+ }
+
+ private void initialize() {
+ initialConfig.putAll(System.getProperties());
+ Properties properties = getProperties(
+ System.getProperty(INITIAL_CONFIG_FILE_PATH_PROPS_KEY, INITIAL_CONFIG_FILE_PATH));
+ initialConfig.putAll(properties);
+ initializeMetricsProperties();
+ }
+
+ private void initializeMetricsProperties() {
+ Properties properties = getProperties(
+ System.getProperty(METRICS_CONFIG_FILE_PATH_PROPS_KEY, METRICS_CONFIG_FILE_PATH));
+ metricsConfig.putAll(properties);
+ }
+
+ private Properties getProperties(String filePath) {
+ Properties properties = new Properties();
+ try (InputStream inputStream = new FileInputStream(filePath)) {
+ properties.load(inputStream);
+ } catch (Exception e) {
+ log.error("Failed to load properties from {}.", filePath, e);
+ }
+ return properties;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConfigMapReconciler.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConfigMapReconciler.java
new file mode 100644
index 00000000..89e7bc83
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/config/SparkOperatorConfigMapReconciler.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.config;
+
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+
+import io.fabric8.kubernetes.api.model.ConfigMap;
+import io.javaoperatorsdk.operator.api.config.informer.InformerConfiguration;
+import io.javaoperatorsdk.operator.api.reconciler.Context;
+import io.javaoperatorsdk.operator.api.reconciler.ControllerConfiguration;
+import io.javaoperatorsdk.operator.api.reconciler.ErrorStatusHandler;
+import io.javaoperatorsdk.operator.api.reconciler.ErrorStatusUpdateControl;
+import io.javaoperatorsdk.operator.api.reconciler.EventSourceContext;
+import io.javaoperatorsdk.operator.api.reconciler.EventSourceInitializer;
+import io.javaoperatorsdk.operator.api.reconciler.Reconciler;
+import io.javaoperatorsdk.operator.api.reconciler.UpdateControl;
+import io.javaoperatorsdk.operator.processing.event.rate.RateLimited;
+import io.javaoperatorsdk.operator.processing.event.source.EventSource;
+import io.javaoperatorsdk.operator.processing.event.source.informer.InformerEventSource;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.OperatorNamespace;
+
+/**
+ * This serves dynamic configuration for Spark Operator.
+ * When enabled, Operator assumes config file is located in given config map
+ * It would keep watch the config map & apply changes when update is detected.
+ */
+@ControllerConfiguration
+@RateLimited(maxReconciliations = 1, within = 30)
+@RequiredArgsConstructor
+@Slf4j
+public class SparkOperatorConfigMapReconciler implements Reconciler,
+ ErrorStatusHandler, EventSourceInitializer {
+ private final Function, Boolean> namespaceUpdater;
+
+ @Override
+ public ErrorStatusUpdateControl updateErrorStatus(ConfigMap resource,
+ Context context,
+ Exception e) {
+ log.error("Failed to reconcile dynamic config change.");
+ return ErrorStatusUpdateControl.noStatusUpdate();
+ }
+
+ @Override
+ public Map prepareEventSources(EventSourceContext context) {
+ var configMapEventSource =
+ new InformerEventSource<>(InformerConfiguration.from(ConfigMap.class, context)
+ .withNamespaces(OperatorNamespace.getValue())
+ .build(), context);
+ return EventSourceInitializer.nameEventSources(configMapEventSource);
+ }
+
+ @Override
+ public UpdateControl reconcile(ConfigMap resource, Context context)
+ throws Exception {
+ SparkOperatorConfManager.INSTANCE.refresh(resource.getData());
+ namespaceUpdater.apply(SparkReconcilerUtils.getWatchedNamespaces());
+ return UpdateControl.noUpdate();
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/controller/BaseContext.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/controller/BaseContext.java
new file mode 100644
index 00000000..6eafc440
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/controller/BaseContext.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.controller;
+
+import io.fabric8.kubernetes.client.KubernetesClient;
+
+import org.apache.spark.kubernetes.operator.BaseResource;
+
+public abstract class BaseContext> {
+ public abstract CR getResource();
+ public abstract KubernetesClient getClient();
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/controller/SparkAppContext.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/controller/SparkAppContext.java
new file mode 100644
index 00000000..ab4884b9
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/controller/SparkAppContext.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.controller;
+
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.fabric8.kubernetes.api.model.Pod;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import io.javaoperatorsdk.operator.api.reconciler.Context;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.SparkAppResourceSpec;
+import org.apache.spark.kubernetes.operator.SparkAppSubmissionWorker;
+import org.apache.spark.kubernetes.operator.SparkApplication;
+import org.apache.spark.kubernetes.operator.reconciler.SparkAppReconcileUtils;
+
+import static org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils.driverLabels;
+import static org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils.executorLabels;
+
+/**
+ * Context for {@link org.apache.spark.kubernetes.operator.SparkApplication} resource
+ * Includes secondary resource(s) and desired secondary resource spec
+ */
+@RequiredArgsConstructor
+@Slf4j
+public class SparkAppContext extends BaseContext {
+ private final SparkApplication sparkApplication;
+ private final Context> josdkContext;
+ private final SparkAppSubmissionWorker submissionWorker;
+ private SparkAppResourceSpec secondaryResourceSpec;
+
+ public Optional getDriverPod() {
+ return josdkContext.getSecondaryResourcesAsStream(Pod.class)
+ .filter(p -> p.getMetadata().getLabels().entrySet()
+ .containsAll(driverLabels(sparkApplication).entrySet()))
+ .findAny();
+ }
+
+ public Set getExecutorsForApplication() {
+ return josdkContext.getSecondaryResourcesAsStream(Pod.class)
+ .filter(p -> p.getMetadata().getLabels().entrySet()
+ .containsAll(executorLabels(sparkApplication).entrySet()))
+ .collect(Collectors.toSet());
+ }
+
+ private SparkAppResourceSpec getSecondaryResourceSpec() {
+ synchronized (this) {
+ if (secondaryResourceSpec == null) {
+ secondaryResourceSpec = SparkAppReconcileUtils.buildResourceSpec(sparkApplication,
+ josdkContext.getClient(), submissionWorker);
+ }
+ return secondaryResourceSpec;
+ }
+ }
+
+ @Override
+ public SparkApplication getResource() {
+ return sparkApplication;
+ }
+
+ @Override
+ public KubernetesClient getClient() {
+ return josdkContext.getClient();
+ }
+
+ public List getDriverPreResourcesSpec() {
+ return getSecondaryResourceSpec().getDriverPreResources();
+ }
+
+ public Pod getDriverPodSpec() {
+ return getSecondaryResourceSpec().getConfiguredPod();
+ }
+
+ public List getDriverResourcesSpec() {
+ return getSecondaryResourceSpec().getDriverResources();
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/decorators/DriverDecorator.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/decorators/DriverDecorator.java
new file mode 100644
index 00000000..618bc818
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/decorators/DriverDecorator.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.decorators;
+
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.fabric8.kubernetes.api.model.ObjectMeta;
+import io.fabric8.kubernetes.api.model.ObjectMetaBuilder;
+import lombok.RequiredArgsConstructor;
+
+import org.apache.spark.kubernetes.operator.SparkApplication;
+
+import static org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils.sparkAppResourceLabels;
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.buildOwnerReferenceTo;
+
+/**
+ * Decorates driver (pod) to make sure its metadata matches event source
+ * Also adds owner reference to the owner SparkApplication for garbage collection
+ */
+@RequiredArgsConstructor
+public class DriverDecorator implements ResourceDecorator {
+
+ private final SparkApplication app;
+
+ /**
+ * Add labels and owner references to the app for all secondary resources
+ */
+ @Override
+ public T decorate(T resource) {
+ ObjectMeta metaData = new ObjectMetaBuilder(resource.getMetadata())
+ .addToOwnerReferences(buildOwnerReferenceTo(app))
+ .addToLabels(sparkAppResourceLabels(app))
+ .withNamespace(app.getMetadata().getNamespace())
+ .build();
+ resource.setMetadata(metaData);
+ return resource;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/decorators/DriverResourceDecorator.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/decorators/DriverResourceDecorator.java
new file mode 100644
index 00000000..00ef9af6
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/decorators/DriverResourceDecorator.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.decorators;
+
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.fabric8.kubernetes.api.model.ObjectMeta;
+import io.fabric8.kubernetes.api.model.ObjectMetaBuilder;
+import io.fabric8.kubernetes.api.model.OwnerReference;
+import io.fabric8.kubernetes.api.model.Pod;
+import lombok.RequiredArgsConstructor;
+import org.apache.commons.collections.CollectionUtils;
+
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.buildOwnerReferenceTo;
+
+/**
+ * Decorates Driver resources (except the pod).
+ * This makes sure all resources have owner reference to the driver pod, so they can
+ * be garbage collected upon termination.
+ * Secondary resources would be garbage-collected if ALL owners are deleted. Therefore,
+ * operator makes only driver pod has owned by the SparkApplication while all other
+ * secondary resources are owned by the driver. In this way, after driver pod is deleted
+ * at the end of each attempt, all other resources would be garbage collected automatically.
+ */
+@RequiredArgsConstructor
+public class DriverResourceDecorator implements ResourceDecorator {
+ private final Pod driverPod;
+
+ @Override
+ public T decorate(T resource) {
+ boolean ownerReferenceExists = false;
+ if (CollectionUtils.isNotEmpty(resource.getMetadata().getOwnerReferences())) {
+ for (OwnerReference o : resource.getMetadata().getOwnerReferences()) {
+ if (driverPod.getKind().equals(o.getKind())
+ && driverPod.getMetadata().getName().equals(o.getName())
+ && driverPod.getMetadata().getUid().equals(o.getUid())) {
+ ownerReferenceExists = true;
+ break;
+ }
+ }
+ }
+ if (!ownerReferenceExists) {
+ ObjectMeta metaData = new ObjectMetaBuilder(resource.getMetadata())
+ .addToOwnerReferences(buildOwnerReferenceTo(driverPod))
+ .addToLabels(driverPod.getMetadata().getLabels())
+ .build();
+ resource.setMetadata(metaData);
+ }
+ return resource;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/health/SentinelManager.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/health/SentinelManager.java
new file mode 100644
index 00000000..874fab8e
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/health/SentinelManager.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.health;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import io.javaoperatorsdk.operator.processing.event.ResourceID;
+import lombok.Getter;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+
+import org.apache.spark.kubernetes.operator.BaseResource;
+import org.apache.spark.kubernetes.operator.config.SparkOperatorConf;
+import org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils;
+
+import static org.apache.spark.kubernetes.operator.Constants.SENTINEL_LABEL;
+import static org.apache.spark.kubernetes.operator.Constants.SPARK_CONF_SENTINEL_DUMMY_FIELD;
+
+/**
+ * Sentinel manager monitors dedicated sentinel resources to make sure the operator is healthy
+ *
+ * @param custom resource type
+ */
+@RequiredArgsConstructor
+@Slf4j
+public class SentinelManager> {
+
+ private final ConcurrentHashMap sentinelResources =
+ new ConcurrentHashMap<>();
+
+ private final ScheduledExecutorService executorService = Executors.newScheduledThreadPool(
+ SparkOperatorConf.SentinelExecutorServicePoolSize.getValue());
+
+ public static boolean isSentinelResource(HasMetadata resource) {
+ var labels = resource.getMetadata().getLabels();
+ if (labels == null) {
+ return false;
+ }
+ var namespace = resource.getMetadata().getNamespace();
+ return shouldSentinelWatchGivenNamespace(namespace)
+ && Boolean.TRUE.toString()
+ .equalsIgnoreCase(labels.getOrDefault(SENTINEL_LABEL, Boolean.FALSE.toString()));
+ }
+
+ private static boolean shouldSentinelWatchGivenNamespace(String namespace) {
+ if ((!SparkReconcilerUtils.getWatchedNamespaces().isEmpty())
+ && !SparkReconcilerUtils.getWatchedNamespaces().contains(namespace)) {
+ if (log.isErrorEnabled()) {
+ log.error("Skip watching sentinel resource in namespace {}", namespace);
+ }
+ return false;
+ }
+ return true;
+ }
+
+ public boolean allSentinelsAreHealthy() {
+ Set unWatchedKey = new HashSet<>();
+ var result = sentinelResources.entrySet().stream().filter(
+ x -> {
+ if (x.getKey().getNamespace().isPresent()
+ && shouldSentinelWatchGivenNamespace(x.getKey().getNamespace().get())) {
+ return true;
+ }
+ unWatchedKey.add(x.getKey());
+ return false;
+ }
+ ).map(Map.Entry::getValue).allMatch(SentinelResourceState::isHealthy);
+ sentinelResources.keySet().removeAll(unWatchedKey);
+ return result;
+ }
+
+ public void checkHealth(ResourceID resourceID, KubernetesClient client) {
+ SentinelResourceState sentinelResourceState = sentinelResources.get(resourceID);
+ if (sentinelResourceState == null) {
+ if (log.isErrorEnabled()) {
+ log.error("Sentinel resources {} not found. Stopping sentinel health checks",
+ resourceID);
+ }
+ return;
+ }
+
+ if (sentinelResourceState.reconciledSinceUpdate()) {
+ log.info("Sentinel reports healthy state globally");
+ sentinelResourceState.isHealthy = true;
+ } else {
+ if (log.isErrorEnabled()) {
+ log.error(
+ "Sentinel deployment {} latest spec not was reconciled. Expected " +
+ "generation larger than {}, received {}",
+ resourceID,
+ sentinelResourceState.previousGeneration,
+ sentinelResourceState.resource.getMetadata().getGeneration());
+ }
+ sentinelResourceState.isHealthy = false;
+ }
+
+ updateSpecAndScheduleHealthCheck(resourceID, sentinelResourceState, client);
+ }
+
+ public boolean handleSentinelResourceReconciliation(CR resource, KubernetesClient client) {
+ if (!isSentinelResource(resource)) {
+ return false;
+ }
+
+ var resourceId = ResourceID.fromResource(resource);
+ sentinelResources.compute(
+ resourceId,
+ (id, previousState) -> {
+ boolean firstReconcile = false;
+ if (previousState == null) {
+ firstReconcile = true;
+ previousState = new SentinelResourceState();
+ }
+ previousState.onReconcile(resource);
+ if (firstReconcile) {
+ updateSpecAndScheduleHealthCheck(resourceId, previousState, client);
+ }
+ return previousState;
+ });
+ return true;
+ }
+
+ private void updateSpecAndScheduleHealthCheck(ResourceID resourceID,
+ SentinelResourceState sentinelResourceState,
+ KubernetesClient client) {
+ var sparkConf = sentinelResourceState.resource.getSpec().getSparkConf();
+ sparkConf.compute(SPARK_CONF_SENTINEL_DUMMY_FIELD, (key, value) -> {
+ if (value == null) {
+ return "1";
+ } else {
+ return String.valueOf(Long.parseLong(value) + 1);
+ }
+ });
+ sentinelResourceState.previousGeneration =
+ sentinelResourceState.resource.getMetadata().getGeneration();
+ try {
+ if (log.isDebugEnabled()) {
+ log.debug("Update the sentinel kubernetes resource spec {}", sentinelResourceState);
+ }
+ client.resource(SparkReconcilerUtils.clone(sentinelResourceState.resource)).replace();
+ } catch (Throwable t) {
+ if (log.isWarnEnabled()) {
+ log.warn("Could not replace the sentinel deployment spark conf {}",
+ SPARK_CONF_SENTINEL_DUMMY_FIELD, t);
+ }
+ }
+ var delay = SparkOperatorConf.SENTINEL_RESOURCE_RECONCILIATION_DELAY.getValue();
+ if (log.isInfoEnabled()) {
+ log.info("Scheduling sentinel check for {} in {} seconds", resourceID, delay);
+ }
+ executorService.schedule(() -> checkHealth(resourceID, client),
+ delay,
+ TimeUnit.SECONDS);
+ }
+
+ public class SentinelResourceState {
+ CR resource;
+ long previousGeneration;
+
+ @Getter
+ boolean isHealthy = true;
+
+ void onReconcile(CR cr) {
+ resource = cr;
+ }
+
+ boolean reconciledSinceUpdate() {
+ return resource.getMetadata().getGeneration() > previousGeneration;
+ }
+
+ @Override
+ public String toString() {
+ return new ToStringBuilder(this)
+ .append("resource", resource)
+ .append("previousGeneration", previousGeneration)
+ .append("isHealthy", isHealthy)
+ .toString();
+ }
+ }
+
+ @VisibleForTesting
+ public ConcurrentHashMap getSentinelResources() {
+ return sentinelResources;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/listeners/BaseStatusListener.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/listeners/BaseStatusListener.java
new file mode 100644
index 00000000..3c6367ba
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/listeners/BaseStatusListener.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.listeners;
+
+import org.apache.spark.kubernetes.operator.BaseResource;
+import org.apache.spark.kubernetes.operator.status.BaseStatus;
+
+/**
+ * Custom listeners, if added, would be listening to resource status change
+ */
+public abstract class BaseStatusListener,
+ CR extends BaseResource, ?, ?, ?, STATUS>> {
+ public abstract void listenStatus(CR resource, STATUS prevStatus, STATUS updatedStatus);
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/listeners/SparkAppStatusListener.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/listeners/SparkAppStatusListener.java
new file mode 100644
index 00000000..e9043361
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/listeners/SparkAppStatusListener.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.listeners;
+
+import org.apache.spark.kubernetes.operator.SparkApplication;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+
+/**
+ * Custom listeners, if added, would be listening to Spark App status change
+ */
+public abstract class SparkAppStatusListener extends BaseStatusListener {
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/JVMMetricSet.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/JVMMetricSet.java
new file mode 100644
index 00000000..7ca4bcb7
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/JVMMetricSet.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.metrics;
+
+import java.lang.management.ManagementFactory;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.codahale.metrics.Metric;
+import com.codahale.metrics.MetricSet;
+import com.codahale.metrics.jvm.BufferPoolMetricSet;
+import com.codahale.metrics.jvm.FileDescriptorRatioGauge;
+import com.codahale.metrics.jvm.GarbageCollectorMetricSet;
+import com.codahale.metrics.jvm.MemoryUsageGaugeSet;
+import com.codahale.metrics.jvm.ThreadStatesGaugeSet;
+
+public class JVMMetricSet implements MetricSet {
+ public static final String FILE_DESC_RATIO_OPEN_MAX = "fileDesc.ratio.open/max";
+ private final BufferPoolMetricSet bufferPoolMetricSet;
+ private final FileDescriptorRatioGauge fileDescriptorRatioGauge;
+ private final GarbageCollectorMetricSet garbageCollectorMetricSet;
+ private final MemoryUsageGaugeSet memoryUsageGaugeSet;
+ private final ThreadStatesGaugeSet threadStatesGaugeSet;
+
+ public JVMMetricSet() {
+ bufferPoolMetricSet = new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer());
+ fileDescriptorRatioGauge = new FileDescriptorRatioGauge();
+ garbageCollectorMetricSet = new GarbageCollectorMetricSet();
+ memoryUsageGaugeSet = new MemoryUsageGaugeSet();
+ threadStatesGaugeSet = new ThreadStatesGaugeSet();
+ }
+
+ @Override
+ public Map getMetrics() {
+ final Map jvmMetrics = new HashMap<>();
+ putAllMetrics(jvmMetrics, bufferPoolMetricSet, "bufferPool");
+ jvmMetrics.put(FILE_DESC_RATIO_OPEN_MAX, fileDescriptorRatioGauge);
+ putAllMetrics(jvmMetrics, garbageCollectorMetricSet, "gc");
+ putAllMetrics(jvmMetrics, memoryUsageGaugeSet, "memoryUsage");
+ putAllMetrics(jvmMetrics, threadStatesGaugeSet, "threadStates");
+ return jvmMetrics;
+ }
+
+ private void putAllMetrics(final Map destination, final MetricSet origin,
+ final String prefix) {
+ for (Map.Entry entry : origin.getMetrics().entrySet()) {
+ destination.put(prefix + "." + entry.getKey(), entry.getValue());
+ }
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsService.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsService.java
new file mode 100644
index 00000000..313a2d9d
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsService.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.metrics;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.List;
+import java.util.Optional;
+
+import com.sun.net.httpserver.HttpServer;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.metrics.sink.PrometheusPullModelSink;
+import org.apache.spark.metrics.sink.Sink;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.OperatorMetricsPort;
+
+@Slf4j
+public class MetricsService {
+ HttpServer server;
+ MetricsSystem metricsSystem;
+
+ public MetricsService(MetricsSystem metricsSystem) {
+ this.metricsSystem = metricsSystem;
+ try {
+ server = HttpServer.create(new InetSocketAddress(OperatorMetricsPort.getValue()), 0);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to create Metrics Server", e);
+ }
+ server.setExecutor(null);
+ }
+
+ public void start() {
+ log.info("Metrics Service started");
+ List sinks = metricsSystem.getSinks();
+ Optional instanceOptional =
+ sinks.stream().filter(x -> x instanceof PrometheusPullModelSink).findAny();
+ instanceOptional.ifPresent(sink ->
+ server.createContext("/prometheus", (PrometheusPullModelSink) sink));
+ server.start();
+ }
+
+ public void stop() {
+ log.info("Metrics Service stopped");
+ server.stop(0);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsSystem.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsSystem.java
new file mode 100644
index 00000000..11203e7b
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsSystem.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.metrics;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.codahale.metrics.MetricFilter;
+import com.codahale.metrics.MetricRegistry;
+import lombok.Data;
+import lombok.Getter;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.metrics.source.JVMSource;
+import org.apache.spark.metrics.sink.Sink;
+import org.apache.spark.metrics.source.Source;
+import org.apache.spark.util.Utils;
+
+@Slf4j
+public class MetricsSystem {
+ private AtomicBoolean running = new AtomicBoolean(false);
+ @Getter
+ private List sinks;
+ @Getter
+ private List sources;
+ @Getter
+ private MetricRegistry registry;
+ private Properties properties;
+ private Map sinkPropertiesMap;
+
+ public MetricsSystem() {
+ this.sinks = new ArrayList<>();
+ this.sources = new ArrayList<>();
+ this.registry = new MetricRegistry();
+ this.properties = new Properties();
+ this.sinkPropertiesMap = new HashMap<>();
+ }
+
+ public MetricsSystem(Properties properties) {
+ this.sinks = new ArrayList<>();
+ this.sources = new ArrayList<>();
+ this.registry = new MetricRegistry();
+ this.properties = properties;
+ this.sinkPropertiesMap = MetricsSystemFactory.parseSinkProperties(this.properties);
+ }
+
+ public void start() {
+ if (running.get()) {
+ throw new IllegalStateException(
+ "Attempting to start a MetricsSystem that is already running");
+ }
+ running.set(true);
+ registerSources();
+ registerSinks();
+ sinks.forEach(Sink::start);
+ }
+
+ public void stop() {
+ if (running.get()) {
+ sinks.forEach(Sink::stop);
+ registry.removeMatching(MetricFilter.ALL);
+ } else {
+ log.error("Stopping a MetricsSystem that is not running");
+ }
+ running.set(false);
+ }
+
+ public void report() {
+ sinks.forEach(Sink::report);
+ }
+
+ public void registerSinks() {
+ log.info("sinkPropertiesMap: {}", sinkPropertiesMap);
+ sinkPropertiesMap.values().forEach(sinkProp -> {
+ Class sink = Utils.classForName(sinkProp.getClassName(), true, false);
+ Sink sinkInstance;
+ try {
+ sinkInstance = sink.getConstructor(Properties.class, MetricRegistry.class)
+ .newInstance(sinkProp.getProperties(), registry);
+ } catch (Exception e) {
+ if (log.isErrorEnabled()) {
+ log.error("Fail to create metrics sink for sink name {}, sink properties {}",
+ sinkProp.getClassName(), sinkProp.getProperties());
+ }
+ throw new RuntimeException("Fail to create metrics sink", e);
+ }
+ sinks.add(sinkInstance);
+ });
+ }
+
+ private void registerSources() {
+ // TO-DO parse the properties to config sources
+ registerSource(new JVMSource());
+ }
+
+ public void registerSource(Source source) {
+ sources.add(source);
+ try {
+ String regName = MetricRegistry.name(source.sourceName());
+ registry.register(regName, source.metricRegistry());
+ } catch (IllegalArgumentException e) {
+ log.error("Metrics already registered", e);
+ }
+ }
+
+ @Data
+ public static class SinkProps {
+ String className;
+ Properties properties;
+
+ public SinkProps() {
+ this.className = "";
+ this.properties = new Properties();
+ }
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsSystemFactory.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsSystemFactory.java
new file mode 100644
index 00000000..ce2243fe
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/MetricsSystemFactory.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.metrics;
+
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.spark.kubernetes.operator.config.SparkOperatorConfManager;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.CLASS;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.METRIC_PREFIX;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.SINK;
+
+public class MetricsSystemFactory {
+ public static MetricsSystem createMetricsSystem() {
+ Properties properties =
+ parseMetricsProperties(SparkOperatorConfManager.INSTANCE.getMetricsProperties());
+ return new MetricsSystem(properties);
+ }
+
+ private static Properties parseMetricsProperties(Properties userProperties) {
+ Properties properties = new Properties();
+ Enumeration> valueEnumeration = userProperties.propertyNames();
+ while (valueEnumeration.hasMoreElements()) {
+ String key = (String) valueEnumeration.nextElement();
+ if (key.startsWith(METRIC_PREFIX)) {
+ properties.put(key.substring(METRIC_PREFIX.length()),
+ userProperties.getProperty(key));
+ }
+ }
+ return properties;
+ }
+
+ public static Map parseSinkProperties(
+ Properties metricsProperties) {
+ Map propertiesMap = new HashMap<>();
+ // e.g: "sink.graphite.class"="org.apache.spark.metrics.sink.GraphiteSink"
+ Enumeration> valueEnumeration = metricsProperties.propertyNames();
+ while (valueEnumeration.hasMoreElements()) {
+ String key = (String) valueEnumeration.nextElement();
+ int firstDotIndex = StringUtils.ordinalIndexOf(key, ".", 1);
+ int secondDotIndex = StringUtils.ordinalIndexOf(key, ".", 2);
+ if (key.startsWith(SINK)) {
+ String shortName = key.substring(firstDotIndex + 1, secondDotIndex);
+ MetricsSystem.SinkProps sinkProps =
+ propertiesMap.getOrDefault(shortName, new MetricsSystem.SinkProps());
+ if (key.endsWith(CLASS)) {
+ sinkProps.setClassName(metricsProperties.getProperty(key));
+ } else {
+ sinkProps.getProperties().put(key.substring(secondDotIndex + 1),
+ metricsProperties.getProperty(key));
+ }
+ propertiesMap.put(shortName, sinkProps);
+ }
+ }
+ sinkPropertiesSanityCheck(propertiesMap);
+ return propertiesMap;
+ }
+
+ private static void sinkPropertiesSanityCheck(
+ Map sinkPropsMap) {
+ for (Map.Entry pair : sinkPropsMap.entrySet()) {
+ // Each Sink should have mapping class full name
+ if (StringUtils.isBlank(pair.getValue().className)) {
+ String errorMessage = String.format(
+ "%s provides properties, but does not provide full class name",
+ pair.getKey());
+ throw new RuntimeException(errorMessage);
+ }
+ // Check the existence of each class full name
+ try {
+ Class.forName(pair.getValue().getClassName());
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(
+ String.format("Fail to find class %s", pair.getValue().getClassName()), e);
+ }
+ }
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/sink/PrometheusPullModelSink.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/sink/PrometheusPullModelSink.java
new file mode 100644
index 00000000..52aae782
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/sink/PrometheusPullModelSink.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.metrics.sink;
+
+import javax.servlet.http.HttpServletRequest;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import com.codahale.metrics.MetricRegistry;
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.metrics.sink.PrometheusServlet;
+
+import static org.apache.spark.kubernetes.operator.utils.ProbeUtil.sendMessage;
+
+@Slf4j
+public class PrometheusPullModelSink extends PrometheusServlet implements HttpHandler {
+ public PrometheusPullModelSink(Properties properties, MetricRegistry registry) {
+ super(properties, registry);
+ }
+
+ @Override
+ public void start() {
+ log.info("PrometheusPullModelSink started");
+ }
+
+ @Override
+ public void stop() {
+ log.info("PrometheusPullModelSink stopped");
+ }
+
+ @Override
+ public void report() {
+ //no-op
+ }
+
+ @Override
+ public void handle(HttpExchange exchange) throws IOException {
+ // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala#L50
+ // Temporary solution since PrometheusServlet.getMetricsSnapshot method does not use
+ // httpServletRequest at all
+ HttpServletRequest httpServletRequest = null;
+ String value = getMetricsSnapshot(httpServletRequest);
+ // Prometheus will have invalid syntax exception while parsing value equal to "[]", e.g:
+ // metrics_jvm_threadStates_deadlocks_Number{type="gauges"} []
+ // metrics_jvm_threadStates_deadlocks_Value{type="gauges"} []
+ String[] records = value.split("\n");
+ List filteredRecords = new ArrayList<>();
+ for (String record : records) {
+ String[] keyValuePair = record.split(" ");
+ if ("[]".equals(keyValuePair[1])) {
+ log.info("Bug identified strconv.ParseFloat: parsing []");
+ continue;
+ }
+ filteredRecords.add(record);
+ }
+ sendMessage(exchange, 200, String.join("\n", filteredRecords));
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/JVMSource.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/JVMSource.java
new file mode 100644
index 00000000..07c44a78
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/JVMSource.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.metrics.source;
+
+import com.codahale.metrics.MetricRegistry;
+
+import org.apache.spark.kubernetes.operator.metrics.JVMMetricSet;
+import org.apache.spark.metrics.source.Source;
+
+public class JVMSource implements Source {
+
+ @Override
+ public String sourceName() {
+ return "jvm";
+ }
+
+ @Override
+ public MetricRegistry metricRegistry() {
+ MetricRegistry metricRegistry = new MetricRegistry();
+ metricRegistry.registerAll(new JVMMetricSet());
+ return metricRegistry;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/KubernetesMetricsInterceptor.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/KubernetesMetricsInterceptor.java
new file mode 100644
index 00000000..efad902d
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/KubernetesMetricsInterceptor.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.metrics.source;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+
+import com.codahale.metrics.Histogram;
+import com.codahale.metrics.Meter;
+import com.codahale.metrics.MetricRegistry;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.Interceptor;
+import okhttp3.Request;
+import okhttp3.Response;
+import org.apache.commons.lang3.tuple.Pair;
+import org.jetbrains.annotations.NotNull;
+
+import org.apache.spark.metrics.source.Source;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.KubernetesClientMetricsGroupByResponseCodeGroupEnabled;
+
+@Slf4j
+public class KubernetesMetricsInterceptor implements Interceptor, Source {
+ MetricRegistry metricRegistry;
+ public static final String NAMESPACES = "namespaces";
+ public static final String HTTP_REQUEST_GROUP = "http.request";
+ public static final String HTTP_REQUEST_FAILED_GROUP = "failed";
+ public static final String HTTP_RESPONSE_GROUP = "http.response";
+ public static final String HTTP_RESPONSE_1XX = "1xx";
+ public static final String HTTP_RESPONSE_2XX = "2xx";
+ public static final String HTTP_RESPONSE_3XX = "3xx";
+ public static final String HTTP_RESPONSE_4XX = "4xx";
+ public static final String HTTP_RESPONSE_5XX = "5xx";
+ private final Histogram responseLatency;
+ private final Map responseCodeMeters =
+ new ConcurrentHashMap<>();
+ private final Map requestMethodCounter = new ConcurrentHashMap<>();
+ private final List responseCodeGroupMeters = new ArrayList<>(5);
+ private final Meter requestFailedRateMeter;
+ private final Meter requestRateMeter;
+ private final Meter responseRateMeter;
+ private final Map namespacedResourceMethodMeters = new ConcurrentHashMap<>();
+
+ public KubernetesMetricsInterceptor() {
+ metricRegistry = new MetricRegistry();
+
+ responseLatency = metricRegistry.histogram(
+ MetricRegistry.name(HTTP_RESPONSE_GROUP, "latency", "nanos").toLowerCase());
+ requestFailedRateMeter =
+ metricRegistry.meter(MetricRegistry.name(HTTP_REQUEST_FAILED_GROUP).toLowerCase());
+ requestRateMeter =
+ metricRegistry.meter(MetricRegistry.name(HTTP_REQUEST_GROUP).toLowerCase());
+ responseRateMeter =
+ metricRegistry.meter(MetricRegistry.name(HTTP_RESPONSE_GROUP).toLowerCase());
+
+ if (KubernetesClientMetricsGroupByResponseCodeGroupEnabled.getValue()) {
+ responseCodeGroupMeters.add(
+ metricRegistry.meter(MetricRegistry.name(HTTP_RESPONSE_1XX).toLowerCase()));
+ responseCodeGroupMeters.add(
+ metricRegistry.meter(MetricRegistry.name(HTTP_RESPONSE_2XX).toLowerCase()));
+ responseCodeGroupMeters.add(
+ metricRegistry.meter(MetricRegistry.name(HTTP_RESPONSE_3XX).toLowerCase()));
+ responseCodeGroupMeters.add(
+ metricRegistry.meter(MetricRegistry.name(HTTP_RESPONSE_4XX).toLowerCase()));
+ responseCodeGroupMeters.add(
+ metricRegistry.meter(MetricRegistry.name(HTTP_RESPONSE_5XX).toLowerCase()));
+ }
+ }
+
+ @NotNull
+ @Override
+ public Response intercept(@NotNull Chain chain) throws IOException {
+ Request request = chain.request();
+ updateRequestMetrics(request);
+ Response response = null;
+ final long startTime = System.nanoTime();
+ try {
+ response = chain.proceed(request);
+ return response;
+ } finally {
+ updateResponseMetrics(response, startTime);
+ }
+ }
+
+ @Override
+ public String sourceName() {
+ return "kubernetes.client";
+ }
+
+ @Override
+ public MetricRegistry metricRegistry() {
+ return this.metricRegistry;
+ }
+
+ private void updateRequestMetrics(Request request) {
+ this.requestRateMeter.mark();
+ getMeterByRequestMethod(request.method()).mark();
+ Optional> resourceNamePairOptional =
+ parseNamespaceScopedResource(request.url().uri().getPath());
+ resourceNamePairOptional.ifPresent(pair -> {
+ getMeterByRequestMethodAndResourceName(
+ pair.getValue(), request.method()).mark();
+ getMeterByRequestMethodAndResourceName(
+ pair.getKey() + "." + pair.getValue(),
+ request.method()).mark();
+ }
+ );
+ }
+
+ private void updateResponseMetrics(Response response, long startTimeNanos) {
+ final long latency = System.nanoTime() - startTimeNanos;
+ if (response != null) {
+ this.responseRateMeter.mark();
+ this.responseLatency.update(latency);
+ getMeterByResponseCode(response.code()).mark();
+ if (KubernetesClientMetricsGroupByResponseCodeGroupEnabled.getValue()) {
+ responseCodeGroupMeters.get(response.code() / 100 - 1).mark();
+ }
+ } else {
+ this.requestFailedRateMeter.mark();
+ }
+ }
+
+ private Meter getMeterByRequestMethod(String method) {
+ return requestMethodCounter.computeIfAbsent(
+ method,
+ key ->
+ metricRegistry.meter(
+ MetricRegistry.name(HTTP_REQUEST_GROUP, method).toLowerCase()));
+ }
+
+ private Meter getMeterByRequestMethodAndResourceName(String resourceName, String method) {
+ String metricsName = MetricRegistry.name(resourceName, method);
+ return namespacedResourceMethodMeters.computeIfAbsent(
+ metricsName,
+ key ->
+ metricRegistry.meter(metricsName.toLowerCase()));
+ }
+
+ private Meter getMeterByResponseCode(int code) {
+ return responseCodeMeters.computeIfAbsent(code,
+ key -> metricRegistry.meter(
+ MetricRegistry.name(HTTP_RESPONSE_GROUP, String.valueOf(code))));
+ }
+
+ public Optional> parseNamespaceScopedResource(String path) {
+ if (path.contains(NAMESPACES)) {
+ var index = path.indexOf(NAMESPACES) + NAMESPACES.length();
+ String namespaceAndResources = path.substring(index + 1);
+ String[] parts = namespaceAndResources.split("/");
+ return Optional.of(Pair.of(parts[0], parts[1]));
+ } else {
+ return Optional.empty();
+ }
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/OperatorJosdkMetrics.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/OperatorJosdkMetrics.java
new file mode 100644
index 00000000..8fa15110
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/metrics/source/OperatorJosdkMetrics.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.metrics.source;
+
+import java.util.Map;
+import java.util.Optional;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Histogram;
+import com.codahale.metrics.MetricRegistry;
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.javaoperatorsdk.operator.api.monitoring.Metrics;
+import io.javaoperatorsdk.operator.api.reconciler.Constants;
+import io.javaoperatorsdk.operator.api.reconciler.RetryInfo;
+import io.javaoperatorsdk.operator.processing.Controller;
+import io.javaoperatorsdk.operator.processing.GroupVersionKind;
+import io.javaoperatorsdk.operator.processing.event.Event;
+import io.javaoperatorsdk.operator.processing.event.ResourceID;
+import io.javaoperatorsdk.operator.processing.event.source.controller.ResourceEvent;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.BaseResource;
+import org.apache.spark.kubernetes.operator.SparkApplication;
+import org.apache.spark.metrics.source.Source;
+import org.apache.spark.util.Clock;
+import org.apache.spark.util.SystemClock;
+
+import static io.javaoperatorsdk.operator.api.reconciler.Constants.CONTROLLER_NAME;
+
+@Slf4j
+public class OperatorJosdkMetrics implements Source, Metrics {
+ public static final String FINISHED = "finished";
+ public static final String CLEANUP = "cleanup";
+ public static final String FAILED = "failed";
+ public static final String RETRIES = "retries";
+ private final Map histograms = new ConcurrentHashMap<>();
+ private final Map counters = new ConcurrentHashMap<>();
+ private final Map gauges = new ConcurrentHashMap<>();
+ private static final String RECONCILIATION = "reconciliation";
+ private static final String RESOURCE = "resource";
+ private static final String EVENT = "event";
+ private static final String SUCCESS = "success";
+ private static final String FAILURE = "failure";
+ private static final String EXCEPTION = "exception";
+ private static final String PREFIX = "operator.sdk";
+ private static final String RECONCILIATIONS = "reconciliations";
+ private static final String RECONCILIATIONS_EXECUTIONS = RECONCILIATIONS + ".executions";
+ private static final String RECONCILIATIONS_QUEUE_SIZE = RECONCILIATIONS + ".queue.size";
+ private static final String SIZE = "size";
+
+ private final Clock clock;
+ private final MetricRegistry metricRegistry;
+
+ public OperatorJosdkMetrics() {
+ this.clock = new SystemClock();
+ this.metricRegistry = new MetricRegistry();
+ }
+
+ @Override
+ public String sourceName() {
+ return PREFIX;
+ }
+
+ @Override
+ public MetricRegistry metricRegistry() {
+ return metricRegistry;
+ }
+
+ @Override
+ public void controllerRegistered(Controller extends HasMetadata> controller) {
+ // no-op
+ log.debug("Controller has been registered");
+ }
+
+ @Override
+ public void receivedEvent(Event event, Map metadata) {
+ log.debug("received event {}, metadata {}", event, metadata);
+ if (event instanceof ResourceEvent) {
+ final var action = ((ResourceEvent) event).getAction();
+ final var resource = getResourceClass(metadata);
+ final var namespaceOptional = event.getRelatedCustomResourceID().getNamespace();
+ resource.ifPresent(aClass -> getCounter(aClass, action.name().toLowerCase(), RESOURCE,
+ EVENT).inc());
+ if (resource.isPresent() && namespaceOptional.isPresent()) {
+ getCounter(resource.get(), namespaceOptional.get(), action.name().toLowerCase(),
+ RESOURCE, EVENT).inc();
+ }
+ }
+ }
+
+ @Override
+ public T timeControllerExecution(ControllerExecution execution) throws Exception {
+ log.debug("Time controller execution");
+ final var name = execution.controllerName();
+ final var resourceID = execution.resourceID();
+ final var namespaceOptional = resourceID.getNamespace();
+ final var metadata = execution.metadata();
+ final var resourceClass = getResourceClass(metadata);
+ final var execName = execution.name();
+
+ long startTime = clock.getTimeMillis();
+ try {
+ T result = execution.execute();
+ final var successType = execution.successTypeName(result);
+ if (resourceClass.isPresent()) {
+ getHistogram(resourceClass.get(), name, execName, successType).update(
+ toSeconds(startTime));
+ getCounter(resourceClass.get(), name, execName, SUCCESS, successType).inc();
+ if (namespaceOptional.isPresent()) {
+ getHistogram(resourceClass.get(), namespaceOptional.get(), name, execName,
+ successType).update(toSeconds(startTime));
+ getCounter(resourceClass.get(), namespaceOptional.get(), name, execName,
+ SUCCESS, successType).inc();
+ }
+ }
+ return result;
+ } catch (Exception e) {
+ log.error("Controller execution failed for resource {}, metadata {}", resourceID,
+ metadata, e);
+ final var exception = e.getClass().getSimpleName();
+ if (resourceClass.isPresent()) {
+ getHistogram(resourceClass.get(), name, execName, FAILURE).update(
+ toSeconds(startTime));
+ getCounter(resourceClass.get(), name, execName, FAILURE, EXCEPTION,
+ exception).inc();
+ if (namespaceOptional.isPresent()) {
+ getHistogram(resourceClass.get(), namespaceOptional.get(), name, execName,
+ FAILURE).update(toSeconds(startTime));
+ getCounter(resourceClass.get(), namespaceOptional.get(), name, execName,
+ FAILURE, EXCEPTION, exception).inc();
+ }
+ }
+ throw e;
+ }
+ }
+
+ @Override
+ public void reconcileCustomResource(HasMetadata resource, RetryInfo retryInfo,
+ Map metadata) {
+ log.debug("Reconcile custom resource {}, with retryInfo {} metadata {}", resource,
+ retryInfo, metadata);
+ if (retryInfo != null) {
+ final var namespace = resource.getMetadata().getNamespace();
+ getCounter(resource.getClass(), RECONCILIATION, RETRIES).inc();
+ getCounter(resource.getClass(), namespace, RECONCILIATION, RETRIES).inc();
+ }
+ getCounter(resource.getClass(), (String) metadata.get(CONTROLLER_NAME),
+ RECONCILIATIONS_QUEUE_SIZE).inc();
+ }
+
+ @Override
+ public void failedReconciliation(HasMetadata resource, Exception exception,
+ Map metadata) {
+ log.error("Failed reconciliation for resource {} with metadata {}", resource, exception,
+ exception);
+ getCounter(resource.getClass(), RECONCILIATION, FAILED).inc();
+ getCounter(resource.getClass(), resource.getMetadata().getNamespace(), RECONCILIATION,
+ FAILED).inc();
+ }
+
+ @Override
+ public void finishedReconciliation(HasMetadata resource, Map metadata) {
+ log.debug("Finished reconciliation for resource {} with metadata {}", resource, metadata);
+ getCounter(resource.getClass(), RECONCILIATION, FINISHED).inc();
+ getCounter(resource.getClass(), resource.getMetadata().getNamespace(), RECONCILIATION,
+ FINISHED);
+ }
+
+ @Override
+ public void cleanupDoneFor(ResourceID resourceID, Map metadata) {
+ log.debug("Cleanup Done for resource {} with metadata {}", resourceID, metadata);
+ getCounter(resourceID.getClass(), RECONCILIATION, CLEANUP).inc();
+ resourceID.getNamespace().ifPresent(
+ ns -> getCounter(resourceID.getClass(), ns, RECONCILIATION, CLEANUP).inc());
+ }
+
+ @Override
+ public > T monitorSizeOf(T map, String name) {
+ log.debug("Monitor size for {}", name);
+ var gauge = new Gauge<>() {
+ @Override
+ public Integer getValue() {
+ return map.size();
+ }
+ };
+ gauges.put(MetricRegistry.name(name, SIZE), gauge);
+ return map;
+ }
+
+ @Override
+ public void reconciliationExecutionStarted(HasMetadata resource, Map metadata) {
+ log.debug("Reconciliation execution started");
+ var namespace = resource.getMetadata().getNamespace();
+ getCounter(resource.getClass(), (String) metadata.get(CONTROLLER_NAME),
+ RECONCILIATIONS_EXECUTIONS).inc();
+ getCounter(resource.getClass(), namespace, (String) metadata.get(CONTROLLER_NAME),
+ RECONCILIATIONS_EXECUTIONS).inc();
+ }
+
+ @Override
+ public void reconciliationExecutionFinished(HasMetadata resource,
+ Map metadata) {
+ log.debug("Reconciliation execution finished");
+ var namespace = resource.getMetadata().getNamespace();
+ getCounter(resource.getClass(), (String) metadata.get(CONTROLLER_NAME),
+ RECONCILIATIONS_EXECUTIONS).dec();
+ getCounter(resource.getClass(), namespace, (String) metadata.get(CONTROLLER_NAME),
+ RECONCILIATIONS_EXECUTIONS).dec();
+ getCounter(resource.getClass(), (String) metadata.get(CONTROLLER_NAME),
+ RECONCILIATIONS_QUEUE_SIZE).dec();
+ }
+
+ private long toSeconds(long startTimeInMilliseconds) {
+ return TimeUnit.MILLISECONDS.toSeconds(clock.getTimeMillis() - startTimeInMilliseconds);
+ }
+
+ private Histogram getHistogram(Class> kclass, String... names) {
+ String name = MetricRegistry.name(kclass.getSimpleName(), names).toLowerCase();
+ Histogram histogram;
+ if (!histograms.containsKey(name)) {
+ histogram = metricRegistry.histogram(name);
+ histograms.put(name, histogram);
+ } else {
+ histogram = histograms.get(name);
+ }
+ return histogram;
+ }
+
+ private Counter getCounter(Class> klass, String... names) {
+ String name = MetricRegistry.name(klass.getSimpleName(), names).toLowerCase();
+ Counter counter;
+ if (!counters.containsKey(name)) {
+ counter = metricRegistry.counter(name);
+ counters.put(name, counter);
+ } else {
+ counter = counters.get(name);
+ }
+ return counter;
+ }
+
+ private Optional>> getResourceClass(
+ Map metadata) {
+ var resourceGvk = (GroupVersionKind) metadata.get(Constants.RESOURCE_GVK_KEY);
+
+ if (resourceGvk == null) {
+ return Optional.empty();
+ }
+
+ Class extends BaseResource, ?, ?, ?, ?>> resourceClass;
+
+ if (resourceGvk.getKind().equals(SparkApplication.class.getSimpleName())) {
+ resourceClass = SparkApplication.class;
+ } else {
+ return Optional.empty();
+ }
+ return Optional.of(resourceClass);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/HealthProbe.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/HealthProbe.java
new file mode 100644
index 00000000..1128b511
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/HealthProbe.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.probe;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+import io.javaoperatorsdk.operator.Operator;
+import io.javaoperatorsdk.operator.RuntimeInfo;
+import io.javaoperatorsdk.operator.health.InformerHealthIndicator;
+import io.javaoperatorsdk.operator.health.Status;
+import lombok.Getter;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.health.SentinelManager;
+
+import static org.apache.spark.kubernetes.operator.utils.ProbeUtil.areOperatorsStarted;
+import static org.apache.spark.kubernetes.operator.utils.ProbeUtil.sendMessage;
+
+@Getter
+@Slf4j
+public class HealthProbe implements HttpHandler {
+ private final List operators;
+ private final List> sentinelManagers = new ArrayList<>();
+
+ public HealthProbe(List operators) {
+ this.operators = operators;
+ }
+
+ public void registerSentinelResourceManager(SentinelManager> sentinelManager) {
+ sentinelManagers.add(sentinelManager);
+ }
+
+ public boolean isHealthy() {
+ var operatorsAreReady = areOperatorsStarted(operators);
+ if (operatorsAreReady.isEmpty() || !operatorsAreReady.get()) {
+ return false;
+ }
+
+ var runtimeInfosAreHealthy = operators.stream().map(operator ->
+ checkInformersHealth(operator.getRuntimeInfo())
+ ).reduce((a, b) -> a && b);
+
+ if (runtimeInfosAreHealthy.isEmpty() || !runtimeInfosAreHealthy.get()) {
+ return false;
+ }
+
+ for (SentinelManager> sentinelManager : sentinelManagers) {
+ if (!sentinelManager.allSentinelsAreHealthy()) {
+ log.error("One sentinel manager {} reported an unhealthy condition.",
+ sentinelManager);
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ @Override
+ public void handle(HttpExchange exchange) throws IOException {
+ if (isHealthy()) {
+ sendMessage(exchange, 200, "healthy");
+ } else {
+ sendMessage(exchange, 500, "unhealthy");
+ }
+ }
+
+ private boolean checkInformersHealth(RuntimeInfo operatorRuntimeInfo) {
+ log.debug("Checking informer health");
+ List informersHealthList = new ArrayList<>();
+ for (var controllerEntry :
+ operatorRuntimeInfo.unhealthyInformerWrappingEventSourceHealthIndicator()
+ .entrySet()) {
+ for (var eventSourceEntry : controllerEntry.getValue().entrySet()) {
+ Map informers =
+ eventSourceEntry.getValue().informerHealthIndicators();
+ for (var informerEntry : informers.entrySet()) {
+ if (informerEntry.getValue().getStatus() == Status.HEALTHY) {
+ informersHealthList.add(true);
+ } else {
+ if (log.isErrorEnabled()) {
+ log.error(
+ "Controller: {}, Event Source: {}, Informer: {} is not in a " +
+ "healthy state",
+ controllerEntry.getKey(), eventSourceEntry.getKey(),
+ informerEntry.getKey());
+ }
+ informersHealthList.add(false);
+ }
+ }
+ }
+ }
+ return informersHealthList.stream().reduce((a, b) -> a && b).orElse(true);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/ProbeService.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/ProbeService.java
new file mode 100644
index 00000000..8eb48856
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/ProbeService.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.probe;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.util.List;
+
+import com.sun.net.httpserver.HttpServer;
+import io.javaoperatorsdk.operator.Operator;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.health.SentinelManager;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.OperatorProbePort;
+
+@Slf4j
+public class ProbeService {
+ public static final String HEALTHZ = "/healthz";
+ public static final String READYZ = "/readyz";
+ HttpServer server;
+
+ public ProbeService(List operators, SentinelManager sentinelManager) {
+ HealthProbe healthProbe = new HealthProbe(operators);
+ healthProbe.registerSentinelResourceManager(sentinelManager);
+ try {
+ server = HttpServer.create(new InetSocketAddress(OperatorProbePort.getValue()), 0);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to create Probe Service Server", e);
+ }
+ server.createContext(READYZ, new ReadinessProbe(operators));
+ server.createContext(HEALTHZ, healthProbe);
+ server.setExecutor(null);
+ }
+
+ public void start() {
+ log.info("Probe service started");
+ server.start();
+ }
+
+ public void stop() {
+ log.info("Probe service stopped");
+ server.stop(0);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/ReadinessProbe.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/ReadinessProbe.java
new file mode 100644
index 00000000..6ea7be8e
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/probe/ReadinessProbe.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.kubernetes.operator.probe;
+
+import java.io.IOException;
+import java.util.List;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+import io.javaoperatorsdk.operator.Operator;
+import lombok.extern.slf4j.Slf4j;
+
+import static org.apache.spark.kubernetes.operator.utils.ProbeUtil.areOperatorsStarted;
+import static org.apache.spark.kubernetes.operator.utils.ProbeUtil.sendMessage;
+
+@Slf4j
+public class ReadinessProbe implements HttpHandler {
+ private final List operators;
+
+ public ReadinessProbe(List operators) {
+ this.operators = operators;
+ }
+
+ @Override
+ public void handle(HttpExchange httpExchange) throws IOException {
+ var operatorsAreReady = areOperatorsStarted(operators);
+ if (operatorsAreReady.isEmpty() || !operatorsAreReady.get()) {
+ sendMessage(httpExchange, 400, "spark operators are not ready yet");
+ }
+
+ if (!passRbacCheck()) {
+ sendMessage(httpExchange, 403, "required rbac test failed, operators are not ready");
+ }
+
+ sendMessage(httpExchange, 200, "started");
+ }
+
+ public boolean passRbacCheck() {
+ return true;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/ReconcileProgress.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/ReconcileProgress.java
new file mode 100644
index 00000000..b115733c
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/ReconcileProgress.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler;
+
+import java.time.Duration;
+
+import lombok.Data;
+
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.SparkAppReconcileIntervalSeconds;
+
+/**
+ * Represents the progress of a reconcile request
+ * - completed : is set to true if there's no more actions expected in the same reconciliation
+ * - requeue : describes whether the mentioned resource need to be reconciled again - and if so,
+ * the frequency
+ */
+@Data
+public class ReconcileProgress {
+ private boolean completed;
+ boolean requeue;
+ private Duration requeueAfterDuration;
+
+ private ReconcileProgress(boolean completed, boolean requeue, Duration requeueAfterDuration) {
+ this.completed = completed;
+ this.requeue = requeue;
+ this.requeueAfterDuration = requeueAfterDuration;
+ }
+
+ public static ReconcileProgress proceed() {
+ return new ReconcileProgress(false, true,
+ Duration.ofSeconds(SparkAppReconcileIntervalSeconds.getValue()));
+ }
+
+ public static ReconcileProgress completeAndDefaultRequeue() {
+ return new ReconcileProgress(true, true,
+ Duration.ofSeconds(SparkAppReconcileIntervalSeconds.getValue()));
+ }
+
+ public static ReconcileProgress completeAndRequeueAfter(Duration requeueAfterDuration) {
+ return new ReconcileProgress(true, true, requeueAfterDuration);
+ }
+
+ public static ReconcileProgress completeAndImmediateRequeue() {
+ return new ReconcileProgress(true, true, Duration.ZERO);
+ }
+
+ public static ReconcileProgress completeAndNoRequeue() {
+ return new ReconcileProgress(true, false, Duration.ZERO);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkAppReconcileUtils.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkAppReconcileUtils.java
new file mode 100644
index 00000000..5f582145
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkAppReconcileUtils.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.time.Instant;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+
+import io.fabric8.kubernetes.api.model.PodTemplateSpec;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.SparkAppResourceSpec;
+import org.apache.spark.kubernetes.operator.SparkAppSubmissionWorker;
+import org.apache.spark.kubernetes.operator.SparkApplication;
+import org.apache.spark.kubernetes.operator.decorators.DriverDecorator;
+import org.apache.spark.kubernetes.operator.utils.ModelUtils;
+
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.DRIVER_SPARK_TEMPLATE_FILE_PROP_KEY;
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.EXECUTOR_SPARK_TEMPLATE_FILE_PROP_KEY;
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.overrideDriverTemplate;
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.overrideExecutorTemplate;
+
+@Slf4j
+public class SparkAppReconcileUtils {
+ public static boolean enableForceDelete(SparkApplication app) {
+ long timeoutThreshold = app.getSpec().getApplicationTolerations()
+ .getApplicationTimeoutConfig().getForceTerminationGracePeriodMillis();
+ Instant lastTransitionTime =
+ Instant.parse(app.getStatus().getCurrentState().getLastTransitionTime());
+ return lastTransitionTime.plusMillis(timeoutThreshold).isBefore(Instant.now());
+ }
+
+ public static SparkAppResourceSpec buildResourceSpec(final SparkApplication app,
+ final KubernetesClient client,
+ final SparkAppSubmissionWorker worker) {
+ Map confOverrides = overrideDependencyConf(app);
+ SparkAppResourceSpec resourceSpec = worker.getResourceSpec(app, client, confOverrides);
+ cleanUpTempResourcesForApp(app, confOverrides);
+ DriverDecorator decorator = new DriverDecorator(app);
+ decorator.decorate(resourceSpec.getConfiguredPod());
+ return resourceSpec;
+ }
+
+ private static Map overrideDependencyConf(final SparkApplication app) {
+ Map confOverrides = new HashMap<>();
+ SparkReconcilerUtils.sparkAppResourceLabels(app).forEach((k, v) -> {
+ confOverrides.put("spark.kubernetes.driver.label." + k, v);
+ confOverrides.put("spark.kubernetes.driver.service.label." + k, v);
+ confOverrides.put("spark.kubernetes.executor.label." + k, v);
+ });
+ confOverrides.put("spark.kubernetes.namespace", app.getMetadata().getNamespace());
+ if (app.getSpec().getSparkConf().containsKey("spark.app.name")) {
+ confOverrides.put("spark.app.name", app.getMetadata().getName());
+ }
+ // FIXME: avoid this file flushing
+ confOverrides.putAll(getOrCreateLocalFileForDriverSpec(app, confOverrides));
+ confOverrides.putAll(getOrCreateLocalFileForExecutorSpec(app, confOverrides));
+ return confOverrides;
+ }
+
+ private static void cleanUpTempResourcesForApp(final SparkApplication app,
+ Map confOverrides) {
+ if (overrideDriverTemplate(app.getSpec())) {
+ deleteLocalFileFromPathKey(confOverrides, DRIVER_SPARK_TEMPLATE_FILE_PROP_KEY);
+ }
+ if (overrideExecutorTemplate(app.getSpec())) {
+ deleteLocalFileFromPathKey(confOverrides, EXECUTOR_SPARK_TEMPLATE_FILE_PROP_KEY);
+ }
+ }
+
+ private static Optional getLocalFileFromPathKey(Map confOverrides,
+ String pathKey) {
+ if (confOverrides.containsKey(pathKey)) {
+ String filePath = confOverrides.get(pathKey);
+ if (filePath.startsWith("local") || filePath.startsWith("file") ||
+ filePath.startsWith("/")) {
+ return Optional.of(new File(filePath));
+ }
+ }
+ return Optional.empty();
+ }
+
+ private static void deleteLocalFileFromPathKey(Map confOverrides,
+ String pathKey) {
+ Optional localFile = Optional.empty();
+ boolean deleted = false;
+ try {
+ localFile = getLocalFileFromPathKey(confOverrides, pathKey);
+ if (localFile.isPresent() && localFile.get().exists() && localFile.get().isFile()) {
+ deleted = localFile.get().delete();
+ } else {
+ log.warn("Local temp file not found at {}", pathKey);
+ }
+ } catch (Throwable t) {
+ log.error("Failed to delete temp file. Attempting delete upon exit.", t);
+ } finally {
+ if (!deleted && localFile.isPresent() && localFile.get().exists()) {
+ localFile.get().deleteOnExit();
+ }
+ }
+ }
+
+ private static Map getOrCreateLocalFileForDriverSpec(
+ final SparkApplication app,
+ final Map confOverrides) {
+ if (overrideDriverTemplate(app.getSpec())) {
+ Optional localFile =
+ getLocalFileFromPathKey(confOverrides, DRIVER_SPARK_TEMPLATE_FILE_PROP_KEY);
+ if (localFile.isEmpty() || !localFile.get().exists() || !localFile.get().isFile()) {
+ String filePath = createLocalFileForPodTemplateSpec(
+ app.getSpec().getDriverSpec().getPodTemplateSpec(),
+ app.getMetadata().getUid() + "-driver-");
+ return Collections.singletonMap(DRIVER_SPARK_TEMPLATE_FILE_PROP_KEY, filePath);
+ }
+ }
+ return Collections.emptyMap();
+ }
+
+ private static Map getOrCreateLocalFileForExecutorSpec(
+ final SparkApplication app,
+ final Map confOverrides) {
+ if (overrideExecutorTemplate(app.getSpec())) {
+ Optional localFile =
+ getLocalFileFromPathKey(confOverrides, EXECUTOR_SPARK_TEMPLATE_FILE_PROP_KEY);
+ if (localFile.isEmpty() || !localFile.get().exists() || !localFile.get().isFile()) {
+ String filePath = createLocalFileForPodTemplateSpec(
+ app.getSpec().getExecutorSpec().getPodTemplateSpec(),
+ app.getMetadata().getUid() + "-executor-");
+ return Collections.singletonMap(EXECUTOR_SPARK_TEMPLATE_FILE_PROP_KEY, filePath);
+ }
+ }
+ return Collections.emptyMap();
+ }
+
+ /**
+ * Flush driver pod template spec to a local file
+ *
+ * @return temp file path
+ */
+ private static String createLocalFileForPodTemplateSpec(final PodTemplateSpec podTemplateSpec,
+ final String tempFilePrefix) {
+ try {
+ File tmpFile = File.createTempFile(tempFilePrefix, ".json");
+ FileOutputStream fileStream = new FileOutputStream(tmpFile);
+ OutputStreamWriter writer = new OutputStreamWriter(fileStream, "UTF-8");
+ writer.write(
+ ModelUtils.asJsonString(ModelUtils.getPodFromTemplateSpec(podTemplateSpec)));
+ writer.close();
+ String path = tmpFile.getAbsolutePath();
+ if (log.isDebugEnabled()) {
+ log.debug("Temp file wrote to {}", tmpFile.getAbsolutePath());
+ }
+ return path;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkAppReconciler.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkAppReconciler.java
new file mode 100644
index 00000000..eb2cf2d5
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkAppReconciler.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import io.fabric8.kubernetes.api.model.Pod;
+import io.javaoperatorsdk.operator.api.config.informer.InformerConfiguration;
+import io.javaoperatorsdk.operator.api.reconciler.Cleaner;
+import io.javaoperatorsdk.operator.api.reconciler.Context;
+import io.javaoperatorsdk.operator.api.reconciler.ControllerConfiguration;
+import io.javaoperatorsdk.operator.api.reconciler.DeleteControl;
+import io.javaoperatorsdk.operator.api.reconciler.ErrorStatusHandler;
+import io.javaoperatorsdk.operator.api.reconciler.ErrorStatusUpdateControl;
+import io.javaoperatorsdk.operator.api.reconciler.EventSourceContext;
+import io.javaoperatorsdk.operator.api.reconciler.EventSourceInitializer;
+import io.javaoperatorsdk.operator.api.reconciler.Reconciler;
+import io.javaoperatorsdk.operator.api.reconciler.UpdateControl;
+import io.javaoperatorsdk.operator.processing.event.source.EventSource;
+import io.javaoperatorsdk.operator.processing.event.source.informer.InformerEventSource;
+import io.javaoperatorsdk.operator.processing.event.source.informer.Mappers;
+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.SparkAppSubmissionWorker;
+import org.apache.spark.kubernetes.operator.Constants;
+import org.apache.spark.kubernetes.operator.SparkApplication;
+import org.apache.spark.kubernetes.operator.controller.SparkAppContext;
+import org.apache.spark.kubernetes.operator.health.SentinelManager;
+import org.apache.spark.kubernetes.operator.reconciler.observers.AppDriverReadyObserver;
+import org.apache.spark.kubernetes.operator.reconciler.observers.AppDriverRunningObserver;
+import org.apache.spark.kubernetes.operator.reconciler.observers.AppDriverStartObserver;
+import org.apache.spark.kubernetes.operator.reconciler.observers.AppDriverTimeoutObserver;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppCleanUpStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppInitStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppReconcileStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppResourceObserveStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppRunningStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppTerminatedStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.AppValidateStep;
+import org.apache.spark.kubernetes.operator.reconciler.reconcilesteps.UnknownStateStep;
+import org.apache.spark.kubernetes.operator.utils.SparkAppStatusUtils;
+import org.apache.spark.kubernetes.operator.utils.LoggingUtils;
+import org.apache.spark.kubernetes.operator.utils.SparkAppStatusRecorder;
+
+import static org.apache.spark.kubernetes.operator.reconciler.ReconcileProgress.completeAndDefaultRequeue;
+import static org.apache.spark.kubernetes.operator.reconciler.SparkReconcilerUtils.commonResourceLabelsStr;
+
+/**
+ * Reconciler for Spark Application.
+ * Performs sanity check on the app, identify the reconcile steps based on App status
+ * and execute the steps.
+ */
+@ControllerConfiguration
+@Slf4j
+@RequiredArgsConstructor
+public class SparkAppReconciler
+ implements Reconciler,
+ ErrorStatusHandler,
+ EventSourceInitializer,
+ Cleaner {
+ private final SparkAppSubmissionWorker submissionWorker;
+ private final SparkAppStatusRecorder sparkAppStatusRecorder;
+ private final SentinelManager sentinelManager;
+
+ @Override
+ public UpdateControl reconcile(SparkApplication sparkApplication,
+ Context context)
+ throws Exception {
+ LoggingUtils.TrackedMDC trackedMDC = new LoggingUtils.TrackedMDC();
+ try {
+ trackedMDC.set(sparkApplication);
+ if (sentinelManager.handleSentinelResourceReconciliation(sparkApplication,
+ context.getClient())) {
+ return UpdateControl.noUpdate();
+ }
+ log.debug("Start reconciliation.");
+ sparkAppStatusRecorder.updateStatusFromCache(sparkApplication);
+ SparkAppContext ctx = new SparkAppContext(sparkApplication, context,
+ submissionWorker);
+ List reconcileSteps = getReconcileSteps(sparkApplication);
+ for (AppReconcileStep step : reconcileSteps) {
+ ReconcileProgress progress = step.reconcile(ctx, sparkAppStatusRecorder);
+ if (progress.isCompleted()) {
+ return SparkReconcilerUtils.toUpdateControl(sparkApplication, progress);
+ }
+ }
+ return SparkReconcilerUtils.toUpdateControl(sparkApplication,
+ completeAndDefaultRequeue());
+
+ } finally {
+ log.debug("Reconciliation completed.");
+ trackedMDC.reset();
+ }
+ }
+
+ @Override
+ public ErrorStatusUpdateControl updateErrorStatus(
+ SparkApplication sparkApplication,
+ Context context,
+ Exception e) {
+ LoggingUtils.TrackedMDC trackedMDC = new LoggingUtils.TrackedMDC();
+ try {
+ trackedMDC.set(sparkApplication);
+ context.getRetryInfo().ifPresent(retryInfo -> {
+ if (log.isErrorEnabled()) {
+ log.error("Failed attempt: {}, last attempt: {}", retryInfo.getAttemptCount(),
+ retryInfo.isLastAttempt());
+ }
+ });
+ return ErrorStatusUpdateControl.noStatusUpdate();
+ } finally {
+ trackedMDC.reset();
+ }
+ }
+
+ @Override
+ public Map prepareEventSources(
+ EventSourceContext context) {
+ var podEventSource =
+ new InformerEventSource<>(InformerConfiguration.from(Pod.class, context)
+ .withSecondaryToPrimaryMapper(
+ Mappers.fromLabel(Constants.LABEL_SPARK_APPLICATION_NAME))
+ .withLabelSelector(commonResourceLabelsStr())
+ .build(), context);
+ return EventSourceInitializer.nameEventSources(podEventSource);
+ }
+
+ protected List getReconcileSteps(final SparkApplication app) {
+ List steps = new ArrayList<>();
+ steps.add(new AppValidateStep());
+ steps.add(new AppTerminatedStep());
+ switch (app.getStatus().getCurrentState().getCurrentStateSummary()) {
+ case SUBMITTED:
+ case SCHEDULED_TO_RESTART:
+ steps.add(new AppInitStep());
+ break;
+ case DRIVER_REQUESTED:
+ case DRIVER_STARTED:
+ steps.add(new AppResourceObserveStep(
+ List.of(new AppDriverStartObserver(), new AppDriverReadyObserver())));
+ steps.add(new AppResourceObserveStep(
+ Collections.singletonList(new AppDriverRunningObserver())));
+ steps.add(new AppResourceObserveStep(
+ Collections.singletonList(new AppDriverTimeoutObserver())));
+ break;
+ case DRIVER_READY:
+ case INITIALIZED_BELOW_THRESHOLD_EXECUTORS:
+ case RUNNING_HEALTHY:
+ case RUNNING_WITH_BELOW_THRESHOLD_EXECUTORS:
+ steps.add(new AppRunningStep());
+ steps.add(new AppResourceObserveStep(
+ Collections.singletonList(new AppDriverRunningObserver())));
+ steps.add(new AppResourceObserveStep(
+ Collections.singletonList(new AppDriverTimeoutObserver())));
+ break;
+ case SPARK_SESSION_INITIALIZATION_TIMED_OUT:
+ case DRIVER_LAUNCH_TIMED_OUT:
+ case EXECUTORS_LAUNCH_TIMED_OUT:
+ case SUCCEEDED:
+ case DRIVER_EVICTED:
+ case FAILED:
+ case SCHEDULING_FAILURE:
+ steps.add(new AppCleanUpStep());
+ break;
+ default:
+ steps.add(new UnknownStateStep());
+ break;
+ }
+ return steps;
+ }
+
+ /**
+ * Best-effort graceful termination upon delete.
+ *
+ * @param sparkApplication the resource that is marked for deletion
+ * @param context the context with which the operation is executed
+ * @return DeleteControl, with requeue if needed
+ */
+ @Override
+ public DeleteControl cleanup(SparkApplication sparkApplication,
+ Context context) {
+ LoggingUtils.TrackedMDC trackedMDC = new LoggingUtils.TrackedMDC();
+ DeleteControl deleteControl = DeleteControl.defaultDelete();
+ try {
+ trackedMDC.set(sparkApplication);
+ log.info("Cleaning up resources for SparkApp.");
+ SparkAppContext ctx = new SparkAppContext(sparkApplication, context,
+ submissionWorker);
+ List cleanupSteps = new ArrayList<>();
+ cleanupSteps.add(new AppValidateStep());
+ cleanupSteps.add(new AppTerminatedStep());
+ cleanupSteps.add(new AppCleanUpStep(SparkAppStatusUtils::appCancelled));
+ for (AppReconcileStep step : cleanupSteps) {
+ ReconcileProgress progress = step.reconcile(ctx, sparkAppStatusRecorder);
+ if (progress.isCompleted()) {
+ if (progress.isRequeue()) {
+ return DeleteControl.noFinalizerRemoval().rescheduleAfter(
+ progress.getRequeueAfterDuration());
+ } else {
+ break;
+ }
+ }
+ }
+ } finally {
+ log.info("Cleanup completed");
+ trackedMDC.reset();
+ }
+ sparkAppStatusRecorder.removeCachedStatus(sparkApplication);
+ return deleteControl;
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkReconcilerUtils.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkReconcilerUtils.java
new file mode 100644
index 00000000..3233a375
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/SparkReconcilerUtils.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.fabric8.kubernetes.api.model.DeletionPropagation;
+import io.fabric8.kubernetes.api.model.HasMetadata;
+import io.fabric8.kubernetes.api.model.ObjectMeta;
+import io.fabric8.kubernetes.api.model.ObjectMetaBuilder;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import io.fabric8.kubernetes.client.KubernetesClientException;
+import io.javaoperatorsdk.operator.api.reconciler.DeleteControl;
+import io.javaoperatorsdk.operator.api.reconciler.UpdateControl;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.spark.kubernetes.operator.BaseResource;
+import org.apache.spark.kubernetes.operator.Constants;
+import org.apache.spark.kubernetes.operator.SparkApplication;
+
+import static org.apache.spark.kubernetes.operator.Constants.LABEL_RESOURCE_NAME;
+import static org.apache.spark.kubernetes.operator.Constants.LABEL_SPARK_OPERATOR_NAME;
+import static org.apache.spark.kubernetes.operator.Constants.LABEL_SPARK_ROLE_DRIVER_VALUE;
+import static org.apache.spark.kubernetes.operator.Constants.LABEL_SPARK_ROLE_EXECUTOR_VALUE;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.DriverCreateMaxAttempts;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.ForegroundRequestTimeoutSeconds;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.OperatorAppName;
+import static org.apache.spark.kubernetes.operator.config.SparkOperatorConf.OperatorWatchedNamespaces;
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.buildOwnerReferenceTo;
+import static org.apache.spark.kubernetes.operator.utils.SparkExceptionUtils.isConflictForExistingResource;
+
+@Slf4j
+public class SparkReconcilerUtils {
+
+ private static final ObjectMapper objectMapper = new ObjectMapper();
+
+ public static Map commonOperatorResourceLabels() {
+ Map labels = new HashMap<>();
+ labels.put(LABEL_RESOURCE_NAME, OperatorAppName.getValue());
+ return labels;
+ }
+
+ public static Map defaultOperatorConfigLabels() {
+ Map labels = new HashMap<>(commonOperatorResourceLabels());
+ labels.put("app.kubernetes.io/component", "operator-dynamic-config-overrides");
+ return labels;
+ }
+
+ public static Map commonManagedResourceLabels() {
+ Map labels = new HashMap<>();
+ labels.put(LABEL_SPARK_OPERATOR_NAME, OperatorAppName.getValue());
+ return labels;
+ }
+
+ public static Map sparkAppResourceLabels(final SparkApplication app) {
+ return sparkAppResourceLabels(app.getMetadata().getName());
+ }
+
+ public static Map sparkAppResourceLabels(final String appName) {
+ Map labels = commonManagedResourceLabels();
+ labels.put(Constants.LABEL_SPARK_APPLICATION_NAME, appName);
+ return labels;
+ }
+
+ public static Map driverLabels(final SparkApplication sparkApplication) {
+ Map labels = sparkAppResourceLabels(sparkApplication);
+ labels.put(Constants.LABEL_SPARK_ROLE_NAME, LABEL_SPARK_ROLE_DRIVER_VALUE);
+ return labels;
+ }
+
+ public static Map executorLabels(final SparkApplication sparkApplication) {
+ Map labels = sparkAppResourceLabels(sparkApplication);
+ labels.put(Constants.LABEL_SPARK_ROLE_NAME, LABEL_SPARK_ROLE_EXECUTOR_VALUE);
+ return labels;
+ }
+
+ public static Set getWatchedNamespaces() {
+ String namespaces = OperatorWatchedNamespaces.getValue();
+ if (StringUtils.isNotEmpty(namespaces)) {
+ return Arrays.stream(namespaces.split(",")).map(String::trim)
+ .collect(Collectors.toSet());
+ }
+ return Collections.emptySet();
+ }
+
+ /**
+ * Labels to be applied to all created resources, as a comma-separated string
+ *
+ * @return labels string
+ */
+ public static String commonResourceLabelsStr() {
+ return labelsAsStr(commonManagedResourceLabels());
+ }
+
+ public static String labelsAsStr(Map labels) {
+ return labels
+ .entrySet()
+ .stream()
+ .map(e -> String.join("=", e.getKey(), e.getValue()))
+ .collect(Collectors.joining(","));
+ }
+
+ public static > UpdateControl toUpdateControl(
+ O resource, ReconcileProgress reconcileProgress) {
+ // reconciler already handled resource and status update, skip update at lower level
+ UpdateControl updateControl = UpdateControl.noUpdate();
+ if (reconcileProgress.isRequeue()) {
+ return updateControl.rescheduleAfter(reconcileProgress.getRequeueAfterDuration());
+ } else {
+ return updateControl;
+ }
+ }
+
+ public static > DeleteControl toDeleteControl(
+ O resource, ReconcileProgress reconcileProgress) {
+ if (reconcileProgress.isRequeue()) {
+ return DeleteControl.noFinalizerRemoval().rescheduleAfter(
+ reconcileProgress.getRequeueAfterDuration());
+ } else {
+ return DeleteControl.defaultDelete();
+ }
+ }
+
+ public static Optional getOrCreateSecondaryResource(
+ final KubernetesClient client,
+ final T resource) {
+ Optional current = getResource(client, resource);
+ if (current.isEmpty()) {
+ // Adding retry logic to overcome a k8s bug:
+ // https://github.com/kubernetes/kubernetes/issues/67761
+ long maxAttempts = DriverCreateMaxAttempts.getValue();
+ long attemptCount = 1;
+ while (true) {
+ try {
+ current = Optional.ofNullable(client.resource(resource).create());
+ break;
+ } catch (KubernetesClientException e) {
+ if (log.isErrorEnabled()) {
+ log.error(
+ "Failed to request resource with responseCode={} " +
+ "attemptCount={}/{}",
+ e.getCode(), attemptCount, maxAttempts);
+ }
+ // retry only on 409 Conflict
+ if (e.getCode() != 409) {
+ throw e;
+ } else {
+ if (isConflictForExistingResource(e)) {
+ current = getResource(client, resource);
+ if (current.isPresent()) {
+ return current;
+ }
+ }
+ if (++attemptCount > maxAttempts) {
+ log.error("Max Retries exceeded while trying to create resource");
+ throw e;
+ }
+ }
+ }
+ }
+ }
+ return current;
+ }
+
+ public static void addOwnerReferenceSecondaryResource(final KubernetesClient client,
+ final List resources,
+ final HasMetadata owner) {
+
+ resources.forEach(r -> {
+ ObjectMeta metaData = new ObjectMetaBuilder(r.getMetadata())
+ .addToOwnerReferences(buildOwnerReferenceTo(owner))
+ .build();
+ r.setMetadata(metaData);
+ });
+ client.resourceList(resources).forceConflicts().serverSideApply();
+ }
+
+ public static Optional getResource(final KubernetesClient client,
+ final T desired) {
+ T resource = null;
+ try {
+ resource = client.resource(desired).get();
+ } catch (KubernetesClientException e) {
+ if (e.getCode() == 404) {
+ return Optional.empty();
+ }
+ }
+ return Optional.ofNullable(resource);
+ }
+
+ public static void deleteResourceIfExists(final KubernetesClient client,
+ final T resource,
+ boolean forceDelete) {
+ try {
+ if (forceDelete) {
+ client.resource(resource)
+ .withGracePeriod(0L)
+ .delete();
+ } else {
+ client.resource(resource)
+ .withPropagationPolicy(DeletionPropagation.FOREGROUND)
+ .withTimeout(ForegroundRequestTimeoutSeconds.getValue(), TimeUnit.SECONDS)
+ .delete();
+ }
+ } catch (KubernetesClientException e) {
+ if (e.getCode() != 404) {
+ throw e;
+ } else {
+ log.info("Pod to delete does not exist, proceeding...");
+ }
+ }
+ }
+
+ public static T clone(T object) {
+ if (object == null) {
+ return null;
+ }
+ try {
+ return (T)
+ objectMapper.readValue(
+ objectMapper.writeValueAsString(object), object.getClass());
+ } catch (JsonProcessingException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverReadyObserver.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverReadyObserver.java
new file mode 100644
index 00000000..ea123457
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverReadyObserver.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler.observers;
+
+import java.util.Optional;
+
+import io.fabric8.kubernetes.api.model.Pod;
+
+import org.apache.spark.kubernetes.operator.Constants;
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+import org.apache.spark.kubernetes.operator.status.ApplicationState;
+import org.apache.spark.kubernetes.operator.status.ApplicationStateSummary;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+import org.apache.spark.kubernetes.operator.utils.PodUtils;
+
+/**
+ * Observes whether driver is ready
+ */
+public class AppDriverReadyObserver extends BaseAppDriverObserver {
+ @Override
+ public Optional observe(Pod driver,
+ ApplicationSpec spec,
+ ApplicationStatus currentStatus) {
+ if (ApplicationStateSummary.DRIVER_READY.ordinal()
+ <= currentStatus.getCurrentState().getCurrentStateSummary().ordinal()) {
+ return Optional.empty();
+ }
+ if (PodUtils.isPodReady(driver)) {
+ return Optional.of(new ApplicationState(ApplicationStateSummary.DRIVER_READY,
+ Constants.DriverReady));
+ }
+ return observeDriverTermination(driver, true, spec);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverRunningObserver.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverRunningObserver.java
new file mode 100644
index 00000000..8cba5854
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverRunningObserver.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler.observers;
+
+import java.util.Optional;
+
+import io.fabric8.kubernetes.api.model.Pod;
+
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+import org.apache.spark.kubernetes.operator.status.ApplicationState;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+
+/**
+ * Observes whether driver reaches running state (in other words, whether its at least scheduled)
+ */
+public class AppDriverRunningObserver extends BaseAppDriverObserver {
+ @Override
+ public Optional observe(Pod driver,
+ ApplicationSpec spec,
+ ApplicationStatus currentStatus) {
+ return observeDriverTermination(driver, true, spec);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverStartObserver.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverStartObserver.java
new file mode 100644
index 00000000..567d0a1c
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverStartObserver.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler.observers;
+
+import java.util.Optional;
+
+import io.fabric8.kubernetes.api.model.Pod;
+
+import org.apache.spark.kubernetes.operator.Constants;
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+import org.apache.spark.kubernetes.operator.status.ApplicationState;
+import org.apache.spark.kubernetes.operator.status.ApplicationStateSummary;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+import org.apache.spark.kubernetes.operator.utils.PodUtils;
+
+public class AppDriverStartObserver extends BaseAppDriverObserver {
+ @Override
+ public Optional observe(Pod driver,
+ ApplicationSpec spec,
+ ApplicationStatus currentStatus) {
+ if (ApplicationStateSummary.DRIVER_STARTED.ordinal()
+ <= currentStatus.getCurrentState().getCurrentStateSummary().ordinal()) {
+ return Optional.empty();
+ }
+ if (PodUtils.isPodStarted(driver, spec)) {
+ return Optional.of(new ApplicationState(ApplicationStateSummary.DRIVER_STARTED,
+ Constants.DriverRunning));
+ }
+ return observeDriverTermination(driver, false, spec);
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverTimeoutObserver.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverTimeoutObserver.java
new file mode 100644
index 00000000..a6e61426
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/AppDriverTimeoutObserver.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler.observers;
+
+import java.time.Instant;
+import java.util.Optional;
+import java.util.function.Supplier;
+
+import io.fabric8.kubernetes.api.model.Pod;
+
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+import org.apache.spark.kubernetes.operator.spec.ApplicationTimeoutConfig;
+import org.apache.spark.kubernetes.operator.status.ApplicationState;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+import org.apache.spark.kubernetes.operator.utils.SparkAppStatusUtils;
+
+/**
+ * Observes driver status and time-out as configured in app spec
+ */
+public class AppDriverTimeoutObserver extends BaseAppDriverObserver {
+
+ /**
+ * Operator may proactively terminate application if it has stay in certain state for a while.
+ * This helps to avoid resource deadlock when app cannot proceed.
+ * Such states include
+ * - DRIVER_REQUESTED -> goes to DRIVER_LAUNCH_TIMED_OUT if driver pod cannot be scheduled or
+ * cannot start running
+ * - DRIVER_STARTED -> goes to SPARK_SESSION_INITIALIZATION_TIMED_OUT if Spark session cannot
+ * be initialized
+ * - DRIVER_READY / EXECUTOR_REQUESTED / EXECUTOR_SCHEDULED /
+ * INITIALIZED_BELOW_THRESHOLD_EXECUTORS
+ * -> go to EXECUTORS_LAUNCH_TIMED_OUT if app cannot acquire at least minimal executors in
+ * given time
+ * Operator will NOT proactively stop the app if it has acquired enough executors and later
+ * lose them. User may build additional layers to alert and act on such scenario.
+ * Timeout check would be performed at the end of reconcile - and it would be performed only
+ * if there's no other updates to be performed in the same reconcile action.
+ */
+ @Override
+ public Optional observe(Pod driver,
+ ApplicationSpec spec,
+ ApplicationStatus currentStatus) {
+ Instant lastTransitionTime =
+ Instant.parse(currentStatus.getCurrentState().getLastTransitionTime());
+ long timeoutThreshold;
+ Supplier supplier;
+ ApplicationTimeoutConfig timeoutConfig =
+ spec.getApplicationTolerations().getApplicationTimeoutConfig();
+ switch (currentStatus.getCurrentState().getCurrentStateSummary()) {
+ case DRIVER_REQUESTED:
+ timeoutThreshold = timeoutConfig.getDriverStartTimeoutMillis();
+ supplier = SparkAppStatusUtils::driverLaunchTimedOut;
+ break;
+ case DRIVER_STARTED:
+ timeoutThreshold = timeoutConfig.getSparkSessionStartTimeoutMillis();
+ supplier = SparkAppStatusUtils::driverReadyTimedOut;
+ break;
+ case DRIVER_READY:
+ case INITIALIZED_BELOW_THRESHOLD_EXECUTORS:
+ timeoutThreshold = timeoutConfig.getExecutorStartTimeoutMillis();
+ supplier = SparkAppStatusUtils::executorLaunchTimedOut;
+ break;
+ default:
+ // No timeout check needed for other states
+ return Optional.empty();
+ }
+ if (timeoutThreshold > 0L &&
+ lastTransitionTime.plusMillis(timeoutThreshold).isBefore(Instant.now())) {
+ ApplicationState state = supplier.get();
+ state.setLastObservedDriverStatus(driver.getStatus());
+ return Optional.of(state);
+ }
+ return Optional.empty();
+ }
+}
diff --git a/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/BaseAppDriverObserver.java b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/BaseAppDriverObserver.java
new file mode 100644
index 00000000..9d04e5c8
--- /dev/null
+++ b/spark-operator/src/main/java/org/apache/spark/kubernetes/operator/reconciler/observers/BaseAppDriverObserver.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package org.apache.spark.kubernetes.operator.reconciler.observers;
+
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+import io.fabric8.kubernetes.api.model.ContainerStatus;
+import io.fabric8.kubernetes.api.model.Pod;
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.spark.kubernetes.operator.spec.ApplicationSpec;
+import org.apache.spark.kubernetes.operator.status.ApplicationAttemptSummary;
+import org.apache.spark.kubernetes.operator.status.ApplicationState;
+import org.apache.spark.kubernetes.operator.status.ApplicationStateSummary;
+import org.apache.spark.kubernetes.operator.status.ApplicationStatus;
+import org.apache.spark.kubernetes.operator.utils.PodPhase;
+import org.apache.spark.kubernetes.operator.utils.PodUtils;
+
+import static org.apache.spark.kubernetes.operator.Constants.DriverCompletedMessage;
+import static org.apache.spark.kubernetes.operator.Constants.DriverFailedInitContainersMessage;
+import static org.apache.spark.kubernetes.operator.Constants.DriverFailedMessage;
+import static org.apache.spark.kubernetes.operator.Constants.DriverRestartedMessage;
+import static org.apache.spark.kubernetes.operator.Constants.DriverSucceededMessage;
+import static org.apache.spark.kubernetes.operator.Constants.DriverTerminatedBeforeInitializationMessage;
+import static org.apache.spark.kubernetes.operator.utils.ModelUtils.isDriverMainContainer;
+
+/**
+ * Observes driver pod status and update Application status as needed
+ */
+@Slf4j
+public abstract class BaseAppDriverObserver extends
+ BaseSecondaryResourceObserver