From f1e5b2f1084ae7c36066468b78806200c0a83560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Caner?= Date: Fri, 15 Feb 2019 12:46:52 +0100 Subject: [PATCH 1/6] Added Pod Anti Affinity --- pkg/cluster/k8sres.go | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index cbc5f8bd7..147a2600b 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -290,6 +290,26 @@ func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { } } +func generatePodAffinity(team string, version string) *v1.Affinity { + // generate pod anti affinity to avoid multiple on instances on the same node + matchLabels := make(map[string]string) + + matchLabels["application"] = "spilo" + matchLabels["team"] = team + matchLabels["version"] = version + + return &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: matchLabels, + }, + TopologyKey: "kubernetes.io/hostname", + }}, + }, + } +} + func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration { // allow to override tolerations by postgresql manifest if len(*tolerationsSpec) > 0 { @@ -437,9 +457,7 @@ func generatePodTemplate( addShmVolume(&podSpec) } - if nodeAffinity != nil { - podSpec.Affinity = nodeAffinity - } + podSpec.Affinity = generatePodAffinity(labels.Get("team"), labels.Get("version")) if priorityClassName != "" { podSpec.PriorityClassName = priorityClassName From d5481e8ab05061dbd2d9039fdce4d9103082260b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Caner?= Date: Fri, 15 Feb 2019 15:58:43 +0100 Subject: [PATCH 2/6] Add Pod Anti Affinity * use `cluster_labels` for pod anti affinity * use `node_readiness_label` when defined * added property `enable_pod_antiaffinity` to operator config * added property `pod_antiaffinity_topology_key` --- pkg/cluster/k8sres.go | 32 ++++++++++++++++++++------------ pkg/util/config/config.go | 2 ++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 147a2600b..444da2fc3 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -290,24 +290,24 @@ func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { } } -func generatePodAffinity(team string, version string) *v1.Affinity { +func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity) *v1.Affinity { // generate pod anti affinity to avoid multiple on instances on the same node - matchLabels := make(map[string]string) - - matchLabels["application"] = "spilo" - matchLabels["team"] = team - matchLabels["version"] = version - - return &v1.Affinity{ + podAffinity := v1.Affinity{ PodAntiAffinity: &v1.PodAntiAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{ LabelSelector: &metav1.LabelSelector{ - MatchLabels: matchLabels, + MatchLabels: labels, }, - TopologyKey: "kubernetes.io/hostname", + TopologyKey: topologyKey, }}, }, } + + if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil { + podAffinity.NodeAffinity = nodeAffinity.NodeAffinity + } + + return &podAffinity } func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration { @@ -439,6 +439,8 @@ func generatePodTemplate( kubeIAMRole string, priorityClassName string, shmVolume bool, + podAntiAffinity bool, + podAntiAffinityTopologyKey string, ) (*v1.PodTemplateSpec, error) { terminateGracePeriodSeconds := terminateGracePeriod @@ -457,7 +459,11 @@ func generatePodTemplate( addShmVolume(&podSpec) } - podSpec.Affinity = generatePodAffinity(labels.Get("team"), labels.Get("version")) + if podAntiAffinity { + podSpec.Affinity = generatePodAffinity(labels, podAntiAffinityTopologyKey, nodeAffinity) + } else if nodeAffinity != nil { + podSpec.Affinity = nodeAffinity + } if priorityClassName != "" { podSpec.PriorityClassName = priorityClassName @@ -831,7 +837,9 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State c.OpConfig.PodServiceAccountName, c.OpConfig.KubeIAMRole, effectivePodPriorityClassName, - mountShmVolumeNeeded(c.OpConfig, spec)); err != nil { + mountShmVolumeNeeded(c.OpConfig, spec), + c.OpConfig.EnablePodAntiAffinity, + c.OpConfig.PodAntiAffinityTopologyKey); err != nil { return nil, fmt.Errorf("could not generate pod template: %v", err) } diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 31cda4b98..f6199a2da 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -94,6 +94,8 @@ type Config struct { EnableMasterLoadBalancer bool `name:"enable_master_load_balancer" default:"true"` EnableReplicaLoadBalancer bool `name:"enable_replica_load_balancer" default:"false"` CustomServiceAnnotations map[string]string `name:"custom_service_annotations"` + EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"` + PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` // deprecated and kept for backward compatibility EnableLoadBalancer *bool `name:"enable_load_balancer"` MasterDNSNameFormat StringTemplate `name:"master_dns_name_format" default:"{cluster}.{team}.{hostedzone}"` From 4153af3df56681fe1e84272bdaffea83a4fdbc75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Caner?= Date: Fri, 15 Feb 2019 12:46:52 +0100 Subject: [PATCH 3/6] Added Pod Anti Affinity --- pkg/cluster/k8sres.go | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index c0ae1648c..cd923897c 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -290,6 +290,26 @@ func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { } } +func generatePodAffinity(team string, version string) *v1.Affinity { + // generate pod anti affinity to avoid multiple on instances on the same node + matchLabels := make(map[string]string) + + matchLabels["application"] = "spilo" + matchLabels["team"] = team + matchLabels["version"] = version + + return &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: matchLabels, + }, + TopologyKey: "kubernetes.io/hostname", + }}, + }, + } +} + func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration { // allow to override tolerations by postgresql manifest if len(*tolerationsSpec) > 0 { @@ -437,9 +457,7 @@ func generatePodTemplate( addShmVolume(&podSpec) } - if nodeAffinity != nil { - podSpec.Affinity = nodeAffinity - } + podSpec.Affinity = generatePodAffinity(labels.Get("team"), labels.Get("version")) if priorityClassName != "" { podSpec.PriorityClassName = priorityClassName From 85b3a8ad464a40dd421cd611fe42c270062310fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Caner?= Date: Fri, 15 Feb 2019 15:58:43 +0100 Subject: [PATCH 4/6] Add Pod Anti Affinity * use `cluster_labels` for pod anti affinity * use `node_readiness_label` when defined * added property `enable_pod_antiaffinity` to operator config * added property `pod_antiaffinity_topology_key` --- pkg/cluster/k8sres.go | 32 ++++++++++++++++++++------------ pkg/util/config/config.go | 2 ++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index cd923897c..4513c10c3 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -290,24 +290,24 @@ func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { } } -func generatePodAffinity(team string, version string) *v1.Affinity { +func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity) *v1.Affinity { // generate pod anti affinity to avoid multiple on instances on the same node - matchLabels := make(map[string]string) - - matchLabels["application"] = "spilo" - matchLabels["team"] = team - matchLabels["version"] = version - - return &v1.Affinity{ + podAffinity := v1.Affinity{ PodAntiAffinity: &v1.PodAntiAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{ LabelSelector: &metav1.LabelSelector{ - MatchLabels: matchLabels, + MatchLabels: labels, }, - TopologyKey: "kubernetes.io/hostname", + TopologyKey: topologyKey, }}, }, } + + if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil { + podAffinity.NodeAffinity = nodeAffinity.NodeAffinity + } + + return &podAffinity } func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration { @@ -439,6 +439,8 @@ func generatePodTemplate( kubeIAMRole string, priorityClassName string, shmVolume bool, + podAntiAffinity bool, + podAntiAffinityTopologyKey string, ) (*v1.PodTemplateSpec, error) { terminateGracePeriodSeconds := terminateGracePeriod @@ -457,7 +459,11 @@ func generatePodTemplate( addShmVolume(&podSpec) } - podSpec.Affinity = generatePodAffinity(labels.Get("team"), labels.Get("version")) + if podAntiAffinity { + podSpec.Affinity = generatePodAffinity(labels, podAntiAffinityTopologyKey, nodeAffinity) + } else if nodeAffinity != nil { + podSpec.Affinity = nodeAffinity + } if priorityClassName != "" { podSpec.PriorityClassName = priorityClassName @@ -831,7 +837,9 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State c.OpConfig.PodServiceAccountName, c.OpConfig.KubeIAMRole, effectivePodPriorityClassName, - mountShmVolumeNeeded(c.OpConfig, spec)); err != nil { + mountShmVolumeNeeded(c.OpConfig, spec), + c.OpConfig.EnablePodAntiAffinity, + c.OpConfig.PodAntiAffinityTopologyKey); err != nil { return nil, fmt.Errorf("could not generate pod template: %v", err) } diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 371b7cb65..a82f4c17d 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -95,6 +95,8 @@ type Config struct { EnableMasterLoadBalancer bool `name:"enable_master_load_balancer" default:"true"` EnableReplicaLoadBalancer bool `name:"enable_replica_load_balancer" default:"false"` CustomServiceAnnotations map[string]string `name:"custom_service_annotations"` + EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"` + PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` // deprecated and kept for backward compatibility EnableLoadBalancer *bool `name:"enable_load_balancer"` MasterDNSNameFormat StringTemplate `name:"master_dns_name_format" default:"{cluster}.{team}.{hostedzone}"` From 15817b9f0fbd5b5657a03e62a53300c07bc3d493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Caner?= Date: Thu, 21 Feb 2019 12:45:41 +0100 Subject: [PATCH 5/6] Changed comment and added documentation as requested --- docs/administrator.md | 30 +++++++++++++++++++++++++++ docs/reference/operator_parameters.md | 8 +++++++ pkg/cluster/k8sres.go | 2 +- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/docs/administrator.md b/docs/administrator.md index 8abd31c55..77213f2e7 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -146,6 +146,36 @@ data: ... ``` +### Enable pod anti affinity + +To ensure Postgres pods are running on different topologies, you can use [pod anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) +and configure the required topology in the operator ConfigMap. + +Enable pod anti affinity by adding following line to the operator ConfigMap: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-operator +data: + enable_pod_antiaffinity: "true" +``` + +By default the topology key for the pod anti affinity is set to `kubernetes.io/hostname`, +you can set another topology key e.g. `failure-domain.beta.kubernetes.io/zone` by adding following line +to the operator ConfigMap, see [built-in node labels](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) for available topology keys: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-operator +data: + enable_pod_antiaffinity: "true" + pod_antiaffinity_topology_key: "failure-domain.beta.kubernetes.io/zone" +``` + ### Add cluster-specific labels In some cases, you might want to add `labels` that are specific to a given diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 06a779c1e..69d903427 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -213,6 +213,14 @@ configuration they are grouped under the `kubernetes` key. that should be assigned to the Postgres pods. The priority class itself must be defined in advance. Default is empty (use the default priority class). +* **enable_pod_antiaffinity** + toggles [pod anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) on the Postgres pods, to avoid multiple pods + of the same Postgres cluster in the same topology , e.g. node. The default is `false`. + +* **pod_antiaffinity_topology_key** + override + [topology key](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) + for pod anti affinity. The default is `kubernetes.io/hostname`. ## Kubernetes resource requests diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 4513c10c3..9a58f0516 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -291,7 +291,7 @@ func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { } func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity) *v1.Affinity { - // generate pod anti affinity to avoid multiple on instances on the same node + // generate pod anti-affinity to avoid multiple pods of the same Postgres cluster in the same topology , e.g. node podAffinity := v1.Affinity{ PodAntiAffinity: &v1.PodAntiAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{ From 986d3abbf27ad0c5eadae2f0465e10d646195250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=96zg=C3=BCr=20Caner?= Date: Thu, 21 Feb 2019 13:08:12 +0100 Subject: [PATCH 6/6] Added pod anti affinity parameters to CRD-based operator configuration --- pkg/apis/acid.zalan.do/v1/operator_configuration_type.go | 2 ++ pkg/controller/operator_config.go | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index 1b6939dfa..99d79b64b 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -60,6 +60,8 @@ type KubernetesMetaConfiguration struct { // TODO: use namespacedname PodEnvironmentConfigMap string `json:"pod_environment_configmap,omitempty"` PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` + EnablePodAntiAffinity bool `json:"enable_pod_antiaffinity" default:"false"` + PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` } // PostgresPodResourcesDefaults defines the spec of default resources diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index 74549cbb8..08df7e97c 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -53,6 +53,9 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName + result.EnablePodAntiAffinity = fromCRD.Kubernetes.EnablePodAntiAffinity; + result.PodAntiAffinityTopologyKey = fromCRD.Kubernetes.PodAntiAffinityTopologyKey; + result.DefaultCPURequest = fromCRD.PostgresPodResources.DefaultCPURequest result.DefaultMemoryRequest = fromCRD.PostgresPodResources.DefaultMemoryRequest result.DefaultCPULimit = fromCRD.PostgresPodResources.DefaultCPULimit