diff --git a/design/improve-csi-snaphot-perf.md b/design/improve-csi-snaphot-perf.md new file mode 100644 index 0000000000..1f06701c2f --- /dev/null +++ b/design/improve-csi-snaphot-perf.md @@ -0,0 +1,68 @@ +# Proposal to improve performance of CSI snapshotting through velero. + +- [Proposal to improve performance of CSI snapshotting through velero.] + +## Abstract +Currently velero uses the CSI plugin for taking CSI snapshots. The CSI Plugin is modeled as a BIA, where whenever the velero code encounters a PVC, it invokes the PVCAction BIA of CSI Plugin. In the Execute() phase of the plugin the CSI plugin waits for a default 10mins for snapshotting to complete. This is a blocking call and the velero code waits for the snapshotting to complete before proceeding to the next resource. In case of failures due to permissions etc, velero will keep waiting for 10*N minutes. This tracking cannot be made async since we need to ensure the appreance of snapshotHandle on the VolumeSnapshotContent before proceeding. This is because the pre-hooks run on pods first, then PVCs are snapshotted, and then posthooks. Ensuring waiting for actual snapshotting is key here to ensure that the posthooks are not executed before the snapshotting is complete. +Further the Core velero code waits on the CSI Snapshot to have readyToUse=true for 10 minutes. + + + +## Goals +- Reduce the time to take CSI snapshots. +- Ensure current behaviour of pre and post hooks in context of PVCs attached to a pod. + +## Non Goals + +## Considerations: +- Ensure no existing flows break. +- Ensure thread safety in case of concurrency. +- Provide control over max concurrency tweakable by end user. + +## How to Group Snapshots +- PVCs which are being used by a pod/deployment should be snapshotting together. +- PVCs which are not being used by any pod can be snapshotted at any time. +- If there are no hooks provided, snapshotting for all PVCs can be done in parallel. + +## How to group Resources: +- This is an additional thinking of how to group resources efficiently in velero so that they can be backed up more efficiently - potentially in parallel. +- Current issues with PVs being backed up through different flows will be potentially solved through some in discussion items. + +## Approaches + +### Invoke CSI Plugin in parallel for a group of PVCs. +- Invoke current CSI plugin's Execute() in parallel for a group of PVCs. + +## Implementation +- Current code flow `backupItem` in backup.go is invoked for each resource -> this further invokes `itembackupper.backupItem` -> `backupItemInternal` +- Now for a Pod -> First Pre Hooks are run -> Then `executeActions` -> iterate over all BIA applicable on Pod -> which will invoke the `PodAction` +- After all actions are run, `executeActions` gets the additionalItems to backup(PVCs) +- For all these PVCs and other additional items we iterate and call `itembackupper.backupItem`. +- After all additional items are backed up -> control returns to `backupItemInternal` -> Post Hooks are run -> and then `backupItem` returns. +- Here the change we will do is that when backup for additionalItems is done, for PVCs, we will run `itembackupper.backupItem` in an async way. + +## Open Problems +- Ensuring thread safety +- Ensuring that PVBs triggered in parallel work as expected. + +## Alternatives Considered +### Approach 1: Add support for VolumeGroupSnapshot in Velero. +- (Volume Group Snapshots)[https://kubernetes.io/blog/2023/05/08/kubernetes-1-27-volume-group-snapshot-alpha/] is introduced as an Alpha feature in Kubernetes v1.27. This feature introduces a Kubernetes API that allows users to take crash consistent snapshots for multiple volumes together. It uses a **label selector to group multiple PersistentVolumeClaims** for snapshotting +- This is out of scope for current design since the API is not even Beta yet and not impacting current perf improvements. + +## Approach 2: Create a Pod BIA Plugin which will invoke CSI Plugin in parallel for a group of PVCs. +- Create a Pod BIA Plugin which will invoke CSI Plugin in parallel for a group of PVCs. +- This would lead to code and logic duplication across CSI Plugin and the pod plugin. +- With BIAv2 it is complicated to achieve since a single pod plugin would have to return N operation IDs for N PVCs, while there is only support for 1 operation id at a time. Hacking around using 1 operation id field would lead to code complications and would not be a clean approach. + +## Security Considerations +No security impact. + +## Compatibility + + + +## Future enhancement + +## Open Issues +NA diff --git a/pkg/backup/item_backupper.go b/pkg/backup/item_backupper.go index f1cafbfcad..7cfbb5385c 100644 --- a/pkg/backup/item_backupper.go +++ b/pkg/backup/item_backupper.go @@ -22,6 +22,7 @@ import ( "encoding/json" "fmt" "strings" + "sync" "time" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -166,7 +167,7 @@ func (ib *itemBackupper) backupItemInternal(logger logrus.FieldLogger, obj runti namespace: namespace, name: name, } - + // mutex on BackedUpItems needed. if _, exists := ib.backupRequest.BackedUpItems[key]; exists { log.Info("Skipping item because it's already been backed up.") // returning true since this item *is* in the backup, even though we're not backing it up here @@ -408,7 +409,22 @@ func (ib *itemBackupper) executeActions( itemOperList := ib.backupRequest.GetItemOperationsList() *itemOperList = append(*itemOperList, &newOperation) } - + // ## Approach 1 + // Extract all PVCs from the Additional Items + // Create a label such as :<> + // Apply label to all these PVCs + // Create VolumeSnapshotGroup CR with label selector + // Invoke VolumeSnapshotGroup Action with the the CR + // Poll for VSG in CSI VSG Plugin's Execute() + // Return Additional Items and continue backup + + // ## Approach 2 + // Async call the current CSI Plugin's Execute() + // Wait for all snapshots to complete. + // Max parallelism can be controlled by further tweaking the WaitGroup. + additionalItemFilesChannel := make(chan FileForArchive) + errChannel := make(chan error) + var wg sync.WaitGroup for _, additionalItem := range additionalItemIdentifiers { gvr, resource, err := ib.discoveryHelper.ResourceFor(additionalItem.GroupResource.WithVersion("")) if err != nil { @@ -433,12 +449,32 @@ func (ib *itemBackupper) executeActions( if err != nil { return nil, itemFiles, errors.WithStack(err) } - - _, additionalItemFiles, err := ib.backupItem(log, item, gvr.GroupResource(), gvr, mustInclude, finalize) - if err != nil { - return nil, itemFiles, err - } - itemFiles = append(itemFiles, additionalItemFiles...) + wg.Add(1) + go func(additionalItem velero.ResourceIdentifier, log logrus.FieldLogger, item runtime.Unstructured, gvr schema.GroupVersionResource, mustInclude, finalize bool) { + defer wg.Done() + log.WithFields(logrus.Fields{ + "groupResource": additionalItem.GroupResource, + "namespace": additionalItem.Namespace, + "name": additionalItem.Name, + }).Infof("Triggering async backupitem for additional item") + _, additionalItemFiles, err := ib.backupItem(log, item, gvr.GroupResource(), gvr, mustInclude, finalize) + if err != nil { + errChannel <- err + return + } + for _, file := range additionalItemFiles { + additionalItemFilesChannel <- file + } + }(additionalItem, log, item.DeepCopy(), gvr, mustInclude, finalize) + } + wg.Wait() + close(additionalItemFilesChannel) + close(errChannel) + for itemFilesFromChannel := range additionalItemFilesChannel { + itemFiles = append(itemFiles, itemFilesFromChannel) + } + for err := range errChannel { + return nil, itemFiles, err } } return obj, itemFiles, nil @@ -658,6 +694,7 @@ func (ib *itemBackupper) getMatchAction(obj runtime.Unstructured, groupResource // this function will be called throughout the process of backup, it needs to handle any object func (ib *itemBackupper) trackSkippedPV(obj runtime.Unstructured, groupResource schema.GroupResource, approach string, reason string, log logrus.FieldLogger) { if name, err := getPVName(obj, groupResource); len(name) > 0 && err == nil { + // Skip PV Tracker already has Mutex lock. ib.backupRequest.SkippedPVTracker.Track(name, approach, reason) } else if err != nil { log.WithError(err).Warnf("unable to get PV name, skip tracking.") diff --git a/pkg/backup/pvc_snapshot_tracker.go b/pkg/backup/pvc_snapshot_tracker.go index fd9d68899a..cdf9ce3638 100644 --- a/pkg/backup/pvc_snapshot_tracker.go +++ b/pkg/backup/pvc_snapshot_tracker.go @@ -77,12 +77,14 @@ func (t *pvcSnapshotTracker) recordStatus(pod *corev1api.Pod, volumeName string, for _, volume := range pod.Spec.Volumes { if volume.Name == volumeName { if volume.PersistentVolumeClaim != nil { + // lock on pvcPod t.pvcPod[key(pod.Namespace, volume.PersistentVolumeClaim.ClaimName)] = pod.Name currStatus, ok := t.pvcs[key(pod.Namespace, volume.PersistentVolumeClaim.ClaimName)] if !ok { currStatus = pvcSnapshotStatusNotTracked } if currStatus == preReqStatus { + // lock on pvcs t.pvcs[key(pod.Namespace, volume.PersistentVolumeClaim.ClaimName)] = status } }