-
Notifications
You must be signed in to change notification settings - Fork 524
CreateAndInitializeAsync: Adds Code to Optimize Rntbd Open Connection Logic to Open Connections in Parallel. #3939
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,11 @@ internal class GatewayAddressCache : IAddressCache, IDisposable | |
| private const string AddressResolutionBatchSize = "AddressResolutionBatchSize"; | ||
| private const int DefaultBatchSize = 50; | ||
|
|
||
| // This warmup cache and connection timeout is meant to mimic an indefinite timeframe till which | ||
| // a delay task will run, until a cancellation token is requested to cancel the task. The default | ||
| // value for this timeout is 45 minutes at the moment. | ||
| private static readonly TimeSpan WarmupCacheAndOpenConnectionTimeout = TimeSpan.FromMinutes(45); | ||
|
|
||
| private readonly Uri serviceEndpoint; | ||
| private readonly Uri addressEndpoint; | ||
|
|
||
|
|
@@ -113,7 +118,7 @@ public async Task OpenConnectionsAsync( | |
| bool shouldOpenRntbdChannels, | ||
| CancellationToken cancellationToken) | ||
| { | ||
| List<Task<TryCatch<DocumentServiceResponse>>> tasks = new (); | ||
| List<Task> tasks = new (); | ||
| int batchSize = GatewayAddressCache.DefaultBatchSize; | ||
|
|
||
| #if !(NETSTANDARD15 || NETSTANDARD16) | ||
|
|
@@ -147,50 +152,33 @@ public async Task OpenConnectionsAsync( | |
| { | ||
| for (int i = 0; i < partitionKeyRangeIdentities.Count; i += batchSize) | ||
| { | ||
| tasks | ||
| .Add(this.GetAddressesAsync( | ||
| request: request, | ||
| collectionRid: collection.ResourceId, | ||
| partitionKeyRangeIds: partitionKeyRangeIdentities.Skip(i).Take(batchSize).Select(range => range.PartitionKeyRangeId))); | ||
| tasks.Add( | ||
| this.WarmupCachesAndOpenConnectionsAsync( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wire CancellationTokens?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my understanding, the |
||
| request: request, | ||
| collectionRid: collection.ResourceId, | ||
| partitionKeyRangeIds: partitionKeyRangeIdentities.Skip(i).Take(batchSize).Select(range => range.PartitionKeyRangeId), | ||
| containerProperties: collection, | ||
| shouldOpenRntbdChannels: shouldOpenRntbdChannels)); | ||
| } | ||
| } | ||
|
|
||
| foreach (TryCatch<DocumentServiceResponse> task in await Task.WhenAll(tasks)) | ||
| { | ||
| if (task.Failed) | ||
| { | ||
| continue; | ||
| } | ||
| using CancellationTokenSource linkedTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); | ||
|
|
||
| using (DocumentServiceResponse response = task.Result) | ||
| { | ||
| FeedResource<Address> addressFeed = response.GetResource<FeedResource<Address>>(); | ||
|
|
||
| bool inNetworkRequest = this.IsInNetworkRequest(response); | ||
| // The `timeoutTask` is a background task which adds a delay for a period of WarmupCacheAndOpenConnectionTimeout. The task will | ||
| // be cancelled either by - a) when `linkedTokenSource` expires, which means the original `cancellationToken` expires or | ||
| // b) the the `linkedTokenSource.Cancel()` is called. | ||
| Task timeoutTask = Task.Delay(GatewayAddressCache.WarmupCacheAndOpenConnectionTimeout, linkedTokenSource.Token); | ||
| Task resultTask = await Task.WhenAny(Task.WhenAll(tasks), timeoutTask); | ||
|
|
||
| IEnumerable<Tuple<PartitionKeyRangeIdentity, PartitionAddressInformation>> addressInfos = | ||
| addressFeed.Where(addressInfo => ProtocolFromString(addressInfo.Protocol) == this.protocol) | ||
| .GroupBy(address => address.PartitionKeyRangeId, StringComparer.Ordinal) | ||
| .Select(group => this.ToPartitionAddressAndRange(collection.ResourceId, @group.ToList(), inNetworkRequest)); | ||
|
|
||
| foreach (Tuple<PartitionKeyRangeIdentity, PartitionAddressInformation> addressInfo in addressInfos) | ||
| { | ||
| this.serverPartitionAddressCache.Set( | ||
| new PartitionKeyRangeIdentity(collection.ResourceId, addressInfo.Item1.PartitionKeyRangeId), | ||
| addressInfo.Item2); | ||
|
|
||
| // The `shouldOpenRntbdChannels` boolean flag indicates whether the SDK should establish Rntbd connections to the | ||
| // backend replica nodes. For the `CosmosClient.CreateAndInitializeAsync()` flow, the flag should be passed as | ||
| // `true` so that the Rntbd connections to the backend replicas could be established deterministically. For any | ||
| // other flow, the flag should be passed as `false`. | ||
| if (this.openConnectionsHandler != null && shouldOpenRntbdChannels) | ||
| { | ||
| await this.openConnectionsHandler | ||
| .TryOpenRntbdChannelsAsync( | ||
| addresses: addressInfo.Item2.Get(Protocol.Tcp)?.ReplicaTransportAddressUris); | ||
| } | ||
| } | ||
| } | ||
| if (resultTask == timeoutTask) | ||
| { | ||
| // Operation has been cancelled. | ||
| DefaultTrace.TraceWarning("The open connection task was cancelled because the cancellation token was expired. '{0}'", | ||
| System.Diagnostics.Trace.CorrelationManager.ActivityId); | ||
| } | ||
| else | ||
| { | ||
| linkedTokenSource.Cancel(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -350,6 +338,81 @@ public async Task<PartitionAddressInformation> TryGetAddressesAsync( | |
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Gets the address information from the gateway using the partition key range ids, and warms up the async non blocking cache | ||
| /// by inserting them as a key value pair for later lookup. Additionally attempts to establish Rntbd connections to the backend | ||
| /// replicas based on `shouldOpenRntbdChannels` boolean flag. | ||
| /// </summary> | ||
| /// <param name="request">An instance of <see cref="DocumentServiceRequest"/> containing the request payload.</param> | ||
| /// <param name="collectionRid">A string containing the collection ids.</param> | ||
| /// <param name="partitionKeyRangeIds">An instance of <see cref="IEnumerable{T}"/> containing the list of partition key range ids.</param> | ||
| /// <param name="containerProperties">An instance of <see cref="ContainerProperties"/> containing the collection properties.</param> | ||
| /// <param name="shouldOpenRntbdChannels">A boolean flag indicating whether Rntbd connections are required to be established to the backend replica nodes.</param> | ||
| private async Task WarmupCachesAndOpenConnectionsAsync( | ||
| DocumentServiceRequest request, | ||
| string collectionRid, | ||
| IEnumerable<string> partitionKeyRangeIds, | ||
| ContainerProperties containerProperties, | ||
| bool shouldOpenRntbdChannels) | ||
| { | ||
| TryCatch<DocumentServiceResponse> documentServiceResponseWrapper = await this.GetAddressesAsync( | ||
| request: request, | ||
| collectionRid: collectionRid, | ||
| partitionKeyRangeIds: partitionKeyRangeIds); | ||
|
|
||
| if (documentServiceResponseWrapper.Failed) | ||
| { | ||
| return; | ||
ealsur marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| try | ||
| { | ||
| using (DocumentServiceResponse response = documentServiceResponseWrapper.Result) | ||
| { | ||
| FeedResource<Address> addressFeed = response.GetResource<FeedResource<Address>>(); | ||
|
|
||
| bool inNetworkRequest = this.IsInNetworkRequest(response); | ||
|
|
||
| IEnumerable<Tuple<PartitionKeyRangeIdentity, PartitionAddressInformation>> addressInfos = | ||
| addressFeed.Where(addressInfo => ProtocolFromString(addressInfo.Protocol) == this.protocol) | ||
| .GroupBy(address => address.PartitionKeyRangeId, StringComparer.Ordinal) | ||
| .Select(group => this.ToPartitionAddressAndRange(containerProperties.ResourceId, @group.ToList(), inNetworkRequest)); | ||
|
|
||
| List<Task> openConnectionTasks = new (); | ||
| foreach (Tuple<PartitionKeyRangeIdentity, PartitionAddressInformation> addressInfo in addressInfos) | ||
| { | ||
| this.serverPartitionAddressCache.Set( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yield based on cancelaltion after wiring through?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please help me understanding this better ? Currently there is no need to have the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. stop the foreach if the cancellation is signaling (if ct.IsCancellationRequested break)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OnCancellation behavior: With new changes its trying to gracefully complete right?
|
||
| new PartitionKeyRangeIdentity(containerProperties.ResourceId, addressInfo.Item1.PartitionKeyRangeId), | ||
| addressInfo.Item2); | ||
|
|
||
| // The `shouldOpenRntbdChannels` boolean flag indicates whether the SDK should establish Rntbd connections to the | ||
| // backend replica nodes. For the `CosmosClient.CreateAndInitializeAsync()` flow, the flag should be passed as | ||
| // `true` so that the Rntbd connections to the backend replicas could be established deterministically. For any | ||
| // other flow, the flag should be passed as `false`. | ||
| if (this.openConnectionsHandler != null && shouldOpenRntbdChannels) | ||
| { | ||
| openConnectionTasks | ||
| .Add(this.openConnectionsHandler | ||
| .TryOpenRntbdChannelsAsync( | ||
| addresses: addressInfo.Item2.Get(Protocol.Tcp)?.ReplicaTransportAddressUris)); | ||
| } | ||
| } | ||
|
|
||
| if (openConnectionTasks.Any()) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: What is more performant? Any() or Count > 0?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the check necessary?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, I think we can remove this condition. The |
||
| { | ||
| await Task.WhenAll(openConnectionTasks); | ||
| } | ||
| } | ||
| } | ||
| catch (Exception ex) | ||
| { | ||
| DefaultTrace.TraceWarning("Failed to warm-up caches and open connections for the server addresses: {0} with exception: {1}. '{2}'", | ||
| collectionRid, | ||
| ex, | ||
| System.Diagnostics.Trace.CorrelationManager.ActivityId); | ||
| } | ||
| } | ||
|
|
||
| private static void SetTransportAddressUrisToUnhealthy( | ||
| PartitionAddressInformation stalePartitionAddressInformation, | ||
| Lazy<HashSet<TransportAddressUri>> failedEndpoints) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How is 45M determined/tuned future?
Incoming cancellatoinToken is an option right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Incoming cancellation is honored through:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
True, just incoming cancellationToken might suffice right? Why is extra 45M cap?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The 45 Min cap is there to honor the connection opening time, when there is no cancellation token provided. In order to understand the wiring better, please see the code snippet below:
In the above code, the
resultTaskwill be completed, either when thetimeoutTaskis completed, i.e. thecancellationTokenis expired OR theTask.WhenAll(tasks), which is opening the rntbd connections is completed. Today, in .net standard, theTask.WhenAll()doesn't allow a cancellation token to cancel the tasks. There is a Task.WaitAsync on .NET >= 6, which allows cancellation token, however it is not supported in the current .net framework. This is the reason we are taking this alternate route to honor the cancellation token.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Incoming cancellation is optional, what happens if the user does not provide one? How do we timebox this in case something hangs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Personal opinion, 45M too long for any startup task (might be as bad as deadlock for them).
Its a new behavior right, is it an ASK from customers?