-
Notifications
You must be signed in to change notification settings - Fork 524
[Internal] Upgrade Resiliency: Fixes Duplicate Channel and Task Creation. #4123
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f81eef2
27832ba
adfaf8e
94d2eff
0f35c93
5c06a8a
9c2b7b9
8426716
c21d09c
f026c12
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,12 +50,14 @@ internal class GatewayAddressCache : IAddressCache, IDisposable | |
| private readonly ICosmosAuthorizationTokenProvider tokenProvider; | ||
| private readonly bool enableTcpConnectionEndpointRediscovery; | ||
|
|
||
| private readonly SemaphoreSlim semaphore; | ||
| private readonly CosmosHttpClient httpClient; | ||
| private readonly bool isReplicaAddressValidationEnabled; | ||
|
|
||
| private Tuple<PartitionKeyRangeIdentity, PartitionAddressInformation> masterPartitionAddressCache; | ||
| private DateTime suboptimalMasterPartitionTimestamp; | ||
| private bool disposedValue; | ||
| private bool validateUnknownReplicas; | ||
| private IOpenConnectionsHandler openConnectionsHandler; | ||
|
|
||
| public GatewayAddressCache( | ||
|
|
@@ -90,8 +92,10 @@ public GatewayAddressCache( | |
| Constants.Properties.Protocol, | ||
| GatewayAddressCache.ProtocolString(this.protocol)); | ||
|
|
||
| this.semaphore = new SemaphoreSlim(1, 1); | ||
| this.openConnectionsHandler = openConnectionsHandler; | ||
| this.isReplicaAddressValidationEnabled = replicaAddressValidationEnabled; | ||
| this.validateUnknownReplicas = false; | ||
| } | ||
|
|
||
| public Uri ServiceEndpoint => this.serviceEndpoint; | ||
|
|
@@ -120,6 +124,14 @@ public async Task OpenConnectionsAsync( | |
| List<Task> tasks = new (); | ||
| int batchSize = GatewayAddressCache.DefaultBatchSize; | ||
|
|
||
| // By design, the Unknown replicas are validated only when the following two conditions meet: | ||
| // 1) The CosmosClient is initiated using the CreateAndInitializaAsync() flow. | ||
| // 2) The advanced replica selection feature enabled. | ||
| if (shouldOpenRntbdChannels) | ||
| { | ||
| this.validateUnknownReplicas = true; | ||
| } | ||
|
|
||
| #if !(NETSTANDARD15 || NETSTANDARD16) | ||
| #if NETSTANDARD20 | ||
| // GetEntryAssembly returns null when loaded from native netstandard2.0 | ||
|
|
@@ -302,11 +314,12 @@ public async Task<PartitionAddressInformation> TryGetAddressesAsync( | |
| .ReplicaTransportAddressUris | ||
| .Any(x => x.ShouldRefreshHealthStatus())) | ||
| { | ||
| Task refreshAddressesInBackgroundTask = Task.Run(async () => | ||
| bool slimAcquired = await this.semaphore.WaitAsync(0); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clarification: What will happen if this check not exists?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the below condition doesn't exist, then there is no gate to detect if other threads has acquired the semaphore in the past and already scheduled a background refresh task. So, without the check, the Also, the code snippet |
||
| try | ||
kundadebdatta marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| { | ||
| try | ||
| if (slimAcquired) | ||
| { | ||
| await this.serverPartitionAddressCache.RefreshAsync( | ||
| this.serverPartitionAddressCache.Refresh( | ||
| key: partitionKeyRangeIdentity, | ||
| singleValueInitFunc: (currentCachedValue) => this.GetAddressesForRangeIdAsync( | ||
| request, | ||
|
|
@@ -315,14 +328,21 @@ await this.serverPartitionAddressCache.RefreshAsync( | |
| partitionKeyRangeIdentity.PartitionKeyRangeId, | ||
| forceRefresh: true)); | ||
| } | ||
| catch (Exception ex) | ||
| else | ||
| { | ||
| DefaultTrace.TraceWarning("Failed to refresh addresses in the background for the collection rid: {0} with exception: {1}. '{2}'", | ||
| DefaultTrace.TraceVerbose("Failed to refresh addresses in the background for the collection rid: {0}, partition key range id: {1}, because the semaphore is already acquired. '{2}'", | ||
| partitionKeyRangeIdentity.CollectionRid, | ||
| ex, | ||
| partitionKeyRangeIdentity.PartitionKeyRangeId, | ||
| System.Diagnostics.Trace.CorrelationManager.ActivityId); | ||
| } | ||
| }); | ||
| } | ||
| finally | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: try-finally be moved inside IF (code refractoring and fainally clause then don't need explicit check
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. I will refactor this with the next PR to master. Good catch! |
||
| { | ||
| if (slimAcquired) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should not this semaphore be released only when the refresh completed?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed this offline. The below code should cover this: The semaphore will be released as soon as the task is successfully scheduled. and the above check is good enough to block other parallel threads to create duplicate tasks. |
||
| { | ||
| this.semaphore.Release(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return addresses; | ||
|
|
@@ -1008,18 +1028,26 @@ private static PartitionAddressInformation MergeAddresses( | |
| /// Returns a list of <see cref="TransportAddressUri"/> needed to validate their health status. Validating | ||
| /// a uri is done by opening Rntbd connection to the backend replica, which is a costly operation by nature. Therefore | ||
| /// vaidating both Unhealthy and Unknown replicas at the same time could impose a high CPU utilization. To avoid this | ||
| /// situation, the RntbdOpenConnectionHandler has good concurrency control mechanism to open the connections gracefully/>. | ||
| /// situation, the RntbdOpenConnectionHandler has good concurrency control mechanism to open the connections gracefully. | ||
| /// By default, this method only returns the Unhealthy replicas that requires to validate it's connectivity status. The | ||
| /// Unknown replicas are validated only when the CosmosClient is initiated using the CreateAndInitializaAsync() flow. | ||
| /// </summary> | ||
| /// <param name="transportAddresses">A read only list of <see cref="TransportAddressUri"/>s.</param> | ||
| /// <returns>A list of <see cref="TransportAddressUri"/> that needs to validate their status.</returns> | ||
| private IEnumerable<TransportAddressUri> GetAddressesNeededToValidateStatus( | ||
| IReadOnlyList<TransportAddressUri> transportAddresses) | ||
| { | ||
| return transportAddresses | ||
| .Where(address => address | ||
| return this.validateUnknownReplicas | ||
| ? transportAddresses | ||
| .Where(address => address | ||
| .GetCurrentHealthState() | ||
| .GetHealthStatus() is | ||
| TransportAddressHealthState.HealthStatus.UnhealthyPending or | ||
| TransportAddressHealthState.HealthStatus.Unknown) | ||
| : transportAddresses | ||
| .Where(address => address | ||
| .GetCurrentHealthState() | ||
| .GetHealthStatus() is | ||
| TransportAddressHealthState.HealthStatus.Unknown or | ||
| TransportAddressHealthState.HealthStatus.UnhealthyPending); | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.