Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
e18aa95
Initial code changes to throw 503 on 429/3092.
kundadebdatta Sep 11, 2024
e44f080
Updated client retry policy. Added more tests to cover 429/3092.
kundadebdatta Sep 11, 2024
9006b5f
Code changes to update direct package version. Updating the tests.
kundadebdatta Sep 12, 2024
e7710c1
Code changes to refactor client retry policy.
kundadebdatta Sep 12, 2024
5a1b505
Minor code cleanup.
kundadebdatta Sep 12, 2024
ebf1a11
Merge branch 'master' into users/kundadebdatta/4390_cross_regional_re…
kundadebdatta Sep 12, 2024
91cc3ea
Reverting the direct version bump up change.
kundadebdatta Sep 13, 2024
b479714
Merge branch 'master' into users/kundadebdatta/4656_cross_regional_re…
kundadebdatta Sep 13, 2024
6ebb9e6
Code changes to address some of the review comments.
kundadebdatta Sep 13, 2024
47cacdf
Merge branch 'master' into users/kundadebdatta/4656_cross_regional_re…
kundadebdatta Sep 13, 2024
e6e2766
Code changes to move failover logic in client retry policy.
kundadebdatta Sep 14, 2024
9575986
Minor code clean up.
kundadebdatta Sep 14, 2024
0c5304d
Code changes to clean up some cosmetic items.
kundadebdatta Sep 15, 2024
ce3c62f
Further clean up.
kundadebdatta Sep 15, 2024
bf5e649
Merge branch 'master' into users/kundadebdatta/4656_cross_regional_re…
kundadebdatta Sep 16, 2024
973f1f1
Code changes to address review comments.
kundadebdatta Sep 17, 2024
40560cc
Minor refactor to address cosmetic update.
kundadebdatta Sep 17, 2024
a198a04
Code changes to address cosmetic review comment.
kundadebdatta Sep 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Code changes to refactor client retry policy.
  • Loading branch information
kundadebdatta committed Sep 12, 2024
commit e7710c1a2d0f65539f67f5e63ac5e28c55a55a03
71 changes: 39 additions & 32 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -123,19 +123,13 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(

ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken);

// Received 503 due to client connect timeout or Gateway
// Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy
// is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable
// for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503.
if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);

return this.ShouldRetryOnServiceUnavailable();
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return throttleRetryResult;
Expand Down Expand Up @@ -163,19 +157,13 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(

ShouldRetryResult throttleRetryResult = await this.throttlingRetry.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);

// Received 503 due to client connect timeout or Gateway
// Today, the only scenario where we would receive a ServiceUnavailableException from the Throttling Retry Policy
// is when we get 410 (Gone) with sub status code 3092 (System Resource Not Available). Note that this is applicable
// for write requests targeted to a multiple master account. In such case, the 410/3092 will get converted into 503.
if (throttleRetryResult.ExceptionToThrow is ServiceUnavailableException)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);

return this.ShouldRetryOnServiceUnavailable();
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return throttleRetryResult;
Expand Down Expand Up @@ -310,16 +298,8 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
// Received 503 due to client connect timeout or Gateway
if (statusCode == HttpStatusCode.ServiceUnavailable)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);

return this.ShouldRetryOnServiceUnavailable();
return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
shouldMarkEndpointUnavailableForPkRange: true);
}

return null;
Expand Down Expand Up @@ -442,6 +422,33 @@ private ShouldRetryResult ShouldRetryOnSessionNotAvailable(DocumentServiceReques
}
}

/// <summary>
/// Attempts to mark the endpoint associated with the current partition key range as unavailable and determines if
/// a retry should be performed due to a ServiceUnavailable (503) response. This method is invoked when a 503
/// Service Unavailable response is received, indicating that the service might be temporarily unavailable.
/// It optionally marks the partition key range as unavailable, which will influence future routing decisions.
/// </summary>
/// <param name="shouldMarkEndpointUnavailableForPkRange">A boolean flag indicating whether the endpoint for the
/// current partition key range should be marked as unavailable.</param>
/// <returns>An instance of <see cref="ShouldRetryResult"/> indicating whether the operation should be retried.</returns>
private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable(
bool shouldMarkEndpointUnavailableForPkRange)
{
DefaultTrace.TraceWarning("ClientRetryPolicy: ServiceUnavailable. Refresh cache and retry. Failed Location: {0}; ResourceAddress: {1}",
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

if (shouldMarkEndpointUnavailableForPkRange)
{
// Mark the partition as unavailable.
// Let the ClientRetry logic decide if the request should be retried
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);
}

return this.ShouldRetryOnServiceUnavailable();
}

/// <summary>
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
Expand Down