Skip to content

Commit 35edd39

Browse files
committed
Move updating connectivity state outside of subchannel lock
1 parent 60d9d2c commit 35edd39

File tree

2 files changed

+193
-2
lines changed

2 files changed

+193
-2
lines changed

src/Grpc.Net.Client/Balancer/Subchannel.cs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ public void UpdateAddresses(IReadOnlyList<BalancerAddress> addresses)
237237
/// </summary>
238238
public void RequestConnection()
239239
{
240+
var connectionRequested = false;
240241
lock (Lock)
241242
{
242243
switch (_state)
@@ -245,7 +246,8 @@ public void RequestConnection()
245246
SubchannelLog.ConnectionRequested(_logger, Id);
246247

247248
// Only start connecting underlying transport if in an idle state.
248-
UpdateConnectivityState(ConnectivityState.Connecting, "Connection requested.");
249+
// Update connectivity state outside of subchannel lock to avoid deadlock.
250+
connectionRequested = true;
249251
break;
250252
case ConnectivityState.Connecting:
251253
case ConnectivityState.Ready:
@@ -264,6 +266,11 @@ public void RequestConnection()
264266
}
265267
}
266268

269+
if (connectionRequested)
270+
{
271+
UpdateConnectivityState(ConnectivityState.Connecting, "Connection requested.");
272+
}
273+
267274
// Don't capture the current ExecutionContext and its AsyncLocals onto the connect
268275
var restoreFlow = false;
269276
if (!ExecutionContext.IsFlowSuppressed())
@@ -448,6 +455,8 @@ internal bool UpdateConnectivityState(ConnectivityState state, string successDet
448455

449456
internal bool UpdateConnectivityState(ConnectivityState state, Status status)
450457
{
458+
Debug.Assert(!Monitor.IsEntered(Lock), "Ensure the subchannel lock isn't held here. Updating channel state with the subchannel lock can cause a deadlock.");
459+
451460
lock (Lock)
452461
{
453462
// Don't update subchannel state if the state is the same or the subchannel has been shutdown.
@@ -462,7 +471,7 @@ internal bool UpdateConnectivityState(ConnectivityState state, Status status)
462471
}
463472
_state = state;
464473
}
465-
474+
466475
// Notify channel outside of lock to avoid deadlocks.
467476
_manager.OnSubchannelStateChange(this, state, status);
468477
return true;

test/Grpc.Net.Client.Tests/Balancer/ConnectionManagerTests.cs

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#endregion
1818

1919
#if SUPPORT_LOAD_BALANCING
20+
using System.Diagnostics;
2021
using System.Net;
2122
using System.Threading.Channels;
2223
using Greet;
@@ -29,6 +30,7 @@
2930
using Grpc.Tests.Shared;
3031
using Microsoft.Extensions.DependencyInjection;
3132
using Microsoft.Extensions.Logging;
33+
using Microsoft.Extensions.Logging.Abstractions;
3234
using Microsoft.Extensions.Logging.Testing;
3335
using NUnit.Framework;
3436
using ChannelState = Grpc.Net.Client.Balancer.ChannelState;
@@ -535,6 +537,186 @@ public async Task PickAsync_DoesNotDeadlockAfterReconnect_WithZeroAddressResolve
535537
await pickTask.DefaultTimeout();
536538
}
537539

540+
[Test]
541+
public async Task PickAsync_UpdateAddressesWhileRequestingConnection_DoesNotDeadlock()
542+
{
543+
var services = new ServiceCollection();
544+
services.AddNUnitLogger();
545+
546+
var testSink = new TestSink();
547+
var testProvider = new TestLoggerProvider(testSink);
548+
549+
services.AddLogging(b =>
550+
{
551+
b.AddProvider(testProvider);
552+
});
553+
554+
await using var serviceProvider = services.BuildServiceProvider();
555+
var loggerFactory = serviceProvider.GetRequiredService<ILoggerFactory>();
556+
557+
var resolver = new TestResolver(loggerFactory);
558+
resolver.UpdateAddresses(new List<BalancerAddress>
559+
{
560+
new BalancerAddress("localhost", 80)
561+
});
562+
563+
var channelOptions = new GrpcChannelOptions();
564+
565+
var transportFactory = new TestSubchannelTransportFactory();
566+
var clientChannel = CreateConnectionManager(loggerFactory, resolver, transportFactory, new[] { new PickFirstBalancerFactory() });
567+
// Configure balancer similar to how GrpcChannel constructor does it
568+
clientChannel.ConfigureBalancer(c => new ChildHandlerLoadBalancer(
569+
c,
570+
channelOptions.ServiceConfig,
571+
clientChannel));
572+
573+
await clientChannel.ConnectAsync(waitForReady: true, cancellationToken: CancellationToken.None);
574+
575+
transportFactory.Transports.ForEach(t => t.Disconnect());
576+
577+
var requestConnectionSyncPoint = new SyncPoint(runContinuationsAsynchronously: true);
578+
testSink.MessageLogged += (w) =>
579+
{
580+
if (w.EventId.Name == "ConnectionRequested")
581+
{
582+
requestConnectionSyncPoint.WaitToContinue().Wait();
583+
}
584+
};
585+
586+
// Task should pause when requesting connection because of the logger sink.
587+
var pickTask = Task.Run(() => clientChannel.PickAsync(
588+
new PickContext { Request = new HttpRequestMessage() },
589+
waitForReady: true,
590+
CancellationToken.None).AsTask());
591+
592+
// Wait until we're paused on requesting a connection.
593+
await requestConnectionSyncPoint.WaitForSyncPoint().DefaultTimeout();
594+
595+
// Update addresses while requesting a connection.
596+
var updateAddressesTcs = new TaskCompletionSource<object?>(TaskCreationOptions.RunContinuationsAsynchronously);
597+
var updateAddressesTask = Task.Run(() =>
598+
{
599+
updateAddressesTcs.TrySetResult(null);
600+
resolver.UpdateAddresses(new List<BalancerAddress>
601+
{
602+
new BalancerAddress("localhost", 81)
603+
});
604+
});
605+
606+
// There isn't a clean way to wait for UpdateAddresses to be waiting for the subchannel lock.
607+
// Use a long delay to ensure we're waiting for the lock and are in the right state.
608+
await updateAddressesTcs.Task.DefaultTimeout();
609+
await Task.Delay(500);
610+
requestConnectionSyncPoint.Continue();
611+
612+
// Ensure the pick completes without deadlock.
613+
try
614+
{
615+
await pickTask.DefaultTimeout();
616+
}
617+
catch (TimeoutException ex)
618+
{
619+
throw new InvalidOperationException("Likely deadlock when picking subchannel.", ex);
620+
}
621+
}
622+
623+
[Test]
624+
public async Task PickAsync_ResolverUpdating_DoesNotDeadlock()
625+
{
626+
// Arrange
627+
var services = new ServiceCollection();
628+
//services.AddNUnitLogger();
629+
await using var serviceProvider = services.BuildServiceProvider();
630+
var loggerFactory = NullLoggerFactory.Instance;// serviceProvider.GetRequiredService<ILoggerFactory>();
631+
632+
var resolver = new TestResolver(loggerFactory);
633+
634+
var channelOptions = new GrpcChannelOptions();
635+
636+
var transportFactory = new TestSubchannelTransportFactory();
637+
var clientChannel = CreateConnectionManager(loggerFactory, resolver, transportFactory, new[] { new PickFirstBalancerFactory() });
638+
// Configure balancer similar to how GrpcChannel constructor does it
639+
clientChannel.ConfigureBalancer(c => new ChildHandlerLoadBalancer(
640+
c,
641+
channelOptions.ServiceConfig,
642+
clientChannel));
643+
644+
// Act
645+
var connectTask = clientChannel.ConnectAsync(waitForReady: true, cancellationToken: CancellationToken.None);
646+
647+
var t = Task.Run(async () =>
648+
{
649+
while (true)
650+
{
651+
await Task.Delay(1000);
652+
}
653+
});
654+
655+
// continiously update addresses
656+
_ = Task.Run(async () =>
657+
{
658+
var a1 = new List<BalancerAddress>
659+
{
660+
new BalancerAddress("localhost", 80)
661+
};
662+
var a2 = new List<BalancerAddress>
663+
{
664+
new BalancerAddress("localhost", 81)
665+
};
666+
var current = a1;
667+
668+
var count = 0;
669+
while (true)
670+
{
671+
current = count % 2 == 0 ? a1 : a2;
672+
//if (count > 10_000)
673+
//{
674+
// updateAddressesTcs.TrySetResult(null);
675+
//}
676+
677+
resolver.UpdateAddresses(current);
678+
count++;
679+
680+
//updateAddressChannel.Writer.TryWrite(true);
681+
682+
await Task.Delay(1);
683+
}
684+
});
685+
686+
// Simulate transport/network issue (with resolver reporting no addresses)
687+
//transportFactory.Transports.ForEach(t => t.Disconnect());
688+
689+
try
690+
{
691+
for (int i = 0; i < 10_000; i++)
692+
{
693+
694+
var pickTask = clientChannel.PickAsync(
695+
new PickContext { Request = new HttpRequestMessage() },
696+
waitForReady: true,
697+
CancellationToken.None).AsTask();
698+
//resolver.UpdateAddresses(new List<BalancerAddress>
699+
//{
700+
// new BalancerAddress("localhost", 80)
701+
//});
702+
703+
// Assert
704+
// Should not timeout (deadlock)
705+
await pickTask.DefaultTimeout();
706+
await Task.Delay(1);
707+
}
708+
709+
}
710+
catch (Exception ex)
711+
{
712+
_ = ex;
713+
Debugger.Launch();
714+
throw;
715+
}
716+
717+
await t;
718+
}
719+
538720
[Test]
539721
public async Task PickAsync_ExecutionContext_DoesNotCaptureAsyncLocalsInConnect()
540722
{

0 commit comments

Comments
 (0)