-
Notifications
You must be signed in to change notification settings - Fork 4.6k
pickfirstleaf: Avoid getting stuck in IDLE on connection breakage #8615
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1504,6 +1504,101 @@ func (s) TestPickFirstLeaf_AddressUpdateWithMetadata(t *testing.T) { | |
| } | ||
| } | ||
|
|
||
| // Tests the scenario where a connection is established and then breaks, leading | ||
| // to a reconnection attempt. While the reconnection is in progress, a resolver | ||
| // update with a new address is received. The test verifies that the balancer | ||
| // creates a new SubConn for the new address and that the ClientConn eventually | ||
| // becomes READY. | ||
| func (s) TestPickFirstLeaf_Reconnection(t *testing.T) { | ||
| ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout) | ||
| defer cancel() | ||
| cc := testutils.NewBalancerClientConn(t) | ||
| bal := balancer.Get(pickfirstleaf.Name).Build(cc, balancer.BuildOptions{}) | ||
| defer bal.Close() | ||
| ccState := balancer.ClientConnState{ | ||
| ResolverState: resolver.State{ | ||
| Endpoints: []resolver.Endpoint{ | ||
| {Addresses: []resolver.Address{{Addr: "1.1.1.1:1"}}}, | ||
| }, | ||
| }, | ||
| } | ||
| if err := bal.UpdateClientConnState(ccState); err != nil { | ||
| t.Fatalf("UpdateClientConnState(%v) returned error: %v", ccState, err) | ||
| } | ||
|
|
||
| select { | ||
| case state := <-cc.NewStateCh: | ||
| if got, want := state, connectivity.Connecting; got != want { | ||
| t.Fatalf("Received unexpected ClientConn sate: got %v, want %v", got, want) | ||
| } | ||
| case <-ctx.Done(): | ||
| t.Fatal("Context timed out waiting for ClientConn state update.") | ||
| } | ||
|
|
||
| sc1 := <-cc.NewSubConnCh | ||
| select { | ||
| case <-sc1.ConnectCh: | ||
| case <-ctx.Done(): | ||
| t.Fatal("Context timed out waiting for Connect() to be called on sc1.") | ||
| } | ||
| sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) | ||
| sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) | ||
|
|
||
| if err := cc.WaitForConnectivityState(ctx, connectivity.Ready); err != nil { | ||
| t.Fatalf("Context timed out waiting for ClientConn to become READY.") | ||
| } | ||
|
|
||
| // Simulate a connection breakage, this should result the channel | ||
| // transitioning to IDLE. | ||
| sc1.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Idle}) | ||
| if err := cc.WaitForConnectivityState(ctx, connectivity.Idle); err != nil { | ||
| t.Fatalf("Context timed out waiting for ClientConn to enter IDLE.") | ||
| } | ||
|
|
||
| // Calling the idle picker should result in the SubConn being re-connected. | ||
| if err := cc.WaitForPickerWithErr(ctx, balancer.ErrNoSubConnAvailable); err != nil { | ||
|
||
| t.Fatalf("cc.WaitForPickerWithErr(%v) returned error: %v", balancer.ErrNoSubConnAvailable, err) | ||
| } | ||
|
|
||
| select { | ||
| case <-sc1.ConnectCh: | ||
| case <-ctx.Done(): | ||
| t.Fatal("Context timed out waiting for Connect() to be called on sc1.") | ||
| } | ||
|
|
||
| // Send a resolver update, removing the existing SubConn. Since the balancer | ||
| // is connecting, it should create a new SubConn for the new backend | ||
| // address. | ||
| ccState = balancer.ClientConnState{ | ||
| ResolverState: resolver.State{ | ||
| Endpoints: []resolver.Endpoint{ | ||
| {Addresses: []resolver.Address{{Addr: "2.2.2.2:2"}}}, | ||
| }, | ||
| }, | ||
| } | ||
| if err := bal.UpdateClientConnState(ccState); err != nil { | ||
| t.Fatalf("UpdateClientConnState(%v) returned error: %v", ccState, err) | ||
| } | ||
|
|
||
| var sc2 *testutils.TestSubConn | ||
| select { | ||
| case sc2 = <-cc.NewSubConnCh: | ||
| case <-ctx.Done(): | ||
| t.Fatal("Context timed out waiting for new SubConn to be created.") | ||
| } | ||
|
|
||
| select { | ||
| case <-sc2.ConnectCh: | ||
| case <-ctx.Done(): | ||
| t.Fatal("Context timed out waiting for Connect() to be called on sc2.") | ||
| } | ||
| sc2.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Connecting}) | ||
| sc2.UpdateState(balancer.SubConnState{ConnectivityState: connectivity.Ready}) | ||
| if err := cc.WaitForConnectivityState(ctx, connectivity.Ready); err != nil { | ||
| t.Fatalf("Context timed out waiting for ClientConn to become READY.") | ||
| } | ||
| } | ||
|
|
||
| // healthListenerCapturingCCWrapper is used to capture the health listener so | ||
| // that health updates can be mocked for testing. | ||
| type healthListenerCapturingCCWrapper struct { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice to have this as an e2e style test. We wouldn't be able to use the blocking context dialer testutil that we have, since we can only control the connection after we start connecting. So, by that time, the subchannel would have already moved to
Connecting. I spent a couple of minutes on it, but couldn't think of a way to be able to do this in a e2e style. But if you have an idea that can work without much work, that would be great. But I wouldn't block this PR for that test.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought of using a real channel, but couldn't figure out a simple way to delay/drop the
Connectingupdate, so decided to use the fake channel.One solution I thought of was to wrap the SubConn state listener in a parent LB to intercept and drop connectivity updates. That was making the test more complex than using a fake channel.