Skip to content

Commit 397398d

Browse files
committed
[CLOUDGA-30570] Multiple task failed due to StatusRuntimeException: UNAVAILABLE: Channel shutdown invoked
Summary: Connection can be evicted early due to cache getting full. Setting unlimited size for YBM. LRU will evict. Test Plan: Sanity tests ran by creating a universe. For the port, testing by killing precheck in the middle (added -x to the generated script). ``` + ports=(7000 7100 9000 9100 18018 5433 9042 9070 9300 12000 13000) + vm_ip=127.0.0.1 + for port in "${ports[@]}" ++ start_server 7000 ++ local port=7000 ++ echo 275659 ++ python3 -m http.server 7000 + server_pid=275659 + '[' -n 275659 ']' + trap 'force_kill_on_exit 275659' EXIT + sleep 2 ^C++ force_kill_on_exit 275659 ++ local pid=275659 ++ ps -fp 275659 ++ grep http.server ++ kill -9 275659 [ec2-user@ip-10-9-94-179 scripts]$ ``` Reviewers: amalyshev, anijhawan, amindrov Reviewed By: amindrov Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D48791
1 parent 894b560 commit 397398d

File tree

8 files changed

+52
-13
lines changed

8 files changed

+52
-13
lines changed

managed/RUNTIME-FLAGS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@
176176
| "Enable Node Agent Message Compression" | "yb.node_agent.enable_message_compression" | "GLOBAL" | "Enable compression for message sent over node agent channel." | "Boolean" |
177177
| "GCP Blob Delete Retry Count" | "yb.gcp.blob_delete_retry_count" | "GLOBAL" | "Number of times to retry deleting blobs in GCP. This is used to handle the case where the blob deletion fails due to some transient error." | "Integer" |
178178
| "Node Agent Client Connection Cache Size" | "yb.node_agent.connection_cache_size" | "GLOBAL" | "Cache size for node agent client connections" | "Integer" |
179+
| "Ignore Node Agent Client Connection Cache Size" | "yb.node_agent.ignore_connection_cache_size" | "GLOBAL" | "Ignore the cache size (limit) for node agent client connections" | "Boolean" |
179180
| "Node Agent Client Connection Time-out" | "yb.node_agent.connect_timeout" | "GLOBAL" | "Client connection time-out for node agent." | "Duration" |
180181
| "Node Agent Client Idle Connection Time-out" | "yb.node_agent.idle_connection_timeout" | "GLOBAL" | "Client idle connection timeout for node agent." | "Duration" |
181182
| "Node Agent Client Keep Alive Time" | "yb.node_agent.connection_keep_alive_time" | "GLOBAL" | "Client connection keep-alive time for node agent." | "Duration" |

managed/node-agent/resources/ynp/configs/config.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ prometheus_user = prometheus
104104

105105
[ConfigureNetwork]
106106
ip_address={{ ynp_config['ynp'].node_ip }}
107-
ports = 7000 7100 9000 9100 18018 22 5433 9042 9070 9300 12000 13000
107+
ports = 7000 7100 9000 9100 18018 5433 9042 9070 9300 12000 13000
108108

109109
[InstallNodeAgent]
110110
{{ render_section(ynp_config['yba']) -}}

managed/node-agent/resources/ynp/modules/provision/network/templates/precheck.j2

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,21 @@ force_kill() {
4040
fi
4141
}
4242

43+
force_kill_on_exit() {
44+
local pid=$1
45+
# Check if the process is still running and kill it.
46+
if ps -fp $pid | grep "http.server" > /dev/null; then
47+
kill -9 $pid 2>/dev/null
48+
fi
49+
}
50+
4351
# Iterate through the list of ports, start server, check port, and stop server
4452
for port in "${ports[@]}"; do
4553
# Start the server and get its PID
4654
server_pid=$(start_server $port)
55+
if [ -n "$server_pid" ]; then
56+
trap "force_kill_on_exit $server_pid" EXIT
57+
fi
4758

4859
# Give the server a moment to start
4960
sleep 2

managed/src/main/java/com/yugabyte/yw/common/NodeAgentClient.java

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -178,25 +178,34 @@ public NodeAgentClient(
178178
this.confGetter = confGetter;
179179
this.nodeAgentEnablerProvider = nodeAgentEnablerProvider;
180180
this.channelFactory = channelFactory;
181-
this.cachedChannels =
181+
CacheBuilder<Object, Object> cacheBuilder =
182182
CacheBuilder.newBuilder()
183183
.removalListener(
184184
n -> {
185185
ManagedChannel channel = (ManagedChannel) n.getValue();
186-
log.debug("Channel for {} expired", n.getKey());
186+
log.debug(
187+
"Channel for {} expired. Current size: {}", n.getKey(), getClientCacheSize());
187188
if (!channel.isShutdown() && !channel.isTerminated()) {
188189
channel.shutdown();
189190
}
190191
})
191-
.expireAfterAccess(10, TimeUnit.MINUTES)
192-
.maximumSize(confGetter.getGlobalConf(GlobalConfKeys.nodeAgentConnectionCacheSize))
193-
.build(
194-
new CacheLoader<ChannelConfig, ManagedChannel>() {
195-
@Override
196-
public ManagedChannel load(ChannelConfig config) {
197-
return NodeAgentClient.this.channelFactory.get(config);
198-
}
199-
});
192+
.expireAfterAccess(10, TimeUnit.MINUTES);
193+
if (confGetter.getGlobalConf(GlobalConfKeys.nodeAgentIgnoreConnectionCacheSize)) {
194+
// Only LRU is effective.
195+
log.debug("Ignoring max cache size for node agent client connections");
196+
} else {
197+
int maxClients = confGetter.getGlobalConf(GlobalConfKeys.nodeAgentConnectionCacheSize);
198+
log.debug("Setting max cache size for node agent client connections to {}", maxClients);
199+
cacheBuilder = cacheBuilder.maximumSize(maxClients);
200+
}
201+
this.cachedChannels =
202+
cacheBuilder.build(
203+
new CacheLoader<ChannelConfig, ManagedChannel>() {
204+
@Override
205+
public ManagedChannel load(ChannelConfig config) {
206+
return NodeAgentClient.this.channelFactory.get(config);
207+
}
208+
});
200209
}
201210

202211
@Builder
@@ -1154,4 +1163,8 @@ private List<String> getBashCommand(List<String> command) {
11541163
.collect(Collectors.joining(" ")));
11551164
return shellCommand;
11561165
}
1166+
1167+
private long getClientCacheSize() {
1168+
return cachedChannels.size();
1169+
}
11571170
}

managed/src/main/java/com/yugabyte/yw/common/config/GlobalConfKeys.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,6 +1771,14 @@ public class GlobalConfKeys extends RuntimeConfigKeysModule {
17711771
"Cache size for node agent client connections",
17721772
ConfDataType.IntegerType,
17731773
ImmutableList.of(ConfKeyTags.PUBLIC));
1774+
public static final ConfKeyInfo<Boolean> nodeAgentIgnoreConnectionCacheSize =
1775+
new ConfKeyInfo<>(
1776+
"yb.node_agent.ignore_connection_cache_size",
1777+
ScopeType.GLOBAL,
1778+
"Ignore Node Agent Client Connection Cache Size",
1779+
"Ignore the cache size (limit) for node agent client connections",
1780+
ConfDataType.BooleanType,
1781+
ImmutableList.of(ConfKeyTags.PUBLIC));
17741782
public static final ConfKeyInfo<Duration> nodeAgentConnectTimeout =
17751783
new ConfKeyInfo<>(
17761784
"yb.node_agent.connect_timeout",

managed/src/main/resources/reference.conf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1352,7 +1352,8 @@ yb {
13521352
}
13531353

13541354
node_agent {
1355-
connection_cache_size = 100
1355+
connection_cache_size = 500
1356+
ignore_connection_cache_size = ${yb.cloud.enabled}
13561357
connect_timeout = 10 seconds
13571358
idle_connection_timeout = 5 minutes
13581359
connection_keep_alive_time = 10 seconds

managed/src/test/java/com/yugabyte/yw/commissioner/tasks/subtasks/YNPProvisioningTest.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ protected Application provideApplication() {
116116
lenient()
117117
.when(confGetter.getGlobalConf(eq(GlobalConfKeys.nodeAgentConnectionCacheSize)))
118118
.thenReturn(100);
119+
lenient()
120+
.when(confGetter.getGlobalConf(eq(GlobalConfKeys.nodeAgentIgnoreConnectionCacheSize)))
121+
.thenReturn(false);
119122
lenient()
120123
.when(confGetter.getGlobalConf(eq(GlobalConfKeys.nodeAgentTokenLifetime)))
121124
.thenReturn(Duration.ofHours(1));

managed/src/test/java/com/yugabyte/yw/common/NodeAgentClientTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,8 @@ public void describeTask(
190190
mockConfGetter = mock(RuntimeConfGetter.class);
191191
when(mockConfGetter.getGlobalConf(eq(GlobalConfKeys.nodeAgentConnectionCacheSize)))
192192
.thenReturn(100);
193+
when(mockConfGetter.getGlobalConf(eq(GlobalConfKeys.nodeAgentIgnoreConnectionCacheSize)))
194+
.thenReturn(false);
193195
when(mockConfGetter.getGlobalConf(eq(GlobalConfKeys.nodeAgentDescribePollDeadline)))
194196
.thenReturn(Duration.ofSeconds(5));
195197

0 commit comments

Comments
 (0)