Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
tcp_conn_tuner: notice retransmit events for tcp conn, apply BBR
....if these exceed 1/32 of packets sent.  Also only apply RL techniques
for connections we repeatedly use; this saves us trying to apply
RL to cases where we do not have enough data to make good choices.

Add file download test using python3 http.server; it allows us to
show BBR kicking in for connections where loss/latency are induced
via netem.

Signed-off-by: Alan Maguire <[email protected]>
  • Loading branch information
alan-maguire committed Dec 11, 2024
commit f8fd7193c477182ab013eda16e20c73edf16adc4
9 changes: 9 additions & 0 deletions docs/bpftune-tcp-conn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ DESCRIPTION
such cases, BBR is a good fit since it continuously estimates bottleneck
bandwidth and attempts to fit the congestion algorithm to it.

When we have limited information about a remote host - i.e. we have
not had > REMOTE_HOST_MIN_INSTANCES connections involving it,
the only auto-selection involved is to use BBR in cases where
loss rates exceed 1/32 of the packet sent rate - in such scenarions,
BBR performs much better than other congestion control algorithms.

For cases where we connect multiple times we can try different
algorithms to select the best.

In selecting the appropriate congestion control algorithm, a reinforcement
reinforcement learning-based method is used whereby we choose the
congestion control algorithm that best fits the optimal bandwidth
Expand Down
74 changes: 53 additions & 21 deletions src/tcp_conn_tuner.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,44 @@

#include "tcp_conn_tuner.h"

__u64 tcp_cong_choices[NUM_TCP_CONG_ALGS];

BPF_MAP_DEF(remote_host_map, BPF_MAP_TYPE_HASH, struct in6_addr, struct remote_host, 1024, 0);

BPF_MAP_DEF(sk_storage_map, BPF_MAP_TYPE_SK_STORAGE, int, __u64, 0, BPF_F_NO_PREALLOC);

static __always_inline struct remote_host *get_remote_host(struct in6_addr *key)
/* if we have not looked up the host >= REMOTE_HOST_MIN_INSTANCES, return NULL.
* This ensures we only apply RL to hosts with which we have multiple
* interactions.
*/
static __always_inline struct remote_host *get_remote_host(struct in6_addr *key,
bool initial)
{
struct remote_host *remote_host = NULL;

remote_host = bpf_map_lookup_elem(&remote_host_map, key);
if (!remote_host) {
struct remote_host new_remote_host = {};
if (!remote_host) {
struct remote_host new_remote_host = { .instances = 1};

bpf_map_update_elem(&remote_host_map, key, &new_remote_host,
BPF_ANY);
remote_host = bpf_map_lookup_elem(&remote_host_map, key);
}
return NULL;
}
/* bump for initial conn established */
if (initial)
remote_host->instances++;
if (remote_host->instances < REMOTE_HOST_MIN_INSTANCES)
return NULL;
return remote_host;
}

static __always_inline void set_cong(struct bpf_sock_ops *ops, struct remote_host *remote_host,
__u8 i)
static __always_inline void set_cong(struct bpf_sock_ops *ops, __u8 i)
{
int ret;

ret = bpf_setsockopt(ops, SOL_TCP, TCP_CONGESTION, (void *)congs[i],
sizeof(congs[i]));
tcp_cong_choices[i & (NUM_TCP_CONG_ALGS - 1)]++;
/* update state */
if (!ret) {
struct bpf_sock *sk = ops->sk;
Expand All @@ -64,22 +76,39 @@ static __always_inline void set_cong(struct bpf_sock_ops *ops, struct remote_hos
SEC("sockops")
int conn_tuner_sockops(struct bpf_sock_ops *ops)
{
int cb_flags = BPF_SOCK_OPS_STATE_CB_FLAG;
int cb_flags = BPF_SOCK_OPS_STATE_CB_FLAG|BPF_SOCK_OPS_RETRANS_CB_FLAG;
struct remote_host *remote_host;
struct bpftune_event event = {};
struct tcp_conn_event_data *event_data = (struct tcp_conn_event_data *)&event.raw_data;
struct in6_addr *key = &event_data->raddr;
struct bpf_sock *sk = ops->sk;
struct tcp_sock *tp = NULL;
__u64 *statep = NULL;
bool initial = false;
int state;

switch (ops->op) {
case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
/* enable other needed events */
bpf_sock_ops_cb_flags_set(ops, cb_flags);
initial = true;
break;
case BPF_SOCK_OPS_RETRANS_CB:
/* set individual cong algorithm to BBR if retransmit rate
* is > 1/32 of packets out.
*/
if (ops->total_retrans > (ops->segs_out >> DROP_SHIFT)) {
if (sk) {
statep = bpf_sk_storage_get(&sk_storage_map, sk,
0, 0);
}
if (!statep || *statep != TCP_STATE_CONG_BBR) {
set_cong(ops, TCP_STATE_CONG_BBR);
/* no more need for retrans events... */
bpf_sock_ops_cb_flags_set(ops, BPF_SOCK_OPS_STATE_CB_FLAG);
}
}
return 1;
case BPF_SOCK_OPS_STATE_CB:
state = ops->args[1];
switch (state) {
Expand All @@ -91,14 +120,6 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops)
default:
return 1;
}
tp = bpf_skc_to_tcp_sock(sk);
if (!tp)
return 1;

/* retrieve state indicating which cong alg was set */
statep = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0);
if (!statep)
return 1;
break;
default:
return 1;
Expand All @@ -117,7 +138,8 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops)
default:
return 1;
}
remote_host = get_remote_host(key);
remote_host = get_remote_host(key, initial);
/* no RL unless seen a number of times... */
if (!remote_host)
return 1;

Expand Down Expand Up @@ -170,20 +192,30 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops)
s = epsilon_greedy(minindex, NUM_TCP_CONN_METRICS, 20);
s &= 0x3;

set_cong(ops, remote_host, s);
set_cong(ops, s);

return 1;
}
case BPF_SOCK_OPS_STATE_CB: {
/* update metric/send metric event on connection close. */
__u64 metric, metric_old, min_rtt, rate_interval_us, rate_delivered, mss;
struct tcp_conn_metric *m;
__u8 i, s = *statep & 0x3;
struct tcp_sock *tp;
bool greedy = true;
__u8 i, s;

if (!sk)
return 1;
if (!remote_host)
return 1;
/* retrieve state indicating which cong alg was set */
statep = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0);
if (!statep)
return 1;
s = *statep & 0x3;
tp = bpf_skc_to_tcp_sock(sk);
if (!tp)
return 1;

min_rtt = (__u64)tp->rtt_min.s[0].v;
rate_interval_us = (__u64)tp->rate_interval_us;
rate_delivered = (__u64)tp->rate_delivered;
Expand Down
20 changes: 16 additions & 4 deletions src/tcp_conn_tuner.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
#include "tcp_conn_tuner.skel.nobtf.h"

static struct bpftunable_desc descs[] = {
{
TCP_CONG, BPFTUNABLE_OTHER, "TCP congestion control", 0, 0 },

{ TCP_CONG, BPFTUNABLE_OTHER, "TCP congestion control", 0, 0 },
};

static struct bpftunable_scenario scenarios[] = {
Expand Down Expand Up @@ -90,11 +90,23 @@ static void summarize_conn_choices(struct bpftuner *tuner)
struct in6_addr key, *prev_key = NULL;
int map_fd = bpf_map__fd(map);
unsigned long greedy_count = 0;

__u64 *cong_choices;
int i;

cong_choices = bpftuner_bpf_var_get(tcp_conn, tuner, tcp_cong_choices);
if (cong_choices) {
bpftune_log(BPFTUNE_LOG_LEVEL,
"Summary: tcp_conn_tuner: %20s %20s\n",
"CongAlg", "Count");
for (i = 0; i < NUM_TCP_CONG_ALGS; i++) {
bpftune_log(BPFTUNE_LOG_LEVEL,
"Summary: tcp_conn_tuner: %20s %20lu\n",
congs[i], cong_choices[i]);
}
}
while (!bpf_map_get_next_key(map_fd, prev_key, &key)) {
char buf[INET6_ADDRSTRLEN];
struct remote_host r;
int i;

prev_key = &key;

Expand Down
11 changes: 10 additions & 1 deletion src/tcp_conn_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ enum tcp_cong_tunables {
};

enum tcp_cong_scenarios {
TCP_CONG_SET
TCP_CONG_SET,
};

#define CONG_MAXNAME 16
Expand Down Expand Up @@ -67,9 +67,18 @@ struct tcp_conn_event_data {
struct remote_host {
__u64 min_rtt;
__u64 max_rate_delivered;
__u64 instances;
struct tcp_conn_metric metrics[NUM_TCP_CONN_METRICS];
};

/* collect per-conn data once we see > REMOTE_HOST_MIN_INSTANCES */
#define REMOTE_HOST_MIN_INSTANCES 4

/* if total retrans/segs_out > 1(2^DROP_SHIFT) (1/32 by default)
* apply BBR congestion control.
*/
#define DROP_SHIFT 5

#define RTT_SCALE 1000000
#define DELIVERY_SCALE 1000000

Expand Down
1 change: 1 addition & 0 deletions test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ TUNER_TESTS = support_test log_test service_test inotify_test cap_test \
podman_globalonly_test podman_globalonly_legacy_test \
sysctl_test sysctl_legacy_test sysctl_netns_test \
netns_test netns_legacy_test \
file_download_test file_download_legacy_test \
budget_test \
backlog_test backlog_legacy_test \
frag_test frag_legacy_test \
Expand Down
128 changes: 128 additions & 0 deletions test/file_download_legacy_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/bash
#
# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
#
# Copyright (c) 2023, Oracle and/or its affiliates.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License v2 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 021110-1307, USA.
#

# download file via wget with various drop/latencies; bbr should
# be used when drops >= 2%

. ./test_lib.sh

LOGFILE=$TESTLOG_LAST

SLEEPTIME=1
TIMEOUT=30
MAX_CONN=50

for DROP_PERCENT in 2 0; do
for LATENCY in "" "delay 100" ; do
for NS in nonglobal global; do
for FAMILY in ipv4 ipv6 ; do

case $FAMILY in
ipv4)
if [[ $NS == "global" ]]; then
ADDR=$VETH2_IPV4
else
ADDR=$VETH1_IPV4
fi
WGET_ARG=""
WGET_ADDR=$ADDR
HTTP_BIND_ADDR=""
;;
ipv6)
pyversion=$(python3 --version | awk -F '.' '{ print $2 }')
# http.server supports IPv6 for 3.8 and later.
if [[ $pyversion -lt 8 ]]; then
echo "IPv6 test needs python 3.8 or later, skipping"
continue
fi
if [[ $NS == "global" ]]; then
ADDR=$VETH2_IPV6
else
ADDR=$VETH1_IPV6
fi
WGET_ARG="-6"
WGET_ADDR="[${ADDR}]"
HTTP_BIND_ADDR="--bind $ADDR"
;;
esac

test_start "$0|file legacy test to $ADDR:$PORT $FAMILY $NS drop $DROP_PERCENT $LATENCY"

if [[ $DROP_PERCENT -gt 0 ]]; then
DROP=$DROP_PERCENT
fi

if [[ $NS == "global" ]]; then
CLIENT_PREFIX="ip netns exec $NETNS"
CLIENT_VETH=$VETH1
SERVER_PREFIX=""
SERVER_VETH=$VETH2
else
CLIENT_PREFIX=""
CLIENT_VETH=$VETH2
SERVER_PREFIX="ip netns exec $NETNS"
SERVER_VETH=$VETH1
fi
test_setup true
mkdir -p $SERVERDIR
dd if=/dev/zero of=${SERVERFILE} bs=$SERVERFILE_SIZE count=1

set +e
FIREWALLD_PID=$(pgrep firewalld)
set -e
if [[ -n "$FIREWALLD_PID" ]]; then
service firewalld stop
fi
for MODE in baseline test ; do

echo "Running ${MODE}..."
if [[ $MODE != "baseline" ]]; then
test_run_cmd_local "$BPFTUNE -sL &" true
sleep $SETUPTIME
else
LOGSZ=$(wc -l $LOGFILE | awk '{print $1}')
LOGSZ=$(expr $LOGSZ + 1)
fi
pushd $SERVERDIR
test_run_cmd_local "$SERVER_PREFIX python3 -m http.server $HTTP_BIND_ADDR $PORT &" true
sleep $SLEEPTIME
$CLIENT_PREFIX wget $WGET_ARG http://${WGET_ADDR}:${PORT}/file
popd
rm -f ${SERVERFILE}.1
if [[ $MODE != "baseline" ]]; then
pkill -TERM bpftune
sleep $SETUPTIME
tail -n +${LOGSZ} $LOGFILE | grep bbr
else
sleep $SLEEPTIME
fi
done
if [[ -n "$FIREWALLD_PID" ]]; then
service firewalld start
fi
test_pass
test_cleanup
done
done
done
done

test_exit
Loading