diff --git a/docs/bpftune-tcp-conn.rst b/docs/bpftune-tcp-conn.rst index a3ddb15..b9ad970 100644 --- a/docs/bpftune-tcp-conn.rst +++ b/docs/bpftune-tcp-conn.rst @@ -19,6 +19,15 @@ DESCRIPTION such cases, BBR is a good fit since it continuously estimates bottleneck bandwidth and attempts to fit the congestion algorithm to it. + When we have limited information about a remote host - i.e. we have + not had > REMOTE_HOST_MIN_INSTANCES connections involving it, + the only auto-selection involved is to use BBR in cases where + loss rates exceed 1/32 of the packet sent rate - in such scenarions, + BBR performs much better than other congestion control algorithms. + + For cases where we connect multiple times we can try different + algorithms to select the best. + In selecting the appropriate congestion control algorithm, a reinforcement reinforcement learning-based method is used whereby we choose the congestion control algorithm that best fits the optimal bandwidth diff --git a/src/tcp_conn_tuner.bpf.c b/src/tcp_conn_tuner.bpf.c index 3dc8974..be29d6e 100644 --- a/src/tcp_conn_tuner.bpf.c +++ b/src/tcp_conn_tuner.bpf.c @@ -21,32 +21,44 @@ #include "tcp_conn_tuner.h" +__u64 tcp_cong_choices[NUM_TCP_CONG_ALGS]; + BPF_MAP_DEF(remote_host_map, BPF_MAP_TYPE_HASH, struct in6_addr, struct remote_host, 1024, 0); BPF_MAP_DEF(sk_storage_map, BPF_MAP_TYPE_SK_STORAGE, int, __u64, 0, BPF_F_NO_PREALLOC); -static __always_inline struct remote_host *get_remote_host(struct in6_addr *key) +/* if we have not looked up the host >= REMOTE_HOST_MIN_INSTANCES, return NULL. + * This ensures we only apply RL to hosts with which we have multiple + * interactions. + */ +static __always_inline struct remote_host *get_remote_host(struct in6_addr *key, + bool initial) { struct remote_host *remote_host = NULL; remote_host = bpf_map_lookup_elem(&remote_host_map, key); - if (!remote_host) { - struct remote_host new_remote_host = {}; + if (!remote_host) { + struct remote_host new_remote_host = { .instances = 1}; bpf_map_update_elem(&remote_host_map, key, &new_remote_host, BPF_ANY); - remote_host = bpf_map_lookup_elem(&remote_host_map, key); - } + return NULL; + } + /* bump for initial conn established */ + if (initial) + remote_host->instances++; + if (remote_host->instances < REMOTE_HOST_MIN_INSTANCES) + return NULL; return remote_host; } -static __always_inline void set_cong(struct bpf_sock_ops *ops, struct remote_host *remote_host, - __u8 i) +static __always_inline void set_cong(struct bpf_sock_ops *ops, __u8 i) { int ret; ret = bpf_setsockopt(ops, SOL_TCP, TCP_CONGESTION, (void *)congs[i], sizeof(congs[i])); + tcp_cong_choices[i & (NUM_TCP_CONG_ALGS - 1)]++; /* update state */ if (!ret) { struct bpf_sock *sk = ops->sk; @@ -64,14 +76,14 @@ static __always_inline void set_cong(struct bpf_sock_ops *ops, struct remote_hos SEC("sockops") int conn_tuner_sockops(struct bpf_sock_ops *ops) { - int cb_flags = BPF_SOCK_OPS_STATE_CB_FLAG; + int cb_flags = BPF_SOCK_OPS_STATE_CB_FLAG|BPF_SOCK_OPS_RETRANS_CB_FLAG; struct remote_host *remote_host; struct bpftune_event event = {}; struct tcp_conn_event_data *event_data = (struct tcp_conn_event_data *)&event.raw_data; struct in6_addr *key = &event_data->raddr; struct bpf_sock *sk = ops->sk; - struct tcp_sock *tp = NULL; __u64 *statep = NULL; + bool initial = false; int state; switch (ops->op) { @@ -79,7 +91,24 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops) case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: /* enable other needed events */ bpf_sock_ops_cb_flags_set(ops, cb_flags); + initial = true; break; + case BPF_SOCK_OPS_RETRANS_CB: + /* set individual cong algorithm to BBR if retransmit rate + * is > 1/32 of packets out. + */ + if (ops->total_retrans > (ops->segs_out >> DROP_SHIFT)) { + if (sk) { + statep = bpf_sk_storage_get(&sk_storage_map, sk, + 0, 0); + } + if (!statep || *statep != TCP_STATE_CONG_BBR) { + set_cong(ops, TCP_STATE_CONG_BBR); + /* no more need for retrans events... */ + bpf_sock_ops_cb_flags_set(ops, BPF_SOCK_OPS_STATE_CB_FLAG); + } + } + return 1; case BPF_SOCK_OPS_STATE_CB: state = ops->args[1]; switch (state) { @@ -91,14 +120,6 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops) default: return 1; } - tp = bpf_skc_to_tcp_sock(sk); - if (!tp) - return 1; - - /* retrieve state indicating which cong alg was set */ - statep = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0); - if (!statep) - return 1; break; default: return 1; @@ -117,7 +138,8 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops) default: return 1; } - remote_host = get_remote_host(key); + remote_host = get_remote_host(key, initial); + /* no RL unless seen a number of times... */ if (!remote_host) return 1; @@ -170,7 +192,7 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops) s = epsilon_greedy(minindex, NUM_TCP_CONN_METRICS, 20); s &= 0x3; - set_cong(ops, remote_host, s); + set_cong(ops, s); return 1; } @@ -178,12 +200,22 @@ int conn_tuner_sockops(struct bpf_sock_ops *ops) /* update metric/send metric event on connection close. */ __u64 metric, metric_old, min_rtt, rate_interval_us, rate_delivered, mss; struct tcp_conn_metric *m; - __u8 i, s = *statep & 0x3; + struct tcp_sock *tp; bool greedy = true; + __u8 i, s; + if (!sk) + return 1; + if (!remote_host) + return 1; + /* retrieve state indicating which cong alg was set */ + statep = bpf_sk_storage_get(&sk_storage_map, sk, 0, 0); + if (!statep) + return 1; + s = *statep & 0x3; + tp = bpf_skc_to_tcp_sock(sk); if (!tp) return 1; - min_rtt = (__u64)tp->rtt_min.s[0].v; rate_interval_us = (__u64)tp->rate_interval_us; rate_delivered = (__u64)tp->rate_delivered; diff --git a/src/tcp_conn_tuner.c b/src/tcp_conn_tuner.c index 0eba2f7..9aae83b 100644 --- a/src/tcp_conn_tuner.c +++ b/src/tcp_conn_tuner.c @@ -33,8 +33,8 @@ #include "tcp_conn_tuner.skel.nobtf.h" static struct bpftunable_desc descs[] = { -{ - TCP_CONG, BPFTUNABLE_OTHER, "TCP congestion control", 0, 0 }, + +{ TCP_CONG, BPFTUNABLE_OTHER, "TCP congestion control", 0, 0 }, }; static struct bpftunable_scenario scenarios[] = { @@ -90,11 +90,23 @@ static void summarize_conn_choices(struct bpftuner *tuner) struct in6_addr key, *prev_key = NULL; int map_fd = bpf_map__fd(map); unsigned long greedy_count = 0; - + __u64 *cong_choices; + int i; + + cong_choices = bpftuner_bpf_var_get(tcp_conn, tuner, tcp_cong_choices); + if (cong_choices) { + bpftune_log(BPFTUNE_LOG_LEVEL, + "Summary: tcp_conn_tuner: %20s %20s\n", + "CongAlg", "Count"); + for (i = 0; i < NUM_TCP_CONG_ALGS; i++) { + bpftune_log(BPFTUNE_LOG_LEVEL, + "Summary: tcp_conn_tuner: %20s %20lu\n", + congs[i], cong_choices[i]); + } + } while (!bpf_map_get_next_key(map_fd, prev_key, &key)) { char buf[INET6_ADDRSTRLEN]; struct remote_host r; - int i; prev_key = &key; diff --git a/src/tcp_conn_tuner.h b/src/tcp_conn_tuner.h index b2e7857..015d0ba 100644 --- a/src/tcp_conn_tuner.h +++ b/src/tcp_conn_tuner.h @@ -24,7 +24,7 @@ enum tcp_cong_tunables { }; enum tcp_cong_scenarios { - TCP_CONG_SET + TCP_CONG_SET, }; #define CONG_MAXNAME 16 @@ -67,9 +67,18 @@ struct tcp_conn_event_data { struct remote_host { __u64 min_rtt; __u64 max_rate_delivered; + __u64 instances; struct tcp_conn_metric metrics[NUM_TCP_CONN_METRICS]; }; +/* collect per-conn data once we see > REMOTE_HOST_MIN_INSTANCES */ +#define REMOTE_HOST_MIN_INSTANCES 4 + +/* if total retrans/segs_out > 1(2^DROP_SHIFT) (1/32 by default) + * apply BBR congestion control. + */ +#define DROP_SHIFT 5 + #define RTT_SCALE 1000000 #define DELIVERY_SCALE 1000000 diff --git a/test/Makefile b/test/Makefile index dc56d32..bce33b7 100644 --- a/test/Makefile +++ b/test/Makefile @@ -28,6 +28,7 @@ TUNER_TESTS = support_test log_test service_test inotify_test cap_test \ podman_globalonly_test podman_globalonly_legacy_test \ sysctl_test sysctl_legacy_test sysctl_netns_test \ netns_test netns_legacy_test \ + file_download_test file_download_legacy_test \ budget_test \ backlog_test backlog_legacy_test \ frag_test frag_legacy_test \ diff --git a/test/file_download_legacy_test.sh b/test/file_download_legacy_test.sh new file mode 100644 index 0000000..81e3db3 --- /dev/null +++ b/test/file_download_legacy_test.sh @@ -0,0 +1,128 @@ +#!/usr/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# Copyright (c) 2023, Oracle and/or its affiliates. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public +# License v2 as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 021110-1307, USA. +# + +# download file via wget with various drop/latencies; bbr should +# be used when drops >= 2% + +. ./test_lib.sh + +LOGFILE=$TESTLOG_LAST + +SLEEPTIME=1 +TIMEOUT=30 +MAX_CONN=50 + +for DROP_PERCENT in 2 0; do +for LATENCY in "" "delay 100" ; do +for NS in nonglobal global; do + for FAMILY in ipv4 ipv6 ; do + + case $FAMILY in + ipv4) + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV4 + else + ADDR=$VETH1_IPV4 + fi + WGET_ARG="" + WGET_ADDR=$ADDR + HTTP_BIND_ADDR="" + ;; + ipv6) + pyversion=$(python3 --version | awk -F '.' '{ print $2 }') + # http.server supports IPv6 for 3.8 and later. + if [[ $pyversion -lt 8 ]]; then + echo "IPv6 test needs python 3.8 or later, skipping" + continue + fi + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV6 + else + ADDR=$VETH1_IPV6 + fi + WGET_ARG="-6" + WGET_ADDR="[${ADDR}]" + HTTP_BIND_ADDR="--bind $ADDR" + ;; + esac + + test_start "$0|file legacy test to $ADDR:$PORT $FAMILY $NS drop $DROP_PERCENT $LATENCY" + + if [[ $DROP_PERCENT -gt 0 ]]; then + DROP=$DROP_PERCENT + fi + + if [[ $NS == "global" ]]; then + CLIENT_PREFIX="ip netns exec $NETNS" + CLIENT_VETH=$VETH1 + SERVER_PREFIX="" + SERVER_VETH=$VETH2 + else + CLIENT_PREFIX="" + CLIENT_VETH=$VETH2 + SERVER_PREFIX="ip netns exec $NETNS" + SERVER_VETH=$VETH1 + fi + test_setup true + mkdir -p $SERVERDIR + dd if=/dev/zero of=${SERVERFILE} bs=$SERVERFILE_SIZE count=1 + + set +e + FIREWALLD_PID=$(pgrep firewalld) + set -e + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld stop + fi + for MODE in baseline test ; do + + echo "Running ${MODE}..." + if [[ $MODE != "baseline" ]]; then + test_run_cmd_local "$BPFTUNE -sL &" true + sleep $SETUPTIME + else + LOGSZ=$(wc -l $LOGFILE | awk '{print $1}') + LOGSZ=$(expr $LOGSZ + 1) + fi + pushd $SERVERDIR + test_run_cmd_local "$SERVER_PREFIX python3 -m http.server $HTTP_BIND_ADDR $PORT &" true + sleep $SLEEPTIME + $CLIENT_PREFIX wget $WGET_ARG http://${WGET_ADDR}:${PORT}/file + popd + rm -f ${SERVERFILE}.1 + if [[ $MODE != "baseline" ]]; then + pkill -TERM bpftune + sleep $SETUPTIME + tail -n +${LOGSZ} $LOGFILE | grep bbr + else + sleep $SLEEPTIME + fi + done + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld start + fi + test_pass + test_cleanup + done +done +done +done + +test_exit diff --git a/test/file_download_test.sh b/test/file_download_test.sh new file mode 100644 index 0000000..ca33a04 --- /dev/null +++ b/test/file_download_test.sh @@ -0,0 +1,128 @@ +#!/usr/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# Copyright (c) 2023, Oracle and/or its affiliates. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public +# License v2 as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 021110-1307, USA. +# + +# download file via wget with various drop/latencies; bbr should +# be used when drops >= 2% + +. ./test_lib.sh + +LOGFILE=$TESTLOG_LAST + +SLEEPTIME=1 +TIMEOUT=30 +MAX_CONN=50 + +for DROP_PERCENT in 2 0; do +for LATENCY in "" "delay 100" ; do +for NS in nonglobal global; do + for FAMILY in ipv4 ipv6 ; do + + case $FAMILY in + ipv4) + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV4 + else + ADDR=$VETH1_IPV4 + fi + WGET_ARG="" + WGET_ADDR=$ADDR + HTTP_BIND_ADDR="" + ;; + ipv6) + pyversion=$(python3 --version | awk -F '.' '{ print $2 }') + # http.server supports IPv6 for 3.8 and later. + if [[ $pyversion -lt 8 ]]; then + echo "IPv6 test needs python 3.8 or later, skipping" + continue + fi + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV6 + else + ADDR=$VETH1_IPV6 + fi + WGET_ARG="-6" + WGET_ADDR="[${ADDR}]" + HTTP_BIND_ADDR="--bind $ADDR" + ;; + esac + + test_start "$0|file test to $ADDR:$PORT $FAMILY $NS drop $DROP_PERCENT $LATENCY" + + if [[ $DROP_PERCENT -gt 0 ]]; then + DROP=$DROP_PERCENT + fi + + if [[ $NS == "global" ]]; then + CLIENT_PREFIX="ip netns exec $NETNS" + CLIENT_VETH=$VETH1 + SERVER_PREFIX="" + SERVER_VETH=$VETH2 + else + CLIENT_PREFIX="" + CLIENT_VETH=$VETH2 + SERVER_PREFIX="ip netns exec $NETNS" + SERVER_VETH=$VETH1 + fi + test_setup true + mkdir -p $SERVERDIR + dd if=/dev/zero of=${SERVERFILE} bs=$SERVERFILE_SIZE count=1 + + set +e + FIREWALLD_PID=$(pgrep firewalld) + set -e + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld stop + fi + for MODE in baseline test ; do + + echo "Running ${MODE}..." + if [[ $MODE != "baseline" ]]; then + test_run_cmd_local "$BPFTUNE -s &" true + sleep $SETUPTIME + else + LOGSZ=$(wc -l $LOGFILE | awk '{print $1}') + LOGSZ=$(expr $LOGSZ + 1) + fi + pushd $SERVERDIR + test_run_cmd_local "$SERVER_PREFIX python3 -m http.server $HTTP_BIND_ADDR $PORT &" true + sleep $SLEEPTIME + $CLIENT_PREFIX wget $WGET_ARG http://${WGET_ADDR}:${PORT}/file + popd + rm -f ${SERVERFILE}.1 + if [[ $MODE != "baseline" ]]; then + pkill -TERM bpftune + sleep $SETUPTIME + tail -n +${LOGSZ} $LOGFILE | grep bbr + else + sleep $SLEEPTIME + fi + done + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld start + fi + test_pass + test_cleanup + done +done +done +done + +test_exit diff --git a/test/test_lib.sh b/test/test_lib.sh index 8ecfa2b..453ad3d 100644 --- a/test/test_lib.sh +++ b/test/test_lib.sh @@ -70,6 +70,10 @@ export LOGFILE=$SYSLOGFILE export BPFTUNE_LEGACY=${BPFTUNE_LEGACY:-0} export BPFTUNE_NETNS=${BPFTUNE_NETNS:-1} +export SERVERDIR=${TESTDIR}/https +export SERVERFILE=${SERVERDIR}/file +export SERVERFILE_SIZE=500M + export B=$(tput -Tvt100 bold) export N=$(tput -Tvt100 sgr0) @@ -293,8 +297,8 @@ test_cleanup_local() ip link del $VETH2 2>/dev/null ip link del bpftunelocal 2>/dev/null sysctl -w net.ipv6.conf.all.disable_ipv6=0 + rm -fr $SERVERDIR set -e - if [[ ! -f /usr/lib64/bpftune/tcp_buffer_tuner.so ]]; then mv /tmp/tcp_buffer_tuner.so /usr/lib64/bpftune fi