diff --git a/docs/Makefile b/docs/Makefile index 2c08807..aa1df67 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -36,7 +36,7 @@ man8dir = $(mandir)/man8 MAN8_RST = bpftune.rst bpftune-sysctl.rst bpftune-tcp-conn.rst \ bpftune-neigh.rst bpftune-tcp-buffer.rst bpftune-netns.rst \ - bpftune-net-buffer.rst + bpftune-net-buffer.rst bpftune-ip-frag.rst _DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) diff --git a/docs/bpftune-ip-frag.rst b/docs/bpftune-ip-frag.rst new file mode 100644 index 0000000..4a37316 --- /dev/null +++ b/docs/bpftune-ip-frag.rst @@ -0,0 +1,29 @@ +=============== +BPFTUNE-IP-FRAG +=============== +------------------------------------------------------------------------------- +IP fragmentation bpftune plugin for managing fragment reassembly memory limits +------------------------------------------------------------------------------- + +:Manual section: 8 + + +DESCRIPTION +=========== + + For IPv[46] fragmentation reassembly, memory is capped at + + net.ipv[46].ip[6]frag_high_thresh + + Fragmentation reassembly can fail if this value is set too low; + monitor for fragmentation reassembly and bump value if needed. + + Avoid bumping it if assembly faiures constitute too high a + proportion of reassembly events; this may signify a DoS. + + Tunables: + + - net.ipv4.ipfrag_high_thresh: number of bytes devoted to + IPv4 fragmentation reassembly; default 4MB + - net.ipv6.ip6frag_high_thresh: number of bytes devoted to + IPv6 fragmentation reassembly; default 4MB diff --git a/src/Makefile b/src/Makefile index d0f12ac..448f16b 100644 --- a/src/Makefile +++ b/src/Makefile @@ -84,7 +84,7 @@ submake_extras := feature_display=0 endif TUNERS = tcp_buffer_tuner route_table_tuner neigh_table_tuner sysctl_tuner \ - tcp_conn_tuner netns_tuner net_buffer_tuner + tcp_conn_tuner netns_tuner net_buffer_tuner ip_frag_tuner TUNER_OBJS = $(patsubst %,%.o,$(TUNERS)) TUNER_SRCS = $(patsubst %,%.c,$(TUNERS)) diff --git a/src/ip_frag_tuner.bpf.c b/src/ip_frag_tuner.bpf.c new file mode 100644 index 0000000..8380e5c --- /dev/null +++ b/src/ip_frag_tuner.bpf.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include "ip_frag_tuner.h" + +/* ratio of failure to success is > 1/2 */ +#define REASM_FAIL_THRESHOLD(success, fail) ((success >= 2) && (fail > (success >> 1))) + +static __always_inline int defrag(struct net *net, struct fqdir *fqdir, + struct ipstats_mib *ip_stats, int tunable) +{ + long mem = BPFTUNE_CORE_READ(fqdir, mem.counter); + long high_thresh = BPFTUNE_CORE_READ(fqdir, high_thresh); + + bpftune_debug("defrag: mem %ld high thresh %ld\n", + mem, high_thresh); + if (!fqdir || !mem || !high_thresh) + return 0; + + /* FragmentSmack DoS relied on small packets overwhelming defragmentation; + * do not raise limits when we see small fragments and a significant + * number of fragmentation reassembly failures versus successes. + */ + if (NEARLY_FULL(mem, high_thresh)) { + __u64 reasm_success = BPFTUNE_CORE_READ(ip_stats, + mibs[IPSTATS_MIB_REASMOKS]); + __u64 reasm_fails = BPFTUNE_CORE_READ(ip_stats, + mibs[IPSTATS_MIB_REASMFAILS]); + struct bpftune_event event = { 0 }; + long old[3] = {}; + long new[3] = {}; + + bpftune_debug("nearly full, reasm success %ld reasm fail %ld\n", + reasm_success, reasm_fails); + + /* too many fragmentation reassembly fails? */ + if (REASM_FAIL_THRESHOLD(reasm_success, reasm_fails)) + return 0; + old[0] = high_thresh; + new[0] = BPFTUNE_GROW_BY_DELTA(high_thresh); + send_net_sysctl_event(net, IP_FRAG_THRESHOLD_INCREASE, + tunable, + old, new, &event); + } + return 0; +} + +BPF_FENTRY(ip_defrag, struct net *net, struct sk_buff *skb, u32 user) +{ + struct fqdir *fqdir = BPFTUNE_CORE_READ(net, ipv4.fqdir); + struct ipstats_mib *ip_stats; + + if (!fqdir) + return 0; + ip_stats = BPFTUNE_CORE_READ(net, mib.ip_statistics); + + if (!ip_stats) + return 0; + return defrag(net, fqdir, ip_stats, IP_FRAG_MAX_THRESHOLD); +} + +#define SKB_DST_NOREF 1UL +#define SKB_DST_PTRMASK ~(SKB_DST_NOREF) +BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb) +{ + long unsigned int refdst = BPFTUNE_CORE_READ(skb, _skb_refdst); + struct dst_entry *dst = (struct dst_entry *)(refdst & SKB_DST_PTRMASK); + struct ipstats_mib *ipv6_stats; + struct net_device *dev; + struct fqdir *fqdir; + struct net *net; + + if (!dst) + return 0; + dev = BPFTUNE_CORE_READ(dst, dev); + if (!dev) + return 0; + net = BPFTUNE_CORE_READ(dev, nd_net.net); + if (!net) + return 0; + fqdir = BPFTUNE_CORE_READ(net, ipv6.fqdir); + if (!fqdir) + return 0; + ipv6_stats = BPFTUNE_CORE_READ(net, mib.ipv6_statistics); + if (!ipv6_stats) + return 0; + return defrag(net, fqdir, ipv6_stats, IP6_FRAG_MAX_THRESHOLD); +} diff --git a/src/ip_frag_tuner.c b/src/ip_frag_tuner.c new file mode 100644 index 0000000..b3cc8d8 --- /dev/null +++ b/src/ip_frag_tuner.c @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2023, Oracle and/or its affiliates. */ + +#include +#include "ip_frag_tuner.h" +#include "ip_frag_tuner.skel.h" +#include "ip_frag_tuner.skel.legacy.h" +#include "ip_frag_tuner.skel.nobtf.h" + +#include +#include + +struct tcp_buffer_tuner_bpf *skel; + +static struct bpftunable_desc descs[] = { +{ IP_FRAG_MAX_THRESHOLD, BPFTUNABLE_SYSCTL, "net.ipv4.ipfrag_high_thresh", + BPFTUNABLE_NAMESPACED, 1 }, +{ IP6_FRAG_MAX_THRESHOLD, BPFTUNABLE_SYSCTL, "net.ipv6.ip6frag_high_thresh", + BPFTUNABLE_NAMESPACED, 1 }, +}; + +static struct bpftunable_scenario scenarios[] = { +{ IP_FRAG_THRESHOLD_INCREASE, "need to increase IP fragmentation high threshold", + "this allows additional memory to be used to accommodate more defragmentation." }, +}; + +int init(struct bpftuner *tuner) +{ + int err; + + err = bpftuner_bpf_init(ip_frag, tuner, NULL); + if (err) + return err; + return bpftuner_tunables_init(tuner, IP_FRAG_NUM_TUNABLES, descs, + ARRAY_SIZE(scenarios), scenarios); +} + +void fini(struct bpftuner *tuner) +{ + bpftune_log(LOG_DEBUG, "calling fini for %s\n", tuner->name); + bpftuner_bpf_fini(tuner); +} + +void event_handler(struct bpftuner *tuner, + struct bpftune_event *event, + __attribute__((unused))void *ctx) +{ + int scenario = event->scenario_id; + const char *tunable; + long new, old; + int id; + + /* netns cookie not supported; ignore */ + if (event->netns_cookie == (unsigned long)-1) + return; + + id = event->update[0].id; + + memcpy(&new, event->update[0].new, sizeof(new)); + memcpy(&old, event->update[0].old, sizeof(old)); + + tunable = bpftuner_tunable_name(tuner, id); + if (!tunable) { + bpftune_log(LOG_DEBUG, "unknown tunable [%d] for ip_frag_tuner\n", id); + return; + } + switch (id) { + case IP_FRAG_MAX_THRESHOLD: + case IP6_FRAG_MAX_THRESHOLD: + bpftuner_tunable_sysctl_write(tuner, id, scenario, + event->netns_cookie, 1, &new, +"Due to approaching fragmentation maximum threshold change %s from (%ld) -> (%ld)\n", + tunable, old, new); + break; + default: + break; + } +} diff --git a/src/ip_frag_tuner.h b/src/ip_frag_tuner.h new file mode 100644 index 0000000..03e545a --- /dev/null +++ b/src/ip_frag_tuner.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include + + +enum ip_frag_tunables { + IP_FRAG_MAX_THRESHOLD, + IP6_FRAG_MAX_THRESHOLD, + IP_FRAG_NUM_TUNABLES +}; + +enum ip_frag_scenarios { + IP_FRAG_THRESHOLD_INCREASE, +}; diff --git a/test/Makefile b/test/Makefile index 176baa4..94fc1e1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -27,6 +27,7 @@ TUNER_TESTS = support_test log_test service_test inotify_test cap_test \ sysctl_test sysctl_legacy_test sysctl_netns_test \ netns_test netns_legacy_test \ backlog_test backlog_legacy_test \ + frag_test frag_legacy_test \ neigh_table_test neigh_table_v4only_test \ neigh_table_legacy_test \ mem_pressure_test mem_pressure_legacy_test \ diff --git a/test/frag_legacy_test.sh b/test/frag_legacy_test.sh new file mode 100644 index 0000000..67bd5aa --- /dev/null +++ b/test/frag_legacy_test.sh @@ -0,0 +1,132 @@ +#!/usr/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# Copyright (c) 2023, Oracle and/or its affiliates. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public +# License v2 as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 021110-1307, USA. +# + +# send frags > MTU via ping with netns with too-low high_thresh for +# fragment memory; ensure we bump up memory limits. + +. ./test_lib.sh + +LOGFILE=$TESTLOG_LAST + +SLEEPTIME=1 +TIMEOUT=30 +MAX_CONN=50 + +for FAMILY in ipv6 ipv4 ; do + for NS in nonglobal global; do + case $FAMILY in + ipv4) + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV4 + else + ADDR=$VETH1_IPV4 + fi + SYSCTL_PREFIX=net.ipv4.ipfrag_ + SYSCTL_NAME="${SYSCTL_PREFIX}high_thresh" + ;; + ipv6) + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV6 + else + ADDR=$VETH1_IPV6 + fi + SYSCTL_PREFIX=net.ipv6.ip6frag_ + SYSCTL_NAME="${SYSCTL_PREFIX}high_thresh" + ;; + esac + + test_start "$0|frag legacy test to $ADDR:$PORT $FAMILY $NS" + + if [[ $NS == "global" ]]; then + CLIENT_PREFIX="ip netns exec $NETNS" + CLIENT_VETH=$VETH1 + SERVER_PREFIX="" + SERVER_VETH=$VETH2 + else + CLIENT_PREFIX="" + CLIENT_VETH=$VETH2 + SERVER_PREFIX="ip netns exec $NETNS" + SERVER_VETH=$VETH1 + fi + test_setup true + + $CLIENT_PREFIX ethtool --offload $CLIENT_VETH rx off tx off gso off gro off lro off tso off + $SERVER_PREFIX ethtool --offload $SERVER_VETH rx off tx off gso off gro off lro off tso off + frag_orig=($($SERVER_PREFIX sysctl -n $SYSCTL_NAME)) + low_orig=($($SERVER_PREFIX sysctl -n ${SYSCTL_PREFIX}low_thresh)) + $SERVER_PREFIX sysctl -w ${SYSCTL_PREFIX}low_thresh=8192 + $SERVER_PREFIX sysctl -w $SYSCTL_NAME="8192" + + frag_pre=($($SERVER_PREFIX sysctl -n $SYSCTL_NAME)) + + # prevent firewall from reassembling packets. + set +e + FIREWALLD_PID=$(pgrep firewalld) + set -e + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld stop + fi + for MODE in baseline test ; do + + echo "Running ${MODE}..." + if [[ $MODE != "baseline" ]]; then + test_run_cmd_local "$BPFTUNE -dsL &" true + sleep $SETUPTIME + else + LOGSZ=$(wc -l $LOGFILE | awk '{print $1}') + LOGSZ=$(expr $LOGSZ + 1) + fi + set +e + echo "Running $CLIENT_PREFIX ping -v -c 20 -M t -s 8192 $ADDR" + $CLIENT_PREFIX ping -v -c 20 -M want -s 8192 $ADDR + set -e + + if [[ $MODE != "baseline" ]]; then + pkill -TERM bpftune + sleep $SETUPTIME + else + sleep $SLEEPTIME + fi + done + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld start + fi + frag_post=($($SERVER_PREFIX sysctl -n $SYSCTL_NAME)) + if [[ -n $SERVER_PREFIX ]]; then + sysctl -w ${SYSCTL_NAME}=$frag_orig + sysctl -w ${SYSCTL_PREFIX}low_thresh=$low_orig + fi + echo "$SYSCTL_NAME before ${frag_pre}" + echo "$SYSCTL_NAME after ${frag_post}" + if [[ $MODE == "test" ]]; then + if [[ "${frag_post}" -gt ${frag_pre} ]]; then + grep "approaching fragmentation maximum threshold" $LOGFILE + test_pass + else + test_cleanup + fi + fi + + test_cleanup + done +done + +test_exit diff --git a/test/frag_test.sh b/test/frag_test.sh new file mode 100644 index 0000000..1b21ce8 --- /dev/null +++ b/test/frag_test.sh @@ -0,0 +1,132 @@ +#!/usr/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# Copyright (c) 2023, Oracle and/or its affiliates. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public +# License v2 as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 021110-1307, USA. +# + +# send frags > MTU via ping with netns with too-low high_thresh for +# fragment memory; ensure we bump up memory limits. + +. ./test_lib.sh + +LOGFILE=$TESTLOG_LAST + +SLEEPTIME=1 +TIMEOUT=30 +MAX_CONN=50 + +for FAMILY in ipv6 ipv4 ; do + for NS in nonglobal global; do + case $FAMILY in + ipv4) + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV4 + else + ADDR=$VETH1_IPV4 + fi + SYSCTL_PREFIX=net.ipv4.ipfrag_ + SYSCTL_NAME="${SYSCTL_PREFIX}high_thresh" + ;; + ipv6) + if [[ $NS == "global" ]]; then + ADDR=$VETH2_IPV6 + else + ADDR=$VETH1_IPV6 + fi + SYSCTL_PREFIX=net.ipv6.ip6frag_ + SYSCTL_NAME="${SYSCTL_PREFIX}high_thresh" + ;; + esac + + test_start "$0|frag test to $ADDR:$PORT $FAMILY $NS" + + if [[ $NS == "global" ]]; then + CLIENT_PREFIX="ip netns exec $NETNS" + CLIENT_VETH=$VETH1 + SERVER_PREFIX="" + SERVER_VETH=$VETH2 + else + CLIENT_PREFIX="" + CLIENT_VETH=$VETH2 + SERVER_PREFIX="ip netns exec $NETNS" + SERVER_VETH=$VETH1 + fi + test_setup true + + $CLIENT_PREFIX ethtool --offload $CLIENT_VETH rx off tx off gso off gro off lro off tso off + $SERVER_PREFIX ethtool --offload $SERVER_VETH rx off tx off gso off gro off lro off tso off + frag_orig=($($SERVER_PREFIX sysctl -n $SYSCTL_NAME)) + low_orig=($($SERVER_PREFIX sysctl -n ${SYSCTL_PREFIX}low_thresh)) + $SERVER_PREFIX sysctl -w ${SYSCTL_PREFIX}low_thresh=8192 + $SERVER_PREFIX sysctl -w $SYSCTL_NAME="8192" + + frag_pre=($($SERVER_PREFIX sysctl -n $SYSCTL_NAME)) + + # prevent firewall from reassembling packets. + set +e + FIREWALLD_PID=$(pgrep firewalld) + set -e + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld stop + fi + for MODE in baseline test ; do + + echo "Running ${MODE}..." + if [[ $MODE != "baseline" ]]; then + test_run_cmd_local "$BPFTUNE -ds &" true + sleep $SETUPTIME + else + LOGSZ=$(wc -l $LOGFILE | awk '{print $1}') + LOGSZ=$(expr $LOGSZ + 1) + fi + set +e + echo "Running $CLIENT_PREFIX ping -v -c 20 -M t -s 8192 $ADDR" + $CLIENT_PREFIX ping -v -c 20 -M want -s 8192 $ADDR + set -e + + if [[ $MODE != "baseline" ]]; then + pkill -TERM bpftune + sleep $SETUPTIME + else + sleep $SLEEPTIME + fi + done + if [[ -n "$FIREWALLD_PID" ]]; then + service firewalld start + fi + frag_post=($($SERVER_PREFIX sysctl -n $SYSCTL_NAME)) + if [[ -n $SERVER_PREFIX ]]; then + sysctl -w ${SYSCTL_NAME}=$frag_orig + sysctl -w ${SYSCTL_PREFIX}low_thresh=$low_orig + fi + echo "$SYSCTL_NAME before ${frag_pre}" + echo "$SYSCTL_NAME after ${frag_post}" + if [[ $MODE == "test" ]]; then + if [[ "${frag_post}" -gt ${frag_pre} ]]; then + grep "approaching fragmentation maximum threshold" $LOGFILE + test_pass + else + test_cleanup + fi + fi + + test_cleanup + done +done + +test_exit diff --git a/test/test_lib.sh b/test/test_lib.sh index e0ae631..4b270f8 100644 --- a/test/test_lib.sh +++ b/test/test_lib.sh @@ -320,7 +320,14 @@ test_cleanup() { trap - EXIT - test_cleanup_local $EXITCODE + if [[ $SKIP_CLEANUP -ne 0 ]]; then + echo "skipping cleanup as requested" + if [ $EXITCODE -ne 0 ]; then + test_log_result + fi + else + test_cleanup_local $EXITCODE + fi if [ $EXITCODE -ne 0 ]; then test_log_result exit 1 @@ -333,14 +340,7 @@ test_cleanup_exit() if [[ -n "$BC" ]]; then echo "Last command executed: '$BC'" fi - if [[ $SKIP_CLEANUP -ne 0 ]]; then - echo "skipping cleanup as requested" - if [ $EXITCODE -ne 0 ]; then - test_log_result - fi - else - test_cleanup - fi + test_cleanup test_exit }