Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ man8dir = $(mandir)/man8

MAN8_RST = bpftune.rst bpftune-sysctl.rst bpftune-tcp-conn.rst \
bpftune-neigh.rst bpftune-tcp-buffer.rst bpftune-netns.rst \
bpftune-net-buffer.rst
bpftune-net-buffer.rst bpftune-ip-frag.rst

_DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST))
DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8))
Expand Down
29 changes: 29 additions & 0 deletions docs/bpftune-ip-frag.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
===============
BPFTUNE-IP-FRAG
===============
-------------------------------------------------------------------------------
IP fragmentation bpftune plugin for managing fragment reassembly memory limits
-------------------------------------------------------------------------------

:Manual section: 8


DESCRIPTION
===========

For IPv[46] fragmentation reassembly, memory is capped at

net.ipv[46].ip[6]frag_high_thresh

Fragmentation reassembly can fail if this value is set too low;
monitor for fragmentation reassembly and bump value if needed.

Avoid bumping it if assembly faiures constitute too high a
proportion of reassembly events; this may signify a DoS.

Tunables:

- net.ipv4.ipfrag_high_thresh: number of bytes devoted to
IPv4 fragmentation reassembly; default 4MB
- net.ipv6.ip6frag_high_thresh: number of bytes devoted to
IPv6 fragmentation reassembly; default 4MB
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ submake_extras := feature_display=0
endif

TUNERS = tcp_buffer_tuner route_table_tuner neigh_table_tuner sysctl_tuner \
tcp_conn_tuner netns_tuner net_buffer_tuner
tcp_conn_tuner netns_tuner net_buffer_tuner ip_frag_tuner

TUNER_OBJS = $(patsubst %,%.o,$(TUNERS))
TUNER_SRCS = $(patsubst %,%.c,$(TUNERS))
Expand Down
105 changes: 105 additions & 0 deletions src/ip_frag_tuner.bpf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Copyright (c) 2023, Oracle and/or its affiliates.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/

#include <bpftune/bpftune.bpf.h>
#include "ip_frag_tuner.h"

/* ratio of failure to success is > 1/2 */
#define REASM_FAIL_THRESHOLD(success, fail) ((success >= 2) && (fail > (success >> 1)))

static __always_inline int defrag(struct net *net, struct fqdir *fqdir,
struct ipstats_mib *ip_stats, int tunable)
{
long mem = BPFTUNE_CORE_READ(fqdir, mem.counter);
long high_thresh = BPFTUNE_CORE_READ(fqdir, high_thresh);

bpftune_debug("defrag: mem %ld high thresh %ld\n",
mem, high_thresh);
if (!fqdir || !mem || !high_thresh)
return 0;

/* FragmentSmack DoS relied on small packets overwhelming defragmentation;
* do not raise limits when we see small fragments and a significant
* number of fragmentation reassembly failures versus successes.
*/
if (NEARLY_FULL(mem, high_thresh)) {
__u64 reasm_success = BPFTUNE_CORE_READ(ip_stats,
mibs[IPSTATS_MIB_REASMOKS]);
__u64 reasm_fails = BPFTUNE_CORE_READ(ip_stats,
mibs[IPSTATS_MIB_REASMFAILS]);
struct bpftune_event event = { 0 };
long old[3] = {};
long new[3] = {};

bpftune_debug("nearly full, reasm success %ld reasm fail %ld\n",
reasm_success, reasm_fails);

/* too many fragmentation reassembly fails? */
if (REASM_FAIL_THRESHOLD(reasm_success, reasm_fails))
return 0;
old[0] = high_thresh;
new[0] = BPFTUNE_GROW_BY_DELTA(high_thresh);
send_net_sysctl_event(net, IP_FRAG_THRESHOLD_INCREASE,
tunable,
old, new, &event);
}
return 0;
}

BPF_FENTRY(ip_defrag, struct net *net, struct sk_buff *skb, u32 user)
{
struct fqdir *fqdir = BPFTUNE_CORE_READ(net, ipv4.fqdir);
struct ipstats_mib *ip_stats;

if (!fqdir)
return 0;
ip_stats = BPFTUNE_CORE_READ(net, mib.ip_statistics);

if (!ip_stats)
return 0;
return defrag(net, fqdir, ip_stats, IP_FRAG_MAX_THRESHOLD);
}

#define SKB_DST_NOREF 1UL
#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb)
{
long unsigned int refdst = BPFTUNE_CORE_READ(skb, _skb_refdst);
struct dst_entry *dst = (struct dst_entry *)(refdst & SKB_DST_PTRMASK);
struct ipstats_mib *ipv6_stats;
struct net_device *dev;
struct fqdir *fqdir;
struct net *net;

if (!dst)
return 0;
dev = BPFTUNE_CORE_READ(dst, dev);
if (!dev)
return 0;
net = BPFTUNE_CORE_READ(dev, nd_net.net);
if (!net)
return 0;
fqdir = BPFTUNE_CORE_READ(net, ipv6.fqdir);
if (!fqdir)
return 0;
ipv6_stats = BPFTUNE_CORE_READ(net, mib.ipv6_statistics);
if (!ipv6_stats)
return 0;
return defrag(net, fqdir, ipv6_stats, IP6_FRAG_MAX_THRESHOLD);
}
78 changes: 78 additions & 0 deletions src/ip_frag_tuner.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/* Copyright (c) 2023, Oracle and/or its affiliates. */

#include <bpftune/libbpftune.h>
#include "ip_frag_tuner.h"
#include "ip_frag_tuner.skel.h"
#include "ip_frag_tuner.skel.legacy.h"
#include "ip_frag_tuner.skel.nobtf.h"

#include <unistd.h>
#include <linux/limits.h>

struct tcp_buffer_tuner_bpf *skel;

static struct bpftunable_desc descs[] = {
{ IP_FRAG_MAX_THRESHOLD, BPFTUNABLE_SYSCTL, "net.ipv4.ipfrag_high_thresh",
BPFTUNABLE_NAMESPACED, 1 },
{ IP6_FRAG_MAX_THRESHOLD, BPFTUNABLE_SYSCTL, "net.ipv6.ip6frag_high_thresh",
BPFTUNABLE_NAMESPACED, 1 },
};

static struct bpftunable_scenario scenarios[] = {
{ IP_FRAG_THRESHOLD_INCREASE, "need to increase IP fragmentation high threshold",
"this allows additional memory to be used to accommodate more defragmentation." },
};

int init(struct bpftuner *tuner)
{
int err;

err = bpftuner_bpf_init(ip_frag, tuner, NULL);
if (err)
return err;
return bpftuner_tunables_init(tuner, IP_FRAG_NUM_TUNABLES, descs,
ARRAY_SIZE(scenarios), scenarios);
}

void fini(struct bpftuner *tuner)
{
bpftune_log(LOG_DEBUG, "calling fini for %s\n", tuner->name);
bpftuner_bpf_fini(tuner);
}

void event_handler(struct bpftuner *tuner,
struct bpftune_event *event,
__attribute__((unused))void *ctx)
{
int scenario = event->scenario_id;
const char *tunable;
long new, old;
int id;

/* netns cookie not supported; ignore */
if (event->netns_cookie == (unsigned long)-1)
return;

id = event->update[0].id;

memcpy(&new, event->update[0].new, sizeof(new));
memcpy(&old, event->update[0].old, sizeof(old));

tunable = bpftuner_tunable_name(tuner, id);
if (!tunable) {
bpftune_log(LOG_DEBUG, "unknown tunable [%d] for ip_frag_tuner\n", id);
return;
}
switch (id) {
case IP_FRAG_MAX_THRESHOLD:
case IP6_FRAG_MAX_THRESHOLD:
bpftuner_tunable_sysctl_write(tuner, id, scenario,
event->netns_cookie, 1, &new,
"Due to approaching fragmentation maximum threshold change %s from (%ld) -> (%ld)\n",
tunable, old, new);
break;
default:
break;
}
}
31 changes: 31 additions & 0 deletions src/ip_frag_tuner.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Copyright (c) 2023, Oracle and/or its affiliates.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/

#include <bpftune/bpftune.h>


enum ip_frag_tunables {
IP_FRAG_MAX_THRESHOLD,
IP6_FRAG_MAX_THRESHOLD,
IP_FRAG_NUM_TUNABLES
};

enum ip_frag_scenarios {
IP_FRAG_THRESHOLD_INCREASE,
};
1 change: 1 addition & 0 deletions test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ TUNER_TESTS = support_test log_test service_test inotify_test cap_test \
sysctl_test sysctl_legacy_test sysctl_netns_test \
netns_test netns_legacy_test \
backlog_test backlog_legacy_test \
frag_test frag_legacy_test \
neigh_table_test neigh_table_v4only_test \
neigh_table_legacy_test \
mem_pressure_test mem_pressure_legacy_test \
Expand Down
Loading