Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
ip frag tuner improvements
correlate increases in ip frag memory with fragmentation error rate.
if increasing memory is not helping - just allowing us to defrag more
bad packets - we should see a correlation between fragmentation memory
increase and fragmentation error rate. If this is observed, reduce
fragmentation memory.  Helps protect against bad actors that send
bad frags and could bloat frag memory.

Add functions to read snmp stats from /proc/net/snmp[6] ; needed
to compute frag rate.  Also allow computation of correlation in
userspace.

Signed-off-by: Alan Maguire <[email protected]>
  • Loading branch information
alan-maguire committed Nov 26, 2024
commit 6c427cab9c9ba6935a4a2af4d25f5ae125a49deb
6 changes: 4 additions & 2 deletions docs/bpftune-ip-frag.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ DESCRIPTION
Fragmentation reassembly can fail if this value is set too low;
monitor for fragmentation reassembly and bump value if needed.

Avoid bumping it if assembly faiures constitute too high a
proportion of reassembly events; this may signify a DoS.
Avoid bumping it if assembly failures are correlated with
increases in frag_high_thresh; this suggests that increasing
available memory does not help. While correlation is high,
tune down the frag_high_thresh value.

Tunables:

Expand Down
2 changes: 2 additions & 0 deletions include/bpftune/bpftune.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ BPF_RINGBUF(ring_buffer_map, 128 * 1024);

BPF_MAP_DEF(netns_map, BPF_MAP_TYPE_HASH, __u64, __u64, 65536, 0);

BPF_MAP_DEF(corr_map, BPF_MAP_TYPE_HASH, struct corr_key, struct corr, 1024, 0);

unsigned int tuner_id;
unsigned int strategy_id;
unsigned int bpftune_pid;
Expand Down
16 changes: 15 additions & 1 deletion include/bpftune/corr.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
#define CORR_MIN_SAMPLES 10

/* threshold at which we determine correlation is significant */
#define CORR_THRESHOLD ((long double)0.7)
#define CORR_THRESHOLD ((long double)0.75)
#define CORR_HIGH_THRESHOLD ((long double)0.9)

/* correlate tunables via id + netns cookie */
struct corr_key {
Expand Down Expand Up @@ -114,6 +115,19 @@ static inline long double corr_compute(struct corr *c)
return 0;
return cov/(sqrtl(var_x)*sqrtl(var_y));
}

static inline int corr_update_user(int map, __u64 id,
__u64 netns_cookie,
__u64 x, __u64 y)
{
struct corr_key key = { .id = id, .netns_cookie = netns_cookie };
struct corr corr = {};

bpf_map_lookup_elem(map, &key, &corr);
corr_update(&corr, x, y);
return bpf_map_update_elem(map, &key, &corr, 0);
}

#endif /* __KERNEL__ */

#endif /* _CORR_H */
4 changes: 3 additions & 1 deletion include/bpftune/libbpftune.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <string.h>
#include <syslog.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
Expand Down Expand Up @@ -165,6 +166,7 @@ void bpftuner_tunables_fini(struct bpftuner *tuner);
tuner->obj = __skel->obj; \
tuner->ring_buffer_map = __skel->maps.ring_buffer_map; \
tuner->netns_map = __skel->maps.netns_map; \
tuner->corr_map = __skel->maps.corr_map; \
} while (0)

#define bpftuner_bpf_open(tuner_name, tuner) ({ \
Expand Down Expand Up @@ -289,7 +291,7 @@ void bpftune_ring_buffer_fini(void *ring_buffer);
void bpftune_sysctl_name_to_path(const char *name, char *path, size_t path_sz);
int bpftune_sysctl_read(int netns_fd, const char *name, long *values);
int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long *values);

int bpftune_snmpstat_read(unsigned long netns_cookie, int family, const char *name, long *value);
bool bpftune_netns_cookie_supported(void);
int bpftune_netns_set(int fd, int *orig_fd, bool quiet);
int bpftune_netns_info(int pid, int *fd, unsigned long *cookie);
Expand Down
17 changes: 11 additions & 6 deletions src/ip_frag_tuner.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@

#include <bpftune/bpftune.bpf.h>
#include "ip_frag_tuner.h"
#include <bpftune/corr.h>

static __always_inline int defrag(struct net *net, struct fqdir *fqdir,
int tunable)
struct ipstats_mib *mib, int tunable)
{
long mem = BPFTUNE_CORE_READ(fqdir, mem.counter);
long high_thresh = BPFTUNE_CORE_READ(fqdir, high_thresh);
Expand All @@ -31,9 +32,8 @@ static __always_inline int defrag(struct net *net, struct fqdir *fqdir,
if (!fqdir || !mem || !high_thresh)
return 0;

/* FragmentSmack DoS relied on small packets overwhelming defragmentation;
* do not raise limits when we see small fragments and a significant
* number of fragmentation reassembly failures versus successes.
/* do not raise limits when we see a correlation between raised fragment
* threshold and fragmentation failures; this suggests DoS
*/
if (NEARLY_FULL(mem, high_thresh)) {
struct bpftune_event event = { 0 };
Expand All @@ -52,10 +52,11 @@ static __always_inline int defrag(struct net *net, struct fqdir *fqdir,
BPF_FENTRY(ip_defrag, struct net *net, struct sk_buff *skb, u32 user)
{
struct fqdir *fqdir = BPFTUNE_CORE_READ(net, ipv4.fqdir);
struct ipstats_mib *mib = BPFTUNE_CORE_READ(net, mib.ip_statistics);

if (!fqdir)
return 0;
return defrag(net, fqdir, IP_FRAG_MAX_THRESHOLD);
return defrag(net, fqdir, mib, IP_FRAG_MAX_THRESHOLD);
}

#define SKB_DST_NOREF 1UL
Expand All @@ -64,6 +65,7 @@ BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb)
{
long unsigned int refdst = BPFTUNE_CORE_READ(skb, _skb_refdst);
struct dst_entry *dst = (struct dst_entry *)(refdst & SKB_DST_PTRMASK);
struct ipstats_mib *mib;
struct net_device *dev;
struct fqdir *fqdir;
struct net *net;
Expand All @@ -79,5 +81,8 @@ BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb)
fqdir = BPFTUNE_CORE_READ(net, ipv6.fqdir);
if (!fqdir)
return 0;
return defrag(net, fqdir, IP6_FRAG_MAX_THRESHOLD);
mib = BPFTUNE_CORE_READ(net, mib.ipv6_statistics);
if (!mib)
return 0;
return defrag(net, fqdir, mib, IP6_FRAG_MAX_THRESHOLD);
}
38 changes: 36 additions & 2 deletions src/ip_frag_tuner.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
/* Copyright (c) 2023, Oracle and/or its affiliates. */

#include <bpftune/libbpftune.h>
#include <bpftune/corr.h>

#include "ip_frag_tuner.h"
#include "ip_frag_tuner.skel.h"
#include "ip_frag_tuner.skel.legacy.h"
Expand All @@ -22,6 +24,8 @@ static struct bpftunable_desc descs[] = {
static struct bpftunable_scenario scenarios[] = {
{ IP_FRAG_THRESHOLD_INCREASE, "need to increase IP fragmentation high threshold",
"this allows additional memory to be used to accommodate more defragmentation." },
{ IP_FRAG_THRESHOLD_DECREASE, "need to decrease IP fragmentation high threshold",
"as we increased fragmentation high threshold we saw a correlation in reassembly failures; this indicates that we received more invalid fragments as we added memory to process them. As such, further increases are likely to be ineffective so reduce high threshold." },
};

int init(struct bpftuner *tuner)
Expand All @@ -45,10 +49,13 @@ void event_handler(struct bpftuner *tuner,
struct bpftune_event *event,
__attribute__((unused))void *ctx)
{
long new, old, reasmfails, reasmreqds, reasm_failrate;
int scenario = event->scenario_id;
struct corr c = { 0 };
long double corr = 0;
struct corr_key key;
const char *tunable;
long new, old;
int id;
int id, af;

/* netns cookie not supported; ignore */
if (event->netns_cookie == (unsigned long)-1)
Expand All @@ -64,6 +71,33 @@ void event_handler(struct bpftuner *tuner,
bpftune_log(LOG_DEBUG, "unknown tunable [%d] for ip_frag_tuner\n", id);
return;
}
key.id = (__u64)id;
key.netns_cookie = event->netns_cookie;

af = id == IP_FRAG_MAX_THRESHOLD ? AF_INET : AF_INET6;
if (!bpftune_snmpstat_read(event->netns_cookie, af,
"ReasmFails", &reasmfails) &&
!bpftune_snmpstat_read(event->netns_cookie, af,
"ReasmReqds", &reasmreqds)) {
/* % of reasm fails */
reasm_failrate = (reasmfails * 100)/reasmreqds;
bpftune_log(LOG_DEBUG, "got %ld reasmfails, %ld reasmreqds, %ld reasm fail rate (% of reasm failures)\n",
reasmfails, reasmreqds, reasm_failrate);
if (corr_update_user(tuner->corr_map_fd, key.id, key.netns_cookie,
(__u64)new, (__u64)reasm_failrate)) {
bpftune_log(LOG_DEBUG, "corr map fd %d xxx update failed %d\n", tuner->corr_map_fd, errno);
}
}
if (!bpf_map_lookup_elem(tuner->corr_map_fd, &key, &c)) {
corr = corr_compute(&c);
bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF ; corr %LF\n",
tunable, key.netns_cookie, new, covar_compute(&c), corr);
if (corr > CORR_HIGH_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) {
scenario = IP_FRAG_THRESHOLD_DECREASE;
new = BPFTUNE_SHRINK_BY_DELTA(old);
}
}

switch (id) {
case IP_FRAG_MAX_THRESHOLD:
case IP6_FRAG_MAX_THRESHOLD:
Expand Down
1 change: 1 addition & 0 deletions src/ip_frag_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ enum ip_frag_tunables {

enum ip_frag_scenarios {
IP_FRAG_THRESHOLD_INCREASE,
IP_FRAG_THRESHOLD_DECREASE,
};
107 changes: 101 additions & 6 deletions src/libbpftune.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ int bpftune_loglevel = BPFTUNE_LOG_LEVEL;
struct ring_buffer *ring_buffer;
int ring_buffer_map_fd;
int netns_map_fd;
int corr_map_fd;

int bpftune_log_level(void)
{
Expand Down Expand Up @@ -564,7 +565,10 @@ int __bpftuner_bpf_load(struct bpftuner *tuner, const char **optionals)
if (bpftuner_map_reuse("ring_buffer", tuner->ring_buffer_map,
ring_buffer_map_fd, &tuner->ring_buffer_map_fd) ||
bpftuner_map_reuse("netns_map", tuner->netns_map,
netns_map_fd, &tuner->netns_map_fd)) {
netns_map_fd, &tuner->netns_map_fd) ||
bpftuner_map_reuse("corr_map", tuner->corr_map,
corr_map_fd, &tuner->corr_map_fd)) {
bpftune_log(LOG_DEBUG, "got here!!\n");
err = -1;
goto out;
}
Expand Down Expand Up @@ -604,6 +608,8 @@ int __bpftuner_bpf_load(struct bpftuner *tuner, const char **optionals)
&ring_buffer_map_fd, &tuner->ring_buffer_map_fd);
bpftuner_map_init(tuner, "netns_map", &tuner->netns_map,
&netns_map_fd, &tuner->netns_map_fd);
bpftuner_map_init(tuner, "corr_map", &tuner->corr_map,
&corr_map_fd, &tuner->corr_map_fd);
out:
bpftune_cap_drop();
return err;
Expand All @@ -621,6 +627,7 @@ int __bpftuner_bpf_attach(struct bpftuner *tuner)
bpftune_log_bpf_err(err, "could not attach skeleton: %s\n");
} else {
tuner->ring_buffer_map_fd = bpf_map__fd(tuner->ring_buffer_map);
tuner->corr_map_fd = bpf_map__fd(tuner->corr_map);
}
bpftune_cap_drop();
return err;
Expand All @@ -639,7 +646,9 @@ void bpftuner_bpf_fini(struct bpftuner *tuner)
close(ring_buffer_map_fd);
if (netns_map_fd > 0)
close(netns_map_fd);
ring_buffer_map_fd = netns_map_fd = 0;
if (corr_map_fd > 0)
close(corr_map_fd);
ring_buffer_map_fd = netns_map_fd = corr_map_fd = 0;
}
bpftune_cap_drop();
}
Expand Down Expand Up @@ -967,6 +976,89 @@ int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long *
return err;
}

int bpftune_snmpstat_read(unsigned long netns_cookie, int family,
const char *name, long *value)
{
int err, netns_fd = 0, orig_netns_fd = 0, stat_index = 0;
const char *file;
char line[1024];
FILE *fp = NULL;

switch (family) {
case AF_INET:
file = "/proc/net/snmp";
break;
case AF_INET6:
file = "/proc/net/snmp6";
break;
default:
return -EINVAL;
}
err = bpftune_cap_add();
if (err)
return err;
netns_fd = bpftuner_netns_fd_from_cookie(NULL, netns_cookie);
if (netns_fd < 0) {
bpftune_log(LOG_DEBUG, "could not get netns fd for cookie %ld\n",
netns_cookie);
return -EINVAL;
}
err = bpftune_netns_set(netns_fd, &orig_netns_fd, false);
if (err < 0)
goto out_unset;
fp = fopen(file, "r");
if (!fp) {
err = -errno;
goto out;
}
while (fgets(line, sizeof(line) - 1, fp) != NULL) {
char *next, *s, *saveptr = NULL;
int index = 0;

/* for IPv6 it is a "key value" format per line; for
* IPv4 it is a set of parameter names on one line
* followed by the values on the next.
*/
if (family == AF_INET6) {
char nextname[128];

sscanf(line, "%s %ld", nextname, value);
/* names are ip6<Name> etc */
if (strstr(nextname, name))
break;
continue;
}
for (s = line;
(next = strtok_r(s, " ", &saveptr)) != NULL;
s = NULL, index++) {
/* found the stat value at index; set it in value */
if (stat_index && index == stat_index) {
if (sscanf(next, "%ld", value) != 1)
err = -ENOENT;
goto out;
}
/* find index of stat in stat string; value will
* have same index on the next line.
*/
if (strcmp(next, name) == 0) {
stat_index = index;
break;
}
}
}
out:
if (fp)
fclose(fp);
bpftune_netns_set(orig_netns_fd, NULL, true);
out_unset:
if (netns_fd)
close(netns_fd);
if (orig_netns_fd)
close(orig_netns_fd);
bpftune_cap_drop();
return err;
}

int bpftuner_tunables_init(struct bpftuner *tuner, unsigned int num_descs,
struct bpftunable_desc *descs,
unsigned int num_scenarios,
Expand Down Expand Up @@ -1423,18 +1515,21 @@ static int bpftune_netns_find(unsigned long cookie)

int bpftuner_netns_fd_from_cookie(struct bpftuner *tuner, unsigned long cookie)
{
struct bpftuner_netns *netns = bpftuner_netns_from_cookie(tuner->id,
cookie);
struct bpftuner_netns *netns = NULL;
int fd;

if (tuner)
netns = bpftuner_netns_from_cookie(tuner->id, cookie);
if (netns && netns->state >= BPFTUNE_MANUAL) {
bpftune_log(LOG_DEBUG, "netns (cookie %ld} manually disabled\n",
cookie);
return -ENOENT;
}
fd = bpftune_netns_find(cookie);
if (fd > 0 && !netns)
bpftuner_netns_init(tuner, cookie);
if (fd > 0 && !netns) {
if (tuner)
bpftuner_netns_init(tuner, cookie);
}
return fd;
}

Expand Down
1 change: 1 addition & 0 deletions src/libbpftune.map
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ LIBBPFTUNE_0.1.1 {
bpftune_sysctl_name_to_path;
bpftune_sysctl_read;
bpftune_sysctl_write;
bpftune_snmpstat_read;
bpftune_netns_init_all;
bpftune_netns_set;
bpftune_netns_info;
Expand Down
2 changes: 0 additions & 2 deletions src/tcp_buffer_tuner.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@

#define TCP_BUFFER_MAX 2147483647

BPF_MAP_DEF(corr_map, BPF_MAP_TYPE_HASH, struct corr_key, struct corr, 1024, 0);

bool under_memory_pressure = false;
bool near_memory_pressure = false;
bool near_memory_exhaustion = false;
Expand Down