From 6c427cab9c9ba6935a4a2af4d25f5ae125a49deb Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 22 Nov 2024 14:41:59 +0000 Subject: [PATCH] ip frag tuner improvements correlate increases in ip frag memory with fragmentation error rate. if increasing memory is not helping - just allowing us to defrag more bad packets - we should see a correlation between fragmentation memory increase and fragmentation error rate. If this is observed, reduce fragmentation memory. Helps protect against bad actors that send bad frags and could bloat frag memory. Add functions to read snmp stats from /proc/net/snmp[6] ; needed to compute frag rate. Also allow computation of correlation in userspace. Signed-off-by: Alan Maguire --- docs/bpftune-ip-frag.rst | 6 +- include/bpftune/bpftune.bpf.h | 2 + include/bpftune/corr.h | 16 ++++- include/bpftune/libbpftune.h | 4 +- src/ip_frag_tuner.bpf.c | 17 ++++-- src/ip_frag_tuner.c | 38 +++++++++++- src/ip_frag_tuner.h | 1 + src/libbpftune.c | 107 ++++++++++++++++++++++++++++++++-- src/libbpftune.map | 1 + src/tcp_buffer_tuner.bpf.c | 2 - 10 files changed, 174 insertions(+), 20 deletions(-) diff --git a/docs/bpftune-ip-frag.rst b/docs/bpftune-ip-frag.rst index 4a37316..84c84f7 100644 --- a/docs/bpftune-ip-frag.rst +++ b/docs/bpftune-ip-frag.rst @@ -18,8 +18,10 @@ DESCRIPTION Fragmentation reassembly can fail if this value is set too low; monitor for fragmentation reassembly and bump value if needed. - Avoid bumping it if assembly faiures constitute too high a - proportion of reassembly events; this may signify a DoS. + Avoid bumping it if assembly failures are correlated with + increases in frag_high_thresh; this suggests that increasing + available memory does not help. While correlation is high, + tune down the frag_high_thresh value. Tunables: diff --git a/include/bpftune/bpftune.bpf.h b/include/bpftune/bpftune.bpf.h index 613c19b..5f881b9 100644 --- a/include/bpftune/bpftune.bpf.h +++ b/include/bpftune/bpftune.bpf.h @@ -199,6 +199,8 @@ BPF_RINGBUF(ring_buffer_map, 128 * 1024); BPF_MAP_DEF(netns_map, BPF_MAP_TYPE_HASH, __u64, __u64, 65536, 0); +BPF_MAP_DEF(corr_map, BPF_MAP_TYPE_HASH, struct corr_key, struct corr, 1024, 0); + unsigned int tuner_id; unsigned int strategy_id; unsigned int bpftune_pid; diff --git a/include/bpftune/corr.h b/include/bpftune/corr.h index f9cbad7..f4ae6f3 100644 --- a/include/bpftune/corr.h +++ b/include/bpftune/corr.h @@ -23,7 +23,8 @@ #define CORR_MIN_SAMPLES 10 /* threshold at which we determine correlation is significant */ -#define CORR_THRESHOLD ((long double)0.7) +#define CORR_THRESHOLD ((long double)0.75) +#define CORR_HIGH_THRESHOLD ((long double)0.9) /* correlate tunables via id + netns cookie */ struct corr_key { @@ -114,6 +115,19 @@ static inline long double corr_compute(struct corr *c) return 0; return cov/(sqrtl(var_x)*sqrtl(var_y)); } + +static inline int corr_update_user(int map, __u64 id, + __u64 netns_cookie, + __u64 x, __u64 y) +{ + struct corr_key key = { .id = id, .netns_cookie = netns_cookie }; + struct corr corr = {}; + + bpf_map_lookup_elem(map, &key, &corr); + corr_update(&corr, x, y); + return bpf_map_update_elem(map, &key, &corr, 0); +} + #endif /* __KERNEL__ */ #endif /* _CORR_H */ diff --git a/include/bpftune/libbpftune.h b/include/bpftune/libbpftune.h index 47598e0..4417507 100644 --- a/include/bpftune/libbpftune.h +++ b/include/bpftune/libbpftune.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -165,6 +166,7 @@ void bpftuner_tunables_fini(struct bpftuner *tuner); tuner->obj = __skel->obj; \ tuner->ring_buffer_map = __skel->maps.ring_buffer_map; \ tuner->netns_map = __skel->maps.netns_map; \ + tuner->corr_map = __skel->maps.corr_map; \ } while (0) #define bpftuner_bpf_open(tuner_name, tuner) ({ \ @@ -289,7 +291,7 @@ void bpftune_ring_buffer_fini(void *ring_buffer); void bpftune_sysctl_name_to_path(const char *name, char *path, size_t path_sz); int bpftune_sysctl_read(int netns_fd, const char *name, long *values); int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long *values); - +int bpftune_snmpstat_read(unsigned long netns_cookie, int family, const char *name, long *value); bool bpftune_netns_cookie_supported(void); int bpftune_netns_set(int fd, int *orig_fd, bool quiet); int bpftune_netns_info(int pid, int *fd, unsigned long *cookie); diff --git a/src/ip_frag_tuner.bpf.c b/src/ip_frag_tuner.bpf.c index 594c9a2..4d76e76 100644 --- a/src/ip_frag_tuner.bpf.c +++ b/src/ip_frag_tuner.bpf.c @@ -19,9 +19,10 @@ #include #include "ip_frag_tuner.h" +#include static __always_inline int defrag(struct net *net, struct fqdir *fqdir, - int tunable) + struct ipstats_mib *mib, int tunable) { long mem = BPFTUNE_CORE_READ(fqdir, mem.counter); long high_thresh = BPFTUNE_CORE_READ(fqdir, high_thresh); @@ -31,9 +32,8 @@ static __always_inline int defrag(struct net *net, struct fqdir *fqdir, if (!fqdir || !mem || !high_thresh) return 0; - /* FragmentSmack DoS relied on small packets overwhelming defragmentation; - * do not raise limits when we see small fragments and a significant - * number of fragmentation reassembly failures versus successes. + /* do not raise limits when we see a correlation between raised fragment + * threshold and fragmentation failures; this suggests DoS */ if (NEARLY_FULL(mem, high_thresh)) { struct bpftune_event event = { 0 }; @@ -52,10 +52,11 @@ static __always_inline int defrag(struct net *net, struct fqdir *fqdir, BPF_FENTRY(ip_defrag, struct net *net, struct sk_buff *skb, u32 user) { struct fqdir *fqdir = BPFTUNE_CORE_READ(net, ipv4.fqdir); + struct ipstats_mib *mib = BPFTUNE_CORE_READ(net, mib.ip_statistics); if (!fqdir) return 0; - return defrag(net, fqdir, IP_FRAG_MAX_THRESHOLD); + return defrag(net, fqdir, mib, IP_FRAG_MAX_THRESHOLD); } #define SKB_DST_NOREF 1UL @@ -64,6 +65,7 @@ BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb) { long unsigned int refdst = BPFTUNE_CORE_READ(skb, _skb_refdst); struct dst_entry *dst = (struct dst_entry *)(refdst & SKB_DST_PTRMASK); + struct ipstats_mib *mib; struct net_device *dev; struct fqdir *fqdir; struct net *net; @@ -79,5 +81,8 @@ BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb) fqdir = BPFTUNE_CORE_READ(net, ipv6.fqdir); if (!fqdir) return 0; - return defrag(net, fqdir, IP6_FRAG_MAX_THRESHOLD); + mib = BPFTUNE_CORE_READ(net, mib.ipv6_statistics); + if (!mib) + return 0; + return defrag(net, fqdir, mib, IP6_FRAG_MAX_THRESHOLD); } diff --git a/src/ip_frag_tuner.c b/src/ip_frag_tuner.c index a32a607..92b77df 100644 --- a/src/ip_frag_tuner.c +++ b/src/ip_frag_tuner.c @@ -2,6 +2,8 @@ /* Copyright (c) 2023, Oracle and/or its affiliates. */ #include +#include + #include "ip_frag_tuner.h" #include "ip_frag_tuner.skel.h" #include "ip_frag_tuner.skel.legacy.h" @@ -22,6 +24,8 @@ static struct bpftunable_desc descs[] = { static struct bpftunable_scenario scenarios[] = { { IP_FRAG_THRESHOLD_INCREASE, "need to increase IP fragmentation high threshold", "this allows additional memory to be used to accommodate more defragmentation." }, +{ IP_FRAG_THRESHOLD_DECREASE, "need to decrease IP fragmentation high threshold", + "as we increased fragmentation high threshold we saw a correlation in reassembly failures; this indicates that we received more invalid fragments as we added memory to process them. As such, further increases are likely to be ineffective so reduce high threshold." }, }; int init(struct bpftuner *tuner) @@ -45,10 +49,13 @@ void event_handler(struct bpftuner *tuner, struct bpftune_event *event, __attribute__((unused))void *ctx) { + long new, old, reasmfails, reasmreqds, reasm_failrate; int scenario = event->scenario_id; + struct corr c = { 0 }; + long double corr = 0; + struct corr_key key; const char *tunable; - long new, old; - int id; + int id, af; /* netns cookie not supported; ignore */ if (event->netns_cookie == (unsigned long)-1) @@ -64,6 +71,33 @@ void event_handler(struct bpftuner *tuner, bpftune_log(LOG_DEBUG, "unknown tunable [%d] for ip_frag_tuner\n", id); return; } + key.id = (__u64)id; + key.netns_cookie = event->netns_cookie; + + af = id == IP_FRAG_MAX_THRESHOLD ? AF_INET : AF_INET6; + if (!bpftune_snmpstat_read(event->netns_cookie, af, + "ReasmFails", &reasmfails) && + !bpftune_snmpstat_read(event->netns_cookie, af, + "ReasmReqds", &reasmreqds)) { + /* % of reasm fails */ + reasm_failrate = (reasmfails * 100)/reasmreqds; + bpftune_log(LOG_DEBUG, "got %ld reasmfails, %ld reasmreqds, %ld reasm fail rate (% of reasm failures)\n", + reasmfails, reasmreqds, reasm_failrate); + if (corr_update_user(tuner->corr_map_fd, key.id, key.netns_cookie, + (__u64)new, (__u64)reasm_failrate)) { + bpftune_log(LOG_DEBUG, "corr map fd %d xxx update failed %d\n", tuner->corr_map_fd, errno); + } + } + if (!bpf_map_lookup_elem(tuner->corr_map_fd, &key, &c)) { + corr = corr_compute(&c); + bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF ; corr %LF\n", + tunable, key.netns_cookie, new, covar_compute(&c), corr); + if (corr > CORR_HIGH_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) { + scenario = IP_FRAG_THRESHOLD_DECREASE; + new = BPFTUNE_SHRINK_BY_DELTA(old); + } + } + switch (id) { case IP_FRAG_MAX_THRESHOLD: case IP6_FRAG_MAX_THRESHOLD: diff --git a/src/ip_frag_tuner.h b/src/ip_frag_tuner.h index 03e545a..0b71b58 100644 --- a/src/ip_frag_tuner.h +++ b/src/ip_frag_tuner.h @@ -28,4 +28,5 @@ enum ip_frag_tunables { enum ip_frag_scenarios { IP_FRAG_THRESHOLD_INCREASE, + IP_FRAG_THRESHOLD_DECREASE, }; diff --git a/src/libbpftune.c b/src/libbpftune.c index f75981b..c768572 100644 --- a/src/libbpftune.c +++ b/src/libbpftune.c @@ -69,6 +69,7 @@ int bpftune_loglevel = BPFTUNE_LOG_LEVEL; struct ring_buffer *ring_buffer; int ring_buffer_map_fd; int netns_map_fd; +int corr_map_fd; int bpftune_log_level(void) { @@ -564,7 +565,10 @@ int __bpftuner_bpf_load(struct bpftuner *tuner, const char **optionals) if (bpftuner_map_reuse("ring_buffer", tuner->ring_buffer_map, ring_buffer_map_fd, &tuner->ring_buffer_map_fd) || bpftuner_map_reuse("netns_map", tuner->netns_map, - netns_map_fd, &tuner->netns_map_fd)) { + netns_map_fd, &tuner->netns_map_fd) || + bpftuner_map_reuse("corr_map", tuner->corr_map, + corr_map_fd, &tuner->corr_map_fd)) { + bpftune_log(LOG_DEBUG, "got here!!\n"); err = -1; goto out; } @@ -604,6 +608,8 @@ int __bpftuner_bpf_load(struct bpftuner *tuner, const char **optionals) &ring_buffer_map_fd, &tuner->ring_buffer_map_fd); bpftuner_map_init(tuner, "netns_map", &tuner->netns_map, &netns_map_fd, &tuner->netns_map_fd); + bpftuner_map_init(tuner, "corr_map", &tuner->corr_map, + &corr_map_fd, &tuner->corr_map_fd); out: bpftune_cap_drop(); return err; @@ -621,6 +627,7 @@ int __bpftuner_bpf_attach(struct bpftuner *tuner) bpftune_log_bpf_err(err, "could not attach skeleton: %s\n"); } else { tuner->ring_buffer_map_fd = bpf_map__fd(tuner->ring_buffer_map); + tuner->corr_map_fd = bpf_map__fd(tuner->corr_map); } bpftune_cap_drop(); return err; @@ -639,7 +646,9 @@ void bpftuner_bpf_fini(struct bpftuner *tuner) close(ring_buffer_map_fd); if (netns_map_fd > 0) close(netns_map_fd); - ring_buffer_map_fd = netns_map_fd = 0; + if (corr_map_fd > 0) + close(corr_map_fd); + ring_buffer_map_fd = netns_map_fd = corr_map_fd = 0; } bpftune_cap_drop(); } @@ -967,6 +976,89 @@ int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long * return err; } +int bpftune_snmpstat_read(unsigned long netns_cookie, int family, + const char *name, long *value) +{ + int err, netns_fd = 0, orig_netns_fd = 0, stat_index = 0; + const char *file; + char line[1024]; + FILE *fp = NULL; + + switch (family) { + case AF_INET: + file = "/proc/net/snmp"; + break; + case AF_INET6: + file = "/proc/net/snmp6"; + break; + default: + return -EINVAL; + } + err = bpftune_cap_add(); + if (err) + return err; + netns_fd = bpftuner_netns_fd_from_cookie(NULL, netns_cookie); + if (netns_fd < 0) { + bpftune_log(LOG_DEBUG, "could not get netns fd for cookie %ld\n", + netns_cookie); + return -EINVAL; + } + err = bpftune_netns_set(netns_fd, &orig_netns_fd, false); + if (err < 0) + goto out_unset; + fp = fopen(file, "r"); + if (!fp) { + err = -errno; + goto out; + } + while (fgets(line, sizeof(line) - 1, fp) != NULL) { + char *next, *s, *saveptr = NULL; + int index = 0; + + /* for IPv6 it is a "key value" format per line; for + * IPv4 it is a set of parameter names on one line + * followed by the values on the next. + */ + if (family == AF_INET6) { + char nextname[128]; + + sscanf(line, "%s %ld", nextname, value); + /* names are ip6 etc */ + if (strstr(nextname, name)) + break; + continue; + } + for (s = line; + (next = strtok_r(s, " ", &saveptr)) != NULL; + s = NULL, index++) { + /* found the stat value at index; set it in value */ + if (stat_index && index == stat_index) { + if (sscanf(next, "%ld", value) != 1) + err = -ENOENT; + goto out; + } + /* find index of stat in stat string; value will + * have same index on the next line. + */ + if (strcmp(next, name) == 0) { + stat_index = index; + break; + } + } + } +out: + if (fp) + fclose(fp); + bpftune_netns_set(orig_netns_fd, NULL, true); +out_unset: + if (netns_fd) + close(netns_fd); + if (orig_netns_fd) + close(orig_netns_fd); + bpftune_cap_drop(); + return err; +} + int bpftuner_tunables_init(struct bpftuner *tuner, unsigned int num_descs, struct bpftunable_desc *descs, unsigned int num_scenarios, @@ -1423,18 +1515,21 @@ static int bpftune_netns_find(unsigned long cookie) int bpftuner_netns_fd_from_cookie(struct bpftuner *tuner, unsigned long cookie) { - struct bpftuner_netns *netns = bpftuner_netns_from_cookie(tuner->id, - cookie); + struct bpftuner_netns *netns = NULL; int fd; + if (tuner) + netns = bpftuner_netns_from_cookie(tuner->id, cookie); if (netns && netns->state >= BPFTUNE_MANUAL) { bpftune_log(LOG_DEBUG, "netns (cookie %ld} manually disabled\n", cookie); return -ENOENT; } fd = bpftune_netns_find(cookie); - if (fd > 0 && !netns) - bpftuner_netns_init(tuner, cookie); + if (fd > 0 && !netns) { + if (tuner) + bpftuner_netns_init(tuner, cookie); + } return fd; } diff --git a/src/libbpftune.map b/src/libbpftune.map index fcc7c43..c512f42 100644 --- a/src/libbpftune.map +++ b/src/libbpftune.map @@ -50,6 +50,7 @@ LIBBPFTUNE_0.1.1 { bpftune_sysctl_name_to_path; bpftune_sysctl_read; bpftune_sysctl_write; + bpftune_snmpstat_read; bpftune_netns_init_all; bpftune_netns_set; bpftune_netns_info; diff --git a/src/tcp_buffer_tuner.bpf.c b/src/tcp_buffer_tuner.bpf.c index 73cbfba..2c0202f 100644 --- a/src/tcp_buffer_tuner.bpf.c +++ b/src/tcp_buffer_tuner.bpf.c @@ -23,8 +23,6 @@ #define TCP_BUFFER_MAX 2147483647 -BPF_MAP_DEF(corr_map, BPF_MAP_TYPE_HASH, struct corr_key, struct corr, 1024, 0); - bool under_memory_pressure = false; bool near_memory_pressure = false; bool near_memory_exhaustion = false;