diff --git a/docs/bpftune-ip-frag.rst b/docs/bpftune-ip-frag.rst index 4a37316..84c84f7 100644 --- a/docs/bpftune-ip-frag.rst +++ b/docs/bpftune-ip-frag.rst @@ -18,8 +18,10 @@ DESCRIPTION Fragmentation reassembly can fail if this value is set too low; monitor for fragmentation reassembly and bump value if needed. - Avoid bumping it if assembly faiures constitute too high a - proportion of reassembly events; this may signify a DoS. + Avoid bumping it if assembly failures are correlated with + increases in frag_high_thresh; this suggests that increasing + available memory does not help. While correlation is high, + tune down the frag_high_thresh value. Tunables: diff --git a/include/bpftune/bpftune.bpf.h b/include/bpftune/bpftune.bpf.h index 613c19b..5f881b9 100644 --- a/include/bpftune/bpftune.bpf.h +++ b/include/bpftune/bpftune.bpf.h @@ -199,6 +199,8 @@ BPF_RINGBUF(ring_buffer_map, 128 * 1024); BPF_MAP_DEF(netns_map, BPF_MAP_TYPE_HASH, __u64, __u64, 65536, 0); +BPF_MAP_DEF(corr_map, BPF_MAP_TYPE_HASH, struct corr_key, struct corr, 1024, 0); + unsigned int tuner_id; unsigned int strategy_id; unsigned int bpftune_pid; diff --git a/include/bpftune/corr.h b/include/bpftune/corr.h index f9cbad7..f4ae6f3 100644 --- a/include/bpftune/corr.h +++ b/include/bpftune/corr.h @@ -23,7 +23,8 @@ #define CORR_MIN_SAMPLES 10 /* threshold at which we determine correlation is significant */ -#define CORR_THRESHOLD ((long double)0.7) +#define CORR_THRESHOLD ((long double)0.75) +#define CORR_HIGH_THRESHOLD ((long double)0.9) /* correlate tunables via id + netns cookie */ struct corr_key { @@ -114,6 +115,19 @@ static inline long double corr_compute(struct corr *c) return 0; return cov/(sqrtl(var_x)*sqrtl(var_y)); } + +static inline int corr_update_user(int map, __u64 id, + __u64 netns_cookie, + __u64 x, __u64 y) +{ + struct corr_key key = { .id = id, .netns_cookie = netns_cookie }; + struct corr corr = {}; + + bpf_map_lookup_elem(map, &key, &corr); + corr_update(&corr, x, y); + return bpf_map_update_elem(map, &key, &corr, 0); +} + #endif /* __KERNEL__ */ #endif /* _CORR_H */ diff --git a/include/bpftune/libbpftune.h b/include/bpftune/libbpftune.h index 47598e0..4417507 100644 --- a/include/bpftune/libbpftune.h +++ b/include/bpftune/libbpftune.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -165,6 +166,7 @@ void bpftuner_tunables_fini(struct bpftuner *tuner); tuner->obj = __skel->obj; \ tuner->ring_buffer_map = __skel->maps.ring_buffer_map; \ tuner->netns_map = __skel->maps.netns_map; \ + tuner->corr_map = __skel->maps.corr_map; \ } while (0) #define bpftuner_bpf_open(tuner_name, tuner) ({ \ @@ -289,7 +291,7 @@ void bpftune_ring_buffer_fini(void *ring_buffer); void bpftune_sysctl_name_to_path(const char *name, char *path, size_t path_sz); int bpftune_sysctl_read(int netns_fd, const char *name, long *values); int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long *values); - +int bpftune_snmpstat_read(unsigned long netns_cookie, int family, const char *name, long *value); bool bpftune_netns_cookie_supported(void); int bpftune_netns_set(int fd, int *orig_fd, bool quiet); int bpftune_netns_info(int pid, int *fd, unsigned long *cookie); diff --git a/src/ip_frag_tuner.bpf.c b/src/ip_frag_tuner.bpf.c index 594c9a2..4d76e76 100644 --- a/src/ip_frag_tuner.bpf.c +++ b/src/ip_frag_tuner.bpf.c @@ -19,9 +19,10 @@ #include #include "ip_frag_tuner.h" +#include static __always_inline int defrag(struct net *net, struct fqdir *fqdir, - int tunable) + struct ipstats_mib *mib, int tunable) { long mem = BPFTUNE_CORE_READ(fqdir, mem.counter); long high_thresh = BPFTUNE_CORE_READ(fqdir, high_thresh); @@ -31,9 +32,8 @@ static __always_inline int defrag(struct net *net, struct fqdir *fqdir, if (!fqdir || !mem || !high_thresh) return 0; - /* FragmentSmack DoS relied on small packets overwhelming defragmentation; - * do not raise limits when we see small fragments and a significant - * number of fragmentation reassembly failures versus successes. + /* do not raise limits when we see a correlation between raised fragment + * threshold and fragmentation failures; this suggests DoS */ if (NEARLY_FULL(mem, high_thresh)) { struct bpftune_event event = { 0 }; @@ -52,10 +52,11 @@ static __always_inline int defrag(struct net *net, struct fqdir *fqdir, BPF_FENTRY(ip_defrag, struct net *net, struct sk_buff *skb, u32 user) { struct fqdir *fqdir = BPFTUNE_CORE_READ(net, ipv4.fqdir); + struct ipstats_mib *mib = BPFTUNE_CORE_READ(net, mib.ip_statistics); if (!fqdir) return 0; - return defrag(net, fqdir, IP_FRAG_MAX_THRESHOLD); + return defrag(net, fqdir, mib, IP_FRAG_MAX_THRESHOLD); } #define SKB_DST_NOREF 1UL @@ -64,6 +65,7 @@ BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb) { long unsigned int refdst = BPFTUNE_CORE_READ(skb, _skb_refdst); struct dst_entry *dst = (struct dst_entry *)(refdst & SKB_DST_PTRMASK); + struct ipstats_mib *mib; struct net_device *dev; struct fqdir *fqdir; struct net *net; @@ -79,5 +81,8 @@ BPF_FENTRY(ipv6_frag_rcv, struct sk_buff *skb) fqdir = BPFTUNE_CORE_READ(net, ipv6.fqdir); if (!fqdir) return 0; - return defrag(net, fqdir, IP6_FRAG_MAX_THRESHOLD); + mib = BPFTUNE_CORE_READ(net, mib.ipv6_statistics); + if (!mib) + return 0; + return defrag(net, fqdir, mib, IP6_FRAG_MAX_THRESHOLD); } diff --git a/src/ip_frag_tuner.c b/src/ip_frag_tuner.c index a32a607..92b77df 100644 --- a/src/ip_frag_tuner.c +++ b/src/ip_frag_tuner.c @@ -2,6 +2,8 @@ /* Copyright (c) 2023, Oracle and/or its affiliates. */ #include +#include + #include "ip_frag_tuner.h" #include "ip_frag_tuner.skel.h" #include "ip_frag_tuner.skel.legacy.h" @@ -22,6 +24,8 @@ static struct bpftunable_desc descs[] = { static struct bpftunable_scenario scenarios[] = { { IP_FRAG_THRESHOLD_INCREASE, "need to increase IP fragmentation high threshold", "this allows additional memory to be used to accommodate more defragmentation." }, +{ IP_FRAG_THRESHOLD_DECREASE, "need to decrease IP fragmentation high threshold", + "as we increased fragmentation high threshold we saw a correlation in reassembly failures; this indicates that we received more invalid fragments as we added memory to process them. As such, further increases are likely to be ineffective so reduce high threshold." }, }; int init(struct bpftuner *tuner) @@ -45,10 +49,13 @@ void event_handler(struct bpftuner *tuner, struct bpftune_event *event, __attribute__((unused))void *ctx) { + long new, old, reasmfails, reasmreqds, reasm_failrate; int scenario = event->scenario_id; + struct corr c = { 0 }; + long double corr = 0; + struct corr_key key; const char *tunable; - long new, old; - int id; + int id, af; /* netns cookie not supported; ignore */ if (event->netns_cookie == (unsigned long)-1) @@ -64,6 +71,33 @@ void event_handler(struct bpftuner *tuner, bpftune_log(LOG_DEBUG, "unknown tunable [%d] for ip_frag_tuner\n", id); return; } + key.id = (__u64)id; + key.netns_cookie = event->netns_cookie; + + af = id == IP_FRAG_MAX_THRESHOLD ? AF_INET : AF_INET6; + if (!bpftune_snmpstat_read(event->netns_cookie, af, + "ReasmFails", &reasmfails) && + !bpftune_snmpstat_read(event->netns_cookie, af, + "ReasmReqds", &reasmreqds)) { + /* % of reasm fails */ + reasm_failrate = (reasmfails * 100)/reasmreqds; + bpftune_log(LOG_DEBUG, "got %ld reasmfails, %ld reasmreqds, %ld reasm fail rate (% of reasm failures)\n", + reasmfails, reasmreqds, reasm_failrate); + if (corr_update_user(tuner->corr_map_fd, key.id, key.netns_cookie, + (__u64)new, (__u64)reasm_failrate)) { + bpftune_log(LOG_DEBUG, "corr map fd %d xxx update failed %d\n", tuner->corr_map_fd, errno); + } + } + if (!bpf_map_lookup_elem(tuner->corr_map_fd, &key, &c)) { + corr = corr_compute(&c); + bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF ; corr %LF\n", + tunable, key.netns_cookie, new, covar_compute(&c), corr); + if (corr > CORR_HIGH_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) { + scenario = IP_FRAG_THRESHOLD_DECREASE; + new = BPFTUNE_SHRINK_BY_DELTA(old); + } + } + switch (id) { case IP_FRAG_MAX_THRESHOLD: case IP6_FRAG_MAX_THRESHOLD: diff --git a/src/ip_frag_tuner.h b/src/ip_frag_tuner.h index 03e545a..0b71b58 100644 --- a/src/ip_frag_tuner.h +++ b/src/ip_frag_tuner.h @@ -28,4 +28,5 @@ enum ip_frag_tunables { enum ip_frag_scenarios { IP_FRAG_THRESHOLD_INCREASE, + IP_FRAG_THRESHOLD_DECREASE, }; diff --git a/src/libbpftune.c b/src/libbpftune.c index f75981b..c768572 100644 --- a/src/libbpftune.c +++ b/src/libbpftune.c @@ -69,6 +69,7 @@ int bpftune_loglevel = BPFTUNE_LOG_LEVEL; struct ring_buffer *ring_buffer; int ring_buffer_map_fd; int netns_map_fd; +int corr_map_fd; int bpftune_log_level(void) { @@ -564,7 +565,10 @@ int __bpftuner_bpf_load(struct bpftuner *tuner, const char **optionals) if (bpftuner_map_reuse("ring_buffer", tuner->ring_buffer_map, ring_buffer_map_fd, &tuner->ring_buffer_map_fd) || bpftuner_map_reuse("netns_map", tuner->netns_map, - netns_map_fd, &tuner->netns_map_fd)) { + netns_map_fd, &tuner->netns_map_fd) || + bpftuner_map_reuse("corr_map", tuner->corr_map, + corr_map_fd, &tuner->corr_map_fd)) { + bpftune_log(LOG_DEBUG, "got here!!\n"); err = -1; goto out; } @@ -604,6 +608,8 @@ int __bpftuner_bpf_load(struct bpftuner *tuner, const char **optionals) &ring_buffer_map_fd, &tuner->ring_buffer_map_fd); bpftuner_map_init(tuner, "netns_map", &tuner->netns_map, &netns_map_fd, &tuner->netns_map_fd); + bpftuner_map_init(tuner, "corr_map", &tuner->corr_map, + &corr_map_fd, &tuner->corr_map_fd); out: bpftune_cap_drop(); return err; @@ -621,6 +627,7 @@ int __bpftuner_bpf_attach(struct bpftuner *tuner) bpftune_log_bpf_err(err, "could not attach skeleton: %s\n"); } else { tuner->ring_buffer_map_fd = bpf_map__fd(tuner->ring_buffer_map); + tuner->corr_map_fd = bpf_map__fd(tuner->corr_map); } bpftune_cap_drop(); return err; @@ -639,7 +646,9 @@ void bpftuner_bpf_fini(struct bpftuner *tuner) close(ring_buffer_map_fd); if (netns_map_fd > 0) close(netns_map_fd); - ring_buffer_map_fd = netns_map_fd = 0; + if (corr_map_fd > 0) + close(corr_map_fd); + ring_buffer_map_fd = netns_map_fd = corr_map_fd = 0; } bpftune_cap_drop(); } @@ -967,6 +976,89 @@ int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long * return err; } +int bpftune_snmpstat_read(unsigned long netns_cookie, int family, + const char *name, long *value) +{ + int err, netns_fd = 0, orig_netns_fd = 0, stat_index = 0; + const char *file; + char line[1024]; + FILE *fp = NULL; + + switch (family) { + case AF_INET: + file = "/proc/net/snmp"; + break; + case AF_INET6: + file = "/proc/net/snmp6"; + break; + default: + return -EINVAL; + } + err = bpftune_cap_add(); + if (err) + return err; + netns_fd = bpftuner_netns_fd_from_cookie(NULL, netns_cookie); + if (netns_fd < 0) { + bpftune_log(LOG_DEBUG, "could not get netns fd for cookie %ld\n", + netns_cookie); + return -EINVAL; + } + err = bpftune_netns_set(netns_fd, &orig_netns_fd, false); + if (err < 0) + goto out_unset; + fp = fopen(file, "r"); + if (!fp) { + err = -errno; + goto out; + } + while (fgets(line, sizeof(line) - 1, fp) != NULL) { + char *next, *s, *saveptr = NULL; + int index = 0; + + /* for IPv6 it is a "key value" format per line; for + * IPv4 it is a set of parameter names on one line + * followed by the values on the next. + */ + if (family == AF_INET6) { + char nextname[128]; + + sscanf(line, "%s %ld", nextname, value); + /* names are ip6 etc */ + if (strstr(nextname, name)) + break; + continue; + } + for (s = line; + (next = strtok_r(s, " ", &saveptr)) != NULL; + s = NULL, index++) { + /* found the stat value at index; set it in value */ + if (stat_index && index == stat_index) { + if (sscanf(next, "%ld", value) != 1) + err = -ENOENT; + goto out; + } + /* find index of stat in stat string; value will + * have same index on the next line. + */ + if (strcmp(next, name) == 0) { + stat_index = index; + break; + } + } + } +out: + if (fp) + fclose(fp); + bpftune_netns_set(orig_netns_fd, NULL, true); +out_unset: + if (netns_fd) + close(netns_fd); + if (orig_netns_fd) + close(orig_netns_fd); + bpftune_cap_drop(); + return err; +} + int bpftuner_tunables_init(struct bpftuner *tuner, unsigned int num_descs, struct bpftunable_desc *descs, unsigned int num_scenarios, @@ -1423,18 +1515,21 @@ static int bpftune_netns_find(unsigned long cookie) int bpftuner_netns_fd_from_cookie(struct bpftuner *tuner, unsigned long cookie) { - struct bpftuner_netns *netns = bpftuner_netns_from_cookie(tuner->id, - cookie); + struct bpftuner_netns *netns = NULL; int fd; + if (tuner) + netns = bpftuner_netns_from_cookie(tuner->id, cookie); if (netns && netns->state >= BPFTUNE_MANUAL) { bpftune_log(LOG_DEBUG, "netns (cookie %ld} manually disabled\n", cookie); return -ENOENT; } fd = bpftune_netns_find(cookie); - if (fd > 0 && !netns) - bpftuner_netns_init(tuner, cookie); + if (fd > 0 && !netns) { + if (tuner) + bpftuner_netns_init(tuner, cookie); + } return fd; } diff --git a/src/libbpftune.map b/src/libbpftune.map index fcc7c43..c512f42 100644 --- a/src/libbpftune.map +++ b/src/libbpftune.map @@ -50,6 +50,7 @@ LIBBPFTUNE_0.1.1 { bpftune_sysctl_name_to_path; bpftune_sysctl_read; bpftune_sysctl_write; + bpftune_snmpstat_read; bpftune_netns_init_all; bpftune_netns_set; bpftune_netns_info; diff --git a/src/tcp_buffer_tuner.bpf.c b/src/tcp_buffer_tuner.bpf.c index 73cbfba..2c0202f 100644 --- a/src/tcp_buffer_tuner.bpf.c +++ b/src/tcp_buffer_tuner.bpf.c @@ -23,8 +23,6 @@ #define TCP_BUFFER_MAX 2147483647 -BPF_MAP_DEF(corr_map, BPF_MAP_TYPE_HASH, struct corr_key, struct corr, 1024, 0); - bool under_memory_pressure = false; bool near_memory_pressure = false; bool near_memory_exhaustion = false;