Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/bpftune-net-buffer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,26 @@ DESCRIPTION
appropriate bit is set in the CPU bitmask to prioritize small
flows for drop avoidance.

When NAPI polls to handle multiple packets, the number of packets
is limited by net.core.netdev_budget while the time is limited
by net.core.netdev_budget_usecs. If we hit the limit of number
of packets processed without using the usecs budget the time_squeezed
softnet stat is bumped; if we see increases in time_squeezed, bump
netdev_budget/netdev_budget_usecs.

However, we want to limit such increases if they lead to longer
task scheduling wait times, so we monitor the ratio of time tasks
spend waiting versus running across all processors, and if we see
correlations between increases in netdev budget and wait/run ratio
increases, netdev budget is tuned down.

Tunables:

- net.core.netdev_max_backlog: maximum per-cpu backlog queue length;
default 1024.
- net.core.flow_limit_cpu_bitmap: avoid drops for small flows on
a per-cpu basis; default 0.
- net.core.netdev_budget: maximum number of packets processed in
a NAPI cycle
- net.core.netdev_budget_usecs: maximum amount of time in microseconds
for a NAPI cycle
4 changes: 4 additions & 0 deletions include/bpftune/bpftune.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,10 @@ unsigned long bpftune_init_net;

bool debug;

unsigned int bpftune_sample_rate = 4;

#define bpftune_skip_sample(count) ((++count % bpftune_sample_rate) != 0)

#define __barrier asm volatile("" ::: "memory")

#define bpftune_log(...) __bpf_printk(__VA_ARGS__)
Expand Down
1 change: 1 addition & 0 deletions include/bpftune/libbpftune.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ void bpftune_sysctl_name_to_path(const char *name, char *path, size_t path_sz);
int bpftune_sysctl_read(int netns_fd, const char *name, long *values);
int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long *values);
int bpftune_snmpstat_read(unsigned long netns_cookie, int family, const char *name, long *value);
int bpftune_sched_wait_run_percent_read(void);
bool bpftune_netns_cookie_supported(void);
int bpftune_netns_set(int fd, int *orig_fd, bool quiet);
int bpftune_netns_info(int pid, int *fd, unsigned long *cookie);
Expand Down
2 changes: 1 addition & 1 deletion src/ip_frag_tuner.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void event_handler(struct bpftuner *tuner,
corr = corr_compute(&c);
bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF ; corr %LF\n",
tunable, key.netns_cookie, new, covar_compute(&c), corr);
if (corr > CORR_HIGH_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) {
if (corr > CORR_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) {
scenario = IP_FRAG_THRESHOLD_DECREASE;
new = BPFTUNE_SHRINK_BY_DELTA(old);
}
Expand Down
38 changes: 38 additions & 0 deletions src/libbpftune.c
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,44 @@ int bpftune_snmpstat_read(unsigned long netns_cookie, int family,
return err;
}

/* return % of overall wait/run time on all cpus gathered from
* /proc/schedstat ; see https://docs.kernel.org/scheduler/sched-stats.html
* Usually > 100%.
*/
int bpftune_sched_wait_run_percent_read(void)
{
long running = 0, waiting = 0;
FILE *fp = NULL;
char line[1024];
int err = 0;

err = bpftune_cap_add();
if (err)
return err;
fp = fopen("/proc/schedstat", "r");
if (!fp) {
err = -errno;
goto out;
}
while (fgets(line, sizeof(line) - 1, fp) != NULL) {
long cpurunning = 0, cpuwaiting = 0, cputimeslices;

if (sscanf(line, "cpu%*d %*d %*d %*d %*d %*d %*d %ld %ld %ld",
&cpurunning, &cpuwaiting, &cputimeslices) == 3) {
running += cpurunning;
waiting += cpuwaiting;
}
}
bpftune_log(LOG_DEBUG, "sched waiting %ld, running %ld\n", waiting, running);
if (running > 0)
err = (int)((waiting*100)/running);
out:
if (fp)
fclose(fp);
bpftune_cap_drop();
return err;
}

int bpftuner_tunables_init(struct bpftuner *tuner, unsigned int num_descs,
struct bpftunable_desc *descs,
unsigned int num_scenarios,
Expand Down
1 change: 1 addition & 0 deletions src/libbpftune.map
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ LIBBPFTUNE_0.1.1 {
bpftune_sysctl_read;
bpftune_sysctl_write;
bpftune_snmpstat_read;
bpftune_sched_wait_run_percent_read;
bpftune_netns_init_all;
bpftune_netns_set;
bpftune_netns_info;
Expand Down
85 changes: 66 additions & 19 deletions src/net_buffer_tuner.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ __u64 drop_interval_start = 0;
__u64 flow_limit_cpu_bitmap = 0;

int netdev_max_backlog = 0;
int netdev_budget = 0;
int netdev_budget_usecs = 0;

#ifdef BPFTUNE_LEGACY
SEC("kretprobe/enqueue_to_backlog")
Expand All @@ -52,7 +54,7 @@ int BPF_PROG(bpftune_enqueue_to_backlog, struct sk_buff *skb, int cpu,
drop_count++;

/* only sample subset of drops to reduce overhead. */
if ((drop_count % 4) != 0)
if (bpftune_skip_sample(drop_count))
return 0;

/* if we drop more than 1/16 of the backlog queue size/min,
Expand All @@ -64,28 +66,73 @@ int BPF_PROG(bpftune_enqueue_to_backlog, struct sk_buff *skb, int cpu,
drop_count = 1;
drop_interval_start = time;
}
if (drop_count < (max_backlog >> 4))
return 0;

old[0] = max_backlog;
new[0] = BPFTUNE_GROW_BY_DELTA(max_backlog);
send_net_sysctl_event(NULL, NETDEV_MAX_BACKLOG_INCREASE,
NETDEV_MAX_BACKLOG, old, new, &event);
if (drop_count >= (max_backlog >> 4)) {
old[0] = max_backlog;
new[0] = BPFTUNE_GROW_BY_DELTA(max_backlog);
send_net_sysctl_event(NULL, NETDEV_MAX_BACKLOG_INCREASE,
NETDEV_MAX_BACKLOG, old, new, &event);

#ifdef BPFTUNE_LEGACY
int cpu = bpf_get_smp_processor_id();
int cpu = bpf_get_smp_processor_id();
#endif
/* ensure flow limits prioritize small flows on this cpu */
if (cpu < 64) {
cpubit = 1 << cpu;
if (!(flow_limit_cpu_bitmap & cpubit)) {
old[0] = flow_limit_cpu_bitmap;
new[0] = flow_limit_cpu_bitmap |= cpubit;
if (!send_net_sysctl_event(NULL, FLOW_LIMIT_CPU_SET,
FLOW_LIMIT_CPU_BITMAP,
old, new, &event))
flow_limit_cpu_bitmap = new[0];
/* ensure flow limits prioritize small flows on this cpu */
if (cpu < 64) {
cpubit = 1 << cpu;
if (!(flow_limit_cpu_bitmap & cpubit)) {
old[0] = flow_limit_cpu_bitmap;
new[0] = flow_limit_cpu_bitmap |= cpubit;
if (!send_net_sysctl_event(NULL, FLOW_LIMIT_CPU_SET,
FLOW_LIMIT_CPU_BITMAP,
old, new, &event))
flow_limit_cpu_bitmap = new[0];
}
}
}
return 0;
}

#ifndef BPFTUNE_LEGACY

BPF_MAP_DEF(time_squeeze_map, BPF_MAP_TYPE_PERCPU_ARRAY, unsigned int, unsigned int, 1, 0);

extern const struct softnet_data softnet_data __ksym;

__u64 rx_count = 0;

SEC("fexit/net_rx_action")
int BPF_PROG(net_rx_action)
{
struct bpftune_event event = { 0 };
long old[3], new[3];
struct softnet_data *sd;
unsigned int time_squeeze, *last_time_squeezep, last_time_squeeze;
unsigned int zero = 0;

if (bpftune_skip_sample(rx_count))
return 0;
sd = (struct softnet_data *)bpf_this_cpu_ptr(&softnet_data);
if (!sd)
return 0;
time_squeeze = BPFTUNE_CORE_READ(sd, time_squeeze);
if (!time_squeeze)
return 0;
last_time_squeezep = bpf_map_lookup_elem(&time_squeeze_map, &zero);
if (!last_time_squeezep)
return 0;
last_time_squeeze = *last_time_squeezep;
/* if time squeeze increased for every instance of
* net_rx_action() since last sample, we increase.
*/
if (time_squeeze <= (last_time_squeeze + bpftune_sample_rate))
return 0;
*last_time_squeezep = time_squeeze;
/* did not have previous time_squeeze value for comparison, bail. */
if (!(last_time_squeeze))
return 0;
old[0] = (long)netdev_budget;
new[0] = BPFTUNE_GROW_BY_DELTA((long)netdev_budget);
send_net_sysctl_event(NULL, NETDEV_BUDGET_INCREASE,
NETDEV_BUDGET, old, new, &event);
return 0;
}
#endif
106 changes: 95 additions & 11 deletions src/net_buffer_tuner.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
/* Copyright (c) 2023, Oracle and/or its affiliates. */

#include <bpftune/libbpftune.h>
#include <bpftune/corr.h>
#include "net_buffer_tuner.h"
#include "net_buffer_tuner.skel.h"
#include "net_buffer_tuner.skel.legacy.h"
#include "net_buffer_tuner.skel.nobtf.h"

#include <limits.h>
#include <unistd.h>

struct tcp_buffer_tuner_bpf *skel;
Expand All @@ -17,23 +19,35 @@ static struct bpftunable_desc descs[] = {
{ FLOW_LIMIT_CPU_BITMAP,
BPFTUNABLE_SYSCTL, "net.core.flow_limit_cpu_bitmap",
0, 1 },
{ NETDEV_BUDGET, BPFTUNABLE_SYSCTL, "net.core.netdev_budget",
0, 1 },
{ NETDEV_BUDGET_USECS, BPFTUNABLE_SYSCTL, "net.core.netdev_budget_usecs",
0, 1 },
};

static struct bpftunable_scenario scenarios[] = {
{ NETDEV_MAX_BACKLOG_INCREASE, "need to increase max backlog size",
"Need to increase backlog size to prevent drops for faster connection" },
{ FLOW_LIMIT_CPU_SET, "need to set per-cpu bitmap value",
"Need to set flow limit per-cpu to prioritize small flows" }
"Need to set flow limit per-cpu to prioritize small flows" },
{ NETDEV_BUDGET_INCREASE, "need to increase # of packets processed per NAPI poll",
"Need to increase number of packets processed across network devices during NAPI poll to use all of net.core.netdev_budget_usecs" },
{ NETDEV_BUDGET_DECREASE, "need to decrease # of packets processed per NAPI poll",
"Need to decrease netdev_budget[_usecs] since the ratio of time spent waiting to run versus time spent running for tasks has increased as we have increased netdev budget. This indicates either our budget increases directly let to increased wait times for other tasks, or that general load has increased; either way spending too much time in NAPI processing will hurt system performance." }
};

int init(struct bpftuner *tuner)
{
long cpu_bitmap = 0;
long max_backlog = 0;
long budget = 0;
long budget_usecs = 0;
int err;

bpftune_sysctl_read(0, "net.core.flow_limit_cpu_bitmap", &cpu_bitmap);
bpftune_sysctl_read(0, "net.core.netdev_max_backlog", &max_backlog);
bpftune_sysctl_read(0, "net.core.netdev_budget", &budget);
bpftune_sysctl_read(0, "net.core.netdev_budget_usecs", &budget_usecs);
err = bpftuner_bpf_open(net_buffer, tuner);
if (err)
return err;
Expand All @@ -44,6 +58,10 @@ int init(struct bpftuner *tuner)
cpu_bitmap);
bpftuner_bpf_var_set(net_buffer, tuner, netdev_max_backlog,
max_backlog);
bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget,
budget);
bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget_usecs,
budget_usecs);
err = bpftuner_bpf_attach(net_buffer, tuner);
if (err)
return err;
Expand All @@ -62,9 +80,13 @@ void event_handler(struct bpftuner *tuner,
struct bpftune_event *event,
__attribute__((unused))void *ctx)
{
long new, budget_usecs, budget_usecs_new;
int scenario = event->scenario_id;
struct corr c = { 0 };
long double corr = 0;
const char *tunable;
int id;
struct corr_key key;
int id, ret;

/* netns cookie not supported; ignore */
if (event->netns_cookie == (unsigned long)-1)
Expand All @@ -73,21 +95,23 @@ void event_handler(struct bpftuner *tuner,
id = event->update[0].id;
tunable = bpftuner_tunable_name(tuner, id);
if (!tunable) {
bpftune_log(LOG_DEBUG, "unknown tunable [%d] for tcp_buffer_tuner\n", id);
bpftune_log(LOG_DEBUG, "unknown tunable [%d] for net_buffer_tuner\n", id);
return;
}
switch (id) {
case NETDEV_MAX_BACKLOG:
bpftuner_tunable_sysctl_write(tuner, id, scenario,
event->netns_cookie, 1,
(long int *)event->update[0].new,
ret = bpftuner_tunable_sysctl_write(tuner, id, scenario,
event->netns_cookie, 1,
(long int *)event->update[0].new,
"Due to excessive drops, change %s from (%ld) -> (%ld)\n",
tunable,
event->update[0].old[0],
tunable,
event->update[0].old[0],
event->update[0].new[0]);
if (!ret) {
/* update value of netdev_max_backlog for BPF program */
bpftuner_bpf_var_set(net_buffer, tuner, netdev_max_backlog,
event->update[0].new[0]);
/* update value of netdev_max_backlog for BPF program */
bpftuner_bpf_var_set(net_buffer, tuner, netdev_max_backlog,
event->update[0].new[0]);
}
break;
case FLOW_LIMIT_CPU_BITMAP:
bpftuner_tunable_sysctl_write(tuner, id, scenario,
Expand All @@ -97,6 +121,66 @@ void event_handler(struct bpftuner *tuner,
tunable,
event->update[0].old[0],
event->update[0].new[0]);
break;
case NETDEV_BUDGET:
new = event->update[0].new[0];
if (new > INT_MAX)
break;
budget_usecs = bpftuner_bpf_var_get(net_buffer, tuner,
netdev_budget_usecs);
budget_usecs_new = BPFTUNE_GROW_BY_DELTA(budget_usecs);

ret = bpftune_sched_wait_run_percent_read();
bpftune_log(LOG_DEBUG, "sched wait-run percent : %d\n", ret);
if (ret > 0) {
key.id = (__u64)id;
key.netns_cookie = event->netns_cookie;
if (corr_update_user(tuner->corr_map_fd, key.id,
key.netns_cookie,
(__u64)new, (__u64)ret))
bpftune_log(LOG_DEBUG, "corr map fd %d update failed %d\n",
tuner->corr_map_fd, errno);
}
if (!bpf_map_lookup_elem(tuner->corr_map_fd, &key, &c)) {
corr = corr_compute(&c);
bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF; corr %LF\n",
tunable, key.netns_cookie, new,
covar_compute(&c), corr);
if (corr > CORR_THRESHOLD) {
new = BPFTUNE_SHRINK_BY_DELTA(event->update[0].old[0]);
budget_usecs_new = BPFTUNE_SHRINK_BY_DELTA(budget_usecs);
scenario = NETDEV_BUDGET_DECREASE;
}
}
ret = bpftuner_tunable_sysctl_write(tuner, id, scenario,
event->netns_cookie, 1,
(long int *)&new,
"To maximize # packets processed per NAPI cycle, change %s from (%ld) -> (%ld)\n",
tunable,
event->update[0].old[0],
new);
if (!ret) {
/* update value of netdev_budget for BPF program */
bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget,
new);
/* need to also update budget_usecs since both
* limit netdev budget and reaching either limit
* triggers time_squeeze.
*/
ret = bpftuner_tunable_sysctl_write(tuner,
NETDEV_BUDGET_USECS,
scenario,
event->netns_cookie,
1,
&budget_usecs_new,
"To maximize # packets processed per NAPI cycle, change netdev_budget_usecs from (%ld) -> (%ld)\n",
budget_usecs,
budget_usecs_new);
if (!ret)
bpftuner_bpf_var_set(net_buffer, tuner,
netdev_budget_usecs,
budget_usecs_new);
}
break;
}
}
Loading