oracle · alan-maguire · Dec 3, 2024 · Nov 29, 2024 · Nov 29, 2024 · Dec 2, 2024
diff --git a/docs/bpftune-net-buffer.rst b/docs/bpftune-net-buffer.rst
@@ -22,9 +22,26 @@ DESCRIPTION
         appropriate bit is set in the CPU bitmask to prioritize small
         flows for drop avoidance.
 
+        When NAPI polls to handle multiple packets, the number of packets
+        is limited by net.core.netdev_budget while the time is limited
+        by net.core.netdev_budget_usecs.  If we hit the limit of number
+        of packets processed without using the usecs budget the time_squeezed
+        softnet stat is bumped; if we see increases in time_squeezed, bump
+        netdev_budget/netdev_budget_usecs.
+
+        However, we want to limit such increases if they lead to longer
+        task scheduling wait times, so we monitor the ratio of time tasks
+        spend waiting versus running across all processors, and if we see
+        correlations between increases in netdev budget and wait/run ratio
+        increases, netdev budget is tuned down.
+
         Tunables:
 
         - net.core.netdev_max_backlog: maximum per-cpu backlog queue length;
           default 1024.
         - net.core.flow_limit_cpu_bitmap: avoid drops for small flows on
           a per-cpu basis; default 0.
+        - net.core.netdev_budget: maximum number of packets processed in
+          a NAPI cycle
+        - net.core.netdev_budget_usecs: maximum amount of time in microseconds
+          for a NAPI cycle
diff --git a/include/bpftune/bpftune.bpf.h b/include/bpftune/bpftune.bpf.h
@@ -273,6 +273,10 @@ unsigned long bpftune_init_net;
 
 bool debug;
 
+unsigned int bpftune_sample_rate = 4;
+
+#define bpftune_skip_sample(count)	((++count % bpftune_sample_rate) != 0)
+
 #define __barrier asm volatile("" ::: "memory")
 
 #define bpftune_log(...)	__bpf_printk(__VA_ARGS__)

diff --git a/include/bpftune/libbpftune.h b/include/bpftune/libbpftune.h
@@ -292,6 +292,7 @@ void bpftune_sysctl_name_to_path(const char *name, char *path, size_t path_sz);
 int bpftune_sysctl_read(int netns_fd, const char *name, long *values);
 int bpftune_sysctl_write(int netns_fd, const char *name, __u8 num_values, long *values);
 int bpftune_snmpstat_read(unsigned long netns_cookie, int family, const char *name, long *value);
+int bpftune_sched_wait_run_percent_read(void);
 bool bpftune_netns_cookie_supported(void);
 int bpftune_netns_set(int fd, int *orig_fd, bool quiet);
 int bpftune_netns_info(int pid, int *fd, unsigned long *cookie);

diff --git a/src/ip_frag_tuner.c b/src/ip_frag_tuner.c
@@ -92,7 +92,7 @@ void event_handler(struct bpftuner *tuner,
 		corr = corr_compute(&c);
 		bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF ; corr %LF\n",
 			    tunable, key.netns_cookie, new, covar_compute(&c), corr);
-		if (corr > CORR_HIGH_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) {
+		if (corr > CORR_THRESHOLD && scenario == IP_FRAG_THRESHOLD_INCREASE) {
 			scenario = IP_FRAG_THRESHOLD_DECREASE;
 			new = BPFTUNE_SHRINK_BY_DELTA(old);
 		}

diff --git a/src/libbpftune.c b/src/libbpftune.c
@@ -1095,6 +1095,44 @@ int bpftune_snmpstat_read(unsigned long netns_cookie, int family,
 	return err;
 }
 
+/* return % of overall wait/run time on all cpus gathered from
+ * /proc/schedstat ; see https://docs.kernel.org/scheduler/sched-stats.html
+ * Usually > 100%.
+ */
+int bpftune_sched_wait_run_percent_read(void)
+{
+	long running = 0, waiting = 0;
+	FILE *fp = NULL;
+	char line[1024];
+	int err = 0;
+
+	err = bpftune_cap_add();
+        if (err)
+                return err;
+	fp = fopen("/proc/schedstat", "r");	
+	if (!fp) {
+		err = -errno;
+		goto out;
+	}
+	while (fgets(line, sizeof(line) - 1, fp) != NULL) {
+		long cpurunning = 0, cpuwaiting = 0, cputimeslices;
+
+		if (sscanf(line, "cpu%*d %*d %*d %*d %*d %*d %*d %ld %ld %ld",
+			   &cpurunning, &cpuwaiting, &cputimeslices) == 3) {
+			running += cpurunning;
+			waiting += cpuwaiting;
+		}
+	}
+	bpftune_log(LOG_DEBUG, "sched waiting %ld, running %ld\n", waiting, running);
+	if (running > 0)
+		err = (int)((waiting*100)/running);
+out:
+	if (fp)
+		fclose(fp);
+	bpftune_cap_drop();
+	return err;
+}
+
 int bpftuner_tunables_init(struct bpftuner *tuner, unsigned int num_descs,
 			   struct bpftunable_desc *descs,
 			   unsigned int num_scenarios,

diff --git a/src/libbpftune.map b/src/libbpftune.map
@@ -51,6 +51,7 @@ LIBBPFTUNE_0.1.1 {
 		bpftune_sysctl_read;
 		bpftune_sysctl_write;
 		bpftune_snmpstat_read;
+		bpftune_sched_wait_run_percent_read;
 		bpftune_netns_init_all;
 		bpftune_netns_set;
 		bpftune_netns_info;

diff --git a/src/net_buffer_tuner.bpf.c b/src/net_buffer_tuner.bpf.c
@@ -30,6 +30,8 @@ __u64 drop_interval_start = 0;
 __u64 flow_limit_cpu_bitmap = 0;
 
 int netdev_max_backlog = 0;
+int netdev_budget = 0;
+int netdev_budget_usecs = 0;
 
 #ifdef BPFTUNE_LEGACY
 SEC("kretprobe/enqueue_to_backlog")
@@ -52,7 +54,7 @@ int BPF_PROG(bpftune_enqueue_to_backlog, struct sk_buff *skb, int cpu,
 	drop_count++;
 
 	/* only sample subset of drops to reduce overhead. */
-	if ((drop_count % 4) != 0)
+	if (bpftune_skip_sample(drop_count))
 		return 0;
 
 	/* if we drop more than 1/16 of the backlog queue size/min,
@@ -64,28 +66,73 @@ int BPF_PROG(bpftune_enqueue_to_backlog, struct sk_buff *skb, int cpu,
 		drop_count = 1;
 		drop_interval_start = time;
 	}
-	if (drop_count < (max_backlog >> 4))
-		return 0;
-
-	old[0] = max_backlog;
-	new[0] = BPFTUNE_GROW_BY_DELTA(max_backlog);
-	send_net_sysctl_event(NULL, NETDEV_MAX_BACKLOG_INCREASE,
-			      NETDEV_MAX_BACKLOG, old, new, &event);
+	if (drop_count >= (max_backlog >> 4)) {
+		old[0] = max_backlog;
+		new[0] = BPFTUNE_GROW_BY_DELTA(max_backlog);
+		send_net_sysctl_event(NULL, NETDEV_MAX_BACKLOG_INCREASE,
+				      NETDEV_MAX_BACKLOG, old, new, &event);
 
 #ifdef BPFTUNE_LEGACY
-	int cpu = bpf_get_smp_processor_id();
+		int cpu = bpf_get_smp_processor_id();
 #endif
-	/* ensure flow limits prioritize small flows on this cpu */
-	if (cpu < 64) {
-		cpubit = 1 << cpu;
-		if (!(flow_limit_cpu_bitmap & cpubit)) {
-			old[0] = flow_limit_cpu_bitmap;
-			new[0] = flow_limit_cpu_bitmap |= cpubit;
-			if (!send_net_sysctl_event(NULL, FLOW_LIMIT_CPU_SET,	
-						   FLOW_LIMIT_CPU_BITMAP,
-						   old, new, &event))
-				flow_limit_cpu_bitmap = new[0];
+		/* ensure flow limits prioritize small flows on this cpu */
+		if (cpu < 64) {
+			cpubit = 1 << cpu;
+			if (!(flow_limit_cpu_bitmap & cpubit)) {
+				old[0] = flow_limit_cpu_bitmap;
+				new[0] = flow_limit_cpu_bitmap |= cpubit;
+				if (!send_net_sysctl_event(NULL, FLOW_LIMIT_CPU_SET,	
+							   FLOW_LIMIT_CPU_BITMAP,
+							   old, new, &event))
+					flow_limit_cpu_bitmap = new[0];
+			}
 		}
 	}
 	return 0;
 }
+
+#ifndef BPFTUNE_LEGACY
+
+BPF_MAP_DEF(time_squeeze_map, BPF_MAP_TYPE_PERCPU_ARRAY, unsigned int, unsigned int, 1, 0);
+
+extern const struct softnet_data softnet_data __ksym;
+
+__u64 rx_count = 0;
+
+SEC("fexit/net_rx_action")
+int BPF_PROG(net_rx_action)
+{
+	struct bpftune_event event =  { 0 };
+        long old[3], new[3];
+	struct softnet_data *sd;
+	unsigned int time_squeeze, *last_time_squeezep, last_time_squeeze;
+	unsigned int zero = 0;
+
+	if (bpftune_skip_sample(rx_count))
+		return 0;
+	sd = (struct softnet_data *)bpf_this_cpu_ptr(&softnet_data);
+	if (!sd)
+		return 0;
+	time_squeeze = BPFTUNE_CORE_READ(sd, time_squeeze);
+	if (!time_squeeze)
+		return 0;
+	last_time_squeezep = bpf_map_lookup_elem(&time_squeeze_map, &zero);
+	if (!last_time_squeezep)
+		return 0;
+	last_time_squeeze = *last_time_squeezep;
+	/* if time squeeze increased for every instance of
+	 * net_rx_action() since last sample, we increase.
+	 */
+	if (time_squeeze <= (last_time_squeeze + bpftune_sample_rate))
+		return 0;
+	*last_time_squeezep = time_squeeze;
+	/* did not have previous time_squeeze value for comparison, bail. */
+	if (!(last_time_squeeze))
+		return 0;
+	old[0] = (long)netdev_budget;
+	new[0] = BPFTUNE_GROW_BY_DELTA((long)netdev_budget);
+	send_net_sysctl_event(NULL, NETDEV_BUDGET_INCREASE,
+			      NETDEV_BUDGET, old, new, &event);
+	return 0;
+}
+#endif
diff --git a/src/net_buffer_tuner.c b/src/net_buffer_tuner.c
@@ -2,11 +2,13 @@
 /* Copyright (c) 2023, Oracle and/or its affiliates. */
 
 #include <bpftune/libbpftune.h>
+#include <bpftune/corr.h>
 #include "net_buffer_tuner.h"
 #include "net_buffer_tuner.skel.h"
 #include "net_buffer_tuner.skel.legacy.h"
 #include "net_buffer_tuner.skel.nobtf.h"
 
+#include <limits.h>
 #include <unistd.h>
 
 struct tcp_buffer_tuner_bpf *skel;
@@ -17,23 +19,35 @@ static struct bpftunable_desc descs[] = {
 { FLOW_LIMIT_CPU_BITMAP,
 			BPFTUNABLE_SYSCTL, "net.core.flow_limit_cpu_bitmap",
 								0, 1 },
+{ NETDEV_BUDGET,	BPFTUNABLE_SYSCTL, "net.core.netdev_budget",
+								0, 1 },
+{ NETDEV_BUDGET_USECS,	BPFTUNABLE_SYSCTL, "net.core.netdev_budget_usecs",
+								0, 1 },
 };
 
 static struct bpftunable_scenario scenarios[] = {
 { NETDEV_MAX_BACKLOG_INCREASE,	"need to increase max backlog size",
 	"Need to increase backlog size to prevent drops for faster connection" },
 { FLOW_LIMIT_CPU_SET,		"need to set per-cpu bitmap value",
-	"Need to set flow limit per-cpu to prioritize small flows" }
+	"Need to set flow limit per-cpu to prioritize small flows" },
+{ NETDEV_BUDGET_INCREASE,	"need to increase # of packets processed per NAPI poll",
+	"Need to increase number of packets processed across network devices during NAPI poll to use all of net.core.netdev_budget_usecs" },
+{ NETDEV_BUDGET_DECREASE,	"need to decrease # of packets processed per NAPI poll",
+	"Need to decrease netdev_budget[_usecs] since the ratio of time spent waiting to run versus time spent running for tasks has increased as we have increased netdev budget.  This indicates either our budget increases directly let to increased wait times for other tasks, or that general load has increased; either way spending too much time in NAPI processing will hurt system performance." }
 };
 
 int init(struct bpftuner *tuner)
 {
 	long cpu_bitmap = 0;
 	long max_backlog = 0;
+	long budget = 0;
+	long budget_usecs = 0;
 	int err;
 
 	bpftune_sysctl_read(0, "net.core.flow_limit_cpu_bitmap", &cpu_bitmap);
 	bpftune_sysctl_read(0, "net.core.netdev_max_backlog", &max_backlog);
+	bpftune_sysctl_read(0, "net.core.netdev_budget", &budget);
+	bpftune_sysctl_read(0, "net.core.netdev_budget_usecs", &budget_usecs);
 	err = bpftuner_bpf_open(net_buffer, tuner);
 	if (err)
 		return err;
@@ -44,6 +58,10 @@ int init(struct bpftuner *tuner)
 			     cpu_bitmap);
 	bpftuner_bpf_var_set(net_buffer, tuner, netdev_max_backlog,
 			     max_backlog);
+	bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget,
+			     budget);
+	bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget_usecs,
+			     budget_usecs);
 	err = bpftuner_bpf_attach(net_buffer, tuner);
 	if (err)
 		return err;
@@ -62,9 +80,13 @@ void event_handler(struct bpftuner *tuner,
 		   struct bpftune_event *event,
 		   __attribute__((unused))void *ctx)
 {
+	long new, budget_usecs, budget_usecs_new;
 	int scenario = event->scenario_id;
+	struct corr c = { 0 };
+	long double corr = 0;
 	const char *tunable;
-	int id;
+	struct corr_key key;
+	int id, ret;
 
 	/* netns cookie not supported; ignore */
 	if (event->netns_cookie == (unsigned long)-1)
@@ -73,21 +95,23 @@ void event_handler(struct bpftuner *tuner,
 	id = event->update[0].id;
 	tunable = bpftuner_tunable_name(tuner, id);
 	if (!tunable) {
-		bpftune_log(LOG_DEBUG, "unknown tunable [%d] for tcp_buffer_tuner\n", id);
+		bpftune_log(LOG_DEBUG, "unknown tunable [%d] for net_buffer_tuner\n", id);
 		return;
 	}
 	switch (id) {
 	case NETDEV_MAX_BACKLOG:
-		bpftuner_tunable_sysctl_write(tuner, id, scenario,
-					      event->netns_cookie, 1,
-					      (long int *)event->update[0].new,
+		ret = bpftuner_tunable_sysctl_write(tuner, id, scenario,
+						    event->netns_cookie, 1,
+					            (long int *)event->update[0].new,
 "Due to excessive drops, change %s from (%ld) -> (%ld)\n",
-					     tunable,
-					     event->update[0].old[0],
+					            tunable,
+					            event->update[0].old[0],
+					            event->update[0].new[0]);
+		if (!ret) {
+			/* update value of netdev_max_backlog for BPF program */
+			bpftuner_bpf_var_set(net_buffer, tuner, netdev_max_backlog,
 					     event->update[0].new[0]);
-		/* update value of netdev_max_backlog for BPF program */
-		bpftuner_bpf_var_set(net_buffer, tuner, netdev_max_backlog,
-				     event->update[0].new[0]);
+		}
 		break;
 	case FLOW_LIMIT_CPU_BITMAP:
 		bpftuner_tunable_sysctl_write(tuner, id, scenario, 
@@ -97,6 +121,66 @@ void event_handler(struct bpftuner *tuner,
 					      tunable,
 					      event->update[0].old[0],
 					      event->update[0].new[0]);
+		break;
+	case NETDEV_BUDGET:
+		new = event->update[0].new[0];
+		if (new > INT_MAX)
+			break;
+		budget_usecs = bpftuner_bpf_var_get(net_buffer, tuner,
+						    netdev_budget_usecs);
+		budget_usecs_new = BPFTUNE_GROW_BY_DELTA(budget_usecs);
 
+		ret = bpftune_sched_wait_run_percent_read();
+		bpftune_log(LOG_DEBUG, "sched wait-run percent : %d\n", ret);
+		if (ret > 0) {
+			key.id = (__u64)id;
+			key.netns_cookie = event->netns_cookie;
+			if (corr_update_user(tuner->corr_map_fd, key.id,
+					     key.netns_cookie,
+					     (__u64)new, (__u64)ret))
+				bpftune_log(LOG_DEBUG, "corr map fd %d update failed %d\n",
+					    tuner->corr_map_fd, errno);
+		}
+		if (!bpf_map_lookup_elem(tuner->corr_map_fd, &key, &c)) {
+			corr = corr_compute(&c);
+			bpftune_log(LOG_DEBUG, "covar for '%s' netns %ld (new %ld): %LF; corr %LF\n",
+				    tunable, key.netns_cookie, new,
+				    covar_compute(&c), corr);
+			if (corr > CORR_THRESHOLD) {
+				new = BPFTUNE_SHRINK_BY_DELTA(event->update[0].old[0]);
+				budget_usecs_new = BPFTUNE_SHRINK_BY_DELTA(budget_usecs);
+				scenario = NETDEV_BUDGET_DECREASE;
+			}
+		}
+		ret = bpftuner_tunable_sysctl_write(tuner, id, scenario,
+						    event->netns_cookie, 1,
+						    (long int *)&new,
+"To maximize # packets processed per NAPI cycle, change %s from (%ld) -> (%ld)\n",
+						    tunable,
+						    event->update[0].old[0],
+						    new);
+		if (!ret) {
+			/* update value of netdev_budget for BPF program */
+			bpftuner_bpf_var_set(net_buffer, tuner, netdev_budget,
+					     new);
+			/* need to also update budget_usecs since both
+			 * limit netdev budget and reaching either limit
+			 * triggers time_squeeze.
+			 */
+			ret = bpftuner_tunable_sysctl_write(tuner,
+							    NETDEV_BUDGET_USECS,
+							    scenario,
+							    event->netns_cookie,
+							    1,
+							    &budget_usecs_new,
+"To maximize # packets processed per NAPI cycle, change netdev_budget_usecs from (%ld) -> (%ld)\n",
+							    budget_usecs,
+							    budget_usecs_new);
+			if (!ret)
+				bpftuner_bpf_var_set(net_buffer, tuner,
+						     netdev_budget_usecs,
+						     budget_usecs_new);
+		}
+		break;
 	}
 }