From aadb5acd3793b395e59b32061a0bf5e6e8a78d80 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 20 Aug 2025 12:58:13 +0200
Subject: [PATCH 01/46] Add netstacklat example

This is a direct copy from bpf-examples
but from Simon's devel branch 'netstacklat-groupby'

https://github.com/simosund/bpf-examples/tree/netstacklat-groupby/netstacklat

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 400 +++++++++++++++++++++++++++++++++++++
 examples/netstacklat.h     |  79 ++++++++
 examples/netstacklat.yaml  |  45 +++++
 3 files changed, 524 insertions(+)
 create mode 100644 examples/netstacklat.bpf.c
 create mode 100644 examples/netstacklat.h
 create mode 100644 examples/netstacklat.yaml

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
new file mode 100644
index 00000000..574cdbd4
--- /dev/null
+++ b/examples/netstacklat.bpf.c
@@ -0,0 +1,400 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include "vmlinux_local.h"
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "netstacklat.h"
+#include "bits.bpf.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+
+volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S);
+volatile const struct netstacklat_bpf_config user_config = {
+	.network_ns = 0,
+	.filter_pid = false,
+	.filter_ifindex = false,
+	.filter_cgroup = false,
+	.filter_nonempty_sockqueue = false,
+	.groupby_ifindex = false,
+	.groupby_cgroup = false,
+};
+
+/*
+ * Alternative definition of sk_buff to handle renaming of the field
+ * mono_delivery_time to tstamp_type. See
+ * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
+ */
+struct sk_buff___old {
+	union {
+		ktime_t tstamp;
+		u64 skb_mstamp_ns;
+	};
+	__u8 mono_delivery_time: 1;
+} __attribute__((preserve_access_index));
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+	__uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64);
+	__type(key, struct hist_key);
+	__type(value, u64);
+} netstack_latency_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, PID_MAX_LIMIT);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_pidfilter SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, IFINDEX_MAX);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_ifindexfilter SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_TRACKED_CGROUPS);
+	__type(key, u64);
+	__type(value, u64);
+} netstack_cgroupfilter SEC(".maps");
+
+static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
+{
+	u64 zero = 0;
+	u64 *val;
+
+	val = bpf_map_lookup_elem(map, key);
+	if (val)
+		return val;
+
+	// Key not in map - try insert it and lookup again
+	bpf_map_update_elem(map, key, &zero, BPF_NOEXIST);
+	return bpf_map_lookup_elem(map, key);
+}
+
+static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket)
+{
+	u32 bucket = log2l(value);
+
+	// Right-inclusive histogram, so "round up" the log value
+	if (bucket > 0 && 1ULL << bucket < value)
+		bucket++;
+
+	if (bucket > max_bucket)
+		bucket = max_bucket;
+
+	return bucket;
+}
+
+/*
+ * Same call signature as the increment_exp2_histogram_nosync macro from
+ * https://github.com/cloudflare/ebpf_exporter/blob/master/examples/maps.bpf.h
+ * but provided as a function.
+ *
+ * Unlike the macro, only works with keys of type struct hist_key. The hist_key
+ * struct must be provided by value (rather than as a pointer) to keep the same
+ * call signature as the ebpf-exporter macro, although this will get inefficent
+ * if struct hist_key grows large.
+ */
+static void increment_exp2_histogram_nosync(void *map, struct hist_key key,
+					    u64 value, u32 max_bucket)
+{
+	u64 *bucket_count;
+
+	// Increment histogram
+	key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket);
+	bucket_count = lookup_or_zeroinit_histentry(map, &key);
+	if (bucket_count)
+		(*bucket_count)++;
+
+	// Increment sum at end of histogram
+	if (value == 0)
+		return;
+
+	key.bucket = max_bucket + 1;
+	bucket_count = lookup_or_zeroinit_histentry(map, &key);
+	if (bucket_count)
+		*bucket_count += value;
+}
+
+static ktime_t time_since(ktime_t tstamp)
+{
+	ktime_t now;
+
+	if (tstamp <= 0)
+		return -1;
+
+	now = bpf_ktime_get_tai_ns() - TAI_OFFSET;
+	if (tstamp > now)
+		return -1;
+
+	return now - tstamp;
+}
+
+static void record_latency(ktime_t latency, const struct hist_key *key)
+{
+	increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency,
+					HIST_MAX_LATENCY_SLOT);
+}
+
+static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
+{
+	ktime_t latency = time_since(tstamp);
+	if (latency >= 0)
+		record_latency(latency, key);
+}
+
+static bool filter_ifindex(u32 ifindex)
+{
+	u64 *ifindex_ok;
+
+	if (!user_config.filter_ifindex)
+		// No ifindex filter - all ok
+		return true;
+
+	ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex);
+	if (!ifindex_ok)
+		return false;
+
+	return *ifindex_ok > 0;
+}
+
+static bool filter_network_ns(u32 ns)
+{
+	if (user_config.network_ns == 0)
+		return true;
+
+	return ns == user_config.network_ns;
+}
+
+static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk)
+{
+	/*
+	 * Favor reading from sk due to less redirection (fewer probe reads)
+	 * and skb->dev is not always set.
+	 */
+	if (sk)
+		return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum);
+	else if (skb)
+		return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum);
+	return 0;
+}
+
+static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
+{
+	struct hist_key key = { .hook = hook };
+	u32 ifindex;
+
+	if (bpf_core_field_exists(skb->tstamp_type)) {
+		/*
+		 * For kernels >= v6.11 the tstamp_type being non-zero
+		 * (SKB_CLOCK_REALTIME) implies that skb->tstamp holds a
+		 * preserved TX timestamp rather than a RX timestamp. See
+		 * https://lore.kernel.org/all/20240509211834.3235191-2-quic_abchauha@quicinc.com/
+		 */
+		if (BPF_CORE_READ_BITFIELD(skb, tstamp_type) > 0)
+			return;
+
+	} else {
+		/*
+		 * For kernels < v6.11, the field was called mono_delivery_time
+		 * instead, see https://lore.kernel.org/all/20220302195525.3480280-1-kafai@fb.com/
+		 * Kernels < v5.18 do not have the mono_delivery_field either,
+		 * but we do not support those anyways (as they lack the
+		 * bpf_ktime_get_tai_ns helper)
+		 */
+		struct sk_buff___old *skb_old = (void *)skb;
+		if (BPF_CORE_READ_BITFIELD(skb_old, mono_delivery_time) > 0)
+			return;
+	}
+
+	ifindex = skb->skb_iif;
+	if (!filter_ifindex(ifindex))
+		return;
+
+	if (!filter_network_ns(get_network_ns(skb, sk)))
+		return;
+
+	if (user_config.groupby_ifindex)
+		key.ifindex = ifindex;
+
+	record_latency_since(skb->tstamp, &key);
+}
+
+static bool filter_pid(u32 pid)
+{
+	u64 *pid_ok;
+
+	if (!user_config.filter_pid)
+		// No PID filter - all PIDs ok
+		return true;
+
+	pid_ok = bpf_map_lookup_elem(&netstack_pidfilter, &pid);
+	if (!pid_ok)
+		return false;
+
+	return *pid_ok > 0;
+}
+
+static bool filter_cgroup(u64 cgroup_id)
+{
+	if (!user_config.filter_cgroup)
+		// No cgroup filter - all cgroups ok
+		return true;
+
+	return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL;
+}
+
+static bool filter_current_task(u64 cgroup)
+{
+	bool ok = true;
+	__u32 tgid;
+
+	if (user_config.filter_pid) {
+		tgid = bpf_get_current_pid_tgid() >> 32;
+		ok = ok && filter_pid(tgid);
+	}
+
+	if (user_config.filter_cgroup)
+		ok = ok && filter_cgroup(cgroup);
+
+	return ok;
+}
+
+/**
+ * skb_queue_empty - check if a queue is empty
+ * @list: queue head
+ *
+ * Returns true if the queue is empty, false otherwise.
+ *
+ * Copied from /include/linux/skbuff.h
+ */
+static inline int skb_queue_empty(const struct sk_buff_head *list)
+{
+	return list->next == (const struct sk_buff *)list;
+}
+
+static bool filter_nonempty_sockqueue(struct sock *sk)
+{
+	if (!user_config.filter_nonempty_sockqueue)
+		return true;
+
+	return !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
+				  ktime_t tstamp, enum netstacklat_hook hook)
+{
+	struct hist_key key = { .hook = hook };
+	u64 cgroup = 0;
+	u32 ifindex;
+
+	if (!filter_nonempty_sockqueue(sk))
+		return;
+
+	if (user_config.filter_cgroup || user_config.groupby_cgroup)
+		cgroup = bpf_get_current_cgroup_id();
+
+	if (!filter_current_task(cgroup))
+		return;
+
+	ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex;
+	if (!filter_ifindex(ifindex))
+		return;
+
+	if (!filter_network_ns(get_network_ns(skb, sk)))
+		return;
+
+	if (user_config.groupby_ifindex)
+		key.ifindex = ifindex;
+	if (user_config.groupby_cgroup)
+		key.cgroup = cgroup;
+
+	record_latency_since(tstamp, &key);
+}
+
+SEC("fentry/ip_rcv_core")
+int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/ip6_rcv_core")
+int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/tcp_v4_rcv")
+int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/tcp_v6_rcv")
+int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/udp_rcv")
+int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+
+SEC("fentry/udpv6_rcv")
+int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+
+SEC("fexit/tcp_queue_rcv")
+int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb)
+{
+	record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED);
+	return 0;
+}
+
+SEC("fexit/__udp_enqueue_schedule_skb")
+int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
+	     struct sk_buff *skb, int retval)
+{
+	if (retval == 0)
+		record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
+	return 0;
+}
+
+SEC("fentry/tcp_recv_timestamp")
+int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
+	     struct scm_timestamping_internal *tss)
+{
+	struct timespec64 *ts = &tss->ts[0];
+	record_socket_latency(sk, NULL,
+			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
+			      NETSTACKLAT_HOOK_TCP_SOCK_READ);
+	return 0;
+}
+
+SEC("fentry/skb_consume_udp")
+int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
+	     int len)
+{
+	record_socket_latency(sk, skb, skb->tstamp,
+			      NETSTACKLAT_HOOK_UDP_SOCK_READ);
+	return 0;
+}
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
new file mode 100644
index 00000000..4811da4c
--- /dev/null
+++ b/examples/netstacklat.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef NETSTACKLAT_H
+#define NETSTACKLAT_H
+
+#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s
+/*
+ * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key"
+ * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys)
+ * that ebpf_exporter expects for exp2 hists (see how it's used in the
+ * increment_exp2_histogram_nosync() function)
+ */
+#define HIST_NBUCKETS (HIST_MAX_LATENCY_SLOT + 2)
+
+#define NS_PER_S 1000000000
+
+// The highest possible PID on a Linux system (from /include/linux/threads.h)
+#define PID_MAX_LIMIT (4 * 1024 * 1024)
+// The highest ifindex we expect to encounter
+#define IFINDEX_MAX 16384
+// The maximum number of different cgroups we can filter for
+#define MAX_TRACKED_CGROUPS 4096
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+#ifndef max
+#define max(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a > _b ? _a : _b;  \
+	})
+#endif
+
+#ifndef min
+#define min(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a < _b ? _a : _b;  \
+	})
+#endif
+
+enum netstacklat_hook {
+	NETSTACKLAT_HOOK_INVALID = 0,
+	NETSTACKLAT_HOOK_IP_RCV,
+	NETSTACKLAT_HOOK_TCP_START,
+	NETSTACKLAT_HOOK_UDP_START,
+	NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_TCP_SOCK_READ,
+	NETSTACKLAT_HOOK_UDP_SOCK_READ,
+	NETSTACKLAT_N_HOOKS,
+};
+
+/*
+ * Key used for the histogram map
+ * To be compatible with ebpf-exporter, all histograms need a key struct whose final
+ * member is named "bucket" and is the histogram bucket index.
+ */
+struct hist_key {
+	__u64 cgroup;
+	__u32 ifindex;
+	__u16 hook; // need well defined size for ebpf-exporter to decode
+	__u16 bucket; // needs to be last to be compatible with ebpf-exporter
+};
+
+struct netstacklat_bpf_config {
+	__u32 network_ns;
+	bool filter_pid;
+	bool filter_ifindex;
+	bool filter_cgroup;
+	bool filter_nonempty_sockqueue;
+	bool groupby_ifindex;
+	bool groupby_cgroup;
+};
+
+#endif
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
new file mode 100644
index 00000000..3b6e5dc8
--- /dev/null
+++ b/examples/netstacklat.yaml
@@ -0,0 +1,45 @@
+metrics:
+  histograms:
+    - name: netstack_latency_seconds
+      help: Latency for packets (skbs) to reach various points in the kernel network stack
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: cgroup
+          size: 8
+          decoders:
+            - name: uint
+            - name: cgroup
+        - name: iface
+          size: 4
+          decoders:
+            # If including output from a different network namespace than ebpf-exporter
+            # you probably just want to decode as a uint (ifindex) instead
+            # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others
+            - name: ifname
+        - name: hook
+          size: 2
+          decoders:
+            - name: uint
+            - name: static_map
+              static_map:
+                1: "ip-start"
+                2: "tcp-start"
+                3: "udp-start"
+                4: "tcp-socket-enqueued"
+                5: "udp-socket-enqueued"
+                6: "tcp-socket-read"
+                7: "udp-socket-read"
+        - name: bucket
+          size: 2
+          decoders:
+            - name: uint
+
+cgroup_id_map:
+  name: netstack_cgroupfilter
+  type: hash
+  regexps:
+    - ^.*(system.slice/.*)$
+

From c891328833f954f082e92067f4df8758de4816dc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 20 Aug 2025 13:11:15 +0200
Subject: [PATCH 02/46] Adjust netstacklat example to compile

Keeping this as seperate commit to track what needed to change

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 574cdbd4..c5ce0564 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-#include "vmlinux_local.h"
-#include <linux/bpf.h>
+#include <vmlinux.h>
+//#include <linux/bpf.h>
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>

From d466045d12e928e7ae9f6459e7f77edc752af874 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 20 Aug 2025 16:29:59 +0200
Subject: [PATCH 03/46] netstacklat: adjustments to make it run

Gotcha#1: My devel laptop have TAI offset zero
 - Other systems (incl prod) all have 37 sec

Gotcha#2: RX-timestamping need to be enabled maually
 - something else have to enable RX-timestamping

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 18 +++++++++++++-----
 examples/netstacklat.yaml  |  1 +
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index c5ce0564..d79eeb7d 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -11,16 +11,16 @@
 
 char LICENSE[] SEC("license") = "GPL";
 
-
-volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S);
+// Strange: TAI offset is zero on my test system
+volatile const __s64 TAI_OFFSET = (0LL * NS_PER_S);
 volatile const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_pid = false,
 	.filter_ifindex = false,
 	.filter_cgroup = false,
 	.filter_nonempty_sockqueue = false,
-	.groupby_ifindex = false,
-	.groupby_cgroup = false,
+	.groupby_ifindex = true,
+	.groupby_cgroup = true,
 };
 
 /*
@@ -36,9 +36,16 @@ struct sk_buff___old {
 	__u8 mono_delivery_time: 1;
 } __attribute__((preserve_access_index));
 
+/* NOTICE: max_entries need to be adjusted based on maximum
+ *  number of cgroups and ifindex (that are "groupby" collecting)
+ *  and "enabled" hooks (as we want to disable some)
+ */
+#define N_CGROUPS	2 /* depend on cgroup_id_map matches in YAML config*/
+#define N_HOOKS	NETSTACKLAT_N_HOOKS  /* Keep it same until we disable some */
+#define N_IFACES	64 /* On prod only interested in ext0 and vlan100@ext0 */
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
-	__uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64);
+	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES * 64);
 	__type(key, struct hist_key);
 	__type(value, u64);
 } netstack_latency_seconds SEC(".maps");
@@ -295,6 +302,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	u64 cgroup = 0;
 	u32 ifindex;
 
+	// XXX: TODO evaluate if this feature can make overhead acceptable
 	if (!filter_nonempty_sockqueue(sk))
 		return;
 
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index 3b6e5dc8..15082870 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -42,4 +42,5 @@ cgroup_id_map:
   type: hash
   regexps:
     - ^.*(system.slice/.*)$
+    - ^.*(user.slice/.*)$
 

From 22b929e43338a0769758d14611c82161684a1a77 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 20 Aug 2025 18:13:15 +0200
Subject: [PATCH 04/46] netstacklat: enable filter_cgroup and disable other
 hooks

For ebpf_exporter we cannot control which BPF sections
gets loaded.  Instead we compile time disable some
of the hooks via define/ifdef's.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index d79eeb7d..00a534cf 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -17,12 +17,18 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_pid = false,
 	.filter_ifindex = false,
-	.filter_cgroup = false,
+	.filter_cgroup = true,
 	.filter_nonempty_sockqueue = false,
 	.groupby_ifindex = true,
 	.groupby_cgroup = true,
 };
 
+/* This provide easy way compile-time to disable some hooks */
+//#define	CONFIG_HOOKS_EARLY_RCV	1
+#undef 	CONFIG_HOOKS_EARLY_RCV
+//#define	CONFIG_HOOKS_ENQUEUE	1
+#undef		CONFIG_HOOKS_ENQUEUE
+#define CONFIG_HOOKS_DEQUEUE	1
 /*
  * Alternative definition of sk_buff to handle renaming of the field
  * mono_delivery_time to tstamp_type. See
@@ -193,6 +199,7 @@ static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk)
 	return 0;
 }
 
+#if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE)
 static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
 {
 	struct hist_key key = { .hook = hook };
@@ -233,6 +240,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 
 	record_latency_since(skb->tstamp, &key);
 }
+#endif
 
 static bool filter_pid(u32 pid)
 {
@@ -327,6 +335,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	record_latency_since(tstamp, &key);
 }
 
+#ifdef CONFIG_HOOKS_EARLY_RCV
 SEC("fentry/ip_rcv_core")
 int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
 	     void *tp, void *res, bool compat_mode)
@@ -370,7 +379,9 @@ int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb)
 	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
 	return 0;
 }
+#endif /* CONFIG_HOOKS_EARLY_RCV */
 
+#ifdef CONFIG_HOOKS_ENQUEUE
 SEC("fexit/tcp_queue_rcv")
 int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb)
 {
@@ -386,7 +397,9 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
 		record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
 	return 0;
 }
+#endif /* CONFIG_HOOKS_ENQUEUE */
 
+#ifdef CONFIG_HOOKS_DEQUEUE
 SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
@@ -406,3 +419,4 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 			      NETSTACKLAT_HOOK_UDP_SOCK_READ);
 	return 0;
 }
+#endif /* CONFIG_HOOKS_DEQUEUE */

From 50fafda8eca7a6be9a53eb798aad4aaed6aa792b Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 20 Aug 2025 18:58:16 +0200
Subject: [PATCH 05/46] netstacklat: disable ifindex filter

Instead hardcode ifindex limits based on prod setup.
As we don't have a way to configure this via YAML.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 00a534cf..edba867f 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -16,7 +16,7 @@ volatile const __s64 TAI_OFFSET = (0LL * NS_PER_S);
 volatile const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_pid = false,
-	.filter_ifindex = false,
+	.filter_ifindex = true,
 	.filter_cgroup = true,
 	.filter_nonempty_sockqueue = false,
 	.groupby_ifindex = true,
@@ -29,6 +29,11 @@ volatile const struct netstacklat_bpf_config user_config = {
 //#define	CONFIG_HOOKS_ENQUEUE	1
 #undef		CONFIG_HOOKS_ENQUEUE
 #define CONFIG_HOOKS_DEQUEUE	1
+
+/* Allows to compile-time disable ifindex map as it is large */
+//#define	CONFIG_IFINDEX_FILTER_MAP	1
+#undef		CONFIG_IFINDEX_FILTER_MAP
+
 /*
  * Alternative definition of sk_buff to handle renaming of the field
  * mono_delivery_time to tstamp_type. See
@@ -63,12 +68,14 @@ struct {
 	__type(value, u64);
 } netstack_pidfilter SEC(".maps");
 
+#ifdef CONFIG_IFINDEX_FILTER_MAP
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, IFINDEX_MAX);
 	__type(key, u32);
 	__type(value, u64);
 } netstack_ifindexfilter SEC(".maps");
+#endif
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
@@ -165,17 +172,28 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
 
 static bool filter_ifindex(u32 ifindex)
 {
-	u64 *ifindex_ok;
-
 	if (!user_config.filter_ifindex)
 		// No ifindex filter - all ok
 		return true;
 
+#ifdef CONFIG_IFINDEX_FILTER_MAP
+	u64 *ifindex_ok;
+
 	ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex);
 	if (!ifindex_ok)
 		return false;
 
 	return *ifindex_ok > 0;
+#else
+	/* Hack for production:
+	 * - We want to exclude 'lo' which have ifindex==1.
+	 * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5)
+	 */
+	if (ifindex > 1 && ifindex < 6)
+		return true;
+
+	return false;
+#endif
 }
 
 static bool filter_network_ns(u32 ns)

From 2efbb9092a282fd7eec521c5b3bb754c508e83f9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 20 Aug 2025 19:20:56 +0200
Subject: [PATCH 06/46] netstacklat: disable PID filtering

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index edba867f..d1984e8b 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -30,10 +30,14 @@ volatile const struct netstacklat_bpf_config user_config = {
 #undef		CONFIG_HOOKS_ENQUEUE
 #define CONFIG_HOOKS_DEQUEUE	1
 
-/* Allows to compile-time disable ifindex map as it is large */
+/* Allows to compile-time disable ifindex map as YAML cannot conf this */
 //#define	CONFIG_IFINDEX_FILTER_MAP	1
 #undef		CONFIG_IFINDEX_FILTER_MAP
 
+/* Allows to compile-time disable PID filter map as it is very large */
+//#define	CONFIG_PID_FILTER_MAP	1
+#undef		CONFIG_PID_FILTER_MAP
+
 /*
  * Alternative definition of sk_buff to handle renaming of the field
  * mono_delivery_time to tstamp_type. See
@@ -61,12 +65,14 @@ struct {
 	__type(value, u64);
 } netstack_latency_seconds SEC(".maps");
 
+#ifdef CONFIG_PID_FILTER_MAP
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, PID_MAX_LIMIT);
 	__type(key, u32);
 	__type(value, u64);
 } netstack_pidfilter SEC(".maps");
+#endif
 
 #ifdef CONFIG_IFINDEX_FILTER_MAP
 struct {
@@ -260,6 +266,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 }
 #endif
 
+#ifdef CONFIG_PID_FILTER_MAP
 static bool filter_pid(u32 pid)
 {
 	u64 *pid_ok;
@@ -273,7 +280,9 @@ static bool filter_pid(u32 pid)
 		return false;
 
 	return *pid_ok > 0;
+
 }
+#endif /* CONFIG_PID_FILTER_MAP */
 
 static bool filter_cgroup(u64 cgroup_id)
 {
@@ -287,13 +296,15 @@ static bool filter_cgroup(u64 cgroup_id)
 static bool filter_current_task(u64 cgroup)
 {
 	bool ok = true;
+
+#ifdef CONFIG_PID_FILTER_MAP
 	__u32 tgid;
 
 	if (user_config.filter_pid) {
 		tgid = bpf_get_current_pid_tgid() >> 32;
 		ok = ok && filter_pid(tgid);
 	}
-
+#endif
 	if (user_config.filter_cgroup)
 		ok = ok && filter_cgroup(cgroup);
 

From 43da77b0b318c38d5f45787a590c94016d386594 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 10:22:14 +0200
Subject: [PATCH 07/46] netstacklat: an idea as a code comment

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index d1984e8b..6528a0c1 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -324,6 +324,18 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 	return list->next == (const struct sk_buff *)list;
 }
 
+/* IDEA: To lower runtime overhead, we could skip recording timestamps for
+ *  sockets with very few packets.
+ *
+ * sk_buff_head->qlen could be used to see if e.g. queue have more than 2 elements
+ *
+ *
+static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
+{
+	return list_->qlen;
+}
+*/
+
 static bool filter_nonempty_sockqueue(struct sock *sk)
 {
 	if (!user_config.filter_nonempty_sockqueue)

From dfff4ae0998fba094a7e95b50f411795a7495666 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 10:23:51 +0200
Subject: [PATCH 08/46] netstacklat: limit cgroups to nginx for a test
 deployment

Execution runtime: netstacklat_tcp_recv_timestamp
 - run_time_ns 18885872401 run_cnt 71,443,820 = 264.34 ns

Execution runtime: netstacklat_skb_consume_udp
 - run_time_ns 6124797061 run_cnt 16,324,309 = 375.19 ns

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.yaml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index 15082870..d59d3b6b 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -41,6 +41,9 @@ cgroup_id_map:
   name: netstack_cgroupfilter
   type: hash
   regexps:
-    - ^.*(system.slice/.*)$
-    - ^.*(user.slice/.*)$
-
+    - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$
+    - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$
+#    - ^(/sys/fs/cgroup/production.slice/.*/pingora-backend-router.service).*$
+#    - ^(/sys/fs/cgroup/production.slice/.*/pingora-origin.service).*$
+#    - ^.*(system.slice/.*)$
+#    - ^.*(user.slice/.*)$

From 48c7f20fc5ec68b73bbfed6796cd8b620e645e8d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 11:25:52 +0200
Subject: [PATCH 09/46] netstacklat: evaluate filter_nonempty_sockqueue

Something is buggy with this filter
 - All latency records is on max bucket

The READ_ONCE change doesn't fix the issue

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 6528a0c1..19ab004b 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -18,7 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
-	.filter_nonempty_sockqueue = false,
+	.filter_nonempty_sockqueue = true,
 	.groupby_ifindex = true,
 	.groupby_cgroup = true,
 };
@@ -311,6 +311,8 @@ static bool filter_current_task(u64 cgroup)
 	return ok;
 }
 
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
 /**
  * skb_queue_empty - check if a queue is empty
  * @list: queue head
@@ -321,7 +323,7 @@ static bool filter_current_task(u64 cgroup)
  */
 static inline int skb_queue_empty(const struct sk_buff_head *list)
 {
-	return list->next == (const struct sk_buff *)list;
+	return READ_ONCE(list->next) == (const struct sk_buff *)list;
 }
 
 /* IDEA: To lower runtime overhead, we could skip recording timestamps for

From a0776a2d44a645c0545bac7571f57330d318722d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 11:45:50 +0200
Subject: [PATCH 10/46] netstacklat: disable filter_nonempty_sockqueue

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 19ab004b..a29dbeda 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -18,7 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
-	.filter_nonempty_sockqueue = true,
+	.filter_nonempty_sockqueue = false,
 	.groupby_ifindex = true,
 	.groupby_cgroup = true,
 };

From de778bc569f0c49ee9e44aa52c9ee89dc0959cfc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 12:08:50 +0200
Subject: [PATCH 11/46] netstacklat: restore TAI offset to 37 sec

This was the real reason for seeing wrong numbers in prod.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index a29dbeda..605fbe67 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -11,8 +11,7 @@
 
 char LICENSE[] SEC("license") = "GPL";
 
-// Strange: TAI offset is zero on my test system
-volatile const __s64 TAI_OFFSET = (0LL * NS_PER_S);
+volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 volatile const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_pid = false,

From 33965b2c09ad5b9506f8bb9226270a0f6f395c97 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 12:16:08 +0200
Subject: [PATCH 12/46] netstacklat: use ebpf_exporter macros for
 increment_exp2_histogram_nosync

Execution runtime: netstacklat_tcp_recv_timestamp
 - run_time_ns 32164560127 run_cnt 116,590,498 = 275.88 ns

Execution runtime: netstacklat_skb_consume_udp
 - run_time_ns 10490230543 run_cnt 23,993,428 = 437.21 ns

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 48 ++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 605fbe67..30d24cbe 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -89,6 +89,34 @@ struct {
 	__type(value, u64);
 } netstack_cgroupfilter SEC(".maps");
 
+static ktime_t time_since(ktime_t tstamp)
+{
+	ktime_t now;
+
+	if (tstamp <= 0)
+		return -1;
+
+	now = bpf_ktime_get_tai_ns() - TAI_OFFSET;
+	if (tstamp > now)
+		return -1;
+
+	return now - tstamp;
+}
+
+/* Determine if ebpf_exporter macro or local C implementation is used */
+#define CONFIG_MAP_MACROS	1
+#ifdef  CONFIG_MAP_MACROS
+#include "maps.bpf.h"
+#define _record_latency_since(tstamp, key)					\
+	ktime_t latency = time_since(tstamp);					\
+	if (latency >= 0)							\
+		increment_exp2_histogram_nosync(&netstack_latency_seconds,	\
+						key, latency,			\
+						HIST_MAX_LATENCY_SLOT);
+#else /* !CONFIG_MAP_MACROS */
+#define _record_latency_since(tstamp, key)	\
+	record_latency_since(tstamp, &key)
+
 static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
 {
 	u64 zero = 0;
@@ -148,32 +176,18 @@ static void increment_exp2_histogram_nosync(void *map, struct hist_key key,
 		*bucket_count += value;
 }
 
-static ktime_t time_since(ktime_t tstamp)
-{
-	ktime_t now;
-
-	if (tstamp <= 0)
-		return -1;
-
-	now = bpf_ktime_get_tai_ns() - TAI_OFFSET;
-	if (tstamp > now)
-		return -1;
-
-	return now - tstamp;
-}
-
 static void record_latency(ktime_t latency, const struct hist_key *key)
 {
 	increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency,
 					HIST_MAX_LATENCY_SLOT);
 }
-
 static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
 {
 	ktime_t latency = time_since(tstamp);
 	if (latency >= 0)
 		record_latency(latency, key);
 }
+#endif /* !CONFIG_MAP_MACROS */
 
 static bool filter_ifindex(u32 ifindex)
 {
@@ -261,7 +275,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 	if (user_config.groupby_ifindex)
 		key.ifindex = ifindex;
 
-	record_latency_since(skb->tstamp, &key);
+	_record_latency_since(skb->tstamp, key);
 }
 #endif
 
@@ -374,7 +388,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	if (user_config.groupby_cgroup)
 		key.cgroup = cgroup;
 
-	record_latency_since(tstamp, &key);
+	_record_latency_since(tstamp, key);
 }
 
 #ifdef CONFIG_HOOKS_EARLY_RCV

From e9ac4cddf1e2e89b3545c3d170a6e083570919c1 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 17:20:41 +0200
Subject: [PATCH 13/46] netstacklat: re-evaluate filter_nonempty_sockqueue

Moved filter_nonempty_sockqueue to callers
 - because record_socket_latency() becomes a BPF function-call
 - perf e.g. shows bpf_prog_fb69587c6ea462b7_record_socket_latency

Execution runtime: netstacklat_tcp_recv_timestamp
 - run_time_ns 181391511590 run_cnt 788663546 = 229.99 sn

Execution runtime: netstacklat_skb_consume_udp
 - run_time_ns 16212598612 run_cnt 137812779 = 117.64 ns

This clearly have a huge improvement for UDP packets.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 30d24cbe..639d090d 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -17,7 +17,7 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
-	.filter_nonempty_sockqueue = false,
+	.filter_nonempty_sockqueue = true,
 	.groupby_ifindex = true,
 	.groupby_cgroup = true,
 };
@@ -366,10 +366,6 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	u64 cgroup = 0;
 	u32 ifindex;
 
-	// XXX: TODO evaluate if this feature can make overhead acceptable
-	if (!filter_nonempty_sockqueue(sk))
-		return;
-
 	if (user_config.filter_cgroup || user_config.groupby_cgroup)
 		cgroup = bpf_get_current_cgroup_id();
 
@@ -460,6 +456,9 @@ SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
 {
+	if (!filter_nonempty_sockqueue(sk))
+		return 0;
+
 	struct timespec64 *ts = &tss->ts[0];
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
@@ -471,6 +470,9 @@ SEC("fentry/skb_consume_udp")
 int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 	     int len)
 {
+	if (!filter_nonempty_sockqueue(sk))
+		return 0;
+
 	record_socket_latency(sk, skb, skb->tstamp,
 			      NETSTACKLAT_HOOK_UDP_SOCK_READ);
 	return 0;

From 925debfcf23e86b661a042bf46195dd6c122afbc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 18:06:47 +0200
Subject: [PATCH 14/46] netstacklat: compile time enable
 filter_nonempty_sockqueue

Enable filter_nonempty_sockqueue compile time to make sure
that this config setting doesn't influence performance.

I'm only seeing a small effect.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 639d090d..4efd64d3 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -18,6 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.filter_ifindex = true,
 	.filter_cgroup = true,
 	.filter_nonempty_sockqueue = true,
+#define CONFIG_FILTER_NONEMPTY_SOCKQUEUE	1
 	.groupby_ifindex = true,
 	.groupby_cgroup = true,
 };
@@ -353,8 +354,10 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
 
 static bool filter_nonempty_sockqueue(struct sock *sk)
 {
+#ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE
 	if (!user_config.filter_nonempty_sockqueue)
 		return true;
+#endif
 
 	return !skb_queue_empty(&sk->sk_receive_queue);
 }

From 28c37132c25197930da39dccd7ea49956e481355 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 21 Aug 2025 19:08:59 +0200
Subject: [PATCH 15/46] netstacklat: filter on socket queue length

The filter_nonempty_sockqueue() is effecient for UDP packets,
but doesn't work well for TCP packets.

Add filtering on socket queue lenght. Try filtering
if qlen is not above 3 packets for TCP.

Execution runtime: netstacklat_tcp_recv_timestamp
 - run_time_ns 10690540076 run_cnt 117852699 = 90.71 ns

Execution runtime: netstacklat_skb_consume_udp
 - run_time_ns 2206621632 run_cnt 20004338 = 110.30 ns

This have a HUGE improvement for TCP case.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 4efd64d3..17c69d2c 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -340,18 +340,6 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 	return READ_ONCE(list->next) == (const struct sk_buff *)list;
 }
 
-/* IDEA: To lower runtime overhead, we could skip recording timestamps for
- *  sockets with very few packets.
- *
- * sk_buff_head->qlen could be used to see if e.g. queue have more than 2 elements
- *
- *
-static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
-{
-	return list_->qlen;
-}
-*/
-
 static bool filter_nonempty_sockqueue(struct sock *sk)
 {
 #ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE
@@ -362,6 +350,22 @@ static bool filter_nonempty_sockqueue(struct sock *sk)
 	return !skb_queue_empty(&sk->sk_receive_queue);
 }
 
+/* To lower runtime overhead, skip recording timestamps for sockets with very
+ * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2
+ * elements
+ */
+static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
+{
+	return READ_ONCE(list_->qlen);
+}
+
+static bool filter_queue_len(struct sock *sk, const __u32 above_len)
+{
+	if (sk_queue_len(&sk->sk_receive_queue) > above_len)
+		return true;
+	return false;
+}
+
 static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 				  ktime_t tstamp, enum netstacklat_hook hook)
 {
@@ -462,6 +466,9 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	if (!filter_nonempty_sockqueue(sk))
 		return 0;
 
+	if (!filter_queue_len(sk, 3))
+		return 0;
+
 	struct timespec64 *ts = &tss->ts[0];
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,

From d09463e2128dad7ca56c4d4b7a31f3769d56f6d5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 11:41:59 +0200
Subject: [PATCH 16/46] netstacklat: test with cgroup_id_map type to
 cgrp_storage

Leverage BPF_MAP_TYPE_CGRP_STORAGE for our cgroup filter.

To evaluate the two different cgroup_id_map types code
macro CONFIG_CGRP_STORAGE is introduced, to allow
us to switch implementation compile time.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 37 +++++++++++++++++++++++++++++++------
 examples/netstacklat.yaml  |  2 +-
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 17c69d2c..fae71461 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -83,12 +83,23 @@ struct {
 } netstack_ifindexfilter SEC(".maps");
 #endif
 
+/* Eval two different cgroup_id_map types*/
+#define CONFIG_CGRP_STORAGE	1
+#ifdef CONFIG_CGRP_STORAGE
 struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);  /* type: cgrp_storage */
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_cgroupfilter SEC(".maps");
+#else
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH); /* type: hash */
 	__uint(max_entries, MAX_TRACKED_CGROUPS);
 	__type(key, u64);
 	__type(value, u64);
 } netstack_cgroupfilter SEC(".maps");
+#endif
 
 static ktime_t time_since(ktime_t tstamp)
 {
@@ -298,6 +309,19 @@ static bool filter_pid(u32 pid)
 }
 #endif /* CONFIG_PID_FILTER_MAP */
 
+#ifdef CONFIG_CGRP_STORAGE
+static bool filter_cgroup(u64 unused)
+{
+	if (!user_config.filter_cgroup)
+		// No cgroup filter - all cgroups ok
+		return true;
+
+	struct task_struct *task = bpf_get_current_task_btf();
+	struct cgroup *cgrp = task->cgroups->dfl_cgrp;
+
+	return bpf_cgrp_storage_get(&netstack_cgroupfilter, cgrp, 0, 0) != NULL;
+}
+#else /* !CONFIG_CGRP_STORAGE */
 static bool filter_cgroup(u64 cgroup_id)
 {
 	if (!user_config.filter_cgroup)
@@ -306,8 +330,9 @@ static bool filter_cgroup(u64 cgroup_id)
 
 	return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL;
 }
+#endif /* !CONFIG_CGRP_STORAGE */
 
-static bool filter_current_task(u64 cgroup)
+static bool filter_current_task()
 {
 	bool ok = true;
 
@@ -319,9 +344,6 @@ static bool filter_current_task(u64 cgroup)
 		ok = ok && filter_pid(tgid);
 	}
 #endif
-	if (user_config.filter_cgroup)
-		ok = ok && filter_cgroup(cgroup);
-
 	return ok;
 }
 
@@ -376,7 +398,10 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	if (user_config.filter_cgroup || user_config.groupby_cgroup)
 		cgroup = bpf_get_current_cgroup_id();
 
-	if (!filter_current_task(cgroup))
+	if (!filter_cgroup(cgroup))
+		return;
+
+	if (!filter_current_task())
 		return;
 
 	ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex;
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index d59d3b6b..dda785e3 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -39,7 +39,7 @@ metrics:
 
 cgroup_id_map:
   name: netstack_cgroupfilter
-  type: hash
+  type: cgrp_storage
   regexps:
     - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$
     - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$

From 8370c7321d66a5dde535c06654d788fbe18e62a5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 14:05:26 +0200
Subject: [PATCH 17/46] netstacklat: constify user_config and disable
 filter_nonempty_sockqueue

The ebpf_exporter variant of netstacklat is not runtime configurable at
BPF-load time. Thus, below user_config isn't define as 'volatile', instead
the 'const' allows the compiler to do dead-code elimination.

We also disable user_config.filter_nonempty_sockqueue as we want to
stress the cgroup lookup types some more.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index fae71461..300632ff 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -11,14 +11,17 @@
 
 char LICENSE[] SEC("license") = "GPL";
 
-volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S);
-volatile const struct netstacklat_bpf_config user_config = {
+/* The ebpf_exporter variant of netstacklat is not runtime configurable at
+ * BPF-load time. Thus, below user_config isn't define as 'volatile', instead
+ * the 'const' allows the compiler to do dead-code elimination.
+ */
+const __s64 TAI_OFFSET = (37LL * NS_PER_S);
+const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
-	.filter_nonempty_sockqueue = true,
-#define CONFIG_FILTER_NONEMPTY_SOCKQUEUE	1
+	.filter_nonempty_sockqueue = false,
 	.groupby_ifindex = true,
 	.groupby_cgroup = true,
 };
@@ -364,10 +367,8 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 
 static bool filter_nonempty_sockqueue(struct sock *sk)
 {
-#ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE
 	if (!user_config.filter_nonempty_sockqueue)
 		return true;
-#endif
 
 	return !skb_queue_empty(&sk->sk_receive_queue);
 }
@@ -383,6 +384,9 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
 
 static bool filter_queue_len(struct sock *sk, const __u32 above_len)
 {
+	if (!user_config.filter_nonempty_sockqueue)
+		return true;
+
 	if (sk_queue_len(&sk->sk_receive_queue) > above_len)
 		return true;
 	return false;

From 15525743ad81f48f12d23d0af899d852e2712312 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 14:41:16 +0200
Subject: [PATCH 18/46] netstacklat: move filter_cgroup before
 record_socket_latency

This required changing call signature of filter_cgroup() to also
populate the cgroup_id as that is used for groupby_cgroup.

We are still using the CONFIG_CGRP_STORAGE that leverages the
BPF_MAP_TYPE_CGRP_STORAGE. Without the queue length filters
(filter_nonempty_sockqueue) we are getting more calls. This is on purpose to
evaluate cgroup_id_map types.

name netstacklat_tcp_recv_timestamp
 - run_time_ns 17953390952 run_cnt 55079620 = 325.95 ns

netstacklat_skb_consume_udp
 - run_time_ns 5779869863 run_cnt 11650472 = 496.10 ns

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 47 +++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 300632ff..1b7fb3f7 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -313,25 +313,36 @@ static bool filter_pid(u32 pid)
 #endif /* CONFIG_PID_FILTER_MAP */
 
 #ifdef CONFIG_CGRP_STORAGE
-static bool filter_cgroup(u64 unused)
+static bool filter_cgroup(u64 *cgroup_id)
 {
-	if (!user_config.filter_cgroup)
+	if (!user_config.filter_cgroup) {
+		if (user_config.groupby_cgroup)
+			*cgroup_id = bpf_get_current_cgroup_id();
 		// No cgroup filter - all cgroups ok
 		return true;
+	}
 
 	struct task_struct *task = bpf_get_current_task_btf();
 	struct cgroup *cgrp = task->cgroups->dfl_cgrp;
 
+	if (user_config.groupby_cgroup)
+		/* no need to call bpf_get_current_cgroup_id() */
+		*cgroup_id = BPF_CORE_READ(cgrp, kn, id);
+
 	return bpf_cgrp_storage_get(&netstack_cgroupfilter, cgrp, 0, 0) != NULL;
 }
 #else /* !CONFIG_CGRP_STORAGE */
-static bool filter_cgroup(u64 cgroup_id)
+static bool filter_cgroup(u64 *cgroup_id)
 {
-	if (!user_config.filter_cgroup)
+	if (!user_config.filter_cgroup) {
+		if (user_config.groupby_cgroup)
+			*cgroup_id = bpf_get_current_cgroup_id();
 		// No cgroup filter - all cgroups ok
 		return true;
+	}
+	*cgroup_id = bpf_get_current_cgroup_id();
 
-	return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL;
+	return bpf_map_lookup_elem(&netstack_cgroupfilter, cgroup_id) != NULL;
 }
 #endif /* !CONFIG_CGRP_STORAGE */
 
@@ -393,18 +404,12 @@ static bool filter_queue_len(struct sock *sk, const __u32 above_len)
 }
 
 static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
-				  ktime_t tstamp, enum netstacklat_hook hook)
+				  ktime_t tstamp, enum netstacklat_hook hook,
+				  u64 cgroup_id)
 {
 	struct hist_key key = { .hook = hook };
-	u64 cgroup = 0;
 	u32 ifindex;
 
-	if (user_config.filter_cgroup || user_config.groupby_cgroup)
-		cgroup = bpf_get_current_cgroup_id();
-
-	if (!filter_cgroup(cgroup))
-		return;
-
 	if (!filter_current_task())
 		return;
 
@@ -418,7 +423,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	if (user_config.groupby_ifindex)
 		key.ifindex = ifindex;
 	if (user_config.groupby_cgroup)
-		key.cgroup = cgroup;
+		key.cgroup = cgroup_id;
 
 	_record_latency_since(tstamp, key);
 }
@@ -492,6 +497,11 @@ SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
 {
+	u64 cgroup_id = 0;
+
+	if (!filter_cgroup(&cgroup_id))
+		return 0;
+
 	if (!filter_nonempty_sockqueue(sk))
 		return 0;
 
@@ -501,7 +511,7 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	struct timespec64 *ts = &tss->ts[0];
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
-			      NETSTACKLAT_HOOK_TCP_SOCK_READ);
+			      NETSTACKLAT_HOOK_TCP_SOCK_READ, cgroup_id);
 	return 0;
 }
 
@@ -509,11 +519,16 @@ SEC("fentry/skb_consume_udp")
 int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 	     int len)
 {
+	u64 cgroup_id = 0;
+
+	if (!filter_cgroup(&cgroup_id))
+		return 0;
+
 	if (!filter_nonempty_sockqueue(sk))
 		return 0;
 
 	record_socket_latency(sk, skb, skb->tstamp,
-			      NETSTACKLAT_HOOK_UDP_SOCK_READ);
+			      NETSTACKLAT_HOOK_UDP_SOCK_READ, cgroup_id);
 	return 0;
 }
 #endif /* CONFIG_HOOKS_DEQUEUE */

From 6c6529f408bb470cc70e2d9fd36ae12e8e9cc6e8 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 15:09:18 +0200
Subject: [PATCH 19/46] netstacklat: test cgroup_id_map type hash

Changing the filter_cgroup() to use the cgroup_id_map type hash.
Surprisingly this type seems to be faster than type cgrp_storage.

name netstacklat_tcp_recv_timestamp
 - run_time_ns 10705407914 run_cnt 41576592 = 257.48 ns
 - diff: 325.95 - 257.48 = 68.47 ns better

name netstacklat_skb_consume_udp
 - run_time_ns 3716653454 run_cnt 8499677 = 437.27 ns
 - diff: 496.10 - 437.27 = 58.83 ns better

On this AMD CPU with SRSO enabled, we have extra overheads on BPF helper calls.
The filter_cgroup() for type cgrp_storage has two extra helper calls,
bpf_get_current_task_btf() and bpf_cgrp_storage_get(), but we eliminated the
bpf_get_current_cgroup_id() helper call. Still this doesn't fully account for
diff. That said, if a BPF-prog already have the struct_task available then the
bpf_get_current_task_btf() can also be eliminated, so type cgrp_storage might
still be useful in that case

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 2 +-
 examples/netstacklat.yaml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 1b7fb3f7..fedc3243 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -87,7 +87,7 @@ struct {
 #endif
 
 /* Eval two different cgroup_id_map types*/
-#define CONFIG_CGRP_STORAGE	1
+//#define CONFIG_CGRP_STORAGE	1
 #ifdef CONFIG_CGRP_STORAGE
 struct {
 	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);  /* type: cgrp_storage */
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index dda785e3..d59d3b6b 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -39,7 +39,7 @@ metrics:
 
 cgroup_id_map:
   name: netstack_cgroupfilter
-  type: cgrp_storage
+  type: hash
   regexps:
     - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$
     - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$

From b9c4570e74a5d4cd6608ab73aeebc15cfa533235 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 15:35:14 +0200
Subject: [PATCH 20/46] netstacklat: hide call to get_network_ns()

If we have disabled network_ns filtering, the code still does a lookup of the
current name space via calling get_network_ns().

Reorg the code to avoid this call if feature is disabled.

name netstacklat_tcp_recv_timestamp
 - run_time_ns 10623365578 run_cnt 44842812 = 236.90 ns
 - diff: 257.48 - 236.90 = 20.58 ns better

name netstacklat_skb_consume_udp
 - run_time_ns 3718153230 run_cnt 9902613 = 375.47 ns
 - diff: 437.27 - 375.47 = 61.80 ns better

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index fedc3243..07459885 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -230,14 +230,6 @@ static bool filter_ifindex(u32 ifindex)
 #endif
 }
 
-static bool filter_network_ns(u32 ns)
-{
-	if (user_config.network_ns == 0)
-		return true;
-
-	return ns == user_config.network_ns;
-}
-
 static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk)
 {
 	/*
@@ -251,6 +243,16 @@ static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk)
 	return 0;
 }
 
+static bool filter_network_ns(struct sk_buff *skb, struct sock *sk)
+{
+	if (user_config.network_ns == 0)
+		return true;
+
+	u32 ns = get_network_ns(skb, sk);
+
+	return ns == user_config.network_ns;
+}
+
 #if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE)
 static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
 {
@@ -284,7 +286,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 	if (!filter_ifindex(ifindex))
 		return;
 
-	if (!filter_network_ns(get_network_ns(skb, sk)))
+	if (!filter_network_ns(skb, sk))
 		return;
 
 	if (user_config.groupby_ifindex)
@@ -417,7 +419,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	if (!filter_ifindex(ifindex))
 		return;
 
-	if (!filter_network_ns(get_network_ns(skb, sk)))
+	if (!filter_network_ns(skb, sk))
 		return;
 
 	if (user_config.groupby_ifindex)

From d047606b9930c4c6e6a3929b865aa99b75a11415 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 16:18:14 +0200
Subject: [PATCH 21/46] netstacklat: new user_config for filter_queue_len

Let us control the filter_queue_len() via seperate user_config.
Also enable this for UDP sockets.

Notice the filter_cgroup() is still running first, so this is the primary
filter. And filter_nonempty_sockqueue is still false.

name netstacklat_tcp_recv_timestamp
 - run_time_ns 15661530364 run_cnt 94922963 = 164.99 ns

name netstacklat_skb_consume_udp
 - run_time_ns 3255451250 run_cnt 14532586 = 224.01 ns

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 12 +++++++++---
 examples/netstacklat.h     |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 07459885..0692da36 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -18,6 +18,7 @@ char LICENSE[] SEC("license") = "GPL";
 const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
+	.filter_queue_len = 3, /* zero means filter is inactive */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
@@ -395,9 +396,11 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
 	return READ_ONCE(list_->qlen);
 }
 
-static bool filter_queue_len(struct sock *sk, const __u32 above_len)
+static bool filter_queue_len(struct sock *sk)
 {
-	if (!user_config.filter_nonempty_sockqueue)
+	const u32 above_len = user_config.filter_queue_len;
+
+	if (above_len == 0)
 		return true;
 
 	if (sk_queue_len(&sk->sk_receive_queue) > above_len)
@@ -507,7 +510,7 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	if (!filter_nonempty_sockqueue(sk))
 		return 0;
 
-	if (!filter_queue_len(sk, 3))
+	if (!filter_queue_len(sk))
 		return 0;
 
 	struct timespec64 *ts = &tss->ts[0];
@@ -529,6 +532,9 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 	if (!filter_nonempty_sockqueue(sk))
 		return 0;
 
+	if (!filter_queue_len(sk))
+		return 0;
+
 	record_socket_latency(sk, skb, skb->tstamp,
 			      NETSTACKLAT_HOOK_UDP_SOCK_READ, cgroup_id);
 	return 0;
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index 4811da4c..f713f726 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -68,6 +68,7 @@ struct hist_key {
 
 struct netstacklat_bpf_config {
 	__u32 network_ns;
+	__u32 filter_queue_len;
 	bool filter_pid;
 	bool filter_ifindex;
 	bool filter_cgroup;

From d9039e5ba71e516bb0b5431f3be1a08e7923a636 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 16:50:50 +0200
Subject: [PATCH 22/46] netstacklat: make filter_socket a common function

Verified performance is same as previous patch.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 0692da36..772af789 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -254,7 +254,7 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk)
 	return ns == user_config.network_ns;
 }
 
-#if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE)
+#if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE)
 static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
 {
 	struct hist_key key = { .hook = hook };
@@ -408,6 +408,23 @@ static bool filter_queue_len(struct sock *sk)
 	return false;
 }
 
+#if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE)
+static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
+					  u64 *cgroup_id)
+{
+	if (!filter_cgroup(cgroup_id))
+		return false;
+
+	if (!filter_nonempty_sockqueue(sk))
+		return false;
+
+	if (!filter_queue_len(sk))
+		return false;
+
+	return true;
+}
+#endif
+
 static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 				  ktime_t tstamp, enum netstacklat_hook hook,
 				  u64 cgroup_id)
@@ -498,19 +515,14 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
 #endif /* CONFIG_HOOKS_ENQUEUE */
 
 #ifdef CONFIG_HOOKS_DEQUEUE
+
 SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
 {
 	u64 cgroup_id = 0;
 
-	if (!filter_cgroup(&cgroup_id))
-		return 0;
-
-	if (!filter_nonempty_sockqueue(sk))
-		return 0;
-
-	if (!filter_queue_len(sk))
+	if (!filter_socket(sk, NULL, &cgroup_id))
 		return 0;
 
 	struct timespec64 *ts = &tss->ts[0];
@@ -526,13 +538,7 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 {
 	u64 cgroup_id = 0;
 
-	if (!filter_cgroup(&cgroup_id))
-		return 0;
-
-	if (!filter_nonempty_sockqueue(sk))
-		return 0;
-
-	if (!filter_queue_len(sk))
+	if (!filter_socket(sk, skb, &cgroup_id))
 		return 0;
 
 	record_socket_latency(sk, skb, skb->tstamp,

From a2f10c73fa4e9028a308a8bd4ce263612e8f06d5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 17:04:36 +0200
Subject: [PATCH 23/46] netstacklat: first filter for sockets having a queue

It is more efficient to first filter out sockets with empty or small queues.
This is a tradeoff as our production use-case is to capture when the system
becomes overloaded.

name netstacklat_tcp_recv_timestamp
 - run_time_ns 15347880145 run_cnt 177786472 = 86.32 ns

name netstacklat_skb_consume_udp
 - run_time_ns 3096529442 run_cnt 33903931 = 91.33 ns

The performance gain is huge. Do remember that this is the average runtime cost
that is reduced, because we can skip recording many of these events.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 772af789..b6ee0ae3 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -412,15 +412,15 @@ static bool filter_queue_len(struct sock *sk)
 static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 					  u64 *cgroup_id)
 {
-	if (!filter_cgroup(cgroup_id))
-		return false;
-
 	if (!filter_nonempty_sockqueue(sk))
 		return false;
 
 	if (!filter_queue_len(sk))
 		return false;
 
+	if (!filter_cgroup(cgroup_id))
+		return false;
+
 	return true;
 }
 #endif

From 593f76b76f4ac1ec11785fcf1a323827d20084b3 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 26 Aug 2025 20:39:33 +0200
Subject: [PATCH 24/46] netstacklat: give credit Simon Sundberg

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index b6ee0ae3..6d727172 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -1,4 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * This is a ebpf_exporter variant of the netstacklat tool
+ *
+ * Netstacklat - is a tool that "Monitor RX latency within the network stack"
+ *  - https://github.com/xdp-project/bpf-examples/tree/main/netstacklat
+ *  - Developed by Simon Sundberg <simon.sundberg@kau.se>
+ *
+ * This variant have been code optimized heavily towards Cloudflare's use-case.
+ * Many hooks and features have been disabled, via constructs that lets both the
+ * compiler and BPF verifier do dead-code elimination.
+ */
 #include <vmlinux.h>
 //#include <linux/bpf.h>
 

From 780984dc5db38854f1ef7c05bcad39f161216794 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 11:30:03 +0200
Subject: [PATCH 25/46] netstacklat: take sk_backlog into account

Our checks for empty or almost empty sockets were wrong,
because sockets also have a backlog queue.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 6d727172..84ff2c8c 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * This is a ebpf_exporter variant of the netstacklat tool
+ * This is an ebpf_exporter variant of the netstacklat tool
  *
  * Netstacklat - is a tool that "Monitor RX latency within the network stack"
  *  - https://github.com/xdp-project/bpf-examples/tree/main/netstacklat
@@ -29,7 +29,7 @@ char LICENSE[] SEC("license") = "GPL";
 const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
-	.filter_queue_len = 3, /* zero means filter is inactive */
+	.filter_queue_len = 1, /* zero means filter is inactive */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
@@ -390,12 +390,24 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 	return READ_ONCE(list->next) == (const struct sk_buff *)list;
 }
 
+static inline bool sk_backlog_empty(const struct sock *sk)
+{
+	return READ_ONCE(sk->sk_backlog.tail) == NULL;
+}
+
 static bool filter_nonempty_sockqueue(struct sock *sk)
 {
 	if (!user_config.filter_nonempty_sockqueue)
 		return true;
 
-	return !skb_queue_empty(&sk->sk_receive_queue);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		return true;
+
+	/* Packets can also be on the sk_backlog */
+	if (!sk_backlog_empty(sk))
+		return true;
+
+	return false;
 }
 
 /* To lower runtime overhead, skip recording timestamps for sockets with very
@@ -416,6 +428,14 @@ static bool filter_queue_len(struct sock *sk)
 
 	if (sk_queue_len(&sk->sk_receive_queue) > above_len)
 		return true;
+
+	/* Packets can also be on the sk_backlog, but we don't know the number
+	 * of SKBs on the queue, because sk_backlog.len is in bytes (based on
+	 * skb->truesize).  Thus, if any backlog exists we don't filter.
+	 */
+	if (!sk_backlog_empty(sk))
+		return true;
+
 	return false;
 }
 
@@ -526,7 +546,6 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
 #endif /* CONFIG_HOOKS_ENQUEUE */
 
 #ifdef CONFIG_HOOKS_DEQUEUE
-
 SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)

From f8671e35e40c394f75502d4c852321abd2dc1702 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 12:51:51 +0200
Subject: [PATCH 26/46] netstacklat: rename filter_queue_len to
 filter_min_queue_len

This change test to be 'ge' greater-than-or-equal.
We want the ability specify 1, meaning a queue size of one
and above. Before 1 meant queue size 2 and above.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 12 ++++++------
 examples/netstacklat.h     |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 84ff2c8c..6858f93c 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -29,7 +29,7 @@ char LICENSE[] SEC("license") = "GPL";
 const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
-	.filter_queue_len = 1, /* zero means filter is inactive */
+	.filter_min_queue_len = 1, /* zero means filter is inactive */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
@@ -419,14 +419,14 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
 	return READ_ONCE(list_->qlen);
 }
 
-static bool filter_queue_len(struct sock *sk)
+static bool filter_min_queue_len(struct sock *sk)
 {
-	const u32 above_len = user_config.filter_queue_len;
+	const u32 min_qlen = user_config.filter_min_queue_len;
 
-	if (above_len == 0)
+	if (min_qlen == 0)
 		return true;
 
-	if (sk_queue_len(&sk->sk_receive_queue) > above_len)
+	if (sk_queue_len(&sk->sk_receive_queue) >= min_qlen)
 		return true;
 
 	/* Packets can also be on the sk_backlog, but we don't know the number
@@ -446,7 +446,7 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 	if (!filter_nonempty_sockqueue(sk))
 		return false;
 
-	if (!filter_queue_len(sk))
+	if (!filter_min_queue_len(sk))
 		return false;
 
 	if (!filter_cgroup(cgroup_id))
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index f713f726..ee362924 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -68,7 +68,7 @@ struct hist_key {
 
 struct netstacklat_bpf_config {
 	__u32 network_ns;
-	__u32 filter_queue_len;
+	__u32 filter_min_queue_len;
 	bool filter_pid;
 	bool filter_ifindex;
 	bool filter_cgroup;

From 3c57889e7c2ef5dc2fff94370c1994b4b6432b9d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 14:16:23 +0200
Subject: [PATCH 27/46] netstacklat: add filter for every nth packet

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 35 ++++++++++++++++++++++++++++++++++-
 examples/netstacklat.h     |  1 +
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 6858f93c..407b7803 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -29,7 +29,8 @@ char LICENSE[] SEC("license") = "GPL";
 const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
-	.filter_min_queue_len = 1, /* zero means filter is inactive */
+	.filter_min_queue_len = 0, /* zero means filter is inactive */
+	.filter_nth_packet = 10, /* reduce recorded event to every nth packet */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
@@ -116,6 +117,15 @@ struct {
 } netstack_cgroupfilter SEC(".maps");
 #endif
 
+/* Down sample the recorded events to every nth event */
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, 1);
+    __type(key, u32);
+    __type(value, u64);
+} netstack_nth_filter SEC(".maps");
+
+
 static ktime_t time_since(ktime_t tstamp)
 {
 	ktime_t now;
@@ -439,6 +449,26 @@ static bool filter_min_queue_len(struct sock *sk)
 	return false;
 }
 
+static inline bool filter_nth_packet()
+{
+	u32 key = 0;
+	u64 *nth;
+
+	/* Zero and one means disabled */
+	if (user_config.filter_nth_packet <= 1)
+		return true;
+
+	nth = bpf_map_lookup_elem(&netstack_nth_filter, &key);
+	if (!nth)
+		return false;
+
+	*nth += 1;
+	if ((*nth % user_config.filter_nth_packet) == 0) {
+		return true;
+	}
+	return false;
+}
+
 #if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE)
 static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 					  u64 *cgroup_id)
@@ -452,6 +482,9 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 	if (!filter_cgroup(cgroup_id))
 		return false;
 
+	if (!filter_nth_packet())
+		return false;
+
 	return true;
 }
 #endif
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index ee362924..f4009a44 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -69,6 +69,7 @@ struct hist_key {
 struct netstacklat_bpf_config {
 	__u32 network_ns;
 	__u32 filter_min_queue_len;
+	__u64 filter_nth_packet;
 	bool filter_pid;
 	bool filter_ifindex;
 	bool filter_cgroup;

From 0127c2c87cf3b9bbd72607151e59e3e5420e2f16 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 14:51:19 +0200
Subject: [PATCH 28/46] netstacklat: let nth filter counter be per hook

Let also record_skb_latency() use it.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 65 ++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 407b7803..03dd51fb 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -30,7 +30,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_min_queue_len = 0, /* zero means filter is inactive */
-	.filter_nth_packet = 10, /* reduce recorded event to every nth packet */
+	.filter_nth_packet = 32, /* reduce recorded event to every nth packet, use power-of-2 */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
@@ -117,15 +117,14 @@ struct {
 } netstack_cgroupfilter SEC(".maps");
 #endif
 
-/* Down sample the recorded events to every nth event */
+/* Per-CPU counter for down sampling the recorded events to every nth event */
 struct {
     __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-    __uint(max_entries, 1);
+    __uint(max_entries, NETSTACKLAT_N_HOOKS);
     __type(key, u32);
     __type(value, u64);
 } netstack_nth_filter SEC(".maps");
 
-
 static ktime_t time_since(ktime_t tstamp)
 {
 	ktime_t now;
@@ -226,6 +225,26 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
 }
 #endif /* !CONFIG_MAP_MACROS */
 
+static inline bool filter_nth_packet(const enum netstacklat_hook hook)
+{
+	u32 key = hook;
+	u64 *nth;
+
+	/* Zero and one means disabled */
+	if (user_config.filter_nth_packet <= 1)
+		return true;
+
+	nth = bpf_map_lookup_elem(&netstack_nth_filter, &key);
+	if (!nth)
+		return false;
+
+	*nth += 1;
+	if ((*nth % user_config.filter_nth_packet) == 0) {
+		return true;
+	}
+	return false;
+}
+
 static bool filter_ifindex(u32 ifindex)
 {
 	if (!user_config.filter_ifindex)
@@ -311,6 +330,9 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 	if (!filter_network_ns(skb, sk))
 		return;
 
+	if (!filter_nth_packet(hook))
+		return;
+
 	if (user_config.groupby_ifindex)
 		key.ifindex = ifindex;
 
@@ -449,29 +471,9 @@ static bool filter_min_queue_len(struct sock *sk)
 	return false;
 }
 
-static inline bool filter_nth_packet()
-{
-	u32 key = 0;
-	u64 *nth;
-
-	/* Zero and one means disabled */
-	if (user_config.filter_nth_packet <= 1)
-		return true;
-
-	nth = bpf_map_lookup_elem(&netstack_nth_filter, &key);
-	if (!nth)
-		return false;
-
-	*nth += 1;
-	if ((*nth % user_config.filter_nth_packet) == 0) {
-		return true;
-	}
-	return false;
-}
-
 #if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE)
 static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
-					  u64 *cgroup_id)
+					  u64 *cgroup_id, const enum netstacklat_hook hook)
 {
 	if (!filter_nonempty_sockqueue(sk))
 		return false;
@@ -482,7 +484,7 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 	if (!filter_cgroup(cgroup_id))
 		return false;
 
-	if (!filter_nth_packet())
+	if (!filter_nth_packet(hook))
 		return false;
 
 	return true;
@@ -583,15 +585,16 @@ SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
 {
+	const enum netstacklat_hook hook = NETSTACKLAT_HOOK_TCP_SOCK_READ;
 	u64 cgroup_id = 0;
 
-	if (!filter_socket(sk, NULL, &cgroup_id))
+	if (!filter_socket(sk, NULL, &cgroup_id, hook))
 		return 0;
 
 	struct timespec64 *ts = &tss->ts[0];
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
-			      NETSTACKLAT_HOOK_TCP_SOCK_READ, cgroup_id);
+			      hook, cgroup_id);
 	return 0;
 }
 
@@ -599,13 +602,13 @@ SEC("fentry/skb_consume_udp")
 int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 	     int len)
 {
+	const enum netstacklat_hook hook = NETSTACKLAT_HOOK_UDP_SOCK_READ;
 	u64 cgroup_id = 0;
 
-	if (!filter_socket(sk, skb, &cgroup_id))
+	if (!filter_socket(sk, skb, &cgroup_id, hook))
 		return 0;
 
-	record_socket_latency(sk, skb, skb->tstamp,
-			      NETSTACKLAT_HOOK_UDP_SOCK_READ, cgroup_id);
+	record_socket_latency(sk, skb, skb->tstamp, hook, cgroup_id);
 	return 0;
 }
 #endif /* CONFIG_HOOKS_DEQUEUE */

From 45cd60737ada129d7bf3a708dcb8f65fb9d8046f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 16:05:50 +0200
Subject: [PATCH 29/46] netstacklat: disable UDP hooks via ugly ifdefs

For production usage, we want the ability to disable the UDP
hooks.  Introduce CONFIG_xxx_HOOKS for IP, UDP and TCP.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 03dd51fb..411fab40 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -45,6 +45,10 @@ const struct netstacklat_bpf_config user_config = {
 //#define	CONFIG_HOOKS_ENQUEUE	1
 #undef		CONFIG_HOOKS_ENQUEUE
 #define CONFIG_HOOKS_DEQUEUE	1
+#define CONFIG_ENABLE_IP_HOOKS	1
+#define CONFIG_ENABLE_TCP_HOOKS	1
+//#define CONFIG_ENABLE_UDP_HOOKS	1
+
 
 /* Allows to compile-time disable ifindex map as YAML cannot conf this */
 //#define	CONFIG_IFINDEX_FILTER_MAP	1
@@ -517,6 +521,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 }
 
 #ifdef CONFIG_HOOKS_EARLY_RCV
+# ifdef CONFIG_ENABLE_IP_HOOKS
 SEC("fentry/ip_rcv_core")
 int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
 	     void *tp, void *res, bool compat_mode)
@@ -532,7 +537,9 @@ int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block,
 	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_IP_HOOKS */
 
+# ifdef CONFIG_ENABLE_TCP_HOOKS
 SEC("fentry/tcp_v4_rcv")
 int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb)
 {
@@ -546,7 +553,9 @@ int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb)
 	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_TCP_HOOKS */
 
+# ifdef CONFIG_ENABLE_UDP_HOOKS
 SEC("fentry/udp_rcv")
 int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb)
 {
@@ -560,16 +569,20 @@ int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb)
 	record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_UDP_HOOKS */
 #endif /* CONFIG_HOOKS_EARLY_RCV */
 
 #ifdef CONFIG_HOOKS_ENQUEUE
+# ifdef CONFIG_ENABLE_TCP_HOOKS
 SEC("fexit/tcp_queue_rcv")
 int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb)
 {
 	record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_TCP_HOOKS */
 
+# ifdef CONFIG_ENABLE_UDP_HOOKS
 SEC("fexit/__udp_enqueue_schedule_skb")
 int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
 	     struct sk_buff *skb, int retval)
@@ -578,9 +591,11 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk,
 		record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_UDP_HOOKS */
 #endif /* CONFIG_HOOKS_ENQUEUE */
 
 #ifdef CONFIG_HOOKS_DEQUEUE
+# ifdef CONFIG_ENABLE_TCP_HOOKS
 SEC("fentry/tcp_recv_timestamp")
 int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
@@ -597,7 +612,9 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 			      hook, cgroup_id);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_TCP_HOOKS */
 
+# ifdef CONFIG_ENABLE_UDP_HOOKS
 SEC("fentry/skb_consume_udp")
 int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 	     int len)
@@ -611,4 +628,5 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
 	record_socket_latency(sk, skb, skb->tstamp, hook, cgroup_id);
 	return 0;
 }
+# endif /* CONFIG_ENABLE_UDP_HOOKS */
 #endif /* CONFIG_HOOKS_DEQUEUE */

From 189c61f7c953f3952fe34e1519060e1c43b13bca Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 17:14:38 +0200
Subject: [PATCH 30/46] netstacklat: change time resoultion to usecs

For production we need to reduce the number of Prometheus
buckets metric.  As the dequeue hooks are unlikely to see
below usecs latencies we reduce the resolution to usecs.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 2 +-
 examples/netstacklat.h     | 7 ++++++-
 examples/netstacklat.yaml  | 4 ++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 411fab40..785266d8 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -140,7 +140,7 @@ static ktime_t time_since(ktime_t tstamp)
 	if (tstamp > now)
 		return -1;
 
-	return now - tstamp;
+	return (now - tstamp) / LATENCY_SCALE;
 }
 
 /* Determine if ebpf_exporter macro or local C implementation is used */
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index f4009a44..0e30da60 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -2,7 +2,12 @@
 #ifndef NETSTACKLAT_H
 #define NETSTACKLAT_H
 
-#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s
+/* To reduce Prometheus buckets metric reduce/scale latency time resolution.
+ * This LATENCY_SCALE is connected to the YAML bucket_multiplier config.
+ */
+#define LATENCY_SCALE 1000UL
+
+#define HIST_MAX_LATENCY_SLOT 24 // ( 2^24 ns / 1000) usecs -> ~16.7s
 /*
  * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key"
  * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys)
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index d59d3b6b..4cc3cfab 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -4,8 +4,8 @@ metrics:
       help: Latency for packets (skbs) to reach various points in the kernel network stack
       bucket_type: exp2
       bucket_min: 0
-      bucket_max: 34
-      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      bucket_max: 24
+      bucket_multiplier: 0.000001 # microseconds to seconds
       labels:
         - name: cgroup
           size: 8

From f0a947de4a7883f68e40ae447db05f4d4661c559 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 18:52:30 +0200
Subject: [PATCH 31/46] netstacklat: fix hash size for netstack_latency_seconds

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 785266d8..f7003857 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -77,10 +77,10 @@ struct sk_buff___old {
  */
 #define N_CGROUPS	2 /* depend on cgroup_id_map matches in YAML config*/
 #define N_HOOKS	NETSTACKLAT_N_HOOKS  /* Keep it same until we disable some */
-#define N_IFACES	64 /* On prod only interested in ext0 and vlan100@ext0 */
+#define N_IFACES	6 /* On prod only interested in ext0 and vlan100@ext0 */
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
-	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES * 64);
+	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES);
 	__type(key, struct hist_key);
 	__type(value, u64);
 } netstack_latency_seconds SEC(".maps");

From 0ff2c4160036c792aed581b98c72f4e7a5a8b810 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 27 Aug 2025 19:05:03 +0200
Subject: [PATCH 32/46] netstacklat: remove some tabs and change comment style

When importing this into ebpf_exporter we need to reformat
the C-code, which is done via command line:

 clang-format -i configs/netstacklat.{h,bpf.c}

But it doesn't convert these macros and comments.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index f7003857..98fa547b 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -11,7 +11,6 @@
  * compiler and BPF verifier do dead-code elimination.
  */
 #include <vmlinux.h>
-//#include <linux/bpf.h>
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
@@ -40,23 +39,23 @@ const struct netstacklat_bpf_config user_config = {
 };
 
 /* This provide easy way compile-time to disable some hooks */
-//#define	CONFIG_HOOKS_EARLY_RCV	1
-#undef 	CONFIG_HOOKS_EARLY_RCV
-//#define	CONFIG_HOOKS_ENQUEUE	1
-#undef		CONFIG_HOOKS_ENQUEUE
-#define CONFIG_HOOKS_DEQUEUE	1
-#define CONFIG_ENABLE_IP_HOOKS	1
-#define CONFIG_ENABLE_TCP_HOOKS	1
-//#define CONFIG_ENABLE_UDP_HOOKS	1
+/* #define CONFIG_HOOKS_EARLY_RCV 1 */
+#undef CONFIG_HOOKS_EARLY_RCV
+/* #define CONFIG_HOOKS_ENQUEUE 1 */
+#undef CONFIG_HOOKS_ENQUEUE
+#define CONFIG_HOOKS_DEQUEUE 1
+#define CONFIG_ENABLE_IP_HOOKS 1
+#define CONFIG_ENABLE_TCP_HOOKS 1
+/* #define CONFIG_ENABLE_UDP_HOOKS 1 */
 
 
 /* Allows to compile-time disable ifindex map as YAML cannot conf this */
-//#define	CONFIG_IFINDEX_FILTER_MAP	1
-#undef		CONFIG_IFINDEX_FILTER_MAP
+/* #define CONFIG_IFINDEX_FILTER_MAP 1 */
+#undef CONFIG_IFINDEX_FILTER_MAP
 
 /* Allows to compile-time disable PID filter map as it is very large */
-//#define	CONFIG_PID_FILTER_MAP	1
-#undef		CONFIG_PID_FILTER_MAP
+/* #define CONFIG_PID_FILTER_MAP 1 */
+#undef CONFIG_PID_FILTER_MAP
 
 /*
  * Alternative definition of sk_buff to handle renaming of the field
@@ -104,7 +103,7 @@ struct {
 #endif
 
 /* Eval two different cgroup_id_map types*/
-//#define CONFIG_CGRP_STORAGE	1
+/* #define CONFIG_CGRP_STORAGE 1 */
 #ifdef CONFIG_CGRP_STORAGE
 struct {
 	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);  /* type: cgrp_storage */

From b54fdbf1b5c5e5d4fabe9bdd058ee665d83e39fb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 28 Aug 2025 09:44:15 +0200
Subject: [PATCH 33/46] netstacklat: add reminder to update N_CGROUPS

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index 4cc3cfab..657230b5 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -37,6 +37,7 @@ metrics:
           decoders:
             - name: uint
 
+# Remember to update #define N_CGROUPS in code when adding more matches
 cgroup_id_map:
   name: netstack_cgroupfilter
   type: hash

From e813266b4e522c606d58a530b4c509ebf1516bf5 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 28 Aug 2025 09:59:31 +0200
Subject: [PATCH 34/46] netstacklat: reduce hash size for
 netstack_latency_seconds

We are currently only using a single hook so change N_HOOKS
that is used in the max_entries calc of netstack_latency_seconds.

Detect if other hooks gets enabled and make compile fail.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 98fa547b..5a07ab87 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -48,7 +48,6 @@ const struct netstacklat_bpf_config user_config = {
 #define CONFIG_ENABLE_TCP_HOOKS 1
 /* #define CONFIG_ENABLE_UDP_HOOKS 1 */
 
-
 /* Allows to compile-time disable ifindex map as YAML cannot conf this */
 /* #define CONFIG_IFINDEX_FILTER_MAP 1 */
 #undef CONFIG_IFINDEX_FILTER_MAP
@@ -72,11 +71,15 @@ struct sk_buff___old {
 
 /* NOTICE: max_entries need to be adjusted based on maximum
  *  number of cgroups and ifindex (that are "groupby" collecting)
- *  and "enabled" hooks (as we want to disable some)
+ *  and "enabled" hooks.
  */
 #define N_CGROUPS	2 /* depend on cgroup_id_map matches in YAML config*/
-#define N_HOOKS	NETSTACKLAT_N_HOOKS  /* Keep it same until we disable some */
 #define N_IFACES	6 /* On prod only interested in ext0 and vlan100@ext0 */
+#define N_HOOKS	1
+#if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE || CONFIG_ENABLE_UDP_HOOKS)
+#err "Please update N_HOOKS"
+#endif
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
 	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES);

From da2d60942ce7b834daa814ac8482c6765e87c338 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 4 Sep 2025 10:53:22 +0200
Subject: [PATCH 35/46] netstacklat: atomic counter for filter_nth_packet

We unfortunately need atomic counter update for the nth-filter.
This is because hooks like tcp-socket-read runs outside the socket
lock in a preempt/migrate-able user context.

We don't need accurate nth-counter across CPU, as this is
just a down-sampling mechanism.  Thus, we keep the PERCPU
array map and have nth-counter on a per CPU basis.  The
trick here is that in most cases the counter is only used
by the current running CPU, and the cache-line will mostly
be in a cache coherency Exclusive/Modified (MOESI) state,
which will cost less when doing atomic updates.

Manually testing on production showed 7ns runtime increase
(before 150.88 ns, after 157.67 ns).

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 5a07ab87..f5c97063 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -234,6 +234,7 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
 static inline bool filter_nth_packet(const enum netstacklat_hook hook)
 {
 	u32 key = hook;
+	u64 pkt_cnt;
 	u64 *nth;
 
 	/* Zero and one means disabled */
@@ -244,8 +245,12 @@ static inline bool filter_nth_packet(const enum netstacklat_hook hook)
 	if (!nth)
 		return false;
 
-	*nth += 1;
-	if ((*nth % user_config.filter_nth_packet) == 0) {
+	/* The hooks (like tcp-socket-read) runs outside the socket lock in a
+	 * preempt/migrate-able user context. Thus, atomic updates are needed
+	 * for correctness, but keep PERCPU map to limit cache-line bouncing.
+	 */
+	pkt_cnt = __sync_fetch_and_add(nth, 1);
+	if ((pkt_cnt % user_config.filter_nth_packet) == 0) {
 		return true;
 	}
 	return false;

From e3a5f589fa9227792081d4695ec297b5b05bc5aa Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 4 Sep 2025 13:03:47 +0200
Subject: [PATCH 36/46] netstacklat: Experiment with 2 nth packet sampling

Sample every 2-nth packet (50%)
 - Overhead: 6039419756 / 32219314 = 187.44 ns

Compared to local atomic-nth overead: 157.67 ns
 - approx 30 ns extra cost to sample 50% vs 3.12%

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index f5c97063..3cf786ea 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -29,7 +29,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_min_queue_len = 0, /* zero means filter is inactive */
-	.filter_nth_packet = 32, /* reduce recorded event to every nth packet, use power-of-2 */
+	.filter_nth_packet = 2, /* reduce recorded event to every nth packet, use power-of-2 */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,

From 2e7b39f2cc0cca3701f009455df46d03fd44c6e3 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 4 Sep 2025 13:31:57 +0200
Subject: [PATCH 37/46] netstacklat: Experiment with 4 nth packet sampling

name netstacklat_tcp_recv_timestamp
 - run_time_ns 17510185954 run_cnt 101083454 = 173.23 ns

Sample every 4-nth packet (25%)
 - Overhead: 173.23 ns
 - Compared to nth-2 (187.44 ns) saved 14.21 ns (187.44-173.23)
 - Compared to nth-32 (157.67 ns) cost 15.56 ns more (173.23-157.67)

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 3cf786ea..21791d52 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -29,7 +29,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_min_queue_len = 0, /* zero means filter is inactive */
-	.filter_nth_packet = 2, /* reduce recorded event to every nth packet, use power-of-2 */
+	.filter_nth_packet = 4, /* reduce recorded event to every nth packet, use power-of-2 */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,

From f5e1baa751444cb43729efec4732e71228fece4c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 4 Sep 2025 13:08:38 +0200
Subject: [PATCH 38/46] netstacklat: experiment with disabling nth-filter

name netstacklat_tcp_recv_timestamp
 - run_time_ns 24383044912 run_cnt 121125888 = 201.30 ns

Compared to
 - nth-2  : 186 ns -> +15 ns
 - nth-4  : 173 ns -> +28 ns
 - nth-32 : 157 ns -> +44 ns

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 21791d52..efa0c8b1 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -29,7 +29,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
 	.filter_min_queue_len = 0, /* zero means filter is inactive */
-	.filter_nth_packet = 4, /* reduce recorded event to every nth packet, use power-of-2 */
+	.filter_nth_packet = 0, /* reduce recorded event to every nth packet, use power-of-2 */
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,

From febd9b7225bab56d058cf841e6d97a5224a5b47d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Thu, 4 Sep 2025 14:10:39 +0200
Subject: [PATCH 39/46] netstacklat: disable user_config.groupby_ifindex

Because production have too little traffic on the internal
interface that latency stats becomes unusable.

Via CONFIG_GROUPBY_IFINDEX also remove this from the
hist_key and note that YAML file also needs adjustments.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 11 ++++++++++-
 examples/netstacklat.h     |  9 ++++++++-
 examples/netstacklat.yaml  | 15 ++++++++-------
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index efa0c8b1..a58b771a 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -34,7 +34,7 @@ const struct netstacklat_bpf_config user_config = {
 	.filter_ifindex = true,
 	.filter_cgroup = true,
 	.filter_nonempty_sockqueue = false,
-	.groupby_ifindex = true,
+	.groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */
 	.groupby_cgroup = true,
 };
 
@@ -74,8 +74,13 @@ struct sk_buff___old {
  *  and "enabled" hooks.
  */
 #define N_CGROUPS	2 /* depend on cgroup_id_map matches in YAML config*/
+#ifdef CONFIG_GROUPBY_IFINDEX
 #define N_IFACES	6 /* On prod only interested in ext0 and vlan100@ext0 */
+#else
+#define N_IFACES	1 /* With groupby_ifindex==false */
+#endif
 #define N_HOOKS	1
+
 #if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE || CONFIG_ENABLE_UDP_HOOKS)
 #err "Please update N_HOOKS"
 #endif
@@ -344,8 +349,10 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 	if (!filter_nth_packet(hook))
 		return;
 
+#ifdef CONFIG_GROUPBY_IFINDEX
 	if (user_config.groupby_ifindex)
 		key.ifindex = ifindex;
+#endif
 
 	_record_latency_since(skb->tstamp, key);
 }
@@ -519,8 +526,10 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	if (!filter_network_ns(skb, sk))
 		return;
 
+#ifdef CONFIG_GROUPBY_IFINDEX
 	if (user_config.groupby_ifindex)
 		key.ifindex = ifindex;
+#endif
 	if (user_config.groupby_cgroup)
 		key.cgroup = cgroup_id;
 
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index 0e30da60..bbe8ad7d 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -59,6 +59,11 @@ enum netstacklat_hook {
 	NETSTACKLAT_N_HOOKS,
 };
 
+/* Disabling user_config.groupby_ifindex requires modifying hist_key and YAML
+ */
+//#define CONFIG_GROUPBY_IFINDEX 1
+#undef CONFIG_GROUPBY_IFINDEX
+
 /*
  * Key used for the histogram map
  * To be compatible with ebpf-exporter, all histograms need a key struct whose final
@@ -66,10 +71,12 @@ enum netstacklat_hook {
  */
 struct hist_key {
 	__u64 cgroup;
+#ifdef CONFIG_GROUPBY_IFINDEX
 	__u32 ifindex;
+#endif
 	__u16 hook; // need well defined size for ebpf-exporter to decode
 	__u16 bucket; // needs to be last to be compatible with ebpf-exporter
-};
+} __attribute__((packed));
 
 struct netstacklat_bpf_config {
 	__u32 network_ns;
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index 657230b5..ec98efc6 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -12,13 +12,14 @@ metrics:
           decoders:
             - name: uint
             - name: cgroup
-        - name: iface
-          size: 4
-          decoders:
-            # If including output from a different network namespace than ebpf-exporter
-            # you probably just want to decode as a uint (ifindex) instead
-            # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others
-            - name: ifname
+# See: CONFIG_GROUPBY_IFINDEX
+#        - name: iface
+#          size: 4
+#          decoders:
+#            # If including output from a different network namespace than ebpf-exporter
+#            # you probably just want to decode as a uint (ifindex) instead
+#            # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others
+#            - name: ifname
         - name: hook
           size: 2
           decoders:

From 10a331e62db90a22fe5437a90676e3d53e56d142 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Fri, 5 Sep 2025 14:41:42 +0200
Subject: [PATCH 40/46] netstacklat: minor sync with upstream bpf-examples
 version

The upstream version of netstacklat that we are based on
got merged see PR#129.

https://github.com/xdp-project/bpf-examples/pull/129

Some adjustments were made, so lets sync with these
to avoid diverting too much from upstream.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 19 ++++++++-----------
 examples/netstacklat.h     |  4 ++--
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index a58b771a..5199f652 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -19,6 +19,8 @@
 #include "netstacklat.h"
 #include "bits.bpf.h"
 
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
 char LICENSE[] SEC("license") = "GPL";
 
 /* The ebpf_exporter variant of netstacklat is not runtime configurable at
@@ -28,7 +30,7 @@ char LICENSE[] SEC("license") = "GPL";
 const __s64 TAI_OFFSET = (37LL * NS_PER_S);
 const struct netstacklat_bpf_config user_config = {
 	.network_ns = 0,
-	.filter_min_queue_len = 0, /* zero means filter is inactive */
+	.filter_min_sockqueue_len = 0, /* zero means filter is inactive */
 	.filter_nth_packet = 0, /* reduce recorded event to every nth packet, use power-of-2 */
 	.filter_pid = false,
 	.filter_ifindex = true,
@@ -122,7 +124,7 @@ struct {
 #else
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH); /* type: hash */
-	__uint(max_entries, MAX_TRACKED_CGROUPS);
+	__uint(max_entries, MAX_PARSED_CGROUPS);
 	__type(key, u64);
 	__type(value, u64);
 } netstack_cgroupfilter SEC(".maps");
@@ -305,9 +307,7 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk)
 	if (user_config.network_ns == 0)
 		return true;
 
-	u32 ns = get_network_ns(skb, sk);
-
-	return ns == user_config.network_ns;
+	return get_network_ns(skb, sk) == user_config.network_ns;
 }
 
 #if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE)
@@ -372,7 +372,6 @@ static bool filter_pid(u32 pid)
 		return false;
 
 	return *pid_ok > 0;
-
 }
 #endif /* CONFIG_PID_FILTER_MAP */
 
@@ -425,8 +424,6 @@ static bool filter_current_task()
 	return ok;
 }
 
-#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
-
 /**
  * skb_queue_empty - check if a queue is empty
  * @list: queue head
@@ -469,9 +466,9 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_)
 	return READ_ONCE(list_->qlen);
 }
 
-static bool filter_min_queue_len(struct sock *sk)
+static bool filter_min_sockqueue_len(struct sock *sk)
 {
-	const u32 min_qlen = user_config.filter_min_queue_len;
+	const u32 min_qlen = user_config.filter_min_sockqueue_len;
 
 	if (min_qlen == 0)
 		return true;
@@ -496,7 +493,7 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 	if (!filter_nonempty_sockqueue(sk))
 		return false;
 
-	if (!filter_min_queue_len(sk))
+	if (!filter_min_sockqueue_len(sk))
 		return false;
 
 	if (!filter_cgroup(cgroup_id))
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index bbe8ad7d..bfe4f7ea 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -23,7 +23,7 @@
 // The highest ifindex we expect to encounter
 #define IFINDEX_MAX 16384
 // The maximum number of different cgroups we can filter for
-#define MAX_TRACKED_CGROUPS 4096
+#define MAX_PARSED_CGROUPS 4096
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
@@ -80,7 +80,7 @@ struct hist_key {
 
 struct netstacklat_bpf_config {
 	__u32 network_ns;
-	__u32 filter_min_queue_len;
+	__u32 filter_min_sockqueue_len;
 	__u64 filter_nth_packet;
 	bool filter_pid;
 	bool filter_ifindex;

From 7e2c734b2587ba555d2d183be3ea5c4968b98302 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Fri, 5 Sep 2025 14:49:19 +0200
Subject: [PATCH 41/46] netstacklat: upstream removed filter_nonempty_sockqueue

As the filter_min_sockqueue_len can replaced it.

This was also part of PR#129 merge, but it makes it easier
to review, to keep this in a seperate commit.

https://github.com/xdp-project/bpf-examples/pull/129

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 32 --------------------------------
 examples/netstacklat.h     |  1 -
 2 files changed, 33 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 5199f652..51200479 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -35,7 +35,6 @@ const struct netstacklat_bpf_config user_config = {
 	.filter_pid = false,
 	.filter_ifindex = true,
 	.filter_cgroup = true,
-	.filter_nonempty_sockqueue = false,
 	.groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */
 	.groupby_cgroup = true,
 };
@@ -424,39 +423,11 @@ static bool filter_current_task()
 	return ok;
 }
 
-/**
- * skb_queue_empty - check if a queue is empty
- * @list: queue head
- *
- * Returns true if the queue is empty, false otherwise.
- *
- * Copied from /include/linux/skbuff.h
- */
-static inline int skb_queue_empty(const struct sk_buff_head *list)
-{
-	return READ_ONCE(list->next) == (const struct sk_buff *)list;
-}
-
 static inline bool sk_backlog_empty(const struct sock *sk)
 {
 	return READ_ONCE(sk->sk_backlog.tail) == NULL;
 }
 
-static bool filter_nonempty_sockqueue(struct sock *sk)
-{
-	if (!user_config.filter_nonempty_sockqueue)
-		return true;
-
-	if (!skb_queue_empty(&sk->sk_receive_queue))
-		return true;
-
-	/* Packets can also be on the sk_backlog */
-	if (!sk_backlog_empty(sk))
-		return true;
-
-	return false;
-}
-
 /* To lower runtime overhead, skip recording timestamps for sockets with very
  * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2
  * elements
@@ -490,9 +461,6 @@ static bool filter_min_sockqueue_len(struct sock *sk)
 static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 					  u64 *cgroup_id, const enum netstacklat_hook hook)
 {
-	if (!filter_nonempty_sockqueue(sk))
-		return false;
-
 	if (!filter_min_sockqueue_len(sk))
 		return false;
 
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index bfe4f7ea..d4a40f7a 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -85,7 +85,6 @@ struct netstacklat_bpf_config {
 	bool filter_pid;
 	bool filter_ifindex;
 	bool filter_cgroup;
-	bool filter_nonempty_sockqueue;
 	bool groupby_ifindex;
 	bool groupby_cgroup;
 };

From b9cbe809d077936bea3d2924fc004b7fd16adcfb Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Fri, 5 Sep 2025 15:39:55 +0200
Subject: [PATCH 42/46] netstacklat: comply with clang format requirements

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index d4a40f7a..88b9b350 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -61,7 +61,7 @@ enum netstacklat_hook {
 
 /* Disabling user_config.groupby_ifindex requires modifying hist_key and YAML
  */
-//#define CONFIG_GROUPBY_IFINDEX 1
+/* #define CONFIG_GROUPBY_IFINDEX 1 */
 #undef CONFIG_GROUPBY_IFINDEX
 
 /*

From ba856d8416fbf40fcb77761eb08b0a2203270e8e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 16 Sep 2025 16:29:15 +0200
Subject: [PATCH 43/46] netstacklat: relax ifindex filter due to production

Lacking a YAML ebpf_exporter config for selecting iface names
we hard-coded ifindex, but some production servers have higher
ifindex for vlan100.

Relax ifindex range as a workaround.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 51200479..d99b9f2f 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -280,8 +280,10 @@ static bool filter_ifindex(u32 ifindex)
 	/* Hack for production:
 	 * - We want to exclude 'lo' which have ifindex==1.
 	 * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5)
+	 *   unfortunately ifindex'es are not stable, some production metals have
+	 *   ifindex==6 for vlan100@link0. Relax filter until adding YAML config.
 	 */
-	if (ifindex > 1 && ifindex < 6)
+	if (ifindex > 1 && ifindex < 12)
 		return true;
 
 	return false;

From 885f1016902113ebefc485130af5e44b32060704 Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Wed, 29 Oct 2025 16:09:44 +0100
Subject: [PATCH 44/46] netstacklat: Exclude TCP reads for HOL blocked segments

The 'tcp-socket-read' currently reports the latency for the skb
containing the last TCP segment read from the socket. However, this
segment might have been head of line (HOL) blocked by a previous
segment missing. In this case, netstacklat's reported latency will
include HOL blocking periods that is dependent on external
factors (such as network packet loss, and network latency impacts
retransmission time). As netstacklat is primarily intended to identify
issues within the local host (in the network stack or receiving
applications), by default filter out any socket reads were the last
read SKB might have experienced HOL-blocking.

Add the new -y/--include-tcp-hol-delay option to retain the old
behavior of reporting latency for all reads, including those that are
HOL-blocked. This may be useful in some scenarios when you still want
to be aware of latency issues caused by HOL-blocking, even though it
is caused by external components. For example, in a data center
context were you have full control over the network, it may still be
relevant to monitor HOL-based caused by the network.

To exclude HOL-blocked reads, detect if any new ooo-segments have
arrived by checking for differences in the number of ooo-packets in
tcp_sock->rcv_ooopack. If any new ooo-segments have arrived, exclude
the latency sample from the current read and set a limit for the next
safe sequence number to read where the current ooo-packets must have
been passed so segments can no longer be HOL-blocked. If there are
skbs in the ooo-queue, set the limit to the end of the
ooo-queue. Otherwise, set the limit to the current rcv_nxt (as if the
ooo-queue is empty the detected ooo-segments must already have been
merged into the receive queue and rcv_nxt must have advanced past
them). If the read is past the safe sequence limit and no new
ooo-segments have arrived, it's safe to start including the latency
samples again.

For sockets were some ooo-segments have been observed, keep the
ooo-range state in socket storage (BPF_MAP_TYPE_SK_STORAGE). Skip
protecting this state with a spin-lock, as it should only be
concurrently accessed if there are concurrent reads on the same TCP
socket, which is assumed to be very rare as applications attempting
that cannot know which part of the data each of their concurrent reads
will get.

There are some scenarios that may cause this ooo-filtering to fail.
- If multiple reads are done to the socket concurrently, we may not
  correctly track the last read byte. The kernel does not keep a lock
  on the TCP socket at the time our hooked function
  tcp_recv_timestamp() runs. If two reads are done in parallel, it's
  therefore possible that for both reads we will check the last read
  byte (tcp_sock.copied_seq) after the second read has updated it. We
  may then incorrectly conclude that the first read was ahead of the
  ooo-range when it was not, and record its latency when we should
  have excluded it. In practice I belive this issue should be quite
  rare, as most applications will probably not attempt to perform
  multiple concurrent reads to a single connected TCP socket in
  parallel (as then you cannot know which part of the payload the
  parallel reads will return).

- As tcp_recv_timestamp() runs outside of the socket lock, the various
  state members we access may concurrently be updated as we're
  attempting to read them. An especially problematic one is
  tcp_sock.ooo_last_skb, which keeps a pointer to an SKB that is only
  valid while the ooo-queue is non-empty. It is possible that between
  our check for if the ooo-queue is non-empty and following the
  ooo_last_skb pointer, the ooo-queue is cleared and the ooo_last_skb
  pointer may end up pointing towards a freed SKB. If the socket
  members we access are updated before or while we read them, it can
  break the filtering in numerous ways, e.g. result in includes
  samples that should have been excluded (due to e.g. copied_seq being
  updated before our read) or excluding a large amount of valid
  samples (due to e.g. setting a sequence limit based on garbage in a
  freed SKB).

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 128 +++++++++++++++++++++++++++++++++++++
 examples/netstacklat.h     |   1 +
 2 files changed, 129 insertions(+)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index d99b9f2f..58eb88a1 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -21,6 +21,10 @@
 
 #define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
 
+// Mimic macros from /include/net/tcp.h
+#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
+
 char LICENSE[] SEC("license") = "GPL";
 
 /* The ebpf_exporter variant of netstacklat is not runtime configurable at
@@ -37,6 +41,7 @@ const struct netstacklat_bpf_config user_config = {
 	.filter_cgroup = true,
 	.groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */
 	.groupby_cgroup = true,
+	.include_hol_blocked = false,
 };
 
 /* This provide easy way compile-time to disable some hooks */
@@ -86,6 +91,13 @@ struct sk_buff___old {
 #err "Please update N_HOOKS"
 #endif
 
+struct tcp_sock_ooo_range {
+	u32 prev_n_ooopkts;
+	u32 ooo_seq_end;
+	/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
+	bool active;
+};
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
 	__uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES);
@@ -151,6 +163,22 @@ static ktime_t time_since(ktime_t tstamp)
 	return (now - tstamp) / LATENCY_SCALE;
 }
 
+/*
+ * Is a < b considering u32 wrap around?
+ * Based on the before() function in /include/net/tcp.h
+ */
+static bool u32_lt(u32 a, u32 b)
+{
+	return (s32)(a - b) < 0;
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct tcp_sock_ooo_range);
+} netstack_tcp_ooo_range SEC(".maps");
+
 /* Determine if ebpf_exporter macro or local C implementation is used */
 #define CONFIG_MAP_MACROS	1
 #ifdef  CONFIG_MAP_MACROS
@@ -476,6 +504,102 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 }
 #endif
 
+static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
+{
+	struct tcp_skb_cb cb;
+	u32 max_seq = 0;
+	int err = 0;
+
+	if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
+		/* No ooo-segments currently in ooo-queue
+		 * Any ooo-segments must already have been merged to the
+		 * receive queue. Current rcv_nxt must therefore be ahead
+		 * of all ooo-segments that have arrived until now.
+		 */
+		err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt);
+		if (err)
+			bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d",
+				   err);
+	} else {
+		/*
+		 * Some ooo-segments currently in ooo-queue
+		 * Max out-of-order seq is given by the seq_end of the tail
+		 * skb in the ooo-queue.
+		 */
+		err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
+		if (err)
+			bpf_printk(
+				"failed to read tcp_sock->ooo_last_skb->cb, err=%d",
+				err);
+		max_seq = cb.end_seq;
+	}
+
+	*seq = max_seq;
+	return err;
+}
+
+static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
+				  struct tcp_sock_ooo_range *ooo_range)
+{
+	u32 read_seq;
+	int err;
+
+	if (!ooo_range->active)
+		return false;
+
+	err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err);
+		return true; // Assume we may be in ooo-range
+	}
+
+	if (u32_lt(ooo_range->ooo_seq_end, read_seq)) {
+		ooo_range->active = false;
+		return false;
+	} else {
+		return true;
+	}
+}
+
+static bool tcp_read_maybe_holblocked(struct sock *sk)
+{
+	struct tcp_sock_ooo_range *ooo_range;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 n_ooopkts, nxt_seq;
+	int err;
+
+	err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n",
+			   err);
+		return true; // Assume we may be in ooo-range
+	}
+
+	if (n_ooopkts == 0)
+		return false;
+
+	ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
+				       BPF_SK_STORAGE_GET_F_CREATE);
+	if (!ooo_range) {
+		bpf_printk(
+			"failed getting ooo-range socket storage for tcp socket");
+		return true; // Assume we may be in ooo-range
+	}
+
+	// Increase in ooo-packets since last - figure out next safe seq
+	if (n_ooopkts > ooo_range->prev_n_ooopkts) {
+		ooo_range->prev_n_ooopkts = n_ooopkts;
+		err = current_max_possible_ooo_seq(tp, &nxt_seq);
+		if (!err) {
+			ooo_range->ooo_seq_end = nxt_seq;
+			ooo_range->active = true;
+		}
+		return true;
+	}
+
+	return tcp_read_in_ooo_range(tp, ooo_range);
+}
+
 static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 				  ktime_t tstamp, enum netstacklat_hook hook,
 				  u64 cgroup_id)
@@ -590,6 +714,10 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 		return 0;
 
 	struct timespec64 *ts = &tss->ts[0];
+
+	if (!user_config.include_hol_blocked && tcp_read_maybe_holblocked(sk))
+		return 0;
+
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
 			      hook, cgroup_id);
diff --git a/examples/netstacklat.h b/examples/netstacklat.h
index 88b9b350..019d0fef 100644
--- a/examples/netstacklat.h
+++ b/examples/netstacklat.h
@@ -87,6 +87,7 @@ struct netstacklat_bpf_config {
 	bool filter_cgroup;
 	bool groupby_ifindex;
 	bool groupby_cgroup;
+	bool include_hol_blocked;
 };
 
 #endif

From 1443f7bf1825cf729f05b3802b880c04daa08bee Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Wed, 29 Oct 2025 16:17:02 +0100
Subject: [PATCH 45/46] netstacklat: Add sanity check for out-of-order sequence

The logic for excluding samples from TCP reads that may have been
delayed by HOL blocking relies on reading a number of fields from the
TCP socket outside of the socket lock. This may be prone to errors due
to the socket state being updated at another place in the kernel while
our eBPF program is running. To reduce the risk that a data race
causes the filter to fail, add a sanity check for the maximum out of
order sequence used to exclude future TCP reads from monitoring.

The most problematic of the read fields in the tcp_sock is
ooo_last_skb, as that is a pointer to another SKB rather than a direct
value. This pointer is only valid as long as the out_of_order_queue is
non-empty. Due to a data race, we may check that the ooo-queue is
non-empty while there are still SKBs in it, then have the kernel clear
out the ooo-queue, and finally attempt to read the ooo_last_skb
pointer later when it is no longer valid (and may now point to a
freed/recycled SKB). This may result in incorrect values being used
for the sequence limit used to exclude future reads of
ooo-segments. The faulty sequence limit may both cause reads of
HOL-blocked segments to be included or the exclusion of an
unnecessarily large amount of future reads (up to 2 GB).

To reduce the risk that the garbage data from an invalid SKB is used,
introduce two sanity checks for end_seq in the ooo_last_skb. First
check if the sequence number is zero, if so assume it is invalid (even
though it can be a valid sequence number). Even though we will get an
error code if reading the data from this SKB fails altogether, we may
still succeed reading from a no longer valid SKB, in which case there
is a high risk the data will have been zeroed. If it's non-zero, also
check that it is within the current receive window (if not, clamp it
to the receive window).

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 63 +++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index 58eb88a1..baf58bd8 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -504,22 +504,56 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb,
 }
 #endif
 
+/* Get the current receive window end sequence for tp
+ * In the kernel receive window checks are done against
+ * tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable
+ * result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher
+ */
+static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq)
+{
+	u32 rcv_wup, rcv_wnd, window = 0;
+	int err;
+
+	err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err);
+		goto exit;
+	}
+
+	err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err);
+		goto exit;
+	}
+
+	window = rcv_wup + rcv_wnd;
+	if (u32_lt(window, rcv_nxt))
+		window = rcv_nxt;
+
+exit:
+	*seq = window;
+	return err;
+}
+
 static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 {
+	u32 rcv_nxt, cur_rcv_window, max_seq = 0;
 	struct tcp_skb_cb cb;
-	u32 max_seq = 0;
 	int err = 0;
 
+	err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
+	if (err) {
+		bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err);
+		goto exit;
+	}
+
 	if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
 		/* No ooo-segments currently in ooo-queue
 		 * Any ooo-segments must already have been merged to the
 		 * receive queue. Current rcv_nxt must therefore be ahead
 		 * of all ooo-segments that have arrived until now.
 		 */
-		err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt);
-		if (err)
-			bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d",
-				   err);
+		max_seq = rcv_nxt;
 	} else {
 		/*
 		 * Some ooo-segments currently in ooo-queue
@@ -527,13 +561,28 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 		 * skb in the ooo-queue.
 		 */
 		err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
-		if (err)
+		if (err) {
 			bpf_printk(
 				"failed to read tcp_sock->ooo_last_skb->cb, err=%d",
 				err);
-		max_seq = cb.end_seq;
+			goto exit;
+		}
+
+		// Sanity check - ooo_last_skb->cb.end_seq within the receive window?
+		err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window);
+		if (err)
+			goto exit;
+
+		/* While seq 0 can be a valid seq, consider it more likely to
+		 * be the result of reading from an invalid SKB pointer
+		 */
+		if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq))
+			max_seq = cur_rcv_window;
+		else
+			max_seq = cb.end_seq;
 	}
 
+exit:
 	*seq = max_seq;
 	return err;
 }

From 347abc5be218a006378f35af81cee3b62152c875 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Mon, 3 Nov 2025 17:42:10 +0100
Subject: [PATCH 46/46] netstacklat: convert bpf_printk to optional debug
 feature

For production we need a way to disable any use of bpf_printk.

To track errors in production introduce a map for counting
these errors, as that will be exposed as a Prometheus counter
naming it netstacklat_errors_total.

The new "dbg" macro handled/hides if bpf_printk or counters
are enabled.

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
---
 examples/netstacklat.bpf.c | 76 ++++++++++++++++++++++++++++++++------
 examples/netstacklat.yaml  | 18 +++++++++
 2 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c
index baf58bd8..23016e0b 100644
--- a/examples/netstacklat.bpf.c
+++ b/examples/netstacklat.bpf.c
@@ -265,6 +265,57 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key)
 }
 #endif /* !CONFIG_MAP_MACROS */
 
+/* Debug facility to count errors */
+#define MAX_ERROR_TYPES 8
+enum error_types {
+    ERR_UNKNOWN = 0,
+    ERR_sk_storage = 1,
+    ERR_READ_TCP_rcv_wup = 2,
+    ERR_READ_TCP_rcv_wnd = 3,
+    ERR_READ_TCP_rcv_nxt = 4,
+    ERR_READ_TCP_last_skb_cb = 5,
+    ERR_READ_TCP_cp_seq = 6,
+    ERR_READ_TCP_rcv_ooopack = 7,
+};
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, MAX_ERROR_TYPES);
+    __type(key, u32);
+    __type(value, u64);
+} netstacklat_errors_total SEC(".maps");
+
+/* This provide easy way to disable debug feature for errors.
+ * Disabling this reduces BPF code size.
+ */
+#define CONFIG_TRACK_ERRORS 1
+/* #define CONFIG_PRINT_ERRORS 1 */
+#undef CONFIG_PRINT_ERRORS
+
+void record_errors(u32 err)
+{
+#ifdef CONFIG_TRACK_ERRORS
+    u32 key = ERR_UNKNOWN;
+
+    if (err < MAX_ERROR_TYPES)
+	    key = err;
+
+    increment_map_nosync(&netstacklat_errors_total, &key, 1);
+#endif /* CONFIG_TRACK_ERRORS */
+}
+
+#ifdef CONFIG_PRINT_ERRORS
+#define my_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__)
+#else /* !CONFIG_PRINT_ERRORS */
+#define my_printk(fmt, ...)
+#endif
+
+/* Debug macro that can be disabled compile time */
+#define dbg(__ERR_NR, fmt, ...)			\
+	({						\
+		record_errors(__ERR_NR);		\
+		my_printk(fmt, ##__VA_ARGS__);		\
+	})
+
 static inline bool filter_nth_packet(const enum netstacklat_hook hook)
 {
 	u32 key = hook;
@@ -516,13 +567,15 @@ static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq)
 
 	err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup);
 	if (err) {
-		bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err);
+		dbg(ERR_READ_TCP_rcv_wup,
+		    "failed to read tcp_sock->rcv_wup, err=%d", err);
 		goto exit;
 	}
 
 	err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd);
 	if (err) {
-		bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err);
+		dbg(ERR_READ_TCP_rcv_wnd,
+		    "failed to read tcp_sock->rcv_wnd, err=%d", err);
 		goto exit;
 	}
 
@@ -543,7 +596,8 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 
 	err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
 	if (err) {
-		bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err);
+		dbg(ERR_READ_TCP_rcv_nxt,
+		    "failed reading tcp_sock->rcv_nxt, err=%d", err);
 		goto exit;
 	}
 
@@ -562,9 +616,8 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 		 */
 		err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
 		if (err) {
-			bpf_printk(
-				"failed to read tcp_sock->ooo_last_skb->cb, err=%d",
-				err);
+			dbg(ERR_READ_TCP_last_skb_cb,
+			    "failed to read tcp_sock->ooo_last_skb->cb, err=%d", err);
 			goto exit;
 		}
 
@@ -598,7 +651,8 @@ static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
 
 	err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq);
 	if (err) {
-		bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err);
+		dbg(ERR_READ_TCP_cp_seq,
+		    "failed to read tcp_sock->copied_seq, err=%d", err);
 		return true; // Assume we may be in ooo-range
 	}
 
@@ -619,8 +673,8 @@ static bool tcp_read_maybe_holblocked(struct sock *sk)
 
 	err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack);
 	if (err) {
-		bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n",
-			   err);
+		dbg(ERR_READ_TCP_rcv_ooopack,
+		    "failed to read tcp_sock->rcv_ooopack, err=%d\n", err);
 		return true; // Assume we may be in ooo-range
 	}
 
@@ -630,8 +684,8 @@ static bool tcp_read_maybe_holblocked(struct sock *sk)
 	ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
 				       BPF_SK_STORAGE_GET_F_CREATE);
 	if (!ooo_range) {
-		bpf_printk(
-			"failed getting ooo-range socket storage for tcp socket");
+		dbg(ERR_sk_storage,
+		    "failed getting ooo-range socket storage for tcp socket");
 		return true; // Assume we may be in ooo-range
 	}
 
diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml
index ec98efc6..45fe845f 100644
--- a/examples/netstacklat.yaml
+++ b/examples/netstacklat.yaml
@@ -37,6 +37,24 @@ metrics:
           size: 2
           decoders:
             - name: uint
+  counters:
+    - name: netstacklat_errors_total
+      help: Counter for bpf_core_read errors in code (can be disabled in code)
+      labels:
+        - name: type
+          size: 4
+          decoders:
+            - name: uint
+            - name: static_map
+              static_map:
+                0: unknown
+                1: err_sk_storage
+                2: err_read_tcp_rcv_wup
+                3: err_read_tcp_rcv_wnd
+                4: err_read_tcp_rcv_nxt
+                5: err_read_tcp_last_skb_cb
+                6: err_read_tcp_cp_seq
+                7: err_read_tcp_rcv_ooopack
 
 # Remember to update #define N_CGROUPS in code when adding more matches
 cgroup_id_map: