From aadb5acd3793b395e59b32061a0bf5e6e8a78d80 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 20 Aug 2025 12:58:13 +0200 Subject: [PATCH 01/46] Add netstacklat example This is a direct copy from bpf-examples but from Simon's devel branch 'netstacklat-groupby' https://github.com/simosund/bpf-examples/tree/netstacklat-groupby/netstacklat Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 400 +++++++++++++++++++++++++++++++++++++ examples/netstacklat.h | 79 ++++++++ examples/netstacklat.yaml | 45 +++++ 3 files changed, 524 insertions(+) create mode 100644 examples/netstacklat.bpf.c create mode 100644 examples/netstacklat.h create mode 100644 examples/netstacklat.yaml diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c new file mode 100644 index 00000000..574cdbd4 --- /dev/null +++ b/examples/netstacklat.bpf.c @@ -0,0 +1,400 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#include "vmlinux_local.h" +#include + +#include +#include +#include + +#include "netstacklat.h" +#include "bits.bpf.h" + +char LICENSE[] SEC("license") = "GPL"; + + +volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); +volatile const struct netstacklat_bpf_config user_config = { + .network_ns = 0, + .filter_pid = false, + .filter_ifindex = false, + .filter_cgroup = false, + .filter_nonempty_sockqueue = false, + .groupby_ifindex = false, + .groupby_cgroup = false, +}; + +/* + * Alternative definition of sk_buff to handle renaming of the field + * mono_delivery_time to tstamp_type. See + * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes + */ +struct sk_buff___old { + union { + ktime_t tstamp; + u64 skb_mstamp_ns; + }; + __u8 mono_delivery_time: 1; +} __attribute__((preserve_access_index)); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); + __type(key, struct hist_key); + __type(value, u64); +} netstack_latency_seconds SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, PID_MAX_LIMIT); + __type(key, u32); + __type(value, u64); +} netstack_pidfilter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, IFINDEX_MAX); + __type(key, u32); + __type(value, u64); +} netstack_ifindexfilter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_TRACKED_CGROUPS); + __type(key, u64); + __type(value, u64); +} netstack_cgroupfilter SEC(".maps"); + +static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) +{ + u64 zero = 0; + u64 *val; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + // Key not in map - try insert it and lookup again + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); + return bpf_map_lookup_elem(map, key); +} + +static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket) +{ + u32 bucket = log2l(value); + + // Right-inclusive histogram, so "round up" the log value + if (bucket > 0 && 1ULL << bucket < value) + bucket++; + + if (bucket > max_bucket) + bucket = max_bucket; + + return bucket; +} + +/* + * Same call signature as the increment_exp2_histogram_nosync macro from + * https://github.com/cloudflare/ebpf_exporter/blob/master/examples/maps.bpf.h + * but provided as a function. + * + * Unlike the macro, only works with keys of type struct hist_key. The hist_key + * struct must be provided by value (rather than as a pointer) to keep the same + * call signature as the ebpf-exporter macro, although this will get inefficent + * if struct hist_key grows large. + */ +static void increment_exp2_histogram_nosync(void *map, struct hist_key key, + u64 value, u32 max_bucket) +{ + u64 *bucket_count; + + // Increment histogram + key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket); + bucket_count = lookup_or_zeroinit_histentry(map, &key); + if (bucket_count) + (*bucket_count)++; + + // Increment sum at end of histogram + if (value == 0) + return; + + key.bucket = max_bucket + 1; + bucket_count = lookup_or_zeroinit_histentry(map, &key); + if (bucket_count) + *bucket_count += value; +} + +static ktime_t time_since(ktime_t tstamp) +{ + ktime_t now; + + if (tstamp <= 0) + return -1; + + now = bpf_ktime_get_tai_ns() - TAI_OFFSET; + if (tstamp > now) + return -1; + + return now - tstamp; +} + +static void record_latency(ktime_t latency, const struct hist_key *key) +{ + increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency, + HIST_MAX_LATENCY_SLOT); +} + +static void record_latency_since(ktime_t tstamp, const struct hist_key *key) +{ + ktime_t latency = time_since(tstamp); + if (latency >= 0) + record_latency(latency, key); +} + +static bool filter_ifindex(u32 ifindex) +{ + u64 *ifindex_ok; + + if (!user_config.filter_ifindex) + // No ifindex filter - all ok + return true; + + ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex); + if (!ifindex_ok) + return false; + + return *ifindex_ok > 0; +} + +static bool filter_network_ns(u32 ns) +{ + if (user_config.network_ns == 0) + return true; + + return ns == user_config.network_ns; +} + +static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) +{ + /* + * Favor reading from sk due to less redirection (fewer probe reads) + * and skb->dev is not always set. + */ + if (sk) + return BPF_CORE_READ(sk->__sk_common.skc_net.net, ns.inum); + else if (skb) + return BPF_CORE_READ(skb->dev, nd_net.net, ns.inum); + return 0; +} + +static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) +{ + struct hist_key key = { .hook = hook }; + u32 ifindex; + + if (bpf_core_field_exists(skb->tstamp_type)) { + /* + * For kernels >= v6.11 the tstamp_type being non-zero + * (SKB_CLOCK_REALTIME) implies that skb->tstamp holds a + * preserved TX timestamp rather than a RX timestamp. See + * https://lore.kernel.org/all/20240509211834.3235191-2-quic_abchauha@quicinc.com/ + */ + if (BPF_CORE_READ_BITFIELD(skb, tstamp_type) > 0) + return; + + } else { + /* + * For kernels < v6.11, the field was called mono_delivery_time + * instead, see https://lore.kernel.org/all/20220302195525.3480280-1-kafai@fb.com/ + * Kernels < v5.18 do not have the mono_delivery_field either, + * but we do not support those anyways (as they lack the + * bpf_ktime_get_tai_ns helper) + */ + struct sk_buff___old *skb_old = (void *)skb; + if (BPF_CORE_READ_BITFIELD(skb_old, mono_delivery_time) > 0) + return; + } + + ifindex = skb->skb_iif; + if (!filter_ifindex(ifindex)) + return; + + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + + record_latency_since(skb->tstamp, &key); +} + +static bool filter_pid(u32 pid) +{ + u64 *pid_ok; + + if (!user_config.filter_pid) + // No PID filter - all PIDs ok + return true; + + pid_ok = bpf_map_lookup_elem(&netstack_pidfilter, &pid); + if (!pid_ok) + return false; + + return *pid_ok > 0; +} + +static bool filter_cgroup(u64 cgroup_id) +{ + if (!user_config.filter_cgroup) + // No cgroup filter - all cgroups ok + return true; + + return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; +} + +static bool filter_current_task(u64 cgroup) +{ + bool ok = true; + __u32 tgid; + + if (user_config.filter_pid) { + tgid = bpf_get_current_pid_tgid() >> 32; + ok = ok && filter_pid(tgid); + } + + if (user_config.filter_cgroup) + ok = ok && filter_cgroup(cgroup); + + return ok; +} + +/** + * skb_queue_empty - check if a queue is empty + * @list: queue head + * + * Returns true if the queue is empty, false otherwise. + * + * Copied from /include/linux/skbuff.h + */ +static inline int skb_queue_empty(const struct sk_buff_head *list) +{ + return list->next == (const struct sk_buff *)list; +} + +static bool filter_nonempty_sockqueue(struct sock *sk) +{ + if (!user_config.filter_nonempty_sockqueue) + return true; + + return !skb_queue_empty(&sk->sk_receive_queue); +} + +static void record_socket_latency(struct sock *sk, struct sk_buff *skb, + ktime_t tstamp, enum netstacklat_hook hook) +{ + struct hist_key key = { .hook = hook }; + u64 cgroup = 0; + u32 ifindex; + + if (!filter_nonempty_sockqueue(sk)) + return; + + if (user_config.filter_cgroup || user_config.groupby_cgroup) + cgroup = bpf_get_current_cgroup_id(); + + if (!filter_current_task(cgroup)) + return; + + ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; + if (!filter_ifindex(ifindex)) + return; + + if (!filter_network_ns(get_network_ns(skb, sk))) + return; + + if (user_config.groupby_ifindex) + key.ifindex = ifindex; + if (user_config.groupby_cgroup) + key.cgroup = cgroup; + + record_latency_since(tstamp, &key); +} + +SEC("fentry/ip_rcv_core") +int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, + void *tp, void *res, bool compat_mode) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); + return 0; +} + +SEC("fentry/ip6_rcv_core") +int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block, + void *tp, void *res, bool compat_mode) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); + return 0; +} + +SEC("fentry/tcp_v4_rcv") +int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); + return 0; +} + +SEC("fentry/tcp_v6_rcv") +int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); + return 0; +} + +SEC("fentry/udp_rcv") +int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); + return 0; +} + +SEC("fentry/udpv6_rcv") +int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) +{ + record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); + return 0; +} + +SEC("fexit/tcp_queue_rcv") +int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) +{ + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); + return 0; +} + +SEC("fexit/__udp_enqueue_schedule_skb") +int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, + struct sk_buff *skb, int retval) +{ + if (retval == 0) + record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); + return 0; +} + +SEC("fentry/tcp_recv_timestamp") +int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, + struct scm_timestamping_internal *tss) +{ + struct timespec64 *ts = &tss->ts[0]; + record_socket_latency(sk, NULL, + (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, + NETSTACKLAT_HOOK_TCP_SOCK_READ); + return 0; +} + +SEC("fentry/skb_consume_udp") +int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, + int len) +{ + record_socket_latency(sk, skb, skb->tstamp, + NETSTACKLAT_HOOK_UDP_SOCK_READ); + return 0; +} diff --git a/examples/netstacklat.h b/examples/netstacklat.h new file mode 100644 index 00000000..4811da4c --- /dev/null +++ b/examples/netstacklat.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef NETSTACKLAT_H +#define NETSTACKLAT_H + +#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s +/* + * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key" + * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys) + * that ebpf_exporter expects for exp2 hists (see how it's used in the + * increment_exp2_histogram_nosync() function) + */ +#define HIST_NBUCKETS (HIST_MAX_LATENCY_SLOT + 2) + +#define NS_PER_S 1000000000 + +// The highest possible PID on a Linux system (from /include/linux/threads.h) +#define PID_MAX_LIMIT (4 * 1024 * 1024) +// The highest ifindex we expect to encounter +#define IFINDEX_MAX 16384 +// The maximum number of different cgroups we can filter for +#define MAX_TRACKED_CGROUPS 4096 + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) +#endif + +#ifndef max +#define max(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) +#endif + +#ifndef min +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#endif + +enum netstacklat_hook { + NETSTACKLAT_HOOK_INVALID = 0, + NETSTACKLAT_HOOK_IP_RCV, + NETSTACKLAT_HOOK_TCP_START, + NETSTACKLAT_HOOK_UDP_START, + NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED, + NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED, + NETSTACKLAT_HOOK_TCP_SOCK_READ, + NETSTACKLAT_HOOK_UDP_SOCK_READ, + NETSTACKLAT_N_HOOKS, +}; + +/* + * Key used for the histogram map + * To be compatible with ebpf-exporter, all histograms need a key struct whose final + * member is named "bucket" and is the histogram bucket index. + */ +struct hist_key { + __u64 cgroup; + __u32 ifindex; + __u16 hook; // need well defined size for ebpf-exporter to decode + __u16 bucket; // needs to be last to be compatible with ebpf-exporter +}; + +struct netstacklat_bpf_config { + __u32 network_ns; + bool filter_pid; + bool filter_ifindex; + bool filter_cgroup; + bool filter_nonempty_sockqueue; + bool groupby_ifindex; + bool groupby_cgroup; +}; + +#endif diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml new file mode 100644 index 00000000..3b6e5dc8 --- /dev/null +++ b/examples/netstacklat.yaml @@ -0,0 +1,45 @@ +metrics: + histograms: + - name: netstack_latency_seconds + help: Latency for packets (skbs) to reach various points in the kernel network stack + bucket_type: exp2 + bucket_min: 0 + bucket_max: 34 + bucket_multiplier: 0.000000001 # nanoseconds to seconds + labels: + - name: cgroup + size: 8 + decoders: + - name: uint + - name: cgroup + - name: iface + size: 4 + decoders: + # If including output from a different network namespace than ebpf-exporter + # you probably just want to decode as a uint (ifindex) instead + # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others + - name: ifname + - name: hook + size: 2 + decoders: + - name: uint + - name: static_map + static_map: + 1: "ip-start" + 2: "tcp-start" + 3: "udp-start" + 4: "tcp-socket-enqueued" + 5: "udp-socket-enqueued" + 6: "tcp-socket-read" + 7: "udp-socket-read" + - name: bucket + size: 2 + decoders: + - name: uint + +cgroup_id_map: + name: netstack_cgroupfilter + type: hash + regexps: + - ^.*(system.slice/.*)$ + From c891328833f954f082e92067f4df8758de4816dc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 20 Aug 2025 13:11:15 +0200 Subject: [PATCH 02/46] Adjust netstacklat example to compile Keeping this as seperate commit to track what needed to change Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 574cdbd4..c5ce0564 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -#include "vmlinux_local.h" -#include +#include +//#include #include #include From d466045d12e928e7ae9f6459e7f77edc752af874 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 20 Aug 2025 16:29:59 +0200 Subject: [PATCH 03/46] netstacklat: adjustments to make it run Gotcha#1: My devel laptop have TAI offset zero - Other systems (incl prod) all have 37 sec Gotcha#2: RX-timestamping need to be enabled maually - something else have to enable RX-timestamping Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 18 +++++++++++++----- examples/netstacklat.yaml | 1 + 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index c5ce0564..d79eeb7d 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -11,16 +11,16 @@ char LICENSE[] SEC("license") = "GPL"; - -volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); +// Strange: TAI offset is zero on my test system +volatile const __s64 TAI_OFFSET = (0LL * NS_PER_S); volatile const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_pid = false, .filter_ifindex = false, .filter_cgroup = false, .filter_nonempty_sockqueue = false, - .groupby_ifindex = false, - .groupby_cgroup = false, + .groupby_ifindex = true, + .groupby_cgroup = true, }; /* @@ -36,9 +36,16 @@ struct sk_buff___old { __u8 mono_delivery_time: 1; } __attribute__((preserve_access_index)); +/* NOTICE: max_entries need to be adjusted based on maximum + * number of cgroups and ifindex (that are "groupby" collecting) + * and "enabled" hooks (as we want to disable some) + */ +#define N_CGROUPS 2 /* depend on cgroup_id_map matches in YAML config*/ +#define N_HOOKS NETSTACKLAT_N_HOOKS /* Keep it same until we disable some */ +#define N_IFACES 64 /* On prod only interested in ext0 and vlan100@ext0 */ struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); + __uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES * 64); __type(key, struct hist_key); __type(value, u64); } netstack_latency_seconds SEC(".maps"); @@ -295,6 +302,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, u64 cgroup = 0; u32 ifindex; + // XXX: TODO evaluate if this feature can make overhead acceptable if (!filter_nonempty_sockqueue(sk)) return; diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index 3b6e5dc8..15082870 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -42,4 +42,5 @@ cgroup_id_map: type: hash regexps: - ^.*(system.slice/.*)$ + - ^.*(user.slice/.*)$ From 22b929e43338a0769758d14611c82161684a1a77 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 20 Aug 2025 18:13:15 +0200 Subject: [PATCH 04/46] netstacklat: enable filter_cgroup and disable other hooks For ebpf_exporter we cannot control which BPF sections gets loaded. Instead we compile time disable some of the hooks via define/ifdef's. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index d79eeb7d..00a534cf 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -17,12 +17,18 @@ volatile const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_pid = false, .filter_ifindex = false, - .filter_cgroup = false, + .filter_cgroup = true, .filter_nonempty_sockqueue = false, .groupby_ifindex = true, .groupby_cgroup = true, }; +/* This provide easy way compile-time to disable some hooks */ +//#define CONFIG_HOOKS_EARLY_RCV 1 +#undef CONFIG_HOOKS_EARLY_RCV +//#define CONFIG_HOOKS_ENQUEUE 1 +#undef CONFIG_HOOKS_ENQUEUE +#define CONFIG_HOOKS_DEQUEUE 1 /* * Alternative definition of sk_buff to handle renaming of the field * mono_delivery_time to tstamp_type. See @@ -193,6 +199,7 @@ static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) return 0; } +#if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE) static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { struct hist_key key = { .hook = hook }; @@ -233,6 +240,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta record_latency_since(skb->tstamp, &key); } +#endif static bool filter_pid(u32 pid) { @@ -327,6 +335,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, record_latency_since(tstamp, &key); } +#ifdef CONFIG_HOOKS_EARLY_RCV SEC("fentry/ip_rcv_core") int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) @@ -370,7 +379,9 @@ int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); return 0; } +#endif /* CONFIG_HOOKS_EARLY_RCV */ +#ifdef CONFIG_HOOKS_ENQUEUE SEC("fexit/tcp_queue_rcv") int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) { @@ -386,7 +397,9 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); return 0; } +#endif /* CONFIG_HOOKS_ENQUEUE */ +#ifdef CONFIG_HOOKS_DEQUEUE SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) @@ -406,3 +419,4 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, NETSTACKLAT_HOOK_UDP_SOCK_READ); return 0; } +#endif /* CONFIG_HOOKS_DEQUEUE */ From 50fafda8eca7a6be9a53eb798aad4aaed6aa792b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 20 Aug 2025 18:58:16 +0200 Subject: [PATCH 05/46] netstacklat: disable ifindex filter Instead hardcode ifindex limits based on prod setup. As we don't have a way to configure this via YAML. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 00a534cf..edba867f 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -16,7 +16,7 @@ volatile const __s64 TAI_OFFSET = (0LL * NS_PER_S); volatile const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_pid = false, - .filter_ifindex = false, + .filter_ifindex = true, .filter_cgroup = true, .filter_nonempty_sockqueue = false, .groupby_ifindex = true, @@ -29,6 +29,11 @@ volatile const struct netstacklat_bpf_config user_config = { //#define CONFIG_HOOKS_ENQUEUE 1 #undef CONFIG_HOOKS_ENQUEUE #define CONFIG_HOOKS_DEQUEUE 1 + +/* Allows to compile-time disable ifindex map as it is large */ +//#define CONFIG_IFINDEX_FILTER_MAP 1 +#undef CONFIG_IFINDEX_FILTER_MAP + /* * Alternative definition of sk_buff to handle renaming of the field * mono_delivery_time to tstamp_type. See @@ -63,12 +68,14 @@ struct { __type(value, u64); } netstack_pidfilter SEC(".maps"); +#ifdef CONFIG_IFINDEX_FILTER_MAP struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, IFINDEX_MAX); __type(key, u32); __type(value, u64); } netstack_ifindexfilter SEC(".maps"); +#endif struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -165,17 +172,28 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key) static bool filter_ifindex(u32 ifindex) { - u64 *ifindex_ok; - if (!user_config.filter_ifindex) // No ifindex filter - all ok return true; +#ifdef CONFIG_IFINDEX_FILTER_MAP + u64 *ifindex_ok; + ifindex_ok = bpf_map_lookup_elem(&netstack_ifindexfilter, &ifindex); if (!ifindex_ok) return false; return *ifindex_ok > 0; +#else + /* Hack for production: + * - We want to exclude 'lo' which have ifindex==1. + * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5) + */ + if (ifindex > 1 && ifindex < 6) + return true; + + return false; +#endif } static bool filter_network_ns(u32 ns) From 2efbb9092a282fd7eec521c5b3bb754c508e83f9 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 20 Aug 2025 19:20:56 +0200 Subject: [PATCH 06/46] netstacklat: disable PID filtering Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index edba867f..d1984e8b 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -30,10 +30,14 @@ volatile const struct netstacklat_bpf_config user_config = { #undef CONFIG_HOOKS_ENQUEUE #define CONFIG_HOOKS_DEQUEUE 1 -/* Allows to compile-time disable ifindex map as it is large */ +/* Allows to compile-time disable ifindex map as YAML cannot conf this */ //#define CONFIG_IFINDEX_FILTER_MAP 1 #undef CONFIG_IFINDEX_FILTER_MAP +/* Allows to compile-time disable PID filter map as it is very large */ +//#define CONFIG_PID_FILTER_MAP 1 +#undef CONFIG_PID_FILTER_MAP + /* * Alternative definition of sk_buff to handle renaming of the field * mono_delivery_time to tstamp_type. See @@ -61,12 +65,14 @@ struct { __type(value, u64); } netstack_latency_seconds SEC(".maps"); +#ifdef CONFIG_PID_FILTER_MAP struct { __uint(type, BPF_MAP_TYPE_ARRAY); __uint(max_entries, PID_MAX_LIMIT); __type(key, u32); __type(value, u64); } netstack_pidfilter SEC(".maps"); +#endif #ifdef CONFIG_IFINDEX_FILTER_MAP struct { @@ -260,6 +266,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta } #endif +#ifdef CONFIG_PID_FILTER_MAP static bool filter_pid(u32 pid) { u64 *pid_ok; @@ -273,7 +280,9 @@ static bool filter_pid(u32 pid) return false; return *pid_ok > 0; + } +#endif /* CONFIG_PID_FILTER_MAP */ static bool filter_cgroup(u64 cgroup_id) { @@ -287,13 +296,15 @@ static bool filter_cgroup(u64 cgroup_id) static bool filter_current_task(u64 cgroup) { bool ok = true; + +#ifdef CONFIG_PID_FILTER_MAP __u32 tgid; if (user_config.filter_pid) { tgid = bpf_get_current_pid_tgid() >> 32; ok = ok && filter_pid(tgid); } - +#endif if (user_config.filter_cgroup) ok = ok && filter_cgroup(cgroup); From 43da77b0b318c38d5f45787a590c94016d386594 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 10:22:14 +0200 Subject: [PATCH 07/46] netstacklat: an idea as a code comment Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index d1984e8b..6528a0c1 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -324,6 +324,18 @@ static inline int skb_queue_empty(const struct sk_buff_head *list) return list->next == (const struct sk_buff *)list; } +/* IDEA: To lower runtime overhead, we could skip recording timestamps for + * sockets with very few packets. + * + * sk_buff_head->qlen could be used to see if e.g. queue have more than 2 elements + * + * +static inline __u32 skb_queue_len(const struct sk_buff_head *list_) +{ + return list_->qlen; +} +*/ + static bool filter_nonempty_sockqueue(struct sock *sk) { if (!user_config.filter_nonempty_sockqueue) From dfff4ae0998fba094a7e95b50f411795a7495666 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 10:23:51 +0200 Subject: [PATCH 08/46] netstacklat: limit cgroups to nginx for a test deployment Execution runtime: netstacklat_tcp_recv_timestamp - run_time_ns 18885872401 run_cnt 71,443,820 = 264.34 ns Execution runtime: netstacklat_skb_consume_udp - run_time_ns 6124797061 run_cnt 16,324,309 = 375.19 ns Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.yaml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index 15082870..d59d3b6b 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -41,6 +41,9 @@ cgroup_id_map: name: netstack_cgroupfilter type: hash regexps: - - ^.*(system.slice/.*)$ - - ^.*(user.slice/.*)$ - + - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$ + - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$ +# - ^(/sys/fs/cgroup/production.slice/.*/pingora-backend-router.service).*$ +# - ^(/sys/fs/cgroup/production.slice/.*/pingora-origin.service).*$ +# - ^.*(system.slice/.*)$ +# - ^.*(user.slice/.*)$ From 48c7f20fc5ec68b73bbfed6796cd8b620e645e8d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 11:25:52 +0200 Subject: [PATCH 09/46] netstacklat: evaluate filter_nonempty_sockqueue Something is buggy with this filter - All latency records is on max bucket The READ_ONCE change doesn't fix the issue Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 6528a0c1..19ab004b 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -18,7 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, - .filter_nonempty_sockqueue = false, + .filter_nonempty_sockqueue = true, .groupby_ifindex = true, .groupby_cgroup = true, }; @@ -311,6 +311,8 @@ static bool filter_current_task(u64 cgroup) return ok; } +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + /** * skb_queue_empty - check if a queue is empty * @list: queue head @@ -321,7 +323,7 @@ static bool filter_current_task(u64 cgroup) */ static inline int skb_queue_empty(const struct sk_buff_head *list) { - return list->next == (const struct sk_buff *)list; + return READ_ONCE(list->next) == (const struct sk_buff *)list; } /* IDEA: To lower runtime overhead, we could skip recording timestamps for From a0776a2d44a645c0545bac7571f57330d318722d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 11:45:50 +0200 Subject: [PATCH 10/46] netstacklat: disable filter_nonempty_sockqueue Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 19ab004b..a29dbeda 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -18,7 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, - .filter_nonempty_sockqueue = true, + .filter_nonempty_sockqueue = false, .groupby_ifindex = true, .groupby_cgroup = true, }; From de778bc569f0c49ee9e44aa52c9ee89dc0959cfc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 12:08:50 +0200 Subject: [PATCH 11/46] netstacklat: restore TAI offset to 37 sec This was the real reason for seeing wrong numbers in prod. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index a29dbeda..605fbe67 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -11,8 +11,7 @@ char LICENSE[] SEC("license") = "GPL"; -// Strange: TAI offset is zero on my test system -volatile const __s64 TAI_OFFSET = (0LL * NS_PER_S); +volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); volatile const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_pid = false, From 33965b2c09ad5b9506f8bb9226270a0f6f395c97 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 12:16:08 +0200 Subject: [PATCH 12/46] netstacklat: use ebpf_exporter macros for increment_exp2_histogram_nosync Execution runtime: netstacklat_tcp_recv_timestamp - run_time_ns 32164560127 run_cnt 116,590,498 = 275.88 ns Execution runtime: netstacklat_skb_consume_udp - run_time_ns 10490230543 run_cnt 23,993,428 = 437.21 ns Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 48 ++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 605fbe67..30d24cbe 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -89,6 +89,34 @@ struct { __type(value, u64); } netstack_cgroupfilter SEC(".maps"); +static ktime_t time_since(ktime_t tstamp) +{ + ktime_t now; + + if (tstamp <= 0) + return -1; + + now = bpf_ktime_get_tai_ns() - TAI_OFFSET; + if (tstamp > now) + return -1; + + return now - tstamp; +} + +/* Determine if ebpf_exporter macro or local C implementation is used */ +#define CONFIG_MAP_MACROS 1 +#ifdef CONFIG_MAP_MACROS +#include "maps.bpf.h" +#define _record_latency_since(tstamp, key) \ + ktime_t latency = time_since(tstamp); \ + if (latency >= 0) \ + increment_exp2_histogram_nosync(&netstack_latency_seconds, \ + key, latency, \ + HIST_MAX_LATENCY_SLOT); +#else /* !CONFIG_MAP_MACROS */ +#define _record_latency_since(tstamp, key) \ + record_latency_since(tstamp, &key) + static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) { u64 zero = 0; @@ -148,32 +176,18 @@ static void increment_exp2_histogram_nosync(void *map, struct hist_key key, *bucket_count += value; } -static ktime_t time_since(ktime_t tstamp) -{ - ktime_t now; - - if (tstamp <= 0) - return -1; - - now = bpf_ktime_get_tai_ns() - TAI_OFFSET; - if (tstamp > now) - return -1; - - return now - tstamp; -} - static void record_latency(ktime_t latency, const struct hist_key *key) { increment_exp2_histogram_nosync(&netstack_latency_seconds, *key, latency, HIST_MAX_LATENCY_SLOT); } - static void record_latency_since(ktime_t tstamp, const struct hist_key *key) { ktime_t latency = time_since(tstamp); if (latency >= 0) record_latency(latency, key); } +#endif /* !CONFIG_MAP_MACROS */ static bool filter_ifindex(u32 ifindex) { @@ -261,7 +275,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta if (user_config.groupby_ifindex) key.ifindex = ifindex; - record_latency_since(skb->tstamp, &key); + _record_latency_since(skb->tstamp, key); } #endif @@ -374,7 +388,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (user_config.groupby_cgroup) key.cgroup = cgroup; - record_latency_since(tstamp, &key); + _record_latency_since(tstamp, key); } #ifdef CONFIG_HOOKS_EARLY_RCV From e9ac4cddf1e2e89b3545c3d170a6e083570919c1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 17:20:41 +0200 Subject: [PATCH 13/46] netstacklat: re-evaluate filter_nonempty_sockqueue Moved filter_nonempty_sockqueue to callers - because record_socket_latency() becomes a BPF function-call - perf e.g. shows bpf_prog_fb69587c6ea462b7_record_socket_latency Execution runtime: netstacklat_tcp_recv_timestamp - run_time_ns 181391511590 run_cnt 788663546 = 229.99 sn Execution runtime: netstacklat_skb_consume_udp - run_time_ns 16212598612 run_cnt 137812779 = 117.64 ns This clearly have a huge improvement for UDP packets. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 30d24cbe..639d090d 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -17,7 +17,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, - .filter_nonempty_sockqueue = false, + .filter_nonempty_sockqueue = true, .groupby_ifindex = true, .groupby_cgroup = true, }; @@ -366,10 +366,6 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, u64 cgroup = 0; u32 ifindex; - // XXX: TODO evaluate if this feature can make overhead acceptable - if (!filter_nonempty_sockqueue(sk)) - return; - if (user_config.filter_cgroup || user_config.groupby_cgroup) cgroup = bpf_get_current_cgroup_id(); @@ -460,6 +456,9 @@ SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { + if (!filter_nonempty_sockqueue(sk)) + return 0; + struct timespec64 *ts = &tss->ts[0]; record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, @@ -471,6 +470,9 @@ SEC("fentry/skb_consume_udp") int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, int len) { + if (!filter_nonempty_sockqueue(sk)) + return 0; + record_socket_latency(sk, skb, skb->tstamp, NETSTACKLAT_HOOK_UDP_SOCK_READ); return 0; From 925debfcf23e86b661a042bf46195dd6c122afbc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 18:06:47 +0200 Subject: [PATCH 14/46] netstacklat: compile time enable filter_nonempty_sockqueue Enable filter_nonempty_sockqueue compile time to make sure that this config setting doesn't influence performance. I'm only seeing a small effect. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 639d090d..4efd64d3 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -18,6 +18,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_ifindex = true, .filter_cgroup = true, .filter_nonempty_sockqueue = true, +#define CONFIG_FILTER_NONEMPTY_SOCKQUEUE 1 .groupby_ifindex = true, .groupby_cgroup = true, }; @@ -353,8 +354,10 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) static bool filter_nonempty_sockqueue(struct sock *sk) { +#ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE if (!user_config.filter_nonempty_sockqueue) return true; +#endif return !skb_queue_empty(&sk->sk_receive_queue); } From 28c37132c25197930da39dccd7ea49956e481355 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 21 Aug 2025 19:08:59 +0200 Subject: [PATCH 15/46] netstacklat: filter on socket queue length The filter_nonempty_sockqueue() is effecient for UDP packets, but doesn't work well for TCP packets. Add filtering on socket queue lenght. Try filtering if qlen is not above 3 packets for TCP. Execution runtime: netstacklat_tcp_recv_timestamp - run_time_ns 10690540076 run_cnt 117852699 = 90.71 ns Execution runtime: netstacklat_skb_consume_udp - run_time_ns 2206621632 run_cnt 20004338 = 110.30 ns This have a HUGE improvement for TCP case. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 4efd64d3..17c69d2c 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -340,18 +340,6 @@ static inline int skb_queue_empty(const struct sk_buff_head *list) return READ_ONCE(list->next) == (const struct sk_buff *)list; } -/* IDEA: To lower runtime overhead, we could skip recording timestamps for - * sockets with very few packets. - * - * sk_buff_head->qlen could be used to see if e.g. queue have more than 2 elements - * - * -static inline __u32 skb_queue_len(const struct sk_buff_head *list_) -{ - return list_->qlen; -} -*/ - static bool filter_nonempty_sockqueue(struct sock *sk) { #ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE @@ -362,6 +350,22 @@ static bool filter_nonempty_sockqueue(struct sock *sk) return !skb_queue_empty(&sk->sk_receive_queue); } +/* To lower runtime overhead, skip recording timestamps for sockets with very + * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2 + * elements + */ +static inline __u32 sk_queue_len(const struct sk_buff_head *list_) +{ + return READ_ONCE(list_->qlen); +} + +static bool filter_queue_len(struct sock *sk, const __u32 above_len) +{ + if (sk_queue_len(&sk->sk_receive_queue) > above_len) + return true; + return false; +} + static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook) { @@ -462,6 +466,9 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, if (!filter_nonempty_sockqueue(sk)) return 0; + if (!filter_queue_len(sk, 3)) + return 0; + struct timespec64 *ts = &tss->ts[0]; record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, From d09463e2128dad7ca56c4d4b7a31f3769d56f6d5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 11:41:59 +0200 Subject: [PATCH 16/46] netstacklat: test with cgroup_id_map type to cgrp_storage Leverage BPF_MAP_TYPE_CGRP_STORAGE for our cgroup filter. To evaluate the two different cgroup_id_map types code macro CONFIG_CGRP_STORAGE is introduced, to allow us to switch implementation compile time. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 37 +++++++++++++++++++++++++++++++------ examples/netstacklat.yaml | 2 +- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 17c69d2c..fae71461 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -83,12 +83,23 @@ struct { } netstack_ifindexfilter SEC(".maps"); #endif +/* Eval two different cgroup_id_map types*/ +#define CONFIG_CGRP_STORAGE 1 +#ifdef CONFIG_CGRP_STORAGE struct { - __uint(type, BPF_MAP_TYPE_HASH); + __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); /* type: cgrp_storage */ + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, u32); + __type(value, u64); +} netstack_cgroupfilter SEC(".maps"); +#else +struct { + __uint(type, BPF_MAP_TYPE_HASH); /* type: hash */ __uint(max_entries, MAX_TRACKED_CGROUPS); __type(key, u64); __type(value, u64); } netstack_cgroupfilter SEC(".maps"); +#endif static ktime_t time_since(ktime_t tstamp) { @@ -298,6 +309,19 @@ static bool filter_pid(u32 pid) } #endif /* CONFIG_PID_FILTER_MAP */ +#ifdef CONFIG_CGRP_STORAGE +static bool filter_cgroup(u64 unused) +{ + if (!user_config.filter_cgroup) + // No cgroup filter - all cgroups ok + return true; + + struct task_struct *task = bpf_get_current_task_btf(); + struct cgroup *cgrp = task->cgroups->dfl_cgrp; + + return bpf_cgrp_storage_get(&netstack_cgroupfilter, cgrp, 0, 0) != NULL; +} +#else /* !CONFIG_CGRP_STORAGE */ static bool filter_cgroup(u64 cgroup_id) { if (!user_config.filter_cgroup) @@ -306,8 +330,9 @@ static bool filter_cgroup(u64 cgroup_id) return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; } +#endif /* !CONFIG_CGRP_STORAGE */ -static bool filter_current_task(u64 cgroup) +static bool filter_current_task() { bool ok = true; @@ -319,9 +344,6 @@ static bool filter_current_task(u64 cgroup) ok = ok && filter_pid(tgid); } #endif - if (user_config.filter_cgroup) - ok = ok && filter_cgroup(cgroup); - return ok; } @@ -376,7 +398,10 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (user_config.filter_cgroup || user_config.groupby_cgroup) cgroup = bpf_get_current_cgroup_id(); - if (!filter_current_task(cgroup)) + if (!filter_cgroup(cgroup)) + return; + + if (!filter_current_task()) return; ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index d59d3b6b..dda785e3 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -39,7 +39,7 @@ metrics: cgroup_id_map: name: netstack_cgroupfilter - type: hash + type: cgrp_storage regexps: - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$ - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$ From 8370c7321d66a5dde535c06654d788fbe18e62a5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 14:05:26 +0200 Subject: [PATCH 17/46] netstacklat: constify user_config and disable filter_nonempty_sockqueue The ebpf_exporter variant of netstacklat is not runtime configurable at BPF-load time. Thus, below user_config isn't define as 'volatile', instead the 'const' allows the compiler to do dead-code elimination. We also disable user_config.filter_nonempty_sockqueue as we want to stress the cgroup lookup types some more. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index fae71461..300632ff 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -11,14 +11,17 @@ char LICENSE[] SEC("license") = "GPL"; -volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S); -volatile const struct netstacklat_bpf_config user_config = { +/* The ebpf_exporter variant of netstacklat is not runtime configurable at + * BPF-load time. Thus, below user_config isn't define as 'volatile', instead + * the 'const' allows the compiler to do dead-code elimination. + */ +const __s64 TAI_OFFSET = (37LL * NS_PER_S); +const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, - .filter_nonempty_sockqueue = true, -#define CONFIG_FILTER_NONEMPTY_SOCKQUEUE 1 + .filter_nonempty_sockqueue = false, .groupby_ifindex = true, .groupby_cgroup = true, }; @@ -364,10 +367,8 @@ static inline int skb_queue_empty(const struct sk_buff_head *list) static bool filter_nonempty_sockqueue(struct sock *sk) { -#ifndef CONFIG_FILTER_NONEMPTY_SOCKQUEUE if (!user_config.filter_nonempty_sockqueue) return true; -#endif return !skb_queue_empty(&sk->sk_receive_queue); } @@ -383,6 +384,9 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_) static bool filter_queue_len(struct sock *sk, const __u32 above_len) { + if (!user_config.filter_nonempty_sockqueue) + return true; + if (sk_queue_len(&sk->sk_receive_queue) > above_len) return true; return false; From 15525743ad81f48f12d23d0af899d852e2712312 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 14:41:16 +0200 Subject: [PATCH 18/46] netstacklat: move filter_cgroup before record_socket_latency This required changing call signature of filter_cgroup() to also populate the cgroup_id as that is used for groupby_cgroup. We are still using the CONFIG_CGRP_STORAGE that leverages the BPF_MAP_TYPE_CGRP_STORAGE. Without the queue length filters (filter_nonempty_sockqueue) we are getting more calls. This is on purpose to evaluate cgroup_id_map types. name netstacklat_tcp_recv_timestamp - run_time_ns 17953390952 run_cnt 55079620 = 325.95 ns netstacklat_skb_consume_udp - run_time_ns 5779869863 run_cnt 11650472 = 496.10 ns Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 47 +++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 300632ff..1b7fb3f7 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -313,25 +313,36 @@ static bool filter_pid(u32 pid) #endif /* CONFIG_PID_FILTER_MAP */ #ifdef CONFIG_CGRP_STORAGE -static bool filter_cgroup(u64 unused) +static bool filter_cgroup(u64 *cgroup_id) { - if (!user_config.filter_cgroup) + if (!user_config.filter_cgroup) { + if (user_config.groupby_cgroup) + *cgroup_id = bpf_get_current_cgroup_id(); // No cgroup filter - all cgroups ok return true; + } struct task_struct *task = bpf_get_current_task_btf(); struct cgroup *cgrp = task->cgroups->dfl_cgrp; + if (user_config.groupby_cgroup) + /* no need to call bpf_get_current_cgroup_id() */ + *cgroup_id = BPF_CORE_READ(cgrp, kn, id); + return bpf_cgrp_storage_get(&netstack_cgroupfilter, cgrp, 0, 0) != NULL; } #else /* !CONFIG_CGRP_STORAGE */ -static bool filter_cgroup(u64 cgroup_id) +static bool filter_cgroup(u64 *cgroup_id) { - if (!user_config.filter_cgroup) + if (!user_config.filter_cgroup) { + if (user_config.groupby_cgroup) + *cgroup_id = bpf_get_current_cgroup_id(); // No cgroup filter - all cgroups ok return true; + } + *cgroup_id = bpf_get_current_cgroup_id(); - return bpf_map_lookup_elem(&netstack_cgroupfilter, &cgroup_id) != NULL; + return bpf_map_lookup_elem(&netstack_cgroupfilter, cgroup_id) != NULL; } #endif /* !CONFIG_CGRP_STORAGE */ @@ -393,18 +404,12 @@ static bool filter_queue_len(struct sock *sk, const __u32 above_len) } static void record_socket_latency(struct sock *sk, struct sk_buff *skb, - ktime_t tstamp, enum netstacklat_hook hook) + ktime_t tstamp, enum netstacklat_hook hook, + u64 cgroup_id) { struct hist_key key = { .hook = hook }; - u64 cgroup = 0; u32 ifindex; - if (user_config.filter_cgroup || user_config.groupby_cgroup) - cgroup = bpf_get_current_cgroup_id(); - - if (!filter_cgroup(cgroup)) - return; - if (!filter_current_task()) return; @@ -418,7 +423,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (user_config.groupby_ifindex) key.ifindex = ifindex; if (user_config.groupby_cgroup) - key.cgroup = cgroup; + key.cgroup = cgroup_id; _record_latency_since(tstamp, key); } @@ -492,6 +497,11 @@ SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { + u64 cgroup_id = 0; + + if (!filter_cgroup(&cgroup_id)) + return 0; + if (!filter_nonempty_sockqueue(sk)) return 0; @@ -501,7 +511,7 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct timespec64 *ts = &tss->ts[0]; record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, - NETSTACKLAT_HOOK_TCP_SOCK_READ); + NETSTACKLAT_HOOK_TCP_SOCK_READ, cgroup_id); return 0; } @@ -509,11 +519,16 @@ SEC("fentry/skb_consume_udp") int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, int len) { + u64 cgroup_id = 0; + + if (!filter_cgroup(&cgroup_id)) + return 0; + if (!filter_nonempty_sockqueue(sk)) return 0; record_socket_latency(sk, skb, skb->tstamp, - NETSTACKLAT_HOOK_UDP_SOCK_READ); + NETSTACKLAT_HOOK_UDP_SOCK_READ, cgroup_id); return 0; } #endif /* CONFIG_HOOKS_DEQUEUE */ From 6c6529f408bb470cc70e2d9fd36ae12e8e9cc6e8 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 15:09:18 +0200 Subject: [PATCH 19/46] netstacklat: test cgroup_id_map type hash Changing the filter_cgroup() to use the cgroup_id_map type hash. Surprisingly this type seems to be faster than type cgrp_storage. name netstacklat_tcp_recv_timestamp - run_time_ns 10705407914 run_cnt 41576592 = 257.48 ns - diff: 325.95 - 257.48 = 68.47 ns better name netstacklat_skb_consume_udp - run_time_ns 3716653454 run_cnt 8499677 = 437.27 ns - diff: 496.10 - 437.27 = 58.83 ns better On this AMD CPU with SRSO enabled, we have extra overheads on BPF helper calls. The filter_cgroup() for type cgrp_storage has two extra helper calls, bpf_get_current_task_btf() and bpf_cgrp_storage_get(), but we eliminated the bpf_get_current_cgroup_id() helper call. Still this doesn't fully account for diff. That said, if a BPF-prog already have the struct_task available then the bpf_get_current_task_btf() can also be eliminated, so type cgrp_storage might still be useful in that case Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 2 +- examples/netstacklat.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 1b7fb3f7..fedc3243 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -87,7 +87,7 @@ struct { #endif /* Eval two different cgroup_id_map types*/ -#define CONFIG_CGRP_STORAGE 1 +//#define CONFIG_CGRP_STORAGE 1 #ifdef CONFIG_CGRP_STORAGE struct { __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); /* type: cgrp_storage */ diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index dda785e3..d59d3b6b 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -39,7 +39,7 @@ metrics: cgroup_id_map: name: netstack_cgroupfilter - type: cgrp_storage + type: hash regexps: - ^(/sys/fs/cgroup/production.slice/.*/nginx-cache.service).*$ - ^(/sys/fs/cgroup/production.slice/.*/nginx-ssl.service).*$ From b9c4570e74a5d4cd6608ab73aeebc15cfa533235 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 15:35:14 +0200 Subject: [PATCH 20/46] netstacklat: hide call to get_network_ns() If we have disabled network_ns filtering, the code still does a lookup of the current name space via calling get_network_ns(). Reorg the code to avoid this call if feature is disabled. name netstacklat_tcp_recv_timestamp - run_time_ns 10623365578 run_cnt 44842812 = 236.90 ns - diff: 257.48 - 236.90 = 20.58 ns better name netstacklat_skb_consume_udp - run_time_ns 3718153230 run_cnt 9902613 = 375.47 ns - diff: 437.27 - 375.47 = 61.80 ns better Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index fedc3243..07459885 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -230,14 +230,6 @@ static bool filter_ifindex(u32 ifindex) #endif } -static bool filter_network_ns(u32 ns) -{ - if (user_config.network_ns == 0) - return true; - - return ns == user_config.network_ns; -} - static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) { /* @@ -251,6 +243,16 @@ static __u64 get_network_ns(struct sk_buff *skb, struct sock *sk) return 0; } +static bool filter_network_ns(struct sk_buff *skb, struct sock *sk) +{ + if (user_config.network_ns == 0) + return true; + + u32 ns = get_network_ns(skb, sk); + + return ns == user_config.network_ns; +} + #if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE) static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { @@ -284,7 +286,7 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta if (!filter_ifindex(ifindex)) return; - if (!filter_network_ns(get_network_ns(skb, sk))) + if (!filter_network_ns(skb, sk)) return; if (user_config.groupby_ifindex) @@ -417,7 +419,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (!filter_ifindex(ifindex)) return; - if (!filter_network_ns(get_network_ns(skb, sk))) + if (!filter_network_ns(skb, sk)) return; if (user_config.groupby_ifindex) From d047606b9930c4c6e6a3929b865aa99b75a11415 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 16:18:14 +0200 Subject: [PATCH 21/46] netstacklat: new user_config for filter_queue_len Let us control the filter_queue_len() via seperate user_config. Also enable this for UDP sockets. Notice the filter_cgroup() is still running first, so this is the primary filter. And filter_nonempty_sockqueue is still false. name netstacklat_tcp_recv_timestamp - run_time_ns 15661530364 run_cnt 94922963 = 164.99 ns name netstacklat_skb_consume_udp - run_time_ns 3255451250 run_cnt 14532586 = 224.01 ns Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 12 +++++++++--- examples/netstacklat.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 07459885..0692da36 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -18,6 +18,7 @@ char LICENSE[] SEC("license") = "GPL"; const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, + .filter_queue_len = 3, /* zero means filter is inactive */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, @@ -395,9 +396,11 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_) return READ_ONCE(list_->qlen); } -static bool filter_queue_len(struct sock *sk, const __u32 above_len) +static bool filter_queue_len(struct sock *sk) { - if (!user_config.filter_nonempty_sockqueue) + const u32 above_len = user_config.filter_queue_len; + + if (above_len == 0) return true; if (sk_queue_len(&sk->sk_receive_queue) > above_len) @@ -507,7 +510,7 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, if (!filter_nonempty_sockqueue(sk)) return 0; - if (!filter_queue_len(sk, 3)) + if (!filter_queue_len(sk)) return 0; struct timespec64 *ts = &tss->ts[0]; @@ -529,6 +532,9 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, if (!filter_nonempty_sockqueue(sk)) return 0; + if (!filter_queue_len(sk)) + return 0; + record_socket_latency(sk, skb, skb->tstamp, NETSTACKLAT_HOOK_UDP_SOCK_READ, cgroup_id); return 0; diff --git a/examples/netstacklat.h b/examples/netstacklat.h index 4811da4c..f713f726 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -68,6 +68,7 @@ struct hist_key { struct netstacklat_bpf_config { __u32 network_ns; + __u32 filter_queue_len; bool filter_pid; bool filter_ifindex; bool filter_cgroup; From d9039e5ba71e516bb0b5431f3be1a08e7923a636 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 16:50:50 +0200 Subject: [PATCH 22/46] netstacklat: make filter_socket a common function Verified performance is same as previous patch. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 0692da36..772af789 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -254,7 +254,7 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk) return ns == user_config.network_ns; } -#if (CONFIG_HOOKS_EARLY_RCV && CONFIG_HOOKS_ENQUEUE) +#if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE) static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { struct hist_key key = { .hook = hook }; @@ -408,6 +408,23 @@ static bool filter_queue_len(struct sock *sk) return false; } +#if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE) +static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, + u64 *cgroup_id) +{ + if (!filter_cgroup(cgroup_id)) + return false; + + if (!filter_nonempty_sockqueue(sk)) + return false; + + if (!filter_queue_len(sk)) + return false; + + return true; +} +#endif + static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook, u64 cgroup_id) @@ -498,19 +515,14 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, #endif /* CONFIG_HOOKS_ENQUEUE */ #ifdef CONFIG_HOOKS_DEQUEUE + SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { u64 cgroup_id = 0; - if (!filter_cgroup(&cgroup_id)) - return 0; - - if (!filter_nonempty_sockqueue(sk)) - return 0; - - if (!filter_queue_len(sk)) + if (!filter_socket(sk, NULL, &cgroup_id)) return 0; struct timespec64 *ts = &tss->ts[0]; @@ -526,13 +538,7 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, { u64 cgroup_id = 0; - if (!filter_cgroup(&cgroup_id)) - return 0; - - if (!filter_nonempty_sockqueue(sk)) - return 0; - - if (!filter_queue_len(sk)) + if (!filter_socket(sk, skb, &cgroup_id)) return 0; record_socket_latency(sk, skb, skb->tstamp, From a2f10c73fa4e9028a308a8bd4ce263612e8f06d5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 17:04:36 +0200 Subject: [PATCH 23/46] netstacklat: first filter for sockets having a queue It is more efficient to first filter out sockets with empty or small queues. This is a tradeoff as our production use-case is to capture when the system becomes overloaded. name netstacklat_tcp_recv_timestamp - run_time_ns 15347880145 run_cnt 177786472 = 86.32 ns name netstacklat_skb_consume_udp - run_time_ns 3096529442 run_cnt 33903931 = 91.33 ns The performance gain is huge. Do remember that this is the average runtime cost that is reduced, because we can skip recording many of these events. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 772af789..b6ee0ae3 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -412,15 +412,15 @@ static bool filter_queue_len(struct sock *sk) static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, u64 *cgroup_id) { - if (!filter_cgroup(cgroup_id)) - return false; - if (!filter_nonempty_sockqueue(sk)) return false; if (!filter_queue_len(sk)) return false; + if (!filter_cgroup(cgroup_id)) + return false; + return true; } #endif From 593f76b76f4ac1ec11785fcf1a323827d20084b3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 26 Aug 2025 20:39:33 +0200 Subject: [PATCH 24/46] netstacklat: give credit Simon Sundberg Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index b6ee0ae3..6d727172 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -1,4 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This is a ebpf_exporter variant of the netstacklat tool + * + * Netstacklat - is a tool that "Monitor RX latency within the network stack" + * - https://github.com/xdp-project/bpf-examples/tree/main/netstacklat + * - Developed by Simon Sundberg + * + * This variant have been code optimized heavily towards Cloudflare's use-case. + * Many hooks and features have been disabled, via constructs that lets both the + * compiler and BPF verifier do dead-code elimination. + */ #include //#include From 780984dc5db38854f1ef7c05bcad39f161216794 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 11:30:03 +0200 Subject: [PATCH 25/46] netstacklat: take sk_backlog into account Our checks for empty or almost empty sockets were wrong, because sockets also have a backlog queue. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 6d727172..84ff2c8c 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * This is a ebpf_exporter variant of the netstacklat tool + * This is an ebpf_exporter variant of the netstacklat tool * * Netstacklat - is a tool that "Monitor RX latency within the network stack" * - https://github.com/xdp-project/bpf-examples/tree/main/netstacklat @@ -29,7 +29,7 @@ char LICENSE[] SEC("license") = "GPL"; const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, - .filter_queue_len = 3, /* zero means filter is inactive */ + .filter_queue_len = 1, /* zero means filter is inactive */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, @@ -390,12 +390,24 @@ static inline int skb_queue_empty(const struct sk_buff_head *list) return READ_ONCE(list->next) == (const struct sk_buff *)list; } +static inline bool sk_backlog_empty(const struct sock *sk) +{ + return READ_ONCE(sk->sk_backlog.tail) == NULL; +} + static bool filter_nonempty_sockqueue(struct sock *sk) { if (!user_config.filter_nonempty_sockqueue) return true; - return !skb_queue_empty(&sk->sk_receive_queue); + if (!skb_queue_empty(&sk->sk_receive_queue)) + return true; + + /* Packets can also be on the sk_backlog */ + if (!sk_backlog_empty(sk)) + return true; + + return false; } /* To lower runtime overhead, skip recording timestamps for sockets with very @@ -416,6 +428,14 @@ static bool filter_queue_len(struct sock *sk) if (sk_queue_len(&sk->sk_receive_queue) > above_len) return true; + + /* Packets can also be on the sk_backlog, but we don't know the number + * of SKBs on the queue, because sk_backlog.len is in bytes (based on + * skb->truesize). Thus, if any backlog exists we don't filter. + */ + if (!sk_backlog_empty(sk)) + return true; + return false; } @@ -526,7 +546,6 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, #endif /* CONFIG_HOOKS_ENQUEUE */ #ifdef CONFIG_HOOKS_DEQUEUE - SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) From f8671e35e40c394f75502d4c852321abd2dc1702 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 12:51:51 +0200 Subject: [PATCH 26/46] netstacklat: rename filter_queue_len to filter_min_queue_len This change test to be 'ge' greater-than-or-equal. We want the ability specify 1, meaning a queue size of one and above. Before 1 meant queue size 2 and above. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 12 ++++++------ examples/netstacklat.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 84ff2c8c..6858f93c 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -29,7 +29,7 @@ char LICENSE[] SEC("license") = "GPL"; const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, - .filter_queue_len = 1, /* zero means filter is inactive */ + .filter_min_queue_len = 1, /* zero means filter is inactive */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, @@ -419,14 +419,14 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_) return READ_ONCE(list_->qlen); } -static bool filter_queue_len(struct sock *sk) +static bool filter_min_queue_len(struct sock *sk) { - const u32 above_len = user_config.filter_queue_len; + const u32 min_qlen = user_config.filter_min_queue_len; - if (above_len == 0) + if (min_qlen == 0) return true; - if (sk_queue_len(&sk->sk_receive_queue) > above_len) + if (sk_queue_len(&sk->sk_receive_queue) >= min_qlen) return true; /* Packets can also be on the sk_backlog, but we don't know the number @@ -446,7 +446,7 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, if (!filter_nonempty_sockqueue(sk)) return false; - if (!filter_queue_len(sk)) + if (!filter_min_queue_len(sk)) return false; if (!filter_cgroup(cgroup_id)) diff --git a/examples/netstacklat.h b/examples/netstacklat.h index f713f726..ee362924 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -68,7 +68,7 @@ struct hist_key { struct netstacklat_bpf_config { __u32 network_ns; - __u32 filter_queue_len; + __u32 filter_min_queue_len; bool filter_pid; bool filter_ifindex; bool filter_cgroup; From 3c57889e7c2ef5dc2fff94370c1994b4b6432b9d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 14:16:23 +0200 Subject: [PATCH 27/46] netstacklat: add filter for every nth packet Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 35 ++++++++++++++++++++++++++++++++++- examples/netstacklat.h | 1 + 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 6858f93c..407b7803 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -29,7 +29,8 @@ char LICENSE[] SEC("license") = "GPL"; const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, - .filter_min_queue_len = 1, /* zero means filter is inactive */ + .filter_min_queue_len = 0, /* zero means filter is inactive */ + .filter_nth_packet = 10, /* reduce recorded event to every nth packet */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, @@ -116,6 +117,15 @@ struct { } netstack_cgroupfilter SEC(".maps"); #endif +/* Down sample the recorded events to every nth event */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} netstack_nth_filter SEC(".maps"); + + static ktime_t time_since(ktime_t tstamp) { ktime_t now; @@ -439,6 +449,26 @@ static bool filter_min_queue_len(struct sock *sk) return false; } +static inline bool filter_nth_packet() +{ + u32 key = 0; + u64 *nth; + + /* Zero and one means disabled */ + if (user_config.filter_nth_packet <= 1) + return true; + + nth = bpf_map_lookup_elem(&netstack_nth_filter, &key); + if (!nth) + return false; + + *nth += 1; + if ((*nth % user_config.filter_nth_packet) == 0) { + return true; + } + return false; +} + #if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE) static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, u64 *cgroup_id) @@ -452,6 +482,9 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, if (!filter_cgroup(cgroup_id)) return false; + if (!filter_nth_packet()) + return false; + return true; } #endif diff --git a/examples/netstacklat.h b/examples/netstacklat.h index ee362924..f4009a44 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -69,6 +69,7 @@ struct hist_key { struct netstacklat_bpf_config { __u32 network_ns; __u32 filter_min_queue_len; + __u64 filter_nth_packet; bool filter_pid; bool filter_ifindex; bool filter_cgroup; From 0127c2c87cf3b9bbd72607151e59e3e5420e2f16 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 14:51:19 +0200 Subject: [PATCH 28/46] netstacklat: let nth filter counter be per hook Let also record_skb_latency() use it. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 65 ++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 407b7803..03dd51fb 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -30,7 +30,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_min_queue_len = 0, /* zero means filter is inactive */ - .filter_nth_packet = 10, /* reduce recorded event to every nth packet */ + .filter_nth_packet = 32, /* reduce recorded event to every nth packet, use power-of-2 */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, @@ -117,15 +117,14 @@ struct { } netstack_cgroupfilter SEC(".maps"); #endif -/* Down sample the recorded events to every nth event */ +/* Per-CPU counter for down sampling the recorded events to every nth event */ struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, 1); + __uint(max_entries, NETSTACKLAT_N_HOOKS); __type(key, u32); __type(value, u64); } netstack_nth_filter SEC(".maps"); - static ktime_t time_since(ktime_t tstamp) { ktime_t now; @@ -226,6 +225,26 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key) } #endif /* !CONFIG_MAP_MACROS */ +static inline bool filter_nth_packet(const enum netstacklat_hook hook) +{ + u32 key = hook; + u64 *nth; + + /* Zero and one means disabled */ + if (user_config.filter_nth_packet <= 1) + return true; + + nth = bpf_map_lookup_elem(&netstack_nth_filter, &key); + if (!nth) + return false; + + *nth += 1; + if ((*nth % user_config.filter_nth_packet) == 0) { + return true; + } + return false; +} + static bool filter_ifindex(u32 ifindex) { if (!user_config.filter_ifindex) @@ -311,6 +330,9 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta if (!filter_network_ns(skb, sk)) return; + if (!filter_nth_packet(hook)) + return; + if (user_config.groupby_ifindex) key.ifindex = ifindex; @@ -449,29 +471,9 @@ static bool filter_min_queue_len(struct sock *sk) return false; } -static inline bool filter_nth_packet() -{ - u32 key = 0; - u64 *nth; - - /* Zero and one means disabled */ - if (user_config.filter_nth_packet <= 1) - return true; - - nth = bpf_map_lookup_elem(&netstack_nth_filter, &key); - if (!nth) - return false; - - *nth += 1; - if ((*nth % user_config.filter_nth_packet) == 0) { - return true; - } - return false; -} - #if (CONFIG_HOOKS_DEQUEUE || CONFIG_HOOKS_ENQUEUE) static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, - u64 *cgroup_id) + u64 *cgroup_id, const enum netstacklat_hook hook) { if (!filter_nonempty_sockqueue(sk)) return false; @@ -482,7 +484,7 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, if (!filter_cgroup(cgroup_id)) return false; - if (!filter_nth_packet()) + if (!filter_nth_packet(hook)) return false; return true; @@ -583,15 +585,16 @@ SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { + const enum netstacklat_hook hook = NETSTACKLAT_HOOK_TCP_SOCK_READ; u64 cgroup_id = 0; - if (!filter_socket(sk, NULL, &cgroup_id)) + if (!filter_socket(sk, NULL, &cgroup_id, hook)) return 0; struct timespec64 *ts = &tss->ts[0]; record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, - NETSTACKLAT_HOOK_TCP_SOCK_READ, cgroup_id); + hook, cgroup_id); return 0; } @@ -599,13 +602,13 @@ SEC("fentry/skb_consume_udp") int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, int len) { + const enum netstacklat_hook hook = NETSTACKLAT_HOOK_UDP_SOCK_READ; u64 cgroup_id = 0; - if (!filter_socket(sk, skb, &cgroup_id)) + if (!filter_socket(sk, skb, &cgroup_id, hook)) return 0; - record_socket_latency(sk, skb, skb->tstamp, - NETSTACKLAT_HOOK_UDP_SOCK_READ, cgroup_id); + record_socket_latency(sk, skb, skb->tstamp, hook, cgroup_id); return 0; } #endif /* CONFIG_HOOKS_DEQUEUE */ From 45cd60737ada129d7bf3a708dcb8f65fb9d8046f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 16:05:50 +0200 Subject: [PATCH 29/46] netstacklat: disable UDP hooks via ugly ifdefs For production usage, we want the ability to disable the UDP hooks. Introduce CONFIG_xxx_HOOKS for IP, UDP and TCP. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 03dd51fb..411fab40 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -45,6 +45,10 @@ const struct netstacklat_bpf_config user_config = { //#define CONFIG_HOOKS_ENQUEUE 1 #undef CONFIG_HOOKS_ENQUEUE #define CONFIG_HOOKS_DEQUEUE 1 +#define CONFIG_ENABLE_IP_HOOKS 1 +#define CONFIG_ENABLE_TCP_HOOKS 1 +//#define CONFIG_ENABLE_UDP_HOOKS 1 + /* Allows to compile-time disable ifindex map as YAML cannot conf this */ //#define CONFIG_IFINDEX_FILTER_MAP 1 @@ -517,6 +521,7 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, } #ifdef CONFIG_HOOKS_EARLY_RCV +# ifdef CONFIG_ENABLE_IP_HOOKS SEC("fentry/ip_rcv_core") int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block, void *tp, void *res, bool compat_mode) @@ -532,7 +537,9 @@ int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block, record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_IP_RCV); return 0; } +# endif /* CONFIG_ENABLE_IP_HOOKS */ +# ifdef CONFIG_ENABLE_TCP_HOOKS SEC("fentry/tcp_v4_rcv") int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb) { @@ -546,7 +553,9 @@ int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb) record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_TCP_START); return 0; } +# endif /* CONFIG_ENABLE_TCP_HOOKS */ +# ifdef CONFIG_ENABLE_UDP_HOOKS SEC("fentry/udp_rcv") int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb) { @@ -560,16 +569,20 @@ int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb) record_skb_latency(skb, NULL, NETSTACKLAT_HOOK_UDP_START); return 0; } +# endif /* CONFIG_ENABLE_UDP_HOOKS */ #endif /* CONFIG_HOOKS_EARLY_RCV */ #ifdef CONFIG_HOOKS_ENQUEUE +# ifdef CONFIG_ENABLE_TCP_HOOKS SEC("fexit/tcp_queue_rcv") int BPF_PROG(netstacklat_tcp_queue_rcv, struct sock *sk, struct sk_buff *skb) { record_skb_latency(skb, sk, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED); return 0; } +# endif /* CONFIG_ENABLE_TCP_HOOKS */ +# ifdef CONFIG_ENABLE_UDP_HOOKS SEC("fexit/__udp_enqueue_schedule_skb") int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, struct sk_buff *skb, int retval) @@ -578,9 +591,11 @@ int BPF_PROG(netstacklat_udp_enqueue_schedule_skb, struct sock *sk, record_skb_latency(skb, sk, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED); return 0; } +# endif /* CONFIG_ENABLE_UDP_HOOKS */ #endif /* CONFIG_HOOKS_ENQUEUE */ #ifdef CONFIG_HOOKS_DEQUEUE +# ifdef CONFIG_ENABLE_TCP_HOOKS SEC("fentry/tcp_recv_timestamp") int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) @@ -597,7 +612,9 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, hook, cgroup_id); return 0; } +# endif /* CONFIG_ENABLE_TCP_HOOKS */ +# ifdef CONFIG_ENABLE_UDP_HOOKS SEC("fentry/skb_consume_udp") int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, int len) @@ -611,4 +628,5 @@ int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb, record_socket_latency(sk, skb, skb->tstamp, hook, cgroup_id); return 0; } +# endif /* CONFIG_ENABLE_UDP_HOOKS */ #endif /* CONFIG_HOOKS_DEQUEUE */ From 189c61f7c953f3952fe34e1519060e1c43b13bca Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 17:14:38 +0200 Subject: [PATCH 30/46] netstacklat: change time resoultion to usecs For production we need to reduce the number of Prometheus buckets metric. As the dequeue hooks are unlikely to see below usecs latencies we reduce the resolution to usecs. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 2 +- examples/netstacklat.h | 7 ++++++- examples/netstacklat.yaml | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 411fab40..785266d8 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -140,7 +140,7 @@ static ktime_t time_since(ktime_t tstamp) if (tstamp > now) return -1; - return now - tstamp; + return (now - tstamp) / LATENCY_SCALE; } /* Determine if ebpf_exporter macro or local C implementation is used */ diff --git a/examples/netstacklat.h b/examples/netstacklat.h index f4009a44..0e30da60 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -2,7 +2,12 @@ #ifndef NETSTACKLAT_H #define NETSTACKLAT_H -#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s +/* To reduce Prometheus buckets metric reduce/scale latency time resolution. + * This LATENCY_SCALE is connected to the YAML bucket_multiplier config. + */ +#define LATENCY_SCALE 1000UL + +#define HIST_MAX_LATENCY_SLOT 24 // ( 2^24 ns / 1000) usecs -> ~16.7s /* * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key" * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys) diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index d59d3b6b..4cc3cfab 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -4,8 +4,8 @@ metrics: help: Latency for packets (skbs) to reach various points in the kernel network stack bucket_type: exp2 bucket_min: 0 - bucket_max: 34 - bucket_multiplier: 0.000000001 # nanoseconds to seconds + bucket_max: 24 + bucket_multiplier: 0.000001 # microseconds to seconds labels: - name: cgroup size: 8 From f0a947de4a7883f68e40ae447db05f4d4661c559 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 18:52:30 +0200 Subject: [PATCH 31/46] netstacklat: fix hash size for netstack_latency_seconds Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 785266d8..f7003857 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -77,10 +77,10 @@ struct sk_buff___old { */ #define N_CGROUPS 2 /* depend on cgroup_id_map matches in YAML config*/ #define N_HOOKS NETSTACKLAT_N_HOOKS /* Keep it same until we disable some */ -#define N_IFACES 64 /* On prod only interested in ext0 and vlan100@ext0 */ +#define N_IFACES 6 /* On prod only interested in ext0 and vlan100@ext0 */ struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); - __uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES * 64); + __uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES); __type(key, struct hist_key); __type(value, u64); } netstack_latency_seconds SEC(".maps"); From 0ff2c4160036c792aed581b98c72f4e7a5a8b810 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 27 Aug 2025 19:05:03 +0200 Subject: [PATCH 32/46] netstacklat: remove some tabs and change comment style When importing this into ebpf_exporter we need to reformat the C-code, which is done via command line: clang-format -i configs/netstacklat.{h,bpf.c} But it doesn't convert these macros and comments. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index f7003857..98fa547b 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -11,7 +11,6 @@ * compiler and BPF verifier do dead-code elimination. */ #include -//#include #include #include @@ -40,23 +39,23 @@ const struct netstacklat_bpf_config user_config = { }; /* This provide easy way compile-time to disable some hooks */ -//#define CONFIG_HOOKS_EARLY_RCV 1 -#undef CONFIG_HOOKS_EARLY_RCV -//#define CONFIG_HOOKS_ENQUEUE 1 -#undef CONFIG_HOOKS_ENQUEUE -#define CONFIG_HOOKS_DEQUEUE 1 -#define CONFIG_ENABLE_IP_HOOKS 1 -#define CONFIG_ENABLE_TCP_HOOKS 1 -//#define CONFIG_ENABLE_UDP_HOOKS 1 +/* #define CONFIG_HOOKS_EARLY_RCV 1 */ +#undef CONFIG_HOOKS_EARLY_RCV +/* #define CONFIG_HOOKS_ENQUEUE 1 */ +#undef CONFIG_HOOKS_ENQUEUE +#define CONFIG_HOOKS_DEQUEUE 1 +#define CONFIG_ENABLE_IP_HOOKS 1 +#define CONFIG_ENABLE_TCP_HOOKS 1 +/* #define CONFIG_ENABLE_UDP_HOOKS 1 */ /* Allows to compile-time disable ifindex map as YAML cannot conf this */ -//#define CONFIG_IFINDEX_FILTER_MAP 1 -#undef CONFIG_IFINDEX_FILTER_MAP +/* #define CONFIG_IFINDEX_FILTER_MAP 1 */ +#undef CONFIG_IFINDEX_FILTER_MAP /* Allows to compile-time disable PID filter map as it is very large */ -//#define CONFIG_PID_FILTER_MAP 1 -#undef CONFIG_PID_FILTER_MAP +/* #define CONFIG_PID_FILTER_MAP 1 */ +#undef CONFIG_PID_FILTER_MAP /* * Alternative definition of sk_buff to handle renaming of the field @@ -104,7 +103,7 @@ struct { #endif /* Eval two different cgroup_id_map types*/ -//#define CONFIG_CGRP_STORAGE 1 +/* #define CONFIG_CGRP_STORAGE 1 */ #ifdef CONFIG_CGRP_STORAGE struct { __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); /* type: cgrp_storage */ From b54fdbf1b5c5e5d4fabe9bdd058ee665d83e39fb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 28 Aug 2025 09:44:15 +0200 Subject: [PATCH 33/46] netstacklat: add reminder to update N_CGROUPS Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index 4cc3cfab..657230b5 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -37,6 +37,7 @@ metrics: decoders: - name: uint +# Remember to update #define N_CGROUPS in code when adding more matches cgroup_id_map: name: netstack_cgroupfilter type: hash From e813266b4e522c606d58a530b4c509ebf1516bf5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 28 Aug 2025 09:59:31 +0200 Subject: [PATCH 34/46] netstacklat: reduce hash size for netstack_latency_seconds We are currently only using a single hook so change N_HOOKS that is used in the max_entries calc of netstack_latency_seconds. Detect if other hooks gets enabled and make compile fail. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 98fa547b..5a07ab87 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -48,7 +48,6 @@ const struct netstacklat_bpf_config user_config = { #define CONFIG_ENABLE_TCP_HOOKS 1 /* #define CONFIG_ENABLE_UDP_HOOKS 1 */ - /* Allows to compile-time disable ifindex map as YAML cannot conf this */ /* #define CONFIG_IFINDEX_FILTER_MAP 1 */ #undef CONFIG_IFINDEX_FILTER_MAP @@ -72,11 +71,15 @@ struct sk_buff___old { /* NOTICE: max_entries need to be adjusted based on maximum * number of cgroups and ifindex (that are "groupby" collecting) - * and "enabled" hooks (as we want to disable some) + * and "enabled" hooks. */ #define N_CGROUPS 2 /* depend on cgroup_id_map matches in YAML config*/ -#define N_HOOKS NETSTACKLAT_N_HOOKS /* Keep it same until we disable some */ #define N_IFACES 6 /* On prod only interested in ext0 and vlan100@ext0 */ +#define N_HOOKS 1 +#if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE || CONFIG_ENABLE_UDP_HOOKS) +#err "Please update N_HOOKS" +#endif + struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES); From da2d60942ce7b834daa814ac8482c6765e87c338 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 4 Sep 2025 10:53:22 +0200 Subject: [PATCH 35/46] netstacklat: atomic counter for filter_nth_packet We unfortunately need atomic counter update for the nth-filter. This is because hooks like tcp-socket-read runs outside the socket lock in a preempt/migrate-able user context. We don't need accurate nth-counter across CPU, as this is just a down-sampling mechanism. Thus, we keep the PERCPU array map and have nth-counter on a per CPU basis. The trick here is that in most cases the counter is only used by the current running CPU, and the cache-line will mostly be in a cache coherency Exclusive/Modified (MOESI) state, which will cost less when doing atomic updates. Manually testing on production showed 7ns runtime increase (before 150.88 ns, after 157.67 ns). Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 5a07ab87..f5c97063 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -234,6 +234,7 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key) static inline bool filter_nth_packet(const enum netstacklat_hook hook) { u32 key = hook; + u64 pkt_cnt; u64 *nth; /* Zero and one means disabled */ @@ -244,8 +245,12 @@ static inline bool filter_nth_packet(const enum netstacklat_hook hook) if (!nth) return false; - *nth += 1; - if ((*nth % user_config.filter_nth_packet) == 0) { + /* The hooks (like tcp-socket-read) runs outside the socket lock in a + * preempt/migrate-able user context. Thus, atomic updates are needed + * for correctness, but keep PERCPU map to limit cache-line bouncing. + */ + pkt_cnt = __sync_fetch_and_add(nth, 1); + if ((pkt_cnt % user_config.filter_nth_packet) == 0) { return true; } return false; From e3a5f589fa9227792081d4695ec297b5b05bc5aa Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 4 Sep 2025 13:03:47 +0200 Subject: [PATCH 36/46] netstacklat: Experiment with 2 nth packet sampling Sample every 2-nth packet (50%) - Overhead: 6039419756 / 32219314 = 187.44 ns Compared to local atomic-nth overead: 157.67 ns - approx 30 ns extra cost to sample 50% vs 3.12% Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index f5c97063..3cf786ea 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -29,7 +29,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_min_queue_len = 0, /* zero means filter is inactive */ - .filter_nth_packet = 32, /* reduce recorded event to every nth packet, use power-of-2 */ + .filter_nth_packet = 2, /* reduce recorded event to every nth packet, use power-of-2 */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, From 2e7b39f2cc0cca3701f009455df46d03fd44c6e3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 4 Sep 2025 13:31:57 +0200 Subject: [PATCH 37/46] netstacklat: Experiment with 4 nth packet sampling name netstacklat_tcp_recv_timestamp - run_time_ns 17510185954 run_cnt 101083454 = 173.23 ns Sample every 4-nth packet (25%) - Overhead: 173.23 ns - Compared to nth-2 (187.44 ns) saved 14.21 ns (187.44-173.23) - Compared to nth-32 (157.67 ns) cost 15.56 ns more (173.23-157.67) Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 3cf786ea..21791d52 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -29,7 +29,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_min_queue_len = 0, /* zero means filter is inactive */ - .filter_nth_packet = 2, /* reduce recorded event to every nth packet, use power-of-2 */ + .filter_nth_packet = 4, /* reduce recorded event to every nth packet, use power-of-2 */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, From f5e1baa751444cb43729efec4732e71228fece4c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 4 Sep 2025 13:08:38 +0200 Subject: [PATCH 38/46] netstacklat: experiment with disabling nth-filter name netstacklat_tcp_recv_timestamp - run_time_ns 24383044912 run_cnt 121125888 = 201.30 ns Compared to - nth-2 : 186 ns -> +15 ns - nth-4 : 173 ns -> +28 ns - nth-32 : 157 ns -> +44 ns Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 21791d52..efa0c8b1 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -29,7 +29,7 @@ const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, .filter_min_queue_len = 0, /* zero means filter is inactive */ - .filter_nth_packet = 4, /* reduce recorded event to every nth packet, use power-of-2 */ + .filter_nth_packet = 0, /* reduce recorded event to every nth packet, use power-of-2 */ .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, From febd9b7225bab56d058cf841e6d97a5224a5b47d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 4 Sep 2025 14:10:39 +0200 Subject: [PATCH 39/46] netstacklat: disable user_config.groupby_ifindex Because production have too little traffic on the internal interface that latency stats becomes unusable. Via CONFIG_GROUPBY_IFINDEX also remove this from the hist_key and note that YAML file also needs adjustments. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 11 ++++++++++- examples/netstacklat.h | 9 ++++++++- examples/netstacklat.yaml | 15 ++++++++------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index efa0c8b1..a58b771a 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -34,7 +34,7 @@ const struct netstacklat_bpf_config user_config = { .filter_ifindex = true, .filter_cgroup = true, .filter_nonempty_sockqueue = false, - .groupby_ifindex = true, + .groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */ .groupby_cgroup = true, }; @@ -74,8 +74,13 @@ struct sk_buff___old { * and "enabled" hooks. */ #define N_CGROUPS 2 /* depend on cgroup_id_map matches in YAML config*/ +#ifdef CONFIG_GROUPBY_IFINDEX #define N_IFACES 6 /* On prod only interested in ext0 and vlan100@ext0 */ +#else +#define N_IFACES 1 /* With groupby_ifindex==false */ +#endif #define N_HOOKS 1 + #if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE || CONFIG_ENABLE_UDP_HOOKS) #err "Please update N_HOOKS" #endif @@ -344,8 +349,10 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta if (!filter_nth_packet(hook)) return; +#ifdef CONFIG_GROUPBY_IFINDEX if (user_config.groupby_ifindex) key.ifindex = ifindex; +#endif _record_latency_since(skb->tstamp, key); } @@ -519,8 +526,10 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (!filter_network_ns(skb, sk)) return; +#ifdef CONFIG_GROUPBY_IFINDEX if (user_config.groupby_ifindex) key.ifindex = ifindex; +#endif if (user_config.groupby_cgroup) key.cgroup = cgroup_id; diff --git a/examples/netstacklat.h b/examples/netstacklat.h index 0e30da60..bbe8ad7d 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -59,6 +59,11 @@ enum netstacklat_hook { NETSTACKLAT_N_HOOKS, }; +/* Disabling user_config.groupby_ifindex requires modifying hist_key and YAML + */ +//#define CONFIG_GROUPBY_IFINDEX 1 +#undef CONFIG_GROUPBY_IFINDEX + /* * Key used for the histogram map * To be compatible with ebpf-exporter, all histograms need a key struct whose final @@ -66,10 +71,12 @@ enum netstacklat_hook { */ struct hist_key { __u64 cgroup; +#ifdef CONFIG_GROUPBY_IFINDEX __u32 ifindex; +#endif __u16 hook; // need well defined size for ebpf-exporter to decode __u16 bucket; // needs to be last to be compatible with ebpf-exporter -}; +} __attribute__((packed)); struct netstacklat_bpf_config { __u32 network_ns; diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index 657230b5..ec98efc6 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -12,13 +12,14 @@ metrics: decoders: - name: uint - name: cgroup - - name: iface - size: 4 - decoders: - # If including output from a different network namespace than ebpf-exporter - # you probably just want to decode as a uint (ifindex) instead - # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others - - name: ifname +# See: CONFIG_GROUPBY_IFINDEX +# - name: iface +# size: 4 +# decoders: +# # If including output from a different network namespace than ebpf-exporter +# # you probably just want to decode as a uint (ifindex) instead +# # - name: uint # For the ifname decoder you apparently don't first need a uint decoder like the others +# - name: ifname - name: hook size: 2 decoders: From 10a331e62db90a22fe5437a90676e3d53e56d142 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 5 Sep 2025 14:41:42 +0200 Subject: [PATCH 40/46] netstacklat: minor sync with upstream bpf-examples version The upstream version of netstacklat that we are based on got merged see PR#129. https://github.com/xdp-project/bpf-examples/pull/129 Some adjustments were made, so lets sync with these to avoid diverting too much from upstream. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 19 ++++++++----------- examples/netstacklat.h | 4 ++-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index a58b771a..5199f652 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -19,6 +19,8 @@ #include "netstacklat.h" #include "bits.bpf.h" +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + char LICENSE[] SEC("license") = "GPL"; /* The ebpf_exporter variant of netstacklat is not runtime configurable at @@ -28,7 +30,7 @@ char LICENSE[] SEC("license") = "GPL"; const __s64 TAI_OFFSET = (37LL * NS_PER_S); const struct netstacklat_bpf_config user_config = { .network_ns = 0, - .filter_min_queue_len = 0, /* zero means filter is inactive */ + .filter_min_sockqueue_len = 0, /* zero means filter is inactive */ .filter_nth_packet = 0, /* reduce recorded event to every nth packet, use power-of-2 */ .filter_pid = false, .filter_ifindex = true, @@ -122,7 +124,7 @@ struct { #else struct { __uint(type, BPF_MAP_TYPE_HASH); /* type: hash */ - __uint(max_entries, MAX_TRACKED_CGROUPS); + __uint(max_entries, MAX_PARSED_CGROUPS); __type(key, u64); __type(value, u64); } netstack_cgroupfilter SEC(".maps"); @@ -305,9 +307,7 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk) if (user_config.network_ns == 0) return true; - u32 ns = get_network_ns(skb, sk); - - return ns == user_config.network_ns; + return get_network_ns(skb, sk) == user_config.network_ns; } #if (CONFIG_HOOKS_EARLY_RCV || CONFIG_HOOKS_ENQUEUE) @@ -372,7 +372,6 @@ static bool filter_pid(u32 pid) return false; return *pid_ok > 0; - } #endif /* CONFIG_PID_FILTER_MAP */ @@ -425,8 +424,6 @@ static bool filter_current_task() return ok; } -#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) - /** * skb_queue_empty - check if a queue is empty * @list: queue head @@ -469,9 +466,9 @@ static inline __u32 sk_queue_len(const struct sk_buff_head *list_) return READ_ONCE(list_->qlen); } -static bool filter_min_queue_len(struct sock *sk) +static bool filter_min_sockqueue_len(struct sock *sk) { - const u32 min_qlen = user_config.filter_min_queue_len; + const u32 min_qlen = user_config.filter_min_sockqueue_len; if (min_qlen == 0) return true; @@ -496,7 +493,7 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, if (!filter_nonempty_sockqueue(sk)) return false; - if (!filter_min_queue_len(sk)) + if (!filter_min_sockqueue_len(sk)) return false; if (!filter_cgroup(cgroup_id)) diff --git a/examples/netstacklat.h b/examples/netstacklat.h index bbe8ad7d..bfe4f7ea 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -23,7 +23,7 @@ // The highest ifindex we expect to encounter #define IFINDEX_MAX 16384 // The maximum number of different cgroups we can filter for -#define MAX_TRACKED_CGROUPS 4096 +#define MAX_PARSED_CGROUPS 4096 #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) @@ -80,7 +80,7 @@ struct hist_key { struct netstacklat_bpf_config { __u32 network_ns; - __u32 filter_min_queue_len; + __u32 filter_min_sockqueue_len; __u64 filter_nth_packet; bool filter_pid; bool filter_ifindex; From 7e2c734b2587ba555d2d183be3ea5c4968b98302 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 5 Sep 2025 14:49:19 +0200 Subject: [PATCH 41/46] netstacklat: upstream removed filter_nonempty_sockqueue As the filter_min_sockqueue_len can replaced it. This was also part of PR#129 merge, but it makes it easier to review, to keep this in a seperate commit. https://github.com/xdp-project/bpf-examples/pull/129 Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 32 -------------------------------- examples/netstacklat.h | 1 - 2 files changed, 33 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 5199f652..51200479 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -35,7 +35,6 @@ const struct netstacklat_bpf_config user_config = { .filter_pid = false, .filter_ifindex = true, .filter_cgroup = true, - .filter_nonempty_sockqueue = false, .groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */ .groupby_cgroup = true, }; @@ -424,39 +423,11 @@ static bool filter_current_task() return ok; } -/** - * skb_queue_empty - check if a queue is empty - * @list: queue head - * - * Returns true if the queue is empty, false otherwise. - * - * Copied from /include/linux/skbuff.h - */ -static inline int skb_queue_empty(const struct sk_buff_head *list) -{ - return READ_ONCE(list->next) == (const struct sk_buff *)list; -} - static inline bool sk_backlog_empty(const struct sock *sk) { return READ_ONCE(sk->sk_backlog.tail) == NULL; } -static bool filter_nonempty_sockqueue(struct sock *sk) -{ - if (!user_config.filter_nonempty_sockqueue) - return true; - - if (!skb_queue_empty(&sk->sk_receive_queue)) - return true; - - /* Packets can also be on the sk_backlog */ - if (!sk_backlog_empty(sk)) - return true; - - return false; -} - /* To lower runtime overhead, skip recording timestamps for sockets with very * few packets. Use sk_buff_head->qlen to see if e.g. queue have more than 2 * elements @@ -490,9 +461,6 @@ static bool filter_min_sockqueue_len(struct sock *sk) static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, u64 *cgroup_id, const enum netstacklat_hook hook) { - if (!filter_nonempty_sockqueue(sk)) - return false; - if (!filter_min_sockqueue_len(sk)) return false; diff --git a/examples/netstacklat.h b/examples/netstacklat.h index bfe4f7ea..d4a40f7a 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -85,7 +85,6 @@ struct netstacklat_bpf_config { bool filter_pid; bool filter_ifindex; bool filter_cgroup; - bool filter_nonempty_sockqueue; bool groupby_ifindex; bool groupby_cgroup; }; From b9cbe809d077936bea3d2924fc004b7fd16adcfb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 5 Sep 2025 15:39:55 +0200 Subject: [PATCH 42/46] netstacklat: comply with clang format requirements Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/netstacklat.h b/examples/netstacklat.h index d4a40f7a..88b9b350 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -61,7 +61,7 @@ enum netstacklat_hook { /* Disabling user_config.groupby_ifindex requires modifying hist_key and YAML */ -//#define CONFIG_GROUPBY_IFINDEX 1 +/* #define CONFIG_GROUPBY_IFINDEX 1 */ #undef CONFIG_GROUPBY_IFINDEX /* From ba856d8416fbf40fcb77761eb08b0a2203270e8e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 16 Sep 2025 16:29:15 +0200 Subject: [PATCH 43/46] netstacklat: relax ifindex filter due to production Lacking a YAML ebpf_exporter config for selecting iface names we hard-coded ifindex, but some production servers have higher ifindex for vlan100. Relax ifindex range as a workaround. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 51200479..d99b9f2f 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -280,8 +280,10 @@ static bool filter_ifindex(u32 ifindex) /* Hack for production: * - We want to exclude 'lo' which have ifindex==1. * - We want to filter on ext0 (ifindex 2) and vlan100@ext0 (ifindex 5) + * unfortunately ifindex'es are not stable, some production metals have + * ifindex==6 for vlan100@link0. Relax filter until adding YAML config. */ - if (ifindex > 1 && ifindex < 6) + if (ifindex > 1 && ifindex < 12) return true; return false; From 885f1016902113ebefc485130af5e44b32060704 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Wed, 29 Oct 2025 16:09:44 +0100 Subject: [PATCH 44/46] netstacklat: Exclude TCP reads for HOL blocked segments The 'tcp-socket-read' currently reports the latency for the skb containing the last TCP segment read from the socket. However, this segment might have been head of line (HOL) blocked by a previous segment missing. In this case, netstacklat's reported latency will include HOL blocking periods that is dependent on external factors (such as network packet loss, and network latency impacts retransmission time). As netstacklat is primarily intended to identify issues within the local host (in the network stack or receiving applications), by default filter out any socket reads were the last read SKB might have experienced HOL-blocking. Add the new -y/--include-tcp-hol-delay option to retain the old behavior of reporting latency for all reads, including those that are HOL-blocked. This may be useful in some scenarios when you still want to be aware of latency issues caused by HOL-blocking, even though it is caused by external components. For example, in a data center context were you have full control over the network, it may still be relevant to monitor HOL-based caused by the network. To exclude HOL-blocked reads, detect if any new ooo-segments have arrived by checking for differences in the number of ooo-packets in tcp_sock->rcv_ooopack. If any new ooo-segments have arrived, exclude the latency sample from the current read and set a limit for the next safe sequence number to read where the current ooo-packets must have been passed so segments can no longer be HOL-blocked. If there are skbs in the ooo-queue, set the limit to the end of the ooo-queue. Otherwise, set the limit to the current rcv_nxt (as if the ooo-queue is empty the detected ooo-segments must already have been merged into the receive queue and rcv_nxt must have advanced past them). If the read is past the safe sequence limit and no new ooo-segments have arrived, it's safe to start including the latency samples again. For sockets were some ooo-segments have been observed, keep the ooo-range state in socket storage (BPF_MAP_TYPE_SK_STORAGE). Skip protecting this state with a spin-lock, as it should only be concurrently accessed if there are concurrent reads on the same TCP socket, which is assumed to be very rare as applications attempting that cannot know which part of the data each of their concurrent reads will get. There are some scenarios that may cause this ooo-filtering to fail. - If multiple reads are done to the socket concurrently, we may not correctly track the last read byte. The kernel does not keep a lock on the TCP socket at the time our hooked function tcp_recv_timestamp() runs. If two reads are done in parallel, it's therefore possible that for both reads we will check the last read byte (tcp_sock.copied_seq) after the second read has updated it. We may then incorrectly conclude that the first read was ahead of the ooo-range when it was not, and record its latency when we should have excluded it. In practice I belive this issue should be quite rare, as most applications will probably not attempt to perform multiple concurrent reads to a single connected TCP socket in parallel (as then you cannot know which part of the payload the parallel reads will return). - As tcp_recv_timestamp() runs outside of the socket lock, the various state members we access may concurrently be updated as we're attempting to read them. An especially problematic one is tcp_sock.ooo_last_skb, which keeps a pointer to an SKB that is only valid while the ooo-queue is non-empty. It is possible that between our check for if the ooo-queue is non-empty and following the ooo_last_skb pointer, the ooo-queue is cleared and the ooo_last_skb pointer may end up pointing towards a freed SKB. If the socket members we access are updated before or while we read them, it can break the filtering in numerous ways, e.g. result in includes samples that should have been excluded (due to e.g. copied_seq being updated before our read) or excluding a large amount of valid samples (due to e.g. setting a sequence limit based on garbage in a freed SKB). Signed-off-by: Simon Sundberg Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 128 +++++++++++++++++++++++++++++++++++++ examples/netstacklat.h | 1 + 2 files changed, 129 insertions(+) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index d99b9f2f..58eb88a1 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -21,6 +21,10 @@ #define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +// Mimic macros from /include/net/tcp.h +#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) +#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + char LICENSE[] SEC("license") = "GPL"; /* The ebpf_exporter variant of netstacklat is not runtime configurable at @@ -37,6 +41,7 @@ const struct netstacklat_bpf_config user_config = { .filter_cgroup = true, .groupby_ifindex = false, /* If true also define CONFIG_GROUPBY_IFINDEX */ .groupby_cgroup = true, + .include_hol_blocked = false, }; /* This provide easy way compile-time to disable some hooks */ @@ -86,6 +91,13 @@ struct sk_buff___old { #err "Please update N_HOOKS" #endif +struct tcp_sock_ooo_range { + u32 prev_n_ooopkts; + u32 ooo_seq_end; + /* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */ + bool active; +}; + struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, HIST_NBUCKETS * N_HOOKS * N_CGROUPS * N_IFACES); @@ -151,6 +163,22 @@ static ktime_t time_since(ktime_t tstamp) return (now - tstamp) / LATENCY_SCALE; } +/* + * Is a < b considering u32 wrap around? + * Based on the before() function in /include/net/tcp.h + */ +static bool u32_lt(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tcp_sock_ooo_range); +} netstack_tcp_ooo_range SEC(".maps"); + /* Determine if ebpf_exporter macro or local C implementation is used */ #define CONFIG_MAP_MACROS 1 #ifdef CONFIG_MAP_MACROS @@ -476,6 +504,102 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, } #endif +static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) +{ + struct tcp_skb_cb cb; + u32 max_seq = 0; + int err = 0; + + if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) { + /* No ooo-segments currently in ooo-queue + * Any ooo-segments must already have been merged to the + * receive queue. Current rcv_nxt must therefore be ahead + * of all ooo-segments that have arrived until now. + */ + err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt); + if (err) + bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d", + err); + } else { + /* + * Some ooo-segments currently in ooo-queue + * Max out-of-order seq is given by the seq_end of the tail + * skb in the ooo-queue. + */ + err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb); + if (err) + bpf_printk( + "failed to read tcp_sock->ooo_last_skb->cb, err=%d", + err); + max_seq = cb.end_seq; + } + + *seq = max_seq; + return err; +} + +static bool tcp_read_in_ooo_range(struct tcp_sock *tp, + struct tcp_sock_ooo_range *ooo_range) +{ + u32 read_seq; + int err; + + if (!ooo_range->active) + return false; + + err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq); + if (err) { + bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err); + return true; // Assume we may be in ooo-range + } + + if (u32_lt(ooo_range->ooo_seq_end, read_seq)) { + ooo_range->active = false; + return false; + } else { + return true; + } +} + +static bool tcp_read_maybe_holblocked(struct sock *sk) +{ + struct tcp_sock_ooo_range *ooo_range; + struct tcp_sock *tp = tcp_sk(sk); + u32 n_ooopkts, nxt_seq; + int err; + + err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n", + err); + return true; // Assume we may be in ooo-range + } + + if (n_ooopkts == 0) + return false; + + ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!ooo_range) { + bpf_printk( + "failed getting ooo-range socket storage for tcp socket"); + return true; // Assume we may be in ooo-range + } + + // Increase in ooo-packets since last - figure out next safe seq + if (n_ooopkts > ooo_range->prev_n_ooopkts) { + ooo_range->prev_n_ooopkts = n_ooopkts; + err = current_max_possible_ooo_seq(tp, &nxt_seq); + if (!err) { + ooo_range->ooo_seq_end = nxt_seq; + ooo_range->active = true; + } + return true; + } + + return tcp_read_in_ooo_range(tp, ooo_range); +} + static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook, u64 cgroup_id) @@ -590,6 +714,10 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, return 0; struct timespec64 *ts = &tss->ts[0]; + + if (!user_config.include_hol_blocked && tcp_read_maybe_holblocked(sk)) + return 0; + record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, hook, cgroup_id); diff --git a/examples/netstacklat.h b/examples/netstacklat.h index 88b9b350..019d0fef 100644 --- a/examples/netstacklat.h +++ b/examples/netstacklat.h @@ -87,6 +87,7 @@ struct netstacklat_bpf_config { bool filter_cgroup; bool groupby_ifindex; bool groupby_cgroup; + bool include_hol_blocked; }; #endif From 1443f7bf1825cf729f05b3802b880c04daa08bee Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Wed, 29 Oct 2025 16:17:02 +0100 Subject: [PATCH 45/46] netstacklat: Add sanity check for out-of-order sequence The logic for excluding samples from TCP reads that may have been delayed by HOL blocking relies on reading a number of fields from the TCP socket outside of the socket lock. This may be prone to errors due to the socket state being updated at another place in the kernel while our eBPF program is running. To reduce the risk that a data race causes the filter to fail, add a sanity check for the maximum out of order sequence used to exclude future TCP reads from monitoring. The most problematic of the read fields in the tcp_sock is ooo_last_skb, as that is a pointer to another SKB rather than a direct value. This pointer is only valid as long as the out_of_order_queue is non-empty. Due to a data race, we may check that the ooo-queue is non-empty while there are still SKBs in it, then have the kernel clear out the ooo-queue, and finally attempt to read the ooo_last_skb pointer later when it is no longer valid (and may now point to a freed/recycled SKB). This may result in incorrect values being used for the sequence limit used to exclude future reads of ooo-segments. The faulty sequence limit may both cause reads of HOL-blocked segments to be included or the exclusion of an unnecessarily large amount of future reads (up to 2 GB). To reduce the risk that the garbage data from an invalid SKB is used, introduce two sanity checks for end_seq in the ooo_last_skb. First check if the sequence number is zero, if so assume it is invalid (even though it can be a valid sequence number). Even though we will get an error code if reading the data from this SKB fails altogether, we may still succeed reading from a no longer valid SKB, in which case there is a high risk the data will have been zeroed. If it's non-zero, also check that it is within the current receive window (if not, clamp it to the receive window). Signed-off-by: Simon Sundberg Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 63 +++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index 58eb88a1..baf58bd8 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -504,22 +504,56 @@ static __always_inline bool filter_socket(struct sock *sk, struct sk_buff *skb, } #endif +/* Get the current receive window end sequence for tp + * In the kernel receive window checks are done against + * tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable + * result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher + */ +static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq) +{ + u32 rcv_wup, rcv_wnd, window = 0; + int err; + + err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err); + goto exit; + } + + err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err); + goto exit; + } + + window = rcv_wup + rcv_wnd; + if (u32_lt(window, rcv_nxt)) + window = rcv_nxt; + +exit: + *seq = window; + return err; +} + static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) { + u32 rcv_nxt, cur_rcv_window, max_seq = 0; struct tcp_skb_cb cb; - u32 max_seq = 0; int err = 0; + err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt); + if (err) { + bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err); + goto exit; + } + if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) { /* No ooo-segments currently in ooo-queue * Any ooo-segments must already have been merged to the * receive queue. Current rcv_nxt must therefore be ahead * of all ooo-segments that have arrived until now. */ - err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt); - if (err) - bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d", - err); + max_seq = rcv_nxt; } else { /* * Some ooo-segments currently in ooo-queue @@ -527,13 +561,28 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) * skb in the ooo-queue. */ err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb); - if (err) + if (err) { bpf_printk( "failed to read tcp_sock->ooo_last_skb->cb, err=%d", err); - max_seq = cb.end_seq; + goto exit; + } + + // Sanity check - ooo_last_skb->cb.end_seq within the receive window? + err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window); + if (err) + goto exit; + + /* While seq 0 can be a valid seq, consider it more likely to + * be the result of reading from an invalid SKB pointer + */ + if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq)) + max_seq = cur_rcv_window; + else + max_seq = cb.end_seq; } +exit: *seq = max_seq; return err; } From 347abc5be218a006378f35af81cee3b62152c875 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Nov 2025 17:42:10 +0100 Subject: [PATCH 46/46] netstacklat: convert bpf_printk to optional debug feature For production we need a way to disable any use of bpf_printk. To track errors in production introduce a map for counting these errors, as that will be exposed as a Prometheus counter naming it netstacklat_errors_total. The new "dbg" macro handled/hides if bpf_printk or counters are enabled. Signed-off-by: Jesper Dangaard Brouer --- examples/netstacklat.bpf.c | 76 ++++++++++++++++++++++++++++++++------ examples/netstacklat.yaml | 18 +++++++++ 2 files changed, 83 insertions(+), 11 deletions(-) diff --git a/examples/netstacklat.bpf.c b/examples/netstacklat.bpf.c index baf58bd8..23016e0b 100644 --- a/examples/netstacklat.bpf.c +++ b/examples/netstacklat.bpf.c @@ -265,6 +265,57 @@ static void record_latency_since(ktime_t tstamp, const struct hist_key *key) } #endif /* !CONFIG_MAP_MACROS */ +/* Debug facility to count errors */ +#define MAX_ERROR_TYPES 8 +enum error_types { + ERR_UNKNOWN = 0, + ERR_sk_storage = 1, + ERR_READ_TCP_rcv_wup = 2, + ERR_READ_TCP_rcv_wnd = 3, + ERR_READ_TCP_rcv_nxt = 4, + ERR_READ_TCP_last_skb_cb = 5, + ERR_READ_TCP_cp_seq = 6, + ERR_READ_TCP_rcv_ooopack = 7, +}; +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, MAX_ERROR_TYPES); + __type(key, u32); + __type(value, u64); +} netstacklat_errors_total SEC(".maps"); + +/* This provide easy way to disable debug feature for errors. + * Disabling this reduces BPF code size. + */ +#define CONFIG_TRACK_ERRORS 1 +/* #define CONFIG_PRINT_ERRORS 1 */ +#undef CONFIG_PRINT_ERRORS + +void record_errors(u32 err) +{ +#ifdef CONFIG_TRACK_ERRORS + u32 key = ERR_UNKNOWN; + + if (err < MAX_ERROR_TYPES) + key = err; + + increment_map_nosync(&netstacklat_errors_total, &key, 1); +#endif /* CONFIG_TRACK_ERRORS */ +} + +#ifdef CONFIG_PRINT_ERRORS +#define my_printk(fmt, ...) bpf_printk(fmt, ##__VA_ARGS__) +#else /* !CONFIG_PRINT_ERRORS */ +#define my_printk(fmt, ...) +#endif + +/* Debug macro that can be disabled compile time */ +#define dbg(__ERR_NR, fmt, ...) \ + ({ \ + record_errors(__ERR_NR); \ + my_printk(fmt, ##__VA_ARGS__); \ + }) + static inline bool filter_nth_packet(const enum netstacklat_hook hook) { u32 key = hook; @@ -516,13 +567,15 @@ static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq) err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup); if (err) { - bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err); + dbg(ERR_READ_TCP_rcv_wup, + "failed to read tcp_sock->rcv_wup, err=%d", err); goto exit; } err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd); if (err) { - bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err); + dbg(ERR_READ_TCP_rcv_wnd, + "failed to read tcp_sock->rcv_wnd, err=%d", err); goto exit; } @@ -543,7 +596,8 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt); if (err) { - bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err); + dbg(ERR_READ_TCP_rcv_nxt, + "failed reading tcp_sock->rcv_nxt, err=%d", err); goto exit; } @@ -562,9 +616,8 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) */ err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb); if (err) { - bpf_printk( - "failed to read tcp_sock->ooo_last_skb->cb, err=%d", - err); + dbg(ERR_READ_TCP_last_skb_cb, + "failed to read tcp_sock->ooo_last_skb->cb, err=%d", err); goto exit; } @@ -598,7 +651,8 @@ static bool tcp_read_in_ooo_range(struct tcp_sock *tp, err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq); if (err) { - bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err); + dbg(ERR_READ_TCP_cp_seq, + "failed to read tcp_sock->copied_seq, err=%d", err); return true; // Assume we may be in ooo-range } @@ -619,8 +673,8 @@ static bool tcp_read_maybe_holblocked(struct sock *sk) err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack); if (err) { - bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n", - err); + dbg(ERR_READ_TCP_rcv_ooopack, + "failed to read tcp_sock->rcv_ooopack, err=%d\n", err); return true; // Assume we may be in ooo-range } @@ -630,8 +684,8 @@ static bool tcp_read_maybe_holblocked(struct sock *sk) ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, BPF_SK_STORAGE_GET_F_CREATE); if (!ooo_range) { - bpf_printk( - "failed getting ooo-range socket storage for tcp socket"); + dbg(ERR_sk_storage, + "failed getting ooo-range socket storage for tcp socket"); return true; // Assume we may be in ooo-range } diff --git a/examples/netstacklat.yaml b/examples/netstacklat.yaml index ec98efc6..45fe845f 100644 --- a/examples/netstacklat.yaml +++ b/examples/netstacklat.yaml @@ -37,6 +37,24 @@ metrics: size: 2 decoders: - name: uint + counters: + - name: netstacklat_errors_total + help: Counter for bpf_core_read errors in code (can be disabled in code) + labels: + - name: type + size: 4 + decoders: + - name: uint + - name: static_map + static_map: + 0: unknown + 1: err_sk_storage + 2: err_read_tcp_rcv_wup + 3: err_read_tcp_rcv_wnd + 4: err_read_tcp_rcv_nxt + 5: err_read_tcp_last_skb_cb + 6: err_read_tcp_cp_seq + 7: err_read_tcp_rcv_ooopack # Remember to update #define N_CGROUPS in code when adding more matches cgroup_id_map: