bpf: clat: rework to avoid pointer arithmetic

Avoid using pointer arithmetic in the BPF program, so that it requires
only CAP_BPF and not CAP_PERFMON. In this context "pointer arithmetic"
means adding a variable value to a packet pointer. This means that the
program no longer tries to parse variable-size headers (IPv4 options,
IPv6 extension headers). Those were already not supported before. It
also doesn't parse VLAN tags, but there should be no need for that. If
we use fixed offset, we can avoid using the parsing helpers from
libxdp.
This commit is contained in:
Beniamino Galvani 2025-10-25 10:45:46 +02:00
parent 161dc7774e
commit 2e751865ea
7 changed files with 93 additions and 140 deletions

View file

@ -60,11 +60,11 @@ variables:
#
# This is done by running `ci-fairy generate-template` and possibly bumping
# ".default_tag".
ALPINE_TAG: 'tag-9048a8c683b9'
CENTOS_TAG: 'tag-026f017b5a4a'
DEBIAN_TAG: 'tag-27883e6c662f'
FEDORA_TAG: 'tag-026f017b5a4a'
UBUNTU_TAG: 'tag-27883e6c662f'
ALPINE_TAG: 'tag-8e4bbc59695b'
CENTOS_TAG: 'tag-caf6673db1a7'
DEBIAN_TAG: 'tag-e394e8e726e1'
FEDORA_TAG: 'tag-caf6673db1a7'
UBUNTU_TAG: 'tag-e394e8e726e1'
ALPINE_EXEC: 'bash .gitlab-ci/alpine-install.sh'
CENTOS_EXEC: 'bash .gitlab-ci/fedora-install.sh'

View file

@ -32,7 +32,6 @@ apk add \
'libpsl-dev' \
'libsoup-dev' \
'libteam-dev' \
'libxdp-dev' \
'linux-headers' \
'meson' \
'mobile-broadband-provider-info' \

View file

@ -65,7 +65,6 @@ install \
libsystemd-dev \
libteam-dev \
libudev-dev \
libxdp-dev \
locales \
meson \
mobile-broadband-provider-info \

View file

@ -71,7 +71,6 @@ install \
libnvme-devel \
libselinux-devel \
libuuid-devel \
libxdp-devel \
meson \
mobile-broadband-provider-info-devel \
newt-devel \

View file

@ -303,7 +303,6 @@ BuildRequires: iproute
BuildRequires: iproute-tc
BuildRequires: libnvme-devel >= 1.5
BuildRequires: libbpf-devel
BuildRequires: libxdp-devel
BuildRequires: bpftool
Provides: %{name}-dispatcher%{?_isa} = %{epoch}:%{version}-%{release}

View file

@ -494,7 +494,6 @@ enable_clat = get_option('clat')
if enable_clat
libbpf = dependency('libbpf', version: '>= 0.1.0', required: false)
assert(libbpf.found(), 'You must have libbpf installed to build. Use -Dclat=false to disable use of it')
libxdp = dependency('libxdp', version: '>= 0.1.0', required: false)
endif
config_h.set10('HAVE_CLAT', enable_clat)

View file

@ -26,7 +26,6 @@
#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>
#include <xdp/parsing_helpers.h>
#include "clat.h"
@ -89,7 +88,6 @@ update_l4_checksum(struct __sk_buff *skb,
int ip_type,
bool v4to6)
{
void *data = SKB_DATA(skb);
int flags = BPF_F_PSEUDO_HDR;
__u16 offset;
__u32 csum;
@ -99,13 +97,13 @@ update_l4_checksum(struct __sk_buff *skb,
void *to_ptr = &ip6h->saddr;
csum = bpf_csum_diff(from_ptr, 2 * sizeof(__u32), to_ptr, 2 * sizeof(struct in6_addr), 0);
offset = (void *) (iph + 1) - data;
offset = sizeof(struct ethhdr) + sizeof(struct iphdr);
} else {
void *from_ptr = &ip6h->saddr;
void *to_ptr = &iph->saddr;
csum = bpf_csum_diff(from_ptr, 2 * sizeof(struct in6_addr), to_ptr, 2 * sizeof(__u32), 0);
offset = (void *) (ip6h + 1) - data;
offset = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
}
switch (ip_type) {
@ -130,8 +128,7 @@ update_icmp_checksum(struct __sk_buff *skb,
void *icmp_after,
bool add)
{
void *data = SKB_DATA(skb);
struct icmpv6_pseudo ph = {.nh = IPPROTO_ICMPV6, .len = ip6h->payload_len};
struct icmpv6_pseudo ph = {.nh = IPPROTO_ICMPV6, .len = ip6h->payload_len};
__u16 h_before, h_after, offset;
__u32 csum, u_before, u_after;
@ -153,7 +150,8 @@ update_icmp_checksum(struct __sk_buff *skb,
add ? sizeof(ph) : 0,
0);
offset = ((void *) icmp_after - data) + 2;
offset = sizeof(struct ethhdr) + (add ? sizeof(struct iphdr) : sizeof(struct ipv6hdr)) + 2;
/* first two bytes of ICMP header, type and code */
h_before = *(__u16 *) icmp_before;
h_after = *(__u16 *) icmp_after;
@ -289,34 +287,27 @@ rewrite_icmp(struct iphdr *iph, struct ipv6hdr *ip6h, struct __sk_buff *skb)
/* ipv4 traffic in from application on this device, needs to be translated to v6 and sent to PLAT */
static int
clat_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh)
clat_handle_v4(struct __sk_buff *skb)
{
int ret = TC_ACT_OK;
void *data_end = SKB_DATA_END(skb);
void *data = SKB_DATA(skb);
int ip_type, iphdr_len, ip_offset;
struct in6_addr *dst_v6;
struct ipv6hdr *ip6h;
struct ipv6hdr dst_hdr = {
.version = 6,
int ret = TC_ACT_OK;
void *data_end = SKB_DATA_END(skb);
void *data = SKB_DATA(skb);
struct ipv6hdr *ip6h;
struct ipv6hdr dst_hdr = {
.version = 6,
};
struct iphdr *iph;
struct ethhdr *eth;
struct in_addr src_v4;
struct clat_v4_config_value *v4_config;
struct clat_v4_config_key v4_config_key;
ip_offset = (nh->pos - data) & 0x1fff;
ip_type = parse_iphdr(nh, data_end, &iph);
if (ip_type < 0)
iph = data + sizeof(struct ethhdr);
if (iph + 1 > data_end)
goto out;
src_v4.s_addr = iph->saddr;
v4_config_key.ifindex = skb->ifindex;
v4_config_key.local_v4 = src_v4;
v4_config_key.ifindex = skb->ifindex;
v4_config_key.local_v4.s_addr = iph->saddr;
v4_config = bpf_map_lookup_elem(&v4_config_map, &v4_config_key);
if (!v4_config) {
@ -337,28 +328,26 @@ clat_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh)
* out. The DF is the second-most-significant bit (as bit 0 is
* reserved).
*/
iphdr_len = iph->ihl * 4;
if (iphdr_len != sizeof(struct iphdr) || (iph->frag_off & ~bpf_htons(1 << 14))) {
if (iph->ihl != 5 || (iph->frag_off & ~bpf_htons(1 << 14))) {
DBG("v4: pkt src/dst %pI4/%pI4 has IP options or is fragmented, dropping\n",
&iph->daddr,
&iph->saddr);
goto out;
}
dst_v6 = &v4_config->pref64;
dst_v6->s6_addr32[3] = iph->daddr;
DBG("v4: Found mapping for dst %pI4 to %pI6c\n", &iph->daddr, dst_v6);
/* src v4 as last octet of clat address */
dst_hdr.saddr = v4_config->local_v6;
dst_hdr.daddr = *dst_v6;
dst_hdr.nexthdr = iph->protocol;
dst_hdr.hop_limit = iph->ttl;
dst_hdr.saddr = v4_config->local_v6;
dst_hdr.daddr = v4_config->pref64;
dst_hdr.daddr.s6_addr32[3] = iph->daddr;
dst_hdr.nexthdr = iph->protocol;
dst_hdr.hop_limit = iph->ttl;
/* weird definition in ipv6hdr */
dst_hdr.priority = (iph->tos & 0x70) >> 4;
dst_hdr.flow_lbl[0] = iph->tos << 4;
dst_hdr.payload_len = bpf_htons(bpf_ntohs(iph->tot_len) - iphdr_len);
dst_hdr.payload_len = bpf_htons(bpf_ntohs(iph->tot_len) - sizeof(struct iphdr));
DBG("v4: Found mapping for dst %pI4 to %pI6c\n", &iph->daddr, &dst_hdr.daddr);
switch (dst_hdr.nexthdr) {
case IPPROTO_ICMP:
@ -380,9 +369,12 @@ clat_handle_v4(struct __sk_buff *skb, struct hdr_cursor *nh)
data = SKB_DATA(skb);
data_end = SKB_DATA_END(skb);
eth = data;
ip6h = data + ip_offset;
if ((void *) (eth + 1) > data_end || (void *) (ip6h + 1) > data_end)
eth = data;
if (eth + 1 > data_end)
goto out;
ip6h = (void *) (eth + 1);
if (ip6h + 1 > data_end)
goto out;
eth->h_proto = bpf_htons(ETH_P_IPV6);
@ -402,24 +394,18 @@ csum_fold_helper(__u32 csum)
return ~sum;
}
static int clat_translate_v6(struct __sk_buff *skb,
struct hdr_cursor *nh,
void *data_end,
struct iphdr *dst_hdr_out,
bool depth);
static int
rewrite_icmpv6(struct ipv6hdr *ip6h,
struct __sk_buff *skb,
struct icmphdr **new_icmp_out,
struct hdr_cursor *nh)
rewrite_icmpv6(struct ipv6hdr *ip6h, struct __sk_buff *skb)
{
void *data_end = SKB_DATA_END(skb);
struct icmp6hdr *icmp6 = (void *) (ip6h + 1);
struct icmphdr icmp, *new_icmp;
__u32 mtu, ptr;
struct icmp6hdr *icmp6;
struct icmphdr icmp;
struct icmphdr *new_icmp;
__u32 mtu;
__u32 ptr;
if ((void *) (icmp6 + 1) > data_end)
icmp6 = (void *) (ip6h + 1);
if (icmp6 + 1 > data_end)
return -1;
new_icmp = (void *) icmp6;
@ -500,40 +486,33 @@ rewrite_icmpv6(struct ipv6hdr *ip6h,
return -1;
}
*new_icmp = icmp;
*new_icmp_out = new_icmp;
*new_icmp = icmp;
return 0;
}
/* ipv6 traffic from the PLAT, to be translated into ipv4 and sent to an application */
static int
clat_translate_v6(struct __sk_buff *skb,
struct hdr_cursor *nh,
void *data_end,
struct iphdr *dst_hdr_out,
bool depth)
clat_handle_v6(struct __sk_buff *skb)
{
struct in_addr src_v4;
int ip_type;
int ret = TC_ACT_OK;
void *data_end = SKB_DATA_END(skb);
void *data = SKB_DATA(skb);
struct ethhdr *eth;
struct iphdr *iph;
struct ipv6hdr *ip6h;
int ret = TC_ACT_OK;
struct icmphdr *new_icmp;
struct icmp6hdr old_icmp6;
struct iphdr dst_hdr = {
.version = 4,
.ihl = 5,
.frag_off = bpf_htons(1 << 14), /* set Don't Fragment bit */
};
struct clat_v6_config_value *v6_config;
struct clat_v6_config_key v6_config_key;
struct iphdr dst_hdr = {
.version = 4,
.ihl = 5,
.frag_off = bpf_htons(1 << 14), /* set Don't Fragment bit */
};
ip_type = parse_ip6hdr(nh, data_end, &ip6h);
if (ip_type < 0)
ip6h = data + sizeof(struct ethhdr);
if (ip6h + 1 > data_end)
goto out;
src_v4.s_addr = ip6h->saddr.s6_addr32[3];
v6_config_key.local_v6 = ip6h->daddr;
v6_config_key.pref64 = ip6h->saddr;
/* v6 pxlen is always 96 */
@ -554,35 +533,34 @@ clat_translate_v6(struct __sk_buff *skb,
*/
ret = TC_ACT_SHOT;
/* drop packets with IP options - parser skips options */
if (ip_type != ip6h->nexthdr) {
DBG("v6: dropping packet with IP options from %pI6c\n", &ip6h->saddr);
/* drop packets with extension headers */
if (ip6h->nexthdr != IPPROTO_TCP && ip6h->nexthdr != IPPROTO_UDP
&& ip6h->nexthdr != IPPROTO_ICMPV6)
goto out;
}
dst_hdr.daddr = v6_config->local_v4.s_addr;
dst_hdr.saddr = src_v4.s_addr;
dst_hdr.saddr = ip6h->saddr.s6_addr32[3];
dst_hdr.protocol = ip6h->nexthdr;
dst_hdr.ttl = ip6h->hop_limit;
dst_hdr.tos = ip6h->priority << 4 | (ip6h->flow_lbl[0] >> 4);
dst_hdr.tot_len = bpf_htons(bpf_ntohs(ip6h->payload_len) + sizeof(dst_hdr));
dst_hdr.tot_len = bpf_htons(bpf_ntohs(ip6h->payload_len) + sizeof(struct iphdr));
switch (dst_hdr.protocol) {
case IPPROTO_ICMPV6:
{
struct icmphdr *new_icmp;
struct icmp6hdr old_icmp6;
new_icmp = (void *) (ip6h + 1);
if ((void *) (new_icmp + 1) > data_end)
if (new_icmp + 1 > data_end)
goto out;
old_icmp6 = *((struct icmp6hdr *) (void *) new_icmp);
if (rewrite_icmpv6(ip6h, skb, &new_icmp, nh))
if (rewrite_icmpv6(ip6h, skb))
goto out;
/* FIXME: also need to rewrite IP header embedded in ICMP error */
if (depth)
goto icmp_out;
if (!new_icmp)
goto icmp_out;
/* TODO: also need to rewrite IP header embedded in ICMP error */
if ((void *) (new_icmp + 1) > data_end)
goto icmp_out;
@ -591,6 +569,7 @@ clat_translate_v6(struct __sk_buff *skb,
icmp_out:
dst_hdr.protocol = IPPROTO_ICMP;
break;
}
case IPPROTO_TCP:
case IPPROTO_UDP:
update_l4_checksum(skb, ip6h, &dst_hdr, dst_hdr.protocol, false);
@ -602,34 +581,6 @@ icmp_out:
dst_hdr.check = csum_fold_helper(
bpf_csum_diff((__be32 *) &dst_hdr, 0, (__be32 *) &dst_hdr, sizeof(dst_hdr), 0));
*dst_hdr_out = dst_hdr;
ret = TC_ACT_UNSPEC;
out:
return ret;
}
/* ipv6 traffic from the PLAT, to be translated into ipv4 and sent to an application */
static int
clat_handle_v6(struct __sk_buff *skb, struct hdr_cursor *nh)
{
int ret = TC_ACT_OK;
void *data_end = SKB_DATA_END(skb);
void *data = SKB_DATA(skb);
struct ethhdr *eth;
struct iphdr *iph;
struct iphdr dst_hdr;
int ip_offset = (nh->pos - data) & 0x1fff;
ret = clat_translate_v6(skb, nh, data_end, &dst_hdr, 0);
if (ret != TC_ACT_UNSPEC) {
goto out;
}
ret = TC_ACT_SHOT;
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
goto out;
@ -637,8 +588,11 @@ clat_handle_v6(struct __sk_buff *skb, struct hdr_cursor *nh)
data_end = SKB_DATA_END(skb);
eth = data;
iph = data + ip_offset;
if ((void *) (eth + 1) > data_end || (void *) (iph + 1) > data_end)
if (eth + 1 > data_end)
goto out;
iph = (void *) (eth + 1);
if (iph + 1 > data_end)
goto out;
eth->h_proto = bpf_htons(ETH_P_IP);
@ -652,18 +606,22 @@ out:
static int
clat_handler(struct __sk_buff *skb, bool egress)
{
void *data_end = SKB_DATA_END(skb);
void *data = SKB_DATA(skb);
struct hdr_cursor nh = {.pos = data};
struct ethhdr *eth;
int eth_type;
void *data = SKB_DATA(skb);
void *data_end = SKB_DATA_END(skb);
struct ethhdr *eth;
/* Parse Ethernet and IP/IPv6 headers */
eth_type = parse_ethhdr(&nh, data_end, &eth);
if (eth_type == bpf_htons(ETH_P_IP) && egress)
return clat_handle_v4(skb, &nh);
else if (eth_type == bpf_htons(ETH_P_IPV6) && !egress)
return clat_handle_v6(skb, &nh);
eth = data;
if (eth + 1 > data_end)
return TC_ACT_OK;
/* Don't explicitly handle Ethernet types 8021Q and 8021AD
* because we don't expect to receive VLAN-tagged packets
* on the interface. */
if (eth->h_proto == bpf_htons(ETH_P_IP) && egress)
return clat_handle_v4(skb);
else if (eth->h_proto == bpf_htons(ETH_P_IPV6) && !egress)
return clat_handle_v6(skb);
return TC_ACT_OK;
}