mirror of
https://gitlab.freedesktop.org/NetworkManager/NetworkManager.git
synced 2026-04-30 23:48:05 +02:00
Print some statistics about the translation when the connection goes down: clat: stats: egress (v4 to v6): tcp 1275, udp 191, icmp 9, other 0, dropped 2; ingress (v6 to v4): tcp 1669, udp 272, icmp 0, other 0, fragment 136, dropped 0 Those counters can be used to better understand what's going wrong in case of problems; for example, if the packets are being dropped in the ingress path or in the egress one.
1203 lines
38 KiB
C
1203 lines
38 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/* Copyright 2021 Toke Høiland-Jørgensen <toke@toke.dk> */
|
|
/* Copyright 2025 Mary Strodl <mstrodl@csh.rit.edu> */
|
|
/* Copyright 2026 Beniamino Galvani <bgalvani@redhat.com> */
|
|
|
|
/**
|
|
* This is an implementation of a CLAT in eBPF. BPF is a different environment
|
|
* than the rest of NetworkManager, and we don't have access to most of the
|
|
* C standard library, so some things might look a little different from what
|
|
* you're used to.
|
|
*
|
|
* Check out src/core/bpf/meson.build to see how this gets built.
|
|
**/
|
|
|
|
#include <linux/bpf.h>
|
|
#include <linux/icmp.h>
|
|
#include <linux/icmpv6.h>
|
|
#include <linux/ip.h>
|
|
#include <linux/ipv6.h>
|
|
#include <linux/in.h>
|
|
#include <linux/in6.h>
|
|
#include <linux/pkt_cls.h>
|
|
#include <linux/tcp.h>
|
|
#include <linux/udp.h>
|
|
#include <linux/if_ether.h>
|
|
#include <stdbool.h>
|
|
|
|
#include <bpf/bpf_endian.h>
|
|
#include <bpf/bpf_helpers.h>
|
|
|
|
#include "clat.h"
|
|
|
|
char _license[] SEC("license") = "GPL";
|
|
|
|
struct clat_config config;
|
|
struct clat_stats stats;
|
|
|
|
#ifdef DEBUG
|
|
/* Note: when enabling debugging, you also need to add CAP_PERFMON
|
|
* to the CapabilityBoundingSet of the NM systemd unit. The messages
|
|
* will be printed to /sys/kernel/debug/tracing/trace_pipe */
|
|
#define DBG(fmt, ...) \
|
|
({ \
|
|
char ____fmt[] = "clat: " fmt; \
|
|
bpf_trace_printk(____fmt, sizeof(____fmt), ##__VA_ARGS__); \
|
|
})
|
|
#else
|
|
#define DBG(fmt, ...)
|
|
#endif
|
|
|
|
/* Macros to read the sk_buff data* pointers, preventing the compiler
|
|
* from generating a 32-bit register spill. */
|
|
#define SKB_ACCESS_MEMBER_32(_skb, member) \
|
|
({ \
|
|
void *ptr; \
|
|
\
|
|
asm volatile("%0 = *(u32 *)(%1 + %2)" \
|
|
: "=r"(ptr) \
|
|
: "r"(_skb), "i"(offsetof(struct __sk_buff, member))); \
|
|
\
|
|
ptr; \
|
|
})
|
|
|
|
#define SKB_DATA(_skb) SKB_ACCESS_MEMBER_32(_skb, data)
|
|
#define SKB_DATA_END(_skb) SKB_ACCESS_MEMBER_32(_skb, data_end)
|
|
|
|
struct icmpv6_pseudo {
|
|
struct in6_addr saddr;
|
|
struct in6_addr daddr;
|
|
__u32 len;
|
|
__u8 padding[3];
|
|
__u8 nh;
|
|
} __attribute__((packed));
|
|
|
|
struct ip6_frag {
|
|
__u8 nexthdr;
|
|
__u8 reserved;
|
|
__u16 offset;
|
|
__u32 identification;
|
|
} __attribute__((packed));
|
|
|
|
#define L2_H_LEN(has_eth) (has_eth ? sizeof(struct ethhdr) : 0)
|
|
#define IP_H_LEN (sizeof(struct iphdr))
|
|
#define IP6_H_LEN (sizeof(struct ipv6hdr))
|
|
#define IP6_FRAG_H_LEN (sizeof(struct ip6_frag))
|
|
#define ICMP_H_LEN (sizeof(struct icmphdr))
|
|
#define ICMP6_H_LEN (sizeof(struct icmp6hdr))
|
|
|
|
#define ensure_header(header, skb, data, data_end, offset) \
|
|
_ensure_header((void **) header, (skb), (data), (data_end), sizeof(**(header)), (offset))
|
|
|
|
/*
|
|
* Verifies that the header at offset @offset and with size @size can
|
|
* be accessed, and assigns the pointer to @header. In case the data
|
|
* is not available, the function tries to pull it. Note that all packet
|
|
* pointers must be refreshed after calling this function.
|
|
*/
|
|
static __always_inline bool
|
|
_ensure_header(void **header,
|
|
struct __sk_buff *skb,
|
|
void **data,
|
|
void **data_end,
|
|
unsigned size,
|
|
unsigned offset)
|
|
{
|
|
if (*data + offset + size > *data_end) {
|
|
bpf_skb_pull_data(skb, offset + size);
|
|
*data = SKB_DATA(skb);
|
|
*data_end = SKB_DATA_END(skb);
|
|
}
|
|
|
|
if (*data + offset + size > *data_end)
|
|
return false;
|
|
|
|
*header = *data + offset;
|
|
return true;
|
|
}
|
|
|
|
/* This function must be declared as inline because the BPF calling
|
|
* convention only supports up to 5 function arguments. */
|
|
static __always_inline void
|
|
update_l4_checksum(struct __sk_buff *skb,
|
|
struct ipv6hdr *ip6h,
|
|
struct iphdr *iph,
|
|
bool has_eth,
|
|
bool v4to6,
|
|
bool is_inner,
|
|
bool is_v6_fragment,
|
|
__u32 *csum_diff)
|
|
{
|
|
int flags = BPF_F_PSEUDO_HDR;
|
|
__u16 offset;
|
|
__u32 csum;
|
|
int ip_type;
|
|
|
|
if (v4to6) {
|
|
void *from_ptr = &iph->saddr;
|
|
void *to_ptr = &ip6h->saddr;
|
|
|
|
csum = bpf_csum_diff(from_ptr, 2 * sizeof(__u32), to_ptr, 2 * sizeof(struct in6_addr), 0);
|
|
offset = L2_H_LEN(has_eth) + IP_H_LEN;
|
|
ip_type = ip6h->nexthdr;
|
|
} else {
|
|
void *from_ptr = &ip6h->saddr;
|
|
void *to_ptr = &iph->saddr;
|
|
|
|
csum = bpf_csum_diff(from_ptr, 2 * sizeof(struct in6_addr), to_ptr, 2 * sizeof(__u32), 0);
|
|
offset = L2_H_LEN(has_eth) + IP6_H_LEN;
|
|
ip_type = iph->protocol;
|
|
|
|
if (is_inner) {
|
|
offset = offset + ICMP6_H_LEN + IP6_H_LEN;
|
|
}
|
|
}
|
|
|
|
if (is_v6_fragment) {
|
|
offset += IP6_FRAG_H_LEN;
|
|
}
|
|
|
|
switch (ip_type) {
|
|
case IPPROTO_TCP:
|
|
offset += offsetof(struct tcphdr, check);
|
|
break;
|
|
case IPPROTO_UDP:
|
|
offset += offsetof(struct udphdr, check);
|
|
flags |= BPF_F_MARK_MANGLED_0;
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
|
|
bpf_l4_csum_replace(skb, offset, 0, csum, flags);
|
|
|
|
if (csum_diff) {
|
|
*csum_diff = bpf_csum_diff((__be32 *) &csum, sizeof(csum), 0, 0, *csum_diff);
|
|
}
|
|
}
|
|
|
|
static __always_inline void
|
|
update_icmp_checksum(struct __sk_buff *skb,
|
|
const struct ipv6hdr *ip6h,
|
|
void *icmp_before,
|
|
void *icmp_after,
|
|
bool has_eth,
|
|
bool v4to6,
|
|
bool is_inner,
|
|
__u32 seed)
|
|
{
|
|
struct icmpv6_pseudo ph = {.nh = IPPROTO_ICMPV6, .len = ip6h->payload_len};
|
|
__u16 h_before;
|
|
__u16 h_after;
|
|
__u16 offset;
|
|
__u32 csum;
|
|
__u32 u_before;
|
|
__u32 u_after;
|
|
|
|
__builtin_memcpy(&ph.saddr, &ip6h->saddr, sizeof(struct in6_addr));
|
|
__builtin_memcpy(&ph.daddr, &ip6h->daddr, sizeof(struct in6_addr));
|
|
|
|
/* Do checksum update in two passes: first compute the incremental
|
|
* checksum update of the ICMPv6 pseudo header, update the checksum
|
|
* using bpf_l4_csum_replace(), and then do a separate update for the
|
|
* ICMP type and code (which is two consecutive bytes, so cast them to
|
|
* u16). The bpf_csum_diff() helper can be used to compute the
|
|
* incremental update of the full block, whereas the
|
|
* bpf_l4_csum_replace() helper can do the two-byte diff and update by
|
|
* itself.
|
|
*/
|
|
csum = bpf_csum_diff((__be32 *) &ph,
|
|
v4to6 ? 0 : sizeof(ph),
|
|
(__be32 *) &ph,
|
|
v4to6 ? sizeof(ph) : 0,
|
|
seed);
|
|
|
|
if (v4to6) {
|
|
offset = L2_H_LEN(has_eth) + IP_H_LEN + 2;
|
|
} else {
|
|
offset = L2_H_LEN(has_eth) + IP6_H_LEN + 2;
|
|
if (is_inner)
|
|
offset += ICMP6_H_LEN + IP6_H_LEN;
|
|
}
|
|
|
|
/* first two bytes of ICMP header, type and code */
|
|
h_before = *(__u16 *) icmp_before;
|
|
h_after = *(__u16 *) icmp_after;
|
|
|
|
/* last four bytes of ICMP header, the data union */
|
|
u_before = *(__u32 *) (icmp_before + 4);
|
|
u_after = *(__u32 *) (icmp_after + 4);
|
|
|
|
bpf_l4_csum_replace(skb, offset, 0, csum, BPF_F_PSEUDO_HDR);
|
|
bpf_l4_csum_replace(skb, offset, h_before, h_after, 2);
|
|
|
|
if (u_before != u_after)
|
|
bpf_l4_csum_replace(skb, offset, u_before, u_after, 4);
|
|
}
|
|
|
|
static __always_inline int
|
|
rewrite_icmp(struct __sk_buff *skb, const struct ipv6hdr *ip6h, bool has_eth)
|
|
{
|
|
void *data_end = SKB_DATA_END(skb);
|
|
void *data = SKB_DATA(skb);
|
|
struct icmphdr icmp_buf; /* copy of the old ICMPv4 header */
|
|
struct icmp6hdr icmp6_buf; /* buffer for the new ICMPv6 header */
|
|
struct icmphdr *icmp;
|
|
struct icmp6hdr *icmp6;
|
|
__u32 mtu;
|
|
|
|
if (!ensure_header(&icmp, skb, &data, &data_end, L2_H_LEN(has_eth) + IP_H_LEN))
|
|
return -1;
|
|
|
|
icmp_buf = *icmp;
|
|
icmp6 = (void *) icmp;
|
|
icmp6_buf = *icmp6;
|
|
|
|
/* These translations are defined in RFC6145 section 4.2 */
|
|
switch (icmp->type) {
|
|
case ICMP_ECHO:
|
|
icmp6_buf.icmp6_type = ICMPV6_ECHO_REQUEST;
|
|
break;
|
|
case ICMP_ECHOREPLY:
|
|
icmp6_buf.icmp6_type = ICMPV6_ECHO_REPLY;
|
|
break;
|
|
case ICMP_DEST_UNREACH:
|
|
icmp6_buf.icmp6_type = ICMPV6_DEST_UNREACH;
|
|
switch (icmp->code) {
|
|
case ICMP_NET_UNREACH:
|
|
case ICMP_HOST_UNREACH:
|
|
case ICMP_SR_FAILED:
|
|
case ICMP_NET_UNKNOWN:
|
|
case ICMP_HOST_UNKNOWN:
|
|
case ICMP_HOST_ISOLATED:
|
|
case ICMP_NET_UNR_TOS:
|
|
case ICMP_HOST_UNR_TOS:
|
|
icmp6_buf.icmp6_code = ICMPV6_NOROUTE;
|
|
break;
|
|
case ICMP_PROT_UNREACH:
|
|
icmp6_buf.icmp6_type = ICMPV6_PARAMPROB;
|
|
icmp6_buf.icmp6_code = ICMPV6_UNK_NEXTHDR;
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(offsetof(struct ipv6hdr, nexthdr));
|
|
break;
|
|
case ICMP_PORT_UNREACH:
|
|
icmp6_buf.icmp6_code = ICMPV6_PORT_UNREACH;
|
|
break;
|
|
case ICMP_FRAG_NEEDED:
|
|
icmp6_buf.icmp6_type = ICMPV6_PKT_TOOBIG;
|
|
icmp6_buf.icmp6_code = 0;
|
|
mtu = bpf_ntohs(icmp->un.frag.mtu) + 20;
|
|
/* RFC6145 section 6, "second approach" - should not be
|
|
* necessary, but might as well do this
|
|
*/
|
|
if (mtu < 1280)
|
|
mtu = 1280;
|
|
icmp6_buf.icmp6_mtu = bpf_htonl(mtu);
|
|
break;
|
|
case ICMP_NET_ANO:
|
|
case ICMP_HOST_ANO:
|
|
case ICMP_PKT_FILTERED:
|
|
case ICMP_PREC_CUTOFF:
|
|
icmp6_buf.icmp6_code = ICMPV6_ADM_PROHIBITED;
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
break;
|
|
case ICMP_PARAMETERPROB:
|
|
if (icmp->code == 1)
|
|
return -1;
|
|
icmp6_buf.icmp6_type = ICMPV6_PARAMPROB;
|
|
icmp6_buf.icmp6_code = ICMPV6_HDR_FIELD;
|
|
/* The pointer field not defined in the Linux header. This
|
|
* translation is from Figure 3 of RFC6145.
|
|
*/
|
|
switch (icmp->un.reserved[0]) {
|
|
case 0: /* version/IHL */
|
|
icmp6_buf.icmp6_pointer = 0;
|
|
break;
|
|
case 1: /* Type of Service */
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(1);
|
|
break;
|
|
case 2: /* Total length */
|
|
case 3:
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(4);
|
|
break;
|
|
case 8: /* Time to Live */
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(7);
|
|
break;
|
|
case 9: /* Protocol */
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(6);
|
|
break;
|
|
case 12: /* Source address */
|
|
case 13:
|
|
case 14:
|
|
case 15:
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(8);
|
|
break;
|
|
case 16: /* Destination address */
|
|
case 17:
|
|
case 18:
|
|
case 19:
|
|
icmp6_buf.icmp6_pointer = bpf_htonl(24);
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
|
|
*icmp6 = icmp6_buf;
|
|
update_icmp_checksum(skb, ip6h, &icmp_buf, icmp6, has_eth, true, false, 0);
|
|
|
|
/* FIXME: also need to rewrite IP header embedded in ICMP error */
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Convert an IPv4 address to the corresponding "IPv4-Embedded IPv6 Address"
|
|
* according to RFC 6052 2.2.
|
|
*
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |PL| 0-------------32--40--48--56--64--72--80--88--96--104---------|
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |32| prefix |v4(32) | u | suffix |
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |40| prefix |v4(24) | u |(8)| suffix |
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |48| prefix |v4(16) | u | (16) | suffix |
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |56| prefix |(8)| u | v4(24) | suffix |
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |64| prefix | u | v4(32) | suffix |
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
* |96| prefix | v4(32) |
|
|
* +--+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|
|
*
|
|
*/
|
|
static __always_inline bool
|
|
v4addr_to_v6(__be32 addr4, struct in6_addr *addr6, const struct in6_addr *pref64, int pref64_len)
|
|
{
|
|
union {
|
|
__be32 a32;
|
|
__u8 a8[4];
|
|
} u;
|
|
|
|
u.a32 = addr4;
|
|
|
|
addr6->s6_addr32[0] = 0;
|
|
addr6->s6_addr32[1] = 0;
|
|
addr6->s6_addr32[2] = 0;
|
|
addr6->s6_addr32[3] = 0;
|
|
|
|
switch (pref64_len) {
|
|
case 96:
|
|
addr6->s6_addr32[0] = pref64->s6_addr32[0];
|
|
addr6->s6_addr32[1] = pref64->s6_addr32[1];
|
|
addr6->s6_addr32[2] = pref64->s6_addr32[2];
|
|
addr6->s6_addr32[3] = addr4;
|
|
break;
|
|
case 64:
|
|
addr6->s6_addr32[0] = pref64->s6_addr32[0];
|
|
addr6->s6_addr32[1] = pref64->s6_addr32[1];
|
|
addr6->s6_addr[9] = u.a8[0];
|
|
addr6->s6_addr[10] = u.a8[1];
|
|
addr6->s6_addr[11] = u.a8[2];
|
|
addr6->s6_addr[12] = u.a8[3];
|
|
break;
|
|
case 56:
|
|
addr6->s6_addr32[0] = pref64->s6_addr32[0];
|
|
addr6->s6_addr32[1] = pref64->s6_addr32[1];
|
|
addr6->s6_addr[7] = u.a8[0];
|
|
addr6->s6_addr[9] = u.a8[1];
|
|
addr6->s6_addr[10] = u.a8[2];
|
|
addr6->s6_addr[11] = u.a8[3];
|
|
break;
|
|
case 48:
|
|
addr6->s6_addr32[0] = pref64->s6_addr32[0];
|
|
addr6->s6_addr16[2] = pref64->s6_addr16[2];
|
|
addr6->s6_addr[6] = u.a8[0];
|
|
addr6->s6_addr[7] = u.a8[1];
|
|
addr6->s6_addr[9] = u.a8[2];
|
|
addr6->s6_addr[10] = u.a8[3];
|
|
break;
|
|
case 40:
|
|
addr6->s6_addr32[0] = pref64->s6_addr32[0];
|
|
addr6->s6_addr[4] = pref64->s6_addr[4];
|
|
addr6->s6_addr[5] = u.a8[0];
|
|
addr6->s6_addr[6] = u.a8[1];
|
|
addr6->s6_addr[7] = u.a8[2];
|
|
addr6->s6_addr[9] = u.a8[3];
|
|
break;
|
|
case 32:
|
|
addr6->s6_addr32[0] = pref64->s6_addr32[0];
|
|
addr6->s6_addr32[1] = addr4;
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Extract the IPv4 address @addr4 and the NAT64 prefix @pref64 from an IPv6 address,
|
|
* given the known prefix length @pref64_len. See the table above.
|
|
*/
|
|
static __always_inline bool
|
|
v6addr_to_v4(const struct in6_addr *addr6, int pref64_len, __be32 *addr4, struct in6_addr *pref64)
|
|
{
|
|
union {
|
|
__be32 a32;
|
|
__u8 a8[4];
|
|
} u;
|
|
|
|
pref64->s6_addr32[0] = 0;
|
|
pref64->s6_addr32[1] = 0;
|
|
pref64->s6_addr32[2] = 0;
|
|
pref64->s6_addr32[3] = 0;
|
|
|
|
switch (pref64_len) {
|
|
case 96:
|
|
u.a32 = addr6->s6_addr32[3];
|
|
|
|
pref64->s6_addr32[0] = addr6->s6_addr32[0];
|
|
pref64->s6_addr32[1] = addr6->s6_addr32[1];
|
|
pref64->s6_addr32[2] = addr6->s6_addr32[2];
|
|
break;
|
|
case 64:
|
|
u.a8[0] = addr6->s6_addr[9];
|
|
u.a8[1] = addr6->s6_addr[10];
|
|
u.a8[2] = addr6->s6_addr[11];
|
|
u.a8[3] = addr6->s6_addr[12];
|
|
|
|
pref64->s6_addr32[0] = addr6->s6_addr32[0];
|
|
pref64->s6_addr32[1] = addr6->s6_addr32[1];
|
|
break;
|
|
case 56:
|
|
u.a8[0] = addr6->s6_addr[7];
|
|
u.a8[1] = addr6->s6_addr[9];
|
|
u.a8[2] = addr6->s6_addr[10];
|
|
u.a8[3] = addr6->s6_addr[11];
|
|
|
|
pref64->s6_addr32[0] = addr6->s6_addr32[0];
|
|
pref64->s6_addr32[1] = addr6->s6_addr32[1];
|
|
pref64->s6_addr[7] = 0;
|
|
break;
|
|
case 48:
|
|
u.a8[0] = addr6->s6_addr[6];
|
|
u.a8[1] = addr6->s6_addr[7];
|
|
u.a8[2] = addr6->s6_addr[9];
|
|
u.a8[3] = addr6->s6_addr[10];
|
|
|
|
pref64->s6_addr32[0] = addr6->s6_addr32[0];
|
|
pref64->s6_addr32[1] = addr6->s6_addr32[1];
|
|
pref64->s6_addr16[3] = 0;
|
|
break;
|
|
case 40:
|
|
u.a8[0] = addr6->s6_addr[5];
|
|
u.a8[1] = addr6->s6_addr[6];
|
|
u.a8[2] = addr6->s6_addr[7];
|
|
u.a8[3] = addr6->s6_addr[9];
|
|
|
|
pref64->s6_addr32[0] = addr6->s6_addr32[0];
|
|
pref64->s6_addr32[1] = addr6->s6_addr32[1];
|
|
pref64->s6_addr16[3] = 0;
|
|
pref64->s6_addr[5] = 0;
|
|
|
|
break;
|
|
case 32:
|
|
u.a32 = addr6->s6_addr32[1];
|
|
|
|
pref64->s6_addr32[0] = addr6->s6_addr32[0];
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
*addr4 = u.a32;
|
|
return true;
|
|
}
|
|
|
|
/* ipv4 traffic in from application on this device, needs to be translated to v6 and sent to PLAT */
|
|
static __always_inline int
|
|
clat_handle_v4(struct __sk_buff *skb, bool has_eth)
|
|
{
|
|
int ret = TC_ACT_OK;
|
|
void *data_end = SKB_DATA_END(skb);
|
|
void *data = SKB_DATA(skb);
|
|
struct ipv6hdr *ip6h;
|
|
struct ipv6hdr dst_hdr = {
|
|
.version = 6,
|
|
};
|
|
struct iphdr *iph;
|
|
struct ethhdr *eth;
|
|
|
|
if (!ensure_header(&iph, skb, &data, &data_end, L2_H_LEN(has_eth)))
|
|
goto out;
|
|
|
|
if (has_eth) {
|
|
eth = data;
|
|
if (eth->h_proto != bpf_htons(ETH_P_IP))
|
|
goto out;
|
|
}
|
|
|
|
if (iph->version != 4)
|
|
goto out;
|
|
|
|
if (iph->saddr != config.local_v4.s_addr)
|
|
goto out;
|
|
|
|
/* At this point we know the packet needs translation. If we can't
|
|
* rewrite it, it should be dropped.
|
|
*/
|
|
ret = TC_ACT_SHOT;
|
|
|
|
/* we don't bother dealing with IP options or fragmented packets. The
|
|
* latter are identified by the 'frag_off' field having a value (either
|
|
* the MF bit, or the fragment offset, or both). However, this field also
|
|
* contains the "don't fragment" (DF) bit, which we ignore, so mask that
|
|
* out. The DF is the second-most-significant bit (as bit 0 is
|
|
* reserved).
|
|
*/
|
|
|
|
if (iph->ihl != 5 || (iph->frag_off & ~bpf_htons(1 << 14))) {
|
|
DBG("v4: pkt src/dst %pI4/%pI4 has IP options or is fragmented, dropping\n",
|
|
&iph->saddr,
|
|
&iph->daddr);
|
|
goto out;
|
|
}
|
|
|
|
if (!v4addr_to_v6(iph->daddr, &dst_hdr.daddr, &config.pref64, config.pref64_len))
|
|
goto out;
|
|
|
|
dst_hdr.saddr = config.local_v6;
|
|
dst_hdr.nexthdr = iph->protocol;
|
|
dst_hdr.hop_limit = iph->ttl;
|
|
/* weird definition in ipv6hdr */
|
|
dst_hdr.priority = (iph->tos & 0x70) >> 4;
|
|
dst_hdr.flow_lbl[0] = iph->tos << 4;
|
|
dst_hdr.payload_len = bpf_htons(bpf_ntohs(iph->tot_len) - IP_H_LEN);
|
|
|
|
DBG("v4: outgoing pkt to dst %pI4 (%pI6c)\n", &iph->daddr, &dst_hdr.daddr);
|
|
|
|
switch (dst_hdr.nexthdr) {
|
|
case IPPROTO_ICMP:
|
|
if (rewrite_icmp(skb, &dst_hdr, has_eth))
|
|
goto out;
|
|
dst_hdr.nexthdr = IPPROTO_ICMPV6;
|
|
break;
|
|
case IPPROTO_TCP:
|
|
case IPPROTO_UDP:
|
|
update_l4_checksum(skb, &dst_hdr, iph, has_eth, true, false, false, NULL);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IPV6), 0))
|
|
goto out;
|
|
|
|
data = SKB_DATA(skb);
|
|
data_end = SKB_DATA_END(skb);
|
|
|
|
if (!ensure_header(&ip6h, skb, &data, &data_end, L2_H_LEN(has_eth)))
|
|
goto out;
|
|
|
|
if (has_eth) {
|
|
eth = data;
|
|
eth->h_proto = bpf_htons(ETH_P_IPV6);
|
|
}
|
|
|
|
*ip6h = dst_hdr;
|
|
|
|
switch (dst_hdr.nexthdr) {
|
|
case IPPROTO_ICMPV6:
|
|
__sync_fetch_and_add(&stats.egress_icmp, 1);
|
|
break;
|
|
case IPPROTO_TCP:
|
|
__sync_fetch_and_add(&stats.egress_tcp, 1);
|
|
break;
|
|
case IPPROTO_UDP:
|
|
__sync_fetch_and_add(&stats.egress_udp, 1);
|
|
break;
|
|
default:
|
|
__sync_fetch_and_add(&stats.egress_other, 1);
|
|
break;
|
|
}
|
|
|
|
ret = bpf_redirect(skb->ifindex, 0);
|
|
out:
|
|
if (ret == TC_ACT_SHOT)
|
|
__sync_fetch_and_add(&stats.egress_dropped, 1);
|
|
return ret;
|
|
}
|
|
|
|
static __always_inline __u16
|
|
csum_fold_helper(__u32 csum)
|
|
{
|
|
__u32 sum;
|
|
sum = (csum >> 16) + (csum & 0xffff);
|
|
sum += (sum >> 16);
|
|
return ~sum;
|
|
}
|
|
|
|
static __always_inline bool
|
|
v6addr_equal(const struct in6_addr *a, const struct in6_addr *b)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
if (a->s6_addr32[i] != b->s6_addr32[i])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static __always_inline void
|
|
translate_ipv6_header(const struct ipv6hdr *ip6, struct iphdr *ip, __be32 saddr, __be32 daddr)
|
|
{
|
|
*ip = (struct iphdr) {
|
|
.version = 4,
|
|
.ihl = 5,
|
|
.tos = ip6->priority << 4 | (ip6->flow_lbl[0] >> 4),
|
|
.frag_off = bpf_htons(1 << 14),
|
|
.ttl = ip6->hop_limit,
|
|
.protocol = ip6->nexthdr == IPPROTO_ICMPV6 ? IPPROTO_ICMP : ip6->nexthdr,
|
|
.saddr = saddr,
|
|
.daddr = daddr,
|
|
.tot_len = bpf_htons(bpf_ntohs(ip6->payload_len) + IP_H_LEN),
|
|
};
|
|
|
|
ip->check = csum_fold_helper(bpf_csum_diff((__be32 *) ip, 0, (__be32 *) ip, IP_H_LEN, 0));
|
|
}
|
|
|
|
static __always_inline int
|
|
translate_icmpv6_header(const struct icmp6hdr *icmp6, struct icmphdr *icmp)
|
|
{
|
|
/* These translations are defined in RFC6145 section 5.2 */
|
|
switch (icmp6->icmp6_type) {
|
|
case ICMPV6_ECHO_REQUEST:
|
|
icmp->type = ICMP_ECHO;
|
|
break;
|
|
case ICMPV6_ECHO_REPLY:
|
|
icmp->type = ICMP_ECHOREPLY;
|
|
break;
|
|
case ICMPV6_DEST_UNREACH:
|
|
icmp->type = ICMP_DEST_UNREACH;
|
|
switch (icmp6->icmp6_code) {
|
|
case ICMPV6_NOROUTE:
|
|
case ICMPV6_NOT_NEIGHBOUR:
|
|
case ICMPV6_ADDR_UNREACH:
|
|
icmp->code = ICMP_HOST_UNREACH;
|
|
break;
|
|
case ICMPV6_ADM_PROHIBITED:
|
|
icmp->code = ICMP_HOST_ANO;
|
|
break;
|
|
case ICMPV6_PORT_UNREACH:
|
|
icmp->code = ICMP_PORT_UNREACH;
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
break;
|
|
case ICMPV6_PKT_TOOBIG:
|
|
{
|
|
__u32 mtu;
|
|
|
|
icmp->type = ICMP_DEST_UNREACH;
|
|
icmp->code = ICMP_FRAG_NEEDED;
|
|
|
|
mtu = bpf_ntohl(icmp6->icmp6_mtu) - 20;
|
|
if (mtu > 0xffff)
|
|
return -1;
|
|
icmp->un.frag.mtu = bpf_htons(mtu);
|
|
break;
|
|
}
|
|
case ICMPV6_TIME_EXCEED:
|
|
icmp->type = ICMP_TIME_EXCEEDED;
|
|
break;
|
|
case ICMPV6_PARAMPROB:
|
|
switch (icmp6->icmp6_code) {
|
|
case 0:
|
|
{
|
|
__u32 ptr;
|
|
|
|
icmp->type = ICMP_PARAMETERPROB;
|
|
icmp->code = 0;
|
|
|
|
ptr = bpf_ntohl(icmp6->icmp6_pointer);
|
|
/* Figure 6 in RFC6145 - using if statements b/c of
|
|
* range at the bottom
|
|
*/
|
|
if (ptr == 0 || ptr == 1)
|
|
icmp->un.reserved[0] = ptr;
|
|
else if (ptr == 4 || ptr == 5)
|
|
icmp->un.reserved[0] = 2;
|
|
else if (ptr == 6)
|
|
icmp->un.reserved[0] = 9;
|
|
else if (ptr == 7)
|
|
icmp->un.reserved[0] = 8;
|
|
else if (ptr >= 8 && ptr <= 23)
|
|
icmp->un.reserved[0] = 12;
|
|
else if (ptr >= 24 && ptr <= 39)
|
|
icmp->un.reserved[0] = 16;
|
|
else
|
|
return -1;
|
|
break;
|
|
}
|
|
case 1:
|
|
icmp->type = ICMP_DEST_UNREACH;
|
|
icmp->code = ICMP_PROT_UNREACH;
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static __always_inline int
|
|
rewrite_icmpv6_inner(struct __sk_buff *skb, __u32 *csum_diff, bool has_eth)
|
|
{
|
|
void *data_end = SKB_DATA_END(skb);
|
|
void *data = SKB_DATA(skb);
|
|
struct icmphdr *icmp;
|
|
struct icmp6hdr *icmp6;
|
|
struct icmphdr icmp_buf; /* buffer for the new ICMPv4 header */
|
|
struct icmp6hdr icmp6_buf; /* copy of the old ICMPv6 header */
|
|
|
|
/*
|
|
* icmp6: v
|
|
* -------------------------------------------------------------------------
|
|
* | Ethernet | IPv6 | ICMPv6 | IPv6 | ICMPv6 | ...
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
|
|
if (!ensure_header(&icmp6,
|
|
skb,
|
|
&data,
|
|
&data_end,
|
|
L2_H_LEN(has_eth) + 2 * IP6_H_LEN + ICMP6_H_LEN))
|
|
return -1;
|
|
|
|
icmp6_buf = *icmp6;
|
|
icmp = (void *) icmp6;
|
|
icmp_buf = *icmp;
|
|
|
|
if (translate_icmpv6_header(icmp6, &icmp_buf))
|
|
return -1;
|
|
|
|
*icmp = icmp_buf;
|
|
update_icmp_checksum(skb,
|
|
(struct ipv6hdr *) (data + L2_H_LEN(has_eth)),
|
|
&icmp6_buf,
|
|
icmp,
|
|
has_eth,
|
|
false,
|
|
true,
|
|
0);
|
|
|
|
if (csum_diff) {
|
|
data_end = SKB_DATA_END(skb);
|
|
data = SKB_DATA(skb);
|
|
|
|
if (!ensure_header(&icmp,
|
|
skb,
|
|
&data,
|
|
&data_end,
|
|
L2_H_LEN(has_eth) + 2 * IP6_H_LEN + ICMP6_H_LEN))
|
|
return -1;
|
|
|
|
/* Compute the checksum difference between the old ICMPv6 header and the new ICMPv4 one */
|
|
*csum_diff =
|
|
bpf_csum_diff((__be32 *) &icmp6_buf, ICMP6_H_LEN, (__be32 *) &icmp6_buf, 0, *csum_diff);
|
|
*csum_diff = bpf_csum_diff((__be32 *) icmp, 0, (__be32 *) icmp, ICMP_H_LEN, *csum_diff);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static __always_inline int
|
|
rewrite_ipv6_inner(struct __sk_buff *skb, struct iphdr *dst_hdr, __u32 *csum_diff, bool has_eth)
|
|
{
|
|
void *data_end = SKB_DATA_END(skb);
|
|
void *data = SKB_DATA(skb);
|
|
struct ipv6hdr *ip6h;
|
|
__be32 addr4;
|
|
struct in6_addr subnet_v6;
|
|
|
|
/*
|
|
* ip6h: v
|
|
* ----------------------------------------------------------------
|
|
* | Ethernet | IPv6 | ICMPv6 | IPv6 | ...
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
|
|
if (!ensure_header(&ip6h, skb, &data, &data_end, L2_H_LEN(has_eth) + IP6_H_LEN + ICMP6_H_LEN))
|
|
return -1;
|
|
|
|
if (!v6addr_equal(&ip6h->saddr, &config.local_v6))
|
|
return -1;
|
|
if (!v6addr_to_v4(&ip6h->daddr, config.pref64_len, &addr4, &subnet_v6))
|
|
return -1;
|
|
if (!v6addr_equal(&subnet_v6, &config.pref64))
|
|
return -1;
|
|
|
|
translate_ipv6_header(ip6h, dst_hdr, config.local_v4.s_addr, addr4);
|
|
|
|
if (csum_diff) {
|
|
/* Checksum difference between the old IPv6 header and the new IPv4 one */
|
|
*csum_diff = bpf_csum_diff((__be32 *) ip6h, IP6_H_LEN, (__be32 *) ip6h, 0, *csum_diff);
|
|
|
|
*csum_diff = bpf_csum_diff((__be32 *) dst_hdr, 0, (__be32 *) dst_hdr, IP_H_LEN, *csum_diff);
|
|
}
|
|
|
|
switch (dst_hdr->protocol) {
|
|
case IPPROTO_ICMP:
|
|
if (rewrite_icmpv6_inner(skb, csum_diff, has_eth))
|
|
return -1;
|
|
break;
|
|
case IPPROTO_TCP:
|
|
case IPPROTO_UDP:
|
|
update_l4_checksum(skb, ip6h, dst_hdr, has_eth, false, true, false, csum_diff);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static __always_inline int
|
|
rewrite_icmpv6(struct __sk_buff *skb, int *out_length_diff, bool has_eth)
|
|
{
|
|
void *data_end = SKB_DATA_END(skb);
|
|
void *data = SKB_DATA(skb);
|
|
struct iphdr *ip;
|
|
struct icmp6hdr *icmp6;
|
|
struct icmphdr *icmp;
|
|
struct icmphdr icmp_buf; /* buffer for the new ICMPv4 header */
|
|
struct icmp6hdr icmp6_buf; /* copy of the old ICMPv6 header */
|
|
struct iphdr ip_in_buf; /* buffer for the new inner IPv4 header */
|
|
__u32 csum_diff = 0;
|
|
|
|
/*
|
|
* icmp6: v
|
|
* ---------------------------------------------
|
|
* | Ethernet | IPv6 | ICMPv6 | ...
|
|
* ---------------------------------------------
|
|
*/
|
|
|
|
if (!ensure_header(&icmp6, skb, &data, &data_end, L2_H_LEN(has_eth) + IP6_H_LEN))
|
|
return -1;
|
|
|
|
icmp6_buf = *icmp6;
|
|
icmp = (void *) icmp6;
|
|
icmp_buf = *icmp;
|
|
|
|
if (translate_icmpv6_header(icmp6, &icmp_buf))
|
|
return -1;
|
|
|
|
if (icmp6->icmp6_type >= 128) {
|
|
/* ICMPv6 non-error message: only translate the header */
|
|
*icmp = icmp_buf;
|
|
update_icmp_checksum(skb,
|
|
(struct ipv6hdr *) (data + L2_H_LEN(has_eth)),
|
|
&icmp6_buf,
|
|
icmp,
|
|
has_eth,
|
|
false,
|
|
false,
|
|
0);
|
|
return 0;
|
|
}
|
|
|
|
/* ICMPv6 error messages: we need to rewrite the headers in the inner packet.
|
|
* Track in csum_diff the incremental changes to the checksum for the ICMPv4
|
|
* header. */
|
|
|
|
if (rewrite_ipv6_inner(skb, &ip_in_buf, &csum_diff, has_eth))
|
|
return -1;
|
|
|
|
/* The inner IP header shrinks from 40 (IPv6) to 20 (IPv4) bytes; we need to move
|
|
* the L4 header and payload. BPF programs don't have an easy way to move a variable
|
|
* amount of packet data; use bpf_skb_adjust_room() which can add or remove data
|
|
* inside a packet. It doesn't support arbitrary offsets, but we can use BPF_ADJ_ROOM_NET
|
|
* to remove the bytes just after the L3 header, and rewrite the ICMP and the inner
|
|
* IP headers.
|
|
*/
|
|
if (bpf_skb_adjust_room(skb, (int) IP_H_LEN - (int) IP6_H_LEN, BPF_ADJ_ROOM_NET, 0))
|
|
return -1;
|
|
|
|
*out_length_diff = (int) IP_H_LEN - (int) IP6_H_LEN;
|
|
|
|
data_end = SKB_DATA_END(skb);
|
|
data = SKB_DATA(skb);
|
|
|
|
if (!ensure_header(&ip, skb, &data, &data_end, L2_H_LEN(has_eth) + IP6_H_LEN + ICMP_H_LEN))
|
|
return -1;
|
|
|
|
icmp = data + L2_H_LEN(has_eth) + IP6_H_LEN;
|
|
|
|
/* Rewrite the ICMPv6 header with the translated ICMPv4 one */
|
|
*icmp = icmp_buf;
|
|
/* Rewrite the inner IPv6 header with the translated IPv4 one */
|
|
*ip = ip_in_buf;
|
|
|
|
/* Update the ICMPv4 checksum according to all the changes in headers */
|
|
update_icmp_checksum(skb,
|
|
(struct ipv6hdr *) (data + L2_H_LEN(has_eth)),
|
|
&icmp6_buf,
|
|
icmp,
|
|
has_eth,
|
|
false,
|
|
false,
|
|
csum_diff);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* ipv6 traffic from the PLAT, to be translated into ipv4 and sent to an application */
|
|
static __always_inline int
|
|
clat_handle_v6(struct __sk_buff *skb, bool has_eth)
|
|
{
|
|
int ret = TC_ACT_OK;
|
|
void *data_end = SKB_DATA_END(skb);
|
|
void *data = SKB_DATA(skb);
|
|
struct ethhdr *eth;
|
|
struct ipv6hdr *ip6h;
|
|
struct iphdr *iph;
|
|
struct iphdr dst_hdr;
|
|
struct in6_addr subnet_v6;
|
|
__be32 addr4;
|
|
int length_diff = 0;
|
|
bool fragmented = false;
|
|
|
|
if (!ensure_header(&ip6h, skb, &data, &data_end, L2_H_LEN(has_eth)))
|
|
goto out;
|
|
|
|
if (has_eth) {
|
|
eth = data;
|
|
if (eth->h_proto != bpf_htons(ETH_P_IPV6))
|
|
goto out;
|
|
}
|
|
|
|
if (ip6h->version != 6)
|
|
goto out;
|
|
|
|
if (!v6addr_equal(&ip6h->daddr, &config.local_v6))
|
|
goto out;
|
|
if (!v6addr_to_v4(&ip6h->saddr, config.pref64_len, &addr4, &subnet_v6))
|
|
goto out;
|
|
if (!v6addr_equal(&subnet_v6, &config.pref64)) {
|
|
struct icmp6hdr *icmp6;
|
|
|
|
/* Follow draft-ietf-v6ops-icmpext-xlat-v6only-source-01:
|
|
*
|
|
* "Whenever a translator translates an ICMPv6 Destination Unreachable,
|
|
* ICMPv6 Time Exceeded or ICMPv6 Packet Too Big ([RFC4443]) to the
|
|
* corresponding ICMPv4 ([RFC0792]) message, and the IPv6 source
|
|
* address in the outermost IPv6 header is untranslatable, the
|
|
* translator SHOULD use the dummy IPv4 address (192.0.0.8) as the IPv4
|
|
* source address for the translated packet."
|
|
*/
|
|
if (ip6h->nexthdr != IPPROTO_ICMPV6)
|
|
goto out;
|
|
|
|
if (!ensure_header(&icmp6, skb, &data, &data_end, L2_H_LEN(has_eth) + IP6_H_LEN))
|
|
goto out;
|
|
|
|
ip6h = data + L2_H_LEN(has_eth);
|
|
|
|
if (icmp6->icmp6_type != ICMPV6_DEST_UNREACH && icmp6->icmp6_type != ICMPV6_TIME_EXCEED
|
|
&& icmp6->icmp6_type != ICMPV6_PKT_TOOBIG)
|
|
goto out;
|
|
|
|
DBG("v6: icmpv6 type %u from native address %pI6c, translating src to dummy ipv4\n",
|
|
icmp6->icmp6_type,
|
|
&ip6h->saddr);
|
|
|
|
addr4 = __cpu_to_be32(INADDR_DUMMY);
|
|
}
|
|
|
|
/* At this point we know the packet needs translation. If we can't
|
|
* rewrite it, it should be dropped.
|
|
*/
|
|
ret = TC_ACT_SHOT;
|
|
|
|
if (ip6h->nexthdr == IPPROTO_TCP || ip6h->nexthdr == IPPROTO_UDP
|
|
|| ip6h->nexthdr == IPPROTO_ICMPV6) {
|
|
translate_ipv6_header(ip6h, &dst_hdr, addr4, config.local_v4.s_addr);
|
|
DBG("v6: incoming pkt from src %pI6c (%pI4)\n", &ip6h->saddr, &addr4);
|
|
} else if (ip6h->nexthdr == IPPROTO_FRAGMENT) {
|
|
struct ip6_frag *frag;
|
|
int tot_len;
|
|
__u16 offset;
|
|
|
|
if (!ensure_header(&frag, skb, &data, &data_end, L2_H_LEN(has_eth) + IP6_H_LEN))
|
|
goto out;
|
|
|
|
ip6h = data + L2_H_LEN(has_eth);
|
|
|
|
/* Translate into an IPv4 fragmented packet, RFC 6145 5.1.1 */
|
|
|
|
tot_len = bpf_ntohs(ip6h->payload_len) + IP_H_LEN - IP6_FRAG_H_LEN;
|
|
|
|
offset = bpf_ntohs(frag->offset);
|
|
offset = ((offset & 1) << 13) | /* More Fragments flag */
|
|
(offset >> 3); /* Offset in 8-octet units */
|
|
|
|
dst_hdr = (struct iphdr) {
|
|
.version = 4,
|
|
.ihl = 5,
|
|
.id = bpf_htons(bpf_ntohl(frag->identification) & 0xffff),
|
|
.tos = ip6h->priority << 4 | (ip6h->flow_lbl[0] >> 4),
|
|
.frag_off = bpf_htons(offset),
|
|
.ttl = ip6h->hop_limit,
|
|
.protocol = frag->nexthdr == IPPROTO_ICMPV6 ? IPPROTO_ICMP : frag->nexthdr,
|
|
.saddr = addr4,
|
|
.daddr = config.local_v4.s_addr,
|
|
.tot_len = bpf_htons(tot_len),
|
|
};
|
|
|
|
dst_hdr.check = csum_fold_helper(
|
|
bpf_csum_diff((__be32 *) &dst_hdr, 0, (__be32 *) &dst_hdr, IP_H_LEN, 0));
|
|
|
|
fragmented = true;
|
|
|
|
DBG("v6: incoming fragmented pkt from src %pI6c (%pI4), id 0x%x\n",
|
|
&ip6h->saddr,
|
|
&addr4,
|
|
bpf_ntohs(dst_hdr.id));
|
|
} else {
|
|
DBG("v6: pkt src/dst %pI6c/%pI6c has nexthdr %u, dropping\n", &ip6h->saddr, &ip6h->daddr);
|
|
goto out;
|
|
}
|
|
|
|
switch (dst_hdr.protocol) {
|
|
case IPPROTO_ICMP:
|
|
/* We can't update the checksum of ICMP fragmented packets: ICMPv4 doesn't use
|
|
* a pseudo header, while the ICMPv6 pseudo-header includes the total payload
|
|
* length, which is not known when parsing the first fragment. This makes it
|
|
* impossible for a stateless translator to compute the checksum delta. TCP and
|
|
* UDP don't have this problem because both the v4 and v6 pseudo-headers include
|
|
* the total length. */
|
|
if (fragmented)
|
|
goto out;
|
|
|
|
if (rewrite_icmpv6(skb, &length_diff, has_eth))
|
|
goto out;
|
|
break;
|
|
case IPPROTO_TCP:
|
|
case IPPROTO_UDP:
|
|
/* Update the L4 headers only for non-fragmented packets or for the first
|
|
* fragment, which contains the L4 header. */
|
|
if (!fragmented || (bpf_ntohs(dst_hdr.frag_off) & 0x1FFF) == 0) {
|
|
update_l4_checksum(skb, ip6h, &dst_hdr, has_eth, false, false, fragmented, NULL);
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* rewrite_icmpv6() can change the payload length when it rewrites the content of
|
|
* an ICMPv6 error packet. Update the length and the checksum. */
|
|
if (length_diff != 0) {
|
|
data = SKB_DATA(skb);
|
|
data_end = SKB_DATA_END(skb);
|
|
|
|
if (!ensure_header(&ip6h, skb, &data, &data_end, L2_H_LEN(has_eth)))
|
|
goto out;
|
|
|
|
dst_hdr.tot_len = bpf_htons(bpf_ntohs(ip6h->payload_len) + length_diff + IP_H_LEN);
|
|
|
|
dst_hdr.check = 0;
|
|
dst_hdr.check = csum_fold_helper(
|
|
bpf_csum_diff((__be32 *) &dst_hdr, 0, (__be32 *) &dst_hdr, IP_H_LEN, 0));
|
|
}
|
|
|
|
if (bpf_skb_change_proto(skb, bpf_htons(ETH_P_IP), 0))
|
|
goto out;
|
|
|
|
if (fragmented) {
|
|
/* Remove the IPv6 fragment header */
|
|
if (bpf_skb_adjust_room(skb, -(__s32) IP6_FRAG_H_LEN, BPF_ADJ_ROOM_NET, 0))
|
|
goto out;
|
|
}
|
|
|
|
data = SKB_DATA(skb);
|
|
data_end = SKB_DATA_END(skb);
|
|
|
|
if (!ensure_header(&iph, skb, &data, &data_end, L2_H_LEN(has_eth)))
|
|
goto out;
|
|
|
|
if (has_eth) {
|
|
eth = data;
|
|
eth->h_proto = bpf_htons(ETH_P_IP);
|
|
}
|
|
|
|
*iph = dst_hdr;
|
|
|
|
if (fragmented)
|
|
__sync_fetch_and_add(&stats.ingress_fragment, 1);
|
|
switch (dst_hdr.protocol) {
|
|
case IPPROTO_ICMP:
|
|
__sync_fetch_and_add(&stats.ingress_icmp, 1);
|
|
break;
|
|
case IPPROTO_TCP:
|
|
__sync_fetch_and_add(&stats.ingress_tcp, 1);
|
|
break;
|
|
case IPPROTO_UDP:
|
|
__sync_fetch_and_add(&stats.ingress_udp, 1);
|
|
break;
|
|
default:
|
|
__sync_fetch_and_add(&stats.ingress_other, 1);
|
|
break;
|
|
}
|
|
|
|
ret = bpf_redirect(skb->ifindex, BPF_F_INGRESS);
|
|
out:
|
|
if (ret == TC_ACT_SHOT)
|
|
__sync_fetch_and_add(&stats.ingress_dropped, 1);
|
|
return ret;
|
|
}
|
|
|
|
/* Use separate entry points for interfaces with and without an
|
|
* Ethernet header. Since all functions are now marked as inline,
|
|
* the compiler is able to replace the value of the parametric
|
|
* L2_H_LEN() macros with an immediate constant. This avoids
|
|
* pointer arithmetic which is forbidden because we don't run with
|
|
* CAP_PERFMON. The loader attaches the right program pair based
|
|
* on the interface type. */
|
|
SEC("tcx/egress")
|
|
int
|
|
nm_clat_egress_eth(struct __sk_buff *skb)
|
|
{
|
|
return clat_handle_v4(skb, true);
|
|
}
|
|
|
|
SEC("tcx/egress")
|
|
int
|
|
nm_clat_egress_rawip(struct __sk_buff *skb)
|
|
{
|
|
return clat_handle_v4(skb, false);
|
|
}
|
|
|
|
SEC("tcx/ingress")
|
|
int
|
|
nm_clat_ingress_eth(struct __sk_buff *skb)
|
|
{
|
|
return clat_handle_v6(skb, true);
|
|
}
|
|
|
|
SEC("tcx/ingress")
|
|
int
|
|
nm_clat_ingress_rawip(struct __sk_buff *skb)
|
|
{
|
|
return clat_handle_v6(skb, false);
|
|
}
|