ndisc: generate IPv6 routes with nexthops to avoid merging

Previously the NDisc code would generate multiple IPv6 default routes
with the same metric. The kernel then would merge them into a single
ECMP route, which is forbidden by RFCs as it breaks NUD and other use
cases.

When the kernel is managing IPv6 on the interface it is able to add
multiple independent default routes because it sets the internal flag
RTF_ADDRCONF on them, which prevents ECMP. This flag can't be set from
userspace, and proposals to expose it were rejected:

  https://lore.kernel.org/netdev/20241105031841.10730-2-Matt.Muggeridge@hpe.com/

The only way to avoid the merge is to use the nexthop API.

Now when there are multiple default routes, the NDisc uses a nexthop
for them.

The nexthop id is generated by hashing the route destination (::/0 for
default routes), the interface name and the gateway, so that the id
remains stable across restarts. In case of collisions with other
nexthops created by NetworkManager, or with nexthops configured
externally, a different id is chosen. To mitigate the chance of
collisions with external nexthops, we only choose ids with the high
bit set.
This commit is contained in:
Beniamino Galvani 2025-12-10 14:19:57 +01:00
parent 13f83149e1
commit 359762b2c9
2 changed files with 153 additions and 22 deletions

View file

@ -18,6 +18,7 @@
#include "nm-l3-config-data.h"
#include "nm-l3cfg.h"
#include "nm-ndisc-private.h"
#include "nm-netns.h"
#include "nm-setting-ip6-config.h"
#include "nm-utils.h"
@ -105,6 +106,77 @@ NM_UTILS_LOOKUP_STR_DEFINE(nm_ndisc_dhcp_level_to_string,
/*****************************************************************************/
#define NEXTHOP_ID_RETRIES 400
static guint32
nexthop_id_alloc(NMNDisc *ndisc,
const struct in6_addr *dest,
guint plen,
const struct in6_addr *gateway)
{
NMNDiscPrivate *priv = NM_NDISC_GET_PRIVATE(ndisc);
NMNetns *netns = nm_l3cfg_get_netns(priv->config.l3cfg);
const char *ifname = nm_l3cfg_get_ifname(priv->config.l3cfg, FALSE);
NMPlatform *platform = nm_l3cfg_get_platform(priv->config.l3cfg);
int ifindex = nm_l3cfg_get_ifindex(priv->config.l3cfg);
nm_auto_nmpobj NMPObject *obj = NULL;
CSipHash state;
guint64 id64;
guint32 id;
guint i;
/* Determine a stable nexthop ID by hashing the interface name, the destination
* and the gateway. We set the high bit to decrease the chance of collisions with
* external (manually added) nexthops. */
c_siphash_init(&state, NM_HASH_SEED_16_U64(725697701u));
c_siphash_append(&state, (const uint8_t *) ifname, strlen(ifname) + 1);
c_siphash_append(&state, (const uint8_t *) dest, sizeof(struct in6_addr));
c_siphash_append(&state, (const uint8_t *) &plen, sizeof(guint));
c_siphash_append(&state, (const uint8_t *) gateway, sizeof(struct in6_addr));
id64 = c_siphash_finalize(&state);
id = ((guint32) (id64 >> 32u)) | (1u << 31);
for (i = 0; i < NEXTHOP_ID_RETRIES; i++, id++) {
if (i < NEXTHOP_ID_RETRIES * 3 / 4) {
id |= (1u << 31);
} else {
/* After many collisions, start probing random ids */
id = (guint32) nm_random_u64_range(1u << 31, G_MAXUINT32);
}
if (nm_netns_nexthop_id_is_reserved(netns, id))
continue;
if (nm_platform_ip_nexthop_get(platform, id, &obj)) {
/* The id already exists in platform. We can reuse it only
* if it's an IPv6 RA nexthop on the same interface. */
if (NMP_OBJECT_GET_TYPE(obj) != NMP_OBJECT_TYPE_IP6_NEXTHOP
|| obj->ip6_nexthop.nh_source != NM_IP_CONFIG_SOURCE_RTPROT_RA
|| obj->ip6_nexthop.ifindex != ifindex)
continue;
}
nm_netns_nexthop_id_reserve(netns, id, ndisc);
return id;
}
return 0;
}
static void
_nexthop_id_release_one(NMNDisc *ndisc, guint32 nexthop_id)
{
NMNDiscPrivate *priv;
if (nexthop_id == 0)
return;
priv = NM_NDISC_GET_PRIVATE(ndisc);
nm_netns_nexthop_id_release(nm_l3cfg_get_netns(priv->config.l3cfg), nexthop_id);
}
/*****************************************************************************/
NML3ConfigData *
nm_ndisc_data_to_l3cd(NMDedupMultiIndex *multi_idx,
int ifindex,
@ -172,9 +244,6 @@ nm_ndisc_data_to_l3cd(NMDedupMultiIndex *multi_idx,
}
if (rdata->gateways_n > 0) {
guint metric_offset = 0;
NMIcmpv6RouterPref prev_pref = NM_ICMPV6_ROUTER_PREF_INVALID;
NMPlatformIP6Route r = {
.rt_source = NM_IP_CONFIG_SOURCE_NDISC,
.ifindex = ifindex,
@ -185,24 +254,38 @@ nm_ndisc_data_to_l3cd(NMDedupMultiIndex *multi_idx,
};
for (i = 0; i < rdata->gateways_n; i++) {
/* If we add multiple default routes with the same metric and
* different preferences, kernel merges them into a single ECMP
* route, with overall preference equal to the preference of the
* first route added. Therefore, the preference of individual routes
* is not respected.
* To avoid that, add routes with different metrics if they have
* different preferences, so that they are not merged together. Here
* the gateways are already ordered by increasing preference. */
if (i != 0 && rdata->gateways[i].preference != prev_pref) {
metric_offset++;
}
NMPlatformIP6NextHop nh;
prev_pref = rdata->gateways[i].preference;
r.metric = metric_offset;
r.gateway = rdata->gateways[i].address;
r.rt_pref = rdata->gateways[i].preference;
nm_assert((NMIcmpv6RouterPref) r.rt_pref == rdata->gateways[i].preference);
/* If we add multiple routes with the same destination (in this case, the
* default route) and the same metric, the kernel merges them into a single
* ECMP route, which is forbidden by RFCs as it breaks NUD and other use cases.
* Use nexthop objects to avoid this merging behavior.
*
* We could use nexthops only when there are multiple default routes on this
* interface. But that is not enough, because there can be multiple profiles
* with the same ipv6.route-metric value, and their default routes would still
* be merged. We need to always use nexthops.
*/
if (rdata->gateways[i].nexthop_id == 0) {
/* The nexthop id could not be reserved and we already emitted a warning */
continue;
}
nh = (NMPlatformIP6NextHop) {
.ifindex = ifindex,
.nh_source = NM_IP_CONFIG_SOURCE_NDISC,
.gateway = rdata->gateways[i].address,
.id = rdata->gateways[i].nexthop_id,
};
r.nhid = nh.id;
r.gateway = nh.gateway;
nm_l3_config_data_add_route_6(l3cd, &r);
nm_l3_config_data_add_nexthop(l3cd, AF_INET6, NULL, (const NMPlatformIPNextHop *) &nh);
}
}
@ -465,23 +548,38 @@ nm_ndisc_add_gateway(NMNDisc *ndisc, const NMNDiscGateway *new_item, gint64 now_
{
NMNDiscDataInternal *rdata = &NM_NDISC_GET_PRIVATE(ndisc)->rdata;
guint i;
guint insert_idx = G_MAXUINT;
guint insert_idx = G_MAXUINT;
guint32 old_nexthop_id = 0;
NMNDiscGateway gw;
for (i = 0; i < rdata->gateways->len;) {
NMNDiscGateway *item = &nm_g_array_index(rdata->gateways, NMNDiscGateway, i);
if (IN6_ARE_ADDR_EQUAL(&item->address, &new_item->address)) {
if (new_item->expiry_msec <= now_msec) {
_nexthop_id_release_one(ndisc, item->nexthop_id);
g_array_remove_index(rdata->gateways, i);
_ASSERT_data_gateways(rdata);
return TRUE;
}
if (item->preference != new_item->preference) {
/* Preference changed: save the nexthop ID so that we can
* reuse it when re-inserting at the correct position. */
old_nexthop_id = item->nexthop_id;
g_array_remove_index(rdata->gateways, i);
continue;
}
if (item->nexthop_id == 0) {
/* We failed to allocate the nexthop id previously; retry. */
item->nexthop_id = nexthop_id_alloc(ndisc, &in6addr_any, 0, &new_item->address);
if (item->nexthop_id > 0) {
item->expiry_msec = new_item->expiry_msec;
return TRUE;
}
}
if (item->expiry_msec == new_item->expiry_msec)
return FALSE;
@ -505,9 +603,26 @@ nm_ndisc_add_gateway(NMNDisc *ndisc, const NMNDiscGateway *new_item, gint64 now_
if (new_item->expiry_msec <= now_msec)
return FALSE;
/* Make a copy of the gateway and assign a nexthop id, reusing the existing
* one if possible */
gw = *new_item;
if (old_nexthop_id != 0) {
gw.nexthop_id = old_nexthop_id;
} else {
gw.nexthop_id = nexthop_id_alloc(ndisc, &in6addr_any, 0, &new_item->address);
if (gw.nexthop_id == 0) {
char buf[INET6_ADDRSTRLEN];
_LOGW("failed to find a free nexthop id for gateway %s",
nm_inet6_ntop(&new_item->address, buf));
return FALSE;
}
}
g_array_insert_val(rdata->gateways,
insert_idx == G_MAXUINT ? rdata->gateways->len : insert_idx,
*new_item);
gw);
_ASSERT_data_gateways(rdata);
return TRUE;
}
@ -1330,6 +1445,9 @@ nm_ndisc_stop(NMNDisc *ndisc)
NM_NDISC_GET_CLASS(ndisc)->stop(ndisc);
/* Release all nexthop IDs reserved by this ndisc instance. */
nm_netns_nexthop_id_release_all(nm_l3cfg_get_netns(priv->config.l3cfg), ndisc);
rdata = &priv->rdata;
g_array_set_size(rdata->gateways, 0);
@ -1442,9 +1560,10 @@ _config_changed_log(NMNDisc *ndisc, NMNDiscConfigMap changed)
for (i = 0; i < rdata->gateways->len; i++) {
const NMNDiscGateway *gateway = &nm_g_array_index(rdata->gateways, NMNDiscGateway, i);
_LOGD(" gateway %s pref %s exp %s",
_LOGD(" gateway %s pref %s nhid %u exp %s",
nm_inet6_ntop(&gateway->address, addrstr),
nm_icmpv6_router_pref_to_string(gateway->preference, str_pref, sizeof(str_pref)),
gateway->nexthop_id,
get_exp(str_exp, now_msec, gateway));
}
for (i = 0; i < rdata->addresses->len; i++) {
@ -1521,8 +1640,11 @@ clean_gateways(NMNDisc *ndisc, gint64 now_msec, NMNDiscConfigMap *changed, gint6
arr = &nm_g_array_first(rdata->gateways, NMNDiscGateway);
for (i = 0, j = 0; i < rdata->gateways->len; i++) {
if (!expiry_next(now_msec, arr[i].expiry_msec, next_msec))
if (!expiry_next(now_msec, arr[i].expiry_msec, next_msec)) {
/* Gateway expired. Release its nexthop ID. */
_nexthop_id_release_one(ndisc, arr[i].nexthop_id);
continue;
}
if (i != j)
arr[j] = arr[i];
j++;
@ -1533,8 +1655,14 @@ clean_gateways(NMNDisc *ndisc, gint64 now_msec, NMNDiscConfigMap *changed, gint6
g_array_set_size(rdata->gateways, j);
}
if (_array_set_size_max(rdata->gateways, _SIZE_MAX_GATEWAYS))
if (rdata->gateways->len > _SIZE_MAX_GATEWAYS) {
for (i = _SIZE_MAX_GATEWAYS; i < rdata->gateways->len; i++)
_nexthop_id_release_one(
ndisc,
nm_g_array_index(rdata->gateways, NMNDiscGateway, i).nexthop_id);
g_array_set_size(rdata->gateways, _SIZE_MAX_GATEWAYS);
*changed |= NM_NDISC_CONFIG_GATEWAYS;
}
_ASSERT_data_gateways(rdata);
}
@ -2065,6 +2193,8 @@ finalize(GObject *object)
NMNDiscPrivate *priv = NM_NDISC_GET_PRIVATE(ndisc);
NMNDiscDataInternal *rdata = &priv->rdata;
nm_netns_nexthop_id_release_all(nm_l3cfg_get_netns(priv->config.l3cfg), ndisc);
g_array_unref(rdata->gateways);
g_array_unref(rdata->addresses);
g_array_unref(rdata->routes);

View file

@ -100,6 +100,7 @@ typedef struct _NMNDiscGateway {
struct in6_addr address;
gint64 expiry_msec;
NMIcmpv6RouterPref preference;
guint32 nexthop_id;
} NMNDiscGateway;
typedef struct _NMNDiscAddress {