platform: rework handling of failed routes during nm_platform_ip_route_sync()

Previously, there was "temporary-not-available" mechanism in NML3Cfg,
which aimed to handle IPv6 routes with prefsrc. Theoretically, that
mechanism may have been extended to other use-cases, like IPv4 routes
with prefsrc. What it attempted to handle, is the inability to configure
such routes, unless the respective prefsrc address is configured and
non-tentative.  However, the address that we are waiting for, could also
be on another interface, so that mechanism wasn't applicable. This is
now replaced by _routes_watch_ip_addrs(). It seems there isn't anything
useful left for the "temporary-not-available" mechanism and it can go,
except...

We want to log a warning when we are unable to configure a route. Also,
in the future we might want to know when the IP configuration is
degradated due to inability to configure the desired routes (a condition
that  we might want to expose to the user, not only via logging; or we
may want to react on that).

However, with prefsrc routes we don't know right away whether the
inability to configure the route right away indicates an actual problem,
or whether that will resolve itself (e.g. after the address passes
DAD/ACD, after we received an DHCP lease or after the address was
configured on another interface).  Consequently, to know whether the
current inability to configure such a route is a problem, we need to
know the larger context.  nm_platform_ip_route_sync() does not have that
context.

Instead, nm_platform_ip_route_sync() needs only do debug log about
failure to configure routes. It  will now also  return all the failed
routes to NML3Cfg, which can decide whether that is a problem.

This reworks the previous "temporary-not-available" mechanism to track
the state of the failed routes, to eventually decide whether there is an
actual problem (and log about it).

Another problem this solves is that since commit ('platform: always
reconfigure IP routes even if removed externally'), we will eagerly
re-try to configure the same route over and over. We cannot just spam
the log with warnings about the same failure on every commit. We need to
remember that we already logged about the problem and rate limit
warnings otherwise. This is what the new mechanism also achieves.

Indeed, all this is mostly for the sole benefit of logging better
warnings (and not duplicated).
This commit is contained in:
Thomas Haller 2023-03-08 16:08:23 +01:00
parent b8dba58892
commit 1feaf427d2
No known key found for this signature in database
GPG key ID: 29C2366E4DFC5728
5 changed files with 303 additions and 313 deletions

View file

@ -3522,7 +3522,7 @@ _dev_ip_state_check(NMDevice *self, int addr_family)
&s_is_pending,
&s_is_failed);
has_tna = priv->l3cfg && nm_l3cfg_has_temp_not_available_obj(priv->l3cfg, addr_family);
has_tna = priv->l3cfg && nm_l3cfg_has_failedobj_pending(priv->l3cfg, addr_family);
if (has_tna)
s_is_pending = TRUE;
@ -4254,6 +4254,7 @@ _dev_l3_cfg_notify_cb(NML3Cfg *l3cfg, const NML3ConfigNotifyData *notify_data, N
_dev_ipshared4_spawn_dnsmasq(self);
nm_clear_l3cd(&priv->ipshared_data_4.v4.l3cd);
}
_dev_ip_state_check_async(self, AF_UNSPEC);
_dev_ipmanual_check_ready(self);
return;
case NM_L3_CONFIG_NOTIFY_TYPE_IPV4LL_EVENT:
@ -4263,10 +4264,6 @@ _dev_l3_cfg_notify_cb(NML3Cfg *l3cfg, const NML3ConfigNotifyData *notify_data, N
return;
case NM_L3_CONFIG_NOTIFY_TYPE_PLATFORM_CHANGE:
return;
case NM_L3_CONFIG_NOTIFY_TYPE_ROUTES_TEMPORARY_NOT_AVAILABLE_EXPIRED:
/* we commit again. This way we try to configure the routes.*/
_dev_l3_cfg_commit(self, FALSE);
return;
case NM_L3_CONFIG_NOTIFY_TYPE_PLATFORM_CHANGE_ON_IDLE:
if (NM_FLAGS_ANY(notify_data->platform_change_on_idle.obj_type_flags,
nmp_object_type_to_flags(NMP_OBJECT_TYPE_LINK)
@ -4298,9 +4295,6 @@ _dev_l3_cfg_notify_cb(NML3Cfg *l3cfg, const NML3ConfigNotifyData *notify_data, N
* synchronously to update the current state and schedule a commit. */
nm_ndisc_dad_failed(priv->ipac6_data.ndisc, conflicts, TRUE);
} else if (ready) {
if (nm_l3cfg_has_temp_not_available_obj(priv->l3cfg, AF_INET6))
_dev_l3_cfg_commit(self, FALSE);
nm_clear_l3cd(&priv->ipac6_data.l3cd);
_dev_ipac6_set_state(self, NM_DEVICE_IP_STATE_READY);
_dev_ip_state_check_async(self, AF_INET6);
@ -10307,14 +10301,6 @@ _dev_ipmanual_check_ready(NMDevice *self)
_dev_ipmanual_set_state(self, addr_family, NM_DEVICE_IP_STATE_FAILED);
_dev_ip_state_check_async(self, AF_UNSPEC);
} else if (ready) {
if (priv->ipmanual_data.state_x[IS_IPv4] != NM_DEVICE_IP_STATE_READY
&& nm_l3cfg_has_temp_not_available_obj(priv->l3cfg, addr_family)) {
/* Addresses with pending ACD/DAD are a possible cause for the
* presence of temporarily-not-available objects. Once all addresses
* are ready, retry to commit those unavailable objects. */
_dev_l3_cfg_commit(self, FALSE);
}
_dev_ipmanual_set_state(self, addr_family, NM_DEVICE_IP_STATE_READY);
_dev_ip_state_check_async(self, AF_UNSPEC);
}

View file

@ -11,6 +11,7 @@
#include <linux/if_ether.h>
#include <linux/rtnetlink.h>
#include "libnm-glib-aux/nm-prioq.h"
#include "libnm-glib-aux/nm-time-utils.h"
#include "libnm-platform/nm-platform.h"
#include "libnm-platform/nmp-object.h"
@ -123,25 +124,34 @@ typedef struct {
CList os_lst;
/* If we have a timeout pending, we link the instance to
* self->priv.p->obj_state_temporary_not_available_lst_head. */
CList os_temporary_not_available_lst;
/* If a NMPObject is no longer to be configured (but was configured
* during a previous commit), then we need to remember it so that the
* next commit can delete the address/route in kernel. It becomes a zombie. */
CList os_zombie_lst;
/* We might want to configure "obj" in platform, but it's currently not possible.
* For example, certain IPv6 routes can only be added after the IPv6 address
* becomes non-tentative (*sigh*). In such a case, we need to remember that, and
* retry later. If this timestamp is set to a non-zero value, then it means
* we tried to configure the obj (at that timestamp) and failed, but we are
* waiting to retry.
/* Used by _handle_routes_failed() mechanism. If "os_plobj" is set, then
* this is meaningless but should be set to zero.
*
* See also self->priv.p->obj_state_temporary_not_available_lst_head
* and self->priv.p->obj_state_temporary_not_available_timeout_source. */
gint64 os_temporary_not_available_timestamp_msec;
* If set to a non-zero value, this means adding the object failed. Until
* "os_failedobj_expiry_msec" we are still waiting whether we would be able to
* configure the object. Afterwards, we consider the element failed.
*
* Depending on "os_failedobj_prioq_idx", we are currently waiting whether the
* condition can resolve itself or becomes a failure. */
gint64 os_failedobj_expiry_msec;
/* The index into the "priv->failedobj_prioq" queue for objects that are failed.
* - this field is meaningless in case "os_plobj" is set (but it should be
* set to NM_PRIOQ_IDX_NULL).
* - otherwise, if "os_failedobj_expiry_msec" is 0, no error was detected so
* far. The index should be set to NM_PRIOQ_IDX_NULL.
* - otherwise, if the index is NM_PRIOQ_IDX_NULL it means that the object
* is not tracked by the queue, no grace timer is pending, and the object
* is considered failed.
* - otherwise, the index is used for tracking the element in the queue.
* It means, we are currently waiting to decide whether this will be a
* failure or not. */
guint os_failedobj_prioq_idx;
/* When the obj is a zombie (that means, it was previously configured by NML3Cfg, but
* now no longer), it needs to be deleted from platform. This ratelimits the time
@ -240,7 +250,6 @@ typedef struct _NML3CfgPrivate {
CList obj_state_lst_head;
CList obj_state_zombie_lst_head;
CList obj_state_temporary_not_available_lst_head;
GHashTable *acd_ipv4_addresses_on_link;
@ -287,7 +296,9 @@ typedef struct _NML3CfgPrivate {
guint64 pseudo_timestamp_counter;
GSource *obj_state_temporary_not_available_timeout_source;
NMPrioq failedobj_prioq;
GSource *failedobj_timeout_source;
gint64 failedobj_timeout_expiry_msec;
NML3CfgCommitType commit_on_idle_type;
@ -420,8 +431,6 @@ static NM_UTILS_ENUM2STR_DEFINE(
NM_UTILS_ENUM2STR(NM_L3_CONFIG_NOTIFY_TYPE_PLATFORM_CHANGE_ON_IDLE, "platform-change-on-idle"),
NM_UTILS_ENUM2STR(NM_L3_CONFIG_NOTIFY_TYPE_PRE_COMMIT, "pre-commit"),
NM_UTILS_ENUM2STR(NM_L3_CONFIG_NOTIFY_TYPE_POST_COMMIT, "post-commit"),
NM_UTILS_ENUM2STR(NM_L3_CONFIG_NOTIFY_TYPE_ROUTES_TEMPORARY_NOT_AVAILABLE_EXPIRED,
"routes-temporary-not-available-expired"),
NM_UTILS_ENUM2STR_IGNORE(_NM_L3_CONFIG_NOTIFY_TYPE_NUM), );
static NM_UTILS_ENUM2STR_DEFINE(_l3_acd_defend_type_to_string,
@ -778,8 +787,12 @@ _nm_n_acd_data_probe_new(NML3Cfg *self, in_addr_t addr, guint32 timeout_msec, gp
NMP_OBJECT_TYPE_IP4_ROUTE, \
NMP_OBJECT_TYPE_IP6_ROUTE)); \
nm_assert(!_obj_state->os_plobj || _obj_state->os_was_in_platform); \
nm_assert((_obj_state->os_temporary_not_available_timestamp_msec == 0) \
== c_list_is_empty(&_obj_state->os_temporary_not_available_lst)); \
nm_assert(_obj_state->os_failedobj_expiry_msec != 0 \
|| _obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL); \
nm_assert(_obj_state->os_failedobj_expiry_msec == 0 || !_obj_state->os_plobj); \
nm_assert(_obj_state->os_failedobj_expiry_msec == 0 \
|| c_list_is_empty(&_obj_state->os_zombie_lst)); \
nm_assert(_obj_state->os_failedobj_expiry_msec == 0 || _obj_state->obj); \
if (_self) { \
if (c_list_is_empty(&_obj_state->os_zombie_lst)) { \
nm_assert(_self->priv.p->combined_l3cd_commited); \
@ -787,10 +800,6 @@ _nm_n_acd_data_probe_new(NML3Cfg *self, in_addr_t addr, guint32 timeout_msec, gp
if (NM_MORE_ASSERTS > 5) { \
nm_assert(c_list_contains(&_self->priv.p->obj_state_lst_head, \
&_obj_state->os_lst)); \
nm_assert((_obj_state->os_temporary_not_available_timestamp_msec == 0) \
|| c_list_contains( \
&_self->priv.p->obj_state_temporary_not_available_lst_head, \
&_obj_state->os_temporary_not_available_lst)); \
nm_assert(_obj_state->os_plobj \
== nm_platform_lookup_obj(_self->priv.platform, \
NMP_CACHE_ID_TYPE_OBJECT_TYPE, \
@ -823,7 +832,8 @@ _obj_state_data_new(const NMPObject *obj, const NMPObject *plobj)
.os_was_in_platform = !!plobj,
.os_nm_configured = FALSE,
.os_dirty = FALSE,
.os_temporary_not_available_lst = C_LIST_INIT(obj_state->os_temporary_not_available_lst),
.os_failedobj_expiry_msec = 0,
.os_failedobj_prioq_idx = NM_PRIOQ_IDX_NULL,
.os_zombie_lst = C_LIST_INIT(obj_state->os_zombie_lst),
};
return obj_state;
@ -834,9 +844,10 @@ _obj_state_data_free(gpointer data)
{
ObjStateData *obj_state = data;
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
c_list_unlink_stale(&obj_state->os_lst);
c_list_unlink_stale(&obj_state->os_zombie_lst);
c_list_unlink_stale(&obj_state->os_temporary_not_available_lst);
nmp_object_unref(obj_state->obj);
nmp_object_unref(obj_state->os_plobj);
nm_g_slice_free(obj_state);
@ -874,15 +885,17 @@ _obj_state_data_to_string(const ObjStateData *obj_state, char *buf, gsize buf_si
} else if (obj_state->os_was_in_platform)
nm_strbuf_append_str(&buf, &buf_size, ", was-in-platform");
if (obj_state->os_temporary_not_available_timestamp_msec > 0) {
if (obj_state->os_failedobj_expiry_msec > 0) {
nm_utils_get_monotonic_timestamp_msec_cached(&now_msec);
nm_strbuf_append(
&buf,
nm_strbuf_append(&buf,
&buf_size,
", temporary-not-available-since=%" G_GINT64_FORMAT ".%03d",
(now_msec - obj_state->os_temporary_not_available_timestamp_msec) / 1000,
(int) ((now_msec - obj_state->os_temporary_not_available_timestamp_msec) % 1000));
}
", %s-since=%" G_GINT64_FORMAT ".%03d",
(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL) ? "failed"
: "failed-wait",
(obj_state->os_failedobj_expiry_msec - now_msec) / 1000,
(int) ((obj_state->os_failedobj_expiry_msec - now_msec) % 1000));
} else
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
return buf0;
}
@ -944,6 +957,7 @@ _obj_states_externally_removed_track(NML3Cfg *self, const NMPObject *obj, gboole
if (!in_platform && !c_list_is_empty(&obj_state->os_zombie_lst)) {
/* this is a zombie. We can forget about it.*/
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
nm_clear_nmp_object(&obj_state->os_plobj);
c_list_unlink(&obj_state->os_zombie_lst);
_LOGD("obj-state: zombie gone (untrack): %s",
@ -959,8 +973,23 @@ _obj_states_externally_removed_track(NML3Cfg *self, const NMPObject *obj, gboole
if (in_platform) {
nmp_object_ref_set(&obj_state->os_plobj, obj);
obj_state->os_was_in_platform = TRUE;
if (obj_state->os_failedobj_expiry_msec != 0) {
obj_state->os_failedobj_expiry_msec = 0;
if (obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL) {
_LOGT("obj-state: failed-obj: object now configured after failed earlier: %s",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)));
} else {
nm_prioq_remove(&self->priv.p->failedobj_prioq,
obj_state,
&obj_state->os_failedobj_prioq_idx);
_LOGT("obj-state: failed-obj: object now configured after waiting: %s",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)));
}
} else {
_LOGD("obj-state: appeared in platform: %s",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)));
}
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
goto out;
}
@ -1049,6 +1078,7 @@ _obj_states_update_all(NML3Cfg *self)
continue;
if (obj_state->os_plobj && obj_state->os_nm_configured) {
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
c_list_link_tail(&self->priv.p->obj_state_zombie_lst_head,
&obj_state->os_zombie_lst);
obj_state->os_zombie_count = ZOMBIE_COUNT_START;
@ -1059,6 +1089,9 @@ _obj_states_update_all(NML3Cfg *self)
_LOGD("obj-state: untrack: %s",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)));
nm_prioq_remove(&self->priv.p->failedobj_prioq,
obj_state,
&obj_state->os_failedobj_prioq_idx);
g_hash_table_iter_remove(&h_iter);
}
}
@ -1272,6 +1305,7 @@ _obj_state_zombie_lst_get_prune_lists(NML3Cfg *self,
if (--obj_state->os_zombie_count == 0) {
_LOGD("obj-state: prune zombie (untrack): %s",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)));
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
g_hash_table_remove(self->priv.p->obj_state_hash, obj_state);
continue;
}
@ -1306,6 +1340,7 @@ _obj_state_zombie_lst_prune_all(NML3Cfg *self, int addr_family)
if (--obj_state->os_zombie_count == 0) {
_LOGD("obj-state: zombie pruned during reapply (untrack): %s",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)));
nm_assert(obj_state->os_failedobj_prioq_idx == NM_PRIOQ_IDX_NULL);
g_hash_table_remove(self->priv.p->obj_state_hash, obj_state);
continue;
}
@ -3043,16 +3078,14 @@ nm_l3cfg_get_acd_addr_info(NML3Cfg *self, in_addr_t addr)
/*****************************************************************************/
gboolean
nm_l3cfg_has_temp_not_available_obj(NML3Cfg *self, int addr_family)
nm_l3cfg_has_failedobj_pending(NML3Cfg *self, int addr_family)
{
ObjStateData *obj_state;
nm_assert(NM_IS_L3CFG(self));
nm_assert_addr_family(addr_family);
c_list_for_each_entry (obj_state,
&self->priv.p->obj_state_temporary_not_available_lst_head,
os_temporary_not_available_lst) {
nm_prioq_for_each (&self->priv.p->failedobj_prioq, obj_state) {
if (NMP_OBJECT_GET_ADDR_FAMILY(obj_state->obj) == addr_family)
return TRUE;
}
@ -3940,79 +3973,91 @@ out:
/*****************************************************************************/
static gboolean
_routes_temporary_not_available_timeout(gpointer user_data)
_failedobj_timeout_cb(gpointer user_data)
{
NML3Cfg *self = NM_L3CFG(user_data);
ObjStateData *obj_state;
gint64 now_msec;
gint64 expiry_msec;
nm_clear_g_source_inst(&self->priv.p->obj_state_temporary_not_available_timeout_source);
_LOGT("obj-state: failed-obj: handle timeout");
obj_state = c_list_first_entry(&self->priv.p->obj_state_temporary_not_available_lst_head,
ObjStateData,
os_temporary_not_available_lst);
nm_clear_g_source_inst(&self->priv.p->failedobj_timeout_source);
if (!obj_state)
return G_SOURCE_CONTINUE;
nm_l3cfg_commit_on_idle_schedule(self, NM_L3_CFG_COMMIT_TYPE_AUTO);
now_msec = nm_utils_get_monotonic_timestamp_msec();
expiry_msec = obj_state->os_temporary_not_available_timestamp_msec
+ ROUTES_TEMPORARY_NOT_AVAILABLE_MAX_AGE_MSEC;
if (now_msec < expiry_msec) {
/* the timeout is not yet reached. Restart the timer... */
self->priv.p->obj_state_temporary_not_available_timeout_source =
nm_g_timeout_add_source(expiry_msec - now_msec,
_routes_temporary_not_available_timeout,
self);
return G_SOURCE_CONTINUE;
}
/* One (or several) routes expired. We emit a signal, but we don't schedule it again.
* We expect the callers to commit again, which will one last time try to configure
* the route. If that again fails, we detect the timeout, log a warning and don't
* track the object as not temporary-not-available anymore. */
_nm_l3cfg_emit_signal_notify_simple(
self,
NM_L3_CONFIG_NOTIFY_TYPE_ROUTES_TEMPORARY_NOT_AVAILABLE_EXPIRED);
return G_SOURCE_CONTINUE;
}
static gboolean
_routes_temporary_not_available_update(NML3Cfg *self,
int addr_family,
GPtrArray *routes_temporary_not_available_arr)
static void
_failedobj_reschedule(NML3Cfg *self, gint64 now_msec)
{
ObjStateData *obj_state;
ObjStateData *obj_state_safe;
gint64 now_msec;
gboolean prune_all = FALSE;
gboolean success = TRUE;
guint i;
const NMPClass *klass;
klass = nmp_class_from_type(NMP_OBJECT_TYPE_IP_ROUTE(NM_IS_IPv4(addr_family)));
now_msec = nm_utils_get_monotonic_timestamp_msec();
if (nm_g_ptr_array_len(routes_temporary_not_available_arr) <= 0) {
prune_all = TRUE;
goto out_prune;
}
c_list_for_each_entry (obj_state,
&self->priv.p->obj_state_temporary_not_available_lst_head,
os_temporary_not_available_lst) {
if (NMP_OBJECT_GET_CLASS(obj_state->obj) == klass) {
nm_assert(obj_state->os_temporary_not_available_timestamp_msec > 0);
obj_state->os_tna_dirty = TRUE;
}
}
for (i = 0; i < routes_temporary_not_available_arr->len; i++) {
const NMPObject *o = routes_temporary_not_available_arr->pdata[i];
char sbuf[NM_UTILS_TO_STRING_BUFFER_SIZE];
ObjStateData *obj_state;
nm_utils_get_monotonic_timestamp_msec_cached(&now_msec);
again:
obj_state = nm_prioq_peek(&self->priv.p->failedobj_prioq);
if (obj_state && obj_state->os_failedobj_expiry_msec <= now_msec) {
/* The object is already expired... */
/* we shouldn't have a "os_plobj", because if we had, we should have
* removed "obj_state" from the queue. */
nm_assert(!obj_state->os_plobj);
/* we need to have an "obj", otherwise the "obj_state" instance
* shouldn't exist (as it also has not "os_plobj"). */
nm_assert(obj_state->obj);
/* It seems that nm_platform_ip_route_sync() signaled success and did
* not report the route as missing. Regardless, it is still not
* configured and the timeout expired. */
nm_prioq_remove(&self->priv.p->failedobj_prioq,
obj_state,
&obj_state->os_failedobj_prioq_idx);
_LOGW(
"missing IPv%c route: %s",
nm_utils_addr_family_to_char(NMP_OBJECT_GET_TYPE(obj_state->obj)),
nmp_object_to_string(obj_state->obj, NMP_OBJECT_TO_STRING_PUBLIC, sbuf, sizeof(sbuf)));
goto again;
}
if (!obj_state) {
if (nm_clear_g_source_inst(&self->priv.p->failedobj_timeout_source))
_LOGT("obj-state: failed-obj: cancel timeout");
return;
}
if (nm_g_timeout_reschedule(&self->priv.p->failedobj_timeout_source,
&self->priv.p->failedobj_timeout_expiry_msec,
obj_state->os_failedobj_expiry_msec,
_failedobj_timeout_cb,
self)) {
_LOGT(
"obj-state: failed-obj: schedule timeout in %" G_GINT64_FORMAT " msec",
NM_MAX((gint64) 0,
obj_state->os_failedobj_expiry_msec - nm_utils_get_monotonic_timestamp_msec()));
}
}
static void
_failedobj_handle_routes(NML3Cfg *self, int addr_family, GPtrArray *routes_failed)
{
const gint64 now_msec = nm_utils_get_monotonic_timestamp_msec();
char sbuf[NM_UTILS_TO_STRING_BUFFER_SIZE];
ObjStateData *obj_state;
guint i;
if (!routes_failed)
return;
for (i = 0; i < routes_failed->len; i++) {
const NMPObject *o = routes_failed->pdata[i];
const NMPlatformIPXRoute *rt = NMP_OBJECT_CAST_IPX_ROUTE(o);
gboolean just_started_to_fail = FALSE;
gboolean just_failed = FALSE;
gboolean arm_timer = FALSE;
int grace_timeout_msec;
gint64 grace_expiry_mesc;
nm_assert(NMP_OBJECT_GET_TYPE(o) == NMP_OBJECT_TYPE_IP_ROUTE(NM_IS_IPv4(addr_family)));
@ -4024,70 +4069,83 @@ _routes_temporary_not_available_update(NML3Cfg *self,
continue;
}
if (obj_state->os_temporary_not_available_timestamp_msec > 0) {
nm_assert(obj_state->os_temporary_not_available_timestamp_msec > 0
&& obj_state->os_temporary_not_available_timestamp_msec <= now_msec);
if (!obj_state->os_tna_dirty) {
/* Odd, this only can happen if routes_temporary_not_available_arr contains duplicates.
* It should not. */
nm_assert_not_reached();
if (obj_state->os_plobj) {
/* This object is apparently present in platform. Not sure what this failure report
* is about. Probably some harmless glitch. Ignore. */
continue;
}
if (now_msec > obj_state->os_temporary_not_available_timestamp_msec
+ ROUTES_TEMPORARY_NOT_AVAILABLE_MAX_AGE_MSEC) {
/* Timeout. Could not add this address.
*
* For now, keep it obj_state->os_tna_dirty and prune it below. */
_LOGW("failure to add IPv%c route: %s",
/* This route failed, but why? That determines the grace time that we
* give before considering it bad. */
if (!nm_ip_addr_is_null(addr_family,
nm_platform_ip_route_get_pref_src(addr_family, &rt->rx))) {
/* This route has a pref_src. A common cause for being unable to
* configure such routes, is that the referenced IP address is not
* configured/ready (yet). Give a longer timeout to this case. */
grace_timeout_msec = 10000;
} else {
/* Other route don't have any grace time. There is no retry/wait,
* they are a failure right away. */
grace_timeout_msec = 0;
}
grace_expiry_mesc = now_msec + grace_timeout_msec;
if (obj_state->os_failedobj_expiry_msec == 0) {
/* This is a new failure that we didn't see before... */
obj_state->os_failedobj_expiry_msec = grace_expiry_mesc;
if (grace_timeout_msec == 0)
just_failed = TRUE;
else {
arm_timer = TRUE;
just_started_to_fail = TRUE;
}
} else {
if (obj_state->os_failedobj_expiry_msec > grace_expiry_mesc) {
/* Shorten the grace timeout. We anyway rearm below... */
obj_state->os_failedobj_expiry_msec = grace_expiry_mesc;
}
if (obj_state->os_failedobj_expiry_msec <= now_msec) {
/* The grace period is (already) expired. */
if (obj_state->os_failedobj_prioq_idx != NM_PRIOQ_IDX_NULL) {
/* We are still tracking the element. It just is about to become failed. */
just_failed = TRUE;
}
} else
arm_timer = TRUE;
}
nm_prioq_update(&self->priv.p->failedobj_prioq,
obj_state,
&obj_state->os_failedobj_prioq_idx,
arm_timer);
if (just_failed) {
_LOGW("unable to configure IPv%c route: %s",
nm_utils_addr_family_to_char(addr_family),
nmp_object_to_string(o, NMP_OBJECT_TO_STRING_PUBLIC, sbuf, sizeof(sbuf)));
success = FALSE;
continue;
}
obj_state->os_tna_dirty = FALSE;
continue;
}
_LOGT("(temporarily) unable to add IPv%c route: %s",
nm_utils_addr_family_to_char(addr_family),
nmp_object_to_string(o, NMP_OBJECT_TO_STRING_PUBLIC, sbuf, sizeof(sbuf)));
obj_state->os_tna_dirty = FALSE;
obj_state->os_temporary_not_available_timestamp_msec = now_msec;
c_list_link_tail(&self->priv.p->obj_state_temporary_not_available_lst_head,
&obj_state->os_temporary_not_available_lst);
}
out_prune:
c_list_for_each_entry_safe (obj_state,
obj_state_safe,
&self->priv.p->obj_state_temporary_not_available_lst_head,
os_temporary_not_available_lst) {
if (prune_all || obj_state->os_tna_dirty) {
if (NMP_OBJECT_GET_CLASS(obj_state->obj) == klass) {
obj_state->os_temporary_not_available_timestamp_msec = 0;
c_list_unlink(&obj_state->os_temporary_not_available_lst);
} else if (just_started_to_fail) {
_LOGT("obj-state: failed-obj: unable to configure %s. Wait for %d msec",
_obj_state_data_to_string(obj_state, sbuf, sizeof(sbuf)),
grace_timeout_msec);
}
}
}
nm_clear_g_source_inst(&self->priv.p->obj_state_temporary_not_available_timeout_source);
static int
_failedobj_prioq_cmp(gconstpointer a, gconstpointer b)
{
const ObjStateData *object_state_a = a;
const ObjStateData *object_state_b = b;
obj_state = c_list_first_entry(&self->priv.p->obj_state_temporary_not_available_lst_head,
ObjStateData,
os_temporary_not_available_lst);
if (obj_state) {
self->priv.p->obj_state_temporary_not_available_timeout_source =
nm_g_timeout_add_source((obj_state->os_temporary_not_available_timestamp_msec
+ ROUTES_TEMPORARY_NOT_AVAILABLE_MAX_AGE_MSEC - now_msec),
_routes_temporary_not_available_timeout,
self);
}
nm_assert(object_state_a);
nm_assert(object_state_a->os_failedobj_expiry_msec > 0);
nm_assert(object_state_b);
nm_assert(object_state_b->os_failedobj_expiry_msec > 0);
return success;
NM_CMP_SELF(object_state_a, object_state_b);
NM_CMP_FIELD(object_state_a, object_state_b, os_failedobj_expiry_msec);
return 0;
}
/*****************************************************************************/
@ -4459,10 +4517,23 @@ _routes_watch_ip_addrs(NML3Cfg *self, int addr_family, GPtrArray *addresses, GPt
guint i;
guint j;
/* IP routes that have a pref_src, can only be configured in kernel if
* that address exists (and is non-tentative, in case of IPv6).
* That address might be on another interface. So we actually watch all
* other interfaces. */
/* IP routes that have a pref_src, can only be configured in kernel if that
* address exists (and is non-tentative, in case of IPv6). That address
* might be on another interface. So we actually watch all other
* interfaces.
*
* Note that while we track failure to configure routes via "failedobj"
* mechanism, we eagerly register watchers, even if the route is already
* successfully configured or if the route is to be configure the first
* time. Maybe that could be improved, but
* - watchers should be cheap unless they notify the event.
* - upon change we do an async commit, which is maybe not entirely cheap
* but cheap enough. More importantly, committing is something that
* *always* should be permissible -- because NML3Cfg has multiple,
* independent users, that don't know about each other and which
* independently are allowed to issue a commit when they think something
* relevant changed. If there are really too many, unnecessary commits,
* then the cause needs to be understood and addressed explicitly. */
if (!routes)
goto out;
@ -4736,9 +4807,8 @@ _l3_commit_one(NML3Cfg *self,
gs_unref_ptrarray GPtrArray *routes_nodev = NULL;
gs_unref_ptrarray GPtrArray *addresses_prune = NULL;
gs_unref_ptrarray GPtrArray *routes_prune = NULL;
gs_unref_ptrarray GPtrArray *routes_temporary_not_available_arr = NULL;
gs_unref_ptrarray GPtrArray *routes_failed = NULL;
NMIPRouteTableSyncMode route_table_sync;
gboolean final_failure_for_temporary_not_available = FALSE;
char sbuf_commit_type[50];
guint i;
@ -4861,16 +4931,9 @@ _l3_commit_one(NML3Cfg *self,
self->priv.ifindex,
routes,
routes_prune,
&routes_temporary_not_available_arr);
&routes_failed);
final_failure_for_temporary_not_available = FALSE;
if (!_routes_temporary_not_available_update(self,
addr_family,
routes_temporary_not_available_arr))
final_failure_for_temporary_not_available = TRUE;
/* FIXME(l3cfg) */
(void) final_failure_for_temporary_not_available;
_failedobj_handle_routes(self, addr_family, routes_failed);
}
static void
@ -4943,6 +5006,8 @@ _l3_commit(NML3Cfg *self, NML3CfgCommitType commit_type, gboolean is_idle)
_l3_commit_one(self, AF_INET, commit_type, changed_combined_l3cd, l3cd_old);
_l3_commit_one(self, AF_INET6, commit_type, changed_combined_l3cd, l3cd_old);
_failedobj_reschedule(self, 0);
_l3_commit_mptcp(self, commit_type);
_l3_acd_data_process_changes(self);
@ -5304,7 +5369,6 @@ nm_l3cfg_init(NML3Cfg *self)
c_list_init(&self->priv.p->acd_event_notify_lst_head);
c_list_init(&self->priv.p->commit_type_lst_head);
c_list_init(&self->priv.p->obj_state_lst_head);
c_list_init(&self->priv.p->obj_state_temporary_not_available_lst_head);
c_list_init(&self->priv.p->obj_state_zombie_lst_head);
c_list_init(&self->priv.p->blocked_lst_head_4);
c_list_init(&self->priv.p->blocked_lst_head_6);
@ -5316,6 +5380,8 @@ nm_l3cfg_init(NML3Cfg *self)
nmp_object_indirect_id_equal,
_obj_state_data_free,
NULL);
nm_prioq_init(&self->priv.p->failedobj_prioq, _failedobj_prioq_cmp);
}
static void
@ -5363,6 +5429,9 @@ finalize(GObject *object)
TRUE);
}
nm_prioq_destroy(&self->priv.p->failedobj_prioq);
nm_clear_g_source_inst(&self->priv.p->failedobj_timeout_source);
nm_assert(c_list_is_empty(&self->internal_netns.signal_pending_lst));
nm_assert(c_list_is_empty(&self->internal_netns.ecmp_track_ifindex_lst_head));
@ -5390,11 +5459,8 @@ finalize(GObject *object)
nm_clear_g_source_inst(&self->priv.p->nacd_instance_ensure_retry);
nm_clear_g_source_inst(&self->priv.p->nacd_event_down_source);
nm_clear_g_source_inst(&self->priv.p->obj_state_temporary_not_available_timeout_source);
nm_clear_pointer(&self->priv.p->obj_state_hash, g_hash_table_destroy);
nm_assert(c_list_is_empty(&self->priv.p->obj_state_lst_head));
nm_assert(c_list_is_empty(&self->priv.p->obj_state_temporary_not_available_lst_head));
nm_assert(c_list_is_empty(&self->priv.p->obj_state_zombie_lst_head));
if (_nodev_routes_untrack(self, AF_INET))

View file

@ -128,8 +128,6 @@ typedef enum {
* and neither should you call into NML3Cfg again (reentrancy). */
NM_L3_CONFIG_NOTIFY_TYPE_L3CD_CHANGED,
NM_L3_CONFIG_NOTIFY_TYPE_ROUTES_TEMPORARY_NOT_AVAILABLE_EXPIRED,
NM_L3_CONFIG_NOTIFY_TYPE_ACD_EVENT,
/* emitted before the merged l3cd is committed to platform.
@ -408,7 +406,7 @@ gboolean nm_l3cfg_check_ready(NML3Cfg *self,
NML3CfgCheckReadyFlags flags,
GArray **conflicts);
gboolean nm_l3cfg_has_temp_not_available_obj(NML3Cfg *self, int addr_family);
gboolean nm_l3cfg_has_failedobj_pending(NML3Cfg *self, int addr_family);
/*****************************************************************************/

View file

@ -4593,53 +4593,6 @@ nm_platform_ip_address_flush(NMPlatform *self, int addr_family, int ifindex)
/*****************************************************************************/
static gboolean
_route_is_temporary_not_available(NMPlatform *self,
int err,
const NMPObject *obj,
const char **out_err_reason)
{
const NMPlatformIPXRoute *rx;
const NMPlatformIP6Address *a;
nm_assert(NM_IS_PLATFORM(self));
nm_assert(NMP_OBJECT_IS_VALID(obj));
if (err != -EINVAL)
return FALSE;
rx = NMP_OBJECT_CAST_IPX_ROUTE(obj);
if (rx->rx.rt_source != NM_IP_CONFIG_SOURCE_USER) {
/* we only allow this workaround for routes added manually by the user. */
return FALSE;
}
if (NMP_OBJECT_GET_TYPE(obj) == NMP_OBJECT_TYPE_IP4_ROUTE) {
return FALSE;
} else {
const NMPlatformIP6Route *r = &rx->r6;
/* trying to add an IPv6 route with pref-src fails, if the address is
* still tentative (rh#1452684). We need to hack around that.
*
* Detect it, by guessing whether that's the case. */
if (IN6_IS_ADDR_UNSPECIFIED(&r->pref_src))
return FALSE;
a = nm_platform_ip6_address_get(self, r->ifindex, &r->pref_src);
if (!a)
return FALSE;
if (!NM_FLAGS_HAS(a->n_ifa_flags, IFA_F_TENTATIVE)
|| NM_FLAGS_HAS(a->n_ifa_flags, IFA_F_DADFAILED))
return FALSE;
*out_err_reason = "tentative IPv6 src address not ready";
return TRUE;
}
}
static guint
_ipv6_temporary_addr_prefixes_keep_hash(gconstpointer ptr)
{
@ -4968,8 +4921,8 @@ nm_platform_ip_route_get_prune_list(NMPlatform *self,
* at the end of the operation. Note that if @routes contains
* the same route, then it will not be deleted. @routes overrules
* @routes_prune list.
* @out_temporary_not_available: (allow-none) (out): routes that could
* currently not be synced. The caller shall keep them and try later again.
* @out_routes_failed: (allow-none) (out): routes that could
* not be synced/added.
*
* Returns: %TRUE on success.
*/
@ -4979,7 +4932,7 @@ nm_platform_ip_route_sync(NMPlatform *self,
int ifindex,
GPtrArray *routes,
GPtrArray *routes_prune,
GPtrArray **out_temporary_not_available)
GPtrArray **out_routes_failed)
{
const int IS_IPv4 = NM_IS_IPv4(addr_family);
const NMPlatformVTableRoute *vt;
@ -5000,7 +4953,6 @@ nm_platform_ip_route_sync(NMPlatform *self,
for (i_type = 0; routes && i_type < 2; i_type++) {
for (i = 0; i < routes->len; i++) {
gs_free char *extack_msg = NULL;
const char *err_reason = NULL;
int r;
conf_o = routes->pdata[i];
@ -5094,35 +5046,23 @@ nm_platform_ip_route_sync(NMPlatform *self,
sizeof(sbuf2)));
}
}
} else if (NMP_OBJECT_CAST_IP_ROUTE(conf_o)->rt_source < NM_IP_CONFIG_SOURCE_USER) {
_LOG3D(
"route-sync: ignore failure to add IPv%c route: %s: %s%s%s%s",
vt->is_ip4 ? '4' : '6',
nmp_object_to_string(conf_o, NMP_OBJECT_TO_STRING_PUBLIC, sbuf1, sizeof(sbuf1)),
nm_strerror(r),
NM_PRINT_FMT_QUOTED(extack_msg, " (", extack_msg, ")", ""));
} else if (out_temporary_not_available
&& _route_is_temporary_not_available(self, r, conf_o, &err_reason)) {
_LOG3D("route-sync: ignore temporary failure to add route (%s, %s%s%s%s): %s",
nm_strerror(r),
err_reason,
NM_PRINT_FMT_QUOTED(extack_msg, ", ", extack_msg, "", ""),
nmp_object_to_string(conf_o,
NMP_OBJECT_TO_STRING_PUBLIC,
sbuf1,
sizeof(sbuf1)));
if (!*out_temporary_not_available)
*out_temporary_not_available =
g_ptr_array_new_full(0, (GDestroyNotify) nmp_object_unref);
g_ptr_array_add(*out_temporary_not_available, (gpointer) nmp_object_ref(conf_o));
} else {
_LOG3W(
_LOG3D(
"route-sync: failure to add IPv%c route: %s: %s%s%s%s",
vt->is_ip4 ? '4' : '6',
nmp_object_to_string(conf_o, NMP_OBJECT_TO_STRING_PUBLIC, sbuf1, sizeof(sbuf1)),
nm_strerror(r),
NM_PRINT_FMT_QUOTED(extack_msg, " (", extack_msg, ")", ""));
success = FALSE;
if (out_routes_failed) {
if (!*out_routes_failed) {
*out_routes_failed =
g_ptr_array_new_with_free_func((GDestroyNotify) nmp_object_unref);
}
g_ptr_array_add(*out_routes_failed, (gpointer) nmp_object_ref(conf_o));
}
}
}
}

View file

@ -2332,7 +2332,7 @@ gboolean nm_platform_ip_route_sync(NMPlatform *self,
int ifindex,
GPtrArray *routes,
GPtrArray *routes_prune,
GPtrArray **out_temporary_not_available);
GPtrArray **out_routes_failed);
gboolean nm_platform_ip_route_flush(NMPlatform *self, int addr_family, int ifindex);