From 88e7ea20c3d1bfca9eb0629519655ea387384014 Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Fri, 28 Jan 2022 16:22:32 +0100 Subject: [PATCH 1/5] l3cfg: only constructing log message when needed in _l3_acd_data_state_set_full() --- src/core/nm-l3cfg.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c index b42ff8cb75..0b94f8cb08 100644 --- a/src/core/nm-l3cfg.c +++ b/src/core/nm-l3cfg.c @@ -2071,17 +2071,19 @@ _nm_printf(5, 6) static void _l3_acd_data_state_set_full(NML3Cfg *self, else changed = FALSE; - if (format) { - gs_free char *msg = NULL; - va_list args; + if (_LOGT_ENABLED()) { + if (format) { + gs_free char *msg = NULL; + va_list args; - va_start(args, format); - msg = g_strdup_vprintf(format, args); - va_end(args); + va_start(args, format); + msg = g_strdup_vprintf(format, args); + va_end(args); - _LOGT_acd(acd_data, "set state to %s (%s)", _l3_acd_addr_state_to_string(state), msg); - } else - _LOGT_acd(acd_data, "set state to %s", _l3_acd_addr_state_to_string(state)); + _LOGT_acd(acd_data, "set state to %s (%s)", _l3_acd_addr_state_to_string(state), msg); + } else + _LOGT_acd(acd_data, "set state to %s", _l3_acd_addr_state_to_string(state)); + } if (changed && allow_commit) { /* The availability of an address just changed (and we are instructed to From 601285301041e3f59eb74473f29d1847c3a2ec82 Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Fri, 28 Jan 2022 16:38:23 +0100 Subject: [PATCH 2/5] l3cfg: use nm_g_timeout_add_source() helpers in "nm-l3cfg.c" It's shorter, and brevity lets us focus on the important things in the code. --- src/core/nm-l3cfg.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c index 0b94f8cb08..48a491193b 100644 --- a/src/core/nm-l3cfg.c +++ b/src/core/nm-l3cfg.c @@ -1517,12 +1517,7 @@ _l3_acd_nacd_event(int fd, GIOCondition condition, gpointer user_data) "in %u msec)", timeout_msec); self->priv.p->nacd_event_down_source = - nm_g_timeout_source_new(timeout_msec, - G_PRIORITY_DEFAULT, - _l3_acd_nacd_event_down_timeout_cb, - self, - NULL); - g_source_attach(self->priv.p->nacd_event_down_source, NULL); + nm_g_timeout_add_source(timeout_msec, _l3_acd_nacd_event_down_timeout_cb, self); } break; default: @@ -1583,12 +1578,9 @@ _l3_acd_nacd_instance_reset(NML3Cfg *self, NMTernary start_timer, gboolean acd_d break; case NM_TERNARY_TRUE: self->priv.p->nacd_instance_ensure_retry = - nm_g_timeout_source_new_seconds(ACD_ENSURE_RATELIMIT_MSEC / 1000u, - G_PRIORITY_DEFAULT, + nm_g_timeout_add_seconds_source(ACD_ENSURE_RATELIMIT_MSEC / 1000u, _l3_acd_nacd_instance_ensure_retry_cb, - self, - NULL); - g_source_attach(self->priv.p->nacd_instance_ensure_retry, NULL); + self); break; case NM_TERNARY_DEFAULT: break; @@ -1939,12 +1931,9 @@ _l3_acd_data_timeout_schedule(AcdData *acd_data, gint64 timeout_msec) { nm_clear_g_source_inst(&acd_data->acd_data_timeout_source); acd_data->acd_data_timeout_source = - nm_g_timeout_source_new(NM_CLAMP((gint64) 0, timeout_msec, (gint64) G_MAXUINT), - G_PRIORITY_DEFAULT, + nm_g_timeout_add_source(NM_CLAMP((gint64) 0, timeout_msec, (gint64) G_MAXUINT), _l3_acd_data_timeout_cb, - acd_data, - NULL); - g_source_attach(acd_data->acd_data_timeout_source, NULL); + acd_data); } static void From f00980e5cae536d35e39e28e2b796de1d7f27f4c Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Fri, 28 Jan 2022 16:59:45 +0100 Subject: [PATCH 3/5] l3cfg: add assertion for scheduling timeout in _l3_acd_data_timeout_schedule() The first point is that ACD timeout is strongly tied to the current state. That is (somewhat) obvious, because _l3_acd_data_state_set_full() will clear any pending timeout. So you can only schedule a timeout *after* setting the state, and setting the next state, will clear the timeout. Likewise, note that l3_acd_data_state_change() for the event ACD_STATE_CHANGE_MODE_TIMEOUT asserts that it is only called in the few states where that is expected. See rhbz#2047788 where that assertion gets hit. The first point means that we must only schedule a timer when we are also in a state that supports that. Add an assertion for that at the point when scheduling the timeout. The assert at this point is useful, because it catches the moment when we do the wrong thing (instead of getting the assertion later during the timeout, when we no longer know where the error happened). See-also: https://bugzilla.redhat.com/show_bug.cgi?id=2047788 --- src/core/nm-l3cfg.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c index 48a491193b..fc3b427894 100644 --- a/src/core/nm-l3cfg.c +++ b/src/core/nm-l3cfg.c @@ -1929,6 +1929,19 @@ _l3_acd_data_timeout_cb(gpointer user_data) static void _l3_acd_data_timeout_schedule(AcdData *acd_data, gint64 timeout_msec) { + /* in _l3_acd_data_state_set_full() we clear the timer. At the same time, + * in _l3_acd_data_state_change(ACD_STATE_CHANGE_MODE_TIMEOUT) we only + * expect timeouts in certain states. + * + * That means, scheduling a timeout is only correct if we are in a certain + * state, which allows to handle timeouts. This assert checks for that to + * ensure we don't call a timeout in an unexpected state. */ + nm_assert(NM_IN_SET(acd_data->info.state, + NM_L3_ACD_ADDR_STATE_PROBING, + NM_L3_ACD_ADDR_STATE_USED, + NM_L3_ACD_ADDR_STATE_DEFENDING, + NM_L3_ACD_ADDR_STATE_CONFLICT)); + nm_clear_g_source_inst(&acd_data->acd_data_timeout_source); acd_data->acd_data_timeout_source = nm_g_timeout_add_source(NM_CLAMP((gint64) 0, timeout_msec, (gint64) G_MAXUINT), From d7a951b947c84c94f064b763408f1bc81da9d2fd Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Fri, 28 Jan 2022 18:48:13 +0100 Subject: [PATCH 4/5] l3cfg: add comment to _acd_data_collect_tracks_data() about linear search --- src/core/nm-l3cfg.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c index fc3b427894..1ac477275d 100644 --- a/src/core/nm-l3cfg.c +++ b/src/core/nm-l3cfg.c @@ -1334,6 +1334,15 @@ _acd_data_collect_tracks_data(const AcdData *acd_data, guint n = 0; guint i; + /* We do a simple search over all track-infos for the best, which determines + * our ACD state. That is, we prefer ACD disabled, and otherwise the + * shortest configured timeout. + * + * This linear search is probably fast enough, because we expect that each + * address/acd_data has few trackers. + * The alternative would be caching the best result, but that is more complicated, + * so not done. */ + for (i = 0; i < acd_data->info.n_track_infos; i++) { const NML3AcdAddrTrackInfo *acd_track = &acd_data->info.track_infos[i]; From db0d84f13a4c5a73403474f8b4af4b905fa60d4b Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Fri, 28 Jan 2022 17:52:34 +0100 Subject: [PATCH 5/5] l3cfg: fix handling "instance-reset" ACD event The ACD state handling is unfortunately very complicated. That is, because we obviously need to track state about how ACD is going (the acd_data, and in particular NML3AcdAddrState). Then there are various things that can happen, which are the AcdStateChangeMode enums. All these state-changes come together in one function: _l3_acd_data_state_change(), which is therefore complicated (I don't think that it would become simpler by spreading this code out to different functions, on the contrary). Anyway. So, what happens when we need to reset the n-acd instance? For example, because the MAC address of the link changed or some error. I guess, we need to restart probing. Previously, I think this was not handled properly. We already tried to fix this several times, the last was commit b3316063862b ('l3cfg: on n-acd instance-reset clear also ready ACD state'). There is still an issue ([1]). The bug [1] is, that we are in state NM_L3_ACD_ADDR_STATE_READY, during ACD_STATE_CHANGE_MODE_TIMEOUT event. That leads to an assertion failure. #5 0x00007f23be74698f in g_assertion_message_expr (domain=0x5629aca70359 "nm", file=0x5629aca62aab "src/core/nm-l3cfg.c", line=2395, func=0x5629acb26b30 <__func__.72.lto_priv.4> "_l3_acd_data_state_change", expr=) at ../glib/gtestutils.c:3091 #6 0x00005629ac937e46 in _l3_acd_data_state_change (self=0x5629add69790, acd_data=0x5629add8d520, state_change_mode=ACD_STATE_CHANGE_MODE_TIMEOUT, sender_addr=0x0, p_now_msec=0x7ffded506460) at src/core/nm-l3cfg.c:2395 #7 0x00005629ac939f4d in _l3_acd_data_timeout_cb (user_data=user_data@entry=0x5629add8d520) at src/core/nm-l3cfg.c:1933 #8 0x00007f23be71c5a1 in g_timeout_dispatch (source=0x5629addd7a80, callback=0x5629ac939ee0 <_l3_acd_data_timeout_cb>, user_data=0x5629add8d520) at ../glib/gmain.c:4889 #9 0x00007f23be71bd4f in g_main_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:3337 #10 g_main_context_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:4055 That can only happen, (I think) when we scheduled the timeout during an earlier ACD_STATE_CHANGE_MODE_INSTANCE_RESET event. Meaning, we need to handle instance-reset better. Instead, during instance-reset, switch always back to state PROBING, and let the timeout figure it out. [1] https://bugzilla.redhat.com/show_bug.cgi?id=2047788 --- src/core/nm-l3cfg.c | 87 ++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 49 deletions(-) diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c index 1ac477275d..aed4cf05c9 100644 --- a/src/core/nm-l3cfg.c +++ b/src/core/nm-l3cfg.c @@ -2351,22 +2351,7 @@ handle_init: /* we are already probing. There is nothing to do for this timeout. */ return; } - - nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec); - - if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC - + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC - >= (*p_now_msec)) { - /* hm. We failed to create a new probe too long. Something is really wrong - * internally, but let's ignore the issue and assume the address is good. What - * else would we do? Assume the address is USED? */ - _LOGT_acd(acd_data, - "probe-good (waiting for creating probe timed out. Assume good)"); - goto handle_start_defending; - } - - log_reason = "retry probing on timeout"; - goto handle_start_probing; + /* fall-through */ case NM_L3_ACD_ADDR_STATE_USED: case NM_L3_ACD_ADDR_STATE_CONFLICT: @@ -2386,7 +2371,10 @@ handle_init: acd_data->acd_defend_type_desired = acd_defend_type; if (acd_timeout_msec <= 0) { - log_reason = "acd disabled by configuration (restart after previous conflict)"; + if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING) + log_reason = "acd disabled by configuration (timeout during probing)"; + else + log_reason = "acd disabled by configuration (restart after previous conflict)"; goto handle_probing_done; } @@ -2396,6 +2384,23 @@ handle_init: } nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec); + + if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING) { + if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC + + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC + >= (*p_now_msec)) { + /* hm. We failed to create a new probe too long. Something is really wrong + * internally, but let's ignore the issue and assume the address is good. What + * else would we do? Assume the address is USED? */ + _LOGT_acd(acd_data, + "probe-good (waiting for creating probe timed out. Assume good)"); + goto handle_start_defending; + } + + log_reason = "retry probing on timeout"; + goto handle_start_probing; + } + acd_data->probing_timestamp_msec = (*p_now_msec); acd_data->probing_timeout_msec = acd_timeout_msec; if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_USED) @@ -2616,39 +2621,23 @@ handle_init: return; case ACD_STATE_CHANGE_MODE_INSTANCE_RESET: + nm_assert(NM_IN_SET(acd_data->info.state, + NM_L3_ACD_ADDR_STATE_PROBING, + NM_L3_ACD_ADDR_STATE_DEFENDING, + NM_L3_ACD_ADDR_STATE_READY, + NM_L3_ACD_ADDR_STATE_USED, + NM_L3_ACD_ADDR_STATE_CONFLICT, + NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED)); - switch (acd_data->info.state) { - case NM_L3_ACD_ADDR_STATE_INIT: - nm_assert_not_reached(); - return; - case NM_L3_ACD_ADDR_STATE_PROBING: - case NM_L3_ACD_ADDR_STATE_DEFENDING: - case NM_L3_ACD_ADDR_STATE_READY: - if (!acd_data->nacd_probe) { - /* we failed starting to probe before and have a timer running to - * restart. We don't do anything now, but let the timer handle it. - * This also implements some rate limiting for us. */ - _LOGT_acd(acd_data, - "n-acd instance reset. Ignore event while in state %s", - _l3_acd_addr_state_to_string(acd_data->info.state)); - return; - } - - _LOGT_acd(acd_data, - "n-acd instance reset. Trigger a restart while in state %s", - _l3_acd_addr_state_to_string(acd_data->info.state)); - acd_data->nacd_probe = n_acd_probe_free(acd_data->nacd_probe); - _l3_acd_data_timeout_schedule(acd_data, 0); - return; - case NM_L3_ACD_ADDR_STATE_USED: - case NM_L3_ACD_ADDR_STATE_CONFLICT: - case NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED: - /* as we have no probe, we are already handling this (e.g. by having - * a retry timer). Nothing to do. */ - nm_assert(!acd_data->nacd_probe); - return; - } - nm_assert_not_reached(); + /* an instance-reset is a dramatic event. We start over with probing. */ + _LOGT_acd(acd_data, + "n-acd instance reset. Reset to probing while in state %s", + _l3_acd_addr_state_to_string(acd_data->info.state)); + acd_data->nacd_probe = n_acd_probe_free(acd_data->nacd_probe); + acd_data->last_defendconflict_timestamp_msec = 0; + acd_data->probing_timestamp_msec = nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec); + _l3_acd_data_state_set(self, acd_data, NM_L3_ACD_ADDR_STATE_PROBING, FALSE); + _l3_acd_data_timeout_schedule(acd_data, 0); return; }