From 88e7ea20c3d1bfca9eb0629519655ea387384014 Mon Sep 17 00:00:00 2001
From: Thomas Haller <thaller@redhat.com>
Date: Fri, 28 Jan 2022 16:22:32 +0100
Subject: [PATCH 1/5] l3cfg: only constructing log message when needed in
 _l3_acd_data_state_set_full()

---
 src/core/nm-l3cfg.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c
index b42ff8cb75..0b94f8cb08 100644
--- a/src/core/nm-l3cfg.c
+++ b/src/core/nm-l3cfg.c
@@ -2071,17 +2071,19 @@ _nm_printf(5, 6) static void _l3_acd_data_state_set_full(NML3Cfg         *self,
     else
         changed = FALSE;
 
-    if (format) {
-        gs_free char *msg = NULL;
-        va_list       args;
+    if (_LOGT_ENABLED()) {
+        if (format) {
+            gs_free char *msg = NULL;
+            va_list       args;
 
-        va_start(args, format);
-        msg = g_strdup_vprintf(format, args);
-        va_end(args);
+            va_start(args, format);
+            msg = g_strdup_vprintf(format, args);
+            va_end(args);
 
-        _LOGT_acd(acd_data, "set state to %s (%s)", _l3_acd_addr_state_to_string(state), msg);
-    } else
-        _LOGT_acd(acd_data, "set state to %s", _l3_acd_addr_state_to_string(state));
+            _LOGT_acd(acd_data, "set state to %s (%s)", _l3_acd_addr_state_to_string(state), msg);
+        } else
+            _LOGT_acd(acd_data, "set state to %s", _l3_acd_addr_state_to_string(state));
+    }
 
     if (changed && allow_commit) {
         /* The availability of an address just changed (and we are instructed to

From 601285301041e3f59eb74473f29d1847c3a2ec82 Mon Sep 17 00:00:00 2001
From: Thomas Haller <thaller@redhat.com>
Date: Fri, 28 Jan 2022 16:38:23 +0100
Subject: [PATCH 2/5] l3cfg: use nm_g_timeout_add_source() helpers in
 "nm-l3cfg.c"

It's shorter, and brevity lets us focus on the important things in the
code.
---
 src/core/nm-l3cfg.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c
index 0b94f8cb08..48a491193b 100644
--- a/src/core/nm-l3cfg.c
+++ b/src/core/nm-l3cfg.c
@@ -1517,12 +1517,7 @@ _l3_acd_nacd_event(int fd, GIOCondition condition, gpointer user_data)
                       "in %u msec)",
                       timeout_msec);
                 self->priv.p->nacd_event_down_source =
-                    nm_g_timeout_source_new(timeout_msec,
-                                            G_PRIORITY_DEFAULT,
-                                            _l3_acd_nacd_event_down_timeout_cb,
-                                            self,
-                                            NULL);
-                g_source_attach(self->priv.p->nacd_event_down_source, NULL);
+                    nm_g_timeout_add_source(timeout_msec, _l3_acd_nacd_event_down_timeout_cb, self);
             }
             break;
         default:
@@ -1583,12 +1578,9 @@ _l3_acd_nacd_instance_reset(NML3Cfg *self, NMTernary start_timer, gboolean acd_d
         break;
     case NM_TERNARY_TRUE:
         self->priv.p->nacd_instance_ensure_retry =
-            nm_g_timeout_source_new_seconds(ACD_ENSURE_RATELIMIT_MSEC / 1000u,
-                                            G_PRIORITY_DEFAULT,
+            nm_g_timeout_add_seconds_source(ACD_ENSURE_RATELIMIT_MSEC / 1000u,
                                             _l3_acd_nacd_instance_ensure_retry_cb,
-                                            self,
-                                            NULL);
-        g_source_attach(self->priv.p->nacd_instance_ensure_retry, NULL);
+                                            self);
         break;
     case NM_TERNARY_DEFAULT:
         break;
@@ -1939,12 +1931,9 @@ _l3_acd_data_timeout_schedule(AcdData *acd_data, gint64 timeout_msec)
 {
     nm_clear_g_source_inst(&acd_data->acd_data_timeout_source);
     acd_data->acd_data_timeout_source =
-        nm_g_timeout_source_new(NM_CLAMP((gint64) 0, timeout_msec, (gint64) G_MAXUINT),
-                                G_PRIORITY_DEFAULT,
+        nm_g_timeout_add_source(NM_CLAMP((gint64) 0, timeout_msec, (gint64) G_MAXUINT),
                                 _l3_acd_data_timeout_cb,
-                                acd_data,
-                                NULL);
-    g_source_attach(acd_data->acd_data_timeout_source, NULL);
+                                acd_data);
 }
 
 static void

From f00980e5cae536d35e39e28e2b796de1d7f27f4c Mon Sep 17 00:00:00 2001
From: Thomas Haller <thaller@redhat.com>
Date: Fri, 28 Jan 2022 16:59:45 +0100
Subject: [PATCH 3/5] l3cfg: add assertion for scheduling timeout in
 _l3_acd_data_timeout_schedule()

The first point is that ACD timeout is strongly tied to the current state. That
is (somewhat) obvious, because _l3_acd_data_state_set_full() will clear any pending
timeout. So you can only schedule a timeout *after* setting the state,
and setting the next state, will clear the timeout.

Likewise, note that l3_acd_data_state_change() for the event
ACD_STATE_CHANGE_MODE_TIMEOUT asserts that it is only called in the few
states where that is expected. See rhbz#2047788 where that assertion
gets hit.

The first point means that we must only schedule a timer when we are
also in a state that supports that. Add an assertion for that at the
point when scheduling the timeout. The assert at this point is useful,
because it catches the moment when we do the wrong thing (instead of
getting the assertion later during the timeout, when we no longer know
where the error happened).

See-also: https://bugzilla.redhat.com/show_bug.cgi?id=2047788
---
 src/core/nm-l3cfg.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c
index 48a491193b..fc3b427894 100644
--- a/src/core/nm-l3cfg.c
+++ b/src/core/nm-l3cfg.c
@@ -1929,6 +1929,19 @@ _l3_acd_data_timeout_cb(gpointer user_data)
 static void
 _l3_acd_data_timeout_schedule(AcdData *acd_data, gint64 timeout_msec)
 {
+    /* in _l3_acd_data_state_set_full() we clear the timer. At the same time,
+     * in _l3_acd_data_state_change(ACD_STATE_CHANGE_MODE_TIMEOUT) we only
+     * expect timeouts in certain states.
+     *
+     * That means, scheduling a timeout is only correct if we are in a certain
+     * state, which allows to handle timeouts. This assert checks for that to
+     * ensure we don't call a timeout in an unexpected state. */
+    nm_assert(NM_IN_SET(acd_data->info.state,
+                        NM_L3_ACD_ADDR_STATE_PROBING,
+                        NM_L3_ACD_ADDR_STATE_USED,
+                        NM_L3_ACD_ADDR_STATE_DEFENDING,
+                        NM_L3_ACD_ADDR_STATE_CONFLICT));
+
     nm_clear_g_source_inst(&acd_data->acd_data_timeout_source);
     acd_data->acd_data_timeout_source =
         nm_g_timeout_add_source(NM_CLAMP((gint64) 0, timeout_msec, (gint64) G_MAXUINT),

From d7a951b947c84c94f064b763408f1bc81da9d2fd Mon Sep 17 00:00:00 2001
From: Thomas Haller <thaller@redhat.com>
Date: Fri, 28 Jan 2022 18:48:13 +0100
Subject: [PATCH 4/5] l3cfg: add comment to _acd_data_collect_tracks_data()
 about linear search

---
 src/core/nm-l3cfg.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c
index fc3b427894..1ac477275d 100644
--- a/src/core/nm-l3cfg.c
+++ b/src/core/nm-l3cfg.c
@@ -1334,6 +1334,15 @@ _acd_data_collect_tracks_data(const AcdData     *acd_data,
     guint             n                     = 0;
     guint             i;
 
+    /* We do a simple search over all track-infos for the best, which determines
+     * our ACD state. That is, we prefer ACD disabled, and otherwise the
+     * shortest configured timeout.
+     *
+     * This linear search is probably fast enough, because we expect that each
+     * address/acd_data has few trackers.
+     * The alternative would be caching the best result, but that is more complicated,
+     * so not done. */
+
     for (i = 0; i < acd_data->info.n_track_infos; i++) {
         const NML3AcdAddrTrackInfo *acd_track = &acd_data->info.track_infos[i];
 

From db0d84f13a4c5a73403474f8b4af4b905fa60d4b Mon Sep 17 00:00:00 2001
From: Thomas Haller <thaller@redhat.com>
Date: Fri, 28 Jan 2022 17:52:34 +0100
Subject: [PATCH 5/5] l3cfg: fix handling "instance-reset" ACD event

The ACD state handling is unfortunately very complicated. That is, because
we obviously need to track state about how ACD is going (the acd_data, and
in particular NML3AcdAddrState). Then there are various things that can
happen, which are the AcdStateChangeMode enums. All these state-changes
come together in one function: _l3_acd_data_state_change(), which is
therefore complicated (I don't think that it would become simpler by
spreading this code out to different functions, on the contrary).
Anyway.

So, what happens when we need to reset the n-acd instance? For example,
because the MAC address of the link changed or some error. I guess, we
need to restart probing.

Previously, I think this was not handled properly. We already tried to
fix this several times, the last was commit b3316063862b ('l3cfg: on
n-acd instance-reset clear also ready ACD state'). There is still an
issue ([1]).

The bug [1] is, that we are in state NM_L3_ACD_ADDR_STATE_READY, during
 ACD_STATE_CHANGE_MODE_TIMEOUT event. That leads to an assertion
failure.

  #5  0x00007f23be74698f in g_assertion_message_expr (domain=0x5629aca70359 "nm", file=0x5629aca62aab "src/core/nm-l3cfg.c", line=2395, func=0x5629acb26b30 <__func__.72.lto_priv.4> "_l3_acd_data_state_change", expr=<optimized out>) at ../glib/gtestutils.c:3091
  #6  0x00005629ac937e46 in _l3_acd_data_state_change (self=0x5629add69790, acd_data=0x5629add8d520, state_change_mode=ACD_STATE_CHANGE_MODE_TIMEOUT, sender_addr=0x0, p_now_msec=0x7ffded506460) at src/core/nm-l3cfg.c:2395
  #7  0x00005629ac939f4d in _l3_acd_data_timeout_cb (user_data=user_data@entry=0x5629add8d520) at src/core/nm-l3cfg.c:1933
  #8  0x00007f23be71c5a1 in g_timeout_dispatch (source=0x5629addd7a80, callback=0x5629ac939ee0 <_l3_acd_data_timeout_cb>, user_data=0x5629add8d520) at ../glib/gmain.c:4889
  #9  0x00007f23be71bd4f in g_main_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:3337
  #10 g_main_context_dispatch (context=0x5629adc6da00) at ../glib/gmain.c:4055

That can only happen, (I think) when we scheduled the timeout
during an earlier ACD_STATE_CHANGE_MODE_INSTANCE_RESET event. Meaning,
we need to handle instance-reset better.

Instead, during instance-reset, switch always back to state PROBING, and
let the timeout figure it out.

[1] https://bugzilla.redhat.com/show_bug.cgi?id=2047788
---
 src/core/nm-l3cfg.c | 87 ++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 49 deletions(-)

diff --git a/src/core/nm-l3cfg.c b/src/core/nm-l3cfg.c
index 1ac477275d..aed4cf05c9 100644
--- a/src/core/nm-l3cfg.c
+++ b/src/core/nm-l3cfg.c
@@ -2351,22 +2351,7 @@ handle_init:
                 /* we are already probing. There is nothing to do for this timeout. */
                 return;
             }
-
-            nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec);
-
-            if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC
-                    + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC
-                >= (*p_now_msec)) {
-                /* hm. We failed to create a new probe too long. Something is really wrong
-                 * internally, but let's ignore the issue and assume the address is good. What
-                 * else would we do? Assume the address is USED? */
-                _LOGT_acd(acd_data,
-                          "probe-good (waiting for creating probe timed out. Assume good)");
-                goto handle_start_defending;
-            }
-
-            log_reason = "retry probing on timeout";
-            goto handle_start_probing;
+            /* fall-through */
 
         case NM_L3_ACD_ADDR_STATE_USED:
         case NM_L3_ACD_ADDR_STATE_CONFLICT:
@@ -2386,7 +2371,10 @@ handle_init:
             acd_data->acd_defend_type_desired = acd_defend_type;
 
             if (acd_timeout_msec <= 0) {
-                log_reason = "acd disabled by configuration (restart after previous conflict)";
+                if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING)
+                    log_reason = "acd disabled by configuration (timeout during probing)";
+                else
+                    log_reason = "acd disabled by configuration (restart after previous conflict)";
                 goto handle_probing_done;
             }
 
@@ -2396,6 +2384,23 @@ handle_init:
             }
 
             nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec);
+
+            if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_PROBING) {
+                if (acd_data->probing_timestamp_msec + ACD_WAIT_PROBING_EXTRA_TIME_MSEC
+                        + ACD_WAIT_PROBING_EXTRA_TIME2_MSEC
+                    >= (*p_now_msec)) {
+                    /* hm. We failed to create a new probe too long. Something is really wrong
+                     * internally, but let's ignore the issue and assume the address is good. What
+                     * else would we do? Assume the address is USED? */
+                    _LOGT_acd(acd_data,
+                              "probe-good (waiting for creating probe timed out. Assume good)");
+                    goto handle_start_defending;
+                }
+
+                log_reason = "retry probing on timeout";
+                goto handle_start_probing;
+            }
+
             acd_data->probing_timestamp_msec = (*p_now_msec);
             acd_data->probing_timeout_msec   = acd_timeout_msec;
             if (acd_data->info.state == NM_L3_ACD_ADDR_STATE_USED)
@@ -2616,39 +2621,23 @@ handle_init:
         return;
 
     case ACD_STATE_CHANGE_MODE_INSTANCE_RESET:
+        nm_assert(NM_IN_SET(acd_data->info.state,
+                            NM_L3_ACD_ADDR_STATE_PROBING,
+                            NM_L3_ACD_ADDR_STATE_DEFENDING,
+                            NM_L3_ACD_ADDR_STATE_READY,
+                            NM_L3_ACD_ADDR_STATE_USED,
+                            NM_L3_ACD_ADDR_STATE_CONFLICT,
+                            NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED));
 
-        switch (acd_data->info.state) {
-        case NM_L3_ACD_ADDR_STATE_INIT:
-            nm_assert_not_reached();
-            return;
-        case NM_L3_ACD_ADDR_STATE_PROBING:
-        case NM_L3_ACD_ADDR_STATE_DEFENDING:
-        case NM_L3_ACD_ADDR_STATE_READY:
-            if (!acd_data->nacd_probe) {
-                /* we failed starting to probe before and have a timer running to
-                 * restart. We don't do anything now, but let the timer handle it.
-                 * This also implements some rate limiting for us. */
-                _LOGT_acd(acd_data,
-                          "n-acd instance reset. Ignore event while in state %s",
-                          _l3_acd_addr_state_to_string(acd_data->info.state));
-                return;
-            }
-
-            _LOGT_acd(acd_data,
-                      "n-acd instance reset. Trigger a restart while in state %s",
-                      _l3_acd_addr_state_to_string(acd_data->info.state));
-            acd_data->nacd_probe = n_acd_probe_free(acd_data->nacd_probe);
-            _l3_acd_data_timeout_schedule(acd_data, 0);
-            return;
-        case NM_L3_ACD_ADDR_STATE_USED:
-        case NM_L3_ACD_ADDR_STATE_CONFLICT:
-        case NM_L3_ACD_ADDR_STATE_EXTERNAL_REMOVED:
-            /* as we have no probe, we are already handling this (e.g. by having
-             * a retry timer). Nothing to do. */
-            nm_assert(!acd_data->nacd_probe);
-            return;
-        }
-        nm_assert_not_reached();
+        /* an instance-reset is a dramatic event. We start over with probing. */
+        _LOGT_acd(acd_data,
+                  "n-acd instance reset. Reset to probing while in state %s",
+                  _l3_acd_addr_state_to_string(acd_data->info.state));
+        acd_data->nacd_probe                         = n_acd_probe_free(acd_data->nacd_probe);
+        acd_data->last_defendconflict_timestamp_msec = 0;
+        acd_data->probing_timestamp_msec = nm_utils_get_monotonic_timestamp_msec_cached(p_now_msec);
+        _l3_acd_data_state_set(self, acd_data, NM_L3_ACD_ADDR_STATE_PROBING, FALSE);
+        _l3_acd_data_timeout_schedule(acd_data, 0);
         return;
     }