From 1e74b755ee7a1d2f8bb602c6d8b3fd4f43101624 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:06 +0100 Subject: [PATCH 1/8] platform: reset seq_result on retrying link change Shouldn't make a difference at this point. It's nevertheless a good practice to guard against accidental use of a stale result. --- src/libnm-platform/nm-linux-platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index e4b558bca5..a3b4853977 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -8233,7 +8233,7 @@ do_change_link(NMPlatform *platform, { nm_auto_pop_netns NMPNetns *netns = NULL; int nle; - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; char s_buf[256]; int result; @@ -8250,6 +8250,7 @@ do_change_link(NMPlatform *platform, } retry: + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; result = -NME_UNSPEC; log_level = LOGL_WARN; log_detail = ""; From ef6d8cf1a8f75b6f32e5b25664572b569d871590 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:16 +0100 Subject: [PATCH 2/8] platform: assert the seq_status is known to be unknown on sending a nl message This guards against accidental use of a stale result. --- src/libnm-platform/nm-linux-platform.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index a3b4853977..fe42a3b110 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -7059,6 +7059,8 @@ delayed_action_schedule_WAIT_FOR_RESPONSE(NMPlatform *pla .response.out_data = response_out_data, }; + nm_assert(!out_seq_result || *out_seq_result == WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); + delayed_action_schedule( platform, nmp_netlink_protocol_info(netlink_protocol)->delayed_action_type_wait_for_response, @@ -7423,6 +7425,8 @@ _nl_send_nlmsghdr(NMPlatform *platform, nm_assert(nlhdr); + nm_assert(out_seq_result && *out_seq_result == WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); + seq = _nlh_seq_next_get(priv, NMP_NETLINK_ROUTE); nlhdr->nlmsg_seq = seq; @@ -7481,6 +7485,8 @@ _netlink_send_nlmsg(NMPlatform *platform, guint32 seq; int nle; + nm_assert(!out_seq_result || *out_seq_result == WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); + nlhdr = nlmsg_hdr(nlmsg); seq = _nlh_seq_next_get(priv, netlink_protocol); nlhdr->nlmsg_seq = seq; From e45b27a937a1439871be904f5ec0ff1fb7e3af7c Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:17 +0100 Subject: [PATCH 3/8] platform: create a define for retry count when netlink drops data We're going to use it elsewhere. --- src/libnm-platform/nm-linux-platform.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index fe42a3b110..3845a2383c 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -328,6 +328,10 @@ struct _ifla_vf_vlan_info { /*****************************************************************************/ +#define RESYNC_RETRIES 10 + +/*****************************************************************************/ + typedef struct { guint16 family_id; } GenlFamilyData; @@ -9850,7 +9854,8 @@ ip_route_get(NMPlatform *platform, /* Retry, if we failed due to a cache resync. That can happen when the netlink * socket fills up and we lost the response. */ - } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC && ++try_count < 10); + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); if (seq_result < 0) { /* negative seq_result is an errno from kernel. Map it to negative From fee7832bdec96cb5b0f5ceab41c876faa8bea939 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:17 +0100 Subject: [PATCH 4/8] platform: increase netlink resync retry count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With a small buffer (of 4K) and many links (100 ethernet adapters), I've seen up to ~15 retries of link change until things settled. Let's increase this. Still a »bulharská konštanta« but possibly safer and more broadly useful (so we can cap the link change retry count too). --- src/libnm-platform/nm-linux-platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index 3845a2383c..dbc3ec4b34 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -328,7 +328,7 @@ struct _ifla_vf_vlan_info { /*****************************************************************************/ -#define RESYNC_RETRIES 10 +#define RESYNC_RETRIES 50 /*****************************************************************************/ From 090ff4ae95f5a04c0bfbb636cd0d70611c7c3ee4 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:17 +0100 Subject: [PATCH 5/8] platform: limit retry count on link change This is a nice safeguard, also consistent with ip_route_get(). --- src/libnm-platform/nm-linux-platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index dbc3ec4b34..367faa9ceb 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -8251,6 +8251,7 @@ do_change_link(NMPlatform *platform, const char *log_detail; gs_free char *log_detail_free = NULL; const NMPObject *obj_cache; + int try_count = 0; if (!nm_platform_netns_push(platform, &netns)) { log_level = LOGL_ERR; @@ -8329,7 +8330,7 @@ out: wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf)), log_detail); - if (result == -EAGAIN) + if (result == -EAGAIN && ++try_count < RESYNC_RETRIES) goto retry; return result; From 0a549bfad27527ac5fadd5cdfcc47c16889767ea Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:17 +0100 Subject: [PATCH 6/8] platform: increase log level for some failures These are not expected to happen. While probably harmless, we should notice when they do. --- src/libnm-platform/nm-linux-platform.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index 367faa9ceb..d0f474195f 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -7458,7 +7458,7 @@ again: errsv = errno; if (errsv == EINTR && try_count++ < 100) goto again; - _LOGD("netlink: nl-send-nlmsghdr: failed sending message: %s (%d)", + _LOGI("netlink: nl-send-nlmsghdr: failed sending message: %s (%d)", nm_strerror_native(errsv), errsv); return -nm_errno_from_native(errsv); @@ -7497,7 +7497,7 @@ _netlink_send_nlmsg(NMPlatform *platform, nle = nl_send_auto(priv->sk_x[netlink_protocol], nlmsg); if (nle < 0) { - _LOGD("netlink: nl-send-nlmsg: failed sending message: %s (%d)", nm_strerror(nle), nle); + _LOGI("netlink: nl-send-nlmsg: failed sending message: %s (%d)", nm_strerror(nle), nle); return nle; } From da9745b9619a565accb484a65f588282aaeaa28e Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 16 Feb 2023 13:25:17 +0100 Subject: [PATCH 7/8] platform: always retry when netlink drops messages Netlink is capable of dropping not only outbout messages, but also the requests. We should always try to recover from those. --- src/libnm-platform/nm-linux-platform.c | 295 +++++++++++++++---------- 1 file changed, 175 insertions(+), 120 deletions(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index d0f474195f..c86f895275 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -459,7 +459,14 @@ typedef enum _nm_packed { WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN = 0, WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK, WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_UNKNOWN, + + /* The receive buffer of the netlink socket has a large, but limited size. + * It can fill up, and we lose messages. When that happens, we may lose a + * response that we were waiting for. This error number indicates that we + * don't know the response due to a resync. We probably should retry the + * request. */ WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC, + WAIT_FOR_NL_RESPONSE_RESULT_FAILED_POLL, WAIT_FOR_NL_RESPONSE_RESULT_FAILED_TIMEOUT, WAIT_FOR_NL_RESPONSE_RESULT_FAILED_DISPOSING, @@ -8063,35 +8070,41 @@ do_add_link_with_lookup(NMPlatform *platform, struct nl_msg *nlmsg, const NMPlatformLink **out_link) { - const NMPObject *obj = NULL; - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + const NMPObject *obj = NULL; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; int nle; char s_buf[256]; - NMPCache *cache = nm_platform_get_cache(platform); + NMPCache *cache = nm_platform_get_cache(platform); + int try_count = 0; event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, nlmsg, &seq_result, &extack_msg); - if (nle < 0) { - _LOGE("do-add-link[%s/%s]: failed sending netlink request \"%s\" (%d)", - name, - nm_link_type_to_string(link_type), - nm_strerror(nle), - -nle); - NM_SET_OUT(out_link, NULL); - return nle; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, nlmsg, &seq_result, &extack_msg); + if (nle < 0) { + _LOGE("do-add-link[%s/%s]: failed sending netlink request \"%s\" (%d)", + name, + nm_link_type_to_string(link_type), + nm_strerror(nle), + -nle); + NM_SET_OUT(out_link, NULL); + return nle; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); - _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, - "do-add-link[%s/%s]: %s", - name, - nm_link_type_to_string(link_type), - wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf))); + _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, + "do-add-link[%s/%s]: %s", + name, + nm_link_type_to_string(link_type), + wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf))); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); if (out_link) { obj = nmp_cache_lookup_link_full(cache, 0, name, FALSE, link_type, NULL, NULL); @@ -8109,10 +8122,11 @@ do_add_addrroute(NMPlatform *platform, char **out_extack_msg) { char sbuf1[NM_UTILS_TO_STRING_BUFFER_SIZE]; - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; int nle; char s_buf[256]; + int try_count = 0; nm_assert(!out_extack_msg || !*out_extack_msg); nm_assert(NM_IN_SET(NMP_OBJECT_GET_TYPE(obj_id), @@ -8123,29 +8137,34 @@ do_add_addrroute(NMPlatform *platform, event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, nlmsg, &seq_result, &extack_msg); - if (nle < 0) { - _LOGE("do-add-%s[%s]: failure sending netlink request \"%s\" (%d)", - NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, - nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), - nm_strerror(nle), - -nle); - NM_SET_OUT(out_extack_msg, g_steal_pointer(&extack_msg)); - return -NME_PL_NETLINK; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, nlmsg, &seq_result, &extack_msg); + if (nle < 0) { + _LOGE("do-add-%s[%s]: failure sending netlink request \"%s\" (%d)", + NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, + nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), + nm_strerror(nle), + -nle); + NM_SET_OUT(out_extack_msg, g_steal_pointer(&extack_msg)); + return -NME_PL_NETLINK; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); - _NMLOG((seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK - || (suppress_netlink_failure && seq_result < 0)) - ? LOGL_DEBUG - : LOGL_WARN, - "do-add-%s[%s]: %s", - NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, - nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), - wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf))); + _NMLOG((seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK + || (suppress_netlink_failure && seq_result < 0)) + ? LOGL_DEBUG + : LOGL_WARN, + "do-add-%s[%s]: %s", + NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, + nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), + wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf))); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); if (NMP_OBJECT_GET_TYPE(obj_id) == NMP_OBJECT_TYPE_IP6_ADDRESS) { /* In rare cases, the object is not yet ready as we received the ACK from @@ -8167,54 +8186,60 @@ static gboolean do_delete_object(NMPlatform *platform, const NMPObject *obj_id, struct nl_msg *nlmsg) { char sbuf1[NM_UTILS_TO_STRING_BUFFER_SIZE]; - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; int nle; char s_buf[256]; gboolean success; const char *log_detail = ""; + int try_count = 0; event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, nlmsg, &seq_result, &extack_msg); - if (nle < 0) { - _LOGE("do-delete-%s[%s]: failure sending netlink request \"%s\" (%d)", - NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, - nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), - nm_strerror(nle), - -nle); - return FALSE; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, nlmsg, &seq_result, &extack_msg); + if (nle < 0) { + _LOGE("do-delete-%s[%s]: failure sending netlink request \"%s\" (%d)", + NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, + nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), + nm_strerror(nle), + -nle); + return FALSE; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); - success = TRUE; - if (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK) { - /* ok */ - } else if (NM_IN_SET(-((int) seq_result), ESRCH, ENOENT)) - log_detail = ", meaning the object was already removed"; - else if (NM_IN_SET(-((int) seq_result), ENXIO) - && NM_IN_SET(NMP_OBJECT_GET_TYPE(obj_id), NMP_OBJECT_TYPE_IP6_ADDRESS)) { - /* On RHEL7 kernel, deleting a non existing address fails with ENXIO */ - log_detail = ", meaning the address was already removed"; - } else if (NM_IN_SET(-((int) seq_result), ENODEV)) { - log_detail = ", meaning the device was already removed"; - } else if (NM_IN_SET(-((int) seq_result), EADDRNOTAVAIL) - && NM_IN_SET(NMP_OBJECT_GET_TYPE(obj_id), - NMP_OBJECT_TYPE_IP4_ADDRESS, - NMP_OBJECT_TYPE_IP6_ADDRESS)) - log_detail = ", meaning the address was already removed"; - else - success = FALSE; + success = TRUE; + if (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK) { + /* ok */ + } else if (NM_IN_SET(-((int) seq_result), ESRCH, ENOENT)) + log_detail = ", meaning the object was already removed"; + else if (NM_IN_SET(-((int) seq_result), ENXIO) + && NM_IN_SET(NMP_OBJECT_GET_TYPE(obj_id), NMP_OBJECT_TYPE_IP6_ADDRESS)) { + /* On RHEL7 kernel, deleting a non existing address fails with ENXIO */ + log_detail = ", meaning the address was already removed"; + } else if (NM_IN_SET(-((int) seq_result), ENODEV)) { + log_detail = ", meaning the device was already removed"; + } else if (NM_IN_SET(-((int) seq_result), EADDRNOTAVAIL) + && NM_IN_SET(NMP_OBJECT_GET_TYPE(obj_id), + NMP_OBJECT_TYPE_IP4_ADDRESS, + NMP_OBJECT_TYPE_IP6_ADDRESS)) + log_detail = ", meaning the address was already removed"; + else + success = FALSE; - _NMLOG(success ? LOGL_DEBUG : LOGL_WARN, - "do-delete-%s[%s]: %s%s", - NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, - nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), - wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf)), - log_detail); + _NMLOG(success ? LOGL_DEBUG : LOGL_WARN, + "do-delete-%s[%s]: %s%s", + NMP_OBJECT_GET_CLASS(obj_id)->obj_type_name, + nmp_object_to_string(obj_id, NMP_OBJECT_TO_STRING_ID, sbuf1, sizeof(sbuf1)), + wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf)), + log_detail); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); if (NM_IN_SET(NMP_OBJECT_GET_TYPE(obj_id), NMP_OBJECT_TYPE_IP6_ADDRESS, @@ -9853,8 +9878,8 @@ ip_route_get(NMPlatform *platform, delayed_action_handle_all(platform); - /* Retry, if we failed due to a cache resync. That can happen when the netlink - * socket fills up and we lost the response. */ + nm_assert(seq_result); + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC && ++try_count < RESYNC_RETRIES); @@ -9880,25 +9905,33 @@ ip_route_get(NMPlatform *platform, static int routing_rule_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformRoutingRule *routing_rule) { - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; nm_auto_nlmsg struct nl_msg *msg = NULL; gs_free char *extack_msg = NULL; char s_buf[256]; int nle; + int try_count = 0; msg = _nl_msg_new_routing_rule(RTM_NEWRULE, flags, routing_rule); event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); - if (nle < 0) { - _LOGE("do-add-rule: failed sending netlink request \"%s\" (%d)", nm_strerror(nle), -nle); - return -NME_PL_NETLINK; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); + if (nle < 0) { + _LOGE("do-add-rule: failed sending netlink request \"%s\" (%d)", + nm_strerror(nle), + -nle); + return -NME_PL_NETLINK; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, "do-add-rule: %s", @@ -9916,11 +9949,12 @@ routing_rule_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformRoutin static int qdisc_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformQdisc *qdisc) { - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; int nle; char s_buf[256]; - nm_auto_nlmsg struct nl_msg *msg = NULL; + nm_auto_nlmsg struct nl_msg *msg = NULL; + int try_count = 0; /* Note: @qdisc must not be copied or kept alive because the lifetime of qdisc.kind * is undefined. */ @@ -9929,15 +9963,22 @@ qdisc_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformQdisc *qdisc) event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); - if (nle < 0) { - _LOGE("do-add-qdisc: failed sending netlink request \"%s\" (%d)", nm_strerror(nle), -nle); - return -NME_PL_NETLINK; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); + if (nle < 0) { + _LOGE("do-add-qdisc: failed sending netlink request \"%s\" (%d)", + nm_strerror(nle), + -nle); + return -NME_PL_NETLINK; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, "do-add-qdisc: %s", @@ -9957,7 +9998,7 @@ tc_delete(NMPlatform *platform, guint32 parent, gboolean log_error) { - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; int nle; char s_buf[256]; @@ -9967,6 +10008,7 @@ tc_delete(NMPlatform *platform, .tcm_ifindex = ifindex, .tcm_parent = parent, }; + int try_count = 0; switch (nlmsg_type) { case RTM_DELQDISC: @@ -9987,19 +10029,24 @@ tc_delete(NMPlatform *platform, event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); - if (nle < 0) { - _NMLOG(log_error ? LOGL_ERR : LOGL_DEBUG, - "%s: failed sending netlink request \"%s\" (%d)", - log_tag, - nm_strerror(nle), - -nle); - return -NME_PL_NETLINK; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); + if (nle < 0) { + _NMLOG(log_error ? LOGL_ERR : LOGL_DEBUG, + "%s: failed sending netlink request \"%s\" (%d)", + log_tag, + nm_strerror(nle), + -nle); + return -NME_PL_NETLINK; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); _NMLOG((seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK || !log_error) ? LOGL_DEBUG : LOGL_WARN, @@ -10026,11 +10073,12 @@ qdisc_delete(NMPlatform *platform, int ifindex, guint32 parent, gboolean log_err static int tfilter_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformTfilter *tfilter) { - WaitForNlResponseResult seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + WaitForNlResponseResult seq_result; gs_free char *extack_msg = NULL; int nle; char s_buf[256]; - nm_auto_nlmsg struct nl_msg *msg = NULL; + nm_auto_nlmsg struct nl_msg *msg = NULL; + int try_count = 0; /* Note: @tfilter must not be copied or kept alive because the lifetime of tfilter.kind * and tfilter.action.kind is undefined. */ @@ -10039,19 +10087,26 @@ tfilter_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformTfilter *tf event_handler_read_netlink(platform, NMP_NETLINK_ROUTE, FALSE); - nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); - if (nle < 0) { - _LOGE("do-add-tfilter: failed sending netlink request \"%s\" (%d)", nm_strerror(nle), -nle); - return -NME_PL_NETLINK; - } + do { + seq_result = WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN; + nle = _netlink_send_nlmsg_rtnl(platform, msg, &seq_result, &extack_msg); + if (nle < 0) { + _LOGE("do-add-tfilter: failed sending netlink request \"%s\" (%d)", + nm_strerror(nle), + -nle); + return -NME_PL_NETLINK; + } - delayed_action_handle_all(platform); + delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result); - _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, - "do-add-tfilter: %s", - wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf))); + _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, + "do-add-tfilter: %s", + wait_for_nl_response_to_string(seq_result, extack_msg, s_buf, sizeof(s_buf))); + + } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC + && ++try_count < RESYNC_RETRIES); if (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK) return 0; From 5eb584f84b6be5a51526bf969a56df75359a2069 Mon Sep 17 00:00:00 2001 From: Thomas Haller Date: Tue, 28 Mar 2023 22:46:55 +0200 Subject: [PATCH 8/8] platform: explicitly compare seq_result number against WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN We have other places like nm_assert(!out_seq_result || *out_seq_result == WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); where we explicitly compare against WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN. Do that here too. --- src/libnm-platform/nm-linux-platform.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/libnm-platform/nm-linux-platform.c b/src/libnm-platform/nm-linux-platform.c index c86f895275..4ca0948540 100644 --- a/src/libnm-platform/nm-linux-platform.c +++ b/src/libnm-platform/nm-linux-platform.c @@ -6705,7 +6705,7 @@ delayed_action_wait_for_response_complete(NMPlatform *platform, nm_assert(NM_FLAGS_ANY(priv->delayed_action.flags, ACTION_TYPE)); nm_assert(idx < priv->delayed_action.list_wait_for_response_x[netlink_protocol]->len); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); data = delayed_action_get_list_wait_for_resonse(priv, netlink_protocol, idx); @@ -8095,7 +8095,7 @@ do_add_link_with_lookup(NMPlatform *platform, delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, "do-add-link[%s/%s]: %s", @@ -8152,7 +8152,7 @@ do_add_addrroute(NMPlatform *platform, delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); _NMLOG((seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK || (suppress_netlink_failure && seq_result < 0)) @@ -8210,7 +8210,7 @@ do_delete_object(NMPlatform *platform, const NMPObject *obj_id, struct nl_msg *n delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); success = TRUE; if (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK) { @@ -8307,7 +8307,7 @@ retry: delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); if (NM_IN_SET(seq_result, WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK, -EEXIST, -EADDRINUSE)) { log_level = LOGL_DEBUG; @@ -9878,7 +9878,7 @@ ip_route_get(NMPlatform *platform, delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC && ++try_count < RESYNC_RETRIES); @@ -9928,7 +9928,7 @@ routing_rule_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformRoutin delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC && ++try_count < RESYNC_RETRIES); @@ -9975,7 +9975,7 @@ qdisc_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformQdisc *qdisc) delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC && ++try_count < RESYNC_RETRIES); @@ -10043,7 +10043,7 @@ tc_delete(NMPlatform *platform, delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); } while (seq_result == WAIT_FOR_NL_RESPONSE_RESULT_FAILED_RESYNC && ++try_count < RESYNC_RETRIES); @@ -10099,7 +10099,7 @@ tfilter_add(NMPlatform *platform, NMPNlmFlags flags, const NMPlatformTfilter *tf delayed_action_handle_all(platform); - nm_assert(seq_result); + nm_assert(seq_result != WAIT_FOR_NL_RESPONSE_RESULT_UNKNOWN); _NMLOG(seq_result == WAIT_FOR_NL_RESPONSE_RESULT_RESPONSE_OK ? LOGL_DEBUG : LOGL_WARN, "do-add-tfilter: %s",