From 57c3e8fd25cd8c488a92a8cdd8f53b2a7e39d41b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 4 Mar 2015 16:24:38 -0600 Subject: [PATCH 1/2] team: respawn teamd when it exits instead of failing activation (rh #1145988) teamd can recover interface state on its own, so if it died unexpectedly we don't need to fail the device. Also, if for some reason a teamd is already up and running when activating the interface, we can ask for its configuration and if it has the same configuration we are about to use, just talk to the existing copy instead of killing it. --- src/devices/team/nm-device-team.c | 196 ++++++++++++++++++------------ 1 file changed, 116 insertions(+), 80 deletions(-) diff --git a/src/devices/team/nm-device-team.c b/src/devices/team/nm-device-team.c index 2d0e6e0af3..007d4fe04c 100644 --- a/src/devices/team/nm-device-team.c +++ b/src/devices/team/nm-device-team.c @@ -41,6 +41,7 @@ #include "nm-team-enum-types.h" #include "nm-posix-signals.h" #include "nm-core-internal.h" +#include "gsystem-local-alloc.h" #include "nm-device-team-glue.h" @@ -66,6 +67,8 @@ enum { LAST_PROP }; +static gboolean teamd_start (NMDevice *device, NMSettingTeam *s_team); + /******************************************************************/ static guint32 @@ -271,7 +274,7 @@ teamd_timeout_remove (NMDevice *device) } static void -teamd_cleanup (NMDevice *device, gboolean device_state_failed) +teamd_cleanup (NMDevice *device) { NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); @@ -292,12 +295,6 @@ teamd_cleanup (NMDevice *device, gboolean device_state_failed) } teamd_timeout_remove (device); - - if (device_state_failed) { - if (nm_device_is_activating (device) || - (nm_device_get_state (device) == NM_DEVICE_STATE_ACTIVATED)) - nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); - } } static gboolean @@ -310,7 +307,10 @@ teamd_timeout_cb (gpointer user_data) g_return_val_if_fail (priv->teamd_timeout, FALSE); _LOGI (LOGD_TEAM, "teamd timed out."); - teamd_cleanup (device, TRUE); + teamd_cleanup (device); + + g_warn_if_fail (nm_device_is_activating (device)); + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); return FALSE; } @@ -332,24 +332,28 @@ teamd_dbus_appeared (GDBusConnection *connection, teamd_timeout_remove (device); nm_device_queue_recheck_assume (device); + /* Grab a teamd control handle even if we aren't going to use it + * immediately. But if we are, and grabbing it failed, fail the + * device activation. + */ success = ensure_teamd_connection (device); if (nm_device_get_state (device) == NM_DEVICE_STATE_PREPARE) { if (success) nm_device_activate_schedule_stage2_device_config (device); else if (!nm_device_uses_assumed_connection (device)) nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); - return; } } static void -teamd_dbus_vanished (GDBusConnection *connection, +teamd_dbus_vanished (GDBusConnection *dbus_connection, const gchar *name, gpointer user_data) { NMDeviceTeam *self = NM_DEVICE_TEAM (user_data); NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); NMDevice *device = NM_DEVICE (self); + NMDeviceState state = nm_device_get_state (device); g_return_if_fail (priv->teamd_dbus_watch); @@ -363,7 +367,16 @@ teamd_dbus_vanished (GDBusConnection *connection, } _LOGI (LOGD_TEAM, "teamd vanished from D-Bus"); - teamd_cleanup (device, TRUE); + teamd_cleanup (device); + + /* Attempt to respawn teamd */ + if (state >= NM_DEVICE_STATE_PREPARE && state <= NM_DEVICE_STATE_ACTIVATED) { + NMConnection *connection = nm_device_get_connection (device); + + g_assert (connection); + if (!teamd_start (device, nm_connection_get_setting_team (connection))) + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + } } static void @@ -378,7 +391,7 @@ teamd_process_watch_cb (GPid pid, gint status, gpointer user_data) _LOGI (LOGD_TEAM, "teamd died with status %d", status); priv->teamd_process_watch = 0; priv->teamd_pid = 0; - teamd_cleanup (device, TRUE); + teamd_cleanup (device); } static void @@ -398,23 +411,29 @@ teamd_child_setup (gpointer user_data G_GNUC_UNUSED) nm_unblock_posix_signals (NULL); } -static void -nm_device_team_watch_dbus (NMDeviceTeam *self) +static gboolean +teamd_kill (NMDeviceTeam *self, const char *teamd_binary, GError **error) { - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); - const char *iface = nm_device_get_ip_iface (NM_DEVICE (self)); - char *tmp_str = NULL; + gs_unref_ptrarray GPtrArray *argv = NULL; + gs_free char *tmp_str = NULL; - /* Register D-Bus name watcher */ - tmp_str = g_strdup_printf ("org.libteam.teamd.%s", iface); - priv->teamd_dbus_watch = g_bus_watch_name (G_BUS_TYPE_SYSTEM, - tmp_str, - G_BUS_NAME_WATCHER_FLAGS_NONE, - teamd_dbus_appeared, - teamd_dbus_vanished, - NM_DEVICE (self), - NULL); - g_free (tmp_str); + if (!teamd_binary) { + teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL); + if (!teamd_binary) { + _LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: teamd binary not found"); + return FALSE; + } + } + + argv = g_ptr_array_new (); + g_ptr_array_add (argv, (gpointer) teamd_binary); + g_ptr_array_add (argv, (gpointer) "-k"); + g_ptr_array_add (argv, (gpointer) "-t"); + g_ptr_array_add (argv, (gpointer) nm_device_get_iface (NM_DEVICE (self))); + g_ptr_array_add (argv, NULL); + + _LOGD (LOGD_TEAM, "running: %s", (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); + return g_spawn_sync ("/", (char **) argv->pdata, NULL, 0, nm_unblock_posix_signals, NULL, NULL, NULL, NULL, error); } static gboolean @@ -429,17 +448,6 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) GPtrArray *argv; GError *error = NULL; gboolean ret; - int status; - - if (priv->teamd_process_watch || - priv->teamd_pid > 0 || - priv->tdc || - priv->teamd_timeout) - { - /* FIXME g_assert that this never hits. For now, be more reluctant, and try to recover. */ - g_warn_if_reached (); - teamd_cleanup (device, FALSE); - } teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL); if (!teamd_binary) { @@ -447,20 +455,16 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) return FALSE; } - /* Kill teamd for same named device first if it is there */ - argv = g_ptr_array_new (); - g_ptr_array_add (argv, (gpointer) teamd_binary); - g_ptr_array_add (argv, (gpointer) "-k"); - g_ptr_array_add (argv, (gpointer) "-t"); - g_ptr_array_add (argv, (gpointer) iface); - g_ptr_array_add (argv, NULL); - - _LOGD (LOGD_TEAM, "running: %s", - (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); - g_clear_pointer (&tmp_str, g_free); - - ret = g_spawn_sync ("/", (char **) argv->pdata, NULL, 0, nm_unblock_posix_signals, NULL, NULL, NULL, &status, &error); - g_ptr_array_free (argv, TRUE); + if (priv->teamd_process_watch || + priv->teamd_pid > 0 || + priv->tdc || + priv->teamd_timeout) + { + g_warn_if_reached (); + if (!priv->teamd_pid) + teamd_kill (self, teamd_binary, NULL); + teamd_cleanup (device); + } /* Start teamd now */ argv = g_ptr_array_new (); @@ -496,7 +500,7 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) if (!ret) { _LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: %s", error->message); g_clear_error (&error); - teamd_cleanup (device, FALSE); + teamd_cleanup (device); return FALSE; } @@ -509,46 +513,67 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) return TRUE; } -static void -teamd_stop (NMDevice *device) -{ - NMDeviceTeam *self = NM_DEVICE_TEAM (device); - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); - - if (priv->teamd_pid > 0) - _LOGI (LOGD_TEAM, "Deactivation: stopping teamd..."); - else - _LOGD (LOGD_TEAM, "Deactivation: stopping teamd (not started)..."); - teamd_cleanup (device, FALSE); -} - static NMActStageReturn act_stage1_prepare (NMDevice *device, NMDeviceStateReason *reason) { + NMDeviceTeam *self = NM_DEVICE_TEAM (device); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); NMActStageReturn ret = NM_ACT_STAGE_RETURN_SUCCESS; + gs_free_error GError *error = NULL; NMConnection *connection; NMSettingTeam *s_team; + const char *cfg; g_return_val_if_fail (reason != NULL, NM_ACT_STAGE_RETURN_FAILURE); ret = NM_DEVICE_CLASS (nm_device_team_parent_class)->act_stage1_prepare (device, reason); - if (ret == NM_ACT_STAGE_RETURN_SUCCESS) { - connection = nm_device_get_connection (device); - g_assert (connection); - s_team = nm_connection_get_setting_team (connection); - g_assert (s_team); - if (teamd_start (device, s_team)) - ret = NM_ACT_STAGE_RETURN_POSTPONE; - else - ret = NM_ACT_STAGE_RETURN_FAILURE; + if (ret != NM_ACT_STAGE_RETURN_SUCCESS) + return ret; + + connection = nm_device_get_connection (device); + g_assert (connection); + s_team = nm_connection_get_setting_team (connection); + g_assert (s_team); + + if (priv->tdc) { + /* If the existing teamd config is the same as we're about to use, + * then we can proceed. If it's not the same, and we have a PID, + * kill it so we can respawn it with the right config. If we don't + * have a PID, then we must fail. + */ + cfg = teamdctl_config_get_raw (priv->tdc); + if (cfg && strcmp (cfg, nm_setting_team_get_config (s_team)) == 0) { + _LOGD (LOGD_TEAM, "using existing matching teamd config"); + return NM_ACT_STAGE_RETURN_SUCCESS; + } + + if (!priv->teamd_pid) { + _LOGD (LOGD_TEAM, "existing teamd config mismatch; killing existing via teamdctl"); + if (!teamd_kill (self, NULL, &error)) { + _LOGW (LOGD_TEAM, "existing teamd config mismatch; failed to kill existing teamd: %s", error->message); + *reason = NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED; + return NM_ACT_STAGE_RETURN_FAILURE; + } + } + + _LOGD (LOGD_TEAM, "existing teamd config mismatch; respawning..."); + teamd_cleanup (device); } - return ret; + + return teamd_start (device, s_team) ? + NM_ACT_STAGE_RETURN_POSTPONE : NM_ACT_STAGE_RETURN_FAILURE; } static void deactivate (NMDevice *device) { - teamd_stop (device); + NMDeviceTeam *self = NM_DEVICE_TEAM (device); + + _LOGI (LOGD_TEAM, "deactivation: stopping teamd..."); + + if (!NM_DEVICE_TEAM_GET_PRIVATE (self)->teamd_pid) + teamd_kill (self, NULL, NULL); + teamd_cleanup (device); } static gboolean @@ -696,11 +721,22 @@ nm_device_team_init (NMDeviceTeam * self) static void constructed (GObject *object) { - NMDeviceTeam *self = NM_DEVICE_TEAM (object); + NMDevice *device = NM_DEVICE (object); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (object); + char *tmp_str = NULL; G_OBJECT_CLASS (nm_device_team_parent_class)->constructed (object); - nm_device_team_watch_dbus (self); + /* Register D-Bus name watcher */ + tmp_str = g_strdup_printf ("org.libteam.teamd.%s", nm_device_get_ip_iface (device)); + priv->teamd_dbus_watch = g_bus_watch_name (G_BUS_TYPE_SYSTEM, + tmp_str, + G_BUS_NAME_WATCHER_FLAGS_NONE, + teamd_dbus_appeared, + teamd_dbus_vanished, + NM_DEVICE (device), + NULL); + g_free (tmp_str); } static void @@ -748,7 +784,7 @@ dispose (GObject *object) priv->teamd_dbus_watch = 0; } - teamd_cleanup (NM_DEVICE (object), FALSE); + teamd_cleanup (NM_DEVICE (object)); G_OBJECT_CLASS (nm_device_team_parent_class)->dispose (object); } From 7ad0e83b4aafc1d5a43efaeba2727595a18d9a26 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 Mar 2015 08:34:42 -0500 Subject: [PATCH 2/2] team: ratelimit teamd spawning --- src/devices/team/nm-device-team.c | 136 +++++++++++++++++------------- 1 file changed, 79 insertions(+), 57 deletions(-) diff --git a/src/devices/team/nm-device-team.c b/src/devices/team/nm-device-team.c index 007d4fe04c..49202c3612 100644 --- a/src/devices/team/nm-device-team.c +++ b/src/devices/team/nm-device-team.c @@ -263,18 +263,7 @@ master_update_slave_connection (NMDevice *self, /******************************************************************/ static void -teamd_timeout_remove (NMDevice *device) -{ - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); - - if (priv->teamd_timeout) { - g_source_remove (priv->teamd_timeout); - priv->teamd_timeout = 0; - } -} - -static void -teamd_cleanup (NMDevice *device) +teamd_cleanup (NMDevice *device, gboolean free_tdc) { NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); @@ -283,18 +272,21 @@ teamd_cleanup (NMDevice *device) priv->teamd_process_watch = 0; } + if (priv->teamd_timeout) { + g_source_remove (priv->teamd_timeout); + priv->teamd_timeout = 0; + } + if (priv->teamd_pid > 0) { nm_utils_kill_child_async (priv->teamd_pid, SIGTERM, LOGD_TEAM, "teamd", 2000, NULL, NULL); priv->teamd_pid = 0; } - if (priv->tdc) { + if (priv->tdc && free_tdc) { teamdctl_disconnect (priv->tdc); teamdctl_free (priv->tdc); priv->tdc = NULL; } - - teamd_timeout_remove (device); } static gboolean @@ -305,14 +297,18 @@ teamd_timeout_cb (gpointer user_data) NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); g_return_val_if_fail (priv->teamd_timeout, FALSE); + priv->teamd_timeout = 0; - _LOGI (LOGD_TEAM, "teamd timed out."); - teamd_cleanup (device); + if (priv->teamd_pid && !priv->tdc) { + /* Timed out launching our own teamd process */ + _LOGW (LOGD_TEAM, "teamd timed out."); + teamd_cleanup (device, TRUE); - g_warn_if_fail (nm_device_is_activating (device)); - nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + g_warn_if_fail (nm_device_is_activating (device)); + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + } - return FALSE; + return G_SOURCE_REMOVE; } static void @@ -329,9 +325,32 @@ teamd_dbus_appeared (GDBusConnection *connection, g_return_if_fail (priv->teamd_dbus_watch); _LOGI (LOGD_TEAM, "teamd appeared on D-Bus"); - teamd_timeout_remove (device); nm_device_queue_recheck_assume (device); + /* If another teamd grabbed the bus name while our teamd was starting, + * just ignore the death of our teamd and run with the existing one. + */ + if (priv->teamd_process_watch) { + gs_unref_variant GVariant *ret = NULL; + guint32 pid; + + ret = g_dbus_connection_call_sync (connection, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionUnixProcessID", + g_variant_new ("(s)", name_owner), + NULL, + G_DBUS_CALL_FLAGS_NO_AUTO_START, + 2000, + NULL, + NULL); + g_variant_get (ret, "(u)", &pid); + + if (pid != priv->teamd_pid) + teamd_cleanup (device, FALSE); + } + /* Grab a teamd control handle even if we aren't going to use it * immediately. But if we are, and grabbing it failed, fail the * device activation. @@ -367,7 +386,7 @@ teamd_dbus_vanished (GDBusConnection *dbus_connection, } _LOGI (LOGD_TEAM, "teamd vanished from D-Bus"); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); /* Attempt to respawn teamd */ if (state >= NM_DEVICE_STATE_PREPARE && state <= NM_DEVICE_STATE_ACTIVATED) { @@ -385,13 +404,24 @@ teamd_process_watch_cb (GPid pid, gint status, gpointer user_data) NMDeviceTeam *self = NM_DEVICE_TEAM (user_data); NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); NMDevice *device = NM_DEVICE (self); + NMDeviceState state = nm_device_get_state (device); g_return_if_fail (priv->teamd_process_watch); - _LOGI (LOGD_TEAM, "teamd died with status %d", status); - priv->teamd_process_watch = 0; + _LOGD (LOGD_TEAM, "teamd died with status %d", status); priv->teamd_pid = 0; - teamd_cleanup (device); + priv->teamd_process_watch = 0; + + /* If teamd quit within 5 seconds of starting, it's probably hosed + * and will just die again, so fail the activation. + */ + if (priv->teamd_timeout && + (state >= NM_DEVICE_STATE_PREPARE) && + (state <= NM_DEVICE_STATE_ACTIVATED)) { + _LOGW (LOGD_TEAM, "teamd process quit unexpectedly; failing activation"); + teamd_cleanup (device, TRUE); + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + } } static void @@ -442,12 +472,11 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) NMDeviceTeam *self = NM_DEVICE_TEAM (device); NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); const char *iface = nm_device_get_ip_iface (device); - char *tmp_str = NULL; - const char *config; + gs_unref_ptrarray GPtrArray *argv = NULL; + gs_free_error GError *error = NULL; + gs_free char *tmp_str = NULL; const char *teamd_binary; - GPtrArray *argv; - GError *error = NULL; - gboolean ret; + const char *config; teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL); if (!teamd_binary) { @@ -455,15 +484,11 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) return FALSE; } - if (priv->teamd_process_watch || - priv->teamd_pid > 0 || - priv->tdc || - priv->teamd_timeout) - { + if (priv->teamd_process_watch || priv->teamd_pid > 0 || priv->tdc) { g_warn_if_reached (); if (!priv->teamd_pid) teamd_kill (self, teamd_binary, NULL); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); } /* Start teamd now */ @@ -487,29 +512,24 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) g_ptr_array_add (argv, (gpointer) "-gg"); g_ptr_array_add (argv, NULL); - _LOGD (LOGD_TEAM, "running: %s", - (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); - g_clear_pointer (&tmp_str, g_free); - - /* Start a timeout for teamd to appear at D-Bus */ - priv->teamd_timeout = g_timeout_add_seconds (5, teamd_timeout_cb, device); - - ret = g_spawn_async ("/", (char **) argv->pdata, NULL, G_SPAWN_DO_NOT_REAP_CHILD, - &teamd_child_setup, NULL, &priv->teamd_pid, &error); - g_ptr_array_free (argv, TRUE); - if (!ret) { + _LOGD (LOGD_TEAM, "running: %s", (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); + if (!g_spawn_async ("/", (char **) argv->pdata, NULL, G_SPAWN_DO_NOT_REAP_CHILD, + teamd_child_setup, NULL, &priv->teamd_pid, &error)) { _LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: %s", error->message); - g_clear_error (&error); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); return FALSE; } + /* Start a timeout for teamd to appear at D-Bus */ + if (!priv->teamd_timeout) + priv->teamd_timeout = g_timeout_add_seconds (5, teamd_timeout_cb, device); + /* Monitor the child process so we know when it dies */ priv->teamd_process_watch = g_child_watch_add (priv->teamd_pid, teamd_process_watch_cb, device); - _LOGI (LOGD_TEAM, "Activation: (team) started teamd..."); + _LOGI (LOGD_TEAM, "Activation: (team) started teamd [pid %u]...", (guint) priv->teamd_pid); return TRUE; } @@ -557,7 +577,7 @@ act_stage1_prepare (NMDevice *device, NMDeviceStateReason *reason) } _LOGD (LOGD_TEAM, "existing teamd config mismatch; respawning..."); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); } return teamd_start (device, s_team) ? @@ -568,12 +588,14 @@ static void deactivate (NMDevice *device) { NMDeviceTeam *self = NM_DEVICE_TEAM (device); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); - _LOGI (LOGD_TEAM, "deactivation: stopping teamd..."); + if (priv->teamd_pid || priv->tdc) + _LOGI (LOGD_TEAM, "deactivation: stopping teamd..."); - if (!NM_DEVICE_TEAM_GET_PRIVATE (self)->teamd_pid) + if (!priv->teamd_pid) teamd_kill (self, NULL, NULL); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); } static gboolean @@ -776,15 +798,15 @@ set_property (GObject *object, guint prop_id, static void dispose (GObject *object) { - NMDeviceTeam *self = NM_DEVICE_TEAM (object); - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); + NMDevice *device = NM_DEVICE (object); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (object); if (priv->teamd_dbus_watch) { g_bus_unwatch_name (priv->teamd_dbus_watch); priv->teamd_dbus_watch = 0; } - teamd_cleanup (NM_DEVICE (object)); + teamd_cleanup (device, TRUE); G_OBJECT_CLASS (nm_device_team_parent_class)->dispose (object); }