From 24a764e831bff27647bb5025c1368582fade21cf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 4 Mar 2015 16:24:38 -0600 Subject: [PATCH 1/2] team: respawn teamd when it exits instead of failing activation (rh #1145988) teamd can recover interface state on its own, so if it died unexpectedly we don't need to fail the device. Also, if for some reason a teamd is already up and running when activating the interface, we can ask for its configuration and if it has the same configuration we are about to use, just talk to the existing copy instead of killing it. --- src/devices/team/nm-device-team.c | 196 ++++++++++++++++++------------ 1 file changed, 116 insertions(+), 80 deletions(-) diff --git a/src/devices/team/nm-device-team.c b/src/devices/team/nm-device-team.c index c8f510b9b5..19676c2f5e 100644 --- a/src/devices/team/nm-device-team.c +++ b/src/devices/team/nm-device-team.c @@ -40,6 +40,7 @@ #include "nm-enum-types.h" #include "nm-team-enum-types.h" #include "nm-core-internal.h" +#include "gsystem-local-alloc.h" #include "nm-device-team-glue.h" @@ -65,6 +66,8 @@ enum { LAST_PROP }; +static gboolean teamd_start (NMDevice *device, NMSettingTeam *s_team); + /******************************************************************/ static guint32 @@ -270,7 +273,7 @@ teamd_timeout_remove (NMDevice *device) } static void -teamd_cleanup (NMDevice *device, gboolean device_state_failed) +teamd_cleanup (NMDevice *device) { NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); @@ -291,12 +294,6 @@ teamd_cleanup (NMDevice *device, gboolean device_state_failed) } teamd_timeout_remove (device); - - if (device_state_failed) { - if (nm_device_is_activating (device) || - (nm_device_get_state (device) == NM_DEVICE_STATE_ACTIVATED)) - nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); - } } static gboolean @@ -309,7 +306,10 @@ teamd_timeout_cb (gpointer user_data) g_return_val_if_fail (priv->teamd_timeout, FALSE); _LOGI (LOGD_TEAM, "teamd timed out."); - teamd_cleanup (device, TRUE); + teamd_cleanup (device); + + g_warn_if_fail (nm_device_is_activating (device)); + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); return FALSE; } @@ -331,24 +331,28 @@ teamd_dbus_appeared (GDBusConnection *connection, teamd_timeout_remove (device); nm_device_queue_recheck_assume (device); + /* Grab a teamd control handle even if we aren't going to use it + * immediately. But if we are, and grabbing it failed, fail the + * device activation. + */ success = ensure_teamd_connection (device); if (nm_device_get_state (device) == NM_DEVICE_STATE_PREPARE) { if (success) nm_device_activate_schedule_stage2_device_config (device); else if (!nm_device_uses_assumed_connection (device)) nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); - return; } } static void -teamd_dbus_vanished (GDBusConnection *connection, +teamd_dbus_vanished (GDBusConnection *dbus_connection, const gchar *name, gpointer user_data) { NMDeviceTeam *self = NM_DEVICE_TEAM (user_data); NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); NMDevice *device = NM_DEVICE (self); + NMDeviceState state = nm_device_get_state (device); g_return_if_fail (priv->teamd_dbus_watch); @@ -362,7 +366,16 @@ teamd_dbus_vanished (GDBusConnection *connection, } _LOGI (LOGD_TEAM, "teamd vanished from D-Bus"); - teamd_cleanup (device, TRUE); + teamd_cleanup (device); + + /* Attempt to respawn teamd */ + if (state >= NM_DEVICE_STATE_PREPARE && state <= NM_DEVICE_STATE_ACTIVATED) { + NMConnection *connection = nm_device_get_connection (device); + + g_assert (connection); + if (!teamd_start (device, nm_connection_get_setting_team (connection))) + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + } } static void @@ -377,26 +390,32 @@ teamd_process_watch_cb (GPid pid, gint status, gpointer user_data) _LOGI (LOGD_TEAM, "teamd died with status %d", status); priv->teamd_process_watch = 0; priv->teamd_pid = 0; - teamd_cleanup (device, TRUE); + teamd_cleanup (device); } -static void -nm_device_team_watch_dbus (NMDeviceTeam *self) +static gboolean +teamd_kill (NMDeviceTeam *self, const char *teamd_binary, GError **error) { - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); - const char *iface = nm_device_get_ip_iface (NM_DEVICE (self)); - char *tmp_str = NULL; + gs_unref_ptrarray GPtrArray *argv = NULL; + gs_free char *tmp_str = NULL; - /* Register D-Bus name watcher */ - tmp_str = g_strdup_printf ("org.libteam.teamd.%s", iface); - priv->teamd_dbus_watch = g_bus_watch_name (G_BUS_TYPE_SYSTEM, - tmp_str, - G_BUS_NAME_WATCHER_FLAGS_NONE, - teamd_dbus_appeared, - teamd_dbus_vanished, - NM_DEVICE (self), - NULL); - g_free (tmp_str); + if (!teamd_binary) { + teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL); + if (!teamd_binary) { + _LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: teamd binary not found"); + return FALSE; + } + } + + argv = g_ptr_array_new (); + g_ptr_array_add (argv, (gpointer) teamd_binary); + g_ptr_array_add (argv, (gpointer) "-k"); + g_ptr_array_add (argv, (gpointer) "-t"); + g_ptr_array_add (argv, (gpointer) nm_device_get_iface (NM_DEVICE (self))); + g_ptr_array_add (argv, NULL); + + _LOGD (LOGD_TEAM, "running: %s", (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); + return g_spawn_sync ("/", (char **) argv->pdata, NULL, 0, NULL, NULL, NULL, NULL, NULL, error); } static gboolean @@ -411,17 +430,6 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) GPtrArray *argv; GError *error = NULL; gboolean ret; - int status; - - if (priv->teamd_process_watch || - priv->teamd_pid > 0 || - priv->tdc || - priv->teamd_timeout) - { - /* FIXME g_assert that this never hits. For now, be more reluctant, and try to recover. */ - g_warn_if_reached (); - teamd_cleanup (device, FALSE); - } teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL); if (!teamd_binary) { @@ -429,20 +437,16 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) return FALSE; } - /* Kill teamd for same named device first if it is there */ - argv = g_ptr_array_new (); - g_ptr_array_add (argv, (gpointer) teamd_binary); - g_ptr_array_add (argv, (gpointer) "-k"); - g_ptr_array_add (argv, (gpointer) "-t"); - g_ptr_array_add (argv, (gpointer) iface); - g_ptr_array_add (argv, NULL); - - _LOGD (LOGD_TEAM, "running: %s", - (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); - g_clear_pointer (&tmp_str, g_free); - - ret = g_spawn_sync ("/", (char **) argv->pdata, NULL, 0, NULL, NULL, NULL, NULL, &status, &error); - g_ptr_array_free (argv, TRUE); + if (priv->teamd_process_watch || + priv->teamd_pid > 0 || + priv->tdc || + priv->teamd_timeout) + { + g_warn_if_reached (); + if (!priv->teamd_pid) + teamd_kill (self, teamd_binary, NULL); + teamd_cleanup (device); + } /* Start teamd now */ argv = g_ptr_array_new (); @@ -478,7 +482,7 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) if (!ret) { _LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: %s", error->message); g_clear_error (&error); - teamd_cleanup (device, FALSE); + teamd_cleanup (device); return FALSE; } @@ -491,46 +495,67 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) return TRUE; } -static void -teamd_stop (NMDevice *device) -{ - NMDeviceTeam *self = NM_DEVICE_TEAM (device); - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); - - if (priv->teamd_pid > 0) - _LOGI (LOGD_TEAM, "Deactivation: stopping teamd..."); - else - _LOGD (LOGD_TEAM, "Deactivation: stopping teamd (not started)..."); - teamd_cleanup (device, FALSE); -} - static NMActStageReturn act_stage1_prepare (NMDevice *device, NMDeviceStateReason *reason) { + NMDeviceTeam *self = NM_DEVICE_TEAM (device); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); NMActStageReturn ret = NM_ACT_STAGE_RETURN_SUCCESS; + gs_free_error GError *error = NULL; NMConnection *connection; NMSettingTeam *s_team; + const char *cfg; g_return_val_if_fail (reason != NULL, NM_ACT_STAGE_RETURN_FAILURE); ret = NM_DEVICE_CLASS (nm_device_team_parent_class)->act_stage1_prepare (device, reason); - if (ret == NM_ACT_STAGE_RETURN_SUCCESS) { - connection = nm_device_get_connection (device); - g_assert (connection); - s_team = nm_connection_get_setting_team (connection); - g_assert (s_team); - if (teamd_start (device, s_team)) - ret = NM_ACT_STAGE_RETURN_POSTPONE; - else - ret = NM_ACT_STAGE_RETURN_FAILURE; + if (ret != NM_ACT_STAGE_RETURN_SUCCESS) + return ret; + + connection = nm_device_get_connection (device); + g_assert (connection); + s_team = nm_connection_get_setting_team (connection); + g_assert (s_team); + + if (priv->tdc) { + /* If the existing teamd config is the same as we're about to use, + * then we can proceed. If it's not the same, and we have a PID, + * kill it so we can respawn it with the right config. If we don't + * have a PID, then we must fail. + */ + cfg = teamdctl_config_get_raw (priv->tdc); + if (cfg && strcmp (cfg, nm_setting_team_get_config (s_team)) == 0) { + _LOGD (LOGD_TEAM, "using existing matching teamd config"); + return NM_ACT_STAGE_RETURN_SUCCESS; + } + + if (!priv->teamd_pid) { + _LOGD (LOGD_TEAM, "existing teamd config mismatch; killing existing via teamdctl"); + if (!teamd_kill (self, NULL, &error)) { + _LOGW (LOGD_TEAM, "existing teamd config mismatch; failed to kill existing teamd: %s", error->message); + *reason = NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED; + return NM_ACT_STAGE_RETURN_FAILURE; + } + } + + _LOGD (LOGD_TEAM, "existing teamd config mismatch; respawning..."); + teamd_cleanup (device); } - return ret; + + return teamd_start (device, s_team) ? + NM_ACT_STAGE_RETURN_POSTPONE : NM_ACT_STAGE_RETURN_FAILURE; } static void deactivate (NMDevice *device) { - teamd_stop (device); + NMDeviceTeam *self = NM_DEVICE_TEAM (device); + + _LOGI (LOGD_TEAM, "deactivation: stopping teamd..."); + + if (!NM_DEVICE_TEAM_GET_PRIVATE (self)->teamd_pid) + teamd_kill (self, NULL, NULL); + teamd_cleanup (device); } static gboolean @@ -678,11 +703,22 @@ nm_device_team_init (NMDeviceTeam * self) static void constructed (GObject *object) { - NMDeviceTeam *self = NM_DEVICE_TEAM (object); + NMDevice *device = NM_DEVICE (object); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (object); + char *tmp_str = NULL; G_OBJECT_CLASS (nm_device_team_parent_class)->constructed (object); - nm_device_team_watch_dbus (self); + /* Register D-Bus name watcher */ + tmp_str = g_strdup_printf ("org.libteam.teamd.%s", nm_device_get_ip_iface (device)); + priv->teamd_dbus_watch = g_bus_watch_name (G_BUS_TYPE_SYSTEM, + tmp_str, + G_BUS_NAME_WATCHER_FLAGS_NONE, + teamd_dbus_appeared, + teamd_dbus_vanished, + NM_DEVICE (device), + NULL); + g_free (tmp_str); } static void @@ -730,7 +766,7 @@ dispose (GObject *object) priv->teamd_dbus_watch = 0; } - teamd_cleanup (NM_DEVICE (object), FALSE); + teamd_cleanup (NM_DEVICE (object)); G_OBJECT_CLASS (nm_device_team_parent_class)->dispose (object); } From a8299f6ae97dab6bb401527e73c1cfb3b6e4f912 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 Mar 2015 08:34:42 -0500 Subject: [PATCH 2/2] team: ratelimit teamd spawning --- src/devices/team/nm-device-team.c | 136 +++++++++++++++++------------- 1 file changed, 79 insertions(+), 57 deletions(-) diff --git a/src/devices/team/nm-device-team.c b/src/devices/team/nm-device-team.c index 19676c2f5e..de0c2ee7ed 100644 --- a/src/devices/team/nm-device-team.c +++ b/src/devices/team/nm-device-team.c @@ -262,18 +262,7 @@ master_update_slave_connection (NMDevice *self, /******************************************************************/ static void -teamd_timeout_remove (NMDevice *device) -{ - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); - - if (priv->teamd_timeout) { - g_source_remove (priv->teamd_timeout); - priv->teamd_timeout = 0; - } -} - -static void -teamd_cleanup (NMDevice *device) +teamd_cleanup (NMDevice *device, gboolean free_tdc) { NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); @@ -282,18 +271,21 @@ teamd_cleanup (NMDevice *device) priv->teamd_process_watch = 0; } + if (priv->teamd_timeout) { + g_source_remove (priv->teamd_timeout); + priv->teamd_timeout = 0; + } + if (priv->teamd_pid > 0) { nm_utils_kill_child_async (priv->teamd_pid, SIGTERM, LOGD_TEAM, "teamd", 2000, NULL, NULL); priv->teamd_pid = 0; } - if (priv->tdc) { + if (priv->tdc && free_tdc) { teamdctl_disconnect (priv->tdc); teamdctl_free (priv->tdc); priv->tdc = NULL; } - - teamd_timeout_remove (device); } static gboolean @@ -304,14 +296,18 @@ teamd_timeout_cb (gpointer user_data) NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device); g_return_val_if_fail (priv->teamd_timeout, FALSE); + priv->teamd_timeout = 0; - _LOGI (LOGD_TEAM, "teamd timed out."); - teamd_cleanup (device); + if (priv->teamd_pid && !priv->tdc) { + /* Timed out launching our own teamd process */ + _LOGW (LOGD_TEAM, "teamd timed out."); + teamd_cleanup (device, TRUE); - g_warn_if_fail (nm_device_is_activating (device)); - nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + g_warn_if_fail (nm_device_is_activating (device)); + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + } - return FALSE; + return G_SOURCE_REMOVE; } static void @@ -328,9 +324,32 @@ teamd_dbus_appeared (GDBusConnection *connection, g_return_if_fail (priv->teamd_dbus_watch); _LOGI (LOGD_TEAM, "teamd appeared on D-Bus"); - teamd_timeout_remove (device); nm_device_queue_recheck_assume (device); + /* If another teamd grabbed the bus name while our teamd was starting, + * just ignore the death of our teamd and run with the existing one. + */ + if (priv->teamd_process_watch) { + gs_unref_variant GVariant *ret = NULL; + guint32 pid; + + ret = g_dbus_connection_call_sync (connection, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + "GetConnectionUnixProcessID", + g_variant_new ("(s)", name_owner), + NULL, + G_DBUS_CALL_FLAGS_NO_AUTO_START, + 2000, + NULL, + NULL); + g_variant_get (ret, "(u)", &pid); + + if (pid != priv->teamd_pid) + teamd_cleanup (device, FALSE); + } + /* Grab a teamd control handle even if we aren't going to use it * immediately. But if we are, and grabbing it failed, fail the * device activation. @@ -366,7 +385,7 @@ teamd_dbus_vanished (GDBusConnection *dbus_connection, } _LOGI (LOGD_TEAM, "teamd vanished from D-Bus"); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); /* Attempt to respawn teamd */ if (state >= NM_DEVICE_STATE_PREPARE && state <= NM_DEVICE_STATE_ACTIVATED) { @@ -384,13 +403,24 @@ teamd_process_watch_cb (GPid pid, gint status, gpointer user_data) NMDeviceTeam *self = NM_DEVICE_TEAM (user_data); NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); NMDevice *device = NM_DEVICE (self); + NMDeviceState state = nm_device_get_state (device); g_return_if_fail (priv->teamd_process_watch); - _LOGI (LOGD_TEAM, "teamd died with status %d", status); - priv->teamd_process_watch = 0; + _LOGD (LOGD_TEAM, "teamd died with status %d", status); priv->teamd_pid = 0; - teamd_cleanup (device); + priv->teamd_process_watch = 0; + + /* If teamd quit within 5 seconds of starting, it's probably hosed + * and will just die again, so fail the activation. + */ + if (priv->teamd_timeout && + (state >= NM_DEVICE_STATE_PREPARE) && + (state <= NM_DEVICE_STATE_ACTIVATED)) { + _LOGW (LOGD_TEAM, "teamd process quit unexpectedly; failing activation"); + teamd_cleanup (device, TRUE); + nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED); + } } static gboolean @@ -424,12 +454,11 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) NMDeviceTeam *self = NM_DEVICE_TEAM (device); NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); const char *iface = nm_device_get_ip_iface (device); - char *tmp_str = NULL; - const char *config; + gs_unref_ptrarray GPtrArray *argv = NULL; + gs_free_error GError *error = NULL; + gs_free char *tmp_str = NULL; const char *teamd_binary; - GPtrArray *argv; - GError *error = NULL; - gboolean ret; + const char *config; teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL); if (!teamd_binary) { @@ -437,15 +466,11 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) return FALSE; } - if (priv->teamd_process_watch || - priv->teamd_pid > 0 || - priv->tdc || - priv->teamd_timeout) - { + if (priv->teamd_process_watch || priv->teamd_pid > 0 || priv->tdc) { g_warn_if_reached (); if (!priv->teamd_pid) teamd_kill (self, teamd_binary, NULL); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); } /* Start teamd now */ @@ -469,29 +494,24 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team) g_ptr_array_add (argv, (gpointer) "-gg"); g_ptr_array_add (argv, NULL); - _LOGD (LOGD_TEAM, "running: %s", - (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); - g_clear_pointer (&tmp_str, g_free); - - /* Start a timeout for teamd to appear at D-Bus */ - priv->teamd_timeout = g_timeout_add_seconds (5, teamd_timeout_cb, device); - - ret = g_spawn_async ("/", (char **) argv->pdata, NULL, G_SPAWN_DO_NOT_REAP_CHILD, - nm_utils_setpgid, NULL, &priv->teamd_pid, &error); - g_ptr_array_free (argv, TRUE); - if (!ret) { + _LOGD (LOGD_TEAM, "running: %s", (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata))); + if (!g_spawn_async ("/", (char **) argv->pdata, NULL, G_SPAWN_DO_NOT_REAP_CHILD, + nm_utils_setpgid, NULL, &priv->teamd_pid, &error)) { _LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: %s", error->message); - g_clear_error (&error); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); return FALSE; } + /* Start a timeout for teamd to appear at D-Bus */ + if (!priv->teamd_timeout) + priv->teamd_timeout = g_timeout_add_seconds (5, teamd_timeout_cb, device); + /* Monitor the child process so we know when it dies */ priv->teamd_process_watch = g_child_watch_add (priv->teamd_pid, teamd_process_watch_cb, device); - _LOGI (LOGD_TEAM, "Activation: (team) started teamd..."); + _LOGI (LOGD_TEAM, "Activation: (team) started teamd [pid %u]...", (guint) priv->teamd_pid); return TRUE; } @@ -539,7 +559,7 @@ act_stage1_prepare (NMDevice *device, NMDeviceStateReason *reason) } _LOGD (LOGD_TEAM, "existing teamd config mismatch; respawning..."); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); } return teamd_start (device, s_team) ? @@ -550,12 +570,14 @@ static void deactivate (NMDevice *device) { NMDeviceTeam *self = NM_DEVICE_TEAM (device); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); - _LOGI (LOGD_TEAM, "deactivation: stopping teamd..."); + if (priv->teamd_pid || priv->tdc) + _LOGI (LOGD_TEAM, "deactivation: stopping teamd..."); - if (!NM_DEVICE_TEAM_GET_PRIVATE (self)->teamd_pid) + if (!priv->teamd_pid) teamd_kill (self, NULL, NULL); - teamd_cleanup (device); + teamd_cleanup (device, TRUE); } static gboolean @@ -758,15 +780,15 @@ set_property (GObject *object, guint prop_id, static void dispose (GObject *object) { - NMDeviceTeam *self = NM_DEVICE_TEAM (object); - NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self); + NMDevice *device = NM_DEVICE (object); + NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (object); if (priv->teamd_dbus_watch) { g_bus_unwatch_name (priv->teamd_dbus_watch); priv->teamd_dbus_watch = 0; } - teamd_cleanup (NM_DEVICE (object)); + teamd_cleanup (device, TRUE); G_OBJECT_CLASS (nm_device_team_parent_class)->dispose (object); }