team: respawn teamd instead of exiting (rh #1145988) (bgo #745903)

This commit is contained in:
Dan Williams 2015-04-02 15:03:25 -05:00
commit 4f4e570a7d

View file

@ -41,6 +41,7 @@
#include "nm-team-enum-types.h"
#include "nm-posix-signals.h"
#include "nm-core-internal.h"
#include "gsystem-local-alloc.h"
#include "nm-device-team-glue.h"
@ -66,6 +67,8 @@ enum {
LAST_PROP
};
static gboolean teamd_start (NMDevice *device, NMSettingTeam *s_team);
/******************************************************************/
static guint32
@ -260,18 +263,7 @@ master_update_slave_connection (NMDevice *self,
/******************************************************************/
static void
teamd_timeout_remove (NMDevice *device)
{
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device);
if (priv->teamd_timeout) {
g_source_remove (priv->teamd_timeout);
priv->teamd_timeout = 0;
}
}
static void
teamd_cleanup (NMDevice *device, gboolean device_state_failed)
teamd_cleanup (NMDevice *device, gboolean free_tdc)
{
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device);
@ -280,24 +272,21 @@ teamd_cleanup (NMDevice *device, gboolean device_state_failed)
priv->teamd_process_watch = 0;
}
if (priv->teamd_timeout) {
g_source_remove (priv->teamd_timeout);
priv->teamd_timeout = 0;
}
if (priv->teamd_pid > 0) {
nm_utils_kill_child_async (priv->teamd_pid, SIGTERM, LOGD_TEAM, "teamd", 2000, NULL, NULL);
priv->teamd_pid = 0;
}
if (priv->tdc) {
if (priv->tdc && free_tdc) {
teamdctl_disconnect (priv->tdc);
teamdctl_free (priv->tdc);
priv->tdc = NULL;
}
teamd_timeout_remove (device);
if (device_state_failed) {
if (nm_device_is_activating (device) ||
(nm_device_get_state (device) == NM_DEVICE_STATE_ACTIVATED))
nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED);
}
}
static gboolean
@ -308,11 +297,18 @@ teamd_timeout_cb (gpointer user_data)
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (device);
g_return_val_if_fail (priv->teamd_timeout, FALSE);
priv->teamd_timeout = 0;
_LOGI (LOGD_TEAM, "teamd timed out.");
teamd_cleanup (device, TRUE);
if (priv->teamd_pid && !priv->tdc) {
/* Timed out launching our own teamd process */
_LOGW (LOGD_TEAM, "teamd timed out.");
teamd_cleanup (device, TRUE);
return FALSE;
g_warn_if_fail (nm_device_is_activating (device));
nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED);
}
return G_SOURCE_REMOVE;
}
static void
@ -329,27 +325,54 @@ teamd_dbus_appeared (GDBusConnection *connection,
g_return_if_fail (priv->teamd_dbus_watch);
_LOGI (LOGD_TEAM, "teamd appeared on D-Bus");
teamd_timeout_remove (device);
nm_device_queue_recheck_assume (device);
/* If another teamd grabbed the bus name while our teamd was starting,
* just ignore the death of our teamd and run with the existing one.
*/
if (priv->teamd_process_watch) {
gs_unref_variant GVariant *ret = NULL;
guint32 pid;
ret = g_dbus_connection_call_sync (connection,
"org.freedesktop.DBus",
"/org/freedesktop/DBus",
"org.freedesktop.DBus",
"GetConnectionUnixProcessID",
g_variant_new ("(s)", name_owner),
NULL,
G_DBUS_CALL_FLAGS_NO_AUTO_START,
2000,
NULL,
NULL);
g_variant_get (ret, "(u)", &pid);
if (pid != priv->teamd_pid)
teamd_cleanup (device, FALSE);
}
/* Grab a teamd control handle even if we aren't going to use it
* immediately. But if we are, and grabbing it failed, fail the
* device activation.
*/
success = ensure_teamd_connection (device);
if (nm_device_get_state (device) == NM_DEVICE_STATE_PREPARE) {
if (success)
nm_device_activate_schedule_stage2_device_config (device);
else if (!nm_device_uses_assumed_connection (device))
nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED);
return;
}
}
static void
teamd_dbus_vanished (GDBusConnection *connection,
teamd_dbus_vanished (GDBusConnection *dbus_connection,
const gchar *name,
gpointer user_data)
{
NMDeviceTeam *self = NM_DEVICE_TEAM (user_data);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
NMDevice *device = NM_DEVICE (self);
NMDeviceState state = nm_device_get_state (device);
g_return_if_fail (priv->teamd_dbus_watch);
@ -364,6 +387,15 @@ teamd_dbus_vanished (GDBusConnection *connection,
_LOGI (LOGD_TEAM, "teamd vanished from D-Bus");
teamd_cleanup (device, TRUE);
/* Attempt to respawn teamd */
if (state >= NM_DEVICE_STATE_PREPARE && state <= NM_DEVICE_STATE_ACTIVATED) {
NMConnection *connection = nm_device_get_connection (device);
g_assert (connection);
if (!teamd_start (device, nm_connection_get_setting_team (connection)))
nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED);
}
}
static void
@ -372,13 +404,24 @@ teamd_process_watch_cb (GPid pid, gint status, gpointer user_data)
NMDeviceTeam *self = NM_DEVICE_TEAM (user_data);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
NMDevice *device = NM_DEVICE (self);
NMDeviceState state = nm_device_get_state (device);
g_return_if_fail (priv->teamd_process_watch);
_LOGI (LOGD_TEAM, "teamd died with status %d", status);
priv->teamd_process_watch = 0;
_LOGD (LOGD_TEAM, "teamd died with status %d", status);
priv->teamd_pid = 0;
teamd_cleanup (device, TRUE);
priv->teamd_process_watch = 0;
/* If teamd quit within 5 seconds of starting, it's probably hosed
* and will just die again, so fail the activation.
*/
if (priv->teamd_timeout &&
(state >= NM_DEVICE_STATE_PREPARE) &&
(state <= NM_DEVICE_STATE_ACTIVATED)) {
_LOGW (LOGD_TEAM, "teamd process quit unexpectedly; failing activation");
teamd_cleanup (device, TRUE);
nm_device_state_changed (device, NM_DEVICE_STATE_FAILED, NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED);
}
}
static void
@ -398,23 +441,29 @@ teamd_child_setup (gpointer user_data G_GNUC_UNUSED)
nm_unblock_posix_signals (NULL);
}
static void
nm_device_team_watch_dbus (NMDeviceTeam *self)
static gboolean
teamd_kill (NMDeviceTeam *self, const char *teamd_binary, GError **error)
{
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
const char *iface = nm_device_get_ip_iface (NM_DEVICE (self));
char *tmp_str = NULL;
gs_unref_ptrarray GPtrArray *argv = NULL;
gs_free char *tmp_str = NULL;
/* Register D-Bus name watcher */
tmp_str = g_strdup_printf ("org.libteam.teamd.%s", iface);
priv->teamd_dbus_watch = g_bus_watch_name (G_BUS_TYPE_SYSTEM,
tmp_str,
G_BUS_NAME_WATCHER_FLAGS_NONE,
teamd_dbus_appeared,
teamd_dbus_vanished,
NM_DEVICE (self),
NULL);
g_free (tmp_str);
if (!teamd_binary) {
teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL);
if (!teamd_binary) {
_LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: teamd binary not found");
return FALSE;
}
}
argv = g_ptr_array_new ();
g_ptr_array_add (argv, (gpointer) teamd_binary);
g_ptr_array_add (argv, (gpointer) "-k");
g_ptr_array_add (argv, (gpointer) "-t");
g_ptr_array_add (argv, (gpointer) nm_device_get_iface (NM_DEVICE (self)));
g_ptr_array_add (argv, NULL);
_LOGD (LOGD_TEAM, "running: %s", (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata)));
return g_spawn_sync ("/", (char **) argv->pdata, NULL, 0, nm_unblock_posix_signals, NULL, NULL, NULL, NULL, error);
}
static gboolean
@ -423,23 +472,11 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team)
NMDeviceTeam *self = NM_DEVICE_TEAM (device);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
const char *iface = nm_device_get_ip_iface (device);
char *tmp_str = NULL;
const char *config;
gs_unref_ptrarray GPtrArray *argv = NULL;
gs_free_error GError *error = NULL;
gs_free char *tmp_str = NULL;
const char *teamd_binary;
GPtrArray *argv;
GError *error = NULL;
gboolean ret;
int status;
if (priv->teamd_process_watch ||
priv->teamd_pid > 0 ||
priv->tdc ||
priv->teamd_timeout)
{
/* FIXME g_assert that this never hits. For now, be more reluctant, and try to recover. */
g_warn_if_reached ();
teamd_cleanup (device, FALSE);
}
const char *config;
teamd_binary = nm_utils_find_helper ("teamd", NULL, NULL);
if (!teamd_binary) {
@ -447,20 +484,12 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team)
return FALSE;
}
/* Kill teamd for same named device first if it is there */
argv = g_ptr_array_new ();
g_ptr_array_add (argv, (gpointer) teamd_binary);
g_ptr_array_add (argv, (gpointer) "-k");
g_ptr_array_add (argv, (gpointer) "-t");
g_ptr_array_add (argv, (gpointer) iface);
g_ptr_array_add (argv, NULL);
_LOGD (LOGD_TEAM, "running: %s",
(tmp_str = g_strjoinv (" ", (gchar **) argv->pdata)));
g_clear_pointer (&tmp_str, g_free);
ret = g_spawn_sync ("/", (char **) argv->pdata, NULL, 0, nm_unblock_posix_signals, NULL, NULL, NULL, &status, &error);
g_ptr_array_free (argv, TRUE);
if (priv->teamd_process_watch || priv->teamd_pid > 0 || priv->tdc) {
g_warn_if_reached ();
if (!priv->teamd_pid)
teamd_kill (self, teamd_binary, NULL);
teamd_cleanup (device, TRUE);
}
/* Start teamd now */
argv = g_ptr_array_new ();
@ -483,72 +512,90 @@ teamd_start (NMDevice *device, NMSettingTeam *s_team)
g_ptr_array_add (argv, (gpointer) "-gg");
g_ptr_array_add (argv, NULL);
_LOGD (LOGD_TEAM, "running: %s",
(tmp_str = g_strjoinv (" ", (gchar **) argv->pdata)));
g_clear_pointer (&tmp_str, g_free);
/* Start a timeout for teamd to appear at D-Bus */
priv->teamd_timeout = g_timeout_add_seconds (5, teamd_timeout_cb, device);
ret = g_spawn_async ("/", (char **) argv->pdata, NULL, G_SPAWN_DO_NOT_REAP_CHILD,
&teamd_child_setup, NULL, &priv->teamd_pid, &error);
g_ptr_array_free (argv, TRUE);
if (!ret) {
_LOGD (LOGD_TEAM, "running: %s", (tmp_str = g_strjoinv (" ", (gchar **) argv->pdata)));
if (!g_spawn_async ("/", (char **) argv->pdata, NULL, G_SPAWN_DO_NOT_REAP_CHILD,
teamd_child_setup, NULL, &priv->teamd_pid, &error)) {
_LOGW (LOGD_TEAM, "Activation: (team) failed to start teamd: %s", error->message);
g_clear_error (&error);
teamd_cleanup (device, FALSE);
teamd_cleanup (device, TRUE);
return FALSE;
}
/* Start a timeout for teamd to appear at D-Bus */
if (!priv->teamd_timeout)
priv->teamd_timeout = g_timeout_add_seconds (5, teamd_timeout_cb, device);
/* Monitor the child process so we know when it dies */
priv->teamd_process_watch = g_child_watch_add (priv->teamd_pid,
teamd_process_watch_cb,
device);
_LOGI (LOGD_TEAM, "Activation: (team) started teamd...");
_LOGI (LOGD_TEAM, "Activation: (team) started teamd [pid %u]...", (guint) priv->teamd_pid);
return TRUE;
}
static void
teamd_stop (NMDevice *device)
{
NMDeviceTeam *self = NM_DEVICE_TEAM (device);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
if (priv->teamd_pid > 0)
_LOGI (LOGD_TEAM, "Deactivation: stopping teamd...");
else
_LOGD (LOGD_TEAM, "Deactivation: stopping teamd (not started)...");
teamd_cleanup (device, FALSE);
}
static NMActStageReturn
act_stage1_prepare (NMDevice *device, NMDeviceStateReason *reason)
{
NMDeviceTeam *self = NM_DEVICE_TEAM (device);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
NMActStageReturn ret = NM_ACT_STAGE_RETURN_SUCCESS;
gs_free_error GError *error = NULL;
NMConnection *connection;
NMSettingTeam *s_team;
const char *cfg;
g_return_val_if_fail (reason != NULL, NM_ACT_STAGE_RETURN_FAILURE);
ret = NM_DEVICE_CLASS (nm_device_team_parent_class)->act_stage1_prepare (device, reason);
if (ret == NM_ACT_STAGE_RETURN_SUCCESS) {
connection = nm_device_get_connection (device);
g_assert (connection);
s_team = nm_connection_get_setting_team (connection);
g_assert (s_team);
if (teamd_start (device, s_team))
ret = NM_ACT_STAGE_RETURN_POSTPONE;
else
ret = NM_ACT_STAGE_RETURN_FAILURE;
if (ret != NM_ACT_STAGE_RETURN_SUCCESS)
return ret;
connection = nm_device_get_connection (device);
g_assert (connection);
s_team = nm_connection_get_setting_team (connection);
g_assert (s_team);
if (priv->tdc) {
/* If the existing teamd config is the same as we're about to use,
* then we can proceed. If it's not the same, and we have a PID,
* kill it so we can respawn it with the right config. If we don't
* have a PID, then we must fail.
*/
cfg = teamdctl_config_get_raw (priv->tdc);
if (cfg && strcmp (cfg, nm_setting_team_get_config (s_team)) == 0) {
_LOGD (LOGD_TEAM, "using existing matching teamd config");
return NM_ACT_STAGE_RETURN_SUCCESS;
}
if (!priv->teamd_pid) {
_LOGD (LOGD_TEAM, "existing teamd config mismatch; killing existing via teamdctl");
if (!teamd_kill (self, NULL, &error)) {
_LOGW (LOGD_TEAM, "existing teamd config mismatch; failed to kill existing teamd: %s", error->message);
*reason = NM_DEVICE_STATE_REASON_TEAMD_CONTROL_FAILED;
return NM_ACT_STAGE_RETURN_FAILURE;
}
}
_LOGD (LOGD_TEAM, "existing teamd config mismatch; respawning...");
teamd_cleanup (device, TRUE);
}
return ret;
return teamd_start (device, s_team) ?
NM_ACT_STAGE_RETURN_POSTPONE : NM_ACT_STAGE_RETURN_FAILURE;
}
static void
deactivate (NMDevice *device)
{
teamd_stop (device);
NMDeviceTeam *self = NM_DEVICE_TEAM (device);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
if (priv->teamd_pid || priv->tdc)
_LOGI (LOGD_TEAM, "deactivation: stopping teamd...");
if (!priv->teamd_pid)
teamd_kill (self, NULL, NULL);
teamd_cleanup (device, TRUE);
}
static gboolean
@ -696,11 +743,22 @@ nm_device_team_init (NMDeviceTeam * self)
static void
constructed (GObject *object)
{
NMDeviceTeam *self = NM_DEVICE_TEAM (object);
NMDevice *device = NM_DEVICE (object);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (object);
char *tmp_str = NULL;
G_OBJECT_CLASS (nm_device_team_parent_class)->constructed (object);
nm_device_team_watch_dbus (self);
/* Register D-Bus name watcher */
tmp_str = g_strdup_printf ("org.libteam.teamd.%s", nm_device_get_ip_iface (device));
priv->teamd_dbus_watch = g_bus_watch_name (G_BUS_TYPE_SYSTEM,
tmp_str,
G_BUS_NAME_WATCHER_FLAGS_NONE,
teamd_dbus_appeared,
teamd_dbus_vanished,
NM_DEVICE (device),
NULL);
g_free (tmp_str);
}
static void
@ -740,15 +798,15 @@ set_property (GObject *object, guint prop_id,
static void
dispose (GObject *object)
{
NMDeviceTeam *self = NM_DEVICE_TEAM (object);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (self);
NMDevice *device = NM_DEVICE (object);
NMDeviceTeamPrivate *priv = NM_DEVICE_TEAM_GET_PRIVATE (object);
if (priv->teamd_dbus_watch) {
g_bus_unwatch_name (priv->teamd_dbus_watch);
priv->teamd_dbus_watch = 0;
}
teamd_cleanup (NM_DEVICE (object), FALSE);
teamd_cleanup (device, TRUE);
G_OBJECT_CLASS (nm_device_team_parent_class)->dispose (object);
}