core: merge branch 'th/shutdown-timeout-increase'

https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/1113
This commit is contained in:
Thomas Haller 2022-02-24 09:39:03 +01:00
commit d85b934370
No known key found for this signature in database
GPG key ID: 29C2366E4DFC5728
10 changed files with 47 additions and 28 deletions

16
TODO
View file

@ -72,25 +72,25 @@ shutdown starts. And no singleton getters work reliably after the main() functio
because singletons unref themselves. In general, avoid singleton getters and see
that somebody hands you a reference.
After NM_SHUTDOWN_TIMEOUT_MS we loose patience that it's taking too long.
After NM_SHUTDOWN_TIMEOUT_MAX_MSEC we loose patience that it's taking too long.
We now log a debug message about who is still blocking shutdown.
We also cancel the cancellables from nm_shutdown_wait_obj_register_cancellable()
and give NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG more time. If we then are still
and give NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC more time. If we then are still
not complete, we log an error message about who is still blocking shutdown,
and just exit with an assertion failure. We encountered a bug.
This means, *all* async operations in NetworkManager must either be cancellable (and
afterwards complete fast) or they must not take long to begin with. In particular,
every individual async operation must be completed in at most NM_SHUTDOWN_TIMEOUT_MS,
and all async cleanup operations must complete in NM_SHUTDOWN_TIMEOUT_MS too.
every individual async operation must be completed in at most NM_SHUTDOWN_TIMEOUT_MAX_MSEC,
and all async cleanup operations must complete in NM_SHUTDOWN_TIMEOUT_MAX_MSEC too.
So if you make an async operation not cancellable, but guarantee that you don't take
longer than NM_SHUTDOWN_TIMEOUT_MS you are mostly fine (better would be to actually
complete fast, if you can). That's why reaching NM_SHUTDOWN_TIMEOUT_MS timeout is
still not a bug scenario. But reaching NM_SHUTDOWN_TIMEOUT_MS+NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG
longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC you are mostly fine (better would be to actually
complete fast, if you can). That's why reaching NM_SHUTDOWN_TIMEOUT_MAX_MSEC timeout is
still not a bug scenario. But reaching NM_SHUTDOWN_TIMEOUT_MAX_MSEC+NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC
is a bug.
As NM_SHUTDOWN_TIMEOUT_MS and nm_shutdown_wait_obj_register_object() API already exists,
As NM_SHUTDOWN_TIMEOUT_MAX_MSEC and nm_shutdown_wait_obj_register_object() API already exists,
the first step is to ensure that all parts of NetworkManager can be shutdown and be terminated
in a timely manner.

View file

@ -34,6 +34,11 @@
/*****************************************************************************/
G_STATIC_ASSERT(NM_SHUTDOWN_TIMEOUT_1500_MSEC <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC);
G_STATIC_ASSERT(NM_SHUTDOWN_TIMEOUT_5000_MSEC <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC);
/*****************************************************************************/
/**
* nm_utils_get_shared_wifi_permission:
* @connection: the NMConnection to lookup the permission.
@ -1037,7 +1042,7 @@ _shutdown_waitobj_cb(gpointer user_data, GObject *where_the_object_was)
* is still used.
*
* If @wait_type is %NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE, then during shutdown
* (after %NM_SHUTDOWN_TIMEOUT_MS), the cancellable will be cancelled to notify
* (after %NM_SHUTDOWN_TIMEOUT_MAX_MSEC), the cancellable will be cancelled to notify
* the source of the shutdown. Note that otherwise, in this mode also @watched_obj
* is only tracked with a weak-pointer. Especially, it does not register to the
* "cancelled" signal to automatically unregister (otherwise, you would never
@ -1046,7 +1051,7 @@ _shutdown_waitobj_cb(gpointer user_data, GObject *where_the_object_was)
* FIXME(shutdown): proper shutdown is not yet implemented, and registering
* an object (currently) has no effect.
*
* FIXME(shutdown): during shutdown, after %NM_SHUTDOWN_TIMEOUT_MS timeout, cancel
* FIXME(shutdown): during shutdown, after %NM_SHUTDOWN_TIMEOUT_MAX_MSEC timeout, cancel
* all remaining %NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE instances. Also, when somebody
* enqueues a cancellable after that point, cancel it right away on an idle handler.
*

View file

@ -103,23 +103,37 @@ NMPlatformRoutingRule *nm_ip_routing_rule_to_platform(const NMIPRoutingRule *rul
/*****************************************************************************/
/* during shutdown, there are two relevant timeouts. One is
* NM_SHUTDOWN_TIMEOUT_MS which is plenty of time, that we give for all
* NM_SHUTDOWN_TIMEOUT_MAX_MSEC which is plenty of time, that we give for all
* actions to complete. Of course, during shutdown components should hurry
* to cleanup.
*
* When we initiate shutdown, we should start killing child processes
* with SIGTERM. If they don't complete within NM_SHUTDOWN_TIMEOUT_MS, we send
* with SIGTERM. If they don't complete within NM_SHUTDOWN_TIMEOUT_MAX_MSEC, we send
* SIGKILL.
*
* After NM_SHUTDOWN_TIMEOUT_MS, NetworkManager will however not yet terminate right
* away. It iterates the mainloop for another NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG. This
* After NM_SHUTDOWN_TIMEOUT_MAX_MSEC, NetworkManager will however not yet terminate right
* away. It iterates the mainloop for another NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC. This
* should give time to reap the child process (after SIGKILL).
*
* So, the maximum time we should wait before sending SIGKILL should be at most
* NM_SHUTDOWN_TIMEOUT_MS.
* NM_SHUTDOWN_TIMEOUT_MAX_MSEC.
*/
#define NM_SHUTDOWN_TIMEOUT_MS 1500
#define NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG 500
#define NM_SHUTDOWN_TIMEOUT_MAX_MSEC 5000
#define NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC 500
/**
* NM_SHUTDOWN_TIMEOUT_1500_MSEC: this is just 1500 msec. The special
* thing about the define is that you are guaranteed that this is not
* longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC.
* When you perform an async operation, it must either be cancellable
* (and complete fast) or never take longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC.
* The usage of this macro makes that relation to NM_SHUTDOWN_TIMEOUT_MAX_MSEC
* explicit.
*/
#define NM_SHUTDOWN_TIMEOUT_1500_MSEC 1500
/* See NM_SHUTDOWN_TIMEOUT_1500_MSEC. */
#define NM_SHUTDOWN_TIMEOUT_5000_MSEC 5000
typedef enum {
/* There is no watched_obj argument, and the shutdown is delayed until the user
@ -131,7 +145,7 @@ typedef enum {
NM_SHUTDOWN_WAIT_TYPE_OBJECT,
/* The watched_obj argument is a GCancellable, and shutdown is delayed until the object
* gets destroyed (or unregistered). Note that after NM_SHUTDOWN_TIMEOUT_MS, the
* gets destroyed (or unregistered). Note that after NM_SHUTDOWN_TIMEOUT_MAX_MSEC, the
* cancellable will be cancelled to notify listeners about the shutdown. */
NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE,
} NMShutdownWaitType;

View file

@ -1014,7 +1014,7 @@ act_stage1_prepare(NMDevice *device, NMDeviceStateReason *out_failure_reason)
* get confused and fail to negotiate the new connection. (rh #1023503)
*
* FIXME(shutdown): when exiting, we also need to wait before quitting,
* at least for additional NM_SHUTDOWN_TIMEOUT_MS seconds because
* at least for additional NM_SHUTDOWN_TIMEOUT_MAX_MSEC seconds because
* otherwise after restart the device won't work for the first seconds.
*/
if (priv->ppp_data.last_pppoe_time_msec != 0) {

View file

@ -7003,7 +7003,7 @@ sriov_op_queue(NMDevice *self,
*
* FIXME(shutdown): However, during shutdown we don't have a follow-up write request to cancel
* this operation and we have to give it at least some time to complete. The solution is that
* we register a way to abort the last call during shutdown, and after NM_SHUTDOWN_TIMEOUT_MS
* we register a way to abort the last call during shutdown, and after NM_SHUTDOWN_TIMEOUT_MAX_MSEC
* grace period we pull the plug and cancel it. */
op = g_slice_new(SriovOp);

View file

@ -39,10 +39,10 @@
#define _NMLOG(level, ...) __NMLOG_DEFAULT(level, _NMLOG_DOMAIN, "dnsmasq", __VA_ARGS__)
#define WAIT_MSEC_AFTER_SIGTERM 1000
G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGTERM <= NM_SHUTDOWN_TIMEOUT_MS);
G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGTERM <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC);
#define WAIT_MSEC_AFTER_SIGKILL 400
G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGKILL + 100 <= NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG);
G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGKILL + 100 <= NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC);
typedef void (*GlPidSpawnAsyncNotify)(GCancellable *cancellable,
GPid pid,

View file

@ -422,7 +422,7 @@ _fw_nft_call_communicate_cb(GObject *source, GAsyncResult *result, gpointer user
nm_g_main_context_push_thread_default_if_necessary(NULL);
nm_shutdown_wait_obj_register_object(call_data->subprocess, "nft-terminate");
G_STATIC_ASSERT_EXPR(200 < NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG * 2 / 3);
G_STATIC_ASSERT_EXPR(200 < NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC * 2 / 3);
nm_g_subprocess_terminate_in_background(call_data->subprocess, 200);
}
} else if (g_subprocess_get_successful(call_data->subprocess)) {
@ -546,7 +546,7 @@ _fw_nft_call(GBytes *stdin_buf,
call_data);
call_data->timeout_source =
nm_g_source_attach(nm_g_timeout_source_new((NM_SHUTDOWN_TIMEOUT_MS * 2) / 3,
nm_g_source_attach(nm_g_timeout_source_new((NM_SHUTDOWN_TIMEOUT_1500_MSEC * 2) / 3,
G_PRIORITY_DEFAULT,
_fw_nft_call_timeout_cb,
call_data,

View file

@ -291,7 +291,7 @@ _call_destroy_proxy_configuration(NMPacrunnerManager *self,
g_variant_new("(o)", path),
G_VARIANT_TYPE("()"),
G_DBUS_CALL_FLAGS_NO_AUTO_START,
NM_SHUTDOWN_TIMEOUT_MS,
NM_SHUTDOWN_TIMEOUT_1500_MSEC,
priv->cancellable,
_call_destroy_proxy_configuration_cb,
conf_id_ref(conf_id));
@ -315,7 +315,7 @@ _call_create_proxy_configuration(NMPacrunnerManager *self,
conf_id->parameters,
G_VARIANT_TYPE("(o)"),
G_DBUS_CALL_FLAGS_NO_AUTO_START,
NM_SHUTDOWN_TIMEOUT_MS,
NM_SHUTDOWN_TIMEOUT_1500_MSEC,
priv->cancellable,
_call_create_proxy_configuration_cb,
conf_id_ref(conf_id));

View file

@ -1246,7 +1246,7 @@ _ppp_manager_stop(NMPPPManager *self,
SIGTERM,
LOGD_PPP,
"pppd",
5000,
NM_SHUTDOWN_TIMEOUT_5000_MSEC,
_stop_child_cb,
handle);

View file

@ -511,7 +511,7 @@ nm_secret_agent_cancel_call(NMSecretAgent *self, NMSecretAgentCallId *call_id)
g_variant_new("(os)", call_id->path, call_id->setting_name),
G_VARIANT_TYPE("()"),
G_DBUS_CALL_FLAGS_NO_AUTO_START,
NM_SHUTDOWN_TIMEOUT_MS,
NM_SHUTDOWN_TIMEOUT_1500_MSEC,
NULL, /* this operation is not cancellable. We rely on the timeout. */
_call_cancel_cb,
call_id);