core: merge branch 'th/shutdown-timeout-increase'

https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/1113
2024-07-21 10:14:41 +00:00 · 2022-02-24 09:39:03 +01:00 · 2022-02-24 09:39:03 +01:00 · d85b934370
parent 7a1734926a 7c874ed456
commit d85b934370
10 changed files with 47 additions and 28 deletions
--- a/16
+++ b/16
@ -72,25 +72,25 @@ shutdown starts. And no singleton getters work reliably after the main() functio
 because singletons unref themselves. In general, avoid singleton getters and see
 that somebody hands you a reference.

-After NM_SHUTDOWN_TIMEOUT_MS we loose patience that it's taking too long.
+After NM_SHUTDOWN_TIMEOUT_MAX_MSEC we loose patience that it's taking too long.
 We now log a debug message about who is still blocking shutdown.
 We also cancel the cancellables from nm_shutdown_wait_obj_register_cancellable()
-and give NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG more time. If we then are still
+and give NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC more time. If we then are still
 not complete, we log an error message about who is still blocking shutdown,
 and just exit with an assertion failure. We encountered a bug.

 This means, *all* async operations in NetworkManager must either be cancellable (and
 afterwards complete fast) or they must not take long to begin with. In particular,
-every individual async operation must be completed in at most NM_SHUTDOWN_TIMEOUT_MS,
-and all async cleanup operations must complete in NM_SHUTDOWN_TIMEOUT_MS too.
+every individual async operation must be completed in at most NM_SHUTDOWN_TIMEOUT_MAX_MSEC,
+and all async cleanup operations must complete in NM_SHUTDOWN_TIMEOUT_MAX_MSEC too.

 So if you make an async operation not cancellable, but guarantee that you don't take
-longer than NM_SHUTDOWN_TIMEOUT_MS you are mostly fine (better would be to actually
-complete fast, if you can). That's why reaching NM_SHUTDOWN_TIMEOUT_MS timeout is
-still not a bug scenario. But reaching NM_SHUTDOWN_TIMEOUT_MS+NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG
+longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC you are mostly fine (better would be to actually
+complete fast, if you can). That's why reaching NM_SHUTDOWN_TIMEOUT_MAX_MSEC timeout is
+still not a bug scenario. But reaching NM_SHUTDOWN_TIMEOUT_MAX_MSEC+NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC
 is a bug.

-As NM_SHUTDOWN_TIMEOUT_MS and nm_shutdown_wait_obj_register_object() API already exists,
+As NM_SHUTDOWN_TIMEOUT_MAX_MSEC and nm_shutdown_wait_obj_register_object() API already exists,
 the first step is to ensure that all parts of NetworkManager can be shutdown and be terminated
 in a timely manner.

--- a/src/core/NetworkManagerUtils.c
+++ b/src/core/NetworkManagerUtils.c
@ -34,6 +34,11 @@

 /*****************************************************************************/

+G_STATIC_ASSERT(NM_SHUTDOWN_TIMEOUT_1500_MSEC <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC);
+G_STATIC_ASSERT(NM_SHUTDOWN_TIMEOUT_5000_MSEC <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC);
+
+/*****************************************************************************/
+
 /**
 * nm_utils_get_shared_wifi_permission:
 * @connection: the NMConnection to lookup the permission.
@ -1037,7 +1042,7 @@ _shutdown_waitobj_cb(gpointer user_data, GObject *where_the_object_was)
 * is still used.
 *
 * If @wait_type is %NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE, then during shutdown
- * (after %NM_SHUTDOWN_TIMEOUT_MS), the cancellable will be cancelled to notify
+ * (after %NM_SHUTDOWN_TIMEOUT_MAX_MSEC), the cancellable will be cancelled to notify
 * the source of the shutdown. Note that otherwise, in this mode also @watched_obj
 * is only tracked with a weak-pointer. Especially, it does not register to the
 * "cancelled" signal to automatically unregister (otherwise, you would never
@ -1046,7 +1051,7 @@ _shutdown_waitobj_cb(gpointer user_data, GObject *where_the_object_was)
 * FIXME(shutdown): proper shutdown is not yet implemented, and registering
 *   an object (currently) has no effect.
 *
- * FIXME(shutdown): during shutdown, after %NM_SHUTDOWN_TIMEOUT_MS timeout, cancel
+ * FIXME(shutdown): during shutdown, after %NM_SHUTDOWN_TIMEOUT_MAX_MSEC timeout, cancel
 *   all remaining %NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE instances. Also, when somebody
 *   enqueues a cancellable after that point, cancel it right away on an idle handler.
 *
--- a/src/core/NetworkManagerUtils.h
+++ b/src/core/NetworkManagerUtils.h
@ -103,23 +103,37 @@ NMPlatformRoutingRule *nm_ip_routing_rule_to_platform(const NMIPRoutingRule *rul
 /*****************************************************************************/

 /* during shutdown, there are two relevant timeouts. One is
- * NM_SHUTDOWN_TIMEOUT_MS which is plenty of time, that we give for all
+ * NM_SHUTDOWN_TIMEOUT_MAX_MSEC which is plenty of time, that we give for all
 * actions to complete. Of course, during shutdown components should hurry
 * to cleanup.
 *
 * When we initiate shutdown, we should start killing child processes
- * with SIGTERM. If they don't complete within NM_SHUTDOWN_TIMEOUT_MS, we send
+ * with SIGTERM. If they don't complete within NM_SHUTDOWN_TIMEOUT_MAX_MSEC, we send
 * SIGKILL.
 *
- * After NM_SHUTDOWN_TIMEOUT_MS, NetworkManager will however not yet terminate right
- * away. It iterates the mainloop for another NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG. This
+ * After NM_SHUTDOWN_TIMEOUT_MAX_MSEC, NetworkManager will however not yet terminate right
+ * away. It iterates the mainloop for another NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC. This
 * should give time to reap the child process (after SIGKILL).
 *
 * So, the maximum time we should wait before sending SIGKILL should be at most
- * NM_SHUTDOWN_TIMEOUT_MS.
+ * NM_SHUTDOWN_TIMEOUT_MAX_MSEC.
 */
-#define NM_SHUTDOWN_TIMEOUT_MS          1500
-#define NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG 500
+#define NM_SHUTDOWN_TIMEOUT_MAX_MSEC        5000
+#define NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC 500
+
+/**
+ * NM_SHUTDOWN_TIMEOUT_1500_MSEC: this is just 1500 msec. The special
+ *   thing about the define is that you are guaranteed that this is not
+ *   longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC.
+ *   When you perform an async operation, it must either be cancellable
+ *   (and complete fast) or never take longer than NM_SHUTDOWN_TIMEOUT_MAX_MSEC.
+ *   The usage of this macro makes that relation to NM_SHUTDOWN_TIMEOUT_MAX_MSEC
+ *   explicit.
+ */
+#define NM_SHUTDOWN_TIMEOUT_1500_MSEC 1500
+
+/* See NM_SHUTDOWN_TIMEOUT_1500_MSEC. */
+#define NM_SHUTDOWN_TIMEOUT_5000_MSEC 5000

 typedef enum {
    /* There is no watched_obj argument, and the shutdown is delayed until the user
@ -131,7 +145,7 @@ typedef enum {
    NM_SHUTDOWN_WAIT_TYPE_OBJECT,

    /* The watched_obj argument is a GCancellable, and shutdown is delayed until the object
-     * gets destroyed (or unregistered). Note that after NM_SHUTDOWN_TIMEOUT_MS, the
+     * gets destroyed (or unregistered). Note that after NM_SHUTDOWN_TIMEOUT_MAX_MSEC, the
     * cancellable will be cancelled to notify listeners about the shutdown. */
    NM_SHUTDOWN_WAIT_TYPE_CANCELLABLE,
 } NMShutdownWaitType;
--- a/src/core/devices/nm-device-ethernet.c
+++ b/src/core/devices/nm-device-ethernet.c
@ -1014,7 +1014,7 @@ act_stage1_prepare(NMDevice *device, NMDeviceStateReason *out_failure_reason)
     * get confused and fail to negotiate the new connection. (rh #1023503)
     *
     * FIXME(shutdown): when exiting, we also need to wait before quitting,
-     * at least for additional NM_SHUTDOWN_TIMEOUT_MS seconds because
+     * at least for additional NM_SHUTDOWN_TIMEOUT_MAX_MSEC seconds because
     * otherwise after restart the device won't work for the first seconds.
     */
    if (priv->ppp_data.last_pppoe_time_msec != 0) {
--- a/src/core/devices/nm-device.c
+++ b/src/core/devices/nm-device.c
@ -7003,7 +7003,7 @@ sriov_op_queue(NMDevice               *self,
     *
     * FIXME(shutdown): However, during shutdown we don't have a follow-up write request to cancel
     * this operation and we have to give it at least some time to complete. The solution is that
-     * we register a way to abort the last call during shutdown, and after NM_SHUTDOWN_TIMEOUT_MS
+     * we register a way to abort the last call during shutdown, and after NM_SHUTDOWN_TIMEOUT_MAX_MSEC
     * grace period we pull the plug and cancel it. */

    op  = g_slice_new(SriovOp);
--- a/src/core/dns/nm-dns-dnsmasq.c
+++ b/src/core/dns/nm-dns-dnsmasq.c
@ -39,10 +39,10 @@
 #define _NMLOG(level, ...) __NMLOG_DEFAULT(level, _NMLOG_DOMAIN, "dnsmasq", __VA_ARGS__)

 #define WAIT_MSEC_AFTER_SIGTERM 1000
-G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGTERM <= NM_SHUTDOWN_TIMEOUT_MS);
+G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGTERM <= NM_SHUTDOWN_TIMEOUT_MAX_MSEC);

 #define WAIT_MSEC_AFTER_SIGKILL 400
-G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGKILL + 100 <= NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG);
+G_STATIC_ASSERT(WAIT_MSEC_AFTER_SIGKILL + 100 <= NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC);

 typedef void (*GlPidSpawnAsyncNotify)(GCancellable *cancellable,
                                      GPid          pid,
--- a/src/core/nm-firewall-utils.c
+++ b/src/core/nm-firewall-utils.c
@ -422,7 +422,7 @@ _fw_nft_call_communicate_cb(GObject *source, GAsyncResult *result, gpointer user
                nm_g_main_context_push_thread_default_if_necessary(NULL);

            nm_shutdown_wait_obj_register_object(call_data->subprocess, "nft-terminate");
-            G_STATIC_ASSERT_EXPR(200 < NM_SHUTDOWN_TIMEOUT_MS_WATCHDOG * 2 / 3);
+            G_STATIC_ASSERT_EXPR(200 < NM_SHUTDOWN_TIMEOUT_ADDITIONAL_MSEC * 2 / 3);
            nm_g_subprocess_terminate_in_background(call_data->subprocess, 200);
        }
    } else if (g_subprocess_get_successful(call_data->subprocess)) {
@ -546,7 +546,7 @@ _fw_nft_call(GBytes             *stdin_buf,
                                   call_data);

    call_data->timeout_source =
-        nm_g_source_attach(nm_g_timeout_source_new((NM_SHUTDOWN_TIMEOUT_MS * 2) / 3,
+        nm_g_source_attach(nm_g_timeout_source_new((NM_SHUTDOWN_TIMEOUT_1500_MSEC * 2) / 3,
                                                   G_PRIORITY_DEFAULT,
                                                   _fw_nft_call_timeout_cb,
                                                   call_data,
--- a/src/core/nm-pacrunner-manager.c
+++ b/src/core/nm-pacrunner-manager.c
@ -291,7 +291,7 @@ _call_destroy_proxy_configuration(NMPacrunnerManager *self,
                           g_variant_new("(o)", path),
                           G_VARIANT_TYPE("()"),
                           G_DBUS_CALL_FLAGS_NO_AUTO_START,
-                           NM_SHUTDOWN_TIMEOUT_MS,
+                           NM_SHUTDOWN_TIMEOUT_1500_MSEC,
                           priv->cancellable,
                           _call_destroy_proxy_configuration_cb,
                           conf_id_ref(conf_id));
@ -315,7 +315,7 @@ _call_create_proxy_configuration(NMPacrunnerManager *self,
                           conf_id->parameters,
                           G_VARIANT_TYPE("(o)"),
                           G_DBUS_CALL_FLAGS_NO_AUTO_START,
-                           NM_SHUTDOWN_TIMEOUT_MS,
+                           NM_SHUTDOWN_TIMEOUT_1500_MSEC,
                           priv->cancellable,
                           _call_create_proxy_configuration_cb,
                           conf_id_ref(conf_id));
--- a/src/core/ppp/nm-ppp-manager.c
+++ b/src/core/ppp/nm-ppp-manager.c
@ -1246,7 +1246,7 @@ _ppp_manager_stop(NMPPPManager            *self,
                              SIGTERM,
                              LOGD_PPP,
                              "pppd",
-                              5000,
+                              NM_SHUTDOWN_TIMEOUT_5000_MSEC,
                              _stop_child_cb,
                              handle);

--- a/src/core/settings/nm-secret-agent.c
+++ b/src/core/settings/nm-secret-agent.c
@ -511,7 +511,7 @@ nm_secret_agent_cancel_call(NMSecretAgent *self, NMSecretAgentCallId *call_id)
            g_variant_new("(os)", call_id->path, call_id->setting_name),
            G_VARIANT_TYPE("()"),
            G_DBUS_CALL_FLAGS_NO_AUTO_START,
-            NM_SHUTDOWN_TIMEOUT_MS,
+            NM_SHUTDOWN_TIMEOUT_1500_MSEC,
            NULL, /* this operation is not cancellable. We rely on the timeout. */
            _call_cancel_cb,
            call_id);