Merge pull request #20650 from fbuihuu/watchdog-rework

Watchdog rework
This commit is contained in:
Luca Boccassi 2021-09-15 14:44:49 +01:00 committed by GitHub
commit 8f8e9ad7cb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 79 additions and 101 deletions

View file

@ -1534,14 +1534,12 @@ static int become_shutdown(
else if (streq(shutdown_verb, "kexec"))
watchdog_timer = arg_kexec_watchdog;
if (watchdog_timer > 0 && watchdog_timer != USEC_INFINITY) {
if (timestamp_is_set(watchdog_timer)) {
char *e;
/* If we reboot or kexec let's set the shutdown
* watchdog and tell the shutdown binary to
* repeatedly ping it */
r = watchdog_set_timeout(&watchdog_timer);
/* If we reboot or kexec let's set the shutdown watchdog and
* tell the shutdown binary to repeatedly ping it */
r = watchdog_setup(watchdog_timer);
watchdog_close(r < 0);
/* Tell the binary how often to ping, ignore failure */
@ -1554,9 +1552,8 @@ static int become_shutdown(
} else
watchdog_close(true);
/* Avoid the creation of new processes forked by the
* kernel; at this point, we will not listen to the
* signals anyway */
/* Avoid the creation of new processes forked by the kernel; at this
* point, we will not listen to the signals anyway */
if (detect_container() <= 0)
(void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);

View file

@ -2978,13 +2978,8 @@ int manager_loop(Manager *m) {
return log_error_errno(r, "Failed to enable SIGCHLD event source: %m");
while (m->objective == MANAGER_OK) {
usec_t wait_usec, watchdog_usec;
watchdog_usec = manager_get_watchdog(m, WATCHDOG_RUNTIME);
if (m->runtime_watchdog_running)
(void) watchdog_ping();
else if (timestamp_is_set(watchdog_usec))
manager_retry_runtime_watchdog(m);
(void) watchdog_ping();
if (!ratelimit_below(&rl)) {
/* Yay, something is going seriously wrong, pause a little */
@ -3020,12 +3015,7 @@ int manager_loop(Manager *m) {
continue;
/* Sleep for watchdog runtime wait time */
if (timestamp_is_set(watchdog_usec))
wait_usec = watchdog_runtime_wait();
else
wait_usec = USEC_INFINITY;
r = sd_event_run(m->event, wait_usec);
r = sd_event_run(m->event, watchdog_runtime_wait());
if (r < 0)
return log_error_errno(r, "Failed to run event loop: %m");
}
@ -3203,7 +3193,6 @@ usec_t manager_get_watchdog(Manager *m, WatchdogType t) {
}
void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
int r = 0;
assert(m);
@ -3215,22 +3204,16 @@ void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
if (t == WATCHDOG_RUNTIME)
if (!timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME])) {
if (timestamp_is_set(timeout)) {
r = watchdog_set_timeout(&timeout);
if (r >= 0)
m->runtime_watchdog_running = true;
} else {
if (timestamp_is_set(timeout))
(void) watchdog_setup(timeout);
else
watchdog_close(true);
m->runtime_watchdog_running = false;
}
}
m->watchdog[t] = timeout;
}
int manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
int r = 0;
assert(m);
@ -3241,39 +3224,18 @@ int manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout) {
return 0;
if (t == WATCHDOG_RUNTIME) {
usec_t *p;
usec_t usec = timestamp_is_set(timeout) ? timeout : m->watchdog[t];
p = timestamp_is_set(timeout) ? &timeout : &m->watchdog[t];
if (timestamp_is_set(*p)) {
r = watchdog_set_timeout(p);
if (r >= 0)
m->runtime_watchdog_running = true;
} else {
if (timestamp_is_set(usec))
(void) watchdog_setup(usec);
else
watchdog_close(true);
m->runtime_watchdog_running = false;
}
}
m->watchdog_overridden[t] = timeout;
return 0;
}
void manager_retry_runtime_watchdog(Manager *m) {
int r = 0;
assert(m);
if (timestamp_is_set(m->watchdog_overridden[WATCHDOG_RUNTIME]))
r = watchdog_set_timeout(&m->watchdog_overridden[WATCHDOG_RUNTIME]);
else
r = watchdog_set_timeout(&m->watchdog[WATCHDOG_RUNTIME]);
if (r >= 0)
m->runtime_watchdog_running = true;
}
int manager_reload(Manager *m) {
_cleanup_(manager_reloading_stopp) Manager *reloading = NULL;
_cleanup_fdset_free_ FDSet *fds = NULL;

View file

@ -247,8 +247,6 @@ struct Manager {
usec_t watchdog[_WATCHDOG_TYPE_MAX];
usec_t watchdog_overridden[_WATCHDOG_TYPE_MAX];
bool runtime_watchdog_running; /* Whether the runtime HW watchdog was started, so we know if we still need to get the real timeout from the hardware */
dual_timestamp timestamps[_MANAGER_TIMESTAMP_MAX];
/* Data specific to the device subsystem */
@ -565,7 +563,6 @@ ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
usec_t manager_get_watchdog(Manager *m, WatchdogType t);
void manager_set_watchdog(Manager *m, WatchdogType t, usec_t timeout);
int manager_override_watchdog(Manager *m, WatchdogType t, usec_t timeout);
void manager_retry_runtime_watchdog(Manager *m);
const char* oom_policy_to_string(OOMPolicy i) _const_;
OOMPolicy oom_policy_from_string(const char *s) _pure_;

View file

@ -36,11 +36,16 @@ static int update_timeout(void) {
usec_t t;
t = DIV_ROUND_UP(watchdog_timeout, USEC_PER_SEC);
sec = (int) t >= INT_MAX ? INT_MAX : t; /* Saturate */
sec = MIN(t, (usec_t) INT_MAX); /* Saturate */
if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &sec) < 0)
return log_warning_errno(errno, "Failed to set timeout to %is: %m", sec);
log_info("Set hardware watchdog to %s.", FORMAT_TIMESPAN(sec * USEC_PER_SEC, 0));
/* Just in case the driver is buggy */
assert(sec > 0);
/* watchdog_timeout stores the actual timeout used by the HW */
watchdog_timeout = sec * USEC_PER_SEC;
log_info("Set hardware watchdog to %s.", FORMAT_TIMESPAN(watchdog_timeout, 0));
flags = WDIOS_ENABLECARD;
if (ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags) < 0) {
@ -83,7 +88,7 @@ static int open_watchdog(void) {
return update_timeout();
}
int watchdog_set_device(char *path) {
int watchdog_set_device(const char *path) {
int r;
r = free_and_strdup(&watchdog_device, path);
@ -96,45 +101,49 @@ int watchdog_set_device(char *path) {
return r;
}
int watchdog_set_timeout(usec_t *usec) {
int r;
int watchdog_setup(usec_t timeout) {
watchdog_timeout = *usec;
/* Initialize the watchdog timeout with the caller value. This value is
* going to be updated by update_timeout() with the closest value
* supported by the driver */
watchdog_timeout = timeout;
/* If we didn't open the watchdog yet and didn't get any explicit timeout value set, don't do
* anything */
/* If we didn't open the watchdog yet and didn't get any explicit
* timeout value set, don't do anything */
if (watchdog_fd < 0 && watchdog_timeout == USEC_INFINITY)
return 0;
if (watchdog_fd < 0)
r = open_watchdog();
else
r = update_timeout();
return open_watchdog();
*usec = watchdog_timeout;
return r;
return update_timeout();
}
usec_t watchdog_runtime_wait(void) {
usec_t rtwait, ntime;
if (!timestamp_is_set(watchdog_timeout))
return USEC_INFINITY;
/* Sleep half the watchdog timeout since the last successful ping at most */
if (timestamp_is_set(watchdog_last_ping)) {
ntime = now(clock_boottime_or_monotonic());
assert(ntime >= watchdog_last_ping);
rtwait = usec_sub_unsigned(watchdog_last_ping + (watchdog_timeout / 2), ntime);
} else
rtwait = watchdog_timeout / 2;
usec_t ntime = now(clock_boottime_or_monotonic());
return rtwait;
assert(ntime >= watchdog_last_ping);
return usec_sub_unsigned(watchdog_last_ping + (watchdog_timeout / 2), ntime);
}
return watchdog_timeout / 2;
}
int watchdog_ping(void) {
usec_t ntime;
int r;
if (!timestamp_is_set(watchdog_timeout))
return 0;
if (watchdog_fd < 0)
/* open_watchdog() will automatically ping the device for us if necessary */
return open_watchdog();
ntime = now(clock_boottime_or_monotonic());
@ -146,12 +155,6 @@ int watchdog_ping(void) {
return 0;
}
if (watchdog_fd < 0) {
r = open_watchdog();
if (r < 0)
return r;
}
if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) < 0)
return log_warning_errno(errno, "Failed to ping hardware watchdog: %m");
@ -186,4 +189,8 @@ void watchdog_close(bool disarm) {
}
watchdog_fd = safe_close(watchdog_fd);
/* Once closed, pinging the device becomes a NOP and we request a new
* call to watchdog_setup() to open the device again. */
watchdog_timeout = USEC_INFINITY;
}

View file

@ -6,8 +6,8 @@
#include "time-util.h"
#include "util.h"
int watchdog_set_device(char *path);
int watchdog_set_timeout(usec_t *usec);
int watchdog_set_device(const char *path);
int watchdog_setup(usec_t timeout);
int watchdog_ping(void);
void watchdog_close(bool disarm);
usec_t watchdog_runtime_wait(void);

View file

@ -307,10 +307,33 @@ static void bump_sysctl_printk_log_level(int min_level) {
log_debug_errno(r, "Failed to bump kernel.printk to %i: %m", min_level + 1);
}
static void init_watchdog(void) {
const char *s;
int r;
s = getenv("WATCHDOG_DEVICE");
if (s) {
r = watchdog_set_device(s);
if (r < 0)
log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", s);
}
s = getenv("WATCHDOG_USEC");
if (s) {
usec_t usec;
r = safe_atou64(s, &usec);
if (r < 0)
log_warning_errno(r, "Failed to parse watchdog timeout '%s', ignoring: %m", s);
else
(void) watchdog_setup(usec);
}
}
int main(int argc, char *argv[]) {
bool need_umount, need_swapoff, need_loop_detach, need_dm_detach, need_md_detach, in_container, use_watchdog = false, can_initrd;
bool need_umount, need_swapoff, need_loop_detach, need_dm_detach, need_md_detach, in_container, can_initrd;
_cleanup_free_ char *cgroup = NULL;
char *arguments[3], *watchdog_device;
char *arguments[3];
int cmd, r, umount_log_level = LOG_INFO;
static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL};
@ -370,14 +393,7 @@ int main(int argc, char *argv[]) {
LOG_TARGET_KMSG))
bump_sysctl_printk_log_level(LOG_WARNING);
use_watchdog = getenv("WATCHDOG_USEC");
watchdog_device = getenv("WATCHDOG_DEVICE");
if (watchdog_device) {
r = watchdog_set_device(watchdog_device);
if (r < 0)
log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m",
watchdog_device);
}
init_watchdog();
/* Lock us into memory */
(void) mlockall(MCL_CURRENT|MCL_FUTURE);
@ -409,8 +425,7 @@ int main(int argc, char *argv[]) {
for (;;) {
bool changed = false;
if (use_watchdog)
(void) watchdog_ping();
(void) watchdog_ping();
/* Let's trim the cgroup tree on each iteration so
that we leave an empty cgroup tree around, so that

View file

@ -20,7 +20,7 @@ int main(int argc, char *argv[]) {
t = slow ? 10 * USEC_PER_SEC : 1 * USEC_PER_SEC;
count = slow ? 5 : 3;
r = watchdog_set_timeout(&t);
r = watchdog_setup(t);
if (r < 0)
log_warning_errno(r, "Failed to open watchdog: %m");
if (r == -EPERM)