coredump: check cgroups memory limit if storing on tmpfs

When /var/lib/systemd/coredump/ is backed by a tmpfs, all disk usage
will be accounted under the systemd-coredump process cgroup memory
limit.
If MemoryMax is set, this might cause systemd-coredump to be terminated
by the kernel oom handler when writing large uncompressed core files,
even if the compressed core would fit within the limits.

Detect if a tmpfs is used, and if so check MemoryMax from the process
and slice cgroups, and do not write uncompressed core files that are
greater than half the available memory. If the limit is breached,
stop writing and compress the written chunk immediately, then delete
the uncompressed chunk to free more memory, and resume compressing
directly from STDIN.

Example debug log when this situation happens:

systemd-coredump[737455]: Setting max_size to limit writes to 51344896 bytes.
systemd-coredump[737455]: ZSTD compression finished (51344896 -> 3260 bytes, 0.0%)
systemd-coredump[737455]: ZSTD compression finished (1022786048 -> 47245 bytes, 0.0%)
systemd-coredump[737455]: Process 737445 (a.out) of user 1000 dumped core.
This commit is contained in:
Luca Boccassi 2021-03-12 20:17:09 +00:00
parent 5b6f8e13ad
commit 587f2a5e56
2 changed files with 147 additions and 81 deletions

View file

@ -99,7 +99,7 @@
<term><varname>ExternalSizeMax=</varname></term>
<term><varname>JournalSizeMax=</varname></term>
<listitem><para>The maximum (uncompressed) size in bytes of a
<listitem><para>The maximum (compressed or uncompressed) size in bytes of a
core to be saved. Unit suffixes are allowed just as in
<option>ProcessSizeMax=</option></para></listitem>.
</varlistentry>

View file

@ -3,6 +3,7 @@
#include <errno.h>
#include <stdio.h>
#include <sys/prctl.h>
#include <sys/statvfs.h>
#include <sys/xattr.h>
#include <unistd.h>
@ -18,6 +19,7 @@
#include "acl-util.h"
#include "alloc-util.h"
#include "bus-error.h"
#include "capability-util.h"
#include "cgroup-util.h"
#include "compress.h"
@ -42,6 +44,7 @@
#include "socket-util.h"
#include "special.h"
#include "stacktrace.h"
#include "stat-util.h"
#include "string-table.h"
#include "string-util.h"
#include "strv.h"
@ -63,6 +66,10 @@
#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
#endif
/* When checking for available memory and setting lower limits, don't
* go below 4MB for writing core files to storage. */
#define PROCESS_SIZE_MIN (4U*1024U*1024U)
/* Make sure to not make this larger than the maximum journal entry
* size. See DATA_SIZE_MAX in journal-importer.h. */
assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
@ -329,11 +336,14 @@ static int save_external_coredump(
int *ret_node_fd,
int *ret_data_fd,
uint64_t *ret_size,
uint64_t *ret_compressed_size,
bool *ret_truncated) {
_cleanup_free_ char *fn = NULL, *tmp = NULL;
_cleanup_(unlink_and_freep) char *tmp = NULL;
_cleanup_free_ char *fn = NULL;
_cleanup_close_ int fd = -1;
uint64_t rlimit, process_limit, max_size;
bool truncated, storage_on_tmpfs;
struct stat st;
uid_t uid;
int r;
@ -343,6 +353,8 @@ static int save_external_coredump(
assert(ret_node_fd);
assert(ret_data_fd);
assert(ret_size);
assert(ret_compressed_size);
assert(ret_truncated);
r = parse_uid(context->meta[META_ARGV_UID], &uid);
if (r < 0)
@ -379,92 +391,145 @@ static int save_external_coredump(
if (fd < 0)
return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
r = copy_bytes(input_fd, fd, max_size, 0);
if (r < 0) {
log_error_errno(r, "Cannot store coredump of %s (%s): %m",
context->meta[META_ARGV_PID], context->meta[META_COMM]);
goto fail;
/* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
* the service or the slice it belongs to. This is common on low-resources systems,
* to avoid crashing processes to take away too many system resources.
* Check the cgroup settings, and set max_size to a bit less than half of the
* available memory left to the process.
* Then, attempt to write the core file uncompressed first - if the write gets
* interrupted, we know we won't be able to write it all, so instead compress what
* was written so far, delete the uncompressed truncated core, and then continue
* compressing from STDIN. Given the compressed core cannot be larger than the
* uncompressed one, and 1KB for metadata is accounted for in the calculation, we
* should be able to at least store the full compressed core file. */
storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
if (storage_on_tmpfs && arg_compress) {
_cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
uint64_t cgroup_limit = UINT64_MAX;
struct statvfs sv;
/* If we can't get the cgroup limit, just ignore it, but don't fail,
* try anyway with the config settings. */
r = sd_bus_default_system(&bus);
if (r < 0)
log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
else {
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
r = sd_bus_get_property_trivial(
bus,
"org.freedesktop.systemd1",
"/org/freedesktop/systemd1/unit/self",
"org.freedesktop.systemd1.Service",
"MemoryAvailable",
&error,
't', &cgroup_limit);
if (r < 0)
log_warning_errno(r,
"Failed to query MemoryAvailable for current unit, "
"falling back to static config settings: %s",
bus_error_message(&error, r));
}
max_size = MIN(cgroup_limit, max_size);
max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
/* tmpfs might get full quickly, so check the available space too.
* But don't worry about errors here, failing to access the storage
* location will be better logged when writing to it. */
if (statvfs("/var/lib/systemd/coredump/", &sv) >= 0)
max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
}
*ret_truncated = r == 1;
if (*ret_truncated)
r = copy_bytes(input_fd, fd, max_size, 0);
if (r < 0)
return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
context->meta[META_ARGV_PID], context->meta[META_COMM]);
truncated = r == 1;
#if HAVE_COMPRESSION
if (arg_compress) {
_cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
_cleanup_free_ char *fn_compressed = NULL;
_cleanup_close_ int fd_compressed = -1;
uint64_t uncompressed_size = 0;
if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
fn_compressed = strjoin(fn, COMPRESSED_EXT);
if (!fn_compressed)
return log_oom();
fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
if (fd_compressed < 0)
return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
if (r < 0)
return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
if (truncated && storage_on_tmpfs) {
uint64_t partial_uncompressed_size = 0;
/* Uncompressed write was truncated and we are writing to tmpfs: delete
* the uncompressed core, and compress the remaining part from STDIN. */
tmp = unlink_and_free(tmp);
fd = safe_close(fd);
r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
if (r < 0)
return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
uncompressed_size += partial_uncompressed_size;
}
r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, uid);
if (r < 0)
return r;
if (fstat(fd_compressed, &st) < 0)
return log_error_errno(errno,
"Failed to fstat core file %s: %m",
coredump_tmpfile_name(tmp_compressed));
*ret_filename = TAKE_PTR(fn_compressed); /* compressed */
*ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
*ret_compressed_size = (uint64_t) st.st_size; /* compressed */
*ret_data_fd = TAKE_FD(fd);
*ret_size = uncompressed_size;
*ret_truncated = truncated;
tmp_compressed = mfree(tmp_compressed);
return 0;
}
#endif
if (truncated)
log_struct(LOG_INFO,
LOG_MESSAGE("Core file was truncated to %zu bytes.", max_size),
"SIZE_LIMIT=%zu", max_size,
"MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
if (fstat(fd, &st) < 0) {
log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
goto fail;
}
if (lseek(fd, 0, SEEK_SET) == (off_t) -1) {
log_error_errno(errno, "Failed to seek on %s: %m", coredump_tmpfile_name(tmp));
goto fail;
}
#if HAVE_COMPRESSION
/* If we will remove the coredump anyway, do not compress. */
if (arg_compress && !maybe_remove_external_coredump(NULL, st.st_size)) {
_cleanup_free_ char *fn_compressed = NULL, *tmp_compressed = NULL;
_cleanup_close_ int fd_compressed = -1;
fn_compressed = strjoin(fn, COMPRESSED_EXT);
if (!fn_compressed) {
log_oom();
goto uncompressed;
}
fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
if (fd_compressed < 0) {
log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
goto uncompressed;
}
r = compress_stream(fd, fd_compressed, -1);
if (r < 0) {
log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
goto fail_compressed;
}
r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, uid);
if (r < 0)
goto fail_compressed;
/* OK, this worked, we can get rid of the uncompressed version now */
if (tmp)
unlink_noerrno(tmp);
*ret_filename = TAKE_PTR(fn_compressed); /* compressed */
*ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
*ret_data_fd = TAKE_FD(fd); /* uncompressed */
*ret_size = (uint64_t) st.st_size; /* uncompressed */
return 0;
fail_compressed:
if (tmp_compressed)
(void) unlink(tmp_compressed);
}
uncompressed:
#endif
r = fix_permissions(fd, tmp, fn, context, uid);
if (r < 0)
goto fail;
return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
if (fstat(fd, &st) < 0)
return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
*ret_filename = TAKE_PTR(fn);
*ret_data_fd = TAKE_FD(fd);
*ret_node_fd = -1;
*ret_size = (uint64_t) st.st_size;
*ret_truncated = truncated;
return 0;
fail:
if (tmp)
(void) unlink(tmp);
return r;
}
static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
@ -709,7 +774,7 @@ static int submit_coredump(
_cleanup_free_ char *stacktrace = NULL;
char *core_message;
const char *module_name;
uint64_t coredump_size = UINT64_MAX;
uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
bool truncated = false;
JsonVariant *module_json;
int r;
@ -722,7 +787,8 @@ static int submit_coredump(
/* Always stream the coredump to disk, if that's possible */
r = save_external_coredump(context, input_fd,
&filename, &coredump_node_fd, &coredump_fd, &coredump_size, &truncated);
&filename, &coredump_node_fd, &coredump_fd,
&coredump_size, &coredump_compressed_size, &truncated);
if (r < 0)
/* Skip whole core dumping part */
goto log;
@ -730,7 +796,7 @@ static int submit_coredump(
/* If we don't want to keep the coredump on disk, remove it now, as later on we
* will lack the privileges for it. However, we keep the fd to it, so that we can
* still process it and log it. */
r = maybe_remove_external_coredump(filename, coredump_size);
r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
if (r < 0)
return r;
if (r == 0) {
@ -738,7 +804,7 @@ static int submit_coredump(
} else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
coredump_size, arg_external_size_max);
coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
/* Vacuum again, but exclude the coredump we just created */
(void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
@ -758,7 +824,7 @@ static int submit_coredump(
log_debug("Not generating stack trace: core size %"PRIu64" is greater "
"than %"PRIu64" (the configured maximum)",
coredump_size, arg_process_size_max);
} else
} else if (coredump_fd >= 0)
coredump_parse_core(coredump_fd, context->meta[META_EXE], &stacktrace, &json_metadata);
#endif
@ -812,7 +878,7 @@ log:
}
/* Optionally store the entire coredump in the journal */
if (arg_storage == COREDUMP_STORAGE_JOURNAL) {
if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
if (coredump_size <= arg_journal_size_max) {
size_t sz = 0;