pid1: add ProtectSystem= as system-wide configuration, and default it to true in the initrd

This adds a new ProtectSystem= setting that mirrors the option of the
same of services, but in a more restrictive way. If enabled will remount
/usr/ to read-only, very early at boot. Takes a special value "auto"
(which is the default) which is equivalent to true in the initrd, and
false otherwise.

Unlike the per-service option we don't support full/strict modes, but
the door is open to eventually support that too if it makes sense. It's
not entirely trivial though as we have very little mounted this early,
and hence the mechanism might not apply 1:1. Hence in this PR is a
conservative first step.

My primary goal with this is to lock down initrds a bit, since they
conceptually are mostly immutable, but they are unpacked into a mutable
tmpfs. let's tighten the screws a bit on that, and at least make /usr/
immutable.

This is particularly nice on USIs (i.e. Unified System Images, that pack
a whole OS into a UKI without transitioning out of it), such as
diskomator.
This commit is contained in:
Lennart Poettering 2023-11-29 18:52:28 +01:00
parent 8e3dc737b2
commit ffc1ec73b3
3 changed files with 89 additions and 1 deletions

View file

@ -289,6 +289,20 @@
<xi:include href="version-info.xml" xpointer="v239"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>ProtectSystem=</varname></term>
<listitem><para>Takes a boolean argument or the string <literal>auto</literal>. If set to true this
will remount <filename>/usr/</filename> read-only. If set to <literal>auto</literal> (the default)
and running in an initrd equivalent to true, otherwise false. This implements a restricted subset of
the per-unit setting of the same name, see
<citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
details: currently, the <literal>full</literal> or <literal>struct</literal> values are not
supported.</para>
<xi:include href="version-info.xml" xpointer="v256"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>SystemCallArchitectures=</varname></term>

View file

@ -68,6 +68,7 @@
#include "manager-serialize.h"
#include "mkdir-label.h"
#include "mount-setup.h"
#include "mount-util.h"
#include "os-util.h"
#include "pager.h"
#include "parse-argument.h"
@ -140,6 +141,7 @@ static char **arg_default_environment;
static char **arg_manager_environment;
static uint64_t arg_capability_bounding_set;
static bool arg_no_new_privs;
static int arg_protect_system;
static nsec_t arg_timer_slack_nsec;
static Set* arg_syscall_archs;
static FILE* arg_serialization;
@ -610,6 +612,43 @@ static int config_parse_oom_score_adjust(
return 0;
}
static int config_parse_protect_system_pid1(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
int *v = ASSERT_PTR(data), r;
/* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one
* hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or
* "full"). And we will enable this automatically for the initrd unless configured otherwise.
*
* We might extend this later to match more closely what the per-service ProtectSystem= can do, but
* this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted
* at the moment we enable this logic. */
if (isempty(rvalue) || streq(rvalue, "auto")) {
*v = -1;
return 0;
}
r = parse_boolean(rvalue);
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse ProtectSystem= argument '%s', ignoring: %m", rvalue);
return 0;
}
*v = r;
return 0;
}
static int parse_config_file(void) {
const ConfigTableItem items[] = {
{ "Manager", "LogLevel", config_parse_level2, 0, NULL },
@ -637,6 +676,7 @@ static int parse_config_file(void) {
{ "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
{ "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
{ "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
{ "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system },
#if HAVE_SECCOMP
{ "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
#else
@ -1684,6 +1724,35 @@ static void initialize_core_pattern(bool skip_setup) {
arg_early_core_pattern);
}
static void apply_protect_system(bool skip_setup) {
int r;
if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0)
return;
if (arg_protect_system < 0 && !in_initrd()) {
log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping.");
return;
}
r = make_mount_point("/usr");
if (r < 0) {
log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m");
return;
}
if (mount_nofollow_verbose(
LOG_WARNING,
/* what= */ NULL,
"/usr",
/* fstype= */ NULL,
MS_BIND|MS_REMOUNT|MS_RDONLY,
/* options= */ NULL) < 0)
return;
log_info("Successfully made /usr/ read-only.");
}
static void update_cpu_affinity(bool skip_setup) {
_cleanup_free_ char *mask = NULL;
@ -2531,6 +2600,7 @@ static void reset_arguments(void) {
arg_capability_bounding_set = CAP_MASK_UNSET;
arg_no_new_privs = false;
arg_protect_system = -1;
arg_timer_slack_nsec = NSEC_INFINITY;
arg_syscall_archs = set_free(arg_syscall_archs);
@ -3040,9 +3110,12 @@ int main(int argc, char *argv[]) {
cmdline_take_random_seed();
}
/* A core pattern might have been specified via the cmdline. */
/* A core pattern might have been specified via the cmdline. */
initialize_core_pattern(skip_setup);
/* Make /usr/ read-only */
apply_protect_system(skip_setup);
/* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
log_close();

View file

@ -39,6 +39,7 @@
#WatchdogDevice=
#CapabilityBoundingSet=
#NoNewPrivileges=no
#ProtectSystem=auto
#SystemCallArchitectures=
#TimerSlackNSec=
#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}