From 1761066b135f1a322c446f102343ea4aa61fe3ee Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 27 Oct 2023 14:25:49 +0200 Subject: [PATCH] storagetm: add new systemd-storagetm component This implements a "storage target mode", similar to what MacOS provides since a long time as "Target Disk Mode": https://en.wikipedia.org/wiki/Target_Disk_Mode This implementation is relatively simple: 1. a new generic target "storage-target-mode.target" is added, which when booted into defines the target mode. 2. a small tool and service "systemd-storagetm.service" is added which exposes a specific device or all devices as NVMe-TCP devices over the network. NVMe-TCP appears to be hot shit right now how to expose block devices over the network. And it's really simple to set up via configs, hence our code is relatively short and neat. The idea is that systemd-storagetm.target can be extended sooner or later, for example to expose block devices also as USB mass storage devices and similar, in case the system has "dual mode" USB controller that can also work as device, not just as host. (And people could also plug in sharing as NBD, iSCSI, whatever they want.) How to use this? Boot into your system with a kernel cmdline of "rd.systemd.unit=storage-target-mode.target ip=link-local", and you'll see on screen the precise "nvme connect" command line to make the relevant block devices available locally on some other machine. This all requires that the target mode stuff is included in the initrd of course. And the system will the stay in the initrd forever. Why bother? Primarily three use-cases: 1. Debug a broken system: with very few dependencies during boot get access to the raw block device of a broken machine. 2. Migrate from system to another system, by dd'ing the old to the new directly. 3. Installing an OS remotely on some device (for example via Thunderbolt networking) (And there might be more, for example the ability to boot from a laptop's disk on another system) Limitations: 1. There's no authentication/encryption. Hence: use this on local links only. 2. NVMe target mode on Linux supports r/w operation only. Ideally, we'd have a read-only mode, for security reasons, and default to it. Future love: 1. We should have another mode, where we simply expose the homed LUKS home dirs like that. 2. Some lightweight hookup with plymouth, to display a (shortened) version of the info we write to the console. To test all this, just run: mkosi --kernel-command-line-extra="rd.systemd.unit=storage-target-mode.target" qemu --- meson.build | 6 +- meson_options.txt | 2 + src/storagetm/meson.build | 11 + src/storagetm/storagetm.c | 1047 ++++++++++++++++++++++++++++ units/meson.build | 8 + units/storage-target-mode.target | 16 + units/systemd-storagetm.service.in | 27 + 7 files changed, 1116 insertions(+), 1 deletion(-) create mode 100644 src/storagetm/meson.build create mode 100644 src/storagetm/storagetm.c create mode 100644 units/storage-target-mode.target create mode 100644 units/systemd-storagetm.service.in diff --git a/meson.build b/meson.build index 5f1331e6f33..5ac15d83562 100644 --- a/meson.build +++ b/meson.build @@ -1533,6 +1533,8 @@ have = get_option('sysupdate').require( error_message : 'fdisk and openssl required').allowed() conf.set10('ENABLE_SYSUPDATE', have) +conf.set10('ENABLE_STORAGETM', get_option('storagetm')) + have = get_option('importd').require( conf.get('HAVE_LIBCURL') == 1 and conf.get('HAVE_OPENSSL_OR_GCRYPT') == 1 and @@ -2196,10 +2198,11 @@ subdir('src/systemctl') subdir('src/sysupdate') subdir('src/sysusers') subdir('src/sysv-generator') +subdir('src/storagetm') subdir('src/timedate') subdir('src/timesync') -subdir('src/tpm2-setup') subdir('src/tmpfiles') +subdir('src/tpm2-setup') subdir('src/tty-ask-password-agent') subdir('src/update-done') subdir('src/update-utmp') @@ -2793,6 +2796,7 @@ foreach tuple : [ ['systemd-analyze', conf.get('ENABLE_ANALYZE') == 1], ['sysupdate'], ['sysusers'], + ['storagetm'], ['timedated'], ['timesyncd'], ['tmpfiles'], diff --git a/meson_options.txt b/meson_options.txt index 831d23e299a..c31e3cd4d71 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -158,6 +158,8 @@ option('quotacheck', type : 'boolean', description : 'support for the quotacheck tools') option('sysusers', type : 'boolean', description : 'support for the sysusers configuration') +option('storagetm', type : 'boolean', + description : 'support for Storage Target Mode') option('tmpfiles', type : 'boolean', description : 'support for tmpfiles.d') option('importd', type : 'feature', deprecated : { 'true' : 'enabled', 'false' : 'disabled' }, diff --git a/src/storagetm/meson.build b/src/storagetm/meson.build new file mode 100644 index 00000000000..f95210af4cd --- /dev/null +++ b/src/storagetm/meson.build @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +executables += [ + libexec_template + { + 'name' : 'systemd-storagetm', + 'conditions' : [ + 'ENABLE_STORAGETM', + ], + 'sources' : files('storagetm.c'), + }, +] diff --git a/src/storagetm/storagetm.c b/src/storagetm/storagetm.c new file mode 100644 index 00000000000..dfde8484343 --- /dev/null +++ b/src/storagetm/storagetm.c @@ -0,0 +1,1047 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include + +#include "af-list.h" +#include "alloc-util.h" +#include "blockdev-util.h" +#include "build.h" +#include "daemon-util.h" +#include "device-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "local-addresses.h" +#include "loop-util.h" +#include "main-func.h" +#include "parse-argument.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "random-util.h" +#include "recurse-dir.h" +#include "socket-util.h" +#include "terminal-util.h" +#include "udev-util.h" + +static char **arg_devices = NULL; +static char *arg_nqn = NULL; +static int arg_all = 0; + +STATIC_DESTRUCTOR_REGISTER(arg_devices, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_nqn, freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("systemd-storagetm", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] [DEVICE...]\n" + "\n%sExpose a block device or regular file as NVMe-TCP volume.%s\n\n" + " -h --help Show this help\n" + " --version Show package version\n" + " --nqn=STRING Select NQN (NVMe Qualified Name)\n" + " -a --all Expose all devices\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + enum { + ARG_NQN = 0x100, + ARG_VERSION, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "nqn", required_argument, NULL, ARG_NQN }, + { "all", no_argument, NULL, 'a' }, + {} + }; + + int r, c; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, "ha", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case ARG_NQN: + if (!filename_is_valid(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "NQN invalid: %s", optarg); + + if (free_and_strdup(&arg_nqn, optarg) < 0) + return log_oom(); + + break; + + case 'a': + arg_all++; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_all > 0) { + if (argc > optind) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expects no further arguments if --all/-a is specified."); + } else { + if (optind >= argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Expecting device name or --all/-a."); + + for (int i = optind; i < argc; i++) + if (!path_is_valid(argv[i])) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid path: %s", argv[i]); + + arg_devices = strv_copy(argv + optind); + } + + if (!arg_nqn) { + sd_id128_t id; + + r = sd_id128_get_machine_app_specific(SD_ID128_MAKE(b4,f9,4e,52,b8,e2,45,db,88,84,6e,2e,c3,f4,ef,18), &id); + if (r < 0) + return log_error_errno(r, "Failed to get machine ID: %m"); + + /* See NVM Express Base Specification 2.0c, 4.5 "NVMe Qualified Names" */ + if (asprintf(&arg_nqn, "nqn.2023-10.io.systemd:storagetm." SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(id)) < 0) + return log_oom(); + } + + return 1; +} + +typedef struct NvmeSubsystem { + char *name; + struct stat device_stat; + int device_fd; + int nvme_all_subsystems_fd; /* The /sys/kernel/config/nvmet/subsystems/ dir, that contains all subsystems */ + int nvme_our_subsystem_fd; /* Our private subsystem dir below it. */ + char *device; +} NvmeSubsystem; + +static NvmeSubsystem* nvme_subsystem_free(NvmeSubsystem *s) { + if (!s) + return NULL; + + free(s->name); + safe_close(s->nvme_all_subsystems_fd); + safe_close(s->nvme_our_subsystem_fd); + safe_close(s->device_fd); + free(s->device); + + return mfree(s); +} + +static int nvme_subsystem_unlink(NvmeSubsystem *s) { + int r; + + assert(s); + + if (s->nvme_our_subsystem_fd >= 0) { + _cleanup_close_ int namespaces_fd = -EBADF; + + namespaces_fd = openat(s->nvme_our_subsystem_fd, "namespaces", O_CLOEXEC|O_DIRECTORY|O_RDONLY); + if (namespaces_fd < 0) + log_warning_errno(errno, "Failed to open 'namespaces' directory of subsystem '%s': %m", s->name); + else { + _cleanup_free_ DirectoryEntries *de = NULL; + + r = readdir_all(namespaces_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + log_warning_errno(r, "Failed to read 'namespaces' dir of subsystem '%s', ignoring: %m", s->name); + else { + FOREACH_ARRAY(ee, de->entries, de->n_entries) { + _cleanup_free_ char *enable_fn = NULL; + const struct dirent *e = *ee; + + enable_fn = path_join(e->d_name, "enable"); + if (!enable_fn) + return log_oom(); + + r = write_string_file_at(namespaces_fd, enable_fn, "0", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + log_warning_errno(r, "Failed to disable namespace '%s' of NVME subsystem '%s', ignoring: %m", e->d_name, s->name); + + if (unlinkat(namespaces_fd, e->d_name, AT_REMOVEDIR) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to remove namespace '%s' of NVME subsystem '%s', ignoring: %m", e->d_name, s->name); + } + } + } + + s->nvme_our_subsystem_fd = safe_close(s->nvme_our_subsystem_fd); + } + + if (s->nvme_all_subsystems_fd >= 0 && s->name) { + if (unlinkat(s->nvme_all_subsystems_fd, s->name, AT_REMOVEDIR) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to remove NVME subsystem '%s', ignoring: %m", s->name); + + s->nvme_all_subsystems_fd = safe_close(s->nvme_all_subsystems_fd); /* Invalidate the subsystems/ dir fd, to remember we unlinked the thing already */ + + log_info("NVME subsystem '%s' removed.", s->name); + } + + return 0; +} + +static NvmeSubsystem *nvme_subsystem_destroy(NvmeSubsystem *s) { + if (!s) + return NULL; + + (void) nvme_subsystem_unlink(s); + + return nvme_subsystem_free(s); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(NvmeSubsystem*, nvme_subsystem_destroy); + +static int nvme_subsystem_add(const char *node, int consumed_fd, NvmeSubsystem **ret) { + _cleanup_close_ int fd = consumed_fd; /* always take possession of the fd */ + int r; + + assert(node); + assert(ret); + + _cleanup_free_ char *fname = NULL; + r = path_extract_filename(node, &fname); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from path: %s", node); + + _cleanup_free_ char *j = NULL; + j = strjoin(arg_nqn, ".", fname); + if (!j) + return log_oom(); + + if (fd < 0) { + fd = RET_NERRNO(open(node, O_RDONLY|O_CLOEXEC|O_NONBLOCK)); + if (fd < 0) + return log_error_errno(fd, "Failed to open '%s': %m", node); + } + + struct stat st; + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to fstat '%s': %m", node); + if (!S_ISBLK(st.st_mode)) { + r = stat_verify_regular(&st); + if (r < 0) + return log_error_errno(r, "Not a block device or regular file, refusing: %s", node); + } + + /* Let's lock this device continuously while we are operating on it */ + r = lock_generic_with_timeout(fd, LOCK_BSD, LOCK_EX, 10 * USEC_PER_SEC); + if (r < 0) + return log_error_errno(r, "Failed to lock block device: %m"); + + _cleanup_close_ int subsystems_fd = -EBADF; + subsystems_fd = RET_NERRNO(open("/sys/kernel/config/nvmet/subsystems", O_DIRECTORY|O_CLOEXEC|O_RDONLY)); + if (subsystems_fd < 0) + return log_error_errno(subsystems_fd, "Failed to open /sys/kernel/config/nvmet/subsystems: %m"); + + _cleanup_close_ int subsystem_fd = -EBADF; + subsystem_fd = open_mkdir_at(subsystems_fd, j, O_EXCL|O_RDONLY|O_CLOEXEC, 0777); + if (subsystem_fd < 0) + return log_error_errno(subsystem_fd, "Failed to create NVME subsystem '%s': %m", j); + + r = write_string_file_at(subsystem_fd, "attr_allow_any_host", "1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set 'attr_allow_any_host' flag: %m"); + + _cleanup_close_ int namespace_fd = -EBADF; + namespace_fd = open_mkdir_at(subsystem_fd, "namespaces/1", O_EXCL|O_RDONLY|O_CLOEXEC, 0777); + if (namespace_fd < 0) + return log_error_errno(namespace_fd, "Failed to create NVME namespace '1': %m"); + + /* We use /proc/$PID/fd/$FD rather than /proc/self/fd/$FD, because this string is visible to others + * via configfs, and by including the PID it's clear to who the stuff belongs. */ + r = write_string_file_at(namespace_fd, "device_path", FORMAT_PROC_PID_FD_PATH(0, fd), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write 'device_path' attribute: %m"); + + r = write_string_file_at(namespace_fd, "enable", "1", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to write 'enable' attribute: %m"); + + _cleanup_(nvme_subsystem_destroyp) NvmeSubsystem *subsys = NULL; + + subsys = new(NvmeSubsystem, 1); + if (!subsys) + return log_oom(); + + *subsys = (NvmeSubsystem) { + .name = TAKE_PTR(j), + .device_fd = TAKE_FD(fd), + .nvme_all_subsystems_fd = TAKE_FD(subsystems_fd), + .nvme_our_subsystem_fd = TAKE_FD(subsystem_fd), + .device_stat = st, + }; + + subsys->device = strdup(node); + if (!subsys->device) + return log_oom(); + + *ret = TAKE_PTR(subsys); + return 0; +} + +typedef struct NvmePort { + uint16_t portnr; /* used for both the IP and the NVME port numer */ + + int nvme_port_fd; + int nvme_ports_fd; + + int ip_family; +} NvmePort; + +static NvmePort *nvme_port_free(NvmePort *p) { + if (!p) + return NULL; + + safe_close(p->nvme_port_fd); + safe_close(p->nvme_ports_fd); + + return mfree(p); +} + +static int nvme_port_unlink(NvmePort *p) { + int r, ret = 0; + + assert(p); + + if (p->nvme_port_fd >= 0) { + _cleanup_close_ int subsystems_dir_fd = -EBADF; + + subsystems_dir_fd = openat(p->nvme_port_fd, "subsystems", O_DIRECTORY|O_RDONLY|O_CLOEXEC); + if (subsystems_dir_fd < 0) + log_warning_errno(errno, "Failed to open 'subsystems' dir of port %" PRIu16 ", ignoring: %m", p->portnr); + else { + _cleanup_free_ DirectoryEntries *de = NULL; + + r = readdir_all(subsystems_dir_fd, RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT, &de); + if (r < 0) + log_warning_errno(r, "Failed to read 'subsystems' dir of port %" PRIu16 ", ignoring: %m", p->portnr); + else + FOREACH_ARRAY(ee, de->entries, de->n_entries) { + const struct dirent *e = *ee; + + if (unlinkat(subsystems_dir_fd, e->d_name, 0) < 0 && errno != ENOENT) + log_warning_errno(errno, "Failed to remove 'subsystems' symlink '%s' of port %" PRIu16 ", ignoring: %m", e->d_name, p->portnr); + } + } + + p->nvme_port_fd = safe_close(p->nvme_port_fd); + } + + if (p->nvme_ports_fd >= 0) { + _cleanup_free_ char *fn = NULL; + if (asprintf(&fn, "%" PRIu16, p->portnr) < 0) + return log_oom(); + + if (unlinkat(p->nvme_ports_fd, fn, AT_REMOVEDIR) < 0) { + if (errno == ENOENT) + ret = 0; + else + ret = log_warning_errno(errno, "Failed to remove port '%" PRIu16 ", ignoring: %m", p->portnr); + } else + ret = 1; + + p->nvme_ports_fd = safe_close(p->nvme_ports_fd); + } + + return ret; +} + +static NvmePort *nvme_port_destroy(NvmePort *p) { + if (!p) + return NULL; + + (void) nvme_port_unlink(p); + + return nvme_port_free(p); +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(NvmePort*, nvme_port_destroy); + +static int nvme_port_add_portnr( + int ports_fd, + uint16_t portnr, + int ip_family, + int *ret_fd) { + + int r; + + assert(ports_fd >= 0); + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + assert(ret_fd); + + _cleanup_free_ char *fname = NULL; + if (asprintf(&fname, "%" PRIu16, portnr) < 0) + return log_oom(); + + _cleanup_close_ int port_fd = -EBADF; + port_fd = open_mkdir_at(ports_fd, fname, O_EXCL|O_RDONLY|O_CLOEXEC, 0777); + if (port_fd < 0) { + if (port_fd != -EEXIST) + return log_error_errno(port_fd, "Failed to create port %" PRIu16 ": %m", portnr); + + *ret_fd = -EBADF; + return 0; + } + + r = write_string_file_at(port_fd, "addr_adrfam", af_to_ipv4_ipv6(ip_family), WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set address family on NVME port %" PRIu16 ": %m", portnr); + + r = write_string_file_at(port_fd, "addr_trtype", "tcp", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set transport type on NVME port %" PRIu16 ": %m", portnr); + + r = write_string_file_at(port_fd, "addr_trsvcid", fname, WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set IP port on NVME port %" PRIu16 ": %m", portnr); + + r = write_string_file_at(port_fd, "addr_traddr", ip_family == AF_INET6 ? "::" : "0.0.0.0", WRITE_STRING_FILE_DISABLE_BUFFER); + if (r < 0) + return log_error_errno(r, "Failed to set IP address on NVME port %" PRIu16 ": %m", portnr); + + *ret_fd = TAKE_FD(port_fd); + return 1; +} + +static uint16_t calculate_start_port(const char *name, int ip_family) { + struct siphash state; + uint16_t nr; + + assert(name); + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + + /* Use some fixed key Lennart pulled from /dev/urandom, so that we are deterministic */ + siphash24_init(&state, SD_ID128_MAKE(d1,0b,67,b5,e2,b7,4a,91,8d,6b,27,b6,35,c1,9f,d9).bytes); + siphash24_compress_string(name, &state); + siphash24_compress(&ip_family, sizeof(ip_family), &state); + + nr = 1024U + siphash24_finalize(&state) % (0xFFFFU - 1024U); + SET_FLAG(nr, 1, ip_family == AF_INET6); /* Lowest bit reflects family */ + + return nr; +} + +static uint16_t calculate_next_port(int ip_family) { + uint16_t nr; + + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + + nr = 1024U + random_u64_range(0xFFFFU - 1024U); + SET_FLAG(nr, 1, ip_family == AF_INET6); /* Lowest bit reflects family */ + + return nr; +} + +static int nvme_port_add(const char *name, int ip_family, NvmePort **ret) { + int r; + + assert(name); + assert(IN_SET(ip_family, AF_INET, AF_INET6)); + assert(ret); + + _cleanup_close_ int ports_fd = -EBADF; + ports_fd = RET_NERRNO(open("/sys/kernel/config/nvmet/ports", O_DIRECTORY|O_RDONLY|O_CLOEXEC)); + if (ports_fd < 0) + return log_error_errno(ports_fd, "Failed to open /sys/kernel/config/nvmet/ports: %m"); + + _cleanup_close_ int port_fd = -EBADF; + uint16_t portnr = calculate_start_port(name, ip_family); + for (unsigned attempt = 0;; attempt++) { + r = nvme_port_add_portnr(ports_fd, portnr, ip_family, &port_fd); + if (r < 0) + return r; + if (r > 0) + break; + + if (attempt > 16) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Can't find free NVME port after %u attempts.", attempt); + + log_debug_errno(port_fd, "NVME port %" PRIu16 " exists already, randomizing port.", portnr); + + portnr = calculate_next_port(ip_family); + } + + _cleanup_(nvme_port_destroyp) NvmePort *p = new(NvmePort, 1); + if (!p) + return log_oom(); + + *p = (NvmePort) { + .portnr = portnr, + .nvme_ports_fd = TAKE_FD(ports_fd), + .nvme_port_fd = TAKE_FD(port_fd), + .ip_family = ip_family, + }; + + *ret = TAKE_PTR(p); + return 0; +} + +static int nvme_port_link_subsystem(NvmePort *port, NvmeSubsystem *subsys) { + assert(port); + assert(subsys); + + _cleanup_free_ char *target = NULL, *linkname = NULL; + target = path_join("/sys/kernel/config/nvmet/subsystems", subsys->name); + if (!target) + return log_oom(); + + linkname = path_join("subsystems", subsys->name); + if (!linkname) + return log_oom(); + + if (symlinkat(target, port->nvme_port_fd, linkname) < 0) + return log_error_errno(errno, "Failed to link subsystem '%s' to port %" PRIu16 ": %m", subsys->name, port->portnr); + + return 0; +} + +static int nvme_port_unlink_subsystem(NvmePort *port, NvmeSubsystem *subsys) { + assert(port); + assert(subsys); + + _cleanup_free_ char *linkname = NULL; + linkname = path_join("subsystems", subsys->name); + if (!linkname) + return log_oom(); + + if (unlinkat(port->nvme_port_fd, linkname, 0) < 0 && errno != ENOENT) + return log_error_errno(errno, "Failed to unlink subsystem '%s' to port %" PRIu16 ": %m", subsys->name, port->portnr); + + return 0; +} + +static int nvme_subsystem_report(NvmeSubsystem *subsystem, NvmePort *ipv4, NvmePort *ipv6) { + assert(subsystem); + + _cleanup_free_ struct local_address *addresses = NULL; + int n_addresses; + n_addresses = local_addresses(NULL, 0, AF_UNSPEC, &addresses); + if (n_addresses < 0) + return log_error_errno(n_addresses, "Failed to determine local IP addresses: %m"); + + log_notice("NVMe-TCP: %s %s%s%s (%s)", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_COMPUTER_DISK) : "", emoji_enabled() ? " " : "", + subsystem->name, subsystem->device); + + FOREACH_ARRAY(a, addresses, n_addresses) { + NvmePort *port = a->family == AF_INET ? ipv4 : ipv6; + + if (!port) + continue; + + log_info(" %s Try for specific device: nvme connect -t tcp -n '%s' -a %s -s %" PRIu16, + special_glyph(a >= addresses + (n_addresses - 1) ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH), + subsystem->name, + IN_ADDR_TO_STRING(a->family, &a->address), + port->portnr); + } + + return 0; +} + +static int nvme_port_report(NvmePort *port) { + if (!port) + return 0; + + _cleanup_free_ struct local_address *addresses = NULL; + int n_addresses; + n_addresses = local_addresses(NULL, 0, port->ip_family, &addresses); + if (n_addresses < 0) + return log_error_errno(n_addresses, "Failed to determine local IP addresses: %m"); + + log_notice("NVMe-TCP: %s %s%sListening on %s (port %" PRIu16 ")", + special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), + emoji_enabled() ? special_glyph(SPECIAL_GLYPH_WORLD) : "", emoji_enabled() ? " " : "", + af_to_ipv4_ipv6(port->ip_family), + port->portnr); + + FOREACH_ARRAY(a, addresses, n_addresses) + log_info(" %s Try for all devices: nvme connect-all -t tcp -a %s -s %" PRIu16, + special_glyph(a >= addresses + (n_addresses - 1) ? SPECIAL_GLYPH_TREE_RIGHT : SPECIAL_GLYPH_TREE_BRANCH), + IN_ADDR_TO_STRING(a->family, &a->address), + port->portnr); + + return 0; +} + +typedef struct Context { + Hashmap *subsystems; + NvmePort *ipv4_port, *ipv6_port; + + bool display_refresh_scheduled; +} Context; + +static void device_hash_func(const struct stat *q, struct siphash *state) { + assert(q); + + if (S_ISBLK(q->st_mode) || S_ISCHR(q->st_mode)) { + mode_t m = q->st_mode & S_IFMT; + siphash24_compress(&m, sizeof(m), state); + siphash24_compress(&q->st_rdev, sizeof(q->st_rdev), state); + return; + } + + return inode_hash_func(q, state); +} + +static int device_compare_func(const struct stat *a, const struct stat *b) { + int r; + + assert(a); + assert(b); + + r = CMP(a->st_mode & S_IFMT, b->st_mode & S_IFMT); + if (r != 0) + return r; + + if (S_ISBLK(a->st_mode) || S_ISCHR(a->st_mode)) { + r = CMP(major(a->st_rdev), major(b->st_rdev)); + if (r != 0) + return r; + + r = CMP(minor(a->st_rdev), minor(b->st_rdev)); + if (r != 0) + return r; + + return 0; + } + + return inode_compare_func(a, b); +} + +DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + nvme_subsystem_hash_ops, + struct stat, + device_hash_func, + device_compare_func, + NvmeSubsystem, + nvme_subsystem_destroy); + +static void context_done(Context *c) { + assert(c); + + c->ipv4_port = nvme_port_destroy(c->ipv4_port); + c->ipv6_port = nvme_port_destroy(c->ipv6_port); + + c->subsystems = hashmap_free(c->subsystems); +} + +static void device_track_back(sd_device *d, sd_device **ret) { + int r; + + assert(d); + assert(ret); + + const char *devname = NULL; + (void) sd_device_get_devname(d, &devname); + + _cleanup_(sd_device_unrefp) sd_device *d_originating = NULL; + r = block_device_get_originating(d, &d_originating); + if (r < 0) + log_device_debug_errno(d, r, "Failed to get originating device for '%s', ignoring: %m", strna(devname)); + + sd_device *d_whole = NULL; + r = block_device_get_whole_disk(d_originating ?: d, &d_whole); /* does not ref returned device */ + if (r < 0) + log_device_debug_errno(d, r, "Failed to get whole device for '%s', ignoring: %m", strna(devname)); + + *ret = d_whole ? sd_device_ref(d_whole) : d_originating ? TAKE_PTR(d_originating) : sd_device_ref(d); +} + +static int device_is_same(sd_device *a, sd_device *b) { + dev_t devnum_a, devnum_b; + int r; + + assert(a); + assert(b); + + r = sd_device_get_devnum(a, &devnum_a); + if (r < 0) + return r; + + r = sd_device_get_devnum(b, &devnum_b); + if (r < 0) + return r; + + return devnum_a == devnum_b; +} + +static bool device_is_allowed(sd_device *d) { + int r; + + assert(d); + + if (arg_all >= 2) /* If --all is specified twice we allow even the root fs to shared */ + return true; + + const char *devname; + r = sd_device_get_devname(d, &devname); + if (r < 0) + return log_device_error_errno(d, r, "Failed to get device name: %m"); + + dev_t root_devnum; + r = get_block_device("/", &root_devnum); + if (r < 0) { + log_warning_errno(r, "Failed to get backing device of the root file system: %m"); + return false; /* Better safe */ + } + if (root_devnum == 0) /* Not backed by a block device? */ + return true; + + _cleanup_(sd_device_unrefp) sd_device *root_device = NULL; + r = sd_device_new_from_devnum(&root_device, 'b', root_devnum); + if (r < 0) { + log_warning_errno(r, "Failed to get root block device, assuming device '%s' is same as root device: %m", devname); + return false; + } + + _cleanup_(sd_device_unrefp) sd_device *whole_root_device = NULL; + device_track_back(root_device, &whole_root_device); + + _cleanup_(sd_device_unrefp) sd_device *whole_d = NULL; + device_track_back(d, &whole_d); + + r = device_is_same(whole_root_device, whole_d); + if (r < 0) { + log_warning_errno(r, "Failed to determine if root device and device '%s' are the same, assuming they are: %m", devname); + return false; /* Better safe */ + } + + return !r; +} + +static int device_added(Context *c, sd_device *device) { + _cleanup_close_ int fd = -EBADF; + int r; + + assert(c); + assert(device); + + const char *sysname; + r = sd_device_get_sysname(device, &sysname); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get device name: %m"); + + log_device_debug(device, "new block device '%s'", sysname); + + if (STARTSWITH_SET(sysname, "loop", "zram")) /* Ignore some devices */ + return 0; + + const char *devname; + r = sd_device_get_devname(device, &devname); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get device node path: %m"); + + struct stat lookup_key = { + .st_mode = S_IFBLK, + }; + + r = sd_device_get_devnum(device, &lookup_key.st_rdev); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get major/minor from device: %m"); + + if (hashmap_contains(c->subsystems, &lookup_key)) { + log_debug("Device '%s' already seen.", devname); + return 0; + } + + if (!device_is_allowed(device)) { + log_device_debug(device, "Not exposing device '%s', as it is backed by root disk.", devname); + return 0; + } + + fd = sd_device_open(device, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) { + log_device_warning_errno(device, fd, "Failed to open newly acquired device '%s', ignoring device: %m", devname); + return 0; + } + + _cleanup_(nvme_subsystem_destroyp) NvmeSubsystem *s = NULL; + r = nvme_subsystem_add(devname, TAKE_FD(fd), &s); + if (r < 0) + return r; + + if (c->ipv4_port) { + r = nvme_port_link_subsystem(c->ipv4_port, s); + if (r < 0) + return r; + } + + if (c->ipv6_port) { + r = nvme_port_link_subsystem(c->ipv6_port, s); + if (r < 0) + return r; + } + + r = hashmap_ensure_put(&c->subsystems, &nvme_subsystem_hash_ops, &s->device_stat, s); + if (r < 0) + return log_error_errno(r, "Failed to add subsystem to hash table: %m"); + + (void) nvme_subsystem_report(s, c->ipv4_port, c->ipv6_port); + + TAKE_PTR(s); + return 1; +} + +static int device_removed(Context *c, sd_device *device) { + int r; + + assert(device); + + struct stat lookup_key = { + .st_mode = S_IFBLK, + }; + + r = sd_device_get_devnum(device, &lookup_key.st_rdev); + if (r < 0) + return log_device_error_errno(device, r, "Failed to get major/minor from device: %m"); + + NvmeSubsystem *s = hashmap_remove(c->subsystems, &lookup_key); + if (!s) + return 0; + + log_device_debug(device, "removed block device '%s'", s->name); + + if (c->ipv4_port) + (void) nvme_port_unlink_subsystem(c->ipv4_port, s); + if (c->ipv6_port) + (void) nvme_port_unlink_subsystem(c->ipv6_port, s); + + s = nvme_subsystem_destroy(s); + return 1; +} + +static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) { + Context *c = ASSERT_PTR(userdata); + + if (device_for_action(device, SD_DEVICE_REMOVE)) + device_removed(c, device); + else + device_added(c, device); + + return 0; +} + +static int on_display_refresh(sd_event_source *s, uint64_t usec, void *userdata) { + Context *c = ASSERT_PTR(userdata); + + assert(s); + + c->display_refresh_scheduled = false; + + if (isatty(STDERR_FILENO) > 0) + fputs(ANSI_HOME_CLEAR, stderr); + + (void) nvme_port_report(c->ipv4_port); + (void) nvme_port_report(c->ipv6_port); + + NvmeSubsystem *i; + HASHMAP_FOREACH(i, c->subsystems) + (void) nvme_subsystem_report(i, c->ipv4_port, c->ipv6_port); + + return 0; +} + +static int on_address_change(sd_netlink *rtnl, sd_netlink_message *mm, void *userdata) { + Context *c = ASSERT_PTR(userdata); + int r, family; + + assert(rtnl); + assert(mm); + + r = sd_rtnl_message_addr_get_family(mm, &family); + if (r < 0) { + log_warning_errno(r, "Failed to get address family from netlink address message, ignoring: %m"); + return 0; + } + + if (!c->display_refresh_scheduled) { + r = sd_event_add_time_relative( + sd_netlink_get_event(rtnl), + /* ret_slot= */ NULL, + CLOCK_MONOTONIC, + 750 * USEC_PER_MSEC, + 0, + on_display_refresh, + c); + if (r < 0) + log_warning_errno(r, "Failed to schedule display refresh, ignoring: %m"); + else + c->display_refresh_scheduled = true; + } + + return 0; +} + +static int run(int argc, char* argv[]) { + _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + _cleanup_(context_done) Context context = {}; + int r; + + log_show_color(true); + log_parse_environment(); + log_open(); + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to allocate event loop: %m"); + + r = sd_event_set_signal_exit(event, true); + if (r < 0) + return log_error_errno(r, "Failed to install exit signal handlers: %m"); + + STRV_FOREACH(i, arg_devices) { + _cleanup_(nvme_subsystem_destroyp) NvmeSubsystem *subsys = NULL; + + r = nvme_subsystem_add(*i, -EBADF, &subsys); + if (r < 0) + return r; + + r = hashmap_ensure_put(&context.subsystems, &nvme_subsystem_hash_ops, &subsys->device_stat, subsys); + if (r == -EEXIST) { + log_warning_errno(r, "Duplicate device '%s' specified, skipping: %m", *i); + continue; + } + if (r < 0) + return log_error_errno(r, "Failed to add subsystem to hash table: %m"); + + TAKE_PTR(subsys); + } + + r = nvme_port_add(arg_nqn, AF_INET, &context.ipv4_port); + if (r < 0) + return r; + + nvme_port_report(context.ipv4_port); + + if (socket_ipv6_is_enabled()) { + r = nvme_port_add(arg_nqn, AF_INET6, &context.ipv6_port); + if (r < 0) + return r; + + nvme_port_report(context.ipv6_port); + } + + NvmeSubsystem *i; + HASHMAP_FOREACH(i, context.subsystems) { + if (context.ipv4_port) { + r = nvme_port_link_subsystem(context.ipv4_port, i); + if (r < 0) + return r; + } + + if (context.ipv6_port) { + r = nvme_port_link_subsystem(context.ipv6_port, i); + if (r < 0) + return r; + } + + (void) nvme_subsystem_report(i, context.ipv4_port, context.ipv6_port); + } + + if (arg_all > 0) { + r = sd_device_monitor_new(&monitor); + if (r < 0) + return log_error_errno(r, "Failed to allocate device monitor: %m"); + + r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "block", "disk"); + if (r < 0) + return log_error_errno(r, "Failed to configure device monitor match: %m"); + + r = sd_device_monitor_attach_event(monitor, event); + if (r < 0) + return log_error_errno(r, "Failed to attach device monitor to event loop: %m"); + + r = sd_device_monitor_start(monitor, device_monitor_handler, &context); + if (r < 0) + return log_error_errno(r, "Failed to start device monitor: %m"); + + _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *enumerator = NULL; + r = sd_device_enumerator_new(&enumerator); + if (r < 0) + return log_error_errno(r, "Failed to allocate enumerator: %m"); + + r = sd_device_enumerator_add_match_subsystem(enumerator, "block", /* match= */ true); + if (r < 0) + return log_error_errno(r, "Failed to match block devices: %m"); + + r = sd_device_enumerator_add_match_property(enumerator, "DEVTYPE", "disk"); + if (r < 0) + return log_error_errno(r, "Failed to match whole block devices: %m"); + + r = sd_device_enumerator_add_nomatch_sysname(enumerator, "loop*"); + if (r < 0) + return log_error_errno(r, "Failed to exclude loop devices: %m"); + + FOREACH_DEVICE(enumerator, device) + device_added(&context, device); + } + + _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + r = sd_netlink_open(&rtnl); + if (r < 0) + return log_error_errno(r, "Failed to connect to netlink: %m"); + + r = sd_netlink_attach_event(rtnl, event, SD_EVENT_PRIORITY_NORMAL); + if (r < 0) + return log_error_errno(r, "Failed to attach netlink socket to event loop: %m"); + + r = sd_netlink_add_match(rtnl, /* ret_slot= */ NULL, RTM_NEWADDR, on_address_change, /* destroy_callback= */ NULL, &context, "storagetm-newaddr"); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR events: %m"); + + r = sd_netlink_add_match(rtnl, /* ret_slot= */ NULL, RTM_DELADDR, on_address_change, /* destroy_callback= */ NULL, &context, "storagetm-deladdr"); + if (r < 0) + return log_error_errno(r, "Failed to subscribe to RTM_DELADDR events: %m"); + + if (isatty(0) > 0) + log_info("Hit Ctrl-C to exit target mode."); + + _unused_ _cleanup_(notify_on_cleanup) const char *notify_message = + notify_start("READY=1\n" + "STATUS=Exposing disks in target mode...", + NOTIFY_STOPPING); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + log_info("Exiting target mode."); + return r; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); diff --git a/units/meson.build b/units/meson.build index 493e294002a..51ae9cee5dc 100644 --- a/units/meson.build +++ b/units/meson.build @@ -559,6 +559,14 @@ units = [ 'conditions' : ['ENABLE_SYSUSERS'], 'symlinks' : ['sysinit.target.wants/'], }, + { + 'file' : 'systemd-storagetm.service.in', + 'conditions' : ['ENABLE_STORAGETM'], + }, + { + 'file' : 'storage-target-mode.target', + 'conditions' : ['ENABLE_STORAGETM'], + }, { 'file' : 'systemd-time-wait-sync.service.in', 'conditions' : ['ENABLE_TIMESYNCD'], diff --git a/units/storage-target-mode.target b/units/storage-target-mode.target new file mode 100644 index 00000000000..38f78a286e9 --- /dev/null +++ b/units/storage-target-mode.target @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Storage Target Mode +Documentation=man:systemd.special(7) +Wants=systemd-storagetm.service systemd-udevd.service systemd-udev-trigger.service systemd-networkd.service systemd-network-generator.service systemd-journald.socket systemd-journald-dev-log.socket +Conflicts=rescue.service rescue.target +After=systemd-storagetm.service systemd-udevd.service systemd-udev-trigger.service systemd-networkd.service systemd-network-generator.service systemd-journald.socket systemd-journald-dev-log.socket rescue.service rescue.target +AllowIsolate=yes diff --git a/units/systemd-storagetm.service.in b/units/systemd-storagetm.service.in new file mode 100644 index 00000000000..0fe91eff270 --- /dev/null +++ b/units/systemd-storagetm.service.in @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later +# +# This file is part of systemd. +# +# systemd is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. + +[Unit] +Description=Storage Target Mode (NVMe-TCP) +Documentation=man:systemd-storagetm.service(8) +ConditionVirtualization=!container +DefaultDependencies=no +Wants=modprobe@nvmet_tcp.service modprobe@thunderbolt_net.service sys-kernel-config.mount +After=modprobe@nvmet_tcp.service modprobe@thunderbolt_net.service sys-kernel-config.mount +Conflicts=shutdown.target +Before=shutdown.target +FailureAction=reboot +SuccessAction=reboot + +[Service] +Type=notify +RemainAfterExit=yes +StandardInput=tty +StandardOutput=tty +ExecStart={{LIBEXECDIR}}/systemd-storagetm --all