Hi,

"Host Memory Backends" and "Memory devices" queue ("mem"): - Reintroduce memory region size checks for memory devices; the removal lead to some undesired side effects - Preallocate memory of memory backends in selected configurations asynchronously (so we preallocate concurrently), to speed up QEMU startup time. -----BEGIN PGP SIGNATURE----- iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmXB3LcRHGRhdmlkQHJl ZGhhdC5jb20ACgkQTd4Q9wD/g1plRA/+N8y4aJB+qEwacl5scIpiWShqeBA0aybS Rp3796djgjkqozkv7AFGHrOIGiLDtCh4W1JYuML7kLN7IvuJoHSY+AHzfhDiae1l eluX/Cs/5rgEninwT9M0yEkgvUybA8+kx+z96hBJgkfJOrdbETc7YVbU5iP/sOOF UtfEVWGwT1RJOun0qrgEhHiJCTMcHyJjSEy8D867ymC+knu3OZIz22+axcmpHz6i QJFgY40OCP1yxBvPVLR3K/Z0se/FkxG55LwM58j7N/m+VDv4IqZCTbkZb5BTJVla 5vKgIrZfZ+XFqrenyMsBnBLgQuyCmDJIDFfxM0A9gOvJbwtf8T4DhL9FoRvVZMDD SHBl/EZcViXFDDKVHjotBSA5JoNbjHac5J5jCFu7pRq+2DbzxWHmW6xV7sY9gkSO +SdW9hcmF/vF5MKHfoQR2kVLLJ2/EKHiN/xVVsha0+RQDctucrhg1Y9MS2obJV3u u2udaVk5UNcfNPuVPwkG8YQ0sIyuDYXOTThwNtsj0tyZ+tGVQmMIlou/GAsrc9PF xmqzkCXXyrILrPMQJrYBcdwasBLuEcJMW59BqgxHCVP9NiAQgsNVzXFg4mr3+mVF xTrt8wioTvAPoDvXe+BPoaH6AsIY2TqE8j7IqA1Q/IFNf+KLYkPcHknZfzfxSkdW woRHVtjrkMo= =lW5h -----END PGP SIGNATURE----- Merge tag 'mem-2024-02-06-v3' of https://github.com/davidhildenbrand/qemu into staging Hi, "Host Memory Backends" and "Memory devices" queue ("mem"): - Reintroduce memory region size checks for memory devices; the removal lead to some undesired side effects - Preallocate memory of memory backends in selected configurations asynchronously (so we preallocate concurrently), to speed up QEMU startup time. # -----BEGIN PGP SIGNATURE----- # # iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmXB3LcRHGRhdmlkQHJl # ZGhhdC5jb20ACgkQTd4Q9wD/g1plRA/+N8y4aJB+qEwacl5scIpiWShqeBA0aybS # Rp3796djgjkqozkv7AFGHrOIGiLDtCh4W1JYuML7kLN7IvuJoHSY+AHzfhDiae1l # eluX/Cs/5rgEninwT9M0yEkgvUybA8+kx+z96hBJgkfJOrdbETc7YVbU5iP/sOOF # UtfEVWGwT1RJOun0qrgEhHiJCTMcHyJjSEy8D867ymC+knu3OZIz22+axcmpHz6i # QJFgY40OCP1yxBvPVLR3K/Z0se/FkxG55LwM58j7N/m+VDv4IqZCTbkZb5BTJVla # 5vKgIrZfZ+XFqrenyMsBnBLgQuyCmDJIDFfxM0A9gOvJbwtf8T4DhL9FoRvVZMDD # SHBl/EZcViXFDDKVHjotBSA5JoNbjHac5J5jCFu7pRq+2DbzxWHmW6xV7sY9gkSO # +SdW9hcmF/vF5MKHfoQR2kVLLJ2/EKHiN/xVVsha0+RQDctucrhg1Y9MS2obJV3u # u2udaVk5UNcfNPuVPwkG8YQ0sIyuDYXOTThwNtsj0tyZ+tGVQmMIlou/GAsrc9PF # xmqzkCXXyrILrPMQJrYBcdwasBLuEcJMW59BqgxHCVP9NiAQgsNVzXFg4mr3+mVF # xTrt8wioTvAPoDvXe+BPoaH6AsIY2TqE8j7IqA1Q/IFNf+KLYkPcHknZfzfxSkdW # woRHVtjrkMo= # =lW5h # -----END PGP SIGNATURE----- # gpg: Signature made Tue 06 Feb 2024 07:16:07 GMT # gpg: using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A # gpg: issuer "david@redhat.com" # gpg: Good signature from "David Hildenbrand <david@redhat.com>" [marginal] # gpg: aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full] # gpg: aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown] # Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D FCCA 4DDE 10F7 00FF 835A * tag 'mem-2024-02-06-v3' of https://github.com/davidhildenbrand/qemu: oslib-posix: initialize backend memory objects in parallel memory-device: reintroduce memory region size check hv-balloon: use get_min_alignment() to express 32 GiB alignment Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2024-07-21 10:24:33 +00:00 · 2024-02-08 11:59:13 +00:00 · 2024-02-08 11:59:13 +00:00 · 8ab67b6ebc
parent 39a6e4f87e 04accf43df
commit 8ab67b6ebc
9 changed files with 180 additions and 53 deletions
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@ -20,6 +20,7 @@
 #include "qom/object_interfaces.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/madvise.h"
+#include "hw/qdev-core.h"

 #ifdef CONFIG_NUMA
 #include <numaif.h>
@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
        uint64_t sz = memory_region_size(&backend->mr);

        if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
-                               backend->prealloc_context, errp)) {
+                               backend->prealloc_context, false, errp)) {
            return;
        }
        backend->prealloc = true;
@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
    HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
    void *ptr;
    uint64_t sz;
+    bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);

    if (!bc->alloc) {
        return;
@ -402,7 +404,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
    if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
                                                ptr, sz,
                                                backend->prealloc_threads,
-                                                backend->prealloc_context, errp)) {
+                                                backend->prealloc_context,
+                                                async, errp)) {
        return;
    }
 }
--- a/hw/hyperv/hv-balloon.c
+++ b/hw/hyperv/hv-balloon.c
@ -1477,22 +1477,7 @@ static void hv_balloon_ensure_mr(HvBalloon *balloon)
    balloon->mr = g_new0(MemoryRegion, 1);
    memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON,
                       memory_region_size(hostmem_mr));
-
-    /*
-     * The VM can indicate an alignment up to 32 GiB. Memory device core can
-     * usually only handle/guarantee 1 GiB alignment. The user will have to
-     * specify a larger maxmem eventually.
-     *
-     * The memory device core will warn the user in case maxmem might have to be
-     * increased and will fail plugging the device if there is not sufficient
-     * space after alignment.
-     *
-     * TODO: we could do the alignment ourselves in a slightly bigger region.
-     * But this feels better, although the warning might be annoying. Maybe
-     * we can optimize that in the future (e.g., with such a device on the
-     * cmdline place/size the device memory region differently.
-     */
-    balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr));
+    balloon->mr->align = memory_region_get_alignment(hostmem_mr);
 }

 static void hv_balloon_free_mr(HvBalloon *balloon)
@ -1654,6 +1639,25 @@ static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md,
    return balloon->mr;
 }

+static uint64_t hv_balloon_md_get_min_alignment(const MemoryDeviceState *md)
+{
+    /*
+     * The VM can indicate an alignment up to 32 GiB. Memory device core can
+     * usually only handle/guarantee 1 GiB alignment. The user will have to
+     * specify a larger maxmem eventually.
+     *
+     * The memory device core will warn the user in case maxmem might have to be
+     * increased and will fail plugging the device if there is not sufficient
+     * space after alignment.
+     *
+     * TODO: we could do the alignment ourselves in a slightly bigger region.
+     * But this feels better, although the warning might be annoying. Maybe
+     * we can optimize that in the future (e.g., with such a device on the
+     * cmdline place/size the device memory region differently.
+     */
+    return 32 * GiB;
+}
+
 static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md,
                                           MemoryDeviceInfo *info)
 {
@ -1766,5 +1770,6 @@ static void hv_balloon_class_init(ObjectClass *klass, void *data)
    mdc->get_memory_region = hv_balloon_md_get_memory_region;
    mdc->decide_memslots = hv_balloon_decide_memslots;
    mdc->get_memslots = hv_balloon_get_memslots;
+    mdc->get_min_alignment = hv_balloon_md_get_min_alignment;
    mdc->fill_device_info = hv_balloon_md_fill_device_info;
 }
--- a/hw/mem/memory-device.c
+++ b/hw/mem/memory-device.c
@ -374,6 +374,20 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
        goto out;
    }

+    /*
+     * We always want the memory region size to be multiples of the memory
+     * region alignment: for example, DIMMs with 1G+1byte size don't make
+     * any sense. Note that we don't check that the size is multiples
+     * of any additional alignment requirements the memory device might
+     * have when it comes to the address in physical address space.
+     */
+    if (!QEMU_IS_ALIGNED(memory_region_size(mr),
+                         memory_region_get_alignment(mr))) {
+        error_setg(errp, "backend memory size must be multiple of 0x%"
+                   PRIx64, memory_region_get_alignment(mr));
+        return;
+    }
+
    if (legacy_align) {
        align = *legacy_align;
    } else {
--- a/hw/virtio/virtio-mem.c
+++ b/hw/virtio/virtio-mem.c
@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
        int fd = memory_region_get_fd(&vmem->memdev->mr);
        Error *local_err = NULL;

-        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
+        if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
            static bool warned;

            /*
@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
    int fd = memory_region_get_fd(&vmem->memdev->mr);
    Error *local_err = NULL;

-    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
+    if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
        error_report_err(local_err);
        return -ENOMEM;
    }
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@ -1083,6 +1083,11 @@ typedef enum MachineInitPhase {
     */
    PHASE_ACCEL_CREATED,

+    /*
+     * Late backend objects have been created and initialized.
+     */
+    PHASE_LATE_BACKENDS_CREATED,
+
    /*
     * machine_class->init has been called, thus creating any embedded
     * devices and validating machine properties.  Devices created at
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
 * @area: start address of the are to preallocate
 * @sz: the size of the area to preallocate
 * @max_threads: maximum number of threads to use
+ * @tc: prealloc context threads pointer, NULL if not in use
+ * @async: request asynchronous preallocation, requires @tc
 * @errp: returns an error if this function fails
 *
 * Preallocate memory (populate/prefault page tables writable) for the virtual
@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
 * each page in the area was faulted in writable at least once, for example,
 * after allocating file blocks for mapped files.
 *
+ * When setting @async, allocation might be performed asynchronously.
+ * qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
+ * preallocation.
+ *
 * Return: true on success, else false setting @errp with error.
 */
 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-                       ThreadContext *tc, Error **errp);
+                       ThreadContext *tc, bool async, Error **errp);
+
+/**
+ * qemu_finish_async_prealloc_mem:
+ * @errp: returns an error if this function fails
+ *
+ * Finish all outstanding asynchronous memory preallocation.
+ *
+ * Return: true on success, else false setting @errp with error.
+ */
+bool qemu_finish_async_prealloc_mem(Error **errp);

 /**
 * qemu_get_pid_name:
--- a/system/vl.c
+++ b/system/vl.c
@ -2013,6 +2013,14 @@ static void qemu_create_late_backends(void)

    object_option_foreach_add(object_create_late);

+    /*
+     * Wait for any outstanding memory prealloc from created memory
+     * backends to complete.
+     */
+    if (!qemu_finish_async_prealloc_mem(&error_fatal)) {
+        exit(1);
+    }
+
    if (tpm_init() < 0) {
        exit(1);
    }
@ -3699,6 +3707,7 @@ void qemu_init(int argc, char **argv)
     * over memory-backend-file objects).
     */
    qemu_create_late_backends();
+    phase_advance(PHASE_LATE_BACKENDS_CREATED);

    /*
     * Note: creates a QOM object, must run only after global and
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@ -42,6 +42,7 @@
 #include "qemu/cutils.h"
 #include "qemu/units.h"
 #include "qemu/thread-context.h"
+#include "qemu/main-loop.h"

 #ifdef CONFIG_LINUX
 #include <sys/syscall.h>
@ -63,11 +64,15 @@

 struct MemsetThread;

+static QLIST_HEAD(, MemsetContext) memset_contexts =
+    QLIST_HEAD_INITIALIZER(memset_contexts);
+
 typedef struct MemsetContext {
    bool all_threads_created;
    bool any_thread_failed;
    struct MemsetThread *threads;
    int num_threads;
+    QLIST_ENTRY(MemsetContext) next;
 } MemsetContext;

 struct MemsetThread {
@ -412,19 +417,44 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
    return ret;
 }

+static int wait_and_free_mem_prealloc_context(MemsetContext *context)
+{
+    int i, ret = 0, tmp;
+
+    for (i = 0; i < context->num_threads; i++) {
+        tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
+
+        if (tmp) {
+            ret = tmp;
+        }
+    }
+    g_free(context->threads);
+    g_free(context);
+    return ret;
+}
+
 static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
-                           int max_threads, ThreadContext *tc,
+                           int max_threads, ThreadContext *tc, bool async,
                           bool use_madv_populate_write)
 {
    static gsize initialized = 0;
-    MemsetContext context = {
-        .num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
-    };
+    MemsetContext *context = g_malloc0(sizeof(MemsetContext));
    size_t numpages_per_thread, leftover;
    void *(*touch_fn)(void *);
-    int ret = 0, i = 0;
+    int ret, i = 0;
    char *addr = area;

+    /*
+     * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
+     * and prealloc context for thread placement.
+     */
+    if (!use_madv_populate_write || !tc) {
+        async = false;
+    }
+
+    context->num_threads =
+        get_memset_num_threads(hpagesize, numpages, max_threads);
+
    if (g_once_init_enter(&initialized)) {
        qemu_mutex_init(&page_mutex);
        qemu_cond_init(&page_cond);
@ -432,8 +462,11 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
    }

    if (use_madv_populate_write) {
-        /* Avoid creating a single thread for MADV_POPULATE_WRITE */
-        if (context.num_threads == 1) {
+        /*
+         * Avoid creating a single thread for MADV_POPULATE_WRITE when
+         * preallocating synchronously.
+         */
+        if (context->num_threads == 1 && !async) {
            if (qemu_madvise(area, hpagesize * numpages,
                             QEMU_MADV_POPULATE_WRITE)) {
                return -errno;
@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
        touch_fn = do_touch_pages;
    }

-    context.threads = g_new0(MemsetThread, context.num_threads);
-    numpages_per_thread = numpages / context.num_threads;
-    leftover = numpages % context.num_threads;
-    for (i = 0; i < context.num_threads; i++) {
-        context.threads[i].addr = addr;
-        context.threads[i].numpages = numpages_per_thread + (i < leftover);
-        context.threads[i].hpagesize = hpagesize;
-        context.threads[i].context = &context;
+    context->threads = g_new0(MemsetThread, context->num_threads);
+    numpages_per_thread = numpages / context->num_threads;
+    leftover = numpages % context->num_threads;
+    for (i = 0; i < context->num_threads; i++) {
+        context->threads[i].addr = addr;
+        context->threads[i].numpages = numpages_per_thread + (i < leftover);
+        context->threads[i].hpagesize = hpagesize;
+        context->threads[i].context = context;
        if (tc) {
-            thread_context_create_thread(tc, &context.threads[i].pgthread,
+            thread_context_create_thread(tc, &context->threads[i].pgthread,
                                         "touch_pages",
-                                         touch_fn, &context.threads[i],
+                                         touch_fn, &context->threads[i],
                                         QEMU_THREAD_JOINABLE);
        } else {
-            qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
-                               touch_fn, &context.threads[i],
+            qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
+                               touch_fn, &context->threads[i],
                               QEMU_THREAD_JOINABLE);
        }
-        addr += context.threads[i].numpages * hpagesize;
+        addr += context->threads[i].numpages * hpagesize;
+    }
+
+    if (async) {
+        /*
+         * async requests currently require the BQL. Add it to the list and kick
+         * preallocation off during qemu_finish_async_prealloc_mem().
+         */
+        assert(bql_locked());
+        QLIST_INSERT_HEAD(&memset_contexts, context, next);
+        return 0;
    }

    if (!use_madv_populate_write) {
-        sigbus_memset_context = &context;
+        sigbus_memset_context = context;
    }

    qemu_mutex_lock(&page_mutex);
-    context.all_threads_created = true;
+    context->all_threads_created = true;
    qemu_cond_broadcast(&page_cond);
    qemu_mutex_unlock(&page_mutex);

-    for (i = 0; i < context.num_threads; i++) {
-        int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
+    ret = wait_and_free_mem_prealloc_context(context);

+    if (!use_madv_populate_write) {
+        sigbus_memset_context = NULL;
+    }
+    return ret;
+}
+
+bool qemu_finish_async_prealloc_mem(Error **errp)
+{
+    int ret = 0, tmp;
+    MemsetContext *context, *next_context;
+
+    /* Waiting for preallocation requires the BQL. */
+    assert(bql_locked());
+    if (QLIST_EMPTY(&memset_contexts)) {
+        return true;
+    }
+
+    qemu_mutex_lock(&page_mutex);
+    QLIST_FOREACH(context, &memset_contexts, next) {
+        context->all_threads_created = true;
+    }
+    qemu_cond_broadcast(&page_cond);
+    qemu_mutex_unlock(&page_mutex);
+
+    QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
+        QLIST_REMOVE(context, next);
+        tmp = wait_and_free_mem_prealloc_context(context);
        if (tmp) {
            ret = tmp;
        }
    }

-    if (!use_madv_populate_write) {
-        sigbus_memset_context = NULL;
+    if (ret) {
+        error_setg_errno(errp, -ret,
+                         "qemu_prealloc_mem: preallocating memory failed");
+        return false;
    }
-    g_free(context.threads);
-
-    return ret;
+    return true;
 }

 static bool madv_populate_write_possible(char *area, size_t pagesize)
@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
 }

 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-                       ThreadContext *tc, Error **errp)
+                       ThreadContext *tc, bool async, Error **errp)
 {
    static gsize initialized;
    int ret;
@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
    }

    /* touch pages simultaneously */
-    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
+    ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
                          use_madv_populate_write);
    if (ret) {
        error_setg_errno(errp, -ret,
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@ -265,7 +265,7 @@ int getpagesize(void)
 }

 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
-                       ThreadContext *tc, Error **errp)
+                       ThreadContext *tc, bool async, Error **errp)
 {
    int i;
    size_t pagesize = qemu_real_host_page_size();
@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
    return true;
 }

+bool qemu_finish_async_prealloc_mem(Error **errp)
+{
+    /* async prealloc not supported, there is nothing to finish */
+    return true;
+}
+
 char *qemu_get_pid_name(pid_t pid)
 {
    /* XXX Implement me */