"Host Memory Backends" and "Memory devices" queue ("mem"):
 - Reintroduce memory region size checks for memory devices; the removal
   lead to some undesired side effects
 - Preallocate memory of memory backends in selected configurations
   asynchronously (so we preallocate concurrently), to speed up QEMU
   startup time.
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmXB3LcRHGRhdmlkQHJl
 ZGhhdC5jb20ACgkQTd4Q9wD/g1plRA/+N8y4aJB+qEwacl5scIpiWShqeBA0aybS
 Rp3796djgjkqozkv7AFGHrOIGiLDtCh4W1JYuML7kLN7IvuJoHSY+AHzfhDiae1l
 eluX/Cs/5rgEninwT9M0yEkgvUybA8+kx+z96hBJgkfJOrdbETc7YVbU5iP/sOOF
 UtfEVWGwT1RJOun0qrgEhHiJCTMcHyJjSEy8D867ymC+knu3OZIz22+axcmpHz6i
 QJFgY40OCP1yxBvPVLR3K/Z0se/FkxG55LwM58j7N/m+VDv4IqZCTbkZb5BTJVla
 5vKgIrZfZ+XFqrenyMsBnBLgQuyCmDJIDFfxM0A9gOvJbwtf8T4DhL9FoRvVZMDD
 SHBl/EZcViXFDDKVHjotBSA5JoNbjHac5J5jCFu7pRq+2DbzxWHmW6xV7sY9gkSO
 +SdW9hcmF/vF5MKHfoQR2kVLLJ2/EKHiN/xVVsha0+RQDctucrhg1Y9MS2obJV3u
 u2udaVk5UNcfNPuVPwkG8YQ0sIyuDYXOTThwNtsj0tyZ+tGVQmMIlou/GAsrc9PF
 xmqzkCXXyrILrPMQJrYBcdwasBLuEcJMW59BqgxHCVP9NiAQgsNVzXFg4mr3+mVF
 xTrt8wioTvAPoDvXe+BPoaH6AsIY2TqE8j7IqA1Q/IFNf+KLYkPcHknZfzfxSkdW
 woRHVtjrkMo=
 =lW5h
 -----END PGP SIGNATURE-----

Merge tag 'mem-2024-02-06-v3' of https://github.com/davidhildenbrand/qemu into staging

Hi,

"Host Memory Backends" and "Memory devices" queue ("mem"):
- Reintroduce memory region size checks for memory devices; the removal
  lead to some undesired side effects
- Preallocate memory of memory backends in selected configurations
  asynchronously (so we preallocate concurrently), to speed up QEMU
  startup time.

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmXB3LcRHGRhdmlkQHJl
# ZGhhdC5jb20ACgkQTd4Q9wD/g1plRA/+N8y4aJB+qEwacl5scIpiWShqeBA0aybS
# Rp3796djgjkqozkv7AFGHrOIGiLDtCh4W1JYuML7kLN7IvuJoHSY+AHzfhDiae1l
# eluX/Cs/5rgEninwT9M0yEkgvUybA8+kx+z96hBJgkfJOrdbETc7YVbU5iP/sOOF
# UtfEVWGwT1RJOun0qrgEhHiJCTMcHyJjSEy8D867ymC+knu3OZIz22+axcmpHz6i
# QJFgY40OCP1yxBvPVLR3K/Z0se/FkxG55LwM58j7N/m+VDv4IqZCTbkZb5BTJVla
# 5vKgIrZfZ+XFqrenyMsBnBLgQuyCmDJIDFfxM0A9gOvJbwtf8T4DhL9FoRvVZMDD
# SHBl/EZcViXFDDKVHjotBSA5JoNbjHac5J5jCFu7pRq+2DbzxWHmW6xV7sY9gkSO
# +SdW9hcmF/vF5MKHfoQR2kVLLJ2/EKHiN/xVVsha0+RQDctucrhg1Y9MS2obJV3u
# u2udaVk5UNcfNPuVPwkG8YQ0sIyuDYXOTThwNtsj0tyZ+tGVQmMIlou/GAsrc9PF
# xmqzkCXXyrILrPMQJrYBcdwasBLuEcJMW59BqgxHCVP9NiAQgsNVzXFg4mr3+mVF
# xTrt8wioTvAPoDvXe+BPoaH6AsIY2TqE8j7IqA1Q/IFNf+KLYkPcHknZfzfxSkdW
# woRHVtjrkMo=
# =lW5h
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 06 Feb 2024 07:16:07 GMT
# gpg:                using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A
# gpg:                issuer "david@redhat.com"
# gpg: Good signature from "David Hildenbrand <david@redhat.com>" [marginal]
# gpg:                 aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full]
# gpg:                 aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown]
# Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D  FCCA 4DDE 10F7 00FF 835A

* tag 'mem-2024-02-06-v3' of https://github.com/davidhildenbrand/qemu:
  oslib-posix: initialize backend memory objects in parallel
  memory-device: reintroduce memory region size check
  hv-balloon: use get_min_alignment() to express 32 GiB alignment

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2024-02-08 11:59:13 +00:00
commit 8ab67b6ebc
9 changed files with 180 additions and 53 deletions

View file

@ -20,6 +20,7 @@
#include "qom/object_interfaces.h"
#include "qemu/mmap-alloc.h"
#include "qemu/madvise.h"
#include "hw/qdev-core.h"
#ifdef CONFIG_NUMA
#include <numaif.h>
@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
uint64_t sz = memory_region_size(&backend->mr);
if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
backend->prealloc_context, errp)) {
backend->prealloc_context, false, errp)) {
return;
}
backend->prealloc = true;
@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
void *ptr;
uint64_t sz;
bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
if (!bc->alloc) {
return;
@ -402,7 +404,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
ptr, sz,
backend->prealloc_threads,
backend->prealloc_context, errp)) {
backend->prealloc_context,
async, errp)) {
return;
}
}

View file

@ -1477,22 +1477,7 @@ static void hv_balloon_ensure_mr(HvBalloon *balloon)
balloon->mr = g_new0(MemoryRegion, 1);
memory_region_init(balloon->mr, OBJECT(balloon), TYPE_HV_BALLOON,
memory_region_size(hostmem_mr));
/*
* The VM can indicate an alignment up to 32 GiB. Memory device core can
* usually only handle/guarantee 1 GiB alignment. The user will have to
* specify a larger maxmem eventually.
*
* The memory device core will warn the user in case maxmem might have to be
* increased and will fail plugging the device if there is not sufficient
* space after alignment.
*
* TODO: we could do the alignment ourselves in a slightly bigger region.
* But this feels better, although the warning might be annoying. Maybe
* we can optimize that in the future (e.g., with such a device on the
* cmdline place/size the device memory region differently.
*/
balloon->mr->align = MAX(32 * GiB, memory_region_get_alignment(hostmem_mr));
balloon->mr->align = memory_region_get_alignment(hostmem_mr);
}
static void hv_balloon_free_mr(HvBalloon *balloon)
@ -1654,6 +1639,25 @@ static MemoryRegion *hv_balloon_md_get_memory_region(MemoryDeviceState *md,
return balloon->mr;
}
static uint64_t hv_balloon_md_get_min_alignment(const MemoryDeviceState *md)
{
/*
* The VM can indicate an alignment up to 32 GiB. Memory device core can
* usually only handle/guarantee 1 GiB alignment. The user will have to
* specify a larger maxmem eventually.
*
* The memory device core will warn the user in case maxmem might have to be
* increased and will fail plugging the device if there is not sufficient
* space after alignment.
*
* TODO: we could do the alignment ourselves in a slightly bigger region.
* But this feels better, although the warning might be annoying. Maybe
* we can optimize that in the future (e.g., with such a device on the
* cmdline place/size the device memory region differently.
*/
return 32 * GiB;
}
static void hv_balloon_md_fill_device_info(const MemoryDeviceState *md,
MemoryDeviceInfo *info)
{
@ -1766,5 +1770,6 @@ static void hv_balloon_class_init(ObjectClass *klass, void *data)
mdc->get_memory_region = hv_balloon_md_get_memory_region;
mdc->decide_memslots = hv_balloon_decide_memslots;
mdc->get_memslots = hv_balloon_get_memslots;
mdc->get_min_alignment = hv_balloon_md_get_min_alignment;
mdc->fill_device_info = hv_balloon_md_fill_device_info;
}

View file

@ -374,6 +374,20 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
goto out;
}
/*
* We always want the memory region size to be multiples of the memory
* region alignment: for example, DIMMs with 1G+1byte size don't make
* any sense. Note that we don't check that the size is multiples
* of any additional alignment requirements the memory device might
* have when it comes to the address in physical address space.
*/
if (!QEMU_IS_ALIGNED(memory_region_size(mr),
memory_region_get_alignment(mr))) {
error_setg(errp, "backend memory size must be multiple of 0x%"
PRIx64, memory_region_get_alignment(mr));
return;
}
if (legacy_align) {
align = *legacy_align;
} else {

View file

@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;
if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
static bool warned;
/*
@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;
if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
error_report_err(local_err);
return -ENOMEM;
}

View file

@ -1083,6 +1083,11 @@ typedef enum MachineInitPhase {
*/
PHASE_ACCEL_CREATED,
/*
* Late backend objects have been created and initialized.
*/
PHASE_LATE_BACKENDS_CREATED,
/*
* machine_class->init has been called, thus creating any embedded
* devices and validating machine properties. Devices created at

View file

@ -680,6 +680,8 @@ typedef struct ThreadContext ThreadContext;
* @area: start address of the are to preallocate
* @sz: the size of the area to preallocate
* @max_threads: maximum number of threads to use
* @tc: prealloc context threads pointer, NULL if not in use
* @async: request asynchronous preallocation, requires @tc
* @errp: returns an error if this function fails
*
* Preallocate memory (populate/prefault page tables writable) for the virtual
@ -687,10 +689,24 @@ typedef struct ThreadContext ThreadContext;
* each page in the area was faulted in writable at least once, for example,
* after allocating file blocks for mapped files.
*
* When setting @async, allocation might be performed asynchronously.
* qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
* preallocation.
*
* Return: true on success, else false setting @errp with error.
*/
bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp);
ThreadContext *tc, bool async, Error **errp);
/**
* qemu_finish_async_prealloc_mem:
* @errp: returns an error if this function fails
*
* Finish all outstanding asynchronous memory preallocation.
*
* Return: true on success, else false setting @errp with error.
*/
bool qemu_finish_async_prealloc_mem(Error **errp);
/**
* qemu_get_pid_name:

View file

@ -2013,6 +2013,14 @@ static void qemu_create_late_backends(void)
object_option_foreach_add(object_create_late);
/*
* Wait for any outstanding memory prealloc from created memory
* backends to complete.
*/
if (!qemu_finish_async_prealloc_mem(&error_fatal)) {
exit(1);
}
if (tpm_init() < 0) {
exit(1);
}
@ -3699,6 +3707,7 @@ void qemu_init(int argc, char **argv)
* over memory-backend-file objects).
*/
qemu_create_late_backends();
phase_advance(PHASE_LATE_BACKENDS_CREATED);
/*
* Note: creates a QOM object, must run only after global and

View file

@ -42,6 +42,7 @@
#include "qemu/cutils.h"
#include "qemu/units.h"
#include "qemu/thread-context.h"
#include "qemu/main-loop.h"
#ifdef CONFIG_LINUX
#include <sys/syscall.h>
@ -63,11 +64,15 @@
struct MemsetThread;
static QLIST_HEAD(, MemsetContext) memset_contexts =
QLIST_HEAD_INITIALIZER(memset_contexts);
typedef struct MemsetContext {
bool all_threads_created;
bool any_thread_failed;
struct MemsetThread *threads;
int num_threads;
QLIST_ENTRY(MemsetContext) next;
} MemsetContext;
struct MemsetThread {
@ -412,19 +417,44 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
return ret;
}
static int wait_and_free_mem_prealloc_context(MemsetContext *context)
{
int i, ret = 0, tmp;
for (i = 0; i < context->num_threads; i++) {
tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
if (tmp) {
ret = tmp;
}
}
g_free(context->threads);
g_free(context);
return ret;
}
static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
int max_threads, ThreadContext *tc,
int max_threads, ThreadContext *tc, bool async,
bool use_madv_populate_write)
{
static gsize initialized = 0;
MemsetContext context = {
.num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
};
MemsetContext *context = g_malloc0(sizeof(MemsetContext));
size_t numpages_per_thread, leftover;
void *(*touch_fn)(void *);
int ret = 0, i = 0;
int ret, i = 0;
char *addr = area;
/*
* Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
* and prealloc context for thread placement.
*/
if (!use_madv_populate_write || !tc) {
async = false;
}
context->num_threads =
get_memset_num_threads(hpagesize, numpages, max_threads);
if (g_once_init_enter(&initialized)) {
qemu_mutex_init(&page_mutex);
qemu_cond_init(&page_cond);
@ -432,8 +462,11 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
}
if (use_madv_populate_write) {
/* Avoid creating a single thread for MADV_POPULATE_WRITE */
if (context.num_threads == 1) {
/*
* Avoid creating a single thread for MADV_POPULATE_WRITE when
* preallocating synchronously.
*/
if (context->num_threads == 1 && !async) {
if (qemu_madvise(area, hpagesize * numpages,
QEMU_MADV_POPULATE_WRITE)) {
return -errno;
@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
touch_fn = do_touch_pages;
}
context.threads = g_new0(MemsetThread, context.num_threads);
numpages_per_thread = numpages / context.num_threads;
leftover = numpages % context.num_threads;
for (i = 0; i < context.num_threads; i++) {
context.threads[i].addr = addr;
context.threads[i].numpages = numpages_per_thread + (i < leftover);
context.threads[i].hpagesize = hpagesize;
context.threads[i].context = &context;
context->threads = g_new0(MemsetThread, context->num_threads);
numpages_per_thread = numpages / context->num_threads;
leftover = numpages % context->num_threads;
for (i = 0; i < context->num_threads; i++) {
context->threads[i].addr = addr;
context->threads[i].numpages = numpages_per_thread + (i < leftover);
context->threads[i].hpagesize = hpagesize;
context->threads[i].context = context;
if (tc) {
thread_context_create_thread(tc, &context.threads[i].pgthread,
thread_context_create_thread(tc, &context->threads[i].pgthread,
"touch_pages",
touch_fn, &context.threads[i],
touch_fn, &context->threads[i],
QEMU_THREAD_JOINABLE);
} else {
qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
touch_fn, &context.threads[i],
qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
touch_fn, &context->threads[i],
QEMU_THREAD_JOINABLE);
}
addr += context.threads[i].numpages * hpagesize;
addr += context->threads[i].numpages * hpagesize;
}
if (async) {
/*
* async requests currently require the BQL. Add it to the list and kick
* preallocation off during qemu_finish_async_prealloc_mem().
*/
assert(bql_locked());
QLIST_INSERT_HEAD(&memset_contexts, context, next);
return 0;
}
if (!use_madv_populate_write) {
sigbus_memset_context = &context;
sigbus_memset_context = context;
}
qemu_mutex_lock(&page_mutex);
context.all_threads_created = true;
context->all_threads_created = true;
qemu_cond_broadcast(&page_cond);
qemu_mutex_unlock(&page_mutex);
for (i = 0; i < context.num_threads; i++) {
int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
ret = wait_and_free_mem_prealloc_context(context);
if (!use_madv_populate_write) {
sigbus_memset_context = NULL;
}
return ret;
}
bool qemu_finish_async_prealloc_mem(Error **errp)
{
int ret = 0, tmp;
MemsetContext *context, *next_context;
/* Waiting for preallocation requires the BQL. */
assert(bql_locked());
if (QLIST_EMPTY(&memset_contexts)) {
return true;
}
qemu_mutex_lock(&page_mutex);
QLIST_FOREACH(context, &memset_contexts, next) {
context->all_threads_created = true;
}
qemu_cond_broadcast(&page_cond);
qemu_mutex_unlock(&page_mutex);
QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
QLIST_REMOVE(context, next);
tmp = wait_and_free_mem_prealloc_context(context);
if (tmp) {
ret = tmp;
}
}
if (!use_madv_populate_write) {
sigbus_memset_context = NULL;
if (ret) {
error_setg_errno(errp, -ret,
"qemu_prealloc_mem: preallocating memory failed");
return false;
}
g_free(context.threads);
return ret;
return true;
}
static bool madv_populate_write_possible(char *area, size_t pagesize)
@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
}
bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp)
ThreadContext *tc, bool async, Error **errp)
{
static gsize initialized;
int ret;
@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
}
/* touch pages simultaneously */
ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
use_madv_populate_write);
if (ret) {
error_setg_errno(errp, -ret,

View file

@ -265,7 +265,7 @@ int getpagesize(void)
}
bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp)
ThreadContext *tc, bool async, Error **errp)
{
int i;
size_t pagesize = qemu_real_host_page_size();
@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
return true;
}
bool qemu_finish_async_prealloc_mem(Error **errp)
{
/* async prealloc not supported, there is nothing to finish */
return true;
}
char *qemu_get_pid_name(pid_t pid)
{
/* XXX Implement me */