diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index a9acaf618e..9d1abd2587 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -10,3 +10,4 @@ Migration has plenty of features to support different use cases. dirty-limit vfio virtio + mapped-ram diff --git a/docs/devel/migration/mapped-ram.rst b/docs/devel/migration/mapped-ram.rst new file mode 100644 index 0000000000..fa4cefd9fc --- /dev/null +++ b/docs/devel/migration/mapped-ram.rst @@ -0,0 +1,138 @@ +Mapped-ram +========== + +Mapped-ram is a new stream format for the RAM section designed to +supplement the existing ``file:`` migration and make it compatible +with ``multifd``. This enables parallel migration of a guest's RAM to +a file. + +The core of the feature is to ensure that RAM pages are mapped +directly to offsets in the resulting migration file. This enables the +``multifd`` threads to write exclusively to those offsets even if the +guest is constantly dirtying pages (i.e. live migration). Another +benefit is that the resulting file will have a bounded size, since +pages which are dirtied multiple times will always go to a fixed +location in the file, rather than constantly being added to a +sequential stream. Having the pages at fixed offsets also allows the +usage of O_DIRECT for save/restore of the migration stream as the +pages are ensured to be written respecting O_DIRECT alignment +restrictions (direct-io support not yet implemented). + +Usage +----- + +On both source and destination, enable the ``multifd`` and +``mapped-ram`` capabilities: + + ``migrate_set_capability multifd on`` + + ``migrate_set_capability mapped-ram on`` + +Use a ``file:`` URL for migration: + + ``migrate file:/path/to/migration/file`` + +Mapped-ram migration is best done non-live, i.e. by stopping the VM on +the source side before migrating. + +Use-cases +--------- + +The mapped-ram feature was designed for use cases where the migration +stream will be directed to a file in the filesystem and not +immediately restored on the destination VM [#]_. These could be +thought of as snapshots. We can further categorize them into live and +non-live. + +- Non-live snapshot + +If the use case requires a VM to be stopped before taking a snapshot, +that's the ideal scenario for mapped-ram migration. Not having to +track dirty pages, the migration will write the RAM pages to the disk +as fast as it can. + +Note: if a snapshot is taken of a running VM, but the VM will be +stopped after the snapshot by the admin, then consider stopping it +right before the snapshot to take benefit of the performance gains +mentioned above. + +- Live snapshot + +If the use case requires that the VM keeps running during and after +the snapshot operation, then mapped-ram migration can still be used, +but will be less performant. Other strategies such as +background-snapshot should be evaluated as well. One benefit of +mapped-ram in this scenario is portability since background-snapshot +depends on async dirty tracking (KVM_GET_DIRTY_LOG) which is not +supported outside of Linux. + +.. [#] While this same effect could be obtained with the usage of + snapshots or the ``file:`` migration alone, mapped-ram provides + a performance increase for VMs with larger RAM sizes (10s to + 100s of GiBs), specially if the VM has been stopped beforehand. + +RAM section format +------------------ + +Instead of having a sequential stream of pages that follow the +RAMBlock headers, the dirty pages for a RAMBlock follow its header +instead. This ensures that each RAM page has a fixed offset in the +resulting migration file. + +A bitmap is introduced to track which pages have been written in the +migration file. Pages are written at a fixed location for every +ramblock. Zero pages are ignored as they'd be zero in the destination +migration as well. + +:: + + Without mapped-ram: With mapped-ram: + + --------------------- -------------------------------- + | ramblock 1 header | | ramblock 1 header | + --------------------- -------------------------------- + | ramblock 2 header | | ramblock 1 mapped-ram header | + --------------------- -------------------------------- + | ... | | padding to next 1MB boundary | + --------------------- | ... | + | ramblock n header | -------------------------------- + --------------------- | ramblock 1 pages | + | RAM_SAVE_FLAG_EOS | | ... | + --------------------- -------------------------------- + | stream of pages | | ramblock 2 header | + | (iter 1) | -------------------------------- + | ... | | ramblock 2 mapped-ram header | + --------------------- -------------------------------- + | RAM_SAVE_FLAG_EOS | | padding to next 1MB boundary | + --------------------- | ... | + | stream of pages | -------------------------------- + | (iter 2) | | ramblock 2 pages | + | ... | | ... | + --------------------- -------------------------------- + | ... | | ... | + --------------------- -------------------------------- + | RAM_SAVE_FLAG_EOS | + -------------------------------- + | ... | + -------------------------------- + +where: + - ramblock header: the generic information for a ramblock, such as + idstr, used_len, etc. + + - ramblock mapped-ram header: the information added by this feature: + bitmap of pages written, bitmap size and offset of pages in the + migration file. + +Restrictions +------------ + +Since pages are written to their relative offsets and out of order +(due to the memory dirtying patterns), streaming channels such as +sockets are not supported. A seekable channel such as a file is +required. This can be verified in the QIOChannel by the presence of +the QIO_CHANNEL_FEATURE_SEEKABLE. + +The improvements brought by this feature apply only to guest physical +RAM. Other types of memory such as VRAM are migrated as part of device +states. diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 3eb79723c6..848915ea5b 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -44,6 +44,19 @@ struct RAMBlock { size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; + + /* + * Below fields are only used by mapped-ram migration + */ + /* bitmap of pages present in the migration file */ + unsigned long *file_bmap; + /* + * offset in the file pages belonging to this ramblock are saved, + * used only during migration to a file. + */ + off_t bitmap_offset; + uint64_t pages_offset; + /* bitmap of already received pages in postcopy */ unsigned long *receivedmap; diff --git a/include/io/channel.h b/include/io/channel.h index 5f9dbaab65..7986c49c71 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -44,6 +44,7 @@ enum QIOChannelFeature { QIO_CHANNEL_FEATURE_LISTEN, QIO_CHANNEL_FEATURE_WRITE_ZERO_COPY, QIO_CHANNEL_FEATURE_READ_MSG_PEEK, + QIO_CHANNEL_FEATURE_SEEKABLE, }; @@ -130,6 +131,16 @@ struct QIOChannelClass { Error **errp); /* Optional callbacks */ + ssize_t (*io_pwritev)(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp); + ssize_t (*io_preadv)(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp); int (*io_shutdown)(QIOChannel *ioc, QIOChannelShutdown how, Error **errp); @@ -528,6 +539,78 @@ void qio_channel_set_follow_coroutine_ctx(QIOChannel *ioc, bool enabled); int qio_channel_close(QIOChannel *ioc, Error **errp); +/** + * qio_channel_pwritev + * @ioc: the channel object + * @iov: the array of memory regions to write data from + * @niov: the length of the @iov array + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + * Behaves as qio_channel_writev_full, apart from not supporting + * sending of file handles as well as beginning the write at the + * passed @offset + * + */ +ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp); + +/** + * qio_channel_pwrite + * @ioc: the channel object + * @buf: the memory region to write data into + * @buflen: the number of bytes to @buf + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + */ +ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp); + +/** + * qio_channel_preadv + * @ioc: the channel object + * @iov: the array of memory regions to read data into + * @niov: the length of the @iov array + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + * Behaves as qio_channel_readv_full, apart from not supporting + * receiving of file handles as well as beginning the read at the + * passed @offset + * + */ +ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp); + +/** + * qio_channel_pread + * @ioc: the channel object + * @buf: the memory region to write data into + * @buflen: the number of bytes to @buf + * @offset: offset in the channel where writes should begin + * @errp: pointer to a NULL-initialized error object + * + * Not all implementations will support this facility, so may report + * an error. To avoid errors, the caller may check for the feature + * flag QIO_CHANNEL_FEATURE_SEEKABLE prior to calling this method. + * + */ +ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp); + /** * qio_channel_shutdown: * @ioc: the channel object diff --git a/include/migration/qemu-file-types.h b/include/migration/qemu-file-types.h index 9ba163f333..adec5abc07 100644 --- a/include/migration/qemu-file-types.h +++ b/include/migration/qemu-file-types.h @@ -50,6 +50,8 @@ unsigned int qemu_get_be16(QEMUFile *f); unsigned int qemu_get_be32(QEMUFile *f); uint64_t qemu_get_be64(QEMUFile *f); +bool qemu_file_is_seekable(QEMUFile *f); + static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { qemu_put_be64(f, *pv); diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h index cb3526d1f4..2c0a2fe751 100644 --- a/include/qemu/bitops.h +++ b/include/qemu/bitops.h @@ -67,6 +67,19 @@ static inline void clear_bit(long nr, unsigned long *addr) *p &= ~mask; } +/** + * clear_bit_atomic - Clears a bit in memory atomically + * @nr: Bit to clear + * @addr: Address to start counting from + */ +static inline void clear_bit_atomic(long nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = addr + BIT_WORD(nr); + + return qatomic_and(p, ~mask); +} + /** * change_bit - Toggle a bit in memory * @nr: Bit to change diff --git a/io/channel-file.c b/io/channel-file.c index 4a12c61886..d4706fa592 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -36,6 +36,10 @@ qio_channel_file_new_fd(int fd) ioc->fd = fd; + if (lseek(fd, 0, SEEK_CUR) != (off_t)-1) { + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); + } + trace_qio_channel_file_new_fd(ioc, fd); return ioc; @@ -60,6 +64,10 @@ qio_channel_file_new_path(const char *path, return NULL; } + if (lseek(ioc->fd, 0, SEEK_CUR) != (off_t)-1) { + qio_channel_set_feature(QIO_CHANNEL(ioc), QIO_CHANNEL_FEATURE_SEEKABLE); + } + trace_qio_channel_file_new_path(ioc, path, flags, mode, ioc->fd); return ioc; @@ -138,6 +146,58 @@ static ssize_t qio_channel_file_writev(QIOChannel *ioc, return ret; } +#ifdef CONFIG_PREADV +static ssize_t qio_channel_file_preadv(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + ssize_t ret; + + retry: + ret = preadv(fioc->fd, iov, niov, offset); + if (ret < 0) { + if (errno == EAGAIN) { + return QIO_CHANNEL_ERR_BLOCK; + } + if (errno == EINTR) { + goto retry; + } + + error_setg_errno(errp, errno, "Unable to read from file"); + return -1; + } + + return ret; +} + +static ssize_t qio_channel_file_pwritev(QIOChannel *ioc, + const struct iovec *iov, + size_t niov, + off_t offset, + Error **errp) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + ssize_t ret; + + retry: + ret = pwritev(fioc->fd, iov, niov, offset); + if (ret <= 0) { + if (errno == EAGAIN) { + return QIO_CHANNEL_ERR_BLOCK; + } + if (errno == EINTR) { + goto retry; + } + error_setg_errno(errp, errno, "Unable to write to file"); + return -1; + } + return ret; +} +#endif /* CONFIG_PREADV */ + static int qio_channel_file_set_blocking(QIOChannel *ioc, bool enabled, Error **errp) @@ -182,6 +242,11 @@ static int qio_channel_file_close(QIOChannel *ioc, { QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + if (qemu_fdatasync(fioc->fd) < 0) { + error_setg_errno(errp, errno, + "Unable to synchronize file data with storage device"); + return -1; + } if (qemu_close(fioc->fd) < 0) { error_setg_errno(errp, errno, "Unable to close file"); @@ -223,6 +288,10 @@ static void qio_channel_file_class_init(ObjectClass *klass, ioc_klass->io_writev = qio_channel_file_writev; ioc_klass->io_readv = qio_channel_file_readv; ioc_klass->io_set_blocking = qio_channel_file_set_blocking; +#ifdef CONFIG_PREADV + ioc_klass->io_pwritev = qio_channel_file_pwritev; + ioc_klass->io_preadv = qio_channel_file_preadv; +#endif ioc_klass->io_seek = qio_channel_file_seek; ioc_klass->io_close = qio_channel_file_close; ioc_klass->io_create_watch = qio_channel_file_create_watch; diff --git a/io/channel.c b/io/channel.c index 86c5834510..a1f12f8e90 100644 --- a/io/channel.c +++ b/io/channel.c @@ -454,6 +454,64 @@ GSource *qio_channel_add_watch_source(QIOChannel *ioc, } +ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_pwritev) { + error_setg(errp, "Channel does not support pwritev"); + return -1; + } + + if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) { + error_setg_errno(errp, EINVAL, "Requested channel is not seekable"); + return -1; + } + + return klass->io_pwritev(ioc, iov, niov, offset, errp); +} + +ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = buflen + }; + + return qio_channel_pwritev(ioc, &iov, 1, offset, errp); +} + +ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov, + size_t niov, off_t offset, Error **errp) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + if (!klass->io_preadv) { + error_setg(errp, "Channel does not support preadv"); + return -1; + } + + if (!qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_SEEKABLE)) { + error_setg_errno(errp, EINVAL, "Requested channel is not seekable"); + return -1; + } + + return klass->io_preadv(ioc, iov, niov, offset, errp); +} + +ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen, + off_t offset, Error **errp) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = buflen + }; + + return qio_channel_preadv(ioc, &iov, 1, offset, errp); +} + int qio_channel_shutdown(QIOChannel *ioc, QIOChannelShutdown how, Error **errp) diff --git a/migration/fd.c b/migration/fd.c index 0eb677dcae..d4ae72d132 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -15,18 +15,41 @@ */ #include "qemu/osdep.h" +#include "qapi/error.h" #include "channel.h" #include "fd.h" #include "migration.h" #include "monitor/monitor.h" +#include "io/channel-file.h" #include "io/channel-util.h" +#include "options.h" #include "trace.h" +static struct FdOutgoingArgs { + int fd; +} outgoing_args; + +int fd_args_get_fd(void) +{ + return outgoing_args.fd; +} + +void fd_cleanup_outgoing_migration(void) +{ + if (outgoing_args.fd > 0) { + close(outgoing_args.fd); + outgoing_args.fd = -1; + } +} + void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { QIOChannel *ioc; int fd = monitor_get_fd(monitor_cur(), fdname, errp); + + outgoing_args.fd = -1; + if (fd == -1) { return; } @@ -38,6 +61,8 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } + outgoing_args.fd = fd; + qio_channel_set_name(ioc, "migration-fd-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); object_unref(OBJECT(ioc)); @@ -73,4 +98,23 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) fd_accept_incoming_migration, NULL, NULL, g_main_context_get_thread_default()); + + if (migrate_multifd()) { + int channels = migrate_multifd_channels(); + + while (channels--) { + ioc = QIO_CHANNEL(qio_channel_file_new_fd(dup(fd))); + + if (QIO_CHANNEL_FILE(ioc)->fd == -1) { + error_setg(errp, "Failed to duplicate fd %d", fd); + return; + } + + qio_channel_set_name(ioc, "migration-fd-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + } + } } diff --git a/migration/fd.h b/migration/fd.h index b901bc014e..0c0a18d9e7 100644 --- a/migration/fd.h +++ b/migration/fd.h @@ -20,4 +20,6 @@ void fd_start_incoming_migration(const char *fdname, Error **errp); void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp); +void fd_cleanup_outgoing_migration(void); +int fd_args_get_fd(void); #endif diff --git a/migration/file.c b/migration/file.c index 5d4975f43e..164b079966 100644 --- a/migration/file.c +++ b/migration/file.c @@ -6,17 +6,25 @@ */ #include "qemu/osdep.h" +#include "exec/ramblock.h" #include "qemu/cutils.h" +#include "qemu/error-report.h" #include "qapi/error.h" #include "channel.h" +#include "fd.h" #include "file.h" #include "migration.h" #include "io/channel-file.h" #include "io/channel-util.h" +#include "options.h" #include "trace.h" #define OFFSET_OPTION ",offset=" +static struct FileOutgoingArgs { + char *fname; +} outgoing_args; + /* Remove the offset option from @filespec and return it in @offsetp. */ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp) @@ -36,6 +44,41 @@ int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp) return 0; } +void file_cleanup_outgoing_migration(void) +{ + g_free(outgoing_args.fname); + outgoing_args.fname = NULL; +} + +bool file_send_channel_create(gpointer opaque, Error **errp) +{ + QIOChannelFile *ioc; + int flags = O_WRONLY; + bool ret = false; + int fd = fd_args_get_fd(); + + if (fd && fd != -1) { + ioc = qio_channel_file_new_fd(dup(fd)); + } else { + ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); + if (!ioc) { + goto out; + } + } + + multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); + ret = true; + +out: + /* + * File channel creation is synchronous. However posting this + * semaphore here is simpler than adding a special case. + */ + multifd_send_channel_created(); + + return ret; +} + void file_start_outgoing_migration(MigrationState *s, FileMigrationArgs *file_args, Error **errp) { @@ -52,6 +95,8 @@ void file_start_outgoing_migration(MigrationState *s, return; } + outgoing_args.fname = g_strdup(filename); + ioc = QIO_CHANNEL(fioc); if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { return; @@ -74,7 +119,8 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) g_autofree char *filename = g_strdup(file_args->filename); QIOChannelFile *fioc = NULL; uint64_t offset = file_args->offset; - QIOChannel *ioc; + int channels = 1; + int i = 0; trace_migration_file_incoming(filename); @@ -83,13 +129,100 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) return; } - ioc = QIO_CHANNEL(fioc); - if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) { + if (offset && + qio_channel_io_seek(QIO_CHANNEL(fioc), offset, SEEK_SET, errp) < 0) { return; } - qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - file_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); + + if (migrate_multifd()) { + channels += migrate_multifd_channels(); + } + + do { + QIOChannel *ioc = QIO_CHANNEL(fioc); + + qio_channel_set_name(ioc, "migration-file-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + + fioc = qio_channel_file_new_fd(dup(fioc->fd)); + + if (!fioc || fioc->fd == -1) { + error_setg(errp, "Error creating migration incoming channel"); + break; + } + } while (++i < channels); +} + +int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, + int niov, RAMBlock *block, Error **errp) +{ + ssize_t ret = -1; + int i, slice_idx, slice_num; + uintptr_t base, next, offset; + size_t len; + + slice_idx = 0; + slice_num = 1; + + /* + * If the iov array doesn't have contiguous elements, we need to + * split it in slices because we only have one file offset for the + * whole iov. Do this here so callers don't need to break the iov + * array themselves. + */ + for (i = 0; i < niov; i++, slice_num++) { + base = (uintptr_t) iov[i].iov_base; + + if (i != niov - 1) { + len = iov[i].iov_len; + next = (uintptr_t) iov[i + 1].iov_base; + + if (base + len == next) { + continue; + } + } + + /* + * Use the offset of the first element of the segment that + * we're sending. + */ + offset = (uintptr_t) iov[slice_idx].iov_base - (uintptr_t) block->host; + if (offset >= block->used_length) { + error_setg(errp, "offset " RAM_ADDR_FMT + "outside of ramblock %s range", offset, block->idstr); + ret = -1; + break; + } + + ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num, + block->pages_offset + offset, errp); + if (ret < 0) { + break; + } + + slice_idx += slice_num; + slice_num = 0; + } + + return (ret < 0) ? ret : 0; +} + +int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp) +{ + MultiFDRecvData *data = p->data; + size_t ret; + + ret = qio_channel_pread(p->c, (char *) data->opaque, + data->size, data->file_offset, errp); + if (ret != data->size) { + error_prepend(errp, + "multifd recv (%u): read 0x%zx, expected 0x%zx", + p->id, ret, data->size); + return -1; + } + + return 0; } diff --git a/migration/file.h b/migration/file.h index 37d6a08bfc..9f71e87f74 100644 --- a/migration/file.h +++ b/migration/file.h @@ -9,10 +9,18 @@ #define QEMU_MIGRATION_FILE_H #include "qapi/qapi-types-migration.h" +#include "io/task.h" +#include "channel.h" +#include "multifd.h" void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp); void file_start_outgoing_migration(MigrationState *s, FileMigrationArgs *file_args, Error **errp); int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); +void file_cleanup_outgoing_migration(void); +bool file_send_channel_create(gpointer opaque, Error **errp); +int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, + int niov, RAMBlock *block, Error **errp); +int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp); #endif diff --git a/migration/migration.c b/migration/migration.c index bab68bcbef..a49fcd53ee 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -140,9 +140,38 @@ static bool transport_supports_multi_channels(MigrationAddress *addr) if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { SocketAddress *saddr = &addr->u.socket; - return saddr->type == SOCKET_ADDRESS_TYPE_INET || - saddr->type == SOCKET_ADDRESS_TYPE_UNIX || - saddr->type == SOCKET_ADDRESS_TYPE_VSOCK; + if (saddr->type == SOCKET_ADDRESS_TYPE_FD) { + return migrate_mapped_ram(); + } + + return (saddr->type == SOCKET_ADDRESS_TYPE_INET || + saddr->type == SOCKET_ADDRESS_TYPE_UNIX || + saddr->type == SOCKET_ADDRESS_TYPE_VSOCK); + } else if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + return migrate_mapped_ram(); + } else { + return false; + } +} + +static bool migration_needs_seekable_channel(void) +{ + return migrate_mapped_ram(); +} + +static bool transport_supports_seeking(MigrationAddress *addr) +{ + if (addr->transport == MIGRATION_ADDRESS_TYPE_FILE) { + return true; + } + + /* + * At this point, the user might not yet have passed the file + * descriptor to QEMU, so we cannot know for sure whether it + * refers to a plain file or a socket. Let it through anyway. + */ + if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { + return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD; } return false; @@ -152,6 +181,12 @@ static bool migration_channels_and_transport_compatible(MigrationAddress *addr, Error **errp) { + if (migration_needs_seekable_channel() && + !transport_supports_seeking(addr)) { + error_setg(errp, "Migration requires seekable transport (e.g. file)"); + return false; + } + if (migration_needs_multiple_sockets() && !transport_supports_multi_channels(addr)) { error_setg(errp, "Migration requires multi-channel URIs (e.g. tcp)"); @@ -881,7 +916,8 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) uint32_t channel_magic = 0; int ret = 0; - if (migrate_multifd() && !migrate_postcopy_ram() && + if (migrate_multifd() && !migrate_mapped_ram() && + !migrate_postcopy_ram() && qio_channel_has_feature(ioc, QIO_CHANNEL_FEATURE_READ_MSG_PEEK)) { /* * With multiple channels, it is possible that we receive channels @@ -1950,6 +1986,18 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc, return false; } + if (migrate_mapped_ram()) { + if (migrate_tls()) { + error_setg(errp, "Cannot use TLS with mapped-ram"); + return false; + } + + if (migrate_multifd_compression()) { + error_setg(errp, "Cannot use compression with mapped-ram"); + return false; + } + } + if (migrate_mode_is_cpr(s)) { const char *conflict = NULL; diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 012e3bdea1..6120faad65 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -69,7 +69,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) err_msg = "out of memory for buf"; goto err_free_zbuff; } - p->data = z; + p->compress_data = z; return 0; err_free_zbuff: @@ -92,15 +92,15 @@ err_free_z: */ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; deflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; g_free(z->buf); z->buf = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -117,7 +117,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; @@ -194,7 +194,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; - p->data = z; + p->compress_data = z; zs->zalloc = Z_NULL; zs->zfree = Z_NULL; zs->opaque = Z_NULL; @@ -224,17 +224,17 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zlib_recv_cleanup(MultiFDRecvParams *p) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; inflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** - * zlib_recv_pages: read the data from the channel into actual pages + * zlib_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -244,9 +244,9 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zlib_recv(MultiFDRecvParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ @@ -319,7 +319,7 @@ static MultiFDMethods multifd_zlib_ops = { .send_prepare = zlib_send_prepare, .recv_setup = zlib_recv_setup, .recv_cleanup = zlib_recv_cleanup, - .recv_pages = zlib_recv_pages + .recv = zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index dc8fe43e94..cac236833d 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->data = z; + p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -90,14 +90,14 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) */ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeCStream(z->zcs); z->zcs = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -114,7 +114,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; uint32_t i; @@ -183,7 +183,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; - p->data = z; + p->compress_data = z; z->zds = ZSTD_createDStream(); if (!z->zds) { g_free(z); @@ -221,18 +221,18 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zstd_recv_cleanup(MultiFDRecvParams *p) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeDStream(z->zds); z->zds = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** - * zstd_recv_pages: read the data from the channel into actual pages + * zstd_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -242,13 +242,13 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; int i; @@ -310,7 +310,7 @@ static MultiFDMethods multifd_zstd_ops = { .send_prepare = zstd_send_prepare, .recv_setup = zstd_recv_setup, .recv_cleanup = zstd_recv_cleanup, - .recv_pages = zstd_recv_pages + .recv = zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index 6c07f19af1..d4a44da559 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -17,7 +17,8 @@ #include "exec/ramblock.h" #include "qemu/error-report.h" #include "qapi/error.h" -#include "ram.h" +#include "fd.h" +#include "file.h" #include "migration.h" #include "migration-stats.h" #include "socket.h" @@ -28,6 +29,7 @@ #include "threadinfo.h" #include "options.h" #include "qemu/yank.h" +#include "io/channel-file.h" #include "io/channel-socket.h" #include "yank_functions.h" @@ -81,9 +83,13 @@ struct { struct { MultiFDRecvParams *params; + MultiFDRecvData *data; /* number of created threads */ int count; - /* syncs main thread and channels */ + /* + * This is always posted by the recv threads, the migration thread + * uses it to wait for recv threads to finish assigned tasks. + */ QemuSemaphore sem_sync; /* global number of generated multifd packets */ uint64_t packet_num; @@ -92,6 +98,27 @@ struct { MultiFDMethods *ops; } *multifd_recv_state; +static bool multifd_use_packets(void) +{ + return !migrate_mapped_ram(); +} + +void multifd_send_channel_created(void) +{ + qemu_sem_post(&multifd_send_state->channels_created); +} + +static void multifd_set_file_bitmap(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + assert(pages->block); + + for (int i = 0; i < p->pages->num; i++) { + ramblock_set_file_bmap_atomic(pages->block, pages->offset[i]); + } +} + /* Multifd without compression */ /** @@ -122,6 +149,19 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->num * p->page_size; +} + /** * nocomp_send_prepare: prepare date to be able to send * @@ -136,9 +176,15 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { bool use_zero_copy_send = migrate_zero_copy_send(); - MultiFDPages_t *pages = p->pages; int ret; + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + multifd_set_file_bitmap(p); + + return 0; + } + if (!use_zero_copy_send) { /* * Only !zerocopy needs the header in IOV; zerocopy will @@ -147,13 +193,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) multifd_send_prepare_header(p); } - for (int i = 0; i < pages->num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; - } - - p->next_packet_size = pages->num * p->page_size; + multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; multifd_send_fill_packet(p); @@ -197,7 +237,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) } /** - * nocomp_recv_pages: read the data from the channel into actual pages + * nocomp_recv: read the data from the channel * * For no compression we just need to read things into the correct place. * @@ -206,9 +246,15 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { - uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t flags; + + if (!multifd_use_packets()) { + return multifd_file_recv_data(p, errp); + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", @@ -228,7 +274,7 @@ static MultiFDMethods multifd_nocomp_ops = { .send_prepare = nocomp_send_prepare, .recv_setup = nocomp_recv_setup, .recv_cleanup = nocomp_recv_cleanup, - .recv_pages = nocomp_recv_pages + .recv = nocomp_recv }; static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { @@ -663,6 +709,19 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); + /* + * An explicit close() on the channel here is normally not + * required, but can be helpful for "file:" iochannels, where it + * will include fdatasync() to make sure the data is flushed to the + * disk backend. + * + * The object_unref() cannot guarantee that because: (1) finalize() + * of the iochannel is only triggered on the last reference, and + * it's not guaranteed that we always hold the last refcount when + * reaching here, and, (2) even if finalize() is invoked, it only + * does a close(fd) without data flush. + */ + qio_channel_close(p->c, &error_abort); object_unref(OBJECT(p->c)); p->c = NULL; } @@ -684,6 +743,8 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + file_cleanup_outgoing_migration(); + fd_cleanup_outgoing_migration(); socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); @@ -795,15 +856,18 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; + bool use_packets = multifd_use_packets(); thread = migration_threads_add(p->name, qemu_get_thread_id()); trace_multifd_send_thread_start(p->id); rcu_register_thread(); - if (multifd_send_initial_packet(p, &local_err) < 0) { - ret = -1; - goto out; + if (use_packets) { + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } } while (true) { @@ -829,8 +893,15 @@ static void *multifd_send_thread(void *opaque) break; } - ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, - 0, p->write_flags, &local_err); + if (migrate_mapped_ram()) { + ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num, + p->pages->block, &local_err); + } else { + ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, + NULL, 0, p->write_flags, + &local_err); + } + if (ret != 0) { break; } @@ -854,16 +925,20 @@ static void *multifd_send_thread(void *opaque) * it doesn't require explicit memory barriers. */ assert(qatomic_read(&p->pending_sync)); - p->flags = MULTIFD_FLAG_SYNC; - multifd_send_fill_packet(p); - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; + + if (use_packets) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; } - /* p->next_packet_size will always be zero for a SYNC packet */ - stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; + qatomic_set(&p->pending_sync, false); qemu_sem_post(&p->sem_sync); } @@ -939,7 +1014,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return true; } -static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { qio_channel_set_delay(ioc, false); @@ -990,7 +1065,7 @@ out: * Here we're not interested whether creation succeeded, only that * it happened at all. */ - qemu_sem_post(&multifd_send_state->channels_created); + multifd_send_channel_created(); if (ret) { return; @@ -1007,9 +1082,14 @@ out: error_free(local_err); } -static void multifd_new_send_channel_create(gpointer opaque) +static bool multifd_new_send_channel_create(gpointer opaque, Error **errp) { + if (!multifd_use_packets()) { + return file_send_channel_create(opaque, errp); + } + socket_send_channel_create(multifd_new_send_channel_async, opaque); + return true; } bool multifd_send_setup(void) @@ -1018,6 +1098,7 @@ bool multifd_send_setup(void) Error *local_err = NULL; int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; if (!migrate_multifd()) { @@ -1040,18 +1121,27 @@ bool multifd_send_setup(void) qemu_sem_init(&p->sem_sync, 0); p->id = i; p->pages = multifd_pages_init(page_count); - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + } else { + p->iov = g_new0(struct iovec, page_count); + } p->name = g_strdup_printf("multifdsend_%d", i); - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; p->write_flags = 0; - multifd_new_send_channel_create(p); + + if (!multifd_new_send_channel_create(p, &local_err)) { + return false; + } } /* @@ -1083,6 +1173,57 @@ bool multifd_send_setup(void) return true; } +bool multifd_recv(void) +{ + int i; + static int next_recv_channel; + MultiFDRecvParams *p = NULL; + MultiFDRecvData *data = multifd_recv_state->data; + + /* + * next_channel can remain from a previous migration that was + * using more channels, so ensure it doesn't overflow if the + * limit is lower now. + */ + next_recv_channel %= migrate_multifd_channels(); + for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) { + if (multifd_recv_should_exit()) { + return false; + } + + p = &multifd_recv_state->params[i]; + + if (qatomic_read(&p->pending_job) == false) { + next_recv_channel = (i + 1) % migrate_multifd_channels(); + break; + } + } + + /* + * Order pending_job read before manipulating p->data below. Pairs + * with qatomic_store_release() at multifd_recv_thread(). + */ + smp_mb_acquire(); + + assert(!p->data->size); + multifd_recv_state->data = p->data; + p->data = data; + + /* + * Order p->data update before setting pending_job. Pairs with + * qatomic_load_acquire() at multifd_recv_thread(). + */ + qatomic_store_release(&p->pending_job, true); + qemu_sem_post(&p->sem); + + return true; +} + +MultiFDRecvData *multifd_get_recv_data(void) +{ + return multifd_recv_state->data; +} + static void multifd_recv_terminate_threads(Error *err) { int i; @@ -1107,10 +1248,27 @@ static void multifd_recv_terminate_threads(Error *err) MultiFDRecvParams *p = &multifd_recv_state->params[i]; /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. + * The migration thread and channels interact differently + * depending on the presence of packets. */ - qemu_sem_post(&p->sem_sync); + if (multifd_use_packets()) { + /* + * The channel receives as long as there are packets. When + * packets end (i.e. MULTIFD_FLAG_SYNC is reached), the + * channel waits for the migration thread to sync. If the + * sync never happens, do it here. + */ + qemu_sem_post(&p->sem_sync); + } else { + /* + * The channel waits for the migration thread to give it + * work. When the migration thread runs out of work, it + * releases the channel and waits for any pending work to + * finish. If we reach here (e.g. due to error) before the + * work runs out, release the channel. + */ + qemu_sem_post(&p->sem); + } /* * We could arrive here for two reasons: @@ -1138,6 +1296,7 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->c = NULL; qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem_sync); + qemu_sem_destroy(&p->sem); g_free(p->name); p->name = NULL; p->packet_len = 0; @@ -1155,6 +1314,8 @@ static void multifd_recv_cleanup_state(void) qemu_sem_destroy(&multifd_recv_state->sem_sync); g_free(multifd_recv_state->params); multifd_recv_state->params = NULL; + g_free(multifd_recv_state->data); + multifd_recv_state->data = NULL; g_free(multifd_recv_state); multifd_recv_state = NULL; } @@ -1182,18 +1343,53 @@ void multifd_recv_cleanup(void) void multifd_recv_sync_main(void) { + int thread_count = migrate_multifd_channels(); + bool file_based = !multifd_use_packets(); int i; if (!migrate_multifd()) { return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - trace_multifd_recv_sync_main_wait(p->id); + /* + * File-based channels don't use packets and therefore need to + * wait for more work. Release them to start the sync. + */ + if (file_based) { + for (i = 0; i < thread_count; i++) { + MultiFDRecvParams *p = &multifd_recv_state->params[i]; + + trace_multifd_recv_sync_main_signal(p->id); + qemu_sem_post(&p->sem); + } + } + + /* + * Initiate the synchronization by waiting for all channels. + * + * For socket-based migration this means each channel has received + * the SYNC packet on the stream. + * + * For file-based migration this means each channel is done with + * the work (pending_job=false). + */ + for (i = 0; i < thread_count; i++) { + trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } - for (i = 0; i < migrate_multifd_channels(); i++) { + + if (file_based) { + /* + * For file-based loading is done in one iteration. We're + * done. + */ + return; + } + + /* + * Sync done. Release the channels for the next iteration. + */ + for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; WITH_QEMU_LOCK_GUARD(&p->mutex) { @@ -1211,46 +1407,87 @@ static void *multifd_recv_thread(void *opaque) { MultiFDRecvParams *p = opaque; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int ret; trace_multifd_recv_thread_start(p->id); rcu_register_thread(); while (true) { - uint32_t flags; + uint32_t flags = 0; + bool has_data = false; + p->normal_num = 0; - if (multifd_recv_should_exit()) { - break; - } + if (use_packets) { + if (multifd_recv_should_exit()) { + break; + } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ - break; - } + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + break; + } - qemu_mutex_lock(&p->mutex); - ret = multifd_recv_unfill_packet(p, &local_err); - if (ret) { + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - break; + } else { + /* + * No packets, so we need to wait for the vmstate code to + * give us work. + */ + qemu_sem_wait(&p->sem); + + if (multifd_recv_should_exit()) { + break; + } + + /* pairs with qatomic_store_release() at multifd_recv() */ + if (!qatomic_load_acquire(&p->pending_job)) { + /* + * Migration thread did not send work, this is + * equivalent to pending_sync on the sending + * side. Post sem_sync to notify we reached this + * point. + */ + qemu_sem_post(&multifd_recv_state->sem_sync); + continue; + } + + has_data = !!p->data->size; } - flags = p->flags; - /* recv methods don't know how to handle the SYNC flag */ - p->flags &= ~MULTIFD_FLAG_SYNC; - qemu_mutex_unlock(&p->mutex); - - if (p->normal_num) { - ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { break; } } - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&multifd_recv_state->sem_sync); - qemu_sem_wait(&p->sem_sync); + if (use_packets) { + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } + } else { + p->total_normal_pages += p->data->size / qemu_target_page_size(); + p->data->size = 0; + /* + * Order data->size update before clearing + * pending_job. Pairs with smp_mb_acquire() at + * multifd_recv(). + */ + qatomic_store_release(&p->pending_job, false); } } @@ -1269,6 +1506,7 @@ int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; /* @@ -1282,6 +1520,10 @@ int multifd_recv_setup(Error **errp) thread_count = migrate_multifd_channels(); multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); + + multifd_recv_state->data = g_new0(MultiFDRecvData, 1); + multifd_recv_state->data->size = 0; + qatomic_set(&multifd_recv_state->count, 0); qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); @@ -1292,10 +1534,18 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); + qemu_sem_init(&p->sem, 0); + p->pending_job = false; p->id = i; - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); + + p->data = g_new0(MultiFDRecvData, 1); + p->data->size = 0; + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); @@ -1339,18 +1589,23 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int id; - id = multifd_recv_initial_packet(ioc, &local_err); - if (id < 0) { - multifd_recv_terminate_threads(local_err); - error_propagate_prepend(errp, local_err, - "failed to receive packet" - " via multifd channel %d: ", - qatomic_read(&multifd_recv_state->count)); - return; + if (use_packets) { + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + } else { + id = qatomic_read(&multifd_recv_state->count); } - trace_multifd_recv_new_channel(id); p = &multifd_recv_state->params[id]; if (p->c != NULL) { diff --git a/migration/multifd.h b/migration/multifd.h index b3fe27ae93..7447c2bea3 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,8 +13,13 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +#include "ram.h" + +typedef struct MultiFDRecvData MultiFDRecvData; + bool multifd_send_setup(void); void multifd_send_shutdown(void); +void multifd_send_channel_created(void); int multifd_recv_setup(Error **errp); void multifd_recv_cleanup(void); void multifd_recv_shutdown(void); @@ -23,6 +28,8 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_recv(void); +MultiFDRecvData *multifd_get_recv_data(void); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) @@ -63,6 +70,13 @@ typedef struct { RAMBlock *block; } MultiFDPages_t; +struct MultiFDRecvData { + void *opaque; + size_t size; + /* for preadv */ + off_t file_offset; +}; + typedef struct { /* Fields are only written at creating/deletion time */ /* No lock required for them, they are read only */ @@ -127,7 +141,7 @@ typedef struct { /* number of iovs used */ uint32_t iovs_num; /* used for compression methods */ - void *data; + void *compress_data; } MultiFDSendParams; typedef struct { @@ -152,6 +166,8 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; + /* sem where to wait for more work */ + QemuSemaphore sem; /* this mutex protects the following parameters */ QemuMutex mutex; @@ -161,6 +177,8 @@ typedef struct { uint32_t flags; /* global number of generated multifd packets */ uint64_t packet_num; + int pending_job; + MultiFDRecvData *data; /* thread local variables. No locking required */ @@ -183,7 +201,7 @@ typedef struct { /* num of non zero pages */ uint32_t normal_num; /* used for de-compression methods */ - void *data; + void *compress_data; } MultiFDRecvParams; typedef struct { @@ -197,8 +215,8 @@ typedef struct { int (*recv_setup)(MultiFDRecvParams *p, Error **errp); /* Cleanup for receiving side */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all pages */ - int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + /* Read all data */ + int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); @@ -211,5 +229,6 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) p->iovs_num++; } +void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc); #endif diff --git a/migration/options.c b/migration/options.c index 3e3e0b93b4..40eb930940 100644 --- a/migration/options.c +++ b/migration/options.c @@ -204,6 +204,7 @@ Property migration_properties[] = { DEFINE_PROP_MIG_CAP("x-switchover-ack", MIGRATION_CAPABILITY_SWITCHOVER_ACK), DEFINE_PROP_MIG_CAP("x-dirty-limit", MIGRATION_CAPABILITY_DIRTY_LIMIT), + DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM), DEFINE_PROP_END_OF_LIST(), }; @@ -263,6 +264,13 @@ bool migrate_events(void) return s->capabilities[MIGRATION_CAPABILITY_EVENTS]; } +bool migrate_mapped_ram(void) +{ + MigrationState *s = migrate_get_current(); + + return s->capabilities[MIGRATION_CAPABILITY_MAPPED_RAM]; +} + bool migrate_ignore_shared(void) { MigrationState *s = migrate_get_current(); @@ -645,6 +653,26 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp) } } + if (new_caps[MIGRATION_CAPABILITY_MAPPED_RAM]) { + if (new_caps[MIGRATION_CAPABILITY_XBZRLE]) { + error_setg(errp, + "Mapped-ram migration is incompatible with xbzrle"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_COMPRESS]) { + error_setg(errp, + "Mapped-ram migration is incompatible with compression"); + return false; + } + + if (new_caps[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { + error_setg(errp, + "Mapped-ram migration is incompatible with postcopy"); + return false; + } + } + return true; } @@ -1218,6 +1246,13 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) } #endif + if (migrate_mapped_ram() && + (migrate_multifd_compression() || migrate_tls())) { + error_setg(errp, + "Mapped-ram only available for non-compressed non-TLS multifd migration"); + return false; + } + if (params->has_x_vcpu_dirty_limit_period && (params->x_vcpu_dirty_limit_period < 1 || params->x_vcpu_dirty_limit_period > 1000)) { @@ -1312,6 +1347,12 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + dest->multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { dest->xbzrle_cache_size = params->xbzrle_cache_size; } @@ -1447,6 +1488,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + s->parameters.multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; xbzrle_cache_resize(params->xbzrle_cache_size, errp); diff --git a/migration/options.h b/migration/options.h index 246c160aee..6ddd8dad9b 100644 --- a/migration/options.h +++ b/migration/options.h @@ -31,6 +31,7 @@ bool migrate_compress(void); bool migrate_dirty_bitmaps(void); bool migrate_dirty_limit(void); bool migrate_events(void); +bool migrate_mapped_ram(void); bool migrate_ignore_shared(void); bool migrate_late_block_activate(void); bool migrate_multifd(void); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 94231ff295..b10c882629 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -33,6 +33,7 @@ #include "options.h" #include "qapi/error.h" #include "rdma.h" +#include "io/channel-file.h" #define IO_BUF_SIZE 32768 #define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64) @@ -255,6 +256,10 @@ static void qemu_iovec_release_ram(QEMUFile *f) memset(f->may_free, 0, sizeof(f->may_free)); } +bool qemu_file_is_seekable(QEMUFile *f) +{ + return qio_channel_has_feature(f->ioc, QIO_CHANNEL_FEATURE_SEEKABLE); +} /** * Flushes QEMUFile buffer @@ -447,6 +452,107 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) } } +void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos) +{ + Error *err = NULL; + size_t ret; + + if (f->last_error) { + return; + } + + qemu_fflush(f); + ret = qio_channel_pwrite(f->ioc, (char *)buf, buflen, pos, &err); + + if (err) { + qemu_file_set_error_obj(f, -EIO, err); + return; + } + + if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) { + qemu_file_set_error_obj(f, -EAGAIN, NULL); + return; + } + + if (ret != buflen) { + error_setg(&err, "Partial write of size %zu, expected %zu", ret, + buflen); + qemu_file_set_error_obj(f, -EIO, err); + return; + } + + stat64_add(&mig_stats.qemu_file_transferred, buflen); + + return; +} + + +size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos) +{ + Error *err = NULL; + size_t ret; + + if (f->last_error) { + return 0; + } + + ret = qio_channel_pread(f->ioc, (char *)buf, buflen, pos, &err); + + if ((ssize_t)ret == -1 || err) { + qemu_file_set_error_obj(f, -EIO, err); + return 0; + } + + if ((ssize_t)ret == QIO_CHANNEL_ERR_BLOCK) { + qemu_file_set_error_obj(f, -EAGAIN, NULL); + return 0; + } + + if (ret != buflen) { + error_setg(&err, "Partial read of size %zu, expected %zu", ret, buflen); + qemu_file_set_error_obj(f, -EIO, err); + return 0; + } + + return ret; +} + +void qemu_set_offset(QEMUFile *f, off_t off, int whence) +{ + Error *err = NULL; + off_t ret; + + if (qemu_file_is_writable(f)) { + qemu_fflush(f); + } else { + /* Drop all cached buffers if existed; will trigger a re-fill later */ + f->buf_index = 0; + f->buf_size = 0; + } + + ret = qio_channel_io_seek(f->ioc, off, whence, &err); + if (ret == (off_t)-1) { + qemu_file_set_error_obj(f, -EIO, err); + } +} + +off_t qemu_get_offset(QEMUFile *f) +{ + Error *err = NULL; + off_t ret; + + qemu_fflush(f); + + ret = qio_channel_io_seek(f->ioc, 0, SEEK_CUR, &err); + if (ret == (off_t)-1) { + qemu_file_set_error_obj(f, -EIO, err); + } + return ret; +} + + void qemu_put_byte(QEMUFile *f, int v) { if (f->last_error) { diff --git a/migration/qemu-file.h b/migration/qemu-file.h index 8aec9fabf7..32fd4a34fd 100644 --- a/migration/qemu-file.h +++ b/migration/qemu-file.h @@ -75,6 +75,12 @@ QEMUFile *qemu_file_get_return_path(QEMUFile *f); int qemu_fflush(QEMUFile *f); void qemu_file_set_blocking(QEMUFile *f, bool block); int qemu_file_get_to_fd(QEMUFile *f, int fd, size_t size); +void qemu_set_offset(QEMUFile *f, off_t off, int whence); +off_t qemu_get_offset(QEMUFile *f); +void qemu_put_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos); +size_t qemu_get_buffer_at(QEMUFile *f, const uint8_t *buf, size_t buflen, + off_t pos); QIOChannel *qemu_file_get_ioc(QEMUFile *file); diff --git a/migration/ram.c b/migration/ram.c index 83fd780fc2..003c28e133 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -94,6 +94,24 @@ #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200 /* We can't use any flag that is bigger than 0x200 */ +/* + * mapped-ram migration supports O_DIRECT, so we need to make sure the + * userspace buffer, the IO operation size and the file offset are + * aligned according to the underlying device's block size. The first + * two are already aligned to page size, but we need to add padding to + * the file to align the offset. We cannot read the block size + * dynamically because the migration file can be moved between + * different systems, so use 1M to cover most block sizes and to keep + * the file offset aligned at page size as well. + */ +#define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 + +/* + * When doing mapped-ram migration, this is the amount we read from + * the pages region in the migration file at a time. + */ +#define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 + XBZRLECacheStats xbzrle_counters; /* used by the search for pages to send */ @@ -1126,12 +1144,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, return 0; } + stat64_add(&mig_stats.zero_pages, 1); + + if (migrate_mapped_ram()) { + /* zero pages are not transferred with mapped-ram */ + clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); + return 1; + } + len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1; ram_release_page(pss->block->idstr, offset); - - stat64_add(&mig_stats.zero_pages, 1); ram_transferred_add(len); /* @@ -1189,14 +1213,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, { QEMUFile *file = pss->pss_channel; - ram_transferred_add(save_page_header(pss, pss->pss_channel, block, - offset | RAM_SAVE_FLAG_PAGE)); - if (async) { - qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, - migrate_release_ram() && - migration_in_postcopy()); + if (migrate_mapped_ram()) { + qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, + block->pages_offset + offset); + set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); } else { - qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); + ram_transferred_add(save_page_header(pss, pss->pss_channel, block, + offset | RAM_SAVE_FLAG_PAGE)); + if (async) { + qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, + migrate_release_ram() && + migration_in_postcopy()); + } else { + qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); + } } ram_transferred_add(TARGET_PAGE_SIZE); stat64_add(&mig_stats.normal_pages, 1); @@ -1332,14 +1362,18 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) pss->block = QLIST_NEXT_RCU(pss->block, next); if (!pss->block) { if (migrate_multifd() && - !migrate_multifd_flush_after_each_section()) { + (!migrate_multifd_flush_after_each_section() || + migrate_mapped_ram())) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } - qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); - qemu_fflush(f); + + if (!migrate_mapped_ram()) { + qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); + qemu_fflush(f); + } } /* * If memory migration starts over, we will meet a dirtied page @@ -2778,6 +2812,9 @@ static void ram_list_init_bitmaps(void) */ block->bmap = bitmap_new(pages); bitmap_set(block->bmap, 0, pages); + if (migrate_mapped_ram()) { + block->file_bmap = bitmap_new(pages); + } block->clear_bmap_shift = shift; block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); } @@ -2915,6 +2952,89 @@ void qemu_guest_free_page_hint(void *addr, size_t len) } } +#define MAPPED_RAM_HDR_VERSION 1 +struct MappedRamHeader { + uint32_t version; + /* + * The target's page size, so we know how many pages are in the + * bitmap. + */ + uint64_t page_size; + /* + * The offset in the migration file where the pages bitmap is + * stored. + */ + uint64_t bitmap_offset; + /* + * The offset in the migration file where the actual pages (data) + * are stored. + */ + uint64_t pages_offset; +} QEMU_PACKED; +typedef struct MappedRamHeader MappedRamHeader; + +static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) +{ + g_autofree MappedRamHeader *header = NULL; + size_t header_size, bitmap_size; + long num_pages; + + header = g_new0(MappedRamHeader, 1); + header_size = sizeof(MappedRamHeader); + + num_pages = block->used_length >> TARGET_PAGE_BITS; + bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + /* + * Save the file offsets of where the bitmap and the pages should + * go as they are written at the end of migration and during the + * iterative phase, respectively. + */ + block->bitmap_offset = qemu_get_offset(file) + header_size; + block->pages_offset = ROUND_UP(block->bitmap_offset + + bitmap_size, + MAPPED_RAM_FILE_OFFSET_ALIGNMENT); + + header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); + header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); + header->bitmap_offset = cpu_to_be64(block->bitmap_offset); + header->pages_offset = cpu_to_be64(block->pages_offset); + + qemu_put_buffer(file, (uint8_t *) header, header_size); + + /* prepare offset for next ramblock */ + qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); +} + +static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, + Error **errp) +{ + size_t ret, header_size = sizeof(MappedRamHeader); + + ret = qemu_get_buffer(file, (uint8_t *)header, header_size); + if (ret != header_size) { + error_setg(errp, "Could not read whole mapped-ram migration header " + "(expected %zd, got %zd bytes)", header_size, ret); + return false; + } + + /* migration stream is big-endian */ + header->version = be32_to_cpu(header->version); + + if (header->version > MAPPED_RAM_HDR_VERSION) { + error_setg(errp, "Migration mapped-ram capability version not " + "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, + header->version); + return false; + } + + header->page_size = be64_to_cpu(header->page_size); + header->bitmap_offset = be64_to_cpu(header->bitmap_offset); + header->pages_offset = be64_to_cpu(header->pages_offset); + + return true; +} + /* * Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -2970,6 +3090,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque) if (migrate_ignore_shared()) { qemu_put_be64(f, block->mr->addr); } + + if (migrate_mapped_ram()) { + mapped_ram_setup_ramblock(f, block); + } } } @@ -2995,7 +3119,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && !migrate_multifd_flush_after_each_section() + && !migrate_mapped_ram()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } @@ -3003,6 +3128,33 @@ static int ram_save_setup(QEMUFile *f, void *opaque) return qemu_fflush(f); } +static void ram_save_file_bmap(QEMUFile *f) +{ + RAMBlock *block; + + RAMBLOCK_FOREACH_MIGRATABLE(block) { + long num_pages = block->used_length >> TARGET_PAGE_BITS; + long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, + block->bitmap_offset); + ram_transferred_add(bitmap_size); + + /* + * Free the bitmap here to catch any synchronization issues + * with multifd channels. No channels should be sending pages + * after we've written the bitmap to file. + */ + g_free(block->file_bmap); + block->file_bmap = NULL; + } +} + +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset) +{ + set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); +} + /** * ram_save_iterate: iterative stage for migration * @@ -3112,7 +3264,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { - if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { + if (migrate_multifd() && migrate_multifd_flush_after_each_section() && + !migrate_mapped_ram()) { ret = multifd_send_sync_main(); if (ret < 0) { return ret; @@ -3192,7 +3345,20 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return ret; } - if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { + if (migrate_mapped_ram()) { + ram_save_file_bmap(f); + + if (qemu_file_get_error(f)) { + Error *local_err = NULL; + int err = qemu_file_get_error_obj(f, &local_err); + + error_reportf_err(local_err, "Failed to write bitmap to file: "); + return -err; + } + } + + if (migrate_multifd() && !migrate_multifd_flush_after_each_section() && + !migrate_mapped_ram()) { qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH); } qemu_put_be64(f, RAM_SAVE_FLAG_EOS); @@ -3792,23 +3958,149 @@ void colo_flush_ram_cache(void) trace_colo_flush_ram_cache_end(); } +static size_t ram_load_multifd_pages(void *host_addr, size_t size, + uint64_t offset) +{ + MultiFDRecvData *data = multifd_get_recv_data(); + + data->opaque = host_addr; + data->file_offset = offset; + data->size = size; + + if (!multifd_recv()) { + return 0; + } + + return size; +} + +static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, + long num_pages, unsigned long *bitmap, + Error **errp) +{ + ERRP_GUARD(); + unsigned long set_bit_idx, clear_bit_idx; + ram_addr_t offset; + void *host; + size_t read, unread, size; + + for (set_bit_idx = find_first_bit(bitmap, num_pages); + set_bit_idx < num_pages; + set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { + + clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); + + unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); + offset = set_bit_idx << TARGET_PAGE_BITS; + + while (unread > 0) { + host = host_from_ram_block_offset(block, offset); + if (!host) { + error_setg(errp, "page outside of ramblock %s range", + block->idstr); + return false; + } + + size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); + + if (migrate_multifd()) { + read = ram_load_multifd_pages(host, size, + block->pages_offset + offset); + } else { + read = qemu_get_buffer_at(f, host, size, + block->pages_offset + offset); + } + + if (!read) { + goto err; + } + offset += read; + unread -= read; + } + } + + return true; + +err: + qemu_file_get_error_obj(f, errp); + error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT + "from file offset %" PRIx64 ": ", block->idstr, offset, + block->pages_offset + offset); + return false; +} + +static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, + ram_addr_t length, Error **errp) +{ + g_autofree unsigned long *bitmap = NULL; + MappedRamHeader header; + size_t bitmap_size; + long num_pages; + + if (!mapped_ram_read_header(f, &header, errp)) { + return; + } + + block->pages_offset = header.pages_offset; + + /* + * Check the alignment of the file region that contains pages. We + * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that + * value to change in the future. Do only a sanity check with page + * size alignment. + */ + if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { + error_setg(errp, + "Error reading ramblock %s pages, region has bad alignment", + block->idstr); + return; + } + + num_pages = length / header.page_size; + bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); + + bitmap = g_malloc0(bitmap_size); + if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, + header.bitmap_offset) != bitmap_size) { + error_setg(errp, "Error reading dirty bitmap"); + return; + } + + if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { + return; + } + + /* Skip pages array */ + qemu_set_offset(f, block->pages_offset + length, SEEK_SET); + + return; +} + static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) { int ret = 0; /* ADVISE is earlier, it shows the source has the postcopy capability on */ bool postcopy_advised = migration_incoming_postcopy_advised(); int max_hg_page_size; + Error *local_err = NULL; assert(block); + if (migrate_mapped_ram()) { + parse_ramblock_mapped_ram(f, block, length, &local_err); + if (local_err) { + error_report_err(local_err); + return -EINVAL; + } + return 0; + } + if (!qemu_ram_is_migratable(block)) { error_report("block %s should not be migrated !", block->idstr); return -EINVAL; } if (length != block->used_length) { - Error *local_err = NULL; - ret = qemu_ram_resize(block, length, &local_err); if (local_err) { error_report_err(local_err); @@ -3899,6 +4191,12 @@ static int ram_load_precopy(QEMUFile *f) invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; } + if (migrate_mapped_ram()) { + invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | + RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | + RAM_SAVE_FLAG_ZERO); + } + while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr; void *host = NULL, *host_bak = NULL; @@ -3920,6 +4218,8 @@ static int ram_load_precopy(QEMUFile *f) addr &= TARGET_PAGE_MASK; if (flags & invalid_flags) { + error_report("Unexpected RAM flags: %d", flags & invalid_flags); + if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) { error_report("Received an unexpected compressed page"); } @@ -3972,6 +4272,16 @@ static int ram_load_precopy(QEMUFile *f) switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: ret = parse_ramblocks(f, addr); + /* + * For mapped-ram migration (to a file) using multifd, we sync + * once and for all here to make sure all tasks we queued to + * multifd threads are completed, so that all the ramblocks + * (including all the guest memory pages within) are fully + * loaded after this sync returns. + */ + if (migrate_mapped_ram()) { + multifd_recv_sync_main(); + } break; case RAM_SAVE_FLAG_ZERO: @@ -4012,7 +4322,12 @@ static int ram_load_precopy(QEMUFile *f) case RAM_SAVE_FLAG_EOS: /* normal exit */ if (migrate_multifd() && - migrate_multifd_flush_after_each_section()) { + migrate_multifd_flush_after_each_section() && + /* + * Mapped-ram migration flushes once and for all after + * parsing ramblocks. Always ignore EOS for it. + */ + !migrate_mapped_ram()) { multifd_recv_sync_main(); } break; diff --git a/migration/ram.h b/migration/ram.h index 9b937a446b..b9ac0da587 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -75,6 +75,7 @@ bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start); void postcopy_preempt_shutdown_file(MigrationState *s); void *postcopy_preempt_thread(void *opaque); +void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset); /* ram cache */ int colo_init_ram_cache(void); diff --git a/migration/savevm.c b/migration/savevm.c index d612c8a902..dc1fb9c0d3 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -245,6 +245,7 @@ static bool should_validate_capability(int capability) /* Validate only new capabilities to keep compatibility. */ switch (capability) { case MIGRATION_CAPABILITY_X_IGNORE_SHARED: + case MIGRATION_CAPABILITY_MAPPED_RAM: return true; default: return false; diff --git a/migration/trace-events b/migration/trace-events index 298ad2b0dd..bf1a069632 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -132,7 +132,7 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uin multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" -multifd_recv_sync_main_wait(uint8_t id) "channel %u" +multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" diff --git a/qapi/migration.json b/qapi/migration.json index 0b33a71ab4..62acc834af 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -531,6 +531,10 @@ # and can result in more stable read performance. Requires KVM # with accelerator property "dirty-ring-size" set. (Since 8.1) # +# @mapped-ram: Migrate using fixed offsets in the migration file for +# each RAM page. Requires a migration URI that supports seeking, +# such as a file. (since 9.0) +# # Features: # # @deprecated: Member @block is deprecated. Use blockdev-mirror with @@ -555,7 +559,7 @@ { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] }, 'validate-uuid', 'background-snapshot', 'zero-copy-send', 'postcopy-preempt', 'switchover-ack', - 'dirty-limit'] } + 'dirty-limit', 'mapped-ram'] } ## # @MigrationCapabilityStatus: @@ -636,28 +640,30 @@ # # @normal: the original form of migration. (since 8.2) # -# @cpr-reboot: The migrate command stops the VM and saves state to the URI. -# After quitting qemu, the user resumes by running qemu -incoming. +# @cpr-reboot: The migrate command stops the VM and saves state to +# the URI. After quitting QEMU, the user resumes by running +# QEMU -incoming. # -# This mode allows the user to quit qemu, and restart an updated version -# of qemu. The user may even update and reboot the OS before restarting, -# as long as the URI persists across a reboot. +# This mode allows the user to quit QEMU, optionally update and +# reboot the OS, and restart QEMU. If the user reboots, the URI +# must persist across the reboot, such as by using a file. # -# Unlike normal mode, the use of certain local storage options does not -# block the migration, but the user must not modify guest block devices -# between the quit and restart. +# Unlike normal mode, the use of certain local storage options +# does not block the migration, but the user must not modify the +# contents of guest block devices between the quit and restart. # -# This mode supports vfio devices provided the user first puts the guest -# in the suspended runstate, such as by issuing guest-suspend-ram to the -# qemu guest agent. +# This mode supports VFIO devices provided the user first puts +# the guest in the suspended runstate, such as by issuing +# guest-suspend-ram to the QEMU guest agent. # -# Best performance is achieved when the memory backend is shared and the -# @x-ignore-shared migration capability is set, but this is not required. -# Further, if the user reboots before restarting such a configuration, the -# shared backend must be be non-volatile across reboot, such as by backing -# it with a dax device. +# Best performance is achieved when the memory backend is shared +# and the @x-ignore-shared migration capability is set, but this +# is not required. Further, if the user reboots before restarting +# such a configuration, the shared memory must persist across the +# reboot, such as by backing it with a dax device. # -# cpr-reboot may not be used with postcopy, colo, or background-snapshot. +# @cpr-reboot may not be used with postcopy, background-snapshot, +# or COLO. # # (since 8.2) ## diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 83512bce85..4023d808f9 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2200,6 +2200,14 @@ static void *test_mode_reboot_start(QTestState *from, QTestState *to) return NULL; } +static void *migrate_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_set_capability(from, "mapped-ram", true); + migrate_set_capability(to, "mapped-ram", true); + + return NULL; +} + static void test_mode_reboot(void) { g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, @@ -2214,6 +2222,72 @@ static void test_mode_reboot(void) test_file_common(&args, true); } +static void test_precopy_file_mapped_ram_live(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_mapped_ram_start, + }; + + test_file_common(&args, false); +} + +static void test_precopy_file_mapped_ram(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_mapped_ram_start, + }; + + test_file_common(&args, true); +} + +static void *migrate_multifd_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_mapped_ram_start(from, to); + + migrate_set_parameter_int(from, "multifd-channels", 4); + migrate_set_parameter_int(to, "multifd-channels", 4); + + migrate_set_capability(from, "multifd", true); + migrate_set_capability(to, "multifd", true); + + return NULL; +} + +static void test_multifd_file_mapped_ram_live(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_multifd_mapped_ram_start, + }; + + test_file_common(&args, false); +} + +static void test_multifd_file_mapped_ram(void) +{ + g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs, + FILE_TEST_FILENAME); + MigrateCommon args = { + .connect_uri = uri, + .listen_uri = "defer", + .start_hook = migrate_multifd_mapped_ram_start, + }; + + test_file_common(&args, true); +} + + static void test_precopy_tcp_plain(void) { MigrateCommon args = { @@ -2462,6 +2536,13 @@ static void *migrate_precopy_fd_file_start(QTestState *from, QTestState *to) return NULL; } +static void *migrate_fd_file_mapped_ram_start(QTestState *from, QTestState *to) +{ + migrate_mapped_ram_start(from, to); + + return migrate_precopy_fd_file_start(from, to); +} + static void test_migrate_precopy_fd_file(void) { MigrateCommon args = { @@ -2472,6 +2553,36 @@ static void test_migrate_precopy_fd_file(void) }; test_file_common(&args, true); } + +static void test_migrate_precopy_fd_file_mapped_ram(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .connect_uri = "fd:fd-mig", + .start_hook = migrate_fd_file_mapped_ram_start, + .finish_hook = test_migrate_fd_finish_hook + }; + test_file_common(&args, true); +} + +static void *migrate_multifd_fd_mapped_ram_start(QTestState *from, + QTestState *to) +{ + migrate_multifd_mapped_ram_start(from, to); + return migrate_precopy_fd_file_start(from, to); +} + +static void test_multifd_fd_mapped_ram(void) +{ + MigrateCommon args = { + .connect_uri = "fd:fd-mig", + .listen_uri = "defer", + .start_hook = migrate_multifd_fd_mapped_ram_start, + .finish_hook = test_migrate_fd_finish_hook + }; + + test_file_common(&args, true); +} #endif /* _WIN32 */ static void do_test_validate_uuid(MigrateStart *args, bool should_fail) @@ -2664,6 +2775,13 @@ static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) { + /* + * Overloading this test to also check that set_parameter does not error. + * This is also done in the tests for the other compression methods. + */ + migrate_set_parameter_int(from, "multifd-zlib-level", 2); + migrate_set_parameter_int(to, "multifd-zlib-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); } @@ -2672,6 +2790,9 @@ static void * test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, QTestState *to) { + migrate_set_parameter_int(from, "multifd-zstd-level", 2); + migrate_set_parameter_int(to, "multifd-zstd-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); } #endif /* CONFIG_ZSTD */ @@ -3509,6 +3630,20 @@ int main(int argc, char **argv) migration_test_add("/migration/mode/reboot", test_mode_reboot); } + migration_test_add("/migration/precopy/file/mapped-ram", + test_precopy_file_mapped_ram); + migration_test_add("/migration/precopy/file/mapped-ram/live", + test_precopy_file_mapped_ram_live); + + migration_test_add("/migration/multifd/file/mapped-ram", + test_multifd_file_mapped_ram); + migration_test_add("/migration/multifd/file/mapped-ram/live", + test_multifd_file_mapped_ram_live); +#ifndef _WIN32 + migration_test_add("/migration/multifd/fd/mapped-ram", + test_multifd_fd_mapped_ram); +#endif + #ifdef CONFIG_GNUTLS migration_test_add("/migration/precopy/unix/tls/psk", test_precopy_unix_tls_psk); @@ -3570,6 +3705,8 @@ int main(int argc, char **argv) test_migrate_precopy_fd_socket); migration_test_add("/migration/precopy/fd/file", test_migrate_precopy_fd_file); + migration_test_add("/migration/precopy/fd/file/mapped-ram", + test_migrate_precopy_fd_file_mapped_ram); #endif migration_test_add("/migration/validate_uuid", test_validate_uuid); migration_test_add("/migration/validate_uuid_error",