diff --git a/docs/devel/migration/CPR.rst b/docs/devel/migration/CPR.rst new file mode 100644 index 0000000000..63c36470cf --- /dev/null +++ b/docs/devel/migration/CPR.rst @@ -0,0 +1,147 @@ +CheckPoint and Restart (CPR) +============================ + +CPR is the umbrella name for a set of migration modes in which the +VM is migrated to a new QEMU instance on the same host. It is +intended for use when the goal is to update host software components +that run the VM, such as QEMU or even the host kernel. At this time, +cpr-reboot is the only available mode. + +Because QEMU is restarted on the same host, with access to the same +local devices, CPR is allowed in certain cases where normal migration +would be blocked. However, the user must not modify the contents of +guest block devices between quitting old QEMU and starting new QEMU. + +CPR unconditionally stops VM execution before memory is saved, and +thus does not depend on any form of dirty page tracking. + +cpr-reboot mode +--------------- + +In this mode, QEMU stops the VM, and writes VM state to the migration +URI, which will typically be a file. After quitting QEMU, the user +resumes by running QEMU with the ``-incoming`` option. Because the +old and new QEMU instances are not active concurrently, the URI cannot +be a type that streams data from one instance to the other. + +Guest RAM can be saved in place if backed by shared memory, or can be +copied to a file. The former is more efficient and is therefore +preferred. + +After state and memory are saved, the user may update userland host +software before restarting QEMU and resuming the VM. Further, if +the RAM is backed by persistent shared memory, such as a DAX device, +then the user may reboot to a new host kernel before restarting QEMU. + +This mode supports VFIO devices provided the user first puts the +guest in the suspended runstate, such as by issuing the +``guest-suspend-ram`` command to the QEMU guest agent. The agent +must be pre-installed in the guest, and the guest must support +suspend to RAM. Beware that suspension can take a few seconds, so +the user should poll to see the suspended state before proceeding +with the CPR operation. + +Usage +^^^^^ + +It is recommended that guest RAM be backed with some type of shared +memory, such as ``memory-backend-file,share=on``, and that the +``x-ignore-shared`` capability be set. This combination allows memory +to be saved in place. Otherwise, after QEMU stops the VM, all guest +RAM is copied to the migration URI. + +Outgoing: + * Set the migration mode parameter to ``cpr-reboot``. + * Set the ``x-ignore-shared`` capability if desired. + * Issue the ``migrate`` command. It is recommended the the URI be a + ``file`` type, but one can use other types such as ``exec``, + provided the command captures all the data from the outgoing side, + and provides all the data to the incoming side. + * Quit when QEMU reaches the postmigrate state. + +Incoming: + * Start QEMU with the ``-incoming defer`` option. + * Set the migration mode parameter to ``cpr-reboot``. + * Set the ``x-ignore-shared`` capability if desired. + * Issue the ``migrate-incoming`` command. + * If the VM was running when the outgoing ``migrate`` command was + issued, then QEMU automatically resumes VM execution. + +Example 1 +^^^^^^^^^ +:: + + # qemu-kvm -monitor stdio + -object memory-backend-file,id=ram0,size=4G,mem-path=/dev/dax0.0,align=2M,share=on -m 4G + ... + + (qemu) info status + VM status: running + (qemu) migrate_set_parameter mode cpr-reboot + (qemu) migrate_set_capability x-ignore-shared on + (qemu) migrate -d file:vm.state + (qemu) info status + VM status: paused (postmigrate) + (qemu) quit + + ### optionally update kernel and reboot + # systemctl kexec + kexec_core: Starting new kernel + ... + + # qemu-kvm ... -incoming defer + (qemu) info status + VM status: paused (inmigrate) + (qemu) migrate_set_parameter mode cpr-reboot + (qemu) migrate_set_capability x-ignore-shared on + (qemu) migrate_incoming file:vm.state + (qemu) info status + VM status: running + +Example 2: VFIO +^^^^^^^^^^^^^^^ +:: + + # qemu-kvm -monitor stdio + -object memory-backend-file,id=ram0,size=4G,mem-path=/dev/dax0.0,align=2M,share=on -m 4G + -device vfio-pci, ... + -chardev socket,id=qga0,path=qga.sock,server=on,wait=off + -device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0 + ... + + (qemu) info status + VM status: running + + # echo '{"execute":"guest-suspend-ram"}' | ncat --send-only -U qga.sock + + (qemu) info status + VM status: paused (suspended) + (qemu) migrate_set_parameter mode cpr-reboot + (qemu) migrate_set_capability x-ignore-shared on + (qemu) migrate -d file:vm.state + (qemu) info status + VM status: paused (postmigrate) + (qemu) quit + + ### optionally update kernel and reboot + # systemctl kexec + kexec_core: Starting new kernel + ... + + # qemu-kvm ... -incoming defer + (qemu) info status + VM status: paused (inmigrate) + (qemu) migrate_set_parameter mode cpr-reboot + (qemu) migrate_set_capability x-ignore-shared on + (qemu) migrate_incoming file:vm.state + (qemu) info status + VM status: paused (suspended) + (qemu) system_wakeup + (qemu) info status + VM status: running + +Caveats +^^^^^^^ + +cpr-reboot mode may not be used with postcopy, background-snapshot, +or COLO. diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 9d1abd2587..d5ca7b86d5 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -11,3 +11,4 @@ Migration has plenty of features to support different use cases. vfio virtio mapped-ram + CPR diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index ce36bb10d4..3e53501691 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -655,6 +655,7 @@ static inline void mmap_unlock(void) {} void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length); void tlb_set_dirty(CPUState *cpu, vaddr addr); +void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length); MemoryRegionSection * address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 90676093f5..de45ba7bc9 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -25,6 +25,7 @@ #include "sysemu/tcg.h" #include "exec/ramlist.h" #include "exec/ramblock.h" +#include "exec/exec-all.h" extern uint64_t total_dirty_pages; @@ -443,6 +444,14 @@ uint64_t cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap, } #endif /* not _WIN32 */ +static inline void cpu_physical_memory_dirty_bits_cleared(ram_addr_t start, + ram_addr_t length) +{ + if (tcg_enabled()) { + tlb_reset_dirty_range_all(start, length); + } + +} bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, ram_addr_t length, unsigned client); @@ -504,6 +513,9 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock *rb, idx++; } } + if (num_dirty) { + cpu_physical_memory_dirty_bits_cleared(start, length); + } if (rb->clear_bmap) { /* diff --git a/include/io/channel-file.h b/include/io/channel-file.h index 50e8eb1138..d373a4e44d 100644 --- a/include/io/channel-file.h +++ b/include/io/channel-file.h @@ -68,6 +68,24 @@ struct QIOChannelFile { QIOChannelFile * qio_channel_file_new_fd(int fd); +/** + * qio_channel_file_new_dupfd: + * @fd: the file descriptor + * @errp: pointer to initialized error object + * + * Create a new IO channel object for a file represented by the @fd + * parameter. Like qio_channel_file_new_fd(), but the @fd is first + * duplicated with dup(). + * + * The channel will own the duplicated file descriptor and will take + * responsibility for closing it, the original FD is owned by the + * caller. + * + * Returns: the new channel object + */ +QIOChannelFile * +qio_channel_file_new_dupfd(int fd, Error **errp); + /** * qio_channel_file_new_path: * @path: the file path diff --git a/io/channel-file.c b/io/channel-file.c index a6ad7770c6..6436cfb6ae 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -45,6 +45,18 @@ qio_channel_file_new_fd(int fd) return ioc; } +QIOChannelFile * +qio_channel_file_new_dupfd(int fd, Error **errp) +{ + int newfd = dup(fd); + + if (newfd < 0) { + error_setg_errno(errp, errno, "Could not dup FD %d", fd); + return NULL; + } + + return qio_channel_file_new_fd(newfd); +} QIOChannelFile * qio_channel_file_new_path(const char *path, diff --git a/migration/block.c b/migration/block.c index 8c6ebafacc..2b9054889a 100644 --- a/migration/block.c +++ b/migration/block.c @@ -402,7 +402,10 @@ static int init_blk_migration(QEMUFile *f) } sectors = bdrv_nb_sectors(bs); - if (sectors <= 0) { + if (sectors == 0) { + continue; + } + if (sectors < 0) { ret = sectors; bdrv_next_cleanup(&it); goto out; diff --git a/migration/fd.c b/migration/fd.c index d4ae72d132..fe0d096abd 100644 --- a/migration/fd.c +++ b/migration/fd.c @@ -18,9 +18,11 @@ #include "qapi/error.h" #include "channel.h" #include "fd.h" +#include "file.h" #include "migration.h" #include "monitor/monitor.h" #include "io/channel-file.h" +#include "io/channel-socket.h" #include "io/channel-util.h" #include "options.h" #include "trace.h" @@ -47,8 +49,7 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** { QIOChannel *ioc; int fd = monitor_get_fd(monitor_cur(), fdname, errp); - - outgoing_args.fd = -1; + int newfd; if (fd == -1) { return; @@ -61,7 +62,17 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error ** return; } - outgoing_args.fd = fd; + /* + * This is dup()ed just to avoid referencing an fd that might + * be already closed by the iochannel. + */ + newfd = dup(fd); + if (newfd == -1) { + error_setg_errno(errp, errno, "Could not dup FD %d", fd); + object_unref(ioc); + return; + } + outgoing_args.fd = newfd; qio_channel_set_name(ioc, "migration-fd-outgoing"); migration_channel_connect(s, ioc, NULL, NULL); @@ -93,28 +104,20 @@ void fd_start_incoming_migration(const char *fdname, Error **errp) return; } - qio_channel_set_name(ioc, "migration-fd-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - fd_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); - if (migrate_multifd()) { - int channels = migrate_multifd_channels(); - - while (channels--) { - ioc = QIO_CHANNEL(qio_channel_file_new_fd(dup(fd))); - - if (QIO_CHANNEL_FILE(ioc)->fd == -1) { - error_setg(errp, "Failed to duplicate fd %d", fd); - return; - } - - qio_channel_set_name(ioc, "migration-fd-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - fd_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); + if (fd_is_socket(fd)) { + error_setg(errp, + "Multifd migration to a socket FD is not supported"); + object_unref(ioc); + return; } + + file_create_incoming_channels(ioc, errp); + } else { + qio_channel_set_name(ioc, "migration-fd-incoming"); + qio_channel_add_watch_full(ioc, G_IO_IN, + fd_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); } } diff --git a/migration/file.c b/migration/file.c index b0b963e0ce..b6e8ba13f2 100644 --- a/migration/file.c +++ b/migration/file.c @@ -15,6 +15,7 @@ #include "file.h" #include "migration.h" #include "io/channel-file.h" +#include "io/channel-socket.h" #include "io/channel-util.h" #include "options.h" #include "trace.h" @@ -58,12 +59,19 @@ bool file_send_channel_create(gpointer opaque, Error **errp) int fd = fd_args_get_fd(); if (fd && fd != -1) { - ioc = qio_channel_file_new_fd(dup(fd)); - } else { - ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); - if (!ioc) { + if (fd_is_socket(fd)) { + error_setg(errp, + "Multifd migration to a socket FD is not supported"); goto out; } + + ioc = qio_channel_file_new_dupfd(fd, errp); + } else { + ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp); + } + + if (!ioc) { + goto out; } multifd_channel_connect(opaque, QIO_CHANNEL(ioc)); @@ -114,13 +122,46 @@ static gboolean file_accept_incoming_migration(QIOChannel *ioc, return G_SOURCE_REMOVE; } +void file_create_incoming_channels(QIOChannel *ioc, Error **errp) +{ + int i, fd, channels = 1; + g_autofree QIOChannel **iocs = NULL; + + if (migrate_multifd()) { + channels += migrate_multifd_channels(); + } + + iocs = g_new0(QIOChannel *, channels); + fd = QIO_CHANNEL_FILE(ioc)->fd; + iocs[0] = ioc; + + for (i = 1; i < channels; i++) { + QIOChannelFile *fioc = qio_channel_file_new_dupfd(fd, errp); + + if (!fioc) { + while (i) { + object_unref(iocs[--i]); + } + return; + } + + iocs[i] = QIO_CHANNEL(fioc); + } + + for (i = 0; i < channels; i++) { + qio_channel_set_name(iocs[i], "migration-file-incoming"); + qio_channel_add_watch_full(iocs[i], G_IO_IN, + file_accept_incoming_migration, + NULL, NULL, + g_main_context_get_thread_default()); + } +} + void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) { g_autofree char *filename = g_strdup(file_args->filename); QIOChannelFile *fioc = NULL; uint64_t offset = file_args->offset; - int channels = 1; - int i = 0; trace_migration_file_incoming(filename); @@ -131,29 +172,11 @@ void file_start_incoming_migration(FileMigrationArgs *file_args, Error **errp) if (offset && qio_channel_io_seek(QIO_CHANNEL(fioc), offset, SEEK_SET, errp) < 0) { + object_unref(OBJECT(fioc)); return; } - if (migrate_multifd()) { - channels += migrate_multifd_channels(); - } - - do { - QIOChannel *ioc = QIO_CHANNEL(fioc); - - qio_channel_set_name(ioc, "migration-file-incoming"); - qio_channel_add_watch_full(ioc, G_IO_IN, - file_accept_incoming_migration, - NULL, NULL, - g_main_context_get_thread_default()); - - fioc = qio_channel_file_new_fd(dup(fioc->fd)); - - if (!fioc || fioc->fd == -1) { - error_setg(errp, "Error creating migration incoming channel"); - break; - } - } while (++i < channels); + file_create_incoming_channels(QIO_CHANNEL(fioc), errp); } int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, diff --git a/migration/file.h b/migration/file.h index 9f71e87f74..7699c04677 100644 --- a/migration/file.h +++ b/migration/file.h @@ -20,6 +20,7 @@ void file_start_outgoing_migration(MigrationState *s, int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp); void file_cleanup_outgoing_migration(void); bool file_send_channel_create(gpointer opaque, Error **errp); +void file_create_incoming_channels(QIOChannel *ioc, Error **errp); int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov, int niov, RAMBlock *block, Error **errp); int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp); diff --git a/migration/migration.c b/migration/migration.c index 644e073b7d..f60bd371e3 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -166,9 +166,9 @@ static bool transport_supports_seeking(MigrationAddress *addr) } /* - * At this point, the user might not yet have passed the file - * descriptor to QEMU, so we cannot know for sure whether it - * refers to a plain file or a socket. Let it through anyway. + * At this point QEMU has not yet fetched the fd passed in by the + * user, so we cannot know for sure whether it refers to a plain + * file or a socket. Let it through anyway and check at fd.c. */ if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) { return addr->u.socket.type == SOCKET_ADDRESS_TYPE_FD; diff --git a/system/physmem.c b/system/physmem.c index 6cfb7a80ab..a4fe3d2bf8 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -819,7 +819,7 @@ found: return block; } -static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) +void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) { CPUState *cpu; ram_addr_t start1; @@ -881,8 +881,8 @@ bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size); } - if (dirty && tcg_enabled()) { - tlb_reset_dirty_range_all(start, length); + if (dirty) { + cpu_physical_memory_dirty_bits_cleared(start, length); } return dirty; @@ -929,9 +929,7 @@ DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty } } - if (tcg_enabled()) { - tlb_reset_dirty_range_all(start, length); - } + cpu_physical_memory_dirty_bits_cleared(start, length); memory_region_clear_dirty_bitmap(mr, offset, length);