diff --git a/migration/migration.c b/migration/migration.c index 455ddc896a..e875ea0d6b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1393,6 +1393,17 @@ bool migration_in_postcopy(void) } } +bool migration_postcopy_is_alive(int state) +{ + switch (state) { + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_RECOVER: + return true; + default: + return false; + } +} + bool migration_in_postcopy_after_devices(MigrationState *s) { return migration_in_postcopy() && s->postcopy_after_devices; @@ -1673,8 +1684,15 @@ void qmp_migrate_pause(Error **errp) MigrationIncomingState *mis = migration_incoming_get_current(); int ret = 0; - if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + if (migration_postcopy_is_alive(ms->state)) { /* Source side, during postcopy */ + Error *error = NULL; + + /* Tell the core migration that we're pausing */ + error_setg(&error, "Postcopy migration is paused by the user"); + migrate_set_error(ms, error); + error_free(error); + qemu_mutex_lock(&ms->qemu_file_lock); if (ms->to_dst_file) { ret = qemu_file_shutdown(ms->to_dst_file); @@ -1683,10 +1701,17 @@ void qmp_migrate_pause(Error **errp) if (ret) { error_setg(errp, "Failed to pause source migration"); } + + /* + * Kick the migration thread out of any waiting windows (on behalf + * of the rp thread). + */ + migration_rp_kick(ms); + return; } - if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + if (migration_postcopy_is_alive(mis->state)) { ret = qemu_file_shutdown(mis->from_src_file); if (ret) { error_setg(errp, "Failed to pause destination migration"); @@ -1695,7 +1720,7 @@ void qmp_migrate_pause(Error **errp) } error_setg(errp, "migrate-pause is currently only supported " - "during postcopy-active state"); + "during postcopy-active or postcopy-recover state"); } bool migration_is_blocked(Error **errp) @@ -1882,9 +1907,21 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp) qemu_sem_post(&s->pause_sem); } -void migration_rp_wait(MigrationState *s) +int migration_rp_wait(MigrationState *s) { + /* If migration has failure already, ignore the wait */ + if (migrate_has_error(s)) { + return -1; + } + qemu_sem_wait(&s->rp_state.rp_sem); + + /* After wait, double check that there's no failure */ + if (migrate_has_error(s)) { + return -1; + } + + return 0; } void migration_rp_kick(MigrationState *s) @@ -2146,6 +2183,20 @@ out: trace_source_return_path_thread_bad_end(); } + if (ms->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { + /* + * this will be extremely unlikely: that we got yet another network + * issue during recovering of the 1st network failure.. during this + * period the main migration thread can be waiting on rp_sem for + * this thread to sync with the other side. + * + * When this happens, explicitly kick the migration thread out of + * RECOVER stage and back to PAUSED, so the admin can try + * everything again. + */ + migration_rp_kick(ms); + } + trace_source_return_path_thread_end(); rcu_unregister_thread(); @@ -2611,7 +2662,9 @@ static int postcopy_resume_handshake(MigrationState *s) qemu_savevm_send_postcopy_resume(s->to_dst_file); while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) { - migration_rp_wait(s); + if (migration_rp_wait(s)) { + return -1; + } } if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { diff --git a/migration/migration.h b/migration/migration.h index 615b517594..af8c965b7f 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -494,6 +494,7 @@ int migrate_init(MigrationState *s, Error **errp); bool migration_is_blocked(Error **errp); /* True if outgoing migration has entered postcopy phase */ bool migration_in_postcopy(void); +bool migration_postcopy_is_alive(int state); MigrationState *migrate_get_current(void); uint64_t ram_get_total_transferred_pages(void); @@ -534,8 +535,11 @@ void migration_populate_vfio_info(MigrationInfo *info); void migration_reset_vfio_bytes_transferred(void); void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page); -/* Migration thread waiting for return path thread. */ -void migration_rp_wait(MigrationState *s); +/* + * Migration thread waiting for return path thread. Return non-zero if an + * error is detected. + */ +int migration_rp_wait(MigrationState *s); /* * Kick the migration thread waiting for return path messages. NOTE: the * name can be slightly confusing (when read as "kick the rp thread"), just diff --git a/migration/ram.c b/migration/ram.c index d05ffddbc8..929cba08f4 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -4099,7 +4099,9 @@ static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) /* Wait until all the ramblocks' dirty bitmap synced */ while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { - migration_rp_wait(s); + if (migration_rp_wait(s)) { + return -1; + } } trace_ram_dirty_bitmap_sync_complete();