From 940f70cc1c1afe20538ad49f809f7664bfc1f32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:30 +0000 Subject: [PATCH 01/13] tests/avocado: add a simple i386 replay kernel test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a number of bugs against 32 bit x86 on the tracker. Lets at least establish a baseline pure kernel boot can do record/replay before we start looking at the devices. Acked-by: Pavel Dovgalyuk Acked-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-2-alex.bennee@linaro.org> --- tests/avocado/replay_kernel.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/avocado/replay_kernel.py b/tests/avocado/replay_kernel.py index c37afa662c..1eaa36444c 100644 --- a/tests/avocado/replay_kernel.py +++ b/tests/avocado/replay_kernel.py @@ -82,6 +82,22 @@ def run_rr(self, kernel_path, kernel_command_line, console_pattern, class ReplayKernelNormal(ReplayKernelBase): + def test_i386_pc(self): + """ + :avocado: tags=arch:i386 + :avocado: tags=machine:pc + """ + kernel_url = ('https://storage.tuxboot.com/20230331/i386/bzImage') + kernel_hash = 'a3e5b32a354729e65910f5a1ffcda7c14a6c12a55e8213fb86e277f1b76ed956' + kernel_path = self.fetch_asset(kernel_url, + asset_hash=kernel_hash, + algorithm = "sha256") + + kernel_command_line = self.KERNEL_COMMON_COMMAND_LINE + 'console=ttyS0' + console_pattern = 'VFS: Cannot open root device' + + self.run_rr(kernel_path, kernel_command_line, console_pattern, shift=5) + # See https://gitlab.com/qemu-project/qemu/-/issues/2010 @skipUnless(os.getenv('QEMU_TEST_FLAKY_TESTS'), 'Test sometimes gets stuck') def test_x86_64_pc(self): From 2d8508bbab39bf342fe80e73c0b528eb3960fa37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:31 +0000 Subject: [PATCH 02/13] tests/avocado: fix typo in replay_linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Pavel Dovgalyuk Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-3-alex.bennee@linaro.org> --- tests/avocado/replay_linux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/avocado/replay_linux.py b/tests/avocado/replay_linux.py index 270ccc1eae..e95bff3299 100644 --- a/tests/avocado/replay_linux.py +++ b/tests/avocado/replay_linux.py @@ -94,7 +94,7 @@ def launch_and_wait(self, record, args, shift): else: vm.event_wait('SHUTDOWN', self.timeout) vm.wait() - logger.info('successfully fihished the replay') + logger.info('successfully finished the replay') elapsed = time.time() - start_time logger.info('elapsed time %.2f sec' % elapsed) return elapsed From 6f6b71536cc0e8b0fa3547deb3a0ac6be3398941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:32 +0000 Subject: [PATCH 03/13] tests/avocado: modernise the drive args for replay_linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QEMU complains about us not being explicit with setting snapshot so lets do that. Also as cdroms are RO media we don't need to jump the hoops of setting up snapshots and replay disks - just declare the drive is a cdrom and nothing should change. Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-4-alex.bennee@linaro.org> --- tests/avocado/replay_linux.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/avocado/replay_linux.py b/tests/avocado/replay_linux.py index e95bff3299..f3a43dc98c 100644 --- a/tests/avocado/replay_linux.py +++ b/tests/avocado/replay_linux.py @@ -48,12 +48,15 @@ def vm_add_disk(self, vm, path, id, device): bus_string = '' if self.bus: bus_string = ',bus=%s.%d' % (self.bus, id,) - vm.add_args('-drive', 'file=%s,snapshot,id=disk%s,if=none' % (path, id)) + vm.add_args('-drive', 'file=%s,snapshot=on,id=disk%s,if=none' % (path, id)) vm.add_args('-drive', 'driver=blkreplay,id=disk%s-rr,if=none,image=disk%s' % (id, id)) vm.add_args('-device', '%s,drive=disk%s-rr%s' % (device, id, bus_string)) + def vm_add_cdrom(self, vm, path, id, device): + vm.add_args('-drive', 'file=%s,id=disk%s,if=none,media=cdrom' % (path, id)) + def launch_and_wait(self, record, args, shift): self.require_netdev('user') vm = self.get_vm() @@ -65,7 +68,7 @@ def launch_and_wait(self, record, args, shift): if args: vm.add_args(*args) self.vm_add_disk(vm, self.boot_path, 0, self.hdd) - self.vm_add_disk(vm, self.cloudinit_path, 1, self.cd) + self.vm_add_cdrom(vm, self.cloudinit_path, 1, self.cd) logger = logging.getLogger('replay') if record: logger.info('recording the execution...') From fcc8c529fe45981496f1ee1fb0f59a1f9f36ba8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:33 +0000 Subject: [PATCH 04/13] scripts/replay-dump: update to latest format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To help debugging replay logs I've implemented decode_plain and decode_char_write as well as put in a new table for the current format of log. Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-5-alex.bennee@linaro.org> --- scripts/replay-dump.py | 90 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 8 deletions(-) diff --git a/scripts/replay-dump.py b/scripts/replay-dump.py index b89dc29555..6f300e4c54 100755 --- a/scripts/replay-dump.py +++ b/scripts/replay-dump.py @@ -21,6 +21,7 @@ import argparse import struct from collections import namedtuple +from os import path # This mirrors some of the global replay state which some of the # stream loading refers to. Some decoders may read the next event so @@ -82,6 +83,12 @@ def read_qword(fin): "Read a 64 bit word" return struct.unpack('>Q', fin.read(8))[0] +def read_array(fin): + "Read a sized array" + size = read_dword(fin) + data = fin.read(size) + return data + # Generic decoder structure Decoder = namedtuple("Decoder", "eid name fn") @@ -115,6 +122,11 @@ def decode_unimp(eid, name, _unused_dumpfile): print("%s not handled - will now stop" % (name)) return False +def decode_plain(eid, name, _unused_dumpfile): + "Plain events without additional data" + print_event(eid, name, "no data") + return True + # Checkpoint decoder def swallow_async_qword(eid, name, dumpfile): "Swallow a qword of data without looking at it" @@ -151,6 +163,12 @@ def decode_instruction(eid, name, dumpfile): print_event(eid, name, "0x%x" % (ins_diff)) return True +def decode_char_write(eid, name, dumpfile): + res = read_dword(dumpfile) + offset = read_dword(dumpfile) + print_event(eid, name, "%d -> %d" % (offset, res)) + return True + def decode_audio_out(eid, name, dumpfile): audio_data = read_dword(dumpfile) print_event(eid, name, "%d" % (audio_data)) @@ -189,14 +207,19 @@ def decode_clock(eid, name, dumpfile): print_event(eid, name, "0x%x" % (clock_data)) return True +def decode_random(eid, name, dumpfile): + ret = read_dword(dumpfile) + data = read_array(dumpfile) + print_event(eid, "%d bytes of random data" % len(data)) + return True # pre-MTTCG merge v5_event_table = [Decoder(0, "EVENT_INSTRUCTION", decode_instruction), Decoder(1, "EVENT_INTERRUPT", decode_interrupt), - Decoder(2, "EVENT_EXCEPTION", decode_unimp), + Decoder(2, "EVENT_EXCEPTION", decode_plain), Decoder(3, "EVENT_ASYNC", decode_async), Decoder(4, "EVENT_SHUTDOWN", decode_unimp), - Decoder(5, "EVENT_CHAR_WRITE", decode_unimp), + Decoder(5, "EVENT_CHAR_WRITE", decode_char_write), Decoder(6, "EVENT_CHAR_READ_ALL", decode_unimp), Decoder(7, "EVENT_CHAR_READ_ALL_ERROR", decode_unimp), Decoder(8, "EVENT_CLOCK_HOST", decode_clock), @@ -215,10 +238,10 @@ def decode_clock(eid, name, dumpfile): # post-MTTCG merge, AUDIO support added v6_event_table = [Decoder(0, "EVENT_INSTRUCTION", decode_instruction), Decoder(1, "EVENT_INTERRUPT", decode_interrupt), - Decoder(2, "EVENT_EXCEPTION", decode_unimp), + Decoder(2, "EVENT_EXCEPTION", decode_plain), Decoder(3, "EVENT_ASYNC", decode_async), Decoder(4, "EVENT_SHUTDOWN", decode_unimp), - Decoder(5, "EVENT_CHAR_WRITE", decode_unimp), + Decoder(5, "EVENT_CHAR_WRITE", decode_char_write), Decoder(6, "EVENT_CHAR_READ_ALL", decode_unimp), Decoder(7, "EVENT_CHAR_READ_ALL_ERROR", decode_unimp), Decoder(8, "EVENT_AUDIO_OUT", decode_audio_out), @@ -250,7 +273,7 @@ def decode_clock(eid, name, dumpfile): Decoder(10, "EVENT_SHUTDOWN_GUEST_RESET", decode_unimp), Decoder(11, "EVENT_SHUTDOWN_GUEST_PANIC", decode_unimp), Decoder(12, "EVENT_SHUTDOWN___MAX", decode_unimp), - Decoder(13, "EVENT_CHAR_WRITE", decode_unimp), + Decoder(13, "EVENT_CHAR_WRITE", decode_char_write), Decoder(14, "EVENT_CHAR_READ_ALL", decode_unimp), Decoder(15, "EVENT_CHAR_READ_ALL_ERROR", decode_unimp), Decoder(16, "EVENT_AUDIO_OUT", decode_audio_out), @@ -268,6 +291,48 @@ def decode_clock(eid, name, dumpfile): Decoder(28, "EVENT_CP_RESET", decode_checkpoint), ] +v12_event_table = [Decoder(0, "EVENT_INSTRUCTION", decode_instruction), + Decoder(1, "EVENT_INTERRUPT", decode_interrupt), + Decoder(2, "EVENT_EXCEPTION", decode_plain), + Decoder(3, "EVENT_ASYNC", decode_async), + Decoder(4, "EVENT_ASYNC", decode_async), + Decoder(5, "EVENT_ASYNC", decode_async), + Decoder(6, "EVENT_ASYNC", decode_async), + Decoder(6, "EVENT_ASYNC", decode_async), + Decoder(8, "EVENT_ASYNC", decode_async), + Decoder(9, "EVENT_ASYNC", decode_async), + Decoder(10, "EVENT_ASYNC", decode_async), + Decoder(11, "EVENT_SHUTDOWN", decode_unimp), + Decoder(12, "EVENT_SHUTDOWN_HOST_ERR", decode_unimp), + Decoder(13, "EVENT_SHUTDOWN_HOST_QMP_QUIT", decode_unimp), + Decoder(14, "EVENT_SHUTDOWN_HOST_QMP_RESET", decode_unimp), + Decoder(14, "EVENT_SHUTDOWN_HOST_SIGNAL", decode_unimp), + Decoder(15, "EVENT_SHUTDOWN_HOST_UI", decode_unimp), + Decoder(16, "EVENT_SHUTDOWN_GUEST_SHUTDOWN", decode_unimp), + Decoder(17, "EVENT_SHUTDOWN_GUEST_RESET", decode_unimp), + Decoder(18, "EVENT_SHUTDOWN_GUEST_PANIC", decode_unimp), + Decoder(19, "EVENT_SHUTDOWN_GUEST_SUBSYSTEM_RESET", decode_unimp), + Decoder(20, "EVENT_SHUTDOWN_GUEST_SNAPSHOT_LOAD", decode_unimp), + Decoder(21, "EVENT_SHUTDOWN___MAX", decode_unimp), + Decoder(22, "EVENT_CHAR_WRITE", decode_char_write), + Decoder(23, "EVENT_CHAR_READ_ALL", decode_unimp), + Decoder(24, "EVENT_CHAR_READ_ALL_ERROR", decode_unimp), + Decoder(25, "EVENT_AUDIO_IN", decode_unimp), + Decoder(26, "EVENT_AUDIO_OUT", decode_audio_out), + Decoder(27, "EVENT_RANDOM", decode_random), + Decoder(28, "EVENT_CLOCK_HOST", decode_clock), + Decoder(29, "EVENT_CLOCK_VIRTUAL_RT", decode_clock), + Decoder(30, "EVENT_CP_CLOCK_WARP_START", decode_checkpoint), + Decoder(31, "EVENT_CP_CLOCK_WARP_ACCOUNT", decode_checkpoint), + Decoder(32, "EVENT_CP_RESET_REQUESTED", decode_checkpoint), + Decoder(33, "EVENT_CP_SUSPEND_REQUESTED", decode_checkpoint), + Decoder(34, "EVENT_CP_CLOCK_VIRTUAL", decode_checkpoint), + Decoder(35, "EVENT_CP_CLOCK_HOST", decode_checkpoint), + Decoder(36, "EVENT_CP_CLOCK_VIRTUAL_RT", decode_checkpoint), + Decoder(37, "EVENT_CP_INIT", decode_checkpoint_init), + Decoder(38, "EVENT_CP_RESET", decode_checkpoint), +] + def parse_arguments(): "Grab arguments for script" parser = argparse.ArgumentParser() @@ -278,14 +343,18 @@ def parse_arguments(): def decode_file(filename): "Decode a record/replay dump" dumpfile = open(filename, "rb") - + dumpsize = path.getsize(filename) # read and throwaway the header version = read_dword(dumpfile) junk = read_qword(dumpfile) + # see REPLAY_VERSION print("HEADER: version 0x%x" % (version)) - if version == 0xe02007: + if version == 0xe0200c: + event_decode_table = v12_event_table + replay_state.checkpoint_start = 30 + elif version == 0xe02007: event_decode_table = v7_event_table replay_state.checkpoint_start = 12 elif version == 0xe02006: @@ -299,8 +368,13 @@ def decode_file(filename): decode_ok = True while decode_ok: event = read_event(dumpfile) - decode_ok = call_decode(event_decode_table, event, dumpfile) + decode_ok = call_decode(event_decode_table, event, + dumpfile) + except Exception as inst: + print(f"error {inst}") + finally: + print(f"Reached {dumpfile.tell()} of {dumpsize} bytes") dumpfile.close() if __name__ == "__main__": From 41e17cc8aa75a8e8e604e17fdf8adcccdca60903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:34 +0000 Subject: [PATCH 05/13] scripts/replay_dump: track total number of instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help in tracking where we are in the stream when debugging. Reviewed-by: Pavel Dovgalyuk Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-6-alex.bennee@linaro.org> --- scripts/replay-dump.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/replay-dump.py b/scripts/replay-dump.py index 6f300e4c54..d668193e79 100755 --- a/scripts/replay-dump.py +++ b/scripts/replay-dump.py @@ -157,10 +157,13 @@ def decode_async(eid, name, dumpfile): return call_decode(async_decode_table, async_event_kind, dumpfile) +total_insns = 0 def decode_instruction(eid, name, dumpfile): + global total_insns ins_diff = read_dword(dumpfile) - print_event(eid, name, "0x%x" % (ins_diff)) + total_insns += ins_diff + print_event(eid, name, "+ %d -> %d" % (ins_diff, total_insns)) return True def decode_char_write(eid, name, dumpfile): From 8835ed3293e5be5bb266eeefdcd5077a0e6a332d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:35 +0000 Subject: [PATCH 06/13] replay: remove host_clock_last MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: a02fe2ca70 (replay: Remove host_clock_last) Reviewed-by: Richard Henderson Reviewed-by: Pavel Dovgalyuk Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-7-alex.bennee@linaro.org> --- replay/replay-internal.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/replay/replay-internal.h b/replay/replay-internal.h index b6836354ac..516147ddbc 100644 --- a/replay/replay-internal.h +++ b/replay/replay-internal.h @@ -80,8 +80,6 @@ typedef struct ReplayState { This counter is global, because requests from different block devices should not get overlapping ids. */ uint64_t block_request_id; - /*! Prior value of the host clock */ - uint64_t host_clock_last; /*! Asynchronous event id read from the log */ uint64_t read_event_id; } ReplayState; From 808eab62412b0bcf89e2cff204ae53b310f8aeca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:36 +0000 Subject: [PATCH 07/13] replay: add proper kdoc for ReplayState MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the non-standard comment formatting and move the descriptions into a proper kdoc comment. Reviewed-by: Pavel Dovgalyuk Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-8-alex.bennee@linaro.org> --- replay/replay-internal.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/replay/replay-internal.h b/replay/replay-internal.h index 516147ddbc..98ca3748ed 100644 --- a/replay/replay-internal.h +++ b/replay/replay-internal.h @@ -63,24 +63,31 @@ enum ReplayEvents { EVENT_COUNT }; +/** + * typedef ReplayState - global tracking Replay state + * + * This structure tracks where we are in the current ReplayState + * including the logged events from the recorded replay stream. Some + * of the data is also stored/restored from VMStateDescription when VM + * save/restore events take place. + * + * @cached_clock: Cached clocks values + * @current_icount: number of processed instructions + * @instruction_count: number of instructions until next event + * @data_kind: current event + * @has_unread_data: 1 if event not yet processed + * @file_offset: offset into replay log at replay snapshot + * @block_request_id: current serialised block request id + * @read_event_id: current async read event id + */ typedef struct ReplayState { - /*! Cached clock values. */ int64_t cached_clock[REPLAY_CLOCK_COUNT]; - /*! Current icount - number of processed instructions. */ uint64_t current_icount; - /*! Number of instructions to be executed before other events happen. */ int instruction_count; - /*! Type of the currently executed event. */ unsigned int data_kind; - /*! Flag which indicates that event is not processed yet. */ unsigned int has_unread_data; - /*! Temporary variable for saving current log offset. */ uint64_t file_offset; - /*! Next block operation id. - This counter is global, because requests from different - block devices should not get overlapping ids. */ uint64_t block_request_id; - /*! Asynchronous event id read from the log */ uint64_t read_event_id; } ReplayState; extern ReplayState replay_state; From 2b7a58b60ad89a41df187fc9b48c0d7d0dab488b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:37 +0000 Subject: [PATCH 08/13] replay: make has_unread_data a bool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For clarity given it only has two states. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Pavel Dovgalyuk Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-9-alex.bennee@linaro.org> --- replay/replay-internal.c | 4 ++-- replay/replay-internal.h | 4 ++-- replay/replay-snapshot.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/replay/replay-internal.c b/replay/replay-internal.c index 77d0c82327..634025096e 100644 --- a/replay/replay-internal.c +++ b/replay/replay-internal.c @@ -179,7 +179,7 @@ void replay_fetch_data_kind(void) replay_state.instruction_count = replay_get_dword(); } replay_check_error(); - replay_state.has_unread_data = 1; + replay_state.has_unread_data = true; if (replay_state.data_kind >= EVENT_COUNT) { error_report("Replay: unknown event kind %d", replay_state.data_kind); @@ -191,7 +191,7 @@ void replay_fetch_data_kind(void) void replay_finish_event(void) { - replay_state.has_unread_data = 0; + replay_state.has_unread_data = false; replay_fetch_data_kind(); } diff --git a/replay/replay-internal.h b/replay/replay-internal.h index 98ca3748ed..1bc8fd5086 100644 --- a/replay/replay-internal.h +++ b/replay/replay-internal.h @@ -75,7 +75,7 @@ enum ReplayEvents { * @current_icount: number of processed instructions * @instruction_count: number of instructions until next event * @data_kind: current event - * @has_unread_data: 1 if event not yet processed + * @has_unread_data: true if event not yet processed * @file_offset: offset into replay log at replay snapshot * @block_request_id: current serialised block request id * @read_event_id: current async read event id @@ -85,7 +85,7 @@ typedef struct ReplayState { uint64_t current_icount; int instruction_count; unsigned int data_kind; - unsigned int has_unread_data; + bool has_unread_data; uint64_t file_offset; uint64_t block_request_id; uint64_t read_event_id; diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c index e5e39161e3..d4f6cb7cda 100644 --- a/replay/replay-snapshot.c +++ b/replay/replay-snapshot.c @@ -47,8 +47,8 @@ static int replay_post_load(void *opaque, int version_id) static const VMStateDescription vmstate_replay = { .name = "replay", - .version_id = 2, - .minimum_version_id = 2, + .version_id = 3, + .minimum_version_id = 3, .pre_save = replay_pre_save, .post_load = replay_post_load, .fields = (const VMStateField[]) { @@ -56,7 +56,7 @@ static const VMStateDescription vmstate_replay = { VMSTATE_UINT64(current_icount, ReplayState), VMSTATE_INT32(instruction_count, ReplayState), VMSTATE_UINT32(data_kind, ReplayState), - VMSTATE_UINT32(has_unread_data, ReplayState), + VMSTATE_BOOL(has_unread_data, ReplayState), VMSTATE_UINT64(file_offset, ReplayState), VMSTATE_UINT64(block_request_id, ReplayState), VMSTATE_UINT64(read_event_id, ReplayState), From dcda73211cd102dbfb8b8465c49dac66246a511e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:38 +0000 Subject: [PATCH 09/13] replay: introduce a central report point for sync errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Figuring out why replay has failed is tricky at the best of times. Lets centralise the reporting of a replay sync error and add a little bit of extra information to help with debugging. Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-10-alex.bennee@linaro.org> --- replay/replay-internal.c | 1 + replay/replay-internal.h | 19 ++++++- replay/replay-snapshot.c | 1 + replay/replay.c | 111 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 1 deletion(-) diff --git a/replay/replay-internal.c b/replay/replay-internal.c index 634025096e..654b99cfb5 100644 --- a/replay/replay-internal.c +++ b/replay/replay-internal.c @@ -175,6 +175,7 @@ void replay_fetch_data_kind(void) if (replay_file) { if (!replay_state.has_unread_data) { replay_state.data_kind = replay_get_byte(); + replay_state.current_event++; if (replay_state.data_kind == EVENT_INSTRUCTION) { replay_state.instruction_count = replay_get_dword(); } diff --git a/replay/replay-internal.h b/replay/replay-internal.h index 1bc8fd5086..75249b7693 100644 --- a/replay/replay-internal.h +++ b/replay/replay-internal.h @@ -25,7 +25,12 @@ typedef enum ReplayAsyncEventKind { REPLAY_ASYNC_COUNT } ReplayAsyncEventKind; -/* Any changes to order/number of events will need to bump REPLAY_VERSION */ +/* + * Any changes to order/number of events will need to bump + * REPLAY_VERSION to prevent confusion with old logs. Also don't + * forget to update replay_event_name() to make your debugging life + * easier. + */ enum ReplayEvents { /* for instruction event */ EVENT_INSTRUCTION, @@ -74,6 +79,7 @@ enum ReplayEvents { * @cached_clock: Cached clocks values * @current_icount: number of processed instructions * @instruction_count: number of instructions until next event + * @current_event: current event index * @data_kind: current event * @has_unread_data: true if event not yet processed * @file_offset: offset into replay log at replay snapshot @@ -84,6 +90,7 @@ typedef struct ReplayState { int64_t cached_clock[REPLAY_CLOCK_COUNT]; uint64_t current_icount; int instruction_count; + unsigned int current_event; unsigned int data_kind; bool has_unread_data; uint64_t file_offset; @@ -188,6 +195,16 @@ void replay_event_net_save(void *opaque); /*! Reads network from the file. */ void *replay_event_net_load(void); +/* Diagnostics */ + +/** + * replay_sync_error(): report sync error and exit + * + * When we reach an error condition we want to report it centrally so + * we can also dump some useful information into the logs. + */ +G_NORETURN void replay_sync_error(const char *error); + /* VMState-related functions */ /* Registers replay VMState. diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c index d4f6cb7cda..ccb4d89dda 100644 --- a/replay/replay-snapshot.c +++ b/replay/replay-snapshot.c @@ -55,6 +55,7 @@ static const VMStateDescription vmstate_replay = { VMSTATE_INT64_ARRAY(cached_clock, ReplayState, REPLAY_CLOCK_COUNT), VMSTATE_UINT64(current_icount, ReplayState), VMSTATE_INT32(instruction_count, ReplayState), + VMSTATE_UINT32(current_event, ReplayState), VMSTATE_UINT32(data_kind, ReplayState), VMSTATE_BOOL(has_unread_data, ReplayState), VMSTATE_UINT64(file_offset, ReplayState), diff --git a/replay/replay.c b/replay/replay.c index 0f7d766efe..ff197f436b 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -38,6 +38,107 @@ static GSList *replay_blockers; uint64_t replay_break_icount = -1ULL; QEMUTimer *replay_break_timer; +/* Pretty print event names */ + +static const char *replay_async_event_name(ReplayAsyncEventKind event) +{ + switch (event) { +#define ASYNC_EVENT(_x) case REPLAY_ASYNC_EVENT_ ## _x: return "ASYNC_EVENT_"#_x + ASYNC_EVENT(BH); + ASYNC_EVENT(BH_ONESHOT); + ASYNC_EVENT(INPUT); + ASYNC_EVENT(INPUT_SYNC); + ASYNC_EVENT(CHAR_READ); + ASYNC_EVENT(BLOCK); + ASYNC_EVENT(NET); +#undef ASYNC_EVENT + default: + g_assert_not_reached(); + } +} + +static const char *replay_clock_event_name(ReplayClockKind clock) +{ + switch (clock) { +#define CLOCK_EVENT(_x) case REPLAY_CLOCK_ ## _x: return "CLOCK_" #_x + CLOCK_EVENT(HOST); + CLOCK_EVENT(VIRTUAL_RT); +#undef CLOCK_EVENT + default: + g_assert_not_reached(); + } +} + +/* Pretty print shutdown event names */ +static const char *replay_shutdown_event_name(ShutdownCause cause) +{ + switch (cause) { +#define SHUTDOWN_EVENT(_x) case SHUTDOWN_CAUSE_ ## _x: return "SHUTDOWN_CAUSE_" #_x + SHUTDOWN_EVENT(NONE); + SHUTDOWN_EVENT(HOST_ERROR); + SHUTDOWN_EVENT(HOST_QMP_QUIT); + SHUTDOWN_EVENT(HOST_QMP_SYSTEM_RESET); + SHUTDOWN_EVENT(HOST_SIGNAL); + SHUTDOWN_EVENT(HOST_UI); + SHUTDOWN_EVENT(GUEST_SHUTDOWN); + SHUTDOWN_EVENT(GUEST_RESET); + SHUTDOWN_EVENT(GUEST_PANIC); + SHUTDOWN_EVENT(SUBSYSTEM_RESET); + SHUTDOWN_EVENT(SNAPSHOT_LOAD); +#undef SHUTDOWN_EVENT + default: + g_assert_not_reached(); + } +} + +static const char *replay_checkpoint_event_name(enum ReplayCheckpoint checkpoint) +{ + switch (checkpoint) { +#define CHECKPOINT_EVENT(_x) case CHECKPOINT_ ## _x: return "CHECKPOINT_" #_x + CHECKPOINT_EVENT(CLOCK_WARP_START); + CHECKPOINT_EVENT(CLOCK_WARP_ACCOUNT); + CHECKPOINT_EVENT(RESET_REQUESTED); + CHECKPOINT_EVENT(SUSPEND_REQUESTED); + CHECKPOINT_EVENT(CLOCK_VIRTUAL); + CHECKPOINT_EVENT(CLOCK_HOST); + CHECKPOINT_EVENT(CLOCK_VIRTUAL_RT); + CHECKPOINT_EVENT(INIT); + CHECKPOINT_EVENT(RESET); +#undef CHECKPOINT_EVENT + default: + g_assert_not_reached(); + } +} + +static const char *replay_event_name(enum ReplayEvents event) +{ + /* First deal with the simple ones */ + switch (event) { +#define EVENT(_x) case EVENT_ ## _x: return "EVENT_"#_x + EVENT(INSTRUCTION); + EVENT(INTERRUPT); + EVENT(EXCEPTION); + EVENT(CHAR_WRITE); + EVENT(CHAR_READ_ALL); + EVENT(AUDIO_OUT); + EVENT(AUDIO_IN); + EVENT(RANDOM); +#undef EVENT + default: + if (event >= EVENT_ASYNC && event <= EVENT_ASYNC_LAST) { + return replay_async_event_name(event - EVENT_ASYNC); + } else if (event >= EVENT_SHUTDOWN && event <= EVENT_SHUTDOWN_LAST) { + return replay_shutdown_event_name(event - EVENT_SHUTDOWN); + } else if (event >= EVENT_CLOCK && event <= EVENT_CLOCK_LAST) { + return replay_clock_event_name(event - EVENT_CLOCK); + } else if (event >= EVENT_CHECKPOINT && event <= EVENT_CHECKPOINT_LAST) { + return replay_checkpoint_event_name(event - EVENT_CHECKPOINT); + } + } + + g_assert_not_reached(); +} + bool replay_next_event_is(int event) { bool res = false; @@ -226,6 +327,15 @@ bool replay_has_event(void) return res; } +G_NORETURN void replay_sync_error(const char *error) +{ + error_report("%s (insn total %"PRId64"/%d left, event %d is %s)", error, + replay_state.current_icount, replay_state.instruction_count, + replay_state.current_event, + replay_event_name(replay_state.data_kind)); + abort(); +} + static void replay_enable(const char *fname, int mode) { const char *fmode = NULL; @@ -258,6 +368,7 @@ static void replay_enable(const char *fname, int mode) replay_state.data_kind = -1; replay_state.instruction_count = 0; replay_state.current_icount = 0; + replay_state.current_event = 0; replay_state.has_unread_data = 0; /* skip file header for RECORD and check it for PLAY */ From fd84325f71254c741464de231c491d21ecf63b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:39 +0000 Subject: [PATCH 10/13] replay/replay-char: use report_sync_error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we have a centralised report function use it for missing character events. Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-11-alex.bennee@linaro.org> --- replay/replay-char.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/replay/replay-char.c b/replay/replay-char.c index a31aded032..72b1f832dd 100644 --- a/replay/replay-char.c +++ b/replay/replay-char.c @@ -113,8 +113,7 @@ void replay_char_write_event_load(int *res, int *offset) *offset = replay_get_dword(); replay_finish_event(); } else { - error_report("Missing character write event in the replay log"); - exit(1); + replay_sync_error("Missing character write event in the replay log"); } } @@ -135,8 +134,7 @@ int replay_char_read_all_load(uint8_t *buf) replay_finish_event(); return res; } else { - error_report("Missing character read all event in the replay log"); - exit(1); + replay_sync_error("Missing character read all event in the replay log"); } } From 1f881ea4a444ef36a8b6907b0b82be4b3af253a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:40 +0000 Subject: [PATCH 11/13] replay: stop us hanging in rr_wait_io_event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A lot of the hang I see are when we end up spinning in rr_wait_io_event for an event that will never come in playback. As a new check functions which can see if we are in PLAY mode and kick us us the wait function so the event can be processed. This fixes most of the failures in replay_kernel.py Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2013 Cc: Pavel Dovgalyuk Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-12-alex.bennee@linaro.org> --- accel/tcg/tcg-accel-ops-rr.c | 2 +- include/sysemu/replay.h | 5 +++++ replay/replay.c | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c index 611932f3c3..825e35b3dc 100644 --- a/accel/tcg/tcg-accel-ops-rr.c +++ b/accel/tcg/tcg-accel-ops-rr.c @@ -109,7 +109,7 @@ static void rr_wait_io_event(void) { CPUState *cpu; - while (all_cpu_threads_idle()) { + while (all_cpu_threads_idle() && replay_can_wait()) { rr_stop_kick_timer(); qemu_cond_wait_iothread(first_cpu->halt_cond); } diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h index 08aae5869f..83995ae4bd 100644 --- a/include/sysemu/replay.h +++ b/include/sysemu/replay.h @@ -70,6 +70,11 @@ int replay_get_instructions(void); /*! Updates instructions counter in replay mode. */ void replay_account_executed_instructions(void); +/** + * replay_can_wait: check if we should pause for wait-io + */ +bool replay_can_wait(void); + /* Processing clocks and other time sources */ /*! Save the specified clock */ diff --git a/replay/replay.c b/replay/replay.c index ff197f436b..3fd241a4fc 100644 --- a/replay/replay.c +++ b/replay/replay.c @@ -449,6 +449,27 @@ void replay_start(void) replay_enable_events(); } +/* + * For none/record the answer is yes. + */ +bool replay_can_wait(void) +{ + if (replay_mode == REPLAY_MODE_PLAY) { + /* + * For playback we shouldn't ever be at a point we wait. If + * the instruction count has reached zero and we have an + * unconsumed event we should go around again and consume it. + */ + if (replay_state.instruction_count == 0 && replay_state.has_unread_data) { + return false; + } else { + replay_sync_error("Playback shouldn't have to iowait"); + } + } + return true; +} + + void replay_finish(void) { if (replay_mode == REPLAY_MODE_NONE) { From 7b571ef3c75ac35522c817219cbfa765753a36c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:41 +0000 Subject: [PATCH 12/13] chardev: force write all when recording replay logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is mostly a problem within avocado as serial generally isn't busy enough to overfill pipes. However the consequences of recording a failed write will haunt us on replay when the log will be out of sync to the playback. Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2010 Acked-by: Pavel Dovgalyuk Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-13-alex.bennee@linaro.org> --- chardev/char.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/chardev/char.c b/chardev/char.c index 996a024c7a..48f28881c2 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -171,6 +171,18 @@ int qemu_chr_write(Chardev *s, const uint8_t *buf, int len, bool write_all) return res; } + if (replay_mode == REPLAY_MODE_RECORD) { + /* + * When recording we don't want temporary conditions to + * perturb the result. By ensuring we write everything we can + * while recording we avoid playback being out of sync if it + * doesn't encounter the same temporary conditions (usually + * triggered by external programs not reading the chardev fast + * enough and pipes filling up). + */ + write_all = true; + } + res = qemu_chr_write_buffer(s, buf, len, &offset, write_all); if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_RECORD) { From c2ef5ee89d76f0ab77c4dd6a1c9eeed4d35d20ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Mon, 11 Dec 2023 09:13:42 +0000 Subject: [PATCH 13/13] tests/avocado: remove skips from replay_kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the latest fixes for #2010 and #2013 these tests look pretty stable now. Of course the only way to be really sure is to run it in the CI infrastructure and see what breaks. Acked-by: Pavel Dovgalyuk Signed-off-by: Alex Bennée Message-Id: <20231211091346.14616-14-alex.bennee@linaro.org> --- tests/avocado/replay_kernel.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/avocado/replay_kernel.py b/tests/avocado/replay_kernel.py index 1eaa36444c..6fdcbd6ac3 100644 --- a/tests/avocado/replay_kernel.py +++ b/tests/avocado/replay_kernel.py @@ -98,13 +98,10 @@ def test_i386_pc(self): self.run_rr(kernel_path, kernel_command_line, console_pattern, shift=5) - # See https://gitlab.com/qemu-project/qemu/-/issues/2010 - @skipUnless(os.getenv('QEMU_TEST_FLAKY_TESTS'), 'Test sometimes gets stuck') def test_x86_64_pc(self): """ :avocado: tags=arch:x86_64 :avocado: tags=machine:pc - :avocado: tags=flaky """ kernel_url = ('https://archives.fedoraproject.org/pub/archive/fedora' '/linux/releases/29/Everything/x86_64/os/images/pxeboot' @@ -135,8 +132,6 @@ def test_mips_malta(self): self.run_rr(kernel_path, kernel_command_line, console_pattern, shift=5) - # See https://gitlab.com/qemu-project/qemu/-/issues/2013 - @skipUnless(os.getenv('QEMU_TEST_FLAKY_TESTS'), 'Test is unstable on GitLab') def test_mips64el_malta(self): """ This test requires the ar tool to extract "data.tar.gz" from @@ -152,7 +147,6 @@ def test_mips64el_malta(self): :avocado: tags=arch:mips64el :avocado: tags=machine:malta - :avocado: tags=flaky """ deb_url = ('http://snapshot.debian.org/archive/debian/' '20130217T032700Z/pool/main/l/linux-2.6/' @@ -200,13 +194,10 @@ def test_arm_virt(self): self.run_rr(kernel_path, kernel_command_line, console_pattern, shift=1) - @skipUnless(os.getenv('QEMU_TEST_FLAKY_TESTS'), 'Test is unstable on GitLab') - def test_arm_cubieboard_initrd(self): """ :avocado: tags=arch:arm :avocado: tags=machine:cubieboard - :avocado: tags=flaky """ deb_url = ('https://apt.armbian.com/pool/main/l/' 'linux-5.10.16-sunxi/linux-image-current-sunxi_21.02.2_armhf.deb') @@ -354,7 +345,6 @@ def test_m68k_mcf5208evb(self): file_path = self.fetch_asset(tar_url, asset_hash=tar_hash) self.do_test_advcal_2018(file_path, 'sanity-clause.elf') - @skip("Test currently broken") # Console stuck as of 5.2-rc1 def test_microblaze_s3adsp1800(self): """ :avocado: tags=arch:microblaze @@ -389,7 +379,6 @@ def test_or1k_sim(self): file_path = self.fetch_asset(tar_url, asset_hash=tar_hash) self.do_test_advcal_2018(file_path, 'vmlinux') - @skip("nios2 emulation is buggy under record/replay") def test_nios2_10m50(self): """ :avocado: tags=arch:nios2