diff --git a/Makefile b/Makefile index 7bb2bf2045..de6393e2f9 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ BLOCK_OBJS=cutils.o qemu-malloc.o BLOCK_OBJS+=block-cow.o block-qcow.o aes.o block-vmdk.o block-cloop.o BLOCK_OBJS+=block-dmg.o block-bochs.o block-vpc.o block-vvfat.o BLOCK_OBJS+=block-qcow2.o block-parallels.o block-nbd.o -BLOCK_OBJS+=nbd.o block.o +BLOCK_OBJS+=nbd.o block.o aio.o ifdef CONFIG_WIN32 BLOCK_OBJS += block-raw-win32.o diff --git a/Makefile.target b/Makefile.target index 88e877ff4b..4a490f4f85 100644 --- a/Makefile.target +++ b/Makefile.target @@ -474,7 +474,7 @@ endif #CONFIG_DARWIN_USER ifndef CONFIG_USER_ONLY OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o -OBJS+=fw_cfg.o +OBJS+=fw_cfg.o aio.o ifdef CONFIG_WIN32 OBJS+=block-raw-win32.o else diff --git a/aio.c b/aio.c new file mode 100644 index 0000000000..687e4bef08 --- /dev/null +++ b/aio.c @@ -0,0 +1,192 @@ +/* + * QEMU aio implementation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block.h" +#include "sys-queue.h" +#include "qemu_socket.h" + +typedef struct AioHandler AioHandler; + +/* The list of registered AIO handlers */ +static LIST_HEAD(, AioHandler) aio_handlers; + +/* This is a simple lock used to protect the aio_handlers list. Specifically, + * it's used to ensure that no callbacks are removed while we're walking and + * dispatching callbacks. + */ +static int walking_handlers; + +struct AioHandler +{ + int fd; + IOHandler *io_read; + IOHandler *io_write; + AioFlushHandler *io_flush; + int deleted; + void *opaque; + LIST_ENTRY(AioHandler) node; +}; + +static AioHandler *find_aio_handler(int fd) +{ + AioHandler *node; + + LIST_FOREACH(node, &aio_handlers, node) { + if (node->fd == fd) + return node; + } + + return NULL; +} + +int qemu_aio_set_fd_handler(int fd, + IOHandler *io_read, + IOHandler *io_write, + AioFlushHandler *io_flush, + void *opaque) +{ + AioHandler *node; + + node = find_aio_handler(fd); + + /* Are we deleting the fd handler? */ + if (!io_read && !io_write) { + if (node) { + /* If the lock is held, just mark the node as deleted */ + if (walking_handlers) + node->deleted = 1; + else { + /* Otherwise, delete it for real. We can't just mark it as + * deleted because deleted nodes are only cleaned up after + * releasing the walking_handlers lock. + */ + LIST_REMOVE(node, node); + qemu_free(node); + } + } + } else { + if (node == NULL) { + /* Alloc and insert if it's not already there */ + node = qemu_mallocz(sizeof(AioHandler)); + if (node == NULL) + return -ENOMEM; + node->fd = fd; + LIST_INSERT_HEAD(&aio_handlers, node, node); + } + /* Update handler with latest information */ + node->io_read = io_read; + node->io_write = io_write; + node->io_flush = io_flush; + node->opaque = opaque; + } + + qemu_set_fd_handler2(fd, NULL, io_read, io_write, opaque); + + return 0; +} + +void qemu_aio_flush(void) +{ + AioHandler *node; + int ret; + + do { + ret = 0; + + LIST_FOREACH(node, &aio_handlers, node) { + ret |= node->io_flush(node->opaque); + } + + qemu_aio_wait(); + } while (ret > 0); +} + +void qemu_aio_wait(void) +{ + int ret; + + if (qemu_bh_poll()) + return; + + do { + AioHandler *node; + fd_set rdfds, wrfds; + int max_fd = -1; + + walking_handlers = 1; + + /* fill fd sets */ + LIST_FOREACH(node, &aio_handlers, node) { + /* If there aren't pending AIO operations, don't invoke callbacks. + * Otherwise, if there are no AIO requests, qemu_aio_wait() would + * wait indefinitely. + */ + if (node->io_flush && node->io_flush(node->opaque) == 0) + continue; + + if (!node->deleted && node->io_read) { + FD_SET(node->fd, &rdfds); + max_fd = MAX(max_fd, node->fd + 1); + } + if (!node->deleted && node->io_write) { + FD_SET(node->fd, &wrfds); + max_fd = MAX(max_fd, node->fd + 1); + } + } + + walking_handlers = 0; + + /* No AIO operations? Get us out of here */ + if (max_fd == -1) + break; + + /* wait until next event */ + ret = select(max_fd, &rdfds, &wrfds, NULL, NULL); + if (ret == -1 && errno == EINTR) + continue; + + /* if we have any readable fds, dispatch event */ + if (ret > 0) { + walking_handlers = 1; + + /* we have to walk very carefully in case + * qemu_aio_set_fd_handler is called while we're walking */ + node = LIST_FIRST(&aio_handlers); + while (node) { + AioHandler *tmp; + + if (!node->deleted && + FD_ISSET(node->fd, &rdfds) && + node->io_read) { + node->io_read(node->opaque); + } + if (!node->deleted && + FD_ISSET(node->fd, &wrfds) && + node->io_write) { + node->io_write(node->opaque); + } + + tmp = node; + node = LIST_NEXT(node, node); + + if (tmp->deleted) { + LIST_REMOVE(tmp, node); + qemu_free(tmp); + } + } + + walking_handlers = 0; + } + } while (ret == 0); +} diff --git a/block-raw-posix.c b/block-raw-posix.c index 26819a4d8c..41f997686f 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -101,6 +101,8 @@ typedef struct BDRVRawState { #endif } BDRVRawState; +static int posix_aio_init(void); + static int fd_open(BlockDriverState *bs); static int raw_open(BlockDriverState *bs, const char *filename, int flags) @@ -108,6 +110,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) BDRVRawState *s = bs->opaque; int fd, open_flags, ret; + posix_aio_init(); + s->lseek_err_cnt = 0; open_flags = O_BINARY; @@ -437,13 +441,15 @@ typedef struct RawAIOCB { int ret; } RawAIOCB; -static int aio_sig_fd = -1; -static int aio_sig_num = SIGUSR2; -static RawAIOCB *first_aio; /* AIO issued */ -static int aio_initialized = 0; - -static void qemu_aio_poll(void *opaque) +typedef struct PosixAioState { + int fd; + RawAIOCB *first_aio; +} PosixAioState; + +static void posix_aio_read(void *opaque) +{ + PosixAioState *s = opaque; RawAIOCB *acb, **pacb; int ret; size_t offset; @@ -457,7 +463,7 @@ static void qemu_aio_poll(void *opaque) while (offset < 128) { ssize_t len; - len = read(aio_sig_fd, sig.buf + offset, 128 - offset); + len = read(s->fd, sig.buf + offset, 128 - offset); if (len == -1 && errno == EINTR) continue; if (len == -1 && errno == EAGAIN) { @@ -472,7 +478,7 @@ static void qemu_aio_poll(void *opaque) } for(;;) { - pacb = &first_aio; + pacb = &s->first_aio; for(;;) { acb = *pacb; if (!acb) @@ -507,25 +513,37 @@ static void qemu_aio_poll(void *opaque) the_end: ; } -void qemu_aio_init(void) +static int posix_aio_flush(void *opaque) +{ + PosixAioState *s = opaque; + return !!s->first_aio; +} + +static PosixAioState *posix_aio_state; + +static int posix_aio_init(void) { sigset_t mask; + PosixAioState *s; + + if (posix_aio_state) + return 0; - if (aio_initialized) - return; - - aio_initialized = 1; + s = qemu_malloc(sizeof(PosixAioState)); + if (s == NULL) + return -ENOMEM; /* Make sure to block AIO signal */ sigemptyset(&mask); - sigaddset(&mask, aio_sig_num); + sigaddset(&mask, SIGUSR2); sigprocmask(SIG_BLOCK, &mask, NULL); - aio_sig_fd = qemu_signalfd(&mask); + s->first_aio = NULL; + s->fd = qemu_signalfd(&mask); - fcntl(aio_sig_fd, F_SETFL, O_NONBLOCK); + fcntl(s->fd, F_SETFL, O_NONBLOCK); - qemu_set_fd_handler2(aio_sig_fd, NULL, qemu_aio_poll, NULL, NULL); + qemu_aio_set_fd_handler(s->fd, posix_aio_read, NULL, posix_aio_flush, s); #if defined(__GLIBC__) && defined(__linux__) { @@ -539,39 +557,9 @@ void qemu_aio_init(void) aio_init(&ai); } #endif -} + posix_aio_state = s; -/* Wait for all IO requests to complete. */ -void qemu_aio_flush(void) -{ - qemu_aio_poll(NULL); - while (first_aio) { - qemu_aio_wait(); - } -} - -void qemu_aio_wait(void) -{ - int ret; - - if (qemu_bh_poll()) - return; - - if (!first_aio) - return; - - do { - fd_set rdfds; - - FD_ZERO(&rdfds); - FD_SET(aio_sig_fd, &rdfds); - - ret = select(aio_sig_fd + 1, &rdfds, NULL, NULL, NULL); - if (ret == -1 && errno == EINTR) - continue; - } while (ret == 0); - - qemu_aio_poll(NULL); + return 0; } static RawAIOCB *raw_aio_setup(BlockDriverState *bs, @@ -588,7 +576,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, if (!acb) return NULL; acb->aiocb.aio_fildes = s->fd; - acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num; + acb->aiocb.aio_sigevent.sigev_signo = SIGUSR2; acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL; acb->aiocb.aio_buf = buf; if (nb_sectors < 0) @@ -596,8 +584,8 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, else acb->aiocb.aio_nbytes = nb_sectors * 512; acb->aiocb.aio_offset = sector_num * 512; - acb->next = first_aio; - first_aio = acb; + acb->next = posix_aio_state->first_aio; + posix_aio_state->first_aio = acb; return acb; } @@ -688,7 +676,7 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) } /* remove the callback from the queue */ - pacb = &first_aio; + pacb = &posix_aio_state->first_aio; for(;;) { if (*pacb == NULL) { break; @@ -701,21 +689,10 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb) } } -# else /* CONFIG_AIO */ - -void qemu_aio_init(void) +#else /* CONFIG_AIO */ +static int posix_aio_init(void) { } - -void qemu_aio_flush(void) -{ -} - -void qemu_aio_wait(void) -{ - qemu_bh_poll(); -} - #endif /* CONFIG_AIO */ static void raw_close(BlockDriverState *bs) @@ -921,6 +898,8 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) BDRVRawState *s = bs->opaque; int fd, open_flags, ret; + posix_aio_init(); + #ifdef CONFIG_COCOA if (strstart(filename, "/dev/cdrom", NULL)) { kern_return_t kernResult; diff --git a/block-raw-win32.c b/block-raw-win32.c index 71404acb58..fd4a9e3a4c 100644 --- a/block-raw-win32.c +++ b/block-raw-win32.c @@ -339,19 +339,6 @@ static int raw_create(const char *filename, int64_t total_size, return 0; } -void qemu_aio_init(void) -{ -} - -void qemu_aio_flush(void) -{ -} - -void qemu_aio_wait(void) -{ - qemu_bh_poll(); -} - BlockDriver bdrv_raw = { "raw", sizeof(BDRVRawState), diff --git a/block.c b/block.c index 15f807a0ca..27b39d65d8 100644 --- a/block.c +++ b/block.c @@ -1310,8 +1310,6 @@ void bdrv_init(void) bdrv_register(&bdrv_qcow2); bdrv_register(&bdrv_parallels); bdrv_register(&bdrv_nbd); - - qemu_aio_init(); } void *qemu_aio_get(BlockDriverState *bs, BlockDriverCompletionFunc *cb, diff --git a/block.h b/block.h index d774a2e1f6..d0e9fd49e4 100644 --- a/block.h +++ b/block.h @@ -1,6 +1,8 @@ #ifndef BLOCK_H #define BLOCK_H +#include "qemu-aio.h" + /* block.c */ typedef struct BlockDriver BlockDriver; @@ -87,10 +89,6 @@ BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num, BlockDriverCompletionFunc *cb, void *opaque); void bdrv_aio_cancel(BlockDriverAIOCB *acb); -void qemu_aio_init(void); -void qemu_aio_flush(void); -void qemu_aio_wait(void); - int qemu_key_check(BlockDriverState *bs, const char *name); /* Ensure contents are flushed to disk. */ diff --git a/qemu-aio.h b/qemu-aio.h new file mode 100644 index 0000000000..79678293ef --- /dev/null +++ b/qemu-aio.h @@ -0,0 +1,45 @@ +/* + * QEMU aio implementation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_AIO_H +#define QEMU_AIO_H + +#include "qemu-common.h" +#include "qemu-char.h" + +/* Returns 1 if there are still outstanding AIO requests; 0 otherwise */ +typedef int (AioFlushHandler)(void *opaque); + +/* Flush any pending AIO operation. This function will block until all + * outstanding AIO operations have been completed or cancelled. */ +void qemu_aio_flush(void); + +/* Wait for a single AIO completion to occur. This function will until a + * single AIO opeartion has completed. It is intended to be used as a looping + * primative when simulating synchronous IO based on asynchronous IO. */ +void qemu_aio_wait(void); + +/* Register a file descriptor and associated callbacks. Behaves very similarly + * to qemu_set_fd_handler2. Unlike qemu_set_fd_handler2, these callbacks will + * be invoked when using either qemu_aio_wait() or qemu_aio_flush(). + * + * Code that invokes AIO completion functions should rely on this function + * instead of qemu_set_fd_handler[2]. + */ +int qemu_aio_set_fd_handler(int fd, + IOHandler *io_read, + IOHandler *io_write, + AioFlushHandler *io_flush, + void *opaque); + +#endif