Pull request for 5.2

NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
 improvements. Elena Afanasova's ioeventfd fixes are also included.
 
 Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAl+ixjgACgkQnKSrs4Gr
 c8iZYgf+OB2eAGsdZO97fKh6VUUoRKa+BgWKuh37Cfpp3q+dLuIFMSKfU/UgprLc
 aowt6uTFfwudDV9KltUB2EiXIzpuf7JhMNOiDRkyEvYSj4KHRPsQmFCd35Nrjezy
 VvxSGafe2Z60Qnvcx+iGeMATSFX9YTcTZeHttC07v7dWn/yEK3b1hobcmjCcwWeR
 Ud8pjMyh5E2z/NpW8E669/byJf9iahx3LSQxSWt+9PVTPuftAB0Suu+m6svz1wvk
 sjVfIbtVWCp2BdGf5U6a2rEqF3+kIcFkfHp+MwgE0EdMz1wfjudaPl13a0C4DSun
 PSt9E+Ct5BTrDUvqCHvQDOaFiMZTPg==
 =Poyb
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/stefanha-gitlab/tags/block-pull-request' into staging

Pull request for 5.2

NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
improvements. Elena Afanasova's ioeventfd fixes are also included.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

# gpg: Signature made Wed 04 Nov 2020 15:18:16 GMT
# gpg:                using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8
# gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full]
# gpg:                 aka "Stefan Hajnoczi <stefanha@gmail.com>" [full]
# Primary key fingerprint: 8695 A8BF D3F9 7CDA AC35  775A 9CA4 ABB3 81AB 73C8

* remotes/stefanha-gitlab/tags/block-pull-request: (33 commits)
  util/vfio-helpers: Assert offset is aligned to page size
  util/vfio-helpers: Convert vfio_dump_mapping to trace events
  util/vfio-helpers: Improve DMA trace events
  util/vfio-helpers: Trace where BARs are mapped
  util/vfio-helpers: Trace PCI BAR region info
  util/vfio-helpers: Trace PCI I/O config accesses
  util/vfio-helpers: Improve reporting unsupported IOMMU type
  block/nvme: Fix nvme_submit_command() on big-endian host
  block/nvme: Fix use of write-only doorbells page on Aarch64 arch
  block/nvme: Align iov's va and size on host page size
  block/nvme: Change size and alignment of prp_list_pages
  block/nvme: Change size and alignment of queue
  block/nvme: Change size and alignment of IDENTIFY response buffer
  block/nvme: Correct minimum device page size
  block/nvme: Set request_alignment at initialization
  block/nvme: Simplify nvme_cmd_sync()
  block/nvme: Simplify ADMIN queue access
  block/nvme: Correctly initialize Admin Queue Attributes
  block/nvme: Use definitions instead of magic values in add_io_queue()
  block/nvme: Introduce Completion Queue definitions
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2020-11-23 13:03:13 +00:00
commit 683685e72d
8 changed files with 195 additions and 134 deletions

View file

@ -1882,6 +1882,7 @@ M: Klaus Jensen <its@irrelevant.dk>
L: qemu-block@nongnu.org
S: Supported
F: hw/block/nvme*
F: include/block/nvme.h
F: tests/qtest/nvme-test.c
F: docs/specs/nvme.txt
T: git git://git.infradead.org/qemu-nvme.git nvme-next
@ -2972,6 +2973,7 @@ R: Fam Zheng <fam@euphon.net>
L: qemu-block@nongnu.org
S: Supported
F: block/nvme*
F: include/block/nvme.h
T: git https://github.com/stefanha/qemu.git block
Bootdevice

View file

@ -2239,8 +2239,10 @@ static int kvm_init(MachineState *ms)
kvm_memory_listener_register(s, &s->memory_listener,
&address_space_memory, 0);
memory_listener_register(&kvm_io_listener,
&address_space_io);
if (kvm_eventfds_allowed) {
memory_listener_register(&kvm_io_listener,
&address_space_io);
}
memory_listener_register(&kvm_coalesced_pio_listener,
&address_space_io);

View file

@ -41,6 +41,16 @@
typedef struct BDRVNVMeState BDRVNVMeState;
/* Same index is used for queues and IRQs */
#define INDEX_ADMIN 0
#define INDEX_IO(n) (1 + n)
/* This driver shares a single MSIX IRQ for the admin and I/O queues */
enum {
MSIX_SHARED_IRQ_IDX = 0,
MSIX_IRQ_COUNT = 1
};
typedef struct {
int32_t head, tail;
uint8_t *queue;
@ -81,18 +91,10 @@ typedef struct {
QEMUBH *completion_bh;
} NVMeQueuePair;
#define INDEX_ADMIN 0
#define INDEX_IO(n) (1 + n)
/* This driver shares a single MSIX IRQ for the admin and I/O queues */
enum {
MSIX_SHARED_IRQ_IDX = 0,
MSIX_IRQ_COUNT = 1
};
struct BDRVNVMeState {
AioContext *aio_context;
QEMUVFIOState *vfio;
void *bar0_wo_map;
/* Memory mapped registers */
volatile struct {
uint32_t sq_tail;
@ -103,7 +105,7 @@ struct BDRVNVMeState {
* [1..]: io queues.
*/
NVMeQueuePair **queues;
int nr_queues;
unsigned queue_count;
size_t page_size;
/* How many uint32_t elements does each doorbell entry take. */
size_t doorbell_scale;
@ -159,28 +161,32 @@ static QemuOptsList runtime_opts = {
},
};
static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
int nentries, int entry_bytes, Error **errp)
/* Returns true on success, false on failure. */
static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
unsigned nentries, size_t entry_bytes, Error **errp)
{
size_t bytes;
int r;
bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
q->head = q->tail = 0;
q->queue = qemu_try_memalign(s->page_size, bytes);
q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
if (!q->queue) {
error_setg(errp, "Cannot allocate queue");
return;
return false;
}
memset(q->queue, 0, bytes);
r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
if (r) {
error_setg(errp, "Cannot map queue");
return false;
}
return true;
}
static void nvme_free_queue_pair(NVMeQueuePair *q)
{
trace_nvme_free_queue_pair(q->index, q);
if (q->completion_bh) {
qemu_bh_delete(q->completion_bh);
}
@ -204,31 +210,33 @@ static void nvme_free_req_queue_cb(void *opaque)
static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
AioContext *aio_context,
int idx, int size,
unsigned idx, size_t size,
Error **errp)
{
int i, r;
Error *local_err = NULL;
NVMeQueuePair *q;
uint64_t prp_list_iova;
size_t bytes;
q = g_try_new0(NVMeQueuePair, 1);
if (!q) {
return NULL;
}
q->prp_list_pages = qemu_try_memalign(s->page_size,
s->page_size * NVME_NUM_REQS);
trace_nvme_create_queue_pair(idx, q, size, aio_context,
event_notifier_get_fd(s->irq_notifier));
bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
qemu_real_host_page_size);
q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
if (!q->prp_list_pages) {
goto fail;
}
memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
memset(q->prp_list_pages, 0, bytes);
qemu_mutex_init(&q->lock);
q->s = s;
q->index = idx;
qemu_co_queue_init(&q->free_req_queue);
q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
s->page_size * NVME_NUM_REQS,
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
false, &prp_list_iova);
if (r) {
goto fail;
@ -243,16 +251,12 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
req->prp_list_iova = prp_list_iova + i * s->page_size;
}
nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
goto fail;
}
q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
if (local_err) {
error_propagate(errp, local_err);
if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
goto fail;
}
q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
@ -292,7 +296,7 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
while (q->free_req_head == -1) {
if (qemu_in_coroutine()) {
trace_nvme_free_req_queue_wait(q);
trace_nvme_free_req_queue_wait(q->s, q->index);
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
} else {
qemu_mutex_unlock(&q->lock);
@ -399,8 +403,8 @@ static bool nvme_process_completion(NVMeQueuePair *q)
}
cid = le16_to_cpu(c->cid);
if (cid == 0 || cid > NVME_QUEUE_SIZE) {
fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
cid);
warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
"queue size: %u", cid, NVME_QUEUE_SIZE);
continue;
}
trace_nvme_complete_command(s, q->index, cid);
@ -465,7 +469,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
assert(!req->cb);
req->cb = cb;
req->opaque = opaque;
cmd->cid = cpu_to_le32(req->cid);
cmd->cid = cpu_to_le16(req->cid);
trace_nvme_submit_command(q->s, q->index, req->cid);
nvme_trace_command(cmd);
@ -479,16 +483,17 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
qemu_mutex_unlock(&q->lock);
}
static void nvme_cmd_sync_cb(void *opaque, int ret)
static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
{
int *pret = opaque;
*pret = ret;
aio_wait_kick();
}
static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
NvmeCmd *cmd)
static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
{
BDRVNVMeState *s = bs->opaque;
NVMeQueuePair *q = s->queues[INDEX_ADMIN];
AioContext *aio_context = bdrv_get_aio_context(bs);
NVMeRequest *req;
int ret = -EINPROGRESS;
@ -496,15 +501,17 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
if (!req) {
return -EBUSY;
}
nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
return ret;
}
static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
/* Returns true on success, false on failure. */
static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
{
BDRVNVMeState *s = bs->opaque;
bool ret = false;
union {
NvmeIdCtrl ctrl;
NvmeIdNs ns;
@ -517,21 +524,22 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
.opcode = NVME_ADM_CMD_IDENTIFY,
.cdw10 = cpu_to_le32(0x1),
};
size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
id = qemu_try_memalign(s->page_size, sizeof(*id));
id = qemu_try_memalign(qemu_real_host_page_size, id_size);
if (!id) {
error_setg(errp, "Cannot allocate buffer for identify response");
goto out;
}
r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova);
if (r) {
error_setg(errp, "Cannot map buffer for DMA");
goto out;
}
memset(id, 0, sizeof(*id));
memset(id, 0, id_size);
cmd.dptr.prp1 = cpu_to_le64(iova);
if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
if (nvme_admin_cmd_sync(bs, &cmd)) {
error_setg(errp, "Failed to identify controller");
goto out;
}
@ -551,10 +559,10 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
s->supports_discard = !!(oncs & NVME_ONCS_DSM);
memset(id, 0, sizeof(*id));
memset(id, 0, id_size);
cmd.cdw10 = 0;
cmd.nsid = cpu_to_le32(namespace);
if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
if (nvme_admin_cmd_sync(bs, &cmd)) {
error_setg(errp, "Failed to identify namespace");
goto out;
}
@ -581,10 +589,13 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
goto out;
}
ret = true;
s->blkshift = lbaf->ds;
out:
qemu_vfio_dma_unmap(s->vfio, id);
qemu_vfree(id);
return ret;
}
static bool nvme_poll_queue(NVMeQueuePair *q)
@ -594,6 +605,7 @@ static bool nvme_poll_queue(NVMeQueuePair *q)
const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
trace_nvme_poll_queue(q->s, q->index);
/*
* Do an early check for completions. q->lock isn't needed because
* nvme_process_completion() only runs in the event loop thread and
@ -618,7 +630,7 @@ static bool nvme_poll_queues(BDRVNVMeState *s)
bool progress = false;
int i;
for (i = 0; i < s->nr_queues; i++) {
for (i = 0; i < s->queue_count; i++) {
if (nvme_poll_queue(s->queues[i])) {
progress = true;
}
@ -639,11 +651,12 @@ static void nvme_handle_event(EventNotifier *n)
static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
{
BDRVNVMeState *s = bs->opaque;
int n = s->nr_queues;
unsigned n = s->queue_count;
NVMeQueuePair *q;
NvmeCmd cmd;
int queue_size = NVME_QUEUE_SIZE;
unsigned queue_size = NVME_QUEUE_SIZE;
assert(n <= UINT16_MAX);
q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
n, queue_size, errp);
if (!q) {
@ -652,26 +665,26 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
cmd = (NvmeCmd) {
.opcode = NVME_ADM_CMD_CREATE_CQ,
.dptr.prp1 = cpu_to_le64(q->cq.iova),
.cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
.cdw11 = cpu_to_le32(0x3),
.cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
.cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
};
if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
error_setg(errp, "Failed to create CQ io queue [%d]", n);
if (nvme_admin_cmd_sync(bs, &cmd)) {
error_setg(errp, "Failed to create CQ io queue [%u]", n);
goto out_error;
}
cmd = (NvmeCmd) {
.opcode = NVME_ADM_CMD_CREATE_SQ,
.dptr.prp1 = cpu_to_le64(q->sq.iova),
.cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
.cdw11 = cpu_to_le32(0x1 | (n << 16)),
.cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
.cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
};
if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
error_setg(errp, "Failed to create SQ io queue [%d]", n);
if (nvme_admin_cmd_sync(bs, &cmd)) {
error_setg(errp, "Failed to create SQ io queue [%u]", n);
goto out_error;
}
s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
s->queues[n] = q;
s->nr_queues++;
s->queue_count++;
return true;
out_error:
nvme_free_queue_pair(q);
@ -684,7 +697,6 @@ static bool nvme_poll_cb(void *opaque)
BDRVNVMeState *s = container_of(e, BDRVNVMeState,
irq_notifier[MSIX_SHARED_IRQ_IDX]);
trace_nvme_poll_cb(s);
return nvme_poll_queues(s);
}
@ -692,12 +704,12 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
Error **errp)
{
BDRVNVMeState *s = bs->opaque;
NVMeQueuePair *q;
AioContext *aio_context = bdrv_get_aio_context(bs);
int ret;
uint64_t cap;
uint64_t timeout_ms;
uint64_t deadline, now;
Error *local_err = NULL;
volatile NvmeBar *regs = NULL;
qemu_co_mutex_init(&s->dma_map_lock);
@ -727,15 +739,29 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
* Initialization". */
cap = le64_to_cpu(regs->cap);
trace_nvme_controller_capability_raw(cap);
trace_nvme_controller_capability("Maximum Queue Entries Supported",
1 + NVME_CAP_MQES(cap));
trace_nvme_controller_capability("Contiguous Queues Required",
NVME_CAP_CQR(cap));
trace_nvme_controller_capability("Doorbell Stride",
2 << (2 + NVME_CAP_DSTRD(cap)));
trace_nvme_controller_capability("Subsystem Reset Supported",
NVME_CAP_NSSRS(cap));
trace_nvme_controller_capability("Memory Page Size Minimum",
1 << (12 + NVME_CAP_MPSMIN(cap)));
trace_nvme_controller_capability("Memory Page Size Maximum",
1 << (12 + NVME_CAP_MPSMAX(cap)));
if (!NVME_CAP_CSS(cap)) {
error_setg(errp, "Device doesn't support NVMe command set");
ret = -EINVAL;
goto out;
}
s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
bs->bl.opt_mem_alignment = s->page_size;
bs->bl.request_alignment = s->page_size;
timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
/* Reset device to get a clean state. */
@ -752,8 +778,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
}
}
s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
NVME_DOORBELL_SIZE, PROT_WRITE, errp);
s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
PROT_WRITE, errp);
s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
if (!s->doorbells) {
ret = -EINVAL;
goto out;
@ -761,19 +789,18 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
/* Set up admin queue. */
s->queues = g_new(NVMeQueuePair *, 1);
s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
NVME_QUEUE_SIZE,
errp);
if (!s->queues[INDEX_ADMIN]) {
q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
if (!q) {
ret = -EINVAL;
goto out;
}
s->nr_queues = 1;
QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
(NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
s->queues[INDEX_ADMIN] = q;
s->queue_count = 1;
QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
regs->asq = cpu_to_le64(q->sq.iova);
regs->acq = cpu_to_le64(q->cq.iova);
/* After setting up all control registers we can enable device now. */
regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
@ -801,9 +828,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
nvme_identify(bs, namespace, &local_err);
if (local_err) {
error_propagate(errp, local_err);
if (!nvme_identify(bs, namespace, errp)) {
ret = -EIO;
goto out;
}
@ -869,7 +894,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
.cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
};
ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
ret = nvme_admin_cmd_sync(bs, &cmd);
if (ret) {
error_setg(errp, "Failed to configure NVMe write cache");
}
@ -878,10 +903,9 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
static void nvme_close(BlockDriverState *bs)
{
int i;
BDRVNVMeState *s = bs->opaque;
for (i = 0; i < s->nr_queues; ++i) {
for (unsigned i = 0; i < s->queue_count; ++i) {
nvme_free_queue_pair(s->queues[i]);
}
g_free(s->queues);
@ -889,8 +913,8 @@ static void nvme_close(BlockDriverState *bs)
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
sizeof(NvmeBar), NVME_DOORBELL_SIZE);
qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
qemu_vfio_close(s->vfio);
g_free(s->device);
@ -994,11 +1018,12 @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
for (i = 0; i < qiov->niov; ++i) {
bool retry = true;
uint64_t iova;
size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
qemu_real_host_page_size);
try_map:
r = qemu_vfio_dma_map(s->vfio,
qiov->iov[i].iov_base,
qiov->iov[i].iov_len,
true, &iova);
len, true, &iova);
if (r == -ENOMEM && retry) {
retry = false;
trace_nvme_dma_flush_queue_wait(s);
@ -1106,7 +1131,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
};
trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
assert(s->nr_queues > 1);
assert(s->queue_count > 1);
req = nvme_get_free_req(ioq);
assert(req);
@ -1142,8 +1167,9 @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
BDRVNVMeState *s = bs->opaque;
for (i = 0; i < qiov->niov; ++i) {
if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
!QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
qemu_real_host_page_size) ||
!QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
qiov->iov[i].iov_len, s->page_size);
return false;
@ -1159,7 +1185,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
int r;
uint8_t *buf = NULL;
QEMUIOVector local_qiov;
size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
assert(QEMU_IS_ALIGNED(offset, s->page_size));
assert(QEMU_IS_ALIGNED(bytes, s->page_size));
assert(bytes <= s->max_transfer);
@ -1169,7 +1195,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
}
s->stats.unaligned_accesses++;
trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
buf = qemu_try_memalign(s->page_size, bytes);
buf = qemu_try_memalign(qemu_real_host_page_size, len);
if (!buf) {
return -ENOMEM;
@ -1216,7 +1242,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
.ret = -EINPROGRESS,
};
assert(s->nr_queues > 1);
assert(s->queue_count > 1);
req = nvme_get_free_req(ioq);
assert(req);
nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
@ -1268,7 +1294,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
cmd.cdw12 = cpu_to_le32(cdw12);
trace_nvme_write_zeroes(s, offset, bytes, flags);
assert(s->nr_queues > 1);
assert(s->queue_count > 1);
req = nvme_get_free_req(ioq);
assert(req);
@ -1311,7 +1337,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
return -ENOTSUP;
}
assert(s->nr_queues > 1);
assert(s->queue_count > 1);
buf = qemu_try_memalign(s->page_size, s->page_size);
if (!buf) {
@ -1391,7 +1417,7 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
{
BDRVNVMeState *s = bs->opaque;
for (int i = 0; i < s->nr_queues; i++) {
for (unsigned i = 0; i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
qemu_bh_delete(q->completion_bh);
@ -1412,7 +1438,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
for (int i = 0; i < s->nr_queues; i++) {
for (unsigned i = 0; i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
q->completion_bh =
@ -1429,11 +1455,10 @@ static void nvme_aio_plug(BlockDriverState *bs)
static void nvme_aio_unplug(BlockDriverState *bs)
{
int i;
BDRVNVMeState *s = bs->opaque;
assert(s->plugged);
s->plugged = false;
for (i = INDEX_IO(0); i < s->nr_queues; i++) {
for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
qemu_mutex_lock(&q->lock);
nvme_kick(q);

View file

@ -134,25 +134,29 @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t
qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
# nvme.c
nvme_kick(void *s, int queue) "s %p queue %d"
nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
nvme_kick(void *s, unsigned q_index) "s %p q #%u"
nvme_dma_flush_queue_wait(void *s) "s %p"
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
nvme_handle_event(void *s) "s %p"
nvme_poll_cb(void *s) "s %p"
nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d"
nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d"
nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
nvme_dma_map_flush(void *s) "s %p"
nvme_free_req_queue_wait(void *q) "q %p"
nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d"
nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"

View file

@ -501,6 +501,11 @@ typedef struct QEMU_PACKED NvmeCreateCq {
#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1)
#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
enum NvmeFlagsCq {
NVME_CQ_PC = 1,
NVME_CQ_IEN = 2,
};
typedef struct QEMU_PACKED NvmeCreateSq {
uint8_t opcode;
uint8_t flags;
@ -518,12 +523,13 @@ typedef struct QEMU_PACKED NvmeCreateSq {
#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1)
#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3)
enum NvmeQueueFlags {
NVME_Q_PC = 1,
NVME_Q_PRIO_URGENT = 0,
NVME_Q_PRIO_HIGH = 1,
NVME_Q_PRIO_NORMAL = 2,
NVME_Q_PRIO_LOW = 3,
enum NvmeFlagsSq {
NVME_SQ_PC = 1,
NVME_SQ_PRIO_URGENT = 0,
NVME_SQ_PRIO_HIGH = 1,
NVME_SQ_PRIO_NORMAL = 2,
NVME_SQ_PRIO_LOW = 3,
};
typedef struct QEMU_PACKED NvmeIdentify {

View file

@ -205,8 +205,15 @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
MemoryRegionIoeventfd *b)
{
return !memory_region_ioeventfd_before(a, b)
&& !memory_region_ioeventfd_before(b, a);
if (int128_eq(a->addr.start, b->addr.start) &&
(!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
(int128_eq(a->addr.size, b->addr.size) &&
(a->match_data == b->match_data) &&
((a->match_data && (a->data == b->data)) || !a->match_data) &&
(a->e == b->e))))
return true;
return false;
}
/* Range of memory in the global map. Addresses are absolute. */

View file

@ -80,8 +80,14 @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
qemu_vfio_dma_reset_temporary(void *s) "s %p"
qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"

View file

@ -137,6 +137,7 @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
{
g_autofree char *barname = NULL;
assert_bar_index_valid(s, index);
s->bar_region_info[index] = (struct vfio_region_info) {
.index = VFIO_PCI_BAR0_REGION_INDEX + index,
@ -146,6 +147,10 @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
error_setg_errno(errp, errno, "Failed to get BAR region info");
return -errno;
}
barname = g_strdup_printf("bar[%d]", index);
trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
s->bar_region_info[index].size,
s->bar_region_info[index].cap_offset);
return 0;
}
@ -158,10 +163,13 @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
Error **errp)
{
void *p;
assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
assert_bar_index_valid(s, index);
p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
prot, MAP_SHARED,
s->device, s->bar_region_info[index].offset + offset);
trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
size, offset, p);
if (p == MAP_FAILED) {
error_setg_errno(errp, errno, "Failed to map BAR region");
p = NULL;
@ -228,6 +236,10 @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
{
int ret;
trace_qemu_vfio_pci_read_config(buf, ofs, size,
s->config_region_info.offset,
s->config_region_info.size);
assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
do {
ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
} while (ret == -1 && errno == EINTR);
@ -238,6 +250,10 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
{
int ret;
trace_qemu_vfio_pci_write_config(buf, ofs, size,
s->config_region_info.offset,
s->config_region_info.size);
assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
do {
ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
} while (ret == -1 && errno == EINTR);
@ -301,7 +317,7 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
}
if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
error_setg_errno(errp, errno, "VFIO IOMMU check failed");
error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
ret = -EINVAL;
goto fail_container;
}
@ -409,6 +425,9 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
ret = -errno;
goto fail;
}
trace_qemu_vfio_region_info("config", s->config_region_info.offset,
s->config_region_info.size,
s->config_region_info.cap_offset);
for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
ret = qemu_vfio_pci_init_bar(s, i, errp);
@ -516,23 +535,12 @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
return s;
}
static void qemu_vfio_dump_mapping(IOVAMapping *m)
{
if (QEMU_VFIO_DEBUG) {
printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
(uint64_t)m->size, (uint64_t)m->iova);
}
}
static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
{
int i;
if (QEMU_VFIO_DEBUG) {
printf("vfio mappings\n");
for (i = 0; i < s->nr_mappings; ++i) {
qemu_vfio_dump_mapping(&s->mappings[i]);
}
for (int i = 0; i < s->nr_mappings; ++i) {
trace_qemu_vfio_dump_mapping(s->mappings[i].host,
s->mappings[i].iova,
s->mappings[i].size);
}
}
@ -622,7 +630,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
.vaddr = (uintptr_t)host,
.size = size,
};
trace_qemu_vfio_do_mapping(s, host, size, iova);
trace_qemu_vfio_do_mapping(s, host, iova, size);
if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
@ -778,6 +786,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
}
}
}
trace_qemu_vfio_dma_mapped(s, host, iova0, size);
if (iova) {
*iova = iova0;
}