qemu/hw/rdma/rdma_rm.c
Yuval Shaia 7d2ce4b016 hw/rdma: Bugfix - Support non-aligned buffers
RDMA application can provide non-aligned buffers to be registered. In
such case the DMA address passed by driver is pointing to the beginning
of the physical address of the mapped page so we can't distinguish
between two addresses from the same page.

Fix it by keeping the offset of the virtual address in mr->virt.

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
Reviewed-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
Message-Id: <20180805153518.2983-13-yuval.shaia@oracle.com>
Signed-off-by: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
2018-08-18 18:00:55 +03:00

554 lines
14 KiB
C

/*
* QEMU paravirtual RDMA - Resource Manager Implementation
*
* Copyright (C) 2018 Oracle
* Copyright (C) 2018 Red Hat Inc
*
* Authors:
* Yuval Shaia <yuval.shaia@oracle.com>
* Marcel Apfelbaum <marcel@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "qapi/error.h"
#include "cpu.h"
#include "rdma_utils.h"
#include "rdma_backend.h"
#include "rdma_rm.h"
/* Page directory and page tables */
#define PG_DIR_SZ { TARGET_PAGE_SIZE / sizeof(__u64) }
#define PG_TBL_SZ { TARGET_PAGE_SIZE / sizeof(__u64) }
static inline void res_tbl_init(const char *name, RdmaRmResTbl *tbl,
uint32_t tbl_sz, uint32_t res_sz)
{
tbl->tbl = g_malloc(tbl_sz * res_sz);
strncpy(tbl->name, name, MAX_RM_TBL_NAME);
tbl->name[MAX_RM_TBL_NAME - 1] = 0;
tbl->bitmap = bitmap_new(tbl_sz);
tbl->tbl_sz = tbl_sz;
tbl->res_sz = res_sz;
qemu_mutex_init(&tbl->lock);
}
static inline void res_tbl_free(RdmaRmResTbl *tbl)
{
qemu_mutex_destroy(&tbl->lock);
g_free(tbl->tbl);
bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0);
}
static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle)
{
pr_dbg("%s, handle=%d\n", tbl->name, handle);
if ((handle < tbl->tbl_sz) && (test_bit(handle, tbl->bitmap))) {
return tbl->tbl + handle * tbl->res_sz;
} else {
pr_dbg("Invalid handle %d\n", handle);
return NULL;
}
}
static inline void *res_tbl_alloc(RdmaRmResTbl *tbl, uint32_t *handle)
{
qemu_mutex_lock(&tbl->lock);
*handle = find_first_zero_bit(tbl->bitmap, tbl->tbl_sz);
if (*handle > tbl->tbl_sz) {
pr_dbg("Failed to alloc, bitmap is full\n");
qemu_mutex_unlock(&tbl->lock);
return NULL;
}
set_bit(*handle, tbl->bitmap);
qemu_mutex_unlock(&tbl->lock);
memset(tbl->tbl + *handle * tbl->res_sz, 0, tbl->res_sz);
pr_dbg("%s, handle=%d\n", tbl->name, *handle);
return tbl->tbl + *handle * tbl->res_sz;
}
static inline void res_tbl_dealloc(RdmaRmResTbl *tbl, uint32_t handle)
{
pr_dbg("%s, handle=%d\n", tbl->name, handle);
qemu_mutex_lock(&tbl->lock);
if (handle < tbl->tbl_sz) {
clear_bit(handle, tbl->bitmap);
}
qemu_mutex_unlock(&tbl->lock);
}
int rdma_rm_alloc_pd(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t *pd_handle, uint32_t ctx_handle)
{
RdmaRmPD *pd;
int ret = -ENOMEM;
pd = res_tbl_alloc(&dev_res->pd_tbl, pd_handle);
if (!pd) {
goto out;
}
ret = rdma_backend_create_pd(backend_dev, &pd->backend_pd);
if (ret) {
ret = -EIO;
goto out_tbl_dealloc;
}
pd->ctx_handle = ctx_handle;
return 0;
out_tbl_dealloc:
res_tbl_dealloc(&dev_res->pd_tbl, *pd_handle);
out:
return ret;
}
RdmaRmPD *rdma_rm_get_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle)
{
return res_tbl_get(&dev_res->pd_tbl, pd_handle);
}
void rdma_rm_dealloc_pd(RdmaDeviceResources *dev_res, uint32_t pd_handle)
{
RdmaRmPD *pd = rdma_rm_get_pd(dev_res, pd_handle);
if (pd) {
rdma_backend_destroy_pd(&pd->backend_pd);
res_tbl_dealloc(&dev_res->pd_tbl, pd_handle);
}
}
int rdma_rm_alloc_mr(RdmaDeviceResources *dev_res, uint32_t pd_handle,
uint64_t guest_start, size_t guest_length, void *host_virt,
int access_flags, uint32_t *mr_handle, uint32_t *lkey,
uint32_t *rkey)
{
RdmaRmMR *mr;
int ret = 0;
RdmaRmPD *pd;
pd = rdma_rm_get_pd(dev_res, pd_handle);
if (!pd) {
pr_dbg("Invalid PD\n");
return -EINVAL;
}
mr = res_tbl_alloc(&dev_res->mr_tbl, mr_handle);
if (!mr) {
pr_dbg("Failed to allocate obj in table\n");
return -ENOMEM;
}
pr_dbg("mr_handle=%d\n", *mr_handle);
pr_dbg("host_virt=0x%p\n", host_virt);
pr_dbg("guest_start=0x%" PRIx64 "\n", guest_start);
pr_dbg("length=%zu\n", guest_length);
if (host_virt) {
mr->virt = host_virt;
mr->start = guest_start;
mr->length = guest_length;
mr->virt += (mr->start & (TARGET_PAGE_SIZE - 1));
ret = rdma_backend_create_mr(&mr->backend_mr, &pd->backend_pd, mr->virt,
mr->length, access_flags);
if (ret) {
pr_dbg("Fail in rdma_backend_create_mr, err=%d\n", ret);
ret = -EIO;
goto out_dealloc_mr;
}
}
/* We keep mr_handle in lkey so send and recv get get mr ptr */
*lkey = *mr_handle;
*rkey = -1;
mr->pd_handle = pd_handle;
return 0;
out_dealloc_mr:
res_tbl_dealloc(&dev_res->mr_tbl, *mr_handle);
return ret;
}
RdmaRmMR *rdma_rm_get_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle)
{
return res_tbl_get(&dev_res->mr_tbl, mr_handle);
}
void rdma_rm_dealloc_mr(RdmaDeviceResources *dev_res, uint32_t mr_handle)
{
RdmaRmMR *mr = rdma_rm_get_mr(dev_res, mr_handle);
if (mr) {
rdma_backend_destroy_mr(&mr->backend_mr);
pr_dbg("start=0x%" PRIx64 "\n", mr->start);
if (mr->start) {
mr->virt -= (mr->start & (TARGET_PAGE_SIZE - 1));
munmap(mr->virt, mr->length);
}
res_tbl_dealloc(&dev_res->mr_tbl, mr_handle);
}
}
int rdma_rm_alloc_uc(RdmaDeviceResources *dev_res, uint32_t pfn,
uint32_t *uc_handle)
{
RdmaRmUC *uc;
/* TODO: Need to make sure pfn is between bar start address and
* bsd+RDMA_BAR2_UAR_SIZE
if (pfn > RDMA_BAR2_UAR_SIZE) {
pr_err("pfn out of range (%d > %d)\n", pfn, RDMA_BAR2_UAR_SIZE);
return -ENOMEM;
}
*/
uc = res_tbl_alloc(&dev_res->uc_tbl, uc_handle);
if (!uc) {
return -ENOMEM;
}
return 0;
}
RdmaRmUC *rdma_rm_get_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle)
{
return res_tbl_get(&dev_res->uc_tbl, uc_handle);
}
void rdma_rm_dealloc_uc(RdmaDeviceResources *dev_res, uint32_t uc_handle)
{
RdmaRmUC *uc = rdma_rm_get_uc(dev_res, uc_handle);
if (uc) {
res_tbl_dealloc(&dev_res->uc_tbl, uc_handle);
}
}
RdmaRmCQ *rdma_rm_get_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle)
{
return res_tbl_get(&dev_res->cq_tbl, cq_handle);
}
int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t cqe, uint32_t *cq_handle, void *opaque)
{
int rc;
RdmaRmCQ *cq;
cq = res_tbl_alloc(&dev_res->cq_tbl, cq_handle);
if (!cq) {
return -ENOMEM;
}
cq->opaque = opaque;
cq->notify = false;
rc = rdma_backend_create_cq(backend_dev, &cq->backend_cq, cqe);
if (rc) {
rc = -EIO;
goto out_dealloc_cq;
}
return 0;
out_dealloc_cq:
rdma_rm_dealloc_cq(dev_res, *cq_handle);
return rc;
}
void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle,
bool notify)
{
RdmaRmCQ *cq;
pr_dbg("cq_handle=%d, notify=0x%x\n", cq_handle, notify);
cq = rdma_rm_get_cq(dev_res, cq_handle);
if (!cq) {
return;
}
cq->notify = notify;
pr_dbg("notify=%d\n", cq->notify);
}
void rdma_rm_dealloc_cq(RdmaDeviceResources *dev_res, uint32_t cq_handle)
{
RdmaRmCQ *cq;
cq = rdma_rm_get_cq(dev_res, cq_handle);
if (!cq) {
return;
}
rdma_backend_destroy_cq(&cq->backend_cq);
res_tbl_dealloc(&dev_res->cq_tbl, cq_handle);
}
RdmaRmQP *rdma_rm_get_qp(RdmaDeviceResources *dev_res, uint32_t qpn)
{
GBytes *key = g_bytes_new(&qpn, sizeof(qpn));
RdmaRmQP *qp = g_hash_table_lookup(dev_res->qp_hash, key);
g_bytes_unref(key);
return qp;
}
int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, uint32_t pd_handle,
uint8_t qp_type, uint32_t max_send_wr,
uint32_t max_send_sge, uint32_t send_cq_handle,
uint32_t max_recv_wr, uint32_t max_recv_sge,
uint32_t recv_cq_handle, void *opaque, uint32_t *qpn)
{
int rc;
RdmaRmQP *qp;
RdmaRmCQ *scq, *rcq;
RdmaRmPD *pd;
uint32_t rm_qpn;
pr_dbg("qp_type=%d\n", qp_type);
pd = rdma_rm_get_pd(dev_res, pd_handle);
if (!pd) {
pr_err("Invalid pd handle (%d)\n", pd_handle);
return -EINVAL;
}
scq = rdma_rm_get_cq(dev_res, send_cq_handle);
rcq = rdma_rm_get_cq(dev_res, recv_cq_handle);
if (!scq || !rcq) {
pr_err("Invalid send_cqn or recv_cqn (%d, %d)\n",
send_cq_handle, recv_cq_handle);
return -EINVAL;
}
qp = res_tbl_alloc(&dev_res->qp_tbl, &rm_qpn);
if (!qp) {
return -ENOMEM;
}
pr_dbg("rm_qpn=%d\n", rm_qpn);
qp->qpn = rm_qpn;
qp->qp_state = IBV_QPS_RESET;
qp->qp_type = qp_type;
qp->send_cq_handle = send_cq_handle;
qp->recv_cq_handle = recv_cq_handle;
qp->opaque = opaque;
rc = rdma_backend_create_qp(&qp->backend_qp, qp_type, &pd->backend_pd,
&scq->backend_cq, &rcq->backend_cq, max_send_wr,
max_recv_wr, max_send_sge, max_recv_sge);
if (rc) {
rc = -EIO;
goto out_dealloc_qp;
}
*qpn = rdma_backend_qpn(&qp->backend_qp);
pr_dbg("rm_qpn=%d, backend_qpn=0x%x\n", rm_qpn, *qpn);
g_hash_table_insert(dev_res->qp_hash, g_bytes_new(qpn, sizeof(*qpn)), qp);
return 0;
out_dealloc_qp:
res_tbl_dealloc(&dev_res->qp_tbl, qp->qpn);
return rc;
}
int rdma_rm_modify_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t qp_handle, uint32_t attr_mask,
union ibv_gid *dgid, uint32_t dqpn,
enum ibv_qp_state qp_state, uint32_t qkey,
uint32_t rq_psn, uint32_t sq_psn)
{
RdmaRmQP *qp;
int ret;
pr_dbg("qpn=0x%x\n", qp_handle);
qp = rdma_rm_get_qp(dev_res, qp_handle);
if (!qp) {
return -EINVAL;
}
pr_dbg("qp_type=%d\n", qp->qp_type);
pr_dbg("attr_mask=0x%x\n", attr_mask);
if (qp->qp_type == IBV_QPT_SMI) {
pr_dbg("QP0 unsupported\n");
return -EPERM;
} else if (qp->qp_type == IBV_QPT_GSI) {
pr_dbg("QP1\n");
return 0;
}
if (attr_mask & IBV_QP_STATE) {
qp->qp_state = qp_state;
pr_dbg("qp_state=%d\n", qp->qp_state);
if (qp->qp_state == IBV_QPS_INIT) {
ret = rdma_backend_qp_state_init(backend_dev, &qp->backend_qp,
qp->qp_type, qkey);
if (ret) {
return -EIO;
}
}
if (qp->qp_state == IBV_QPS_RTR) {
ret = rdma_backend_qp_state_rtr(backend_dev, &qp->backend_qp,
qp->qp_type, dgid, dqpn, rq_psn,
qkey, attr_mask & IBV_QP_QKEY);
if (ret) {
return -EIO;
}
}
if (qp->qp_state == IBV_QPS_RTS) {
ret = rdma_backend_qp_state_rts(&qp->backend_qp, qp->qp_type,
sq_psn, qkey,
attr_mask & IBV_QP_QKEY);
if (ret) {
return -EIO;
}
}
}
return 0;
}
int rdma_rm_query_qp(RdmaDeviceResources *dev_res, RdmaBackendDev *backend_dev,
uint32_t qp_handle, struct ibv_qp_attr *attr,
int attr_mask, struct ibv_qp_init_attr *init_attr)
{
RdmaRmQP *qp;
pr_dbg("qpn=0x%x\n", qp_handle);
qp = rdma_rm_get_qp(dev_res, qp_handle);
if (!qp) {
return -EINVAL;
}
pr_dbg("qp_type=%d\n", qp->qp_type);
return rdma_backend_query_qp(&qp->backend_qp, attr, attr_mask, init_attr);
}
void rdma_rm_dealloc_qp(RdmaDeviceResources *dev_res, uint32_t qp_handle)
{
RdmaRmQP *qp;
GBytes *key;
key = g_bytes_new(&qp_handle, sizeof(qp_handle));
qp = g_hash_table_lookup(dev_res->qp_hash, key);
g_hash_table_remove(dev_res->qp_hash, key);
g_bytes_unref(key);
if (!qp) {
return;
}
rdma_backend_destroy_qp(&qp->backend_qp);
res_tbl_dealloc(&dev_res->qp_tbl, qp->qpn);
}
void *rdma_rm_get_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id)
{
void **cqe_ctx;
cqe_ctx = res_tbl_get(&dev_res->cqe_ctx_tbl, cqe_ctx_id);
if (!cqe_ctx) {
return NULL;
}
pr_dbg("ctx=%p\n", *cqe_ctx);
return *cqe_ctx;
}
int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t *cqe_ctx_id,
void *ctx)
{
void **cqe_ctx;
cqe_ctx = res_tbl_alloc(&dev_res->cqe_ctx_tbl, cqe_ctx_id);
if (!cqe_ctx) {
return -ENOMEM;
}
pr_dbg("ctx=%p\n", ctx);
*cqe_ctx = ctx;
return 0;
}
void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources *dev_res, uint32_t cqe_ctx_id)
{
res_tbl_dealloc(&dev_res->cqe_ctx_tbl, cqe_ctx_id);
}
static void destroy_qp_hash_key(gpointer data)
{
g_bytes_unref(data);
}
int rdma_rm_init(RdmaDeviceResources *dev_res, struct ibv_device_attr *dev_attr,
Error **errp)
{
dev_res->qp_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal,
destroy_qp_hash_key, NULL);
if (!dev_res->qp_hash) {
return -ENOMEM;
}
res_tbl_init("PD", &dev_res->pd_tbl, dev_attr->max_pd, sizeof(RdmaRmPD));
res_tbl_init("CQ", &dev_res->cq_tbl, dev_attr->max_cq, sizeof(RdmaRmCQ));
res_tbl_init("MR", &dev_res->mr_tbl, dev_attr->max_mr, sizeof(RdmaRmMR));
res_tbl_init("QP", &dev_res->qp_tbl, dev_attr->max_qp, sizeof(RdmaRmQP));
res_tbl_init("CQE_CTX", &dev_res->cqe_ctx_tbl, dev_attr->max_qp *
dev_attr->max_qp_wr, sizeof(void *));
res_tbl_init("UC", &dev_res->uc_tbl, MAX_UCS, sizeof(RdmaRmUC));
return 0;
}
void rdma_rm_fini(RdmaDeviceResources *dev_res)
{
res_tbl_free(&dev_res->uc_tbl);
res_tbl_free(&dev_res->cqe_ctx_tbl);
res_tbl_free(&dev_res->qp_tbl);
res_tbl_free(&dev_res->mr_tbl);
res_tbl_free(&dev_res->cq_tbl);
res_tbl_free(&dev_res->pd_tbl);
g_hash_table_destroy(dev_res->qp_hash);
}