freebsd-src/contrib/ofed/libmlx4/verbs.c
Sean Lim a687910fc4 Cleanup pthread locks in ofed RDMA verbs
On FreeBSD, pthread mutex, cond, and spinlocks allocate memory.  On
Linux-based systems, these calls do not allocate memory.  So there was a
safe assumption that the ofed RDMA verb calls do not need to explicitly
destroy the pthread locks.  This assumption is false on FreeBSD.  So let
us rearrange the code to cleanup the pthread locks.

Reviewed by:	delphij
MFC after:	2 weeks
Sponsored by:	Dell EMC Isilon
Differential Revision:	https://reviews.freebsd.org/D41105
2023-09-19 09:10:42 -05:00

1272 lines
29 KiB
C

/*
* Copyright (c) 2007 Cisco, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <config.h>
#include <infiniband/endian.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <errno.h>
#include "mlx4.h"
#include "mlx4-abi.h"
#include "wqe.h"
int mlx4_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
{
struct ibv_query_device cmd;
uint64_t raw_fw_ver;
unsigned major, minor, sub_minor;
int ret;
ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
if (ret)
return ret;
major = (raw_fw_ver >> 32) & 0xffff;
minor = (raw_fw_ver >> 16) & 0xffff;
sub_minor = raw_fw_ver & 0xffff;
snprintf(attr->fw_ver, sizeof attr->fw_ver,
"%d.%d.%03d", major, minor, sub_minor);
return 0;
}
int mlx4_query_device_ex(struct ibv_context *context,
const struct ibv_query_device_ex_input *input,
struct ibv_device_attr_ex *attr,
size_t attr_size)
{
struct mlx4_context *mctx = to_mctx(context);
struct mlx4_query_device_ex_resp resp = {};
struct mlx4_query_device_ex cmd = {};
uint64_t raw_fw_ver;
unsigned sub_minor;
unsigned major;
unsigned minor;
int err;
err = ibv_cmd_query_device_ex(context, input, attr, attr_size,
&raw_fw_ver,
&cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
&resp.ibv_resp, sizeof(resp.ibv_resp),
sizeof(resp));
if (err)
return err;
if (resp.comp_mask & MLX4_QUERY_DEV_RESP_MASK_CORE_CLOCK_OFFSET) {
mctx->core_clock.offset = resp.hca_core_clock_offset;
mctx->core_clock.offset_valid = 1;
}
major = (raw_fw_ver >> 32) & 0xffff;
minor = (raw_fw_ver >> 16) & 0xffff;
sub_minor = raw_fw_ver & 0xffff;
snprintf(attr->orig_attr.fw_ver, sizeof attr->orig_attr.fw_ver,
"%d.%d.%03d", major, minor, sub_minor);
return 0;
}
#define READL(ptr) (*((uint32_t *)(ptr)))
static int mlx4_read_clock(struct ibv_context *context, uint64_t *cycles)
{
unsigned int clockhi, clocklo, clockhi1;
int i;
struct mlx4_context *ctx = to_mctx(context);
if (!ctx->hca_core_clock)
return -EOPNOTSUPP;
/* Handle wraparound */
for (i = 0; i < 2; i++) {
clockhi = be32toh(READL(ctx->hca_core_clock));
clocklo = be32toh(READL(ctx->hca_core_clock + 4));
clockhi1 = be32toh(READL(ctx->hca_core_clock));
if (clockhi == clockhi1)
break;
}
*cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo;
return 0;
}
int mlx4_query_rt_values(struct ibv_context *context,
struct ibv_values_ex *values)
{
uint32_t comp_mask = 0;
int err = 0;
if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) {
uint64_t cycles;
err = mlx4_read_clock(context, &cycles);
if (!err) {
values->raw_clock.tv_sec = 0;
values->raw_clock.tv_nsec = cycles;
comp_mask |= IBV_VALUES_MASK_RAW_CLOCK;
}
}
values->comp_mask = comp_mask;
return err;
}
int mlx4_query_port(struct ibv_context *context, uint8_t port,
struct ibv_port_attr *attr)
{
struct ibv_query_port cmd;
int err;
err = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
if (!err && port <= MLX4_PORTS_NUM && port > 0) {
struct mlx4_context *mctx = to_mctx(context);
if (!mctx->port_query_cache[port - 1].valid) {
mctx->port_query_cache[port - 1].link_layer =
attr->link_layer;
mctx->port_query_cache[port - 1].caps =
attr->port_cap_flags;
mctx->port_query_cache[port - 1].valid = 1;
}
}
return err;
}
/* Only the fields in the port cache will be valid */
static int query_port_cache(struct ibv_context *context, uint8_t port_num,
struct ibv_port_attr *port_attr)
{
struct mlx4_context *mctx = to_mctx(context);
if (port_num <= 0 || port_num > MLX4_PORTS_NUM)
return -EINVAL;
if (mctx->port_query_cache[port_num - 1].valid) {
port_attr->link_layer =
mctx->
port_query_cache[port_num - 1].
link_layer;
port_attr->port_cap_flags =
mctx->
port_query_cache[port_num - 1].
caps;
return 0;
}
return mlx4_query_port(context, port_num,
(struct ibv_port_attr *)port_attr);
}
struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context)
{
struct ibv_alloc_pd cmd;
struct mlx4_alloc_pd_resp resp;
struct mlx4_pd *pd;
pd = malloc(sizeof *pd);
if (!pd)
return NULL;
if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
&resp.ibv_resp, sizeof resp)) {
free(pd);
return NULL;
}
pd->pdn = resp.pdn;
return &pd->ibv_pd;
}
int mlx4_free_pd(struct ibv_pd *pd)
{
int ret;
ret = ibv_cmd_dealloc_pd(pd);
if (ret)
return ret;
free(to_mpd(pd));
return 0;
}
struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
struct ibv_xrcd_init_attr *attr)
{
struct ibv_open_xrcd cmd;
struct ibv_open_xrcd_resp resp;
struct verbs_xrcd *xrcd;
int ret;
xrcd = calloc(1, sizeof *xrcd);
if (!xrcd)
return NULL;
ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
&cmd, sizeof cmd, &resp, sizeof resp);
if (ret)
goto err;
return &xrcd->xrcd;
err:
free(xrcd);
return NULL;
}
int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
{
struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
int ret;
ret = ibv_cmd_close_xrcd(xrcd);
if (!ret)
free(xrcd);
return ret;
}
struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
int access)
{
struct ibv_mr *mr;
struct ibv_reg_mr cmd;
struct ibv_reg_mr_resp resp;
int ret;
mr = malloc(sizeof *mr);
if (!mr)
return NULL;
ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t) addr,
access, mr, &cmd, sizeof cmd,
&resp, sizeof resp);
if (ret) {
free(mr);
return NULL;
}
return mr;
}
int mlx4_rereg_mr(struct ibv_mr *mr,
int flags,
struct ibv_pd *pd, void *addr,
size_t length, int access)
{
struct ibv_rereg_mr cmd;
struct ibv_rereg_mr_resp resp;
if (flags & IBV_REREG_MR_KEEP_VALID)
return ENOTSUP;
return ibv_cmd_rereg_mr(mr, flags, addr, length,
(uintptr_t)addr,
access, pd,
&cmd, sizeof(cmd),
&resp, sizeof(resp));
}
int mlx4_dereg_mr(struct ibv_mr *mr)
{
int ret;
ret = ibv_cmd_dereg_mr(mr);
if (ret)
return ret;
free(mr);
return 0;
}
struct ibv_mw *mlx4_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
{
struct ibv_mw *mw;
struct ibv_alloc_mw cmd;
struct ibv_alloc_mw_resp resp;
int ret;
mw = calloc(1, sizeof(*mw));
if (!mw)
return NULL;
ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd),
&resp, sizeof(resp));
if (ret) {
free(mw);
return NULL;
}
return mw;
}
int mlx4_dealloc_mw(struct ibv_mw *mw)
{
int ret;
struct ibv_dealloc_mw cmd;
ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
if (ret)
return ret;
free(mw);
return 0;
}
int mlx4_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
struct ibv_mw_bind *mw_bind)
{
struct ibv_send_wr *bad_wr = NULL;
struct ibv_send_wr wr = { };
int ret;
wr.opcode = IBV_WR_BIND_MW;
wr.next = NULL;
wr.wr_id = mw_bind->wr_id;
wr.send_flags = mw_bind->send_flags;
wr.bind_mw.mw = mw;
wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
wr.bind_mw.bind_info = mw_bind->bind_info;
ret = mlx4_post_send(qp, &wr, &bad_wr);
if (ret)
return ret;
/* updating the mw with the latest rkey. */
mw->rkey = wr.bind_mw.rkey;
return 0;
}
int align_queue_size(int req)
{
int nent;
for (nent = 1; nent < req; nent <<= 1)
; /* nothing */
return nent;
}
enum {
CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS |
IBV_WC_EX_WITH_COMPLETION_TIMESTAMP
};
enum {
CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS
};
enum {
CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED
};
static int mlx4_cmd_create_cq(struct ibv_context *context,
struct ibv_cq_init_attr_ex *cq_attr,
struct mlx4_cq *cq)
{
struct mlx4_create_cq cmd = {};
struct mlx4_create_cq_resp resp = {};
int ret;
cmd.buf_addr = (uintptr_t) cq->buf.buf;
cmd.db_addr = (uintptr_t) cq->set_ci_db;
ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel,
cq_attr->comp_vector,
ibv_cq_ex_to_cq(&cq->ibv_cq),
&cmd.ibv_cmd, sizeof(cmd),
&resp.ibv_resp, sizeof(resp));
if (!ret)
cq->cqn = resp.cqn;
return ret;
}
static int mlx4_cmd_create_cq_ex(struct ibv_context *context,
struct ibv_cq_init_attr_ex *cq_attr,
struct mlx4_cq *cq)
{
struct mlx4_create_cq_ex cmd = {};
struct mlx4_create_cq_resp_ex resp = {};
int ret;
cmd.buf_addr = (uintptr_t) cq->buf.buf;
cmd.db_addr = (uintptr_t) cq->set_ci_db;
ret = ibv_cmd_create_cq_ex(context, cq_attr,
&cq->ibv_cq, &cmd.ibv_cmd,
sizeof(cmd.ibv_cmd),
sizeof(cmd),
&resp.ibv_resp,
sizeof(resp.ibv_resp),
sizeof(resp));
if (!ret)
cq->cqn = resp.cqn;
return ret;
}
static struct ibv_cq_ex *create_cq(struct ibv_context *context,
struct ibv_cq_init_attr_ex *cq_attr,
int cq_alloc_flags)
{
struct mlx4_cq *cq;
int ret;
struct mlx4_context *mctx = to_mctx(context);
/* Sanity check CQ size before proceeding */
if (cq_attr->cqe > 0x3fffff) {
errno = EINVAL;
return NULL;
}
if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) {
errno = ENOTSUP;
return NULL;
}
if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) {
errno = ENOTSUP;
return NULL;
}
if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS)
return NULL;
/* mlx4 devices don't support slid and sl in cqe when completion
* timestamp is enabled in the CQ
*/
if ((cq_attr->wc_flags & (IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL)) &&
(cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)) {
errno = ENOTSUP;
return NULL;
}
cq = malloc(sizeof *cq);
if (!cq)
return NULL;
cq->cons_index = 0;
if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
goto err;
cq_attr->cqe = align_queue_size(cq_attr->cqe + 1);
if (mlx4_alloc_cq_buf(to_mdev(context->device), &cq->buf, cq_attr->cqe, mctx->cqe_size))
goto err_spl;
cq->cqe_size = mctx->cqe_size;
cq->set_ci_db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_CQ);
if (!cq->set_ci_db)
goto err_buf;
cq->arm_db = cq->set_ci_db + 1;
*cq->arm_db = 0;
cq->arm_sn = 1;
*cq->set_ci_db = 0;
cq->flags = cq_alloc_flags;
if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED)
cq->flags |= MLX4_CQ_FLAGS_SINGLE_THREADED;
--cq_attr->cqe;
if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
ret = mlx4_cmd_create_cq_ex(context, cq_attr, cq);
else
ret = mlx4_cmd_create_cq(context, cq_attr, cq);
if (ret)
goto err_db;
if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED)
mlx4_cq_fill_pfns(cq, cq_attr);
return &cq->ibv_cq;
err_db:
mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db);
err_buf:
mlx4_free_buf(&cq->buf);
err_spl:
pthread_spin_destroy(&cq->lock);
err:
free(cq);
return NULL;
}
struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector)
{
struct ibv_cq_ex *cq;
struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel,
.comp_vector = comp_vector,
.wc_flags = IBV_WC_STANDARD_FLAGS};
cq = create_cq(context, &cq_attr, 0);
return cq ? ibv_cq_ex_to_cq(cq) : NULL;
}
struct ibv_cq_ex *mlx4_create_cq_ex(struct ibv_context *context,
struct ibv_cq_init_attr_ex *cq_attr)
{
/*
* Make local copy since some attributes might be adjusted
* for internal use.
*/
struct ibv_cq_init_attr_ex cq_attr_c = {.cqe = cq_attr->cqe,
.channel = cq_attr->channel,
.comp_vector = cq_attr->comp_vector,
.wc_flags = cq_attr->wc_flags,
.comp_mask = cq_attr->comp_mask,
.flags = cq_attr->flags};
return create_cq(context, &cq_attr_c, MLX4_CQ_FLAGS_EXTENDED);
}
int mlx4_resize_cq(struct ibv_cq *ibcq, int cqe)
{
struct mlx4_cq *cq = to_mcq(ibcq);
struct mlx4_resize_cq cmd;
struct ibv_resize_cq_resp resp;
struct mlx4_buf buf;
int old_cqe, outst_cqe, ret;
/* Sanity check CQ size before proceeding */
if (cqe > 0x3fffff)
return EINVAL;
pthread_spin_lock(&cq->lock);
cqe = align_queue_size(cqe + 1);
if (cqe == ibcq->cqe + 1) {
ret = 0;
goto out;
}
/* Can't be smaller then the number of outstanding CQEs */
outst_cqe = mlx4_get_outstanding_cqes(cq);
if (cqe < outst_cqe + 1) {
ret = EINVAL;
goto out;
}
ret = mlx4_alloc_cq_buf(to_mdev(ibcq->context->device), &buf, cqe, cq->cqe_size);
if (ret)
goto out;
old_cqe = ibcq->cqe;
cmd.buf_addr = (uintptr_t) buf.buf;
ret = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof cmd,
&resp, sizeof resp);
if (ret) {
mlx4_free_buf(&buf);
goto out;
}
mlx4_cq_resize_copy_cqes(cq, buf.buf, old_cqe);
mlx4_free_buf(&cq->buf);
cq->buf = buf;
mlx4_update_cons_index(cq);
out:
pthread_spin_unlock(&cq->lock);
return ret;
}
int mlx4_destroy_cq(struct ibv_cq *cq)
{
int ret;
ret = ibv_cmd_destroy_cq(cq);
if (ret)
return ret;
verbs_cleanup_cq(cq);
pthread_spin_destroy(&to_mcq(cq)->lock);
mlx4_free_db(to_mctx(cq->context), MLX4_DB_TYPE_CQ, to_mcq(cq)->set_ci_db);
mlx4_free_buf(&to_mcq(cq)->buf);
free(to_mcq(cq));
return 0;
}
struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
struct ibv_srq_init_attr *attr)
{
struct mlx4_create_srq cmd;
struct mlx4_create_srq_resp resp;
struct mlx4_srq *srq;
int ret;
/* Sanity check SRQ size before proceeding */
if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
return NULL;
srq = malloc(sizeof *srq);
if (!srq)
return NULL;
if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
goto err;
srq->max = align_queue_size(attr->attr.max_wr + 1);
srq->max_gs = attr->attr.max_sge;
srq->counter = 0;
srq->ext_srq = 0;
if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
goto err_spl;
srq->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
if (!srq->db)
goto err_free;
*srq->db = 0;
cmd.buf_addr = (uintptr_t) srq->buf.buf;
cmd.db_addr = (uintptr_t) srq->db;
ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
&cmd.ibv_cmd, sizeof cmd,
&resp.ibv_resp, sizeof resp);
if (ret)
goto err_db;
return &srq->verbs_srq.srq;
err_db:
mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
err_free:
free(srq->wrid);
mlx4_free_buf(&srq->buf);
err_spl:
pthread_spin_destroy(&srq->lock);
err:
free(srq);
return NULL;
}
struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
struct ibv_srq_init_attr_ex *attr_ex)
{
if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
(attr_ex->srq_type == IBV_SRQT_BASIC))
return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
else if (attr_ex->srq_type == IBV_SRQT_XRC)
return mlx4_create_xrc_srq(context, attr_ex);
return NULL;
}
int mlx4_modify_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr,
int attr_mask)
{
struct ibv_modify_srq cmd;
return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
}
int mlx4_query_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr)
{
struct ibv_query_srq cmd;
return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
}
int mlx4_destroy_srq(struct ibv_srq *srq)
{
int ret;
if (to_msrq(srq)->ext_srq)
return mlx4_destroy_xrc_srq(srq);
ret = ibv_cmd_destroy_srq(srq);
if (ret)
return ret;
mlx4_free_db(to_mctx(srq->context), MLX4_DB_TYPE_RQ, to_msrq(srq)->db);
mlx4_free_buf(&to_msrq(srq)->buf);
free(to_msrq(srq)->wrid);
pthread_spin_destroy(&to_msrq(srq)->lock);
free(to_msrq(srq));
return 0;
}
static int mlx4_cmd_create_qp_ex(struct ibv_context *context,
struct ibv_qp_init_attr_ex *attr,
struct mlx4_create_qp *cmd,
struct mlx4_qp *qp)
{
struct mlx4_create_qp_ex cmd_ex;
struct mlx4_create_qp_resp_ex resp;
int ret;
memset(&cmd_ex, 0, sizeof(cmd_ex));
memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle,
offsetof(typeof(cmd->ibv_cmd), is_srq) +
sizeof(cmd->ibv_cmd.is_srq) -
offsetof(typeof(cmd->ibv_cmd), user_handle));
memcpy(&cmd_ex.drv_ex, &cmd->buf_addr,
offsetof(typeof(*cmd), sq_no_prefetch) +
sizeof(cmd->sq_no_prefetch) - sizeof(cmd->ibv_cmd));
ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
sizeof(qp->verbs_qp), attr,
&cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd),
sizeof(cmd_ex), &resp.ibv_resp,
sizeof(resp.ibv_resp), sizeof(resp));
return ret;
}
enum {
MLX4_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD |
IBV_QP_INIT_ATTR_XRCD |
IBV_QP_INIT_ATTR_CREATE_FLAGS),
};
enum {
MLX4_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS),
};
struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
struct ibv_qp_init_attr_ex *attr)
{
struct mlx4_context *ctx = to_mctx(context);
struct mlx4_create_qp cmd;
struct ibv_create_qp_resp resp;
struct mlx4_qp *qp;
int ret;
/* Sanity check QP size before proceeding */
if (ctx->max_qp_wr) { /* mlx4_query_device succeeded */
if (attr->cap.max_send_wr > ctx->max_qp_wr ||
attr->cap.max_recv_wr > ctx->max_qp_wr ||
attr->cap.max_send_sge > ctx->max_sge ||
attr->cap.max_recv_sge > ctx->max_sge)
return NULL;
} else {
if (attr->cap.max_send_wr > 65536 ||
attr->cap.max_recv_wr > 65536 ||
attr->cap.max_send_sge > 64 ||
attr->cap.max_recv_sge > 64)
return NULL;
}
if (attr->cap.max_inline_data > 1024)
return NULL;
if (attr->comp_mask & ~MLX4_CREATE_QP_SUP_COMP_MASK)
return NULL;
qp = calloc(1, sizeof *qp);
if (!qp)
return NULL;
if (attr->qp_type == IBV_QPT_XRC_RECV) {
attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
} else {
mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
/*
* We need to leave 2 KB + 1 WQE of headroom in the SQ to
* allow HW to prefetch.
*/
qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
}
if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
attr->qp_type == IBV_QPT_XRC_RECV) {
attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
} else {
qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
if (attr->cap.max_recv_sge < 1)
attr->cap.max_recv_sge = 1;
if (attr->cap.max_recv_wr < 1)
attr->cap.max_recv_wr = 1;
}
if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp))
goto err;
mlx4_init_qp_indices(qp);
if (pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE))
goto err_free;
if (pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
goto err_sq_spl;
if (attr->cap.max_recv_sge) {
qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
if (!qp->db)
goto err_rq_spl;
*qp->db = 0;
cmd.db_addr = (uintptr_t) qp->db;
} else {
cmd.db_addr = 0;
}
cmd.buf_addr = (uintptr_t) qp->buf.buf;
cmd.log_sq_stride = qp->sq.wqe_shift;
for (cmd.log_sq_bb_count = 0;
qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
++cmd.log_sq_bb_count)
; /* nothing */
cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
memset(cmd.reserved, 0, sizeof cmd.reserved);
pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
if (attr->comp_mask & MLX4_CREATE_QP_EX2_COMP_MASK)
ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp);
else
ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp,
sizeof(qp->verbs_qp), attr,
&cmd.ibv_cmd, sizeof(cmd), &resp,
sizeof(resp));
if (ret)
goto err_rq_db;
if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
if (ret)
goto err_destroy;
}
pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
qp->rq.max_gs = attr->cap.max_recv_sge;
if (attr->qp_type != IBV_QPT_XRC_RECV)
mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
qp->doorbell_qpn = htobe32(qp->verbs_qp.qp.qp_num << 8);
if (attr->sq_sig_all)
qp->sq_signal_bits = htobe32(MLX4_WQE_CTRL_CQ_UPDATE);
else
qp->sq_signal_bits = 0;
return &qp->verbs_qp.qp;
err_destroy:
ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
err_rq_db:
pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
if (attr->cap.max_recv_sge)
mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
err_rq_spl:
pthread_spin_destroy(&qp->rq.lock);
err_sq_spl:
pthread_spin_destroy(&qp->sq.lock);
err_free:
free(qp->sq.wrid);
if (qp->rq.wqe_cnt)
free(qp->rq.wrid);
mlx4_free_buf(&qp->buf);
err:
free(qp);
return NULL;
}
struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
{
struct ibv_qp_init_attr_ex attr_ex;
struct ibv_qp *qp;
memcpy(&attr_ex, attr, sizeof *attr);
attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
attr_ex.pd = pd;
qp = mlx4_create_qp_ex(pd->context, &attr_ex);
if (qp)
memcpy(attr, &attr_ex, sizeof *attr);
return qp;
}
struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
{
struct ibv_open_qp cmd;
struct ibv_create_qp_resp resp;
struct mlx4_qp *qp;
int ret;
qp = calloc(1, sizeof *qp);
if (!qp)
return NULL;
ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
&cmd, sizeof cmd, &resp, sizeof resp);
if (ret)
goto err;
return &qp->verbs_qp.qp;
err:
free(qp);
return NULL;
}
int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
int attr_mask,
struct ibv_qp_init_attr *init_attr)
{
struct ibv_query_qp cmd;
struct mlx4_qp *qp = to_mqp(ibqp);
int ret;
ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof cmd);
if (ret)
return ret;
init_attr->cap.max_send_wr = qp->sq.max_post;
init_attr->cap.max_send_sge = qp->sq.max_gs;
init_attr->cap.max_inline_data = qp->max_inline_data;
attr->cap = init_attr->cap;
return 0;
}
int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
int attr_mask)
{
struct ibv_modify_qp cmd = {};
struct ibv_port_attr port_attr;
struct mlx4_qp *mqp = to_mqp(qp);
struct ibv_device_attr device_attr;
int ret;
memset(&device_attr, 0, sizeof(device_attr));
if (attr_mask & IBV_QP_PORT) {
ret = ibv_query_port(qp->context, attr->port_num,
&port_attr);
if (ret)
return ret;
mqp->link_layer = port_attr.link_layer;
ret = ibv_query_device(qp->context, &device_attr);
if (ret)
return ret;
switch(qp->qp_type) {
case IBV_QPT_UD:
if ((mqp->link_layer == IBV_LINK_LAYER_INFINIBAND) &&
(device_attr.device_cap_flags & IBV_DEVICE_UD_IP_CSUM))
mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_UD_OVER_IB |
MLX4_RX_CSUM_VALID;
break;
case IBV_QPT_RAW_PACKET:
if ((mqp->link_layer == IBV_LINK_LAYER_ETHERNET) &&
(device_attr.device_cap_flags & IBV_DEVICE_RAW_IP_CSUM))
mqp->qp_cap_cache |= MLX4_CSUM_SUPPORT_RAW_OVER_ETH |
MLX4_RX_CSUM_VALID;
break;
default:
break;
}
}
if (qp->state == IBV_QPS_RESET &&
attr_mask & IBV_QP_STATE &&
attr->qp_state == IBV_QPS_INIT) {
mlx4_qp_init_sq_ownership(to_mqp(qp));
}
ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof cmd);
if (!ret &&
(attr_mask & IBV_QP_STATE) &&
attr->qp_state == IBV_QPS_RESET) {
if (qp->recv_cq)
mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
qp->srq ? to_msrq(qp->srq) : NULL);
if (qp->send_cq && qp->send_cq != qp->recv_cq)
mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
mlx4_init_qp_indices(to_mqp(qp));
if (to_mqp(qp)->rq.wqe_cnt)
*to_mqp(qp)->db = 0;
}
return ret;
}
static void mlx4_lock_cqs(struct ibv_qp *qp)
{
struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
if (!qp->send_cq || !qp->recv_cq) {
if (qp->send_cq)
pthread_spin_lock(&send_cq->lock);
else if (qp->recv_cq)
pthread_spin_lock(&recv_cq->lock);
} else if (send_cq == recv_cq) {
pthread_spin_lock(&send_cq->lock);
} else if (send_cq->cqn < recv_cq->cqn) {
pthread_spin_lock(&send_cq->lock);
pthread_spin_lock(&recv_cq->lock);
} else {
pthread_spin_lock(&recv_cq->lock);
pthread_spin_lock(&send_cq->lock);
}
}
static void mlx4_unlock_cqs(struct ibv_qp *qp)
{
struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
if (!qp->send_cq || !qp->recv_cq) {
if (qp->send_cq)
pthread_spin_unlock(&send_cq->lock);
else if (qp->recv_cq)
pthread_spin_unlock(&recv_cq->lock);
} else if (send_cq == recv_cq) {
pthread_spin_unlock(&send_cq->lock);
} else if (send_cq->cqn < recv_cq->cqn) {
pthread_spin_unlock(&recv_cq->lock);
pthread_spin_unlock(&send_cq->lock);
} else {
pthread_spin_unlock(&send_cq->lock);
pthread_spin_unlock(&recv_cq->lock);
}
}
int mlx4_destroy_qp(struct ibv_qp *ibqp)
{
struct mlx4_qp *qp = to_mqp(ibqp);
int ret;
pthread_mutex_lock(&to_mctx(ibqp->context)->qp_table_mutex);
ret = ibv_cmd_destroy_qp(ibqp);
if (ret) {
pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
return ret;
}
mlx4_lock_cqs(ibqp);
if (ibqp->recv_cq)
__mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
ibqp->srq ? to_msrq(ibqp->srq) : NULL);
if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
mlx4_unlock_cqs(ibqp);
pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
pthread_spin_destroy(&qp->rq.lock);
pthread_spin_destroy(&qp->sq.lock);
if (qp->rq.wqe_cnt) {
mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
free(qp->rq.wrid);
}
if (qp->sq.wqe_cnt)
free(qp->sq.wrid);
mlx4_free_buf(&qp->buf);
free(qp);
return 0;
}
static int link_local_gid(const union ibv_gid *gid)
{
uint32_t *tmp = (uint32_t *)gid->raw;
uint32_t hi = tmp[0];
uint32_t lo = tmp[1];
if (hi == htobe32(0xfe800000) && lo == 0)
return 1;
return 0;
}
static int is_multicast_gid(const union ibv_gid *gid)
{
return gid->raw[0] == 0xff;
}
static uint16_t get_vlan_id(union ibv_gid *gid)
{
uint16_t vid;
vid = gid->raw[11] << 8 | gid->raw[12];
return vid < 0x1000 ? vid : 0xffff;
}
static int mlx4_resolve_grh_to_l2(struct ibv_pd *pd, struct mlx4_ah *ah,
struct ibv_ah_attr *attr)
{
int err, i;
uint16_t vid;
union ibv_gid sgid;
if (link_local_gid(&attr->grh.dgid)) {
memcpy(ah->mac, &attr->grh.dgid.raw[8], 3);
memcpy(ah->mac + 3, &attr->grh.dgid.raw[13], 3);
ah->mac[0] ^= 2;
vid = get_vlan_id(&attr->grh.dgid);
} else if (is_multicast_gid(&attr->grh.dgid)) {
ah->mac[0] = 0x33;
ah->mac[1] = 0x33;
for (i = 2; i < 6; ++i)
ah->mac[i] = attr->grh.dgid.raw[i + 10];
err = ibv_query_gid(pd->context, attr->port_num,
attr->grh.sgid_index, &sgid);
if (err)
return err;
ah->av.dlid = htobe16(0xc000);
ah->av.port_pd |= htobe32(1 << 31);
vid = get_vlan_id(&sgid);
} else
return 1;
if (vid != 0xffff) {
ah->av.port_pd |= htobe32(1 << 29);
ah->vlan = vid | ((attr->sl & 7) << 13);
}
return 0;
}
struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
{
struct mlx4_ah *ah;
struct ibv_port_attr port_attr;
if (query_port_cache(pd->context, attr->port_num, &port_attr))
return NULL;
ah = malloc(sizeof *ah);
if (!ah)
return NULL;
memset(&ah->av, 0, sizeof ah->av);
ah->av.port_pd = htobe32(to_mpd(pd)->pdn | (attr->port_num << 24));
if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
ah->av.g_slid = attr->src_path_bits;
ah->av.dlid = htobe16(attr->dlid);
ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 28);
} else
ah->av.sl_tclass_flowlabel = htobe32(attr->sl << 29);
if (attr->static_rate) {
ah->av.stat_rate = attr->static_rate + MLX4_STAT_RATE_OFFSET;
/* XXX check rate cap? */
}
if (attr->is_global) {
ah->av.g_slid |= 0x80;
ah->av.gid_index = attr->grh.sgid_index;
ah->av.hop_limit = attr->grh.hop_limit;
ah->av.sl_tclass_flowlabel |=
htobe32((attr->grh.traffic_class << 20) |
attr->grh.flow_label);
memcpy(ah->av.dgid, attr->grh.dgid.raw, 16);
}
if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
if (port_attr.port_cap_flags & IBV_PORT_IP_BASED_GIDS) {
uint16_t vid;
if (ibv_resolve_eth_l2_from_gid(pd->context, attr,
ah->mac, &vid)) {
free(ah);
return NULL;
}
if (vid <= 0xfff) {
ah->av.port_pd |= htobe32(1 << 29);
ah->vlan = vid |
((attr->sl & 7) << 13);
}
} else {
if (mlx4_resolve_grh_to_l2(pd, ah, attr)) {
free(ah);
return NULL;
}
}
}
return &ah->ibv_ah;
}
int mlx4_destroy_ah(struct ibv_ah *ah)
{
free(to_mah(ah));
return 0;
}