linux/drivers/infiniband/hw/ipath/ipath_uc.c
Steve Wise 00f7ec36c9 RDMA/core: Add memory management extensions support
This patch adds support for the IB "base memory management extension"
(BMME) and the equivalent iWARP operations (which the iWARP verbs
mandates all devices must implement).  The new operations are:

 - Allocate an ib_mr for use in fast register work requests.

 - Allocate/free a physical buffer lists for use in fast register work
   requests.  This allows device drivers to allocate this memory as
   needed for use in posting send requests (eg via dma_alloc_coherent).

 - New send queue work requests:
   * send with remote invalidate
   * fast register memory region
   * local invalidate memory region
   * RDMA read with invalidate local memory region (iWARP only)

Consumer interface details:

 - A new device capability flag IB_DEVICE_MEM_MGT_EXTENSIONS is added
   to indicate device support for these features.

 - New send work request opcodes IB_WR_FAST_REG_MR, IB_WR_LOCAL_INV,
   IB_WR_RDMA_READ_WITH_INV are added.

 - A new consumer API function, ib_alloc_mr() is added to allocate
   fast register memory regions.

 - New consumer API functions, ib_alloc_fast_reg_page_list() and
   ib_free_fast_reg_page_list() are added to allocate and free
   device-specific memory for fast registration page lists.

 - A new consumer API function, ib_update_fast_reg_key(), is added to
   allow the key portion of the R_Key and L_Key of a fast registration
   MR to be updated.  Consumers call this if desired before posting
   a IB_WR_FAST_REG_MR work request.

Consumers can use this as follows:

 - MR is allocated with ib_alloc_mr().

 - Page list memory is allocated with ib_alloc_fast_reg_page_list().

 - MR R_Key/L_Key "key" field is updated with ib_update_fast_reg_key().

 - MR made VALID and bound to a specific page list via
   ib_post_send(IB_WR_FAST_REG_MR)

 - MR made INVALID via ib_post_send(IB_WR_LOCAL_INV),
   ib_post_send(IB_WR_RDMA_READ_WITH_INV) or an incoming send with
   invalidate operation.

 - MR is deallocated with ib_dereg_mr()

 - page lists dealloced via ib_free_fast_reg_page_list().

Applications can allocate a fast register MR once, and then can
repeatedly bind the MR to different physical block lists (PBLs) via
posting work requests to a send queue (SQ).  For each outstanding
MR-to-PBL binding in the SQ pipe, a fast_reg_page_list needs to be
allocated (the fast_reg_page_list is owned by the low-level driver
from the consumer posting a work request until the request completes).
Thus pipelining can be achieved while still allowing device-specific
page_list processing.

The 32-bit fast register memory key/STag is composed of a 24-bit index
and an 8-bit key.  The application can change the key each time it
fast registers thus allowing more control over the peer's use of the
key/STag (ie it can effectively be changed each time the rkey is
rebound to a page list).

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2008-07-14 23:48:45 -07:00

548 lines
14 KiB
C

/*
* Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
* Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "ipath_verbs.h"
#include "ipath_kernel.h"
/* cut down ridiculously long IB macro names */
#define OP(x) IB_OPCODE_UC_##x
/**
* ipath_make_uc_req - construct a request packet (SEND, RDMA write)
* @qp: a pointer to the QP
*
* Return 1 if constructed; otherwise, return 0.
*/
int ipath_make_uc_req(struct ipath_qp *qp)
{
struct ipath_other_headers *ohdr;
struct ipath_swqe *wqe;
unsigned long flags;
u32 hwords;
u32 bth0;
u32 len;
u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
int ret = 0;
spin_lock_irqsave(&qp->s_lock, flags);
if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
goto bail;
/* We are in the error state, flush the work request. */
if (qp->s_last == qp->s_head)
goto bail;
/* If DMAs are in progress, we can't flush immediately. */
if (atomic_read(&qp->s_dma_busy)) {
qp->s_flags |= IPATH_S_WAIT_DMA;
goto bail;
}
wqe = get_swqe_ptr(qp, qp->s_last);
ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
goto done;
}
ohdr = &qp->s_hdr.u.oth;
if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
ohdr = &qp->s_hdr.u.l.oth;
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
hwords = 5;
bth0 = 1 << 22; /* Set M bit */
/* Get the next send request. */
wqe = get_swqe_ptr(qp, qp->s_cur);
qp->s_wqe = NULL;
switch (qp->s_state) {
default:
if (!(ib_ipath_state_ops[qp->state] &
IPATH_PROCESS_NEXT_SEND_OK))
goto bail;
/* Check if send work queue is empty. */
if (qp->s_cur == qp->s_head)
goto bail;
/*
* Start a new request.
*/
qp->s_psn = wqe->psn = qp->s_next_psn;
qp->s_sge.sge = wqe->sg_list[0];
qp->s_sge.sg_list = wqe->sg_list + 1;
qp->s_sge.num_sge = wqe->wr.num_sge;
qp->s_len = len = wqe->length;
switch (wqe->wr.opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
if (len > pmtu) {
qp->s_state = OP(SEND_FIRST);
len = pmtu;
break;
}
if (wqe->wr.opcode == IB_WR_SEND)
qp->s_state = OP(SEND_ONLY);
else {
qp->s_state =
OP(SEND_ONLY_WITH_IMMEDIATE);
/* Immediate data comes after the BTH */
ohdr->u.imm_data = wqe->wr.ex.imm_data;
hwords += 1;
}
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
bth0 |= 1 << 23;
qp->s_wqe = wqe;
if (++qp->s_cur >= qp->s_size)
qp->s_cur = 0;
break;
case IB_WR_RDMA_WRITE:
case IB_WR_RDMA_WRITE_WITH_IMM:
ohdr->u.rc.reth.vaddr =
cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
ohdr->u.rc.reth.rkey =
cpu_to_be32(wqe->wr.wr.rdma.rkey);
ohdr->u.rc.reth.length = cpu_to_be32(len);
hwords += sizeof(struct ib_reth) / 4;
if (len > pmtu) {
qp->s_state = OP(RDMA_WRITE_FIRST);
len = pmtu;
break;
}
if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
qp->s_state = OP(RDMA_WRITE_ONLY);
else {
qp->s_state =
OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
/* Immediate data comes after the RETH */
ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
hwords += 1;
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
bth0 |= 1 << 23;
}
qp->s_wqe = wqe;
if (++qp->s_cur >= qp->s_size)
qp->s_cur = 0;
break;
default:
goto bail;
}
break;
case OP(SEND_FIRST):
qp->s_state = OP(SEND_MIDDLE);
/* FALLTHROUGH */
case OP(SEND_MIDDLE):
len = qp->s_len;
if (len > pmtu) {
len = pmtu;
break;
}
if (wqe->wr.opcode == IB_WR_SEND)
qp->s_state = OP(SEND_LAST);
else {
qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
/* Immediate data comes after the BTH */
ohdr->u.imm_data = wqe->wr.ex.imm_data;
hwords += 1;
}
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
bth0 |= 1 << 23;
qp->s_wqe = wqe;
if (++qp->s_cur >= qp->s_size)
qp->s_cur = 0;
break;
case OP(RDMA_WRITE_FIRST):
qp->s_state = OP(RDMA_WRITE_MIDDLE);
/* FALLTHROUGH */
case OP(RDMA_WRITE_MIDDLE):
len = qp->s_len;
if (len > pmtu) {
len = pmtu;
break;
}
if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
qp->s_state = OP(RDMA_WRITE_LAST);
else {
qp->s_state =
OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
/* Immediate data comes after the BTH */
ohdr->u.imm_data = wqe->wr.ex.imm_data;
hwords += 1;
if (wqe->wr.send_flags & IB_SEND_SOLICITED)
bth0 |= 1 << 23;
}
qp->s_wqe = wqe;
if (++qp->s_cur >= qp->s_size)
qp->s_cur = 0;
break;
}
qp->s_len -= len;
qp->s_hdrwords = hwords;
qp->s_cur_sge = &qp->s_sge;
qp->s_cur_size = len;
ipath_make_ruc_header(to_idev(qp->ibqp.device),
qp, ohdr, bth0 | (qp->s_state << 24),
qp->s_next_psn++ & IPATH_PSN_MASK);
done:
ret = 1;
goto unlock;
bail:
qp->s_flags &= ~IPATH_S_BUSY;
unlock:
spin_unlock_irqrestore(&qp->s_lock, flags);
return ret;
}
/**
* ipath_uc_rcv - handle an incoming UC packet
* @dev: the device the packet came in on
* @hdr: the header of the packet
* @has_grh: true if the packet has a GRH
* @data: the packet data
* @tlen: the length of the packet
* @qp: the QP for this packet.
*
* This is called from ipath_qp_rcv() to process an incoming UC packet
* for the given QP.
* Called at interrupt level.
*/
void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
{
struct ipath_other_headers *ohdr;
int opcode;
u32 hdrsize;
u32 psn;
u32 pad;
struct ib_wc wc;
u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
struct ib_reth *reth;
int header_in_data;
/* Validate the SLID. See Ch. 9.6.1.5 */
if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
goto done;
/* Check for GRH */
if (!has_grh) {
ohdr = &hdr->u.oth;
hdrsize = 8 + 12; /* LRH + BTH */
psn = be32_to_cpu(ohdr->bth[2]);
header_in_data = 0;
} else {
ohdr = &hdr->u.l.oth;
hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */
/*
* The header with GRH is 60 bytes and the
* core driver sets the eager header buffer
* size to 56 bytes so the last 4 bytes of
* the BTH header (PSN) is in the data buffer.
*/
header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
if (header_in_data) {
psn = be32_to_cpu(((__be32 *) data)[0]);
data += sizeof(__be32);
} else
psn = be32_to_cpu(ohdr->bth[2]);
}
/*
* The opcode is in the low byte when its in network order
* (top byte when in host order).
*/
opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
memset(&wc, 0, sizeof wc);
/* Compare the PSN verses the expected PSN. */
if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
/*
* Handle a sequence error.
* Silently drop any current message.
*/
qp->r_psn = psn;
inv:
qp->r_state = OP(SEND_LAST);
switch (opcode) {
case OP(SEND_FIRST):
case OP(SEND_ONLY):
case OP(SEND_ONLY_WITH_IMMEDIATE):
goto send_first;
case OP(RDMA_WRITE_FIRST):
case OP(RDMA_WRITE_ONLY):
case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
goto rdma_first;
default:
dev->n_pkt_drops++;
goto done;
}
}
/* Check for opcode sequence errors. */
switch (qp->r_state) {
case OP(SEND_FIRST):
case OP(SEND_MIDDLE):
if (opcode == OP(SEND_MIDDLE) ||
opcode == OP(SEND_LAST) ||
opcode == OP(SEND_LAST_WITH_IMMEDIATE))
break;
goto inv;
case OP(RDMA_WRITE_FIRST):
case OP(RDMA_WRITE_MIDDLE):
if (opcode == OP(RDMA_WRITE_MIDDLE) ||
opcode == OP(RDMA_WRITE_LAST) ||
opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
break;
goto inv;
default:
if (opcode == OP(SEND_FIRST) ||
opcode == OP(SEND_ONLY) ||
opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
opcode == OP(RDMA_WRITE_FIRST) ||
opcode == OP(RDMA_WRITE_ONLY) ||
opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
break;
goto inv;
}
/* OK, process the packet. */
switch (opcode) {
case OP(SEND_FIRST):
case OP(SEND_ONLY):
case OP(SEND_ONLY_WITH_IMMEDIATE):
send_first:
if (qp->r_flags & IPATH_R_REUSE_SGE) {
qp->r_flags &= ~IPATH_R_REUSE_SGE;
qp->r_sge = qp->s_rdma_read_sge;
} else if (!ipath_get_rwqe(qp, 0)) {
dev->n_pkt_drops++;
goto done;
}
/* Save the WQE so we can reuse it in case of an error. */
qp->s_rdma_read_sge = qp->r_sge;
qp->r_rcv_len = 0;
if (opcode == OP(SEND_ONLY))
goto send_last;
else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
goto send_last_imm;
/* FALLTHROUGH */
case OP(SEND_MIDDLE):
/* Check for invalid length PMTU or posted rwqe len. */
if (unlikely(tlen != (hdrsize + pmtu + 4))) {
qp->r_flags |= IPATH_R_REUSE_SGE;
dev->n_pkt_drops++;
goto done;
}
qp->r_rcv_len += pmtu;
if (unlikely(qp->r_rcv_len > qp->r_len)) {
qp->r_flags |= IPATH_R_REUSE_SGE;
dev->n_pkt_drops++;
goto done;
}
ipath_copy_sge(&qp->r_sge, data, pmtu);
break;
case OP(SEND_LAST_WITH_IMMEDIATE):
send_last_imm:
if (header_in_data) {
wc.ex.imm_data = *(__be32 *) data;
data += sizeof(__be32);
} else {
/* Immediate data comes after BTH */
wc.ex.imm_data = ohdr->u.imm_data;
}
hdrsize += 4;
wc.wc_flags = IB_WC_WITH_IMM;
/* FALLTHROUGH */
case OP(SEND_LAST):
send_last:
/* Get the number of bytes the message was padded by. */
pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
/* Check for invalid length. */
/* XXX LAST len should be >= 1 */
if (unlikely(tlen < (hdrsize + pad + 4))) {
qp->r_flags |= IPATH_R_REUSE_SGE;
dev->n_pkt_drops++;
goto done;
}
/* Don't count the CRC. */
tlen -= (hdrsize + pad + 4);
wc.byte_len = tlen + qp->r_rcv_len;
if (unlikely(wc.byte_len > qp->r_len)) {
qp->r_flags |= IPATH_R_REUSE_SGE;
dev->n_pkt_drops++;
goto done;
}
wc.opcode = IB_WC_RECV;
last_imm:
ipath_copy_sge(&qp->r_sge, data, tlen);
wc.wr_id = qp->r_wr_id;
wc.status = IB_WC_SUCCESS;
wc.qp = &qp->ibqp;
wc.src_qp = qp->remote_qpn;
wc.slid = qp->remote_ah_attr.dlid;
wc.sl = qp->remote_ah_attr.sl;
/* Signal completion event if the solicited bit is set. */
ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
(ohdr->bth[0] &
__constant_cpu_to_be32(1 << 23)) != 0);
break;
case OP(RDMA_WRITE_FIRST):
case OP(RDMA_WRITE_ONLY):
case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
rdma_first:
/* RETH comes after BTH */
if (!header_in_data)
reth = &ohdr->u.rc.reth;
else {
reth = (struct ib_reth *)data;
data += sizeof(*reth);
}
hdrsize += sizeof(*reth);
qp->r_len = be32_to_cpu(reth->length);
qp->r_rcv_len = 0;
if (qp->r_len != 0) {
u32 rkey = be32_to_cpu(reth->rkey);
u64 vaddr = be64_to_cpu(reth->vaddr);
int ok;
/* Check rkey */
ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
vaddr, rkey,
IB_ACCESS_REMOTE_WRITE);
if (unlikely(!ok)) {
dev->n_pkt_drops++;
goto done;
}
} else {
qp->r_sge.sg_list = NULL;
qp->r_sge.sge.mr = NULL;
qp->r_sge.sge.vaddr = NULL;
qp->r_sge.sge.length = 0;
qp->r_sge.sge.sge_length = 0;
}
if (unlikely(!(qp->qp_access_flags &
IB_ACCESS_REMOTE_WRITE))) {
dev->n_pkt_drops++;
goto done;
}
if (opcode == OP(RDMA_WRITE_ONLY))
goto rdma_last;
else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
goto rdma_last_imm;
/* FALLTHROUGH */
case OP(RDMA_WRITE_MIDDLE):
/* Check for invalid length PMTU or posted rwqe len. */
if (unlikely(tlen != (hdrsize + pmtu + 4))) {
dev->n_pkt_drops++;
goto done;
}
qp->r_rcv_len += pmtu;
if (unlikely(qp->r_rcv_len > qp->r_len)) {
dev->n_pkt_drops++;
goto done;
}
ipath_copy_sge(&qp->r_sge, data, pmtu);
break;
case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
rdma_last_imm:
if (header_in_data) {
wc.ex.imm_data = *(__be32 *) data;
data += sizeof(__be32);
} else {
/* Immediate data comes after BTH */
wc.ex.imm_data = ohdr->u.imm_data;
}
hdrsize += 4;
wc.wc_flags = IB_WC_WITH_IMM;
/* Get the number of bytes the message was padded by. */
pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
/* Check for invalid length. */
/* XXX LAST len should be >= 1 */
if (unlikely(tlen < (hdrsize + pad + 4))) {
dev->n_pkt_drops++;
goto done;
}
/* Don't count the CRC. */
tlen -= (hdrsize + pad + 4);
if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
dev->n_pkt_drops++;
goto done;
}
if (qp->r_flags & IPATH_R_REUSE_SGE)
qp->r_flags &= ~IPATH_R_REUSE_SGE;
else if (!ipath_get_rwqe(qp, 1)) {
dev->n_pkt_drops++;
goto done;
}
wc.byte_len = qp->r_len;
wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
goto last_imm;
case OP(RDMA_WRITE_LAST):
rdma_last:
/* Get the number of bytes the message was padded by. */
pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
/* Check for invalid length. */
/* XXX LAST len should be >= 1 */
if (unlikely(tlen < (hdrsize + pad + 4))) {
dev->n_pkt_drops++;
goto done;
}
/* Don't count the CRC. */
tlen -= (hdrsize + pad + 4);
if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
dev->n_pkt_drops++;
goto done;
}
ipath_copy_sge(&qp->r_sge, data, tlen);
break;
default:
/* Drop packet for unknown opcodes. */
dev->n_pkt_drops++;
goto done;
}
qp->r_psn++;
qp->r_state = opcode;
done:
return;
}