aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSean Hefty <sean.hefty@intel.com>2013-08-01 18:05:55 +0300
committerRoland Dreier <roland@purestorage.com>2014-01-29 13:46:12 -0800
commitdc6ef6957ed2a50b603ffe9354d030e5a871946f (patch)
tree3067e2d7239f1e6c680787d1f33924268bdffd1e
parentd00cfeb15ef4516539725b2f6c7d992e7201e13d (diff)
downloadlibmlx4-dc6ef6957ed2a50b603ffe9354d030e5a871946f.tar.gz
Add support for XRC QPs
Implement the XRC APIs for libmlx4. Hook the XRC functions as part of mlx4_init_context(), then make relevant changes on both control and data path to work properly with XRC. Main changes include using verbs_qp, verbs_srq, which are extendable based as part of calling libibverbs command API (i.e. cmd.c), managing an XRC SRQ table for mapping between mlx4_srq and srqn, differentiating between IBV_QPT_XRC_SEND and IBV_QPT_XRC_RECV. Signed-off-by: Sean Hefty <sean.hefty@intel.com> Signed-off-by: Yishai Hadas <yishaih@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--src/buf.c6
-rw-r--r--src/cq.c42
-rw-r--r--src/mlx4-abi.h6
-rw-r--r--src/mlx4.c16
-rw-r--r--src/mlx4.h62
-rw-r--r--src/qp.c39
-rw-r--r--src/srq.c153
-rw-r--r--src/verbs.c224
8 files changed, 453 insertions, 95 deletions
diff --git a/src/buf.c b/src/buf.c
index be4df3b..c06b3fd 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -59,6 +59,8 @@ int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size)
void mlx4_free_buf(struct mlx4_buf *buf)
{
- ibv_dofork_range(buf->buf, buf->length);
- munmap(buf->buf, buf->length);
+ if (buf->length) {
+ ibv_dofork_range(buf->buf, buf->length);
+ munmap(buf->buf, buf->length);
+ }
}
diff --git a/src/cq.c b/src/cq.c
index bd9bf0f..8b27795 100644
--- a/src/cq.c
+++ b/src/cq.c
@@ -210,33 +210,43 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
rmb();
qpn = ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK;
+ wc->qp_num = qpn;
is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
MLX4_CQE_OPCODE_ERROR;
- if (!*cur_qp ||
- (qpn != (*cur_qp)->ibv_qp.qp_num)) {
+ if ((qpn & MLX4_XRC_QPN_BIT) && !is_send) {
/*
- * We do not have to take the QP table lock here,
- * because CQs will be locked while QPs are removed
+ * We do not have to take the XSRQ table lock here,
+ * because CQs will be locked while SRQs are removed
* from the table.
*/
- *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
- if (!*cur_qp)
+ srq = mlx4_find_xsrq(&to_mctx(cq->ibv_cq.context)->xsrq_table,
+ ntohl(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK);
+ if (!srq)
return CQ_POLL_ERR;
+ } else {
+ if (!*cur_qp || (qpn != (*cur_qp)->verbs_qp.qp.qp_num)) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn);
+ if (!*cur_qp)
+ return CQ_POLL_ERR;
+ }
+ srq = ((*cur_qp)->verbs_qp.qp.srq) ? to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL;
}
- wc->qp_num = (*cur_qp)->ibv_qp.qp_num;
-
if (is_send) {
wq = &(*cur_qp)->sq;
wqe_index = ntohs(cqe->wqe_index);
wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
++wq->tail;
- } else if ((*cur_qp)->ibv_qp.srq) {
- srq = to_msrq((*cur_qp)->ibv_qp.srq);
+ } else if (srq) {
wqe_index = htons(cqe->wqe_index);
wc->wr_id = srq->wrid[wqe_index];
mlx4_free_srq_wqe(srq, wqe_index);
@@ -312,7 +322,10 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0;
wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f;
- if ((*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET)
+ /* When working with xrc srqs, don't have qp to check link layer.
+ * Using IB SL, should consider Roce. (TBD)
+ */
+ if ((*cur_qp) && (*cur_qp)->link_layer == IBV_LINK_LAYER_ETHERNET)
wc->sl = ntohs(cqe->sl_vid) >> 13;
else
wc->sl = ntohs(cqe->sl_vid) >> 12;
@@ -403,7 +416,12 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
while ((int) --prod_index - (int) cq->cons_index >= 0) {
cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
cqe += cqe_inc;
- if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+ if (srq && srq->ext_srq &&
+ ntohl(cqe->g_mlpath_rqpn & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num &&
+ !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
+ mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
+ ++nfreed;
+ } else if ((ntohl(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
++nfreed;
diff --git a/src/mlx4-abi.h b/src/mlx4-abi.h
index a1328af..b48f6fc 100644
--- a/src/mlx4-abi.h
+++ b/src/mlx4-abi.h
@@ -89,6 +89,12 @@ struct mlx4_create_srq {
__u64 db_addr;
};
+struct mlx4_create_xsrq {
+ struct ibv_create_xsrq ibv_cmd;
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
struct mlx4_create_srq_resp {
struct ibv_create_srq_resp ibv_resp;
__u32 srqn;
diff --git a/src/mlx4.c b/src/mlx4.c
index da9f611..c6e4c5c 100644
--- a/src/mlx4.c
+++ b/src/mlx4.c
@@ -126,10 +126,7 @@ static int mlx4_init_context(struct verbs_device *v_device,
struct mlx4_alloc_ucontext_resp_v3 resp_v3;
__u16 bf_reg_size;
struct mlx4_device *dev = to_mdev(&v_device->device);
- /* verbs_context should be used for new verbs
- * struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
- */
-
+ struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx);
/* memory footprint of mlx4_context and verbs_context share
* struct ibv_context.
@@ -168,6 +165,7 @@ static int mlx4_init_context(struct verbs_device *v_device,
for (i = 0; i < MLX4_NUM_DB_TYPE; ++i)
context->db_list[i] = NULL;
+ mlx4_init_xsrq_table(&context->xsrq_table, context->num_qps);
pthread_mutex_init(&context->db_list_mutex, NULL);
context->uar = mmap(NULL, dev->page_size, PROT_WRITE,
@@ -196,7 +194,15 @@ static int mlx4_init_context(struct verbs_device *v_device,
pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE);
ibv_ctx->ops = mlx4_ctx_ops;
- /* New verbs should be added by using verbs_set_ctx_op */
+
+ verbs_ctx->has_comp_mask = VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ |
+ VERBS_CONTEXT_QP;
+ verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd);
+ verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd);
+ verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex);
+ verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num);
+ verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex);
+ verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
return 0;
diff --git a/src/mlx4.h b/src/mlx4.h
index cfb0b50..d71450f 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -38,6 +38,7 @@
#include <infiniband/driver.h>
#include <infiniband/arch.h>
+#include <infiniband/verbs.h>
#ifdef HAVE_VALGRIND_MEMCHECK_H
@@ -93,6 +94,36 @@ enum {
MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
};
+#define MLX4_REMOTE_SRQN_FLAGS(wr) htonl(wr->qp_type.xrc.remote_srqn << 8)
+
+enum {
+ MLX4_XSRQ_TABLE_BITS = 8,
+ MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+ MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
+};
+
+struct mlx4_xsrq_table {
+ struct {
+ struct mlx4_srq **table;
+ int refcnt;
+ } xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+
+ pthread_mutex_t mutex;
+ int num_xsrq;
+ int shift;
+ int mask;
+};
+
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+
+enum {
+ MLX4_XRC_QPN_BIT = (1 << 23)
+};
+
enum mlx4_db_type {
MLX4_DB_TYPE_CQ,
MLX4_DB_TYPE_RQ,
@@ -157,6 +188,7 @@ struct mlx4_context {
struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE];
pthread_mutex_t db_list_mutex;
int cqe_size;
+ struct mlx4_xsrq_table xsrq_table;
};
struct mlx4_buf {
@@ -183,7 +215,7 @@ struct mlx4_cq {
};
struct mlx4_srq {
- struct ibv_srq ibv_srq;
+ struct verbs_srq verbs_srq;
struct mlx4_buf buf;
pthread_spinlock_t lock;
uint64_t *wrid;
@@ -195,6 +227,7 @@ struct mlx4_srq {
int tail;
uint32_t *db;
uint16_t counter;
+ uint8_t ext_srq;
};
struct mlx4_wq {
@@ -210,7 +243,7 @@ struct mlx4_wq {
};
struct mlx4_qp {
- struct ibv_qp ibv_qp;
+ struct verbs_qp verbs_qp;
struct mlx4_buf buf;
int max_inline_data;
int buf_size;
@@ -265,6 +298,7 @@ static inline unsigned long align(unsigned long val, unsigned long align)
{
return (val + align - 1) & ~(align - 1);
}
+int align_queue_size(int req);
#define to_mxxx(xxx, type) \
((struct mlx4_##type *) \
@@ -295,12 +329,14 @@ static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq)
{
- return to_mxxx(srq, srq);
+ return container_of(container_of(ibsrq, struct verbs_srq, srq),
+ struct mlx4_srq, verbs_srq);
}
static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
{
- return to_mxxx(qp, qp);
+ return container_of(container_of(ibqp, struct verbs_qp, qp),
+ struct mlx4_qp, verbs_qp);
}
static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
@@ -321,6 +357,9 @@ int mlx4_query_port(struct ibv_context *context, uint8_t port,
struct ibv_pd *mlx4_alloc_pd(struct ibv_context *context);
int mlx4_free_pd(struct ibv_pd *pd);
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+ struct ibv_xrcd_init_attr *attr);
+int mlx4_close_xrcd(struct ibv_xrcd *xrcd);
struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr,
size_t length, int access);
@@ -343,20 +382,33 @@ void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int new_cqe);
struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
struct ibv_srq_init_attr *attr);
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex);
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex);
int mlx4_modify_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr,
int mask);
int mlx4_query_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr);
int mlx4_destroy_srq(struct ibv_srq *srq);
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq);
int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
struct mlx4_srq *srq);
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size);
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq);
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn);
void mlx4_free_srq_wqe(struct mlx4_srq *srq, int ind);
int mlx4_post_srq_recv(struct ibv_srq *ibsrq,
struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr);
struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr);
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr);
int mlx4_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
int attr_mask,
struct ibv_qp_init_attr *init_attr);
@@ -371,7 +423,7 @@ int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr);
void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
struct mlx4_qp *qp);
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
+int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
enum ibv_qp_type type, struct mlx4_qp *qp);
void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
enum ibv_qp_type type);
diff --git a/src/qp.c b/src/qp.c
index 11c750b..721bed4 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -208,7 +208,7 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
ind = qp->sq.head;
for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
+ if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
ret = ENOMEM;
*bad_wr = wr;
goto out;
@@ -246,6 +246,9 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
size = sizeof *ctrl / 16;
switch (ibqp->qp_type) {
+ case IBV_QPT_XRC_SEND:
+ ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
+ /* fall through */
case IBV_QPT_RC:
case IBV_QPT_UC:
switch (wr->opcode) {
@@ -460,7 +463,7 @@ int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
for (nreq = 0; wr; ++nreq, wr = wr->next) {
- if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
+ if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
ret = ENOMEM;
*bad_wr = wr;
goto out;
@@ -554,6 +557,7 @@ void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
size += sizeof (struct mlx4_wqe_raddr_seg);
break;
+ case IBV_QPT_XRC_SEND:
case IBV_QPT_RC:
size += sizeof (struct mlx4_wqe_raddr_seg);
/*
@@ -583,14 +587,16 @@ void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
; /* nothing */
}
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
+int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
enum ibv_qp_type type, struct mlx4_qp *qp)
{
qp->rq.max_gs = cap->max_recv_sge;
- qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
- if (!qp->sq.wrid)
- return -1;
+ if (qp->sq.wqe_cnt) {
+ qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
+ if (!qp->sq.wrid)
+ return -1;
+ }
if (qp->rq.wqe_cnt) {
qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
@@ -615,15 +621,19 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
qp->sq.offset = 0;
}
- if (mlx4_alloc_buf(&qp->buf,
- align(qp->buf_size, to_mdev(pd->context->device)->page_size),
- to_mdev(pd->context->device)->page_size)) {
- free(qp->sq.wrid);
- free(qp->rq.wrid);
- return -1;
- }
+ if (qp->buf_size) {
+ if (mlx4_alloc_buf(&qp->buf,
+ align(qp->buf_size, to_mdev(context->device)->page_size),
+ to_mdev(context->device)->page_size)) {
+ free(qp->sq.wrid);
+ free(qp->rq.wrid);
+ return -1;
+ }
- memset(qp->buf.buf, 0, qp->buf_size);
+ memset(qp->buf.buf, 0, qp->buf_size);
+ } else {
+ qp->buf.buf = NULL;
+ }
return 0;
}
@@ -639,6 +649,7 @@ void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
break;
+ case IBV_QPT_XRC_SEND:
case IBV_QPT_UC:
case IBV_QPT_RC:
wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
diff --git a/src/srq.c b/src/srq.c
index f1d1240..28bc2d4 100644
--- a/src/srq.c
+++ b/src/srq.c
@@ -42,6 +42,7 @@
#include "mlx4.h"
#include "doorbell.h"
#include "wqe.h"
+#include "mlx4-abi.h"
static void *get_wqe(struct mlx4_srq *srq, int n)
{
@@ -173,3 +174,155 @@ int mlx4_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr,
return 0;
}
+
+void mlx4_init_xsrq_table(struct mlx4_xsrq_table *xsrq_table, int size)
+{
+ memset(xsrq_table, 0, sizeof *xsrq_table);
+ xsrq_table->num_xsrq = size;
+ xsrq_table->shift = ffs(size) - 1 - MLX4_XSRQ_TABLE_BITS;
+ xsrq_table->mask = (1 << xsrq_table->shift) - 1;
+
+ pthread_mutex_init(&xsrq_table->mutex, NULL);
+}
+
+struct mlx4_srq *mlx4_find_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
+{
+ int index;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ if (xsrq_table->xsrq_table[index].refcnt)
+ return xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask];
+
+ return NULL;
+}
+
+int mlx4_store_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn,
+ struct mlx4_srq *srq)
+{
+ int index, ret = 0;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ pthread_mutex_lock(&xsrq_table->mutex);
+ if (!xsrq_table->xsrq_table[index].refcnt) {
+ xsrq_table->xsrq_table[index].table = calloc(xsrq_table->mask + 1,
+ sizeof(struct mlx4_srq *));
+ if (!xsrq_table->xsrq_table[index].table) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ xsrq_table->xsrq_table[index].refcnt++;
+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = srq;
+
+out:
+ pthread_mutex_unlock(&xsrq_table->mutex);
+ return ret;
+}
+
+void mlx4_clear_xsrq(struct mlx4_xsrq_table *xsrq_table, uint32_t srqn)
+{
+ int index;
+
+ index = (srqn & (xsrq_table->num_xsrq - 1)) >> xsrq_table->shift;
+ pthread_mutex_lock(&xsrq_table->mutex);
+
+ if (--xsrq_table->xsrq_table[index].refcnt)
+ xsrq_table->xsrq_table[index].table[srqn & xsrq_table->mask] = NULL;
+ else
+ free(xsrq_table->xsrq_table[index].table);
+
+ pthread_mutex_unlock(&xsrq_table->mutex);
+}
+
+struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex)
+{
+ struct mlx4_create_xsrq cmd;
+ struct mlx4_create_srq_resp resp;
+ struct mlx4_srq *srq;
+ int ret;
+
+ /* Sanity check SRQ size before proceeding */
+ if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64)
+ return NULL;
+
+ srq = calloc(1, sizeof *srq);
+ if (!srq)
+ return NULL;
+
+ if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE))
+ goto err;
+
+ srq->max = align_queue_size(attr_ex->attr.max_wr + 1);
+ srq->max_gs = attr_ex->attr.max_sge;
+ srq->counter = 0;
+ srq->ext_srq = 1;
+
+ if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq))
+ goto err;
+
+ srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
+ if (!srq->db)
+ goto err_free;
+
+ *srq->db = 0;
+
+ cmd.buf_addr = (uintptr_t) srq->buf.buf;
+ cmd.db_addr = (uintptr_t) srq->db;
+
+ ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq,
+ sizeof(srq->verbs_srq),
+ attr_ex,
+ &cmd.ibv_cmd, sizeof cmd,
+ &resp.ibv_resp, sizeof resp);
+ if (ret)
+ goto err_db;
+
+ ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
+ srq->verbs_srq.srq_num, srq);
+ if (ret)
+ goto err_destroy;
+
+ return &srq->verbs_srq.srq;
+
+err_destroy:
+ ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
+err_db:
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
+err_free:
+ free(srq->wrid);
+ mlx4_free_buf(&srq->buf);
+err:
+ free(srq);
+ return NULL;
+}
+
+int mlx4_destroy_xrc_srq(struct ibv_srq *srq)
+{
+ struct mlx4_context *mctx = to_mctx(srq->context);
+ struct mlx4_srq *msrq = to_msrq(srq);
+ struct mlx4_cq *mcq;
+ int ret;
+
+ mcq = to_mcq(msrq->verbs_srq.cq);
+ mlx4_cq_clean(mcq, 0, msrq);
+ pthread_spin_lock(&mcq->lock);
+ mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num);
+ pthread_spin_unlock(&mcq->lock);
+
+ ret = ibv_cmd_destroy_srq(srq);
+ if (ret) {
+ pthread_spin_lock(&mcq->lock);
+ mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq);
+ pthread_spin_unlock(&mcq->lock);
+ return ret;
+ }
+
+ mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db);
+ mlx4_free_buf(&msrq->buf);
+ free(msrq->wrid);
+ free(msrq);
+
+ return 0;
+}
diff --git a/src/verbs.c b/src/verbs.c
index 0f52c61..623d576 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -107,6 +107,42 @@ int mlx4_free_pd(struct ibv_pd *pd)
return 0;
}
+struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context,
+ struct ibv_xrcd_init_attr *attr)
+{
+ struct ibv_open_xrcd cmd;
+ struct ibv_open_xrcd_resp resp;
+ struct verbs_xrcd *xrcd;
+ int ret;
+
+ xrcd = calloc(1, sizeof *xrcd);
+ if (!xrcd)
+ return NULL;
+
+ ret = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), attr,
+ &cmd, sizeof cmd, &resp, sizeof resp);
+ if (ret)
+ goto err;
+
+ return &xrcd->xrcd;
+
+err:
+ free(xrcd);
+ return NULL;
+}
+
+int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd)
+{
+ struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
+ int ret;
+
+ ret = ibv_cmd_close_xrcd(xrcd);
+ if (!ret)
+ free(xrcd);
+
+ return ret;
+}
+
struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
int access)
{
@@ -142,7 +178,7 @@ int mlx4_dereg_mr(struct ibv_mr *mr)
return 0;
}
-static int align_queue_size(int req)
+int align_queue_size(int req)
{
int nent;
@@ -282,7 +318,7 @@ int mlx4_destroy_cq(struct ibv_cq *cq)
}
struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
- struct ibv_srq_init_attr *attr)
+ struct ibv_srq_init_attr *attr)
{
struct mlx4_create_srq cmd;
struct mlx4_create_srq_resp resp;
@@ -303,6 +339,7 @@ struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
srq->max = align_queue_size(attr->attr.max_wr + 1);
srq->max_gs = attr->attr.max_sge;
srq->counter = 0;
+ srq->ext_srq = 0;
if (mlx4_alloc_srq_buf(pd, &attr->attr, srq))
goto err;
@@ -316,15 +353,13 @@ struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
cmd.buf_addr = (uintptr_t) srq->buf.buf;
cmd.db_addr = (uintptr_t) srq->db;
- ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr,
+ ret = ibv_cmd_create_srq(pd, &srq->verbs_srq.srq, attr,
&cmd.ibv_cmd, sizeof cmd,
&resp.ibv_resp, sizeof resp);
if (ret)
goto err_db;
- srq->srqn = resp.srqn;
-
- return &srq->ibv_srq;
+ return &srq->verbs_srq.srq;
err_db:
mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, srq->db);
@@ -339,6 +374,18 @@ err:
return NULL;
}
+struct ibv_srq *mlx4_create_srq_ex(struct ibv_context *context,
+ struct ibv_srq_init_attr_ex *attr_ex)
+{
+ if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
+ (attr_ex->srq_type == IBV_SRQT_BASIC))
+ return mlx4_create_srq(attr_ex->pd, (struct ibv_srq_init_attr *) attr_ex);
+ else if (attr_ex->srq_type == IBV_SRQT_XRC)
+ return mlx4_create_xrc_srq(context, attr_ex);
+
+ return NULL;
+}
+
int mlx4_modify_srq(struct ibv_srq *srq,
struct ibv_srq_attr *attr,
int attr_mask)
@@ -360,6 +407,9 @@ int mlx4_destroy_srq(struct ibv_srq *srq)
{
int ret;
+ if (to_msrq(srq)->ext_srq)
+ return mlx4_destroy_xrc_srq(srq);
+
ret = ibv_cmd_destroy_srq(srq);
if (ret)
return ret;
@@ -372,7 +422,8 @@ int mlx4_destroy_srq(struct ibv_srq *srq)
return 0;
}
-struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
+struct ibv_qp *mlx4_create_qp_ex(struct ibv_context *context,
+ struct ibv_qp_init_attr_ex *attr)
{
struct mlx4_create_qp cmd;
struct ibv_create_qp_resp resp;
@@ -387,30 +438,34 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
attr->cap.max_inline_data > 1024)
return NULL;
- qp = malloc(sizeof *qp);
+ qp = calloc(1, sizeof *qp);
if (!qp)
return NULL;
- mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
-
- /*
- * We need to leave 2 KB + 1 WQE of headroom in the SQ to
- * allow HW to prefetch.
- */
- qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
- qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
- qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
+ if (attr->qp_type == IBV_QPT_XRC_RECV) {
+ attr->cap.max_send_wr = qp->sq.wqe_cnt = 0;
+ } else {
+ mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+ /*
+ * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+ qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+ }
- if (attr->srq)
- attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
- else {
+ if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
+ attr->qp_type == IBV_QPT_XRC_RECV) {
+ attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
+ } else {
+ qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
if (attr->cap.max_recv_sge < 1)
attr->cap.max_recv_sge = 1;
if (attr->cap.max_recv_wr < 1)
attr->cap.max_recv_wr = 1;
}
- if (mlx4_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp))
+ if (mlx4_alloc_qp_buf(context, &attr->cap, attr->qp_type, qp))
goto err;
mlx4_init_qp_indices(qp);
@@ -419,19 +474,18 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE))
goto err_free;
- if (!attr->srq) {
- qp->db = mlx4_alloc_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ);
+ if (attr->cap.max_recv_sge) {
+ qp->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ);
if (!qp->db)
goto err_free;
*qp->db = 0;
+ cmd.db_addr = (uintptr_t) qp->db;
+ } else {
+ cmd.db_addr = 0;
}
cmd.buf_addr = (uintptr_t) qp->buf.buf;
- if (attr->srq)
- cmd.db_addr = 0;
- else
- cmd.db_addr = (uintptr_t) qp->db;
cmd.log_sq_stride = qp->sq.wqe_shift;
for (cmd.log_sq_bb_count = 0;
qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
@@ -440,37 +494,41 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
cmd.sq_no_prefetch = 0; /* OK for ABI 2: just a reserved field */
memset(cmd.reserved, 0, sizeof cmd.reserved);
- pthread_mutex_lock(&to_mctx(pd->context)->qp_table_mutex);
+ pthread_mutex_lock(&to_mctx(context)->qp_table_mutex);
- ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd,
- &resp, sizeof resp);
+ ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp,
+ sizeof(qp->verbs_qp), attr,
+ &cmd.ibv_cmd, sizeof cmd, &resp, sizeof resp);
if (ret)
goto err_rq_db;
- ret = mlx4_store_qp(to_mctx(pd->context), qp->ibv_qp.qp_num, qp);
- if (ret)
- goto err_destroy;
- pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
+ if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
+ ret = mlx4_store_qp(to_mctx(context), qp->verbs_qp.qp.qp_num, qp);
+ if (ret)
+ goto err_destroy;
+ }
+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
qp->rq.max_gs = attr->cap.max_recv_sge;
- mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
+ if (attr->qp_type != IBV_QPT_XRC_RECV)
+ mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
- qp->doorbell_qpn = htonl(qp->ibv_qp.qp_num << 8);
+ qp->doorbell_qpn = htonl(qp->verbs_qp.qp.qp_num << 8);
if (attr->sq_sig_all)
qp->sq_signal_bits = htonl(MLX4_WQE_CTRL_CQ_UPDATE);
else
qp->sq_signal_bits = 0;
- return &qp->ibv_qp;
+ return &qp->verbs_qp.qp;
err_destroy:
- ibv_cmd_destroy_qp(&qp->ibv_qp);
+ ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
err_rq_db:
- pthread_mutex_unlock(&to_mctx(pd->context)->qp_table_mutex);
- if (!attr->srq)
- mlx4_free_db(to_mctx(pd->context), MLX4_DB_TYPE_RQ, qp->db);
+ pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
+ if (attr->cap.max_recv_sge)
+ mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, qp->db);
err_free:
free(qp->sq.wrid);
@@ -484,6 +542,43 @@ err:
return NULL;
}
+struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
+{
+ struct ibv_qp_init_attr_ex attr_ex;
+ struct ibv_qp *qp;
+
+ memcpy(&attr_ex, attr, sizeof *attr);
+ attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
+ attr_ex.pd = pd;
+ qp = mlx4_create_qp_ex(pd->context, &attr_ex);
+ if (qp)
+ memcpy(attr, &attr_ex, sizeof *attr);
+ return qp;
+}
+
+struct ibv_qp *mlx4_open_qp(struct ibv_context *context, struct ibv_qp_open_attr *attr)
+{
+ struct ibv_open_qp cmd;
+ struct ibv_create_qp_resp resp;
+ struct mlx4_qp *qp;
+ int ret;
+
+ qp = calloc(1, sizeof *qp);
+ if (!qp)
+ return NULL;
+
+ ret = ibv_cmd_open_qp(context, &qp->verbs_qp, sizeof(qp->verbs_qp), attr,
+ &cmd, sizeof cmd, &resp, sizeof resp);
+ if (ret)
+ goto err;
+
+ return &qp->verbs_qp.qp;
+
+err:
+ free(qp);
+ return NULL;
+}
+
int mlx4_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
int attr_mask,
struct ibv_qp_init_attr *init_attr)
@@ -514,7 +609,7 @@ int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
int ret;
if (attr_mask & IBV_QP_PORT) {
- ret = ibv_query_port(qp->pd->context, attr->port_num,
+ ret = ibv_query_port(qp->context, attr->port_num,
&port_attr);
if (ret)
return ret;
@@ -532,13 +627,14 @@ int mlx4_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
if (!ret &&
(attr_mask & IBV_QP_STATE) &&
attr->qp_state == IBV_QPS_RESET) {
- mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
- qp->srq ? to_msrq(qp->srq) : NULL);
- if (qp->send_cq != qp->recv_cq)
+ if (qp->recv_cq)
+ mlx4_cq_clean(to_mcq(qp->recv_cq), qp->qp_num,
+ qp->srq ? to_msrq(qp->srq) : NULL);
+ if (qp->send_cq && qp->send_cq != qp->recv_cq)
mlx4_cq_clean(to_mcq(qp->send_cq), qp->qp_num, NULL);
mlx4_init_qp_indices(to_mqp(qp));
- if (!qp->srq)
+ if (to_mqp(qp)->rq.wqe_cnt)
*to_mqp(qp)->db = 0;
}
@@ -550,9 +646,14 @@ static void mlx4_lock_cqs(struct ibv_qp *qp)
struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
- if (send_cq == recv_cq)
+ if (!qp->send_cq || !qp->recv_cq) {
+ if (qp->send_cq)
+ pthread_spin_lock(&send_cq->lock);
+ else if (qp->recv_cq)
+ pthread_spin_lock(&recv_cq->lock);
+ } else if (send_cq == recv_cq) {
pthread_spin_lock(&send_cq->lock);
- else if (send_cq->cqn < recv_cq->cqn) {
+ } else if (send_cq->cqn < recv_cq->cqn) {
pthread_spin_lock(&send_cq->lock);
pthread_spin_lock(&recv_cq->lock);
} else {
@@ -566,9 +667,15 @@ static void mlx4_unlock_cqs(struct ibv_qp *qp)
struct mlx4_cq *send_cq = to_mcq(qp->send_cq);
struct mlx4_cq *recv_cq = to_mcq(qp->recv_cq);
- if (send_cq == recv_cq)
+
+ if (!qp->send_cq || !qp->recv_cq) {
+ if (qp->send_cq)
+ pthread_spin_unlock(&send_cq->lock);
+ else if (qp->recv_cq)
+ pthread_spin_unlock(&recv_cq->lock);
+ } else if (send_cq == recv_cq) {
pthread_spin_unlock(&send_cq->lock);
- else if (send_cq->cqn < recv_cq->cqn) {
+ } else if (send_cq->cqn < recv_cq->cqn) {
pthread_spin_unlock(&recv_cq->lock);
pthread_spin_unlock(&send_cq->lock);
} else {
@@ -591,21 +698,24 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
mlx4_lock_cqs(ibqp);
- __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
- ibqp->srq ? to_msrq(ibqp->srq) : NULL);
- if (ibqp->send_cq != ibqp->recv_cq)
+ if (ibqp->recv_cq)
+ __mlx4_cq_clean(to_mcq(ibqp->recv_cq), ibqp->qp_num,
+ ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+ if (ibqp->send_cq && ibqp->send_cq != ibqp->recv_cq)
__mlx4_cq_clean(to_mcq(ibqp->send_cq), ibqp->qp_num, NULL);
- mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
+ if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
+ mlx4_clear_qp(to_mctx(ibqp->context), ibqp->qp_num);
mlx4_unlock_cqs(ibqp);
pthread_mutex_unlock(&to_mctx(ibqp->context)->qp_table_mutex);
- if (!ibqp->srq)
+ if (qp->rq.wqe_cnt) {
mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
- free(qp->sq.wrid);
- if (qp->rq.wqe_cnt)
free(qp->rq.wrid);
+ }
+ if (qp->sq.wqe_cnt)
+ free(qp->sq.wrid);
mlx4_free_buf(&qp->buf);
free(qp);