| /* |
| * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. |
| * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. |
| * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. |
| * Copyright (c) 2005. PathScale, Inc. All rights reserved. |
| * |
| * This software is available to you under a choice of one of two |
| * licenses. You may choose to be licensed under the terms of the GNU |
| * General Public License (GPL) Version 2, available from the file |
| * COPYING in the main directory of this source tree, or the |
| * OpenIB.org BSD license below: |
| * |
| * Redistribution and use in source and binary forms, with or |
| * without modification, are permitted provided that the following |
| * conditions are met: |
| * |
| * - Redistributions of source code must retain the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer. |
| * |
| * - Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following |
| * disclaimer in the documentation and/or other materials |
| * provided with the distribution. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
| * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| #include <config.h> |
| |
| #include <endian.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <string.h> |
| #include <pthread.h> |
| #include <netinet/in.h> |
| #include <sys/mman.h> |
| #include <errno.h> |
| |
| #include <endian.h> |
| #include <pthread.h> |
| #include <stddef.h> |
| |
| #include <infiniband/driver.h> |
| #include <infiniband/verbs.h> |
| |
| #include "rxe_queue.h" |
| #include "rxe-abi.h" |
| #include "rxe.h" |
| |
| static void rxe_free_context(struct ibv_context *ibctx); |
| |
| static const struct verbs_match_ent hca_table[] = { |
| VERBS_DRIVER_ID(RDMA_DRIVER_RXE), |
| VERBS_NAME_MATCH("rxe", NULL), |
| {}, |
| }; |
| |
| static int rxe_query_device(struct ibv_context *context, |
| const struct ibv_query_device_ex_input *input, |
| struct ibv_device_attr_ex *attr, size_t attr_size) |
| { |
| struct ib_uverbs_ex_query_device_resp resp; |
| size_t resp_size = sizeof(resp); |
| uint64_t raw_fw_ver; |
| unsigned int major, minor, sub_minor; |
| int ret; |
| |
| ret = ibv_cmd_query_device_any(context, input, attr, attr_size, &resp, |
| &resp_size); |
| if (ret) |
| return ret; |
| |
| raw_fw_ver = resp.base.fw_ver; |
| major = (raw_fw_ver >> 32) & 0xffff; |
| minor = (raw_fw_ver >> 16) & 0xffff; |
| sub_minor = raw_fw_ver & 0xffff; |
| |
| snprintf(attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver), |
| "%d.%d.%d", major, minor, sub_minor); |
| |
| return 0; |
| } |
| |
| static int rxe_query_port(struct ibv_context *context, uint8_t port, |
| struct ibv_port_attr *attr) |
| { |
| struct ibv_query_port cmd; |
| |
| return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); |
| } |
| |
| static struct ibv_pd *rxe_alloc_pd(struct ibv_context *context) |
| { |
| struct ibv_alloc_pd cmd; |
| struct ib_uverbs_alloc_pd_resp resp; |
| struct ibv_pd *pd; |
| |
| pd = calloc(1, sizeof(*pd)); |
| if (!pd) |
| return NULL; |
| |
| if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof(cmd), |
| &resp, sizeof(resp))) { |
| free(pd); |
| return NULL; |
| } |
| |
| return pd; |
| } |
| |
| static int rxe_dealloc_pd(struct ibv_pd *pd) |
| { |
| int ret; |
| |
| ret = ibv_cmd_dealloc_pd(pd); |
| if (!ret) |
| free(pd); |
| |
| return ret; |
| } |
| |
| static struct ibv_mw *rxe_alloc_mw(struct ibv_pd *ibpd, enum ibv_mw_type type) |
| { |
| int ret; |
| struct ibv_mw *ibmw; |
| struct ibv_alloc_mw cmd = {}; |
| struct ib_uverbs_alloc_mw_resp resp = {}; |
| |
| ibmw = calloc(1, sizeof(*ibmw)); |
| if (!ibmw) |
| return NULL; |
| |
| ret = ibv_cmd_alloc_mw(ibpd, type, ibmw, &cmd, sizeof(cmd), &resp, |
| sizeof(resp)); |
| if (ret) { |
| free(ibmw); |
| return NULL; |
| } |
| |
| return ibmw; |
| } |
| |
| static int rxe_dealloc_mw(struct ibv_mw *ibmw) |
| { |
| int ret; |
| |
| ret = ibv_cmd_dealloc_mw(ibmw); |
| if (ret) |
| return ret; |
| |
| free(ibmw); |
| return 0; |
| } |
| |
| static int next_rkey(int rkey) |
| { |
| return (rkey & 0xffffff00) | ((rkey + 1) & 0x000000ff); |
| } |
| |
| static int rxe_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr_list, |
| struct ibv_send_wr **bad_wr); |
| |
| static int rxe_bind_mw(struct ibv_qp *ibqp, struct ibv_mw *ibmw, |
| struct ibv_mw_bind *mw_bind) |
| { |
| int ret; |
| struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info; |
| struct ibv_send_wr ibwr; |
| struct ibv_send_wr *bad_wr; |
| |
| if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) { |
| ret = EINVAL; |
| goto err; |
| } |
| |
| memset(&ibwr, 0, sizeof(ibwr)); |
| |
| ibwr.opcode = IBV_WR_BIND_MW; |
| ibwr.next = NULL; |
| ibwr.wr_id = mw_bind->wr_id; |
| ibwr.send_flags = mw_bind->send_flags; |
| ibwr.bind_mw.bind_info = mw_bind->bind_info; |
| ibwr.bind_mw.mw = ibmw; |
| ibwr.bind_mw.rkey = next_rkey(ibmw->rkey); |
| |
| ret = rxe_post_send(ibqp, &ibwr, &bad_wr); |
| if (ret) |
| goto err; |
| |
| /* user has to undo this if he gets an error wc */ |
| ibmw->rkey = ibwr.bind_mw.rkey; |
| |
| return 0; |
| err: |
| errno = ret; |
| return errno; |
| } |
| |
| static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length, |
| uint64_t hca_va, int access) |
| { |
| struct verbs_mr *vmr; |
| struct ibv_reg_mr cmd; |
| struct ib_uverbs_reg_mr_resp resp; |
| int ret; |
| |
| vmr = calloc(1, sizeof(*vmr)); |
| if (!vmr) |
| return NULL; |
| |
| ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, |
| sizeof(cmd), &resp, sizeof(resp)); |
| if (ret) { |
| free(vmr); |
| return NULL; |
| } |
| |
| return &vmr->ibv_mr; |
| } |
| |
| static int rxe_dereg_mr(struct verbs_mr *vmr) |
| { |
| int ret; |
| |
| ret = ibv_cmd_dereg_mr(vmr); |
| if (ret) |
| return ret; |
| |
| free(vmr); |
| return 0; |
| } |
| |
| static int cq_start_poll(struct ibv_cq_ex *current, |
| struct ibv_poll_cq_attr *attr) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| pthread_spin_lock(&cq->lock); |
| |
| cq->cur_index = load_consumer_index(cq->queue); |
| |
| if (check_cq_queue_empty(cq)) { |
| pthread_spin_unlock(&cq->lock); |
| errno = ENOENT; |
| return errno; |
| } |
| |
| cq->wc = addr_from_index(cq->queue, cq->cur_index); |
| cq->vcq.cq_ex.status = cq->wc->status; |
| cq->vcq.cq_ex.wr_id = cq->wc->wr_id; |
| |
| return 0; |
| } |
| |
| static int cq_next_poll(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| advance_cq_cur_index(cq); |
| |
| if (check_cq_queue_empty(cq)) { |
| store_consumer_index(cq->queue, cq->cur_index); |
| errno = ENOENT; |
| return errno; |
| } |
| |
| cq->wc = addr_from_index(cq->queue, cq->cur_index); |
| cq->vcq.cq_ex.status = cq->wc->status; |
| cq->vcq.cq_ex.wr_id = cq->wc->wr_id; |
| |
| return 0; |
| } |
| |
| static void cq_end_poll(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| advance_cq_cur_index(cq); |
| store_consumer_index(cq->queue, cq->cur_index); |
| pthread_spin_unlock(&cq->lock); |
| } |
| |
| static enum ibv_wc_opcode cq_read_opcode(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->opcode; |
| } |
| |
| static uint32_t cq_read_vendor_err(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->vendor_err; |
| } |
| |
| static uint32_t cq_read_byte_len(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->byte_len; |
| } |
| |
| static __be32 cq_read_imm_data(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->ex.imm_data; |
| } |
| |
| static uint32_t cq_read_qp_num(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->qp_num; |
| } |
| |
| static uint32_t cq_read_src_qp(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->src_qp; |
| } |
| |
| static unsigned int cq_read_wc_flags(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->wc_flags; |
| } |
| |
| static uint32_t cq_read_slid(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->slid; |
| } |
| |
| static uint8_t cq_read_sl(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->sl; |
| } |
| |
| static uint8_t cq_read_dlid_path_bits(struct ibv_cq_ex *current) |
| { |
| struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); |
| |
| return cq->wc->dlid_path_bits; |
| } |
| |
| static int rxe_destroy_cq(struct ibv_cq *ibcq); |
| |
| static struct ibv_cq *rxe_create_cq(struct ibv_context *context, int cqe, |
| struct ibv_comp_channel *channel, |
| int comp_vector) |
| { |
| struct rxe_cq *cq; |
| struct urxe_create_cq_resp resp = {}; |
| int ret; |
| |
| cq = calloc(1, sizeof(*cq)); |
| if (!cq) |
| return NULL; |
| |
| ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, |
| &cq->vcq.cq, NULL, 0, |
| &resp.ibv_resp, sizeof(resp)); |
| if (ret) { |
| free(cq); |
| return NULL; |
| } |
| |
| cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, |
| context->cmd_fd, resp.mi.offset); |
| if ((void *)cq->queue == MAP_FAILED) { |
| ibv_cmd_destroy_cq(&cq->vcq.cq); |
| free(cq); |
| return NULL; |
| } |
| |
| cq->wc_size = 1ULL << cq->queue->log2_elem_size; |
| |
| if (cq->wc_size < sizeof(struct ib_uverbs_wc)) { |
| rxe_destroy_cq(&cq->vcq.cq); |
| return NULL; |
| } |
| |
| cq->mmap_info = resp.mi; |
| pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); |
| |
| return &cq->vcq.cq; |
| } |
| |
| enum rxe_sup_wc_flags { |
| RXE_SUP_WC_FLAGS = IBV_WC_EX_WITH_BYTE_LEN |
| | IBV_WC_EX_WITH_IMM |
| | IBV_WC_EX_WITH_QP_NUM |
| | IBV_WC_EX_WITH_SRC_QP |
| | IBV_WC_EX_WITH_SLID |
| | IBV_WC_EX_WITH_SL |
| | IBV_WC_EX_WITH_DLID_PATH_BITS, |
| RXE_SUP_WC_EX_FLAGS = RXE_SUP_WC_FLAGS, |
| // add extended flags here |
| }; |
| |
| static struct ibv_cq_ex *rxe_create_cq_ex(struct ibv_context *context, |
| struct ibv_cq_init_attr_ex *attr) |
| { |
| int ret; |
| struct rxe_cq *cq; |
| struct urxe_create_cq_ex_resp resp = {}; |
| |
| /* user is asking for flags we don't support */ |
| if (attr->wc_flags & ~RXE_SUP_WC_EX_FLAGS) { |
| errno = EOPNOTSUPP; |
| goto err; |
| } |
| |
| cq = calloc(1, sizeof(*cq)); |
| if (!cq) |
| goto err; |
| |
| ret = ibv_cmd_create_cq_ex(context, attr, &cq->vcq, |
| NULL, 0, |
| &resp.ibv_resp, sizeof(resp), 0); |
| if (ret) |
| goto err_free; |
| |
| cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, |
| context->cmd_fd, resp.mi.offset); |
| if ((void *)cq->queue == MAP_FAILED) |
| goto err_destroy; |
| |
| cq->wc_size = 1ULL << cq->queue->log2_elem_size; |
| |
| if (cq->wc_size < sizeof(struct ib_uverbs_wc)) |
| goto err_unmap; |
| |
| cq->mmap_info = resp.mi; |
| pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); |
| |
| cq->vcq.cq_ex.start_poll = cq_start_poll; |
| cq->vcq.cq_ex.next_poll = cq_next_poll; |
| cq->vcq.cq_ex.end_poll = cq_end_poll; |
| cq->vcq.cq_ex.read_opcode = cq_read_opcode; |
| cq->vcq.cq_ex.read_vendor_err = cq_read_vendor_err; |
| cq->vcq.cq_ex.read_wc_flags = cq_read_wc_flags; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) |
| cq->vcq.cq_ex.read_byte_len |
| = cq_read_byte_len; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_IMM) |
| cq->vcq.cq_ex.read_imm_data |
| = cq_read_imm_data; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) |
| cq->vcq.cq_ex.read_qp_num |
| = cq_read_qp_num; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) |
| cq->vcq.cq_ex.read_src_qp |
| = cq_read_src_qp; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_SLID) |
| cq->vcq.cq_ex.read_slid |
| = cq_read_slid; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_SL) |
| cq->vcq.cq_ex.read_sl |
| = cq_read_sl; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) |
| cq->vcq.cq_ex.read_dlid_path_bits |
| = cq_read_dlid_path_bits; |
| |
| return &cq->vcq.cq_ex; |
| |
| err_unmap: |
| if (cq->mmap_info.size) |
| munmap(cq->queue, cq->mmap_info.size); |
| err_destroy: |
| ibv_cmd_destroy_cq(&cq->vcq.cq); |
| err_free: |
| free(cq); |
| err: |
| return NULL; |
| } |
| |
| static int rxe_resize_cq(struct ibv_cq *ibcq, int cqe) |
| { |
| struct rxe_cq *cq = to_rcq(ibcq); |
| struct ibv_resize_cq cmd; |
| struct urxe_resize_cq_resp resp; |
| int ret; |
| |
| pthread_spin_lock(&cq->lock); |
| |
| ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof(cmd), |
| &resp.ibv_resp, sizeof(resp)); |
| if (ret) { |
| pthread_spin_unlock(&cq->lock); |
| return ret; |
| } |
| |
| munmap(cq->queue, cq->mmap_info.size); |
| |
| cq->queue = mmap(NULL, resp.mi.size, |
| PROT_READ | PROT_WRITE, MAP_SHARED, |
| ibcq->context->cmd_fd, resp.mi.offset); |
| |
| ret = errno; |
| pthread_spin_unlock(&cq->lock); |
| |
| if ((void *)cq->queue == MAP_FAILED) { |
| cq->queue = NULL; |
| cq->mmap_info.size = 0; |
| return ret; |
| } |
| |
| cq->mmap_info = resp.mi; |
| |
| return 0; |
| } |
| |
| static int rxe_destroy_cq(struct ibv_cq *ibcq) |
| { |
| struct rxe_cq *cq = to_rcq(ibcq); |
| int ret; |
| |
| ret = ibv_cmd_destroy_cq(ibcq); |
| if (ret) |
| return ret; |
| |
| if (cq->mmap_info.size) |
| munmap(cq->queue, cq->mmap_info.size); |
| free(cq); |
| |
| return 0; |
| } |
| |
| static int rxe_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) |
| { |
| struct rxe_cq *cq = to_rcq(ibcq); |
| struct rxe_queue_buf *q; |
| int npolled; |
| uint8_t *src; |
| |
| pthread_spin_lock(&cq->lock); |
| q = cq->queue; |
| |
| for (npolled = 0; npolled < ne; ++npolled, ++wc) { |
| if (queue_empty(q)) |
| break; |
| |
| src = consumer_addr(q); |
| memcpy(wc, src, sizeof(*wc)); |
| advance_consumer(q); |
| } |
| |
| pthread_spin_unlock(&cq->lock); |
| return npolled; |
| } |
| |
| static struct ibv_srq *rxe_create_srq(struct ibv_pd *pd, |
| struct ibv_srq_init_attr *attr) |
| { |
| struct rxe_srq *srq; |
| struct ibv_create_srq cmd; |
| struct urxe_create_srq_resp resp; |
| int ret; |
| |
| srq = calloc(1, sizeof(*srq)); |
| if (srq == NULL) |
| return NULL; |
| |
| ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, attr, &cmd, sizeof(cmd), |
| &resp.ibv_resp, sizeof(resp)); |
| if (ret) { |
| free(srq); |
| return NULL; |
| } |
| |
| srq->rq.queue = mmap(NULL, resp.mi.size, |
| PROT_READ | PROT_WRITE, MAP_SHARED, |
| pd->context->cmd_fd, resp.mi.offset); |
| if ((void *)srq->rq.queue == MAP_FAILED) { |
| ibv_cmd_destroy_srq(&srq->ibv_srq); |
| free(srq); |
| return NULL; |
| } |
| |
| srq->mmap_info = resp.mi; |
| srq->rq.max_sge = attr->attr.max_sge; |
| pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); |
| |
| return &srq->ibv_srq; |
| } |
| |
| static int rxe_modify_srq(struct ibv_srq *ibsrq, |
| struct ibv_srq_attr *attr, int attr_mask) |
| { |
| struct rxe_srq *srq = to_rsrq(ibsrq); |
| struct urxe_modify_srq cmd; |
| int rc = 0; |
| struct mminfo mi; |
| |
| mi.offset = 0; |
| mi.size = 0; |
| |
| if (attr_mask & IBV_SRQ_MAX_WR) |
| pthread_spin_lock(&srq->rq.lock); |
| |
| cmd.mmap_info_addr = (__u64)(uintptr_t) &mi; |
| rc = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, |
| &cmd.ibv_cmd, sizeof(cmd)); |
| if (rc) |
| goto out; |
| |
| if (attr_mask & IBV_SRQ_MAX_WR) { |
| munmap(srq->rq.queue, srq->mmap_info.size); |
| srq->rq.queue = mmap(NULL, mi.size, |
| PROT_READ | PROT_WRITE, MAP_SHARED, |
| ibsrq->context->cmd_fd, mi.offset); |
| |
| if ((void *)srq->rq.queue == MAP_FAILED) { |
| rc = errno; |
| srq->rq.queue = NULL; |
| srq->mmap_info.size = 0; |
| goto out; |
| } |
| |
| srq->mmap_info = mi; |
| } |
| |
| out: |
| if (attr_mask & IBV_SRQ_MAX_WR) |
| pthread_spin_unlock(&srq->rq.lock); |
| return rc; |
| } |
| |
| static int rxe_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) |
| { |
| struct ibv_query_srq cmd; |
| |
| return ibv_cmd_query_srq(srq, attr, &cmd, sizeof(cmd)); |
| } |
| |
| static int rxe_destroy_srq(struct ibv_srq *ibvsrq) |
| { |
| int ret; |
| struct rxe_srq *srq = to_rsrq(ibvsrq); |
| struct rxe_queue_buf *q = srq->rq.queue; |
| |
| ret = ibv_cmd_destroy_srq(ibvsrq); |
| if (!ret) { |
| if (srq->mmap_info.size) |
| munmap(q, srq->mmap_info.size); |
| free(srq); |
| } |
| |
| return ret; |
| } |
| |
| static int rxe_post_one_recv(struct rxe_wq *rq, struct ibv_recv_wr *recv_wr) |
| { |
| int i; |
| struct rxe_recv_wqe *wqe; |
| struct rxe_queue_buf *q = rq->queue; |
| int length = 0; |
| int rc = 0; |
| |
| if (queue_full(q)) { |
| rc = ENOMEM; |
| goto out; |
| } |
| |
| if (recv_wr->num_sge > rq->max_sge) { |
| rc = EINVAL; |
| goto out; |
| } |
| |
| wqe = (struct rxe_recv_wqe *)producer_addr(q); |
| |
| wqe->wr_id = recv_wr->wr_id; |
| wqe->num_sge = recv_wr->num_sge; |
| |
| memcpy(wqe->dma.sge, recv_wr->sg_list, |
| wqe->num_sge*sizeof(*wqe->dma.sge)); |
| |
| for (i = 0; i < wqe->num_sge; i++) |
| length += wqe->dma.sge[i].length; |
| |
| wqe->dma.length = length; |
| wqe->dma.resid = length; |
| wqe->dma.cur_sge = 0; |
| wqe->dma.num_sge = wqe->num_sge; |
| wqe->dma.sge_offset = 0; |
| |
| advance_producer(q); |
| |
| out: |
| return rc; |
| } |
| |
| static int rxe_post_srq_recv(struct ibv_srq *ibvsrq, |
| struct ibv_recv_wr *recv_wr, |
| struct ibv_recv_wr **bad_recv_wr) |
| { |
| struct rxe_srq *srq = to_rsrq(ibvsrq); |
| int rc = 0; |
| |
| pthread_spin_lock(&srq->rq.lock); |
| |
| while (recv_wr) { |
| rc = rxe_post_one_recv(&srq->rq, recv_wr); |
| if (rc) { |
| *bad_recv_wr = recv_wr; |
| break; |
| } |
| |
| recv_wr = recv_wr->next; |
| } |
| |
| pthread_spin_unlock(&srq->rq.lock); |
| |
| return rc; |
| } |
| |
| /* |
| * builders always consume one send queue slot |
| * setters (below) reach back and adjust previous build |
| */ |
| static void wr_atomic_cmp_swp(struct ibv_qp_ex *ibqp, uint32_t rkey, |
| uint64_t remote_addr, uint64_t compare, |
| uint64_t swap) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = ibqp->wr_id; |
| wqe->wr.send_flags = ibqp->wr_flags; |
| wqe->wr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; |
| |
| wqe->wr.wr.atomic.remote_addr = remote_addr; |
| wqe->wr.wr.atomic.compare_add = compare; |
| wqe->wr.wr.atomic.swap = swap; |
| wqe->wr.wr.atomic.rkey = rkey; |
| wqe->iova = remote_addr; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_atomic_fetch_add(struct ibv_qp_ex *ibqp, uint32_t rkey, |
| uint64_t remote_addr, uint64_t add) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_ATOMIC_FETCH_AND_ADD; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.wr.atomic.remote_addr = remote_addr; |
| wqe->wr.wr.atomic.compare_add = add; |
| wqe->wr.wr.atomic.rkey = rkey; |
| wqe->iova = remote_addr; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_bind_mw(struct ibv_qp_ex *ibqp, struct ibv_mw *ibmw, |
| uint32_t rkey, const struct ibv_mw_bind_info *info) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = ibqp->wr_id; |
| wqe->wr.opcode = IBV_WR_BIND_MW; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.wr.mw.addr = info->addr; |
| wqe->wr.wr.mw.length = info->length; |
| wqe->wr.wr.mw.mr_lkey = info->mr->lkey; |
| wqe->wr.wr.mw.mw_rkey = ibmw->rkey; |
| wqe->wr.wr.mw.rkey = rkey; |
| wqe->wr.wr.mw.access = info->mw_access_flags; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_local_inv(struct ibv_qp_ex *ibqp, uint32_t invalidate_rkey) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_LOCAL_INV; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.ex.invalidate_rkey = invalidate_rkey; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_rdma_read(struct ibv_qp_ex *ibqp, uint32_t rkey, |
| uint64_t remote_addr) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_RDMA_READ; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.wr.rdma.remote_addr = remote_addr; |
| wqe->wr.wr.rdma.rkey = rkey; |
| wqe->iova = remote_addr; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_rdma_write(struct ibv_qp_ex *ibqp, uint32_t rkey, |
| uint64_t remote_addr) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_RDMA_WRITE; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.wr.rdma.remote_addr = remote_addr; |
| wqe->wr.wr.rdma.rkey = rkey; |
| wqe->iova = remote_addr; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_rdma_write_imm(struct ibv_qp_ex *ibqp, uint32_t rkey, |
| uint64_t remote_addr, __be32 imm_data) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.wr.rdma.remote_addr = remote_addr; |
| wqe->wr.wr.rdma.rkey = rkey; |
| wqe->wr.ex.imm_data = imm_data; |
| wqe->iova = remote_addr; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_send(struct ibv_qp_ex *ibqp) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_SEND; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_send_imm(struct ibv_qp_ex *ibqp, __be32 imm_data) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_SEND_WITH_IMM; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.ex.imm_data = imm_data; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_send_inv(struct ibv_qp_ex *ibqp, uint32_t invalidate_rkey) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); |
| |
| if (check_qp_queue_full(qp)) |
| return; |
| |
| memset(wqe, 0, sizeof(*wqe)); |
| |
| wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; |
| wqe->wr.opcode = IBV_WR_SEND_WITH_INV; |
| wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; |
| wqe->wr.ex.invalidate_rkey = invalidate_rkey; |
| wqe->ssn = qp->ssn++; |
| |
| advance_qp_cur_index(qp); |
| } |
| |
| static void wr_set_ud_addr(struct ibv_qp_ex *ibqp, struct ibv_ah *ibah, |
| uint32_t remote_qpn, uint32_t remote_qkey) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_ah *ah = to_rah(ibah); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, |
| qp->cur_index - 1); |
| |
| if (qp->err) |
| return; |
| |
| wqe->wr.wr.ud.remote_qpn = remote_qpn; |
| wqe->wr.wr.ud.remote_qkey = remote_qkey; |
| wqe->wr.wr.ud.ah_num = ah->ah_num; |
| |
| if (!ah->ah_num) |
| /* old kernels only */ |
| memcpy(&wqe->wr.wr.ud.av, &ah->av, sizeof(ah->av)); |
| } |
| |
| static void wr_set_inline_data(struct ibv_qp_ex *ibqp, void *addr, |
| size_t length) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, |
| qp->cur_index - 1); |
| |
| if (qp->err) |
| return; |
| |
| if (length > qp->sq.max_inline) { |
| qp->err = ENOSPC; |
| return; |
| } |
| |
| memcpy(wqe->dma.inline_data, addr, length); |
| wqe->dma.length = length; |
| wqe->dma.resid = length; |
| } |
| |
| static void wr_set_inline_data_list(struct ibv_qp_ex *ibqp, size_t num_buf, |
| const struct ibv_data_buf *buf_list) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, |
| qp->cur_index - 1); |
| uint8_t *data = wqe->dma.inline_data; |
| size_t length; |
| size_t tot_length = 0; |
| |
| if (qp->err) |
| return; |
| |
| while (num_buf--) { |
| length = buf_list->length; |
| |
| if (tot_length + length > qp->sq.max_inline) { |
| qp->err = ENOSPC; |
| return; |
| } |
| |
| memcpy(data, buf_list->addr, length); |
| |
| buf_list++; |
| data += length; |
| } |
| |
| wqe->dma.length = tot_length; |
| wqe->dma.resid = tot_length; |
| } |
| |
| static void wr_set_sge(struct ibv_qp_ex *ibqp, uint32_t lkey, uint64_t addr, |
| uint32_t length) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, |
| qp->cur_index - 1); |
| |
| if (qp->err) |
| return; |
| |
| if (length) { |
| wqe->dma.length = length; |
| wqe->dma.resid = length; |
| wqe->dma.num_sge = 1; |
| |
| wqe->dma.sge[0].addr = addr; |
| wqe->dma.sge[0].length = length; |
| wqe->dma.sge[0].lkey = lkey; |
| } |
| } |
| |
| static void wr_set_sge_list(struct ibv_qp_ex *ibqp, size_t num_sge, |
| const struct ibv_sge *sg_list) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, |
| qp->cur_index - 1); |
| size_t tot_length = 0; |
| |
| if (qp->err) |
| return; |
| |
| if (num_sge > qp->sq.max_sge) { |
| qp->err = ENOSPC; |
| return; |
| } |
| |
| wqe->dma.num_sge = num_sge; |
| memcpy(wqe->dma.sge, sg_list, num_sge*sizeof(*sg_list)); |
| |
| while (num_sge--) |
| tot_length += sg_list->length; |
| |
| wqe->dma.length = tot_length; |
| wqe->dma.resid = tot_length; |
| } |
| |
| |
| static void wr_start(struct ibv_qp_ex *ibqp) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| |
| pthread_spin_lock(&qp->sq.lock); |
| |
| qp->err = 0; |
| qp->cur_index = load_producer_index(qp->sq.queue); |
| } |
| |
| static int post_send_db(struct ibv_qp *ibqp); |
| |
| static int wr_complete(struct ibv_qp_ex *ibqp) |
| { |
| int ret; |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| |
| if (qp->err) { |
| pthread_spin_unlock(&qp->sq.lock); |
| return qp->err; |
| } |
| |
| store_producer_index(qp->sq.queue, qp->cur_index); |
| ret = post_send_db(&qp->vqp.qp); |
| |
| pthread_spin_unlock(&qp->sq.lock); |
| return ret; |
| } |
| |
| static void wr_abort(struct ibv_qp_ex *ibqp) |
| { |
| struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); |
| |
| pthread_spin_unlock(&qp->sq.lock); |
| } |
| |
| static int map_queue_pair(int cmd_fd, struct rxe_qp *qp, |
| struct ibv_qp_init_attr *attr, |
| struct rxe_create_qp_resp *resp) |
| { |
| if (attr->srq) { |
| qp->rq.max_sge = 0; |
| qp->rq.queue = NULL; |
| qp->rq_mmap_info.size = 0; |
| } else { |
| qp->rq.max_sge = attr->cap.max_recv_sge; |
| qp->rq.queue = mmap(NULL, resp->rq_mi.size, PROT_READ | PROT_WRITE, |
| MAP_SHARED, |
| cmd_fd, resp->rq_mi.offset); |
| if ((void *)qp->rq.queue == MAP_FAILED) |
| return errno; |
| |
| qp->rq_mmap_info = resp->rq_mi; |
| pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE); |
| } |
| |
| qp->sq.max_sge = attr->cap.max_send_sge; |
| qp->sq.max_inline = attr->cap.max_inline_data; |
| qp->sq.queue = mmap(NULL, resp->sq_mi.size, PROT_READ | PROT_WRITE, |
| MAP_SHARED, |
| cmd_fd, resp->sq_mi.offset); |
| if ((void *)qp->sq.queue == MAP_FAILED) { |
| if (qp->rq_mmap_info.size) |
| munmap(qp->rq.queue, qp->rq_mmap_info.size); |
| return errno; |
| } |
| |
| qp->sq_mmap_info = resp->sq_mi; |
| pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE); |
| |
| return 0; |
| } |
| |
| static struct ibv_qp *rxe_create_qp(struct ibv_pd *ibpd, |
| struct ibv_qp_init_attr *attr) |
| { |
| struct ibv_create_qp cmd = {}; |
| struct urxe_create_qp_resp resp = {}; |
| struct rxe_qp *qp; |
| int ret; |
| |
| qp = calloc(1, sizeof(*qp)); |
| if (!qp) |
| goto err; |
| |
| ret = ibv_cmd_create_qp(ibpd, &qp->vqp.qp, attr, &cmd, sizeof(cmd), |
| &resp.ibv_resp, sizeof(resp)); |
| if (ret) |
| goto err_free; |
| |
| ret = map_queue_pair(ibpd->context->cmd_fd, qp, attr, |
| &resp.drv_payload); |
| if (ret) |
| goto err_destroy; |
| |
| qp->sq_mmap_info = resp.sq_mi; |
| pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE); |
| |
| return &qp->vqp.qp; |
| |
| err_destroy: |
| ibv_cmd_destroy_qp(&qp->vqp.qp); |
| err_free: |
| free(qp); |
| err: |
| return NULL; |
| } |
| |
| enum { |
| RXE_QP_CREATE_FLAGS_SUP = 0, |
| |
| RXE_QP_COMP_MASK_SUP = IBV_QP_INIT_ATTR_PD | |
| IBV_QP_INIT_ATTR_CREATE_FLAGS | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS, |
| |
| RXE_SUP_RC_QP_SEND_OPS_FLAGS = |
| IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | |
| IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | |
| IBV_QP_EX_WITH_RDMA_READ | IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP | |
| IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD | IBV_QP_EX_WITH_LOCAL_INV | |
| IBV_QP_EX_WITH_BIND_MW | IBV_QP_EX_WITH_SEND_WITH_INV, |
| |
| RXE_SUP_UC_QP_SEND_OPS_FLAGS = |
| IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | |
| IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | |
| IBV_QP_EX_WITH_BIND_MW | IBV_QP_EX_WITH_SEND_WITH_INV, |
| |
| RXE_SUP_UD_QP_SEND_OPS_FLAGS = |
| IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM, |
| }; |
| |
| static int check_qp_init_attr(struct ibv_qp_init_attr_ex *attr) |
| { |
| if (attr->comp_mask & ~RXE_QP_COMP_MASK_SUP) |
| goto err; |
| |
| if ((attr->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) && |
| (attr->create_flags & ~RXE_QP_CREATE_FLAGS_SUP)) |
| goto err; |
| |
| if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { |
| switch (attr->qp_type) { |
| case IBV_QPT_RC: |
| if (attr->send_ops_flags & ~RXE_SUP_RC_QP_SEND_OPS_FLAGS) |
| goto err; |
| break; |
| case IBV_QPT_UC: |
| if (attr->send_ops_flags & ~RXE_SUP_UC_QP_SEND_OPS_FLAGS) |
| goto err; |
| break; |
| case IBV_QPT_UD: |
| if (attr->send_ops_flags & ~RXE_SUP_UD_QP_SEND_OPS_FLAGS) |
| goto err; |
| break; |
| default: |
| goto err; |
| } |
| } |
| |
| return 0; |
| err: |
| errno = EOPNOTSUPP; |
| return errno; |
| } |
| |
| static void set_qp_send_ops(struct rxe_qp *qp, uint64_t flags) |
| { |
| if (flags & IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) |
| qp->vqp.qp_ex.wr_atomic_cmp_swp = wr_atomic_cmp_swp; |
| |
| if (flags & IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) |
| qp->vqp.qp_ex.wr_atomic_fetch_add = wr_atomic_fetch_add; |
| |
| if (flags & IBV_QP_EX_WITH_BIND_MW) |
| qp->vqp.qp_ex.wr_bind_mw = wr_bind_mw; |
| |
| if (flags & IBV_QP_EX_WITH_LOCAL_INV) |
| qp->vqp.qp_ex.wr_local_inv = wr_local_inv; |
| |
| if (flags & IBV_QP_EX_WITH_RDMA_READ) |
| qp->vqp.qp_ex.wr_rdma_read = wr_rdma_read; |
| |
| if (flags & IBV_QP_EX_WITH_RDMA_WRITE) |
| qp->vqp.qp_ex.wr_rdma_write = wr_rdma_write; |
| |
| if (flags & IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) |
| qp->vqp.qp_ex.wr_rdma_write_imm = wr_rdma_write_imm; |
| |
| if (flags & IBV_QP_EX_WITH_SEND) |
| qp->vqp.qp_ex.wr_send = wr_send; |
| |
| if (flags & IBV_QP_EX_WITH_SEND_WITH_IMM) |
| qp->vqp.qp_ex.wr_send_imm = wr_send_imm; |
| |
| if (flags & IBV_QP_EX_WITH_SEND_WITH_INV) |
| qp->vqp.qp_ex.wr_send_inv = wr_send_inv; |
| |
| qp->vqp.qp_ex.wr_set_ud_addr = wr_set_ud_addr; |
| qp->vqp.qp_ex.wr_set_inline_data = wr_set_inline_data; |
| qp->vqp.qp_ex.wr_set_inline_data_list = wr_set_inline_data_list; |
| qp->vqp.qp_ex.wr_set_sge = wr_set_sge; |
| qp->vqp.qp_ex.wr_set_sge_list = wr_set_sge_list; |
| |
| qp->vqp.qp_ex.wr_start = wr_start; |
| qp->vqp.qp_ex.wr_complete = wr_complete; |
| qp->vqp.qp_ex.wr_abort = wr_abort; |
| } |
| |
| static struct ibv_qp *rxe_create_qp_ex(struct ibv_context *context, |
| struct ibv_qp_init_attr_ex *attr) |
| { |
| int ret; |
| struct rxe_qp *qp; |
| struct ibv_create_qp_ex cmd = {}; |
| struct urxe_create_qp_ex_resp resp = {}; |
| size_t cmd_size = sizeof(cmd); |
| size_t resp_size = sizeof(resp); |
| |
| ret = check_qp_init_attr(attr); |
| if (ret) |
| goto err; |
| |
| qp = calloc(1, sizeof(*qp)); |
| if (!qp) |
| goto err; |
| |
| if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) |
| set_qp_send_ops(qp, attr->send_ops_flags); |
| |
| ret = ibv_cmd_create_qp_ex2(context, &qp->vqp, attr, |
| &cmd, cmd_size, |
| &resp.ibv_resp, resp_size); |
| if (ret) |
| goto err_free; |
| |
| qp->vqp.comp_mask |= VERBS_QP_EX; |
| |
| ret = map_queue_pair(context->cmd_fd, qp, |
| (struct ibv_qp_init_attr *)attr, |
| &resp.drv_payload); |
| if (ret) |
| goto err_destroy; |
| |
| return &qp->vqp.qp; |
| |
| err_destroy: |
| ibv_cmd_destroy_qp(&qp->vqp.qp); |
| err_free: |
| free(qp); |
| err: |
| return NULL; |
| } |
| |
| static int rxe_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, |
| int attr_mask, struct ibv_qp_init_attr *init_attr) |
| { |
| struct ibv_query_qp cmd = {}; |
| |
| return ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, |
| &cmd, sizeof(cmd)); |
| } |
| |
| static int rxe_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, |
| int attr_mask) |
| { |
| struct ibv_modify_qp cmd = {}; |
| |
| return ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); |
| } |
| |
| static int rxe_destroy_qp(struct ibv_qp *ibqp) |
| { |
| int ret; |
| struct rxe_qp *qp = to_rqp(ibqp); |
| |
| ret = ibv_cmd_destroy_qp(ibqp); |
| if (!ret) { |
| if (qp->rq_mmap_info.size) |
| munmap(qp->rq.queue, qp->rq_mmap_info.size); |
| if (qp->sq_mmap_info.size) |
| munmap(qp->sq.queue, qp->sq_mmap_info.size); |
| |
| free(qp); |
| } |
| |
| return ret; |
| } |
| |
| /* basic sanity checks for send work request */ |
| static int validate_send_wr(struct rxe_qp *qp, struct ibv_send_wr *ibwr, |
| unsigned int length) |
| { |
| struct rxe_wq *sq = &qp->sq; |
| enum ibv_wr_opcode opcode = ibwr->opcode; |
| |
| if (ibwr->num_sge > sq->max_sge) |
| return -EINVAL; |
| |
| if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) |
| || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) |
| if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7) |
| return -EINVAL; |
| |
| if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline)) |
| return -EINVAL; |
| |
| if (ibwr->opcode == IBV_WR_BIND_MW) { |
| if (length) |
| return -EINVAL; |
| if (ibwr->num_sge) |
| return -EINVAL; |
| if (ibwr->imm_data) |
| return -EINVAL; |
| if ((qp_type(qp) != IBV_QPT_RC) && (qp_type(qp) != IBV_QPT_UC)) |
| return -EINVAL; |
| } |
| |
| return 0; |
| } |
| |
| static void convert_send_wr(struct rxe_qp *qp, struct rxe_send_wr *kwr, |
| struct ibv_send_wr *uwr) |
| { |
| struct ibv_mw *ibmw; |
| struct ibv_mr *ibmr; |
| |
| memset(kwr, 0, sizeof(*kwr)); |
| |
| kwr->wr_id = uwr->wr_id; |
| kwr->num_sge = uwr->num_sge; |
| kwr->opcode = uwr->opcode; |
| kwr->send_flags = uwr->send_flags; |
| kwr->ex.imm_data = uwr->imm_data; |
| |
| switch (uwr->opcode) { |
| case IBV_WR_RDMA_WRITE: |
| case IBV_WR_RDMA_WRITE_WITH_IMM: |
| case IBV_WR_RDMA_READ: |
| kwr->wr.rdma.remote_addr = uwr->wr.rdma.remote_addr; |
| kwr->wr.rdma.rkey = uwr->wr.rdma.rkey; |
| break; |
| |
| case IBV_WR_SEND: |
| case IBV_WR_SEND_WITH_IMM: |
| if (qp_type(qp) == IBV_QPT_UD) { |
| struct rxe_ah *ah = to_rah(uwr->wr.ud.ah); |
| |
| kwr->wr.ud.remote_qpn = uwr->wr.ud.remote_qpn; |
| kwr->wr.ud.remote_qkey = uwr->wr.ud.remote_qkey; |
| kwr->wr.ud.ah_num = ah->ah_num; |
| } |
| break; |
| |
| case IBV_WR_ATOMIC_CMP_AND_SWP: |
| case IBV_WR_ATOMIC_FETCH_AND_ADD: |
| kwr->wr.atomic.remote_addr = uwr->wr.atomic.remote_addr; |
| kwr->wr.atomic.compare_add = uwr->wr.atomic.compare_add; |
| kwr->wr.atomic.swap = uwr->wr.atomic.swap; |
| kwr->wr.atomic.rkey = uwr->wr.atomic.rkey; |
| break; |
| |
| case IBV_WR_BIND_MW: |
| ibmr = uwr->bind_mw.bind_info.mr; |
| ibmw = uwr->bind_mw.mw; |
| |
| kwr->wr.mw.addr = uwr->bind_mw.bind_info.addr; |
| kwr->wr.mw.length = uwr->bind_mw.bind_info.length; |
| kwr->wr.mw.mr_lkey = ibmr->lkey; |
| kwr->wr.mw.mw_rkey = ibmw->rkey; |
| kwr->wr.mw.rkey = uwr->bind_mw.rkey; |
| kwr->wr.mw.access = uwr->bind_mw.bind_info.mw_access_flags; |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| static int init_send_wqe(struct rxe_qp *qp, struct rxe_wq *sq, |
| struct ibv_send_wr *ibwr, unsigned int length, |
| struct rxe_send_wqe *wqe) |
| { |
| int num_sge = ibwr->num_sge; |
| int i; |
| unsigned int opcode = ibwr->opcode; |
| |
| convert_send_wr(qp, &wqe->wr, ibwr); |
| |
| if (qp_type(qp) == IBV_QPT_UD) { |
| struct rxe_ah *ah = to_rah(ibwr->wr.ud.ah); |
| |
| if (!ah->ah_num) |
| /* old kernels only */ |
| memcpy(&wqe->wr.wr.ud.av, &ah->av, sizeof(struct rxe_av)); |
| } |
| |
| if (ibwr->send_flags & IBV_SEND_INLINE) { |
| uint8_t *inline_data = wqe->dma.inline_data; |
| |
| for (i = 0; i < num_sge; i++) { |
| memcpy(inline_data, |
| (uint8_t *)(long)ibwr->sg_list[i].addr, |
| ibwr->sg_list[i].length); |
| inline_data += ibwr->sg_list[i].length; |
| } |
| } else |
| memcpy(wqe->dma.sge, ibwr->sg_list, |
| num_sge*sizeof(struct ibv_sge)); |
| |
| if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) |
| || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) |
| wqe->iova = ibwr->wr.atomic.remote_addr; |
| else |
| wqe->iova = ibwr->wr.rdma.remote_addr; |
| |
| wqe->dma.length = length; |
| wqe->dma.resid = length; |
| wqe->dma.num_sge = num_sge; |
| wqe->dma.cur_sge = 0; |
| wqe->dma.sge_offset = 0; |
| wqe->state = 0; |
| wqe->ssn = qp->ssn++; |
| |
| return 0; |
| } |
| |
| static int post_one_send(struct rxe_qp *qp, struct rxe_wq *sq, |
| struct ibv_send_wr *ibwr) |
| { |
| int err; |
| struct rxe_send_wqe *wqe; |
| unsigned int length = 0; |
| int i; |
| |
| for (i = 0; i < ibwr->num_sge; i++) |
| length += ibwr->sg_list[i].length; |
| |
| err = validate_send_wr(qp, ibwr, length); |
| if (err) { |
| verbs_err(verbs_get_ctx(qp->vqp.qp.context), |
| "validate send failed\n"); |
| return err; |
| } |
| |
| wqe = (struct rxe_send_wqe *)producer_addr(sq->queue); |
| |
| err = init_send_wqe(qp, sq, ibwr, length, wqe); |
| if (err) |
| return err; |
| |
| if (queue_full(sq->queue)) |
| return -ENOMEM; |
| |
| advance_producer(sq->queue); |
| |
| return 0; |
| } |
| |
| /* send a null post send as a doorbell */ |
| static int post_send_db(struct ibv_qp *ibqp) |
| { |
| struct ibv_post_send cmd; |
| struct ib_uverbs_post_send_resp resp; |
| |
| cmd.hdr.command = IB_USER_VERBS_CMD_POST_SEND; |
| cmd.hdr.in_words = sizeof(cmd) / 4; |
| cmd.hdr.out_words = sizeof(resp) / 4; |
| cmd.response = (uintptr_t)&resp; |
| cmd.qp_handle = ibqp->handle; |
| cmd.wr_count = 0; |
| cmd.sge_count = 0; |
| cmd.wqe_size = sizeof(struct ibv_send_wr); |
| |
| if (write(ibqp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) |
| return errno; |
| |
| return 0; |
| } |
| |
| /* this API does not make a distinction between |
| * restartable and non-restartable errors |
| */ |
| static int rxe_post_send(struct ibv_qp *ibqp, |
| struct ibv_send_wr *wr_list, |
| struct ibv_send_wr **bad_wr) |
| { |
| int rc = 0; |
| int err; |
| struct rxe_qp *qp = to_rqp(ibqp); |
| struct rxe_wq *sq = &qp->sq; |
| |
| if (!bad_wr) |
| return EINVAL; |
| |
| *bad_wr = NULL; |
| |
| if (!sq || !wr_list || !sq->queue) |
| return EINVAL; |
| |
| pthread_spin_lock(&sq->lock); |
| |
| while (wr_list) { |
| rc = post_one_send(qp, sq, wr_list); |
| if (rc) { |
| *bad_wr = wr_list; |
| break; |
| } |
| |
| wr_list = wr_list->next; |
| } |
| |
| pthread_spin_unlock(&sq->lock); |
| |
| err = post_send_db(ibqp); |
| return err ? err : rc; |
| } |
| |
| static int rxe_post_recv(struct ibv_qp *ibqp, |
| struct ibv_recv_wr *recv_wr, |
| struct ibv_recv_wr **bad_wr) |
| { |
| int rc = 0; |
| struct rxe_qp *qp = to_rqp(ibqp); |
| struct rxe_wq *rq = &qp->rq; |
| |
| if (!bad_wr) |
| return EINVAL; |
| |
| *bad_wr = NULL; |
| |
| if (!rq || !recv_wr || !rq->queue) |
| return EINVAL; |
| |
| /* see C10-97.2.1 */ |
| if (ibqp->state == IBV_QPS_RESET) |
| return EINVAL; |
| |
| pthread_spin_lock(&rq->lock); |
| |
| while (recv_wr) { |
| rc = rxe_post_one_recv(rq, recv_wr); |
| if (rc) { |
| *bad_wr = recv_wr; |
| break; |
| } |
| |
| recv_wr = recv_wr->next; |
| } |
| |
| pthread_spin_unlock(&rq->lock); |
| |
| return rc; |
| } |
| |
| static inline int ipv6_addr_v4mapped(const struct in6_addr *a) |
| { |
| return IN6_IS_ADDR_V4MAPPED(a); |
| } |
| |
| typedef typeof(((struct rxe_av *)0)->sgid_addr) sockaddr_union_t; |
| |
| static inline int rdma_gid2ip(sockaddr_union_t *out, union ibv_gid *gid) |
| { |
| if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { |
| memset(&out->_sockaddr_in, 0, sizeof(out->_sockaddr_in)); |
| memcpy(&out->_sockaddr_in.sin_addr.s_addr, gid->raw + 12, 4); |
| } else { |
| memset(&out->_sockaddr_in6, 0, sizeof(out->_sockaddr_in6)); |
| out->_sockaddr_in6.sin6_family = AF_INET6; |
| memcpy(&out->_sockaddr_in6.sin6_addr.s6_addr, gid->raw, 16); |
| } |
| return 0; |
| } |
| |
| static int rxe_create_av(struct rxe_ah *ah, struct ibv_pd *pd, |
| struct ibv_ah_attr *attr) |
| { |
| struct rxe_av *av = &ah->av; |
| union ibv_gid sgid; |
| int ret; |
| |
| ret = ibv_query_gid(pd->context, attr->port_num, |
| attr->grh.sgid_index, &sgid); |
| if (ret) |
| return ret; |
| |
| av->port_num = attr->port_num; |
| memcpy(&av->grh, &attr->grh, sizeof(attr->grh)); |
| |
| ret = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw); |
| av->network_type = ret ? RXE_NETWORK_TYPE_IPV4 : |
| RXE_NETWORK_TYPE_IPV6; |
| |
| rdma_gid2ip(&av->sgid_addr, &sgid); |
| rdma_gid2ip(&av->dgid_addr, &attr->grh.dgid); |
| |
| ret = ibv_resolve_eth_l2_from_gid(pd->context, attr, |
| av->dmac, NULL); |
| |
| return ret; |
| } |
| |
| /* |
| * Newer kernels will return a non-zero AH index in resp.ah_num |
| * which can be returned in UD send WQEs. |
| * Older kernels will leave ah_num == 0. For these create an AV and use |
| * in UD send WQEs. |
| */ |
| static struct ibv_ah *rxe_create_ah(struct ibv_pd *pd, |
| struct ibv_ah_attr *attr) |
| { |
| struct rxe_ah *ah; |
| struct urxe_create_ah_resp resp = {}; |
| int ret; |
| |
| ah = calloc(1, sizeof(*ah)); |
| if (!ah) |
| return NULL; |
| |
| ret = ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, |
| &resp.ibv_resp, sizeof(resp)); |
| if (ret) |
| goto err_free; |
| |
| ah->ah_num = resp.ah_num; |
| |
| if (!ah->ah_num) { |
| /* old kernels only */ |
| ret = rxe_create_av(ah, pd, attr); |
| if (ret) |
| goto err_free; |
| } |
| |
| return &ah->ibv_ah; |
| |
| err_free: |
| free(ah); |
| return NULL; |
| } |
| |
| static int rxe_destroy_ah(struct ibv_ah *ibah) |
| { |
| struct rxe_ah *ah = to_rah(ibah); |
| int ret; |
| |
| ret = ibv_cmd_destroy_ah(&ah->ibv_ah); |
| if (!ret) |
| free(ah); |
| |
| return ret; |
| } |
| |
| static const struct verbs_context_ops rxe_ctx_ops = { |
| .query_device_ex = rxe_query_device, |
| .query_port = rxe_query_port, |
| .alloc_pd = rxe_alloc_pd, |
| .dealloc_pd = rxe_dealloc_pd, |
| .reg_mr = rxe_reg_mr, |
| .dereg_mr = rxe_dereg_mr, |
| .alloc_mw = rxe_alloc_mw, |
| .dealloc_mw = rxe_dealloc_mw, |
| .bind_mw = rxe_bind_mw, |
| .create_cq = rxe_create_cq, |
| .create_cq_ex = rxe_create_cq_ex, |
| .poll_cq = rxe_poll_cq, |
| .req_notify_cq = ibv_cmd_req_notify_cq, |
| .resize_cq = rxe_resize_cq, |
| .destroy_cq = rxe_destroy_cq, |
| .create_srq = rxe_create_srq, |
| .modify_srq = rxe_modify_srq, |
| .query_srq = rxe_query_srq, |
| .destroy_srq = rxe_destroy_srq, |
| .post_srq_recv = rxe_post_srq_recv, |
| .create_qp = rxe_create_qp, |
| .create_qp_ex = rxe_create_qp_ex, |
| .query_qp = rxe_query_qp, |
| .modify_qp = rxe_modify_qp, |
| .destroy_qp = rxe_destroy_qp, |
| .post_send = rxe_post_send, |
| .post_recv = rxe_post_recv, |
| .create_ah = rxe_create_ah, |
| .destroy_ah = rxe_destroy_ah, |
| .attach_mcast = ibv_cmd_attach_mcast, |
| .detach_mcast = ibv_cmd_detach_mcast, |
| .free_context = rxe_free_context, |
| }; |
| |
| static struct verbs_context *rxe_alloc_context(struct ibv_device *ibdev, |
| int cmd_fd, |
| void *private_data) |
| { |
| struct rxe_context *context; |
| struct ibv_get_context cmd; |
| struct ib_uverbs_get_context_resp resp; |
| |
| context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, |
| RDMA_DRIVER_RXE); |
| if (!context) |
| return NULL; |
| |
| if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof(cmd), |
| &resp, sizeof(resp))) |
| goto out; |
| |
| verbs_set_ops(&context->ibv_ctx, &rxe_ctx_ops); |
| |
| return &context->ibv_ctx; |
| |
| out: |
| verbs_uninit_context(&context->ibv_ctx); |
| free(context); |
| return NULL; |
| } |
| |
| static void rxe_free_context(struct ibv_context *ibctx) |
| { |
| struct rxe_context *context = to_rctx(ibctx); |
| |
| verbs_uninit_context(&context->ibv_ctx); |
| free(context); |
| } |
| |
| static void rxe_uninit_device(struct verbs_device *verbs_device) |
| { |
| struct rxe_device *dev = to_rdev(&verbs_device->device); |
| |
| free(dev); |
| } |
| |
| static struct verbs_device *rxe_device_alloc(struct verbs_sysfs_dev *sysfs_dev) |
| { |
| struct rxe_device *dev; |
| |
| dev = calloc(1, sizeof(*dev)); |
| if (!dev) |
| return NULL; |
| |
| dev->abi_version = sysfs_dev->abi_ver; |
| |
| return &dev->ibv_dev; |
| } |
| |
| static const struct verbs_device_ops rxe_dev_ops = { |
| .name = "rxe", |
| /* |
| * For 64 bit machines ABI version 1 and 2 are the same. Otherwise 32 |
| * bit machines require ABI version 2 which guarentees the user and |
| * kernel use the same ABI. |
| */ |
| .match_min_abi_version = sizeof(void *) == 8?1:2, |
| .match_max_abi_version = 2, |
| .match_table = hca_table, |
| .alloc_device = rxe_device_alloc, |
| .uninit_device = rxe_uninit_device, |
| .alloc_context = rxe_alloc_context, |
| }; |
| PROVIDER_DRIVER(rxe, rxe_dev_ops); |