| // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause |
| /* |
| * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. |
| */ |
| |
| #include <assert.h> |
| #include <errno.h> |
| #include <pthread.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/mman.h> |
| #include <unistd.h> |
| |
| #include <ccan/minmax.h> |
| |
| #include <util/compiler.h> |
| #include <util/mmio.h> |
| #include <util/util.h> |
| |
| #include "efa.h" |
| #include "efa_io_regs_defs.h" |
| #include "efadv.h" |
| #include "verbs.h" |
| |
| #define EFA_DEV_CAP(ctx, cap) \ |
| ((ctx)->device_caps & EFA_QUERY_DEVICE_CAPS_##cap) |
| |
| static bool is_buf_cleared(void *buf, size_t len) |
| { |
| int i; |
| |
| for (i = 0; i < len; i++) { |
| if (((uint8_t *)buf)[i]) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| #define min3(a, b, c) \ |
| ({ \ |
| typeof(a) _tmpmin = min(a, b); \ |
| min(_tmpmin, c); \ |
| }) |
| |
| #define is_ext_cleared(ptr, inlen) \ |
| is_buf_cleared((uint8_t *)ptr + sizeof(*ptr), inlen - sizeof(*ptr)) |
| |
| #define is_reserved_cleared(reserved) is_buf_cleared(reserved, sizeof(reserved)) |
| |
| struct efa_wq_init_attr { |
| uint64_t db_mmap_key; |
| uint32_t db_off; |
| int cmd_fd; |
| int pgsz; |
| uint16_t sub_cq_idx; |
| }; |
| |
| int efa_query_port(struct ibv_context *ibvctx, uint8_t port, |
| struct ibv_port_attr *port_attr) |
| { |
| struct ibv_query_port cmd; |
| |
| return ibv_cmd_query_port(ibvctx, port, port_attr, &cmd, sizeof(cmd)); |
| } |
| |
| int efa_query_device_ex(struct ibv_context *context, |
| const struct ibv_query_device_ex_input *input, |
| struct ibv_device_attr_ex *attr, |
| size_t attr_size) |
| { |
| struct efa_context *ctx = to_efa_context(context); |
| struct ibv_device_attr *a = &attr->orig_attr; |
| struct efa_query_device_ex_resp resp = {}; |
| size_t resp_size = (ctx->cmds_supp_udata_mask & |
| EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE) ? |
| sizeof(resp) : |
| sizeof(resp.ibv_resp); |
| uint8_t fw_ver[8]; |
| int err; |
| |
| err = ibv_cmd_query_device_any(context, input, attr, attr_size, |
| &resp.ibv_resp, &resp_size); |
| if (err) { |
| verbs_err(verbs_get_ctx(context), "ibv_cmd_query_device_any failed\n"); |
| return err; |
| } |
| |
| a->max_qp_wr = min_t(int, a->max_qp_wr, |
| ctx->max_llq_size / sizeof(struct efa_io_tx_wqe)); |
| memcpy(fw_ver, &resp.ibv_resp.base.fw_ver, |
| sizeof(resp.ibv_resp.base.fw_ver)); |
| snprintf(a->fw_ver, sizeof(a->fw_ver), "%u.%u.%u.%u", |
| fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3]); |
| |
| return 0; |
| } |
| |
| int efa_query_device_ctx(struct efa_context *ctx) |
| { |
| struct efa_query_device_ex_resp resp = {}; |
| struct ibv_device_attr_ex attr; |
| size_t resp_size = sizeof(resp); |
| unsigned int qp_table_sz; |
| int err; |
| |
| if (ctx->cmds_supp_udata_mask & EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE) { |
| err = ibv_cmd_query_device_any(&ctx->ibvctx.context, NULL, |
| &attr, sizeof(attr), |
| &resp.ibv_resp, &resp_size); |
| if (err) { |
| verbs_err(&ctx->ibvctx, |
| "ibv_cmd_query_device_any failed\n"); |
| return err; |
| } |
| |
| ctx->device_caps = resp.device_caps; |
| ctx->max_sq_wr = resp.max_sq_wr; |
| ctx->max_rq_wr = resp.max_rq_wr; |
| ctx->max_sq_sge = resp.max_sq_sge; |
| ctx->max_rq_sge = resp.max_rq_sge; |
| ctx->max_rdma_size = resp.max_rdma_size; |
| } else { |
| err = ibv_cmd_query_device_any(&ctx->ibvctx.context, NULL, |
| &attr, sizeof(attr.orig_attr), |
| NULL, NULL); |
| if (err) { |
| verbs_err(&ctx->ibvctx, |
| "ibv_cmd_query_device_any failed\n"); |
| return err; |
| } |
| } |
| |
| ctx->max_wr_rdma_sge = attr.orig_attr.max_sge_rd; |
| qp_table_sz = roundup_pow_of_two(attr.orig_attr.max_qp); |
| ctx->qp_table_sz_m1 = qp_table_sz - 1; |
| ctx->qp_table = calloc(qp_table_sz, sizeof(*ctx->qp_table)); |
| if (!ctx->qp_table) |
| return ENOMEM; |
| return 0; |
| } |
| |
| int efadv_query_device(struct ibv_context *ibvctx, |
| struct efadv_device_attr *attr, |
| uint32_t inlen) |
| { |
| struct efa_context *ctx = to_efa_context(ibvctx); |
| uint64_t comp_mask_out = 0; |
| |
| if (!is_efa_dev(ibvctx->device)) { |
| verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n"); |
| return EOPNOTSUPP; |
| } |
| |
| if (!vext_field_avail(typeof(*attr), inline_buf_size, inlen)) { |
| verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); |
| return EINVAL; |
| } |
| |
| memset(attr, 0, inlen); |
| attr->max_sq_wr = ctx->max_sq_wr; |
| attr->max_rq_wr = ctx->max_rq_wr; |
| attr->max_sq_sge = ctx->max_sq_sge; |
| attr->max_rq_sge = ctx->max_rq_sge; |
| attr->inline_buf_size = ctx->inline_buf_size; |
| |
| if (vext_field_avail(typeof(*attr), max_rdma_size, inlen)) { |
| attr->max_rdma_size = ctx->max_rdma_size; |
| |
| if (EFA_DEV_CAP(ctx, RDMA_READ)) |
| attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_RDMA_READ; |
| |
| if (EFA_DEV_CAP(ctx, RNR_RETRY)) |
| attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_RNR_RETRY; |
| } |
| |
| attr->comp_mask = comp_mask_out; |
| |
| return 0; |
| } |
| |
| struct ibv_pd *efa_alloc_pd(struct ibv_context *ibvctx) |
| { |
| struct efa_alloc_pd_resp resp = {}; |
| struct ibv_alloc_pd cmd; |
| struct efa_pd *pd; |
| int err; |
| |
| pd = calloc(1, sizeof(*pd)); |
| if (!pd) |
| return NULL; |
| |
| err = ibv_cmd_alloc_pd(ibvctx, &pd->ibvpd, &cmd, sizeof(cmd), |
| &resp.ibv_resp, sizeof(resp)); |
| if (err) { |
| verbs_err(verbs_get_ctx(ibvctx), "Failed to allocate PD\n"); |
| goto out; |
| } |
| |
| pd->pdn = resp.pdn; |
| |
| return &pd->ibvpd; |
| |
| out: |
| free(pd); |
| errno = err; |
| return NULL; |
| } |
| |
| int efa_dealloc_pd(struct ibv_pd *ibvpd) |
| { |
| struct efa_pd *pd = to_efa_pd(ibvpd); |
| int err; |
| |
| err = ibv_cmd_dealloc_pd(ibvpd); |
| if (err) { |
| verbs_err(verbs_get_ctx(ibvpd->context), |
| "Failed to deallocate PD\n"); |
| return err; |
| } |
| free(pd); |
| |
| return 0; |
| } |
| |
| struct ibv_mr *efa_reg_dmabuf_mr(struct ibv_pd *ibvpd, uint64_t offset, |
| size_t length, uint64_t iova, int fd, int acc) |
| { |
| struct efa_mr *mr; |
| int err; |
| |
| mr = calloc(1, sizeof(*mr)); |
| if (!mr) |
| return NULL; |
| |
| err = ibv_cmd_reg_dmabuf_mr(ibvpd, offset, length, iova, fd, acc, |
| &mr->vmr); |
| if (err) { |
| free(mr); |
| errno = err; |
| return NULL; |
| } |
| |
| return &mr->vmr.ibv_mr; |
| } |
| |
| struct ibv_mr *efa_reg_mr(struct ibv_pd *ibvpd, void *sva, size_t len, |
| uint64_t hca_va, int access) |
| { |
| struct ib_uverbs_reg_mr_resp resp; |
| struct ibv_reg_mr cmd; |
| struct efa_mr *mr; |
| int err; |
| |
| mr = calloc(1, sizeof(*mr)); |
| if (!mr) |
| return NULL; |
| |
| err = ibv_cmd_reg_mr(ibvpd, sva, len, hca_va, access, &mr->vmr, |
| &cmd, sizeof(cmd), &resp, sizeof(resp)); |
| if (err) { |
| verbs_err(verbs_get_ctx(ibvpd->context), |
| "Failed to register MR\n"); |
| free(mr); |
| errno = err; |
| return NULL; |
| } |
| |
| return &mr->vmr.ibv_mr; |
| } |
| |
| int efa_dereg_mr(struct verbs_mr *vmr) |
| { |
| struct efa_mr *mr = container_of(vmr, struct efa_mr, vmr); |
| int err; |
| |
| err = ibv_cmd_dereg_mr(vmr); |
| if (err) { |
| verbs_err(verbs_get_ctx(vmr->ibv_mr.context), |
| "Failed to deregister MR\n"); |
| return err; |
| } |
| free(mr); |
| |
| return 0; |
| } |
| |
| static uint32_t efa_wq_get_next_wrid_idx_locked(struct efa_wq *wq, |
| uint64_t wr_id) |
| { |
| uint32_t wrid_idx; |
| |
| /* Get the next wrid to be used from the index pool */ |
| wrid_idx = wq->wrid_idx_pool[wq->wrid_idx_pool_next]; |
| wq->wrid[wrid_idx] = wr_id; |
| |
| /* Will never overlap, as validate function succeeded */ |
| wq->wrid_idx_pool_next++; |
| assert(wq->wrid_idx_pool_next <= wq->wqe_cnt); |
| |
| return wrid_idx; |
| } |
| |
| static void efa_wq_put_wrid_idx_unlocked(struct efa_wq *wq, uint32_t wrid_idx) |
| { |
| pthread_spin_lock(&wq->wqlock); |
| wq->wrid_idx_pool_next--; |
| wq->wrid_idx_pool[wq->wrid_idx_pool_next] = wrid_idx; |
| wq->wqe_completed++; |
| pthread_spin_unlock(&wq->wqlock); |
| } |
| |
| static uint32_t efa_sub_cq_get_current_index(struct efa_sub_cq *sub_cq) |
| { |
| return sub_cq->consumed_cnt & sub_cq->qmask; |
| } |
| |
| static int efa_cqe_is_pending(struct efa_io_cdesc_common *cqe_common, |
| int phase) |
| { |
| return EFA_GET(&cqe_common->flags, EFA_IO_CDESC_COMMON_PHASE) == phase; |
| } |
| |
| static struct efa_io_cdesc_common * |
| efa_sub_cq_get_cqe(struct efa_sub_cq *sub_cq, int entry) |
| { |
| return (struct efa_io_cdesc_common *)(sub_cq->buf + |
| (entry * sub_cq->cqe_size)); |
| } |
| |
| static void efa_update_cq_doorbell(struct efa_cq *cq, bool arm) |
| { |
| uint32_t db = 0; |
| |
| EFA_SET(&db, EFA_IO_REGS_CQ_DB_CONSUMER_INDEX, cq->cc); |
| EFA_SET(&db, EFA_IO_REGS_CQ_DB_CMD_SN, cq->cmd_sn & 0x3); |
| EFA_SET(&db, EFA_IO_REGS_CQ_DB_ARM, arm); |
| |
| mmio_write32(cq->db, db); |
| } |
| |
| void efa_cq_event(struct ibv_cq *ibvcq) |
| { |
| to_efa_cq(ibvcq)->cmd_sn++; |
| } |
| |
| int efa_arm_cq(struct ibv_cq *ibvcq, int solicited_only) |
| { |
| if (unlikely(solicited_only)) |
| return EOPNOTSUPP; |
| |
| efa_update_cq_doorbell(to_efa_cq(ibvcq), true); |
| return 0; |
| } |
| |
| static struct efa_io_cdesc_common * |
| cq_next_sub_cqe_get(struct efa_sub_cq *sub_cq) |
| { |
| struct efa_io_cdesc_common *cqe; |
| uint32_t current_index; |
| |
| current_index = efa_sub_cq_get_current_index(sub_cq); |
| cqe = efa_sub_cq_get_cqe(sub_cq, current_index); |
| if (efa_cqe_is_pending(cqe, sub_cq->phase)) { |
| /* Do not read the rest of the completion entry before the |
| * phase bit has been validated. |
| */ |
| udma_from_device_barrier(); |
| sub_cq->consumed_cnt++; |
| if (!efa_sub_cq_get_current_index(sub_cq)) |
| sub_cq->phase = 1 - sub_cq->phase; |
| return cqe; |
| } |
| |
| return NULL; |
| } |
| |
| static enum ibv_wc_status to_ibv_status(enum efa_io_comp_status status) |
| { |
| switch (status) { |
| case EFA_IO_COMP_STATUS_OK: |
| return IBV_WC_SUCCESS; |
| case EFA_IO_COMP_STATUS_FLUSHED: |
| return IBV_WC_WR_FLUSH_ERR; |
| case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR: |
| case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE: |
| case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH: |
| return IBV_WC_LOC_QP_OP_ERR; |
| case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY: |
| return IBV_WC_LOC_PROT_ERR; |
| case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH: |
| return IBV_WC_LOC_LEN_ERR; |
| case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT: |
| return IBV_WC_REM_ABORT_ERR; |
| case EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR: |
| return IBV_WC_RNR_RETRY_EXC_ERR; |
| case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN: |
| return IBV_WC_REM_INV_RD_REQ_ERR; |
| case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS: |
| return IBV_WC_BAD_RESP_ERR; |
| case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH: |
| return IBV_WC_REM_INV_REQ_ERR; |
| case EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE: |
| return IBV_WC_RESP_TIMEOUT_ERR; |
| case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS: |
| default: |
| return IBV_WC_GENERAL_ERR; |
| } |
| } |
| |
| static void efa_process_cqe(struct efa_cq *cq, struct ibv_wc *wc, |
| struct efa_qp *qp) |
| { |
| struct efa_io_cdesc_common *cqe = cq->cur_cqe; |
| uint32_t wrid_idx; |
| |
| wc->status = to_ibv_status(cqe->status); |
| wc->vendor_err = cqe->status; |
| wc->wc_flags = 0; |
| wc->qp_num = cqe->qp_num; |
| |
| if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == |
| EFA_IO_SEND_QUEUE) { |
| cq->cur_wq = &qp->sq.wq; |
| wc->opcode = IBV_WC_SEND; |
| } else { |
| struct efa_io_rx_cdesc *rcqe = |
| container_of(cqe, struct efa_io_rx_cdesc, common); |
| |
| cq->cur_wq = &qp->rq.wq; |
| |
| wc->byte_len = cqe->length; |
| wc->opcode = IBV_WC_RECV; |
| wc->src_qp = rcqe->src_qp_num; |
| wc->sl = 0; |
| wc->slid = rcqe->ah; |
| |
| if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_HAS_IMM)) { |
| wc->imm_data = htobe32(rcqe->imm); |
| wc->wc_flags |= IBV_WC_WITH_IMM; |
| } |
| } |
| |
| wrid_idx = cqe->req_id; |
| /* We do not have to take the WQ lock here, |
| * because this wrid index has not been freed yet, |
| * so there is no contention on this index. |
| */ |
| wc->wr_id = cq->cur_wq->wrid[wrid_idx]; |
| } |
| |
| static void efa_process_ex_cqe(struct efa_cq *cq, struct efa_qp *qp) |
| { |
| struct ibv_cq_ex *ibvcqx = &cq->verbs_cq.cq_ex; |
| struct efa_io_cdesc_common *cqe = cq->cur_cqe; |
| uint32_t wrid_idx; |
| |
| wrid_idx = cqe->req_id; |
| |
| if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == |
| EFA_IO_SEND_QUEUE) { |
| cq->cur_wq = &qp->sq.wq; |
| } else { |
| cq->cur_wq = &qp->rq.wq; |
| } |
| |
| ibvcqx->wr_id = cq->cur_wq->wrid[wrid_idx]; |
| ibvcqx->status = to_ibv_status(cqe->status); |
| } |
| |
| static inline int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, |
| struct efa_qp **cur_qp, struct ibv_wc *wc, |
| bool extended) ALWAYS_INLINE; |
| static inline int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, |
| struct efa_qp **cur_qp, struct ibv_wc *wc, |
| bool extended) |
| { |
| struct efa_context *ctx = to_efa_context(cq->verbs_cq.cq.context); |
| uint32_t qpn; |
| |
| cq->cur_cqe = cq_next_sub_cqe_get(sub_cq); |
| if (!cq->cur_cqe) |
| return ENOENT; |
| |
| qpn = cq->cur_cqe->qp_num; |
| if (!*cur_qp || qpn != (*cur_qp)->verbs_qp.qp.qp_num) { |
| /* We do not have to take the QP table lock here, |
| * because CQs will be locked while QPs are removed |
| * from the table. |
| */ |
| *cur_qp = ctx->qp_table[qpn & ctx->qp_table_sz_m1]; |
| if (!*cur_qp) { |
| verbs_err(&ctx->ibvctx, |
| "QP[%u] does not exist in QP table\n", |
| qpn); |
| return EINVAL; |
| } |
| } |
| |
| if (extended) { |
| efa_process_ex_cqe(cq, *cur_qp); |
| } else { |
| efa_process_cqe(cq, wc, *cur_qp); |
| efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); |
| } |
| |
| return 0; |
| } |
| |
| static inline int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc, |
| bool extended) ALWAYS_INLINE; |
| static inline int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc, |
| bool extended) |
| { |
| uint16_t num_sub_cqs = cq->num_sub_cqs; |
| struct efa_sub_cq *sub_cq; |
| struct efa_qp *qp = NULL; |
| uint16_t sub_cq_idx; |
| int err = ENOENT; |
| |
| for (sub_cq_idx = 0; sub_cq_idx < num_sub_cqs; sub_cq_idx++) { |
| sub_cq = &cq->sub_cq_arr[cq->next_poll_idx++]; |
| cq->next_poll_idx %= num_sub_cqs; |
| |
| if (!sub_cq->ref_cnt) |
| continue; |
| |
| err = efa_poll_sub_cq(cq, sub_cq, &qp, wc, extended); |
| if (err != ENOENT) { |
| cq->cc++; |
| break; |
| } |
| } |
| |
| return err; |
| } |
| |
| int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) |
| { |
| struct efa_cq *cq = to_efa_cq(ibvcq); |
| int ret = 0; |
| int i; |
| |
| pthread_spin_lock(&cq->lock); |
| for (i = 0; i < nwc; i++) { |
| ret = efa_poll_sub_cqs(cq, &wc[i], false); |
| if (ret) { |
| if (ret == ENOENT) |
| ret = 0; |
| break; |
| } |
| } |
| |
| if (i && cq->db) |
| efa_update_cq_doorbell(cq, false); |
| pthread_spin_unlock(&cq->lock); |
| |
| return i ?: -ret; |
| } |
| |
| static int efa_start_poll(struct ibv_cq_ex *ibvcqx, |
| struct ibv_poll_cq_attr *attr) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| int ret; |
| |
| if (unlikely(attr->comp_mask)) { |
| verbs_err(verbs_get_ctx(ibvcqx->context), |
| "Invalid comp_mask %u\n", |
| attr->comp_mask); |
| return EINVAL; |
| } |
| |
| pthread_spin_lock(&cq->lock); |
| |
| ret = efa_poll_sub_cqs(cq, NULL, true); |
| if (ret) |
| pthread_spin_unlock(&cq->lock); |
| |
| return ret; |
| } |
| |
| static int efa_next_poll(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| int ret; |
| |
| efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); |
| ret = efa_poll_sub_cqs(cq, NULL, true); |
| |
| return ret; |
| } |
| |
| static void efa_end_poll(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| |
| if (cq->cur_cqe) { |
| efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); |
| if (cq->db) |
| efa_update_cq_doorbell(cq, false); |
| } |
| |
| pthread_spin_unlock(&cq->lock); |
| } |
| |
| static enum ibv_wc_opcode efa_wc_read_opcode(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| struct efa_io_cdesc_common *cqe = cq->cur_cqe; |
| |
| if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == |
| EFA_IO_SEND_QUEUE) |
| return IBV_WC_SEND; |
| |
| return IBV_WC_RECV; |
| } |
| |
| static uint32_t efa_wc_read_vendor_err(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| |
| return cq->cur_cqe->status; |
| } |
| |
| static unsigned int efa_wc_read_wc_flags(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| unsigned int wc_flags = 0; |
| |
| if (EFA_GET(&cq->cur_cqe->flags, EFA_IO_CDESC_COMMON_HAS_IMM)) |
| wc_flags |= IBV_WC_WITH_IMM; |
| |
| return wc_flags; |
| } |
| |
| static uint32_t efa_wc_read_byte_len(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| |
| return cq->cur_cqe->length; |
| } |
| |
| static __be32 efa_wc_read_imm_data(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| struct efa_io_rx_cdesc *rcqe; |
| |
| rcqe = container_of(cq->cur_cqe, struct efa_io_rx_cdesc, common); |
| |
| return htobe32(rcqe->imm); |
| } |
| |
| static uint32_t efa_wc_read_qp_num(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| |
| return cq->cur_cqe->qp_num; |
| } |
| |
| static uint32_t efa_wc_read_src_qp(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| struct efa_io_rx_cdesc *rcqe; |
| |
| rcqe = container_of(cq->cur_cqe, struct efa_io_rx_cdesc, common); |
| |
| return rcqe->src_qp_num; |
| } |
| |
| static uint32_t efa_wc_read_slid(struct ibv_cq_ex *ibvcqx) |
| { |
| struct efa_cq *cq = to_efa_cq_ex(ibvcqx); |
| struct efa_io_rx_cdesc *rcqe; |
| |
| rcqe = container_of(cq->cur_cqe, struct efa_io_rx_cdesc, common); |
| |
| return rcqe->ah; |
| } |
| |
| static uint8_t efa_wc_read_sl(struct ibv_cq_ex *ibvcqx) |
| { |
| return 0; |
| } |
| |
| static uint8_t efa_wc_read_dlid_path_bits(struct ibv_cq_ex *ibvcqx) |
| { |
| return 0; |
| } |
| |
| static void efa_cq_fill_pfns(struct ibv_cq_ex *ibvcqx, |
| struct ibv_cq_init_attr_ex *attr) |
| { |
| ibvcqx->start_poll = efa_start_poll; |
| ibvcqx->end_poll = efa_end_poll; |
| ibvcqx->next_poll = efa_next_poll; |
| |
| ibvcqx->read_opcode = efa_wc_read_opcode; |
| ibvcqx->read_vendor_err = efa_wc_read_vendor_err; |
| ibvcqx->read_wc_flags = efa_wc_read_wc_flags; |
| |
| if (attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) |
| ibvcqx->read_byte_len = efa_wc_read_byte_len; |
| if (attr->wc_flags & IBV_WC_EX_WITH_IMM) |
| ibvcqx->read_imm_data = efa_wc_read_imm_data; |
| if (attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) |
| ibvcqx->read_qp_num = efa_wc_read_qp_num; |
| if (attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) |
| ibvcqx->read_src_qp = efa_wc_read_src_qp; |
| if (attr->wc_flags & IBV_WC_EX_WITH_SLID) |
| ibvcqx->read_slid = efa_wc_read_slid; |
| if (attr->wc_flags & IBV_WC_EX_WITH_SL) |
| ibvcqx->read_sl = efa_wc_read_sl; |
| if (attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) |
| ibvcqx->read_dlid_path_bits = efa_wc_read_dlid_path_bits; |
| } |
| |
| static void efa_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf, |
| int sub_cq_size, int cqe_size) |
| { |
| sub_cq->consumed_cnt = 0; |
| sub_cq->phase = 1; |
| sub_cq->buf = buf; |
| sub_cq->qmask = sub_cq_size - 1; |
| sub_cq->cqe_size = cqe_size; |
| sub_cq->ref_cnt = 0; |
| } |
| |
| static struct ibv_cq_ex *create_cq(struct ibv_context *ibvctx, |
| struct ibv_cq_init_attr_ex *attr) |
| { |
| struct efa_context *ctx = to_efa_context(ibvctx); |
| struct efa_create_cq_resp resp = {}; |
| struct efa_create_cq cmd = {}; |
| uint16_t num_sub_cqs; |
| struct efa_cq *cq; |
| int sub_buf_size; |
| int sub_cq_size; |
| uint8_t *buf; |
| int err; |
| int i; |
| |
| if (attr->channel && |
| !EFA_DEV_CAP(ctx, CQ_NOTIFICATIONS)) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| cq = calloc(1, sizeof(*cq) + |
| sizeof(*cq->sub_cq_arr) * ctx->sub_cqs_per_cq); |
| if (!cq) |
| return NULL; |
| |
| num_sub_cqs = ctx->sub_cqs_per_cq; |
| cmd.num_sub_cqs = num_sub_cqs; |
| cmd.cq_entry_size = ctx->cqe_size; |
| if (attr->channel) |
| cmd.flags |= EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL; |
| |
| attr->cqe = roundup_pow_of_two(attr->cqe); |
| err = ibv_cmd_create_cq_ex(ibvctx, attr, &cq->verbs_cq, |
| &cmd.ibv_cmd, sizeof(cmd), |
| &resp.ibv_resp, sizeof(resp), 0); |
| if (err) { |
| errno = err; |
| goto err_free_cq; |
| } |
| |
| sub_cq_size = cq->verbs_cq.cq.cqe; |
| cq->cqn = resp.cq_idx; |
| cq->buf_size = resp.q_mmap_size; |
| cq->num_sub_cqs = num_sub_cqs; |
| cq->cqe_size = ctx->cqe_size; |
| |
| cq->buf = mmap(NULL, cq->buf_size, PROT_READ, MAP_SHARED, |
| ibvctx->cmd_fd, resp.q_mmap_key); |
| if (cq->buf == MAP_FAILED) |
| goto err_destroy_cq; |
| |
| buf = cq->buf; |
| sub_buf_size = cq->cqe_size * sub_cq_size; |
| for (i = 0; i < num_sub_cqs; i++) { |
| efa_sub_cq_initialize(&cq->sub_cq_arr[i], buf, sub_cq_size, |
| cq->cqe_size); |
| buf += sub_buf_size; |
| } |
| |
| if (resp.comp_mask & EFA_CREATE_CQ_RESP_DB_OFF) { |
| cq->db_mmap_addr = mmap(NULL, |
| to_efa_dev(ibvctx->device)->pg_sz, PROT_WRITE, |
| MAP_SHARED, ibvctx->cmd_fd, resp.db_mmap_key); |
| if (cq->db_mmap_addr == MAP_FAILED) |
| goto err_unmap_cq; |
| |
| cq->db = (uint32_t *)(cq->db_mmap_addr + resp.db_off); |
| } |
| |
| efa_cq_fill_pfns(&cq->verbs_cq.cq_ex, attr); |
| pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); |
| |
| return &cq->verbs_cq.cq_ex; |
| |
| err_unmap_cq: |
| munmap(cq->buf, cq->buf_size); |
| err_destroy_cq: |
| ibv_cmd_destroy_cq(&cq->verbs_cq.cq); |
| err_free_cq: |
| free(cq); |
| verbs_err(verbs_get_ctx(ibvctx), "Failed to create CQ\n"); |
| return NULL; |
| } |
| |
| struct ibv_cq *efa_create_cq(struct ibv_context *ibvctx, int ncqe, |
| struct ibv_comp_channel *channel, int vec) |
| { |
| struct ibv_cq_init_attr_ex attr_ex = { |
| .cqe = ncqe, |
| .channel = channel, |
| .comp_vector = vec |
| }; |
| struct ibv_cq_ex *ibvcqx; |
| |
| ibvcqx = create_cq(ibvctx, &attr_ex); |
| |
| return ibvcqx ? ibv_cq_ex_to_cq(ibvcqx) : NULL; |
| } |
| |
| struct ibv_cq_ex *efa_create_cq_ex(struct ibv_context *ibvctx, |
| struct ibv_cq_init_attr_ex *attr_ex) |
| { |
| if (!check_comp_mask(attr_ex->comp_mask, 0) || |
| !check_comp_mask(attr_ex->wc_flags, IBV_WC_STANDARD_FLAGS)) { |
| verbs_err(verbs_get_ctx(ibvctx), |
| "Invalid comp_mask or wc_flags\n"); |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| return create_cq(ibvctx, attr_ex); |
| } |
| |
| int efa_destroy_cq(struct ibv_cq *ibvcq) |
| { |
| struct efa_cq *cq = to_efa_cq(ibvcq); |
| int err; |
| |
| err = ibv_cmd_destroy_cq(ibvcq); |
| if (err) { |
| verbs_err(verbs_get_ctx(ibvcq->context), |
| "Failed to destroy CQ[%u]\n", cq->cqn); |
| return err; |
| } |
| |
| munmap(cq->db_mmap_addr, to_efa_dev(cq->verbs_cq.cq.context->device)->pg_sz); |
| munmap(cq->buf, cq->buf_size); |
| |
| pthread_spin_destroy(&cq->lock); |
| |
| free(cq); |
| |
| return 0; |
| } |
| |
| static void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) |
| { |
| cq->sub_cq_arr[sub_cq_idx].ref_cnt++; |
| } |
| |
| static void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) |
| { |
| cq->sub_cq_arr[sub_cq_idx].ref_cnt--; |
| } |
| |
| static void efa_wq_terminate(struct efa_wq *wq, int pgsz) |
| { |
| void *db_aligned; |
| |
| pthread_spin_destroy(&wq->wqlock); |
| |
| db_aligned = (void *)((uintptr_t)wq->db & ~(pgsz - 1)); |
| munmap(db_aligned, pgsz); |
| |
| free(wq->wrid_idx_pool); |
| free(wq->wrid); |
| } |
| |
| static int efa_wq_initialize(struct efa_wq *wq, struct efa_wq_init_attr *attr) |
| { |
| uint8_t *db_base; |
| int err; |
| int i; |
| |
| wq->wrid = malloc(wq->wqe_cnt * sizeof(*wq->wrid)); |
| if (!wq->wrid) |
| return ENOMEM; |
| |
| wq->wrid_idx_pool = malloc(wq->wqe_cnt * sizeof(uint32_t)); |
| if (!wq->wrid_idx_pool) { |
| err = ENOMEM; |
| goto err_free_wrid; |
| } |
| |
| db_base = mmap(NULL, attr->pgsz, PROT_WRITE, MAP_SHARED, attr->cmd_fd, |
| attr->db_mmap_key); |
| if (db_base == MAP_FAILED) { |
| err = errno; |
| goto err_free_wrid_idx_pool; |
| } |
| |
| wq->db = (uint32_t *)(db_base + attr->db_off); |
| |
| /* Initialize the wrid free indexes pool. */ |
| for (i = 0; i < wq->wqe_cnt; i++) |
| wq->wrid_idx_pool[i] = i; |
| |
| pthread_spin_init(&wq->wqlock, PTHREAD_PROCESS_PRIVATE); |
| |
| wq->sub_cq_idx = attr->sub_cq_idx; |
| |
| return 0; |
| |
| err_free_wrid_idx_pool: |
| free(wq->wrid_idx_pool); |
| err_free_wrid: |
| free(wq->wrid); |
| return err; |
| } |
| |
| static void efa_sq_terminate(struct efa_qp *qp) |
| { |
| struct efa_sq *sq = &qp->sq; |
| |
| if (!sq->wq.wqe_cnt) |
| return; |
| |
| munmap(sq->desc - sq->desc_offset, sq->desc_ring_mmap_size); |
| free(sq->local_queue); |
| |
| efa_wq_terminate(&sq->wq, qp->page_size); |
| } |
| |
| static int efa_sq_initialize(struct efa_qp *qp, |
| const struct ibv_qp_init_attr_ex *attr, |
| struct efa_create_qp_resp *resp) |
| { |
| struct efa_context *ctx = to_efa_context(qp->verbs_qp.qp.context); |
| struct efa_wq_init_attr wq_attr; |
| struct efa_sq *sq = &qp->sq; |
| size_t desc_ring_size; |
| int err; |
| |
| if (!sq->wq.wqe_cnt) |
| return 0; |
| |
| wq_attr = (struct efa_wq_init_attr) { |
| .db_mmap_key = resp->sq_db_mmap_key, |
| .db_off = resp->sq_db_offset, |
| .cmd_fd = qp->verbs_qp.qp.context->cmd_fd, |
| .pgsz = qp->page_size, |
| .sub_cq_idx = resp->send_sub_cq_idx, |
| }; |
| |
| err = efa_wq_initialize(&qp->sq.wq, &wq_attr); |
| if (err) { |
| verbs_err(&ctx->ibvctx, "SQ[%u] efa_wq_initialize failed\n", |
| qp->verbs_qp.qp.qp_num); |
| return err; |
| } |
| |
| sq->desc_offset = resp->llq_desc_offset; |
| desc_ring_size = sq->wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); |
| sq->desc_ring_mmap_size = align(desc_ring_size + sq->desc_offset, |
| qp->page_size); |
| sq->max_inline_data = attr->cap.max_inline_data; |
| |
| sq->local_queue = malloc(desc_ring_size); |
| if (!sq->local_queue) { |
| err = ENOMEM; |
| goto err_terminate_wq; |
| } |
| |
| sq->desc = mmap(NULL, sq->desc_ring_mmap_size, PROT_WRITE, |
| MAP_SHARED, qp->verbs_qp.qp.context->cmd_fd, |
| resp->llq_desc_mmap_key); |
| if (sq->desc == MAP_FAILED) { |
| verbs_err(&ctx->ibvctx, "SQ buffer mmap failed\n"); |
| err = errno; |
| goto err_free_local_queue; |
| } |
| |
| sq->desc += sq->desc_offset; |
| sq->max_wr_rdma_sge = min_t(uint16_t, ctx->max_wr_rdma_sge, |
| EFA_IO_TX_DESC_NUM_RDMA_BUFS); |
| sq->max_batch_wr = ctx->max_tx_batch ? |
| (ctx->max_tx_batch * 64) / sizeof(struct efa_io_tx_wqe) : |
| UINT16_MAX; |
| if (ctx->min_sq_wr) { |
| /* The device can't accept a doorbell for the whole SQ at once, |
| * set the max batch to at least (SQ size - 1). |
| */ |
| sq->max_batch_wr = min_t(uint32_t, sq->max_batch_wr, |
| sq->wq.wqe_cnt - 1); |
| } |
| |
| return 0; |
| |
| err_free_local_queue: |
| free(sq->local_queue); |
| err_terminate_wq: |
| efa_wq_terminate(&sq->wq, qp->page_size); |
| return err; |
| } |
| |
| static void efa_rq_terminate(struct efa_qp *qp) |
| { |
| struct efa_rq *rq = &qp->rq; |
| |
| if (!rq->wq.wqe_cnt) |
| return; |
| |
| munmap(rq->buf, rq->buf_size); |
| |
| efa_wq_terminate(&rq->wq, qp->page_size); |
| } |
| |
| static int efa_rq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp) |
| { |
| struct efa_wq_init_attr wq_attr; |
| struct efa_rq *rq = &qp->rq; |
| int err; |
| |
| if (!rq->wq.wqe_cnt) |
| return 0; |
| |
| wq_attr = (struct efa_wq_init_attr) { |
| .db_mmap_key = resp->rq_db_mmap_key, |
| .db_off = resp->rq_db_offset, |
| .cmd_fd = qp->verbs_qp.qp.context->cmd_fd, |
| .pgsz = qp->page_size, |
| .sub_cq_idx = resp->recv_sub_cq_idx, |
| }; |
| |
| err = efa_wq_initialize(&qp->rq.wq, &wq_attr); |
| if (err) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "RQ efa_wq_initialize failed\n"); |
| return err; |
| } |
| |
| rq->buf_size = resp->rq_mmap_size; |
| rq->buf = mmap(NULL, rq->buf_size, PROT_WRITE, MAP_SHARED, |
| qp->verbs_qp.qp.context->cmd_fd, resp->rq_mmap_key); |
| if (rq->buf == MAP_FAILED) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "RQ buffer mmap failed\n"); |
| err = errno; |
| goto err_terminate_wq; |
| } |
| |
| return 0; |
| |
| err_terminate_wq: |
| efa_wq_terminate(&rq->wq, qp->page_size); |
| return err; |
| } |
| |
| static void efa_qp_init_indices(struct efa_qp *qp) |
| { |
| qp->sq.wq.wqe_posted = 0; |
| qp->sq.wq.wqe_completed = 0; |
| qp->sq.wq.pc = 0; |
| qp->sq.wq.wrid_idx_pool_next = 0; |
| |
| qp->rq.wq.wqe_posted = 0; |
| qp->rq.wq.wqe_completed = 0; |
| qp->rq.wq.pc = 0; |
| qp->rq.wq.wrid_idx_pool_next = 0; |
| } |
| |
| static void efa_setup_qp(struct efa_context *ctx, |
| struct efa_qp *qp, |
| struct ibv_qp_cap *cap, |
| size_t page_size) |
| { |
| uint16_t rq_desc_cnt; |
| |
| efa_qp_init_indices(qp); |
| |
| qp->sq.wq.wqe_cnt = roundup_pow_of_two(max_t(uint32_t, cap->max_send_wr, |
| ctx->min_sq_wr)); |
| qp->sq.wq.max_sge = cap->max_send_sge; |
| qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1; |
| |
| qp->rq.wq.max_sge = cap->max_recv_sge; |
| rq_desc_cnt = roundup_pow_of_two(cap->max_recv_sge * cap->max_recv_wr); |
| qp->rq.wq.desc_mask = rq_desc_cnt - 1; |
| qp->rq.wq.wqe_cnt = rq_desc_cnt / qp->rq.wq.max_sge; |
| |
| qp->page_size = page_size; |
| } |
| |
| static void efa_lock_cqs(struct ibv_qp *ibvqp) |
| { |
| struct efa_cq *send_cq = to_efa_cq(ibvqp->send_cq); |
| struct efa_cq *recv_cq = to_efa_cq(ibvqp->recv_cq); |
| |
| if (recv_cq == send_cq) { |
| pthread_spin_lock(&recv_cq->lock); |
| } else { |
| pthread_spin_lock(&recv_cq->lock); |
| pthread_spin_lock(&send_cq->lock); |
| } |
| } |
| |
| static void efa_unlock_cqs(struct ibv_qp *ibvqp) |
| { |
| struct efa_cq *send_cq = to_efa_cq(ibvqp->send_cq); |
| struct efa_cq *recv_cq = to_efa_cq(ibvqp->recv_cq); |
| |
| if (recv_cq == send_cq) { |
| pthread_spin_unlock(&recv_cq->lock); |
| } else { |
| pthread_spin_unlock(&recv_cq->lock); |
| pthread_spin_unlock(&send_cq->lock); |
| } |
| } |
| |
| static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, |
| struct ibv_qp_init_attr_ex *attr_ex); |
| |
| static int efa_check_qp_attr(struct efa_context *ctx, |
| struct ibv_qp_init_attr_ex *attr, |
| struct efadv_qp_init_attr *efa_attr) |
| { |
| uint64_t supp_send_ops_mask; |
| uint64_t supp_ud_send_ops_mask = IBV_QP_EX_WITH_SEND | |
| IBV_QP_EX_WITH_SEND_WITH_IMM; |
| uint64_t supp_srd_send_ops_mask = |
| IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | |
| (EFA_DEV_CAP(ctx, RDMA_READ) ? IBV_QP_EX_WITH_RDMA_READ : 0); |
| |
| #define EFA_CREATE_QP_SUPP_ATTR_MASK \ |
| (IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) |
| |
| if (attr->qp_type == IBV_QPT_DRIVER && |
| efa_attr->driver_qp_type != EFADV_QP_DRIVER_TYPE_SRD) { |
| verbs_err(&ctx->ibvctx, "Driver QP type must be SRD\n"); |
| return EOPNOTSUPP; |
| } |
| |
| if (!check_comp_mask(attr->comp_mask, EFA_CREATE_QP_SUPP_ATTR_MASK)) { |
| verbs_err(&ctx->ibvctx, |
| "Unsupported comp_mask[%#x] supported[%#x]\n", |
| attr->comp_mask, EFA_CREATE_QP_SUPP_ATTR_MASK); |
| return EOPNOTSUPP; |
| } |
| |
| if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD)) { |
| verbs_err(&ctx->ibvctx, "Does not support PD in init attr\n"); |
| return EINVAL; |
| } |
| |
| if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { |
| switch (attr->qp_type) { |
| case IBV_QPT_UD: |
| supp_send_ops_mask = supp_ud_send_ops_mask; |
| break; |
| case IBV_QPT_DRIVER: |
| supp_send_ops_mask = supp_srd_send_ops_mask; |
| break; |
| default: |
| verbs_err(&ctx->ibvctx, "Invalid QP type %u\n", |
| attr->qp_type); |
| return EOPNOTSUPP; |
| } |
| |
| if (!check_comp_mask(attr->send_ops_flags, |
| supp_send_ops_mask)) { |
| verbs_err(&ctx->ibvctx, |
| "Unsupported send_ops_flags[%" PRIx64 "] supported [%" PRIx64 "]\n", |
| attr->send_ops_flags, supp_send_ops_mask); |
| return EOPNOTSUPP; |
| } |
| } |
| |
| if (!attr->recv_cq || !attr->send_cq) { |
| verbs_err(&ctx->ibvctx, "Send/Receive CQ not provided\n"); |
| return EINVAL; |
| } |
| |
| if (attr->srq) { |
| verbs_err(&ctx->ibvctx, "SRQ is not supported\n"); |
| return EINVAL; |
| } |
| |
| return 0; |
| } |
| |
| static int efa_check_qp_limits(struct efa_context *ctx, |
| struct ibv_qp_init_attr_ex *attr) |
| { |
| if (attr->cap.max_send_sge > ctx->max_sq_sge) { |
| verbs_err(&ctx->ibvctx, |
| "Max send SGE %u > %u\n", attr->cap.max_send_sge, |
| ctx->max_sq_sge); |
| return EINVAL; |
| } |
| |
| if (attr->cap.max_recv_sge > ctx->max_rq_sge) { |
| verbs_err(&ctx->ibvctx, |
| "Max receive SGE %u > %u\n", attr->cap.max_recv_sge, |
| ctx->max_rq_sge); |
| return EINVAL; |
| } |
| |
| if (attr->cap.max_send_wr > ctx->max_sq_wr) { |
| verbs_err(&ctx->ibvctx, |
| "Max send WR %u > %u\n", attr->cap.max_send_wr, |
| ctx->max_sq_wr); |
| return EINVAL; |
| } |
| |
| if (attr->cap.max_recv_wr > ctx->max_rq_wr) { |
| verbs_err(&ctx->ibvctx, |
| "Max receive WR %u > %u\n", attr->cap.max_recv_wr, |
| ctx->max_rq_wr); |
| return EINVAL; |
| } |
| |
| return 0; |
| } |
| |
| static struct ibv_qp *create_qp(struct ibv_context *ibvctx, |
| struct ibv_qp_init_attr_ex *attr, |
| struct efadv_qp_init_attr *efa_attr) |
| { |
| struct efa_context *ctx = to_efa_context(ibvctx); |
| struct efa_dev *dev = to_efa_dev(ibvctx->device); |
| struct efa_create_qp_resp resp = {}; |
| struct efa_create_qp req = {}; |
| struct efa_cq *send_cq; |
| struct efa_cq *recv_cq; |
| struct ibv_qp *ibvqp; |
| struct efa_qp *qp; |
| int err; |
| |
| err = efa_check_qp_attr(ctx, attr, efa_attr); |
| if (err) |
| goto err_out; |
| |
| err = efa_check_qp_limits(ctx, attr); |
| if (err) |
| goto err_out; |
| |
| qp = calloc(1, sizeof(*qp)); |
| if (!qp) { |
| err = ENOMEM; |
| goto err_out; |
| } |
| |
| efa_setup_qp(ctx, qp, &attr->cap, dev->pg_sz); |
| |
| attr->cap.max_send_wr = qp->sq.wq.wqe_cnt; |
| attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt; |
| |
| req.rq_ring_size = (qp->rq.wq.desc_mask + 1) * |
| sizeof(struct efa_io_rx_desc); |
| req.sq_ring_size = (attr->cap.max_send_wr) * |
| sizeof(struct efa_io_tx_wqe); |
| if (attr->qp_type == IBV_QPT_DRIVER) |
| req.driver_qp_type = efa_attr->driver_qp_type; |
| |
| err = ibv_cmd_create_qp_ex(ibvctx, &qp->verbs_qp, |
| attr, &req.ibv_cmd, sizeof(req), |
| &resp.ibv_resp, sizeof(resp)); |
| if (err) |
| goto err_free_qp; |
| |
| ibvqp = &qp->verbs_qp.qp; |
| ibvqp->state = IBV_QPS_RESET; |
| qp->sq_sig_all = attr->sq_sig_all; |
| |
| err = efa_rq_initialize(qp, &resp); |
| if (err) |
| goto err_destroy_qp; |
| |
| err = efa_sq_initialize(qp, attr, &resp); |
| if (err) |
| goto err_terminate_rq; |
| |
| pthread_spin_lock(&ctx->qp_table_lock); |
| ctx->qp_table[ibvqp->qp_num & ctx->qp_table_sz_m1] = qp; |
| pthread_spin_unlock(&ctx->qp_table_lock); |
| |
| send_cq = to_efa_cq(attr->send_cq); |
| pthread_spin_lock(&send_cq->lock); |
| efa_cq_inc_ref_cnt(send_cq, resp.send_sub_cq_idx); |
| pthread_spin_unlock(&send_cq->lock); |
| |
| recv_cq = to_efa_cq(attr->recv_cq); |
| pthread_spin_lock(&recv_cq->lock); |
| efa_cq_inc_ref_cnt(recv_cq, resp.recv_sub_cq_idx); |
| pthread_spin_unlock(&recv_cq->lock); |
| |
| if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { |
| efa_qp_fill_wr_pfns(&qp->verbs_qp.qp_ex, attr); |
| qp->verbs_qp.comp_mask |= VERBS_QP_EX; |
| } |
| |
| return ibvqp; |
| |
| err_terminate_rq: |
| efa_rq_terminate(qp); |
| err_destroy_qp: |
| ibv_cmd_destroy_qp(ibvqp); |
| err_free_qp: |
| free(qp); |
| err_out: |
| errno = err; |
| verbs_err(verbs_get_ctx(ibvctx), "Failed to create QP\n"); |
| return NULL; |
| } |
| |
| struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd, |
| struct ibv_qp_init_attr *attr) |
| { |
| struct ibv_qp_init_attr_ex attr_ex = {}; |
| struct ibv_qp *ibvqp; |
| |
| if (attr->qp_type != IBV_QPT_UD) { |
| verbs_err(verbs_get_ctx(ibvpd->context), |
| "Unsupported QP type %d\n", attr->qp_type); |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| memcpy(&attr_ex, attr, sizeof(*attr)); |
| attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; |
| attr_ex.pd = ibvpd; |
| |
| ibvqp = create_qp(ibvpd->context, &attr_ex, NULL); |
| if (ibvqp) |
| memcpy(attr, &attr_ex, sizeof(*attr)); |
| |
| return ibvqp; |
| } |
| |
| struct ibv_qp *efa_create_qp_ex(struct ibv_context *ibvctx, |
| struct ibv_qp_init_attr_ex *attr_ex) |
| { |
| if (attr_ex->qp_type != IBV_QPT_UD) { |
| verbs_err(verbs_get_ctx(ibvctx), "Unsupported QP type\n"); |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| return create_qp(ibvctx, attr_ex, NULL); |
| } |
| |
| struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, |
| struct ibv_qp_init_attr *attr, |
| uint32_t driver_qp_type) |
| { |
| struct ibv_qp_init_attr_ex attr_ex = {}; |
| struct efadv_qp_init_attr efa_attr = {}; |
| struct ibv_qp *ibvqp; |
| |
| if (!is_efa_dev(ibvpd->context->device)) { |
| verbs_err(verbs_get_ctx(ibvpd->context), "Not an EFA device\n"); |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| if (attr->qp_type != IBV_QPT_DRIVER) { |
| verbs_err(verbs_get_ctx(ibvpd->context), |
| "QP type not IBV_QPT_DRIVER\n"); |
| errno = EINVAL; |
| return NULL; |
| } |
| |
| memcpy(&attr_ex, attr, sizeof(*attr)); |
| attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; |
| attr_ex.pd = ibvpd; |
| efa_attr.driver_qp_type = driver_qp_type; |
| |
| ibvqp = create_qp(ibvpd->context, &attr_ex, &efa_attr); |
| if (ibvqp) |
| memcpy(attr, &attr_ex, sizeof(*attr)); |
| |
| return ibvqp; |
| } |
| |
| struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx, |
| struct ibv_qp_init_attr_ex *attr_ex, |
| struct efadv_qp_init_attr *efa_attr, |
| uint32_t inlen) |
| { |
| if (!is_efa_dev(ibvctx->device)) { |
| verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n"); |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| if (attr_ex->qp_type != IBV_QPT_DRIVER || |
| !vext_field_avail(struct efadv_qp_init_attr, |
| driver_qp_type, inlen) || |
| efa_attr->comp_mask || |
| !is_reserved_cleared(efa_attr->reserved) || |
| (inlen > sizeof(*efa_attr) && !is_ext_cleared(efa_attr, inlen))) { |
| verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); |
| errno = EINVAL; |
| return NULL; |
| } |
| |
| return create_qp(ibvctx, attr_ex, efa_attr); |
| } |
| |
| int efa_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, |
| int attr_mask) |
| { |
| struct efa_qp *qp = to_efa_qp(ibvqp); |
| struct ibv_modify_qp cmd = {}; |
| int err; |
| |
| err = ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof(cmd)); |
| if (err) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "Failed to modify QP[%u]\n", qp->verbs_qp.qp.qp_num); |
| return err; |
| } |
| |
| if (attr_mask & IBV_QP_STATE) { |
| qp->verbs_qp.qp.state = attr->qp_state; |
| /* transition to reset */ |
| if (qp->verbs_qp.qp.state == IBV_QPS_RESET) |
| efa_qp_init_indices(qp); |
| } |
| |
| return 0; |
| } |
| |
| int efa_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, |
| int attr_mask, struct ibv_qp_init_attr *init_attr) |
| { |
| struct ibv_query_qp cmd; |
| |
| return ibv_cmd_query_qp(ibvqp, attr, attr_mask, init_attr, |
| &cmd, sizeof(cmd)); |
| } |
| |
| int efa_destroy_qp(struct ibv_qp *ibvqp) |
| { |
| struct efa_context *ctx = to_efa_context(ibvqp->context); |
| struct efa_qp *qp = to_efa_qp(ibvqp); |
| int err; |
| |
| err = ibv_cmd_destroy_qp(ibvqp); |
| if (err) { |
| verbs_err(&ctx->ibvctx, "Failed to destroy QP[%u]\n", |
| ibvqp->qp_num); |
| return err; |
| } |
| |
| pthread_spin_lock(&ctx->qp_table_lock); |
| efa_lock_cqs(ibvqp); |
| |
| efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->send_cq), qp->sq.wq.sub_cq_idx); |
| efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->recv_cq), qp->rq.wq.sub_cq_idx); |
| |
| ctx->qp_table[ibvqp->qp_num & ctx->qp_table_sz_m1] = NULL; |
| |
| efa_unlock_cqs(ibvqp); |
| pthread_spin_unlock(&ctx->qp_table_lock); |
| |
| efa_sq_terminate(qp); |
| efa_rq_terminate(qp); |
| |
| free(qp); |
| return 0; |
| } |
| |
| static void efa_set_tx_buf(struct efa_io_tx_buf_desc *tx_buf, |
| uint64_t addr, uint32_t lkey, |
| uint32_t length) |
| { |
| tx_buf->length = length; |
| EFA_SET(&tx_buf->lkey, EFA_IO_TX_BUF_DESC_LKEY, lkey); |
| tx_buf->buf_addr_lo = addr & 0xffffffff; |
| tx_buf->buf_addr_hi = addr >> 32; |
| } |
| |
| static void efa_post_send_sgl(struct efa_io_tx_buf_desc *tx_bufs, |
| const struct ibv_sge *sg_list, |
| int num_sge) |
| { |
| const struct ibv_sge *sge; |
| size_t i; |
| |
| for (i = 0; i < num_sge; i++) { |
| sge = &sg_list[i]; |
| efa_set_tx_buf(&tx_bufs[i], sge->addr, sge->lkey, sge->length); |
| } |
| } |
| |
| static void efa_post_send_inline_data(const struct ibv_send_wr *wr, |
| struct efa_io_tx_wqe *tx_wqe) |
| { |
| const struct ibv_sge *sgl = wr->sg_list; |
| uint32_t total_length = 0; |
| uint32_t length; |
| size_t i; |
| |
| for (i = 0; i < wr->num_sge; i++) { |
| length = sgl[i].length; |
| |
| memcpy(tx_wqe->data.inline_data + total_length, |
| (void *)(uintptr_t)sgl[i].addr, length); |
| total_length += length; |
| } |
| |
| EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); |
| tx_wqe->meta.length = total_length; |
| } |
| |
| static size_t efa_sge_total_bytes(const struct ibv_sge *sg_list, int num_sge) |
| { |
| size_t bytes = 0; |
| size_t i; |
| |
| for (i = 0; i < num_sge; i++) |
| bytes += sg_list[i].length; |
| |
| return bytes; |
| } |
| |
| static size_t efa_buf_list_total_bytes(const struct ibv_data_buf *buf_list, |
| size_t num_buf) |
| { |
| size_t bytes = 0; |
| size_t i; |
| |
| for (i = 0; i < num_buf; i++) |
| bytes += buf_list[i].length; |
| |
| return bytes; |
| } |
| |
| static void efa_sq_advance_post_idx(struct efa_sq *sq) |
| { |
| struct efa_wq *wq = &sq->wq; |
| |
| wq->wqe_posted++; |
| wq->pc++; |
| |
| if (!(wq->pc & wq->desc_mask)) |
| wq->phase++; |
| } |
| |
| static inline void efa_rq_ring_doorbell(struct efa_rq *rq, uint16_t pc) |
| { |
| udma_to_device_barrier(); |
| mmio_write32(rq->wq.db, pc); |
| } |
| |
| static inline void efa_sq_ring_doorbell(struct efa_sq *sq, uint16_t pc) |
| { |
| mmio_write32(sq->wq.db, pc); |
| } |
| |
| static void efa_set_common_ctrl_flags(struct efa_io_tx_meta_desc *desc, |
| struct efa_sq *sq, |
| enum efa_io_send_op_type op_type) |
| { |
| EFA_SET(&desc->ctrl1, EFA_IO_TX_META_DESC_META_DESC, 1); |
| EFA_SET(&desc->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE, op_type); |
| EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_PHASE, sq->wq.phase); |
| EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_FIRST, 1); |
| EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_LAST, 1); |
| EFA_SET(&desc->ctrl2, EFA_IO_TX_META_DESC_COMP_REQ, 1); |
| } |
| |
| static int efa_post_send_validate(struct efa_qp *qp, |
| unsigned int wr_flags) |
| { |
| if (unlikely(qp->verbs_qp.qp.state != IBV_QPS_RTS && |
| qp->verbs_qp.qp.state != IBV_QPS_SQD)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] is in invalid state\n", |
| qp->verbs_qp.qp.qp_num); |
| return EINVAL; |
| } |
| |
| if (unlikely(!(wr_flags & IBV_SEND_SIGNALED) && !qp->sq_sig_all)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] Non signaled WRs not supported\n", |
| qp->verbs_qp.qp.qp_num); |
| return EINVAL; |
| } |
| |
| if (unlikely(wr_flags & ~(IBV_SEND_SIGNALED | IBV_SEND_INLINE))) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] Unsupported wr_flags[%#x] supported[%#x]\n", |
| qp->verbs_qp.qp.qp_num, wr_flags, |
| ~(IBV_SEND_SIGNALED | IBV_SEND_INLINE)); |
| return EINVAL; |
| } |
| |
| if (unlikely(qp->sq.wq.wqe_posted - qp->sq.wq.wqe_completed == |
| qp->sq.wq.wqe_cnt)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] is full wqe_posted[%u] wqe_completed[%u] wqe_cnt[%u]\n", |
| qp->verbs_qp.qp.qp_num, qp->sq.wq.wqe_posted, |
| qp->sq.wq.wqe_completed, qp->sq.wq.wqe_cnt); |
| return ENOMEM; |
| } |
| |
| return 0; |
| } |
| |
| static int efa_post_send_validate_wr(struct efa_qp *qp, |
| const struct ibv_send_wr *wr) |
| { |
| int err; |
| |
| err = efa_post_send_validate(qp, wr->send_flags); |
| if (unlikely(err)) |
| return err; |
| |
| if (unlikely(wr->opcode != IBV_WR_SEND && |
| wr->opcode != IBV_WR_SEND_WITH_IMM)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] unsupported opcode %d\n", |
| qp->verbs_qp.qp.qp_num, wr->opcode); |
| return EINVAL; |
| } |
| |
| if (wr->send_flags & IBV_SEND_INLINE) { |
| if (unlikely(efa_sge_total_bytes(wr->sg_list, wr->num_sge) > |
| qp->sq.max_inline_data)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] WR total bytes %zu > %zu\n", |
| qp->verbs_qp.qp.qp_num, |
| efa_sge_total_bytes(wr->sg_list, |
| wr->num_sge), |
| qp->sq.max_inline_data); |
| return EINVAL; |
| } |
| } else { |
| if (unlikely(wr->num_sge > qp->sq.wq.max_sge)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] WR num_sge %d > %d\n", |
| qp->verbs_qp.qp.qp_num, wr->num_sge, |
| qp->sq.wq.max_sge); |
| return EINVAL; |
| } |
| } |
| |
| return 0; |
| } |
| |
| int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, |
| struct ibv_send_wr **bad) |
| { |
| struct efa_io_tx_meta_desc *meta_desc; |
| struct efa_qp *qp = to_efa_qp(ibvqp); |
| struct efa_io_tx_wqe tx_wqe; |
| struct efa_sq *sq = &qp->sq; |
| struct efa_wq *wq = &sq->wq; |
| uint32_t sq_desc_offset; |
| uint32_t curbatch = 0; |
| struct efa_ah *ah; |
| int err = 0; |
| |
| mmio_wc_spinlock(&wq->wqlock); |
| while (wr) { |
| err = efa_post_send_validate_wr(qp, wr); |
| if (err) { |
| *bad = wr; |
| goto ring_db; |
| } |
| |
| memset(&tx_wqe, 0, sizeof(tx_wqe)); |
| meta_desc = &tx_wqe.meta; |
| ah = to_efa_ah(wr->wr.ud.ah); |
| |
| if (wr->send_flags & IBV_SEND_INLINE) { |
| efa_post_send_inline_data(wr, &tx_wqe); |
| } else { |
| meta_desc->length = wr->num_sge; |
| efa_post_send_sgl(tx_wqe.data.sgl, wr->sg_list, |
| wr->num_sge); |
| } |
| |
| if (wr->opcode == IBV_WR_SEND_WITH_IMM) { |
| meta_desc->immediate_data = be32toh(wr->imm_data); |
| EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, |
| 1); |
| } |
| |
| /* Set rest of the descriptor fields */ |
| efa_set_common_ctrl_flags(meta_desc, sq, EFA_IO_SEND); |
| meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(wq, |
| wr->wr_id); |
| meta_desc->dest_qp_num = wr->wr.ud.remote_qpn; |
| meta_desc->ah = ah->efa_ah; |
| meta_desc->qkey = wr->wr.ud.remote_qkey; |
| |
| /* Copy descriptor */ |
| sq_desc_offset = (wq->pc & wq->desc_mask) * |
| sizeof(tx_wqe); |
| mmio_memcpy_x64(sq->desc + sq_desc_offset, &tx_wqe, |
| sizeof(tx_wqe)); |
| |
| /* advance index and change phase */ |
| efa_sq_advance_post_idx(sq); |
| curbatch++; |
| |
| if (curbatch == sq->max_batch_wr) { |
| curbatch = 0; |
| mmio_flush_writes(); |
| efa_sq_ring_doorbell(sq, wq->pc); |
| mmio_wc_start(); |
| } |
| |
| wr = wr->next; |
| } |
| |
| ring_db: |
| if (curbatch) { |
| mmio_flush_writes(); |
| efa_sq_ring_doorbell(sq, wq->pc); |
| } |
| |
| /* |
| * Not using mmio_wc_spinunlock as the doorbell write should be done |
| * inside the lock. |
| */ |
| pthread_spin_unlock(&wq->wqlock); |
| return err; |
| } |
| |
| static int efa_send_wr_common(struct ibv_qp_ex *ibvqpx, |
| enum efa_io_send_op_type op_type) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_sq *sq = &qp->sq; |
| struct efa_io_tx_meta_desc *meta_desc; |
| int err; |
| |
| if (unlikely(qp->wr_session_err)) |
| return qp->wr_session_err; |
| |
| err = efa_post_send_validate(qp, ibvqpx->wr_flags); |
| if (unlikely(err)) { |
| qp->wr_session_err = err; |
| return err; |
| } |
| |
| sq->curr_tx_wqe = (struct efa_io_tx_wqe *)sq->local_queue + |
| sq->num_wqe_pending; |
| memset(sq->curr_tx_wqe, 0, sizeof(*sq->curr_tx_wqe)); |
| |
| meta_desc = &sq->curr_tx_wqe->meta; |
| efa_set_common_ctrl_flags(meta_desc, sq, op_type); |
| meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(&sq->wq, |
| ibvqpx->wr_id); |
| |
| /* advance index and change phase */ |
| efa_sq_advance_post_idx(sq); |
| sq->num_wqe_pending++; |
| |
| return 0; |
| } |
| |
| static void efa_send_wr_send(struct ibv_qp_ex *ibvqpx) |
| { |
| efa_send_wr_common(ibvqpx, EFA_IO_SEND); |
| } |
| |
| static void efa_send_wr_send_imm(struct ibv_qp_ex *ibvqpx, __be32 imm_data) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_io_tx_meta_desc *meta_desc; |
| int err; |
| |
| err = efa_send_wr_common(ibvqpx, EFA_IO_SEND); |
| if (unlikely(err)) |
| return; |
| |
| meta_desc = &qp->sq.curr_tx_wqe->meta; |
| meta_desc->immediate_data = be32toh(imm_data); |
| EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, 1); |
| } |
| |
| static void efa_send_wr_rdma_read(struct ibv_qp_ex *ibvqpx, uint32_t rkey, |
| uint64_t remote_addr) |
| { |
| struct efa_io_remote_mem_addr *remote_mem; |
| struct efa_sq *sq = &to_efa_qp_ex(ibvqpx)->sq; |
| struct efa_io_tx_wqe *tx_wqe; |
| int err; |
| |
| err = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_READ); |
| if (unlikely(err)) |
| return; |
| |
| tx_wqe = sq->curr_tx_wqe; |
| remote_mem = &tx_wqe->data.rdma_req.remote_mem; |
| remote_mem->rkey = rkey; |
| remote_mem->buf_addr_lo = remote_addr & 0xFFFFFFFF; |
| remote_mem->buf_addr_hi = remote_addr >> 32; |
| } |
| |
| static void efa_send_wr_set_sge(struct ibv_qp_ex *ibvqpx, uint32_t lkey, |
| uint64_t addr, uint32_t length) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_io_tx_buf_desc *buf; |
| struct efa_io_tx_wqe *tx_wqe; |
| uint8_t op_type; |
| |
| if (unlikely(qp->wr_session_err)) |
| return; |
| |
| tx_wqe = qp->sq.curr_tx_wqe; |
| tx_wqe->meta.length = 1; |
| |
| op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); |
| switch (op_type) { |
| case EFA_IO_SEND: |
| buf = &tx_wqe->data.sgl[0]; |
| break; |
| case EFA_IO_RDMA_READ: |
| tx_wqe->data.rdma_req.remote_mem.length = length; |
| buf = &tx_wqe->data.rdma_req.local_mem[0]; |
| break; |
| default: |
| return; |
| } |
| |
| efa_set_tx_buf(buf, addr, lkey, length); |
| } |
| |
| static void efa_send_wr_set_sge_list(struct ibv_qp_ex *ibvqpx, size_t num_sge, |
| const struct ibv_sge *sg_list) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_io_rdma_req *rdma_req; |
| struct efa_io_tx_wqe *tx_wqe; |
| struct efa_sq *sq = &qp->sq; |
| uint8_t op_type; |
| |
| if (unlikely(qp->wr_session_err)) |
| return; |
| |
| tx_wqe = sq->curr_tx_wqe; |
| op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); |
| switch (op_type) { |
| case EFA_IO_SEND: |
| if (unlikely(num_sge > sq->wq.max_sge)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] num_sge[%zu] > max_sge[%u]\n", |
| ibvqpx->qp_base.qp_num, num_sge, |
| sq->wq.max_sge); |
| qp->wr_session_err = EINVAL; |
| return; |
| } |
| efa_post_send_sgl(tx_wqe->data.sgl, sg_list, num_sge); |
| break; |
| case EFA_IO_RDMA_READ: |
| if (unlikely(num_sge > sq->max_wr_rdma_sge)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] num_sge[%zu] > max_rdma_sge[%zu]\n", |
| ibvqpx->qp_base.qp_num, num_sge, |
| sq->max_wr_rdma_sge); |
| qp->wr_session_err = EINVAL; |
| return; |
| } |
| rdma_req = &tx_wqe->data.rdma_req; |
| rdma_req->remote_mem.length = efa_sge_total_bytes(sg_list, |
| num_sge); |
| efa_post_send_sgl(rdma_req->local_mem, sg_list, num_sge); |
| break; |
| default: |
| return; |
| } |
| |
| tx_wqe->meta.length = num_sge; |
| } |
| |
| static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, |
| size_t length) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; |
| |
| if (unlikely(qp->wr_session_err)) |
| return; |
| |
| if (unlikely(length > qp->sq.max_inline_data)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] WR inline length %zu > %zu\n", |
| ibvqpx->qp_base.qp_num, length, |
| qp->sq.max_inline_data); |
| qp->wr_session_err = EINVAL; |
| return; |
| } |
| |
| EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); |
| memcpy(tx_wqe->data.inline_data, addr, length); |
| tx_wqe->meta.length = length; |
| } |
| |
| static void |
| efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, |
| size_t num_buf, |
| const struct ibv_data_buf *buf_list) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; |
| uint32_t total_length = 0; |
| uint32_t length; |
| size_t i; |
| |
| if (unlikely(qp->wr_session_err)) |
| return; |
| |
| if (unlikely(efa_buf_list_total_bytes(buf_list, num_buf) > |
| qp->sq.max_inline_data)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] WR inline length %zu > %zu\n", |
| ibvqpx->qp_base.qp_num, |
| efa_buf_list_total_bytes(buf_list, num_buf), |
| qp->sq.max_inline_data); |
| qp->wr_session_err = EINVAL; |
| return; |
| } |
| |
| for (i = 0; i < num_buf; i++) { |
| length = buf_list[i].length; |
| |
| memcpy(tx_wqe->data.inline_data + total_length, |
| buf_list[i].addr, length); |
| total_length += length; |
| } |
| |
| EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); |
| tx_wqe->meta.length = total_length; |
| } |
| |
| static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, |
| struct ibv_ah *ibvah, |
| uint32_t remote_qpn, uint32_t remote_qkey) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_ah *ah = to_efa_ah(ibvah); |
| struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; |
| |
| if (unlikely(qp->wr_session_err)) |
| return; |
| |
| tx_wqe->meta.dest_qp_num = remote_qpn; |
| tx_wqe->meta.ah = ah->efa_ah; |
| tx_wqe->meta.qkey = remote_qkey; |
| } |
| |
| static void efa_send_wr_start(struct ibv_qp_ex *ibvqpx) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_sq *sq = &qp->sq; |
| |
| mmio_wc_spinlock(&qp->sq.wq.wqlock); |
| qp->wr_session_err = 0; |
| sq->num_wqe_pending = 0; |
| sq->phase_rb = qp->sq.wq.phase; |
| } |
| |
| static inline void efa_sq_roll_back(struct efa_sq *sq) |
| { |
| struct efa_qp *qp = container_of(sq, struct efa_qp, sq); |
| struct efa_wq *wq = &sq->wq; |
| |
| verbs_debug(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "SQ[%u] Rollback num_wqe_pending = %u\n", |
| qp->verbs_qp.qp.qp_num, sq->num_wqe_pending); |
| wq->wqe_posted -= sq->num_wqe_pending; |
| wq->pc -= sq->num_wqe_pending; |
| wq->wrid_idx_pool_next -= sq->num_wqe_pending; |
| wq->phase = sq->phase_rb; |
| } |
| |
| static int efa_send_wr_complete(struct ibv_qp_ex *ibvqpx) |
| { |
| struct efa_qp *qp = to_efa_qp_ex(ibvqpx); |
| struct efa_sq *sq = &qp->sq; |
| uint32_t max_txbatch = sq->max_batch_wr; |
| uint32_t num_wqe_to_copy; |
| uint16_t local_idx = 0; |
| uint16_t curbatch = 0; |
| uint16_t sq_desc_idx; |
| uint16_t pc; |
| |
| if (unlikely(qp->wr_session_err)) { |
| efa_sq_roll_back(sq); |
| goto out; |
| } |
| |
| /* |
| * Copy local queue to device in chunks, handling wraparound and max |
| * doorbell batch. |
| */ |
| pc = sq->wq.pc - sq->num_wqe_pending; |
| sq_desc_idx = pc & sq->wq.desc_mask; |
| |
| /* mmio_wc_start() comes from efa_send_wr_start() */ |
| while (sq->num_wqe_pending) { |
| num_wqe_to_copy = min3(sq->num_wqe_pending, |
| sq->wq.wqe_cnt - sq_desc_idx, |
| max_txbatch - curbatch); |
| mmio_memcpy_x64((struct efa_io_tx_wqe *)sq->desc + |
| sq_desc_idx, |
| (struct efa_io_tx_wqe *)sq->local_queue + |
| local_idx, |
| num_wqe_to_copy * sizeof(struct efa_io_tx_wqe)); |
| |
| sq->num_wqe_pending -= num_wqe_to_copy; |
| local_idx += num_wqe_to_copy; |
| curbatch += num_wqe_to_copy; |
| pc += num_wqe_to_copy; |
| sq_desc_idx = (sq_desc_idx + num_wqe_to_copy) & |
| sq->wq.desc_mask; |
| |
| if (curbatch == max_txbatch) { |
| mmio_flush_writes(); |
| efa_sq_ring_doorbell(sq, pc); |
| curbatch = 0; |
| mmio_wc_start(); |
| } |
| } |
| |
| if (curbatch) { |
| mmio_flush_writes(); |
| efa_sq_ring_doorbell(sq, sq->wq.pc); |
| } |
| out: |
| /* |
| * Not using mmio_wc_spinunlock as the doorbell write should be done |
| * inside the lock. |
| */ |
| pthread_spin_unlock(&sq->wq.wqlock); |
| |
| return qp->wr_session_err; |
| } |
| |
| static void efa_send_wr_abort(struct ibv_qp_ex *ibvqpx) |
| { |
| struct efa_sq *sq = &to_efa_qp_ex(ibvqpx)->sq; |
| |
| efa_sq_roll_back(sq); |
| pthread_spin_unlock(&sq->wq.wqlock); |
| } |
| |
| static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, |
| struct ibv_qp_init_attr_ex *attr_ex) |
| { |
| ibvqpx->wr_start = efa_send_wr_start; |
| ibvqpx->wr_complete = efa_send_wr_complete; |
| ibvqpx->wr_abort = efa_send_wr_abort; |
| |
| if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND) |
| ibvqpx->wr_send = efa_send_wr_send; |
| |
| if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND_WITH_IMM) |
| ibvqpx->wr_send_imm = efa_send_wr_send_imm; |
| |
| if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_READ) |
| ibvqpx->wr_rdma_read = efa_send_wr_rdma_read; |
| |
| ibvqpx->wr_set_inline_data = efa_send_wr_set_inline_data; |
| ibvqpx->wr_set_inline_data_list = efa_send_wr_set_inline_data_list; |
| ibvqpx->wr_set_sge = efa_send_wr_set_sge; |
| ibvqpx->wr_set_sge_list = efa_send_wr_set_sge_list; |
| ibvqpx->wr_set_ud_addr = efa_send_wr_set_addr; |
| } |
| |
| static int efa_post_recv_validate(struct efa_qp *qp, struct ibv_recv_wr *wr) |
| { |
| if (unlikely(qp->verbs_qp.qp.state == IBV_QPS_RESET || |
| qp->verbs_qp.qp.state == IBV_QPS_ERR)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "RQ[%u] Invalid QP state\n", |
| qp->verbs_qp.qp.qp_num); |
| return EINVAL; |
| } |
| |
| if (unlikely(wr->num_sge > qp->rq.wq.max_sge)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "RQ[%u] WR num_sge %d > %d\n", |
| qp->verbs_qp.qp.qp_num, wr->num_sge, |
| qp->rq.wq.max_sge); |
| return EINVAL; |
| } |
| |
| if (unlikely(qp->rq.wq.wqe_posted - qp->rq.wq.wqe_completed == |
| qp->rq.wq.wqe_cnt)) { |
| verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), |
| "RQ[%u] is full wqe_posted[%u] wqe_completed[%u] wqe_cnt[%u]\n", |
| qp->verbs_qp.qp.qp_num, qp->rq.wq.wqe_posted, |
| qp->rq.wq.wqe_completed, qp->rq.wq.wqe_cnt); |
| return ENOMEM; |
| } |
| |
| return 0; |
| } |
| |
| int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, |
| struct ibv_recv_wr **bad) |
| { |
| struct efa_qp *qp = to_efa_qp(ibvqp); |
| struct efa_wq *wq = &qp->rq.wq; |
| struct efa_io_rx_desc rx_buf; |
| uint32_t rq_desc_offset; |
| uintptr_t addr; |
| int err = 0; |
| size_t i; |
| |
| pthread_spin_lock(&wq->wqlock); |
| while (wr) { |
| err = efa_post_recv_validate(qp, wr); |
| if (err) { |
| *bad = wr; |
| goto ring_db; |
| } |
| |
| memset(&rx_buf, 0, sizeof(rx_buf)); |
| |
| rx_buf.req_id = efa_wq_get_next_wrid_idx_locked(wq, wr->wr_id); |
| wq->wqe_posted++; |
| |
| /* Default init of the rx buffer */ |
| EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_FIRST, 1); |
| EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_LAST, 0); |
| |
| for (i = 0; i < wr->num_sge; i++) { |
| /* Set last indication if need) */ |
| if (i == wr->num_sge - 1) |
| EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_LAST, |
| 1); |
| |
| addr = wr->sg_list[i].addr; |
| |
| /* Set RX buffer desc from SGE */ |
| rx_buf.length = min_t(uint32_t, wr->sg_list[i].length, UINT16_MAX); |
| EFA_SET(&rx_buf.lkey_ctrl, EFA_IO_RX_DESC_LKEY, |
| wr->sg_list[i].lkey); |
| rx_buf.buf_addr_lo = addr; |
| rx_buf.buf_addr_hi = (uint64_t)addr >> 32; |
| |
| /* Copy descriptor to RX ring */ |
| rq_desc_offset = (wq->pc & wq->desc_mask) * |
| sizeof(rx_buf); |
| memcpy(qp->rq.buf + rq_desc_offset, &rx_buf, sizeof(rx_buf)); |
| |
| /* Wrap rx descriptor index */ |
| wq->pc++; |
| if (!(wq->pc & wq->desc_mask)) |
| wq->phase++; |
| |
| /* reset descriptor for next iov */ |
| memset(&rx_buf, 0, sizeof(rx_buf)); |
| } |
| wr = wr->next; |
| } |
| |
| ring_db: |
| efa_rq_ring_doorbell(&qp->rq, wq->pc); |
| |
| pthread_spin_unlock(&wq->wqlock); |
| return err; |
| } |
| |
| int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, |
| uint32_t inlen) |
| { |
| uint64_t comp_mask_out = 0; |
| |
| if (!is_efa_dev(ibvah->context->device)) { |
| verbs_err(verbs_get_ctx(ibvah->context), "Not an EFA device\n"); |
| return EOPNOTSUPP; |
| } |
| |
| if (!vext_field_avail(typeof(*attr), ahn, inlen)) { |
| verbs_err(verbs_get_ctx(ibvah->context), |
| "Compatibility issues\n"); |
| return EINVAL; |
| } |
| |
| memset(attr, 0, inlen); |
| attr->ahn = to_efa_ah(ibvah)->efa_ah; |
| |
| attr->comp_mask = comp_mask_out; |
| |
| return 0; |
| } |
| |
| struct ibv_ah *efa_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr) |
| { |
| struct efa_create_ah_resp resp = {}; |
| struct efa_ah *ah; |
| int err; |
| |
| ah = calloc(1, sizeof(*ah)); |
| if (!ah) |
| return NULL; |
| |
| err = ibv_cmd_create_ah(ibvpd, &ah->ibvah, attr, |
| &resp.ibv_resp, sizeof(resp)); |
| if (err) { |
| verbs_err(verbs_get_ctx(ibvpd->context), |
| "Failed to create AH\n"); |
| free(ah); |
| errno = err; |
| return NULL; |
| } |
| |
| ah->efa_ah = resp.efa_address_handle; |
| |
| return &ah->ibvah; |
| } |
| |
| int efa_destroy_ah(struct ibv_ah *ibvah) |
| { |
| struct efa_ah *ah; |
| int err; |
| |
| ah = to_efa_ah(ibvah); |
| err = ibv_cmd_destroy_ah(ibvah); |
| if (err) { |
| verbs_err(verbs_get_ctx(ibvah->context), |
| "Failed to destroy AH\n"); |
| return err; |
| } |
| free(ah); |
| |
| return 0; |
| } |