| // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause |
| |
| // Authors: Cheng Xu <chengyou@linux.alibaba.com> |
| // Copyright (c) 2020-2021, Alibaba Group. |
| // Authors: Bernard Metzler <bmt@zurich.ibm.com> |
| // Copyright (c) 2008-2019, IBM Corporation |
| |
| #include <ccan/minmax.h> |
| #include <endian.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/mman.h> |
| #include <sys/types.h> |
| #include <unistd.h> |
| #include <util/mmio.h> |
| #include <util/udma_barrier.h> |
| #include <util/util.h> |
| |
| #include "erdma.h" |
| #include "erdma_abi.h" |
| #include "erdma_db.h" |
| #include "erdma_hw.h" |
| #include "erdma_verbs.h" |
| |
| int erdma_query_device(struct ibv_context *ctx, |
| const struct ibv_query_device_ex_input *input, |
| struct ibv_device_attr_ex *attr, size_t attr_size) |
| { |
| struct ib_uverbs_ex_query_device_resp resp; |
| unsigned int major, minor, sub_minor; |
| size_t resp_size = sizeof(resp); |
| uint64_t raw_fw_ver; |
| int rv; |
| |
| rv = ibv_cmd_query_device_any(ctx, input, attr, attr_size, &resp, |
| &resp_size); |
| if (rv) |
| return rv; |
| |
| raw_fw_ver = resp.base.fw_ver; |
| major = (raw_fw_ver >> 32) & 0xffff; |
| minor = (raw_fw_ver >> 16) & 0xffff; |
| sub_minor = raw_fw_ver & 0xffff; |
| |
| snprintf(attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver), |
| "%d.%d.%d", major, minor, sub_minor); |
| |
| return 0; |
| } |
| |
| int erdma_query_port(struct ibv_context *ctx, uint8_t port, |
| struct ibv_port_attr *attr) |
| { |
| struct ibv_query_port cmd = {}; |
| |
| return ibv_cmd_query_port(ctx, port, attr, &cmd, sizeof(cmd)); |
| } |
| |
| int erdma_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, |
| struct ibv_qp_init_attr *init_attr) |
| { |
| struct ibv_query_qp cmd = {}; |
| |
| return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, |
| sizeof(cmd)); |
| } |
| |
| struct ibv_pd *erdma_alloc_pd(struct ibv_context *ctx) |
| { |
| struct ib_uverbs_alloc_pd_resp resp; |
| struct ibv_alloc_pd cmd = {}; |
| struct ibv_pd *pd; |
| |
| pd = calloc(1, sizeof(*pd)); |
| if (!pd) |
| return NULL; |
| |
| if (ibv_cmd_alloc_pd(ctx, pd, &cmd, sizeof(cmd), &resp, sizeof(resp))) { |
| free(pd); |
| return NULL; |
| } |
| |
| return pd; |
| } |
| |
| int erdma_free_pd(struct ibv_pd *pd) |
| { |
| int rv; |
| |
| rv = ibv_cmd_dealloc_pd(pd); |
| if (rv) |
| return rv; |
| |
| free(pd); |
| return 0; |
| } |
| |
| struct ibv_mr *erdma_reg_mr(struct ibv_pd *pd, void *addr, size_t len, |
| uint64_t hca_va, int access) |
| { |
| struct ib_uverbs_reg_mr_resp resp; |
| struct ibv_reg_mr cmd; |
| struct verbs_mr *vmr; |
| int ret; |
| |
| vmr = calloc(1, sizeof(*vmr)); |
| if (!vmr) |
| return NULL; |
| |
| ret = ibv_cmd_reg_mr(pd, addr, len, hca_va, access, vmr, &cmd, |
| sizeof(cmd), &resp, sizeof(resp)); |
| if (ret) { |
| free(vmr); |
| return NULL; |
| } |
| |
| return &vmr->ibv_mr; |
| } |
| |
| int erdma_dereg_mr(struct verbs_mr *vmr) |
| { |
| int ret; |
| |
| ret = ibv_cmd_dereg_mr(vmr); |
| if (ret) |
| return ret; |
| |
| free(vmr); |
| return 0; |
| } |
| |
| int erdma_notify_cq(struct ibv_cq *ibcq, int solicited) |
| { |
| struct erdma_cq *cq = to_ecq(ibcq); |
| uint64_t db_data; |
| int ret; |
| |
| ret = pthread_spin_lock(&cq->lock); |
| if (ret) |
| return ret; |
| |
| db_data = FIELD_PREP(ERDMA_CQDB_IDX_MASK, cq->db_index) | |
| FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->id) | |
| FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | |
| FIELD_PREP(ERDMA_CQDB_SOL_MASK, solicited) | |
| FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->cmdsn) | |
| FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->ci); |
| |
| *(__le64 *)cq->db_record = htole64(db_data); |
| cq->db_index++; |
| udma_to_device_barrier(); |
| mmio_write64_le(cq->db, htole64(db_data)); |
| |
| pthread_spin_unlock(&cq->lock); |
| |
| return ret; |
| } |
| |
| struct ibv_cq *erdma_create_cq(struct ibv_context *ctx, int num_cqe, |
| struct ibv_comp_channel *channel, |
| int comp_vector) |
| { |
| struct erdma_context *ectx = to_ectx(ctx); |
| struct erdma_cmd_create_cq_resp resp = {}; |
| struct erdma_cmd_create_cq cmd = {}; |
| uint64_t *db_records = NULL; |
| struct erdma_cq *cq; |
| size_t cq_size; |
| int rv; |
| |
| cq = calloc(1, sizeof(*cq)); |
| if (!cq) |
| return NULL; |
| |
| if (num_cqe < 64) |
| num_cqe = 64; |
| |
| num_cqe = roundup_pow_of_two(num_cqe); |
| cq_size = align(num_cqe * sizeof(struct erdma_cqe), ERDMA_PAGE_SIZE); |
| |
| rv = posix_memalign((void **)&cq->queue, ERDMA_PAGE_SIZE, cq_size); |
| if (rv) { |
| errno = rv; |
| free(cq); |
| return NULL; |
| } |
| |
| rv = ibv_dontfork_range(cq->queue, cq_size); |
| if (rv) { |
| free(cq->queue); |
| cq->queue = NULL; |
| goto error_alloc; |
| } |
| |
| memset(cq->queue, 0, cq_size); |
| |
| db_records = erdma_alloc_dbrecords(ectx); |
| if (!db_records) { |
| errno = ENOMEM; |
| goto error_alloc; |
| } |
| |
| cmd.db_record_va = (uintptr_t)db_records; |
| cmd.qbuf_va = (uintptr_t)cq->queue; |
| cmd.qbuf_len = cq_size; |
| |
| rv = ibv_cmd_create_cq(ctx, num_cqe, channel, comp_vector, &cq->base_cq, |
| &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, |
| sizeof(resp)); |
| if (rv) { |
| errno = EIO; |
| goto error_alloc; |
| } |
| |
| pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); |
| |
| *db_records = 0; |
| cq->db_record = db_records; |
| |
| cq->id = resp.cq_id; |
| cq->depth = resp.num_cqe; |
| |
| cq->db = ectx->cdb; |
| cq->db_offset = (cq->id & (ERDMA_PAGE_SIZE / ERDMA_CQDB_SIZE - 1)) * |
| ERDMA_CQDB_SIZE; |
| cq->db += cq->db_offset; |
| |
| cq->comp_vector = comp_vector; |
| |
| return &cq->base_cq; |
| |
| error_alloc: |
| if (db_records) |
| erdma_dealloc_dbrecords(ectx, db_records); |
| |
| if (cq->queue) { |
| ibv_dofork_range(cq->queue, cq_size); |
| free(cq->queue); |
| } |
| |
| free(cq); |
| |
| return NULL; |
| } |
| |
| int erdma_destroy_cq(struct ibv_cq *base_cq) |
| { |
| struct erdma_context *ctx = to_ectx(base_cq->context); |
| struct erdma_cq *cq = to_ecq(base_cq); |
| int rv; |
| |
| pthread_spin_lock(&cq->lock); |
| rv = ibv_cmd_destroy_cq(base_cq); |
| if (rv) { |
| pthread_spin_unlock(&cq->lock); |
| errno = EIO; |
| return rv; |
| } |
| pthread_spin_destroy(&cq->lock); |
| |
| if (cq->db_record) |
| erdma_dealloc_dbrecords(ctx, cq->db_record); |
| |
| if (cq->queue) { |
| ibv_dofork_range(cq->queue, cq->depth << CQE_SHIFT); |
| free(cq->queue); |
| } |
| |
| free(cq); |
| |
| return 0; |
| } |
| |
| static void __erdma_alloc_dbs(struct erdma_qp *qp, struct erdma_context *ctx) |
| { |
| uint32_t qpn = qp->id; |
| uint32_t db_offset; |
| |
| if (ctx->sdb_type == ERDMA_SDB_ENTRY) |
| db_offset = ctx->sdb_offset * ERDMA_NSDB_PER_ENTRY * |
| ERDMA_SQDB_SIZE; |
| else |
| db_offset = (qpn & ERDMA_SDB_ALLOC_QPN_MASK) * ERDMA_SQDB_SIZE; |
| |
| qp->sq.db = ctx->sdb + db_offset; |
| /* qpn[6:0] as the index in this rq db page. */ |
| qp->rq.db = ctx->rdb + |
| (qpn & ERDMA_RDB_ALLOC_QPN_MASK) * ERDMA_RQDB_SPACE_SIZE; |
| } |
| |
| static int erdma_store_qp(struct erdma_context *ctx, struct erdma_qp *qp) |
| { |
| uint32_t tbl_idx, tbl_off; |
| int rv = 0; |
| |
| pthread_mutex_lock(&ctx->qp_table_mutex); |
| tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; |
| tbl_off = qp->id & ERDMA_QP_TABLE_MASK; |
| |
| if (ctx->qp_table[tbl_idx].refcnt == 0) { |
| ctx->qp_table[tbl_idx].table = |
| calloc(ERDMA_QP_TABLE_SIZE, sizeof(struct erdma_qp *)); |
| if (!ctx->qp_table[tbl_idx].table) { |
| rv = -ENOMEM; |
| goto out; |
| } |
| } |
| |
| /* exist qp */ |
| if (ctx->qp_table[tbl_idx].table[tbl_off]) { |
| rv = -EBUSY; |
| goto out; |
| } |
| |
| ctx->qp_table[tbl_idx].table[tbl_off] = qp; |
| ctx->qp_table[tbl_idx].refcnt++; |
| |
| out: |
| pthread_mutex_unlock(&ctx->qp_table_mutex); |
| |
| return rv; |
| } |
| |
| static void erdma_clear_qp(struct erdma_context *ctx, struct erdma_qp *qp) |
| { |
| uint32_t tbl_idx, tbl_off; |
| |
| pthread_mutex_lock(&ctx->qp_table_mutex); |
| tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; |
| tbl_off = qp->id & ERDMA_QP_TABLE_MASK; |
| |
| ctx->qp_table[tbl_idx].table[tbl_off] = NULL; |
| ctx->qp_table[tbl_idx].refcnt--; |
| |
| if (ctx->qp_table[tbl_idx].refcnt == 0) { |
| free(ctx->qp_table[tbl_idx].table); |
| ctx->qp_table[tbl_idx].table = NULL; |
| } |
| |
| pthread_mutex_unlock(&ctx->qp_table_mutex); |
| } |
| |
| static int erdma_alloc_qp_buf_and_db(struct erdma_context *ctx, |
| struct erdma_qp *qp, |
| struct ibv_qp_init_attr *attr) |
| { |
| size_t queue_size; |
| uint32_t nwqebb; |
| int rv; |
| |
| nwqebb = roundup_pow_of_two(attr->cap.max_send_wr * MAX_WQEBB_PER_SQE); |
| queue_size = align(nwqebb << SQEBB_SHIFT, ctx->page_size); |
| nwqebb = roundup_pow_of_two(attr->cap.max_recv_wr); |
| queue_size += align(nwqebb << RQE_SHIFT, ctx->page_size); |
| |
| qp->qbuf_size = queue_size; |
| rv = posix_memalign(&qp->qbuf, ctx->page_size, queue_size); |
| if (rv) { |
| errno = ENOMEM; |
| return -1; |
| } |
| |
| rv = ibv_dontfork_range(qp->qbuf, queue_size); |
| if (rv) { |
| errno = rv; |
| goto err_dontfork; |
| } |
| |
| /* doorbell record allocation. */ |
| qp->db_records = erdma_alloc_dbrecords(ctx); |
| if (!qp->db_records) { |
| errno = ENOMEM; |
| goto err_dbrec; |
| } |
| |
| *qp->db_records = 0; |
| *(qp->db_records + 1) = 0; |
| qp->sq.db_record = qp->db_records; |
| qp->rq.db_record = qp->db_records + 1; |
| |
| pthread_spin_init(&qp->sq_lock, PTHREAD_PROCESS_PRIVATE); |
| pthread_spin_init(&qp->rq_lock, PTHREAD_PROCESS_PRIVATE); |
| |
| return 0; |
| |
| err_dbrec: |
| ibv_dofork_range(qp->qbuf, queue_size); |
| |
| err_dontfork: |
| free(qp->qbuf); |
| |
| return -1; |
| } |
| |
| static void erdma_free_qp_buf_and_db(struct erdma_context *ctx, |
| struct erdma_qp *qp) |
| { |
| pthread_spin_destroy(&qp->sq_lock); |
| pthread_spin_destroy(&qp->rq_lock); |
| |
| if (qp->db_records) |
| erdma_dealloc_dbrecords(ctx, qp->db_records); |
| |
| ibv_dofork_range(qp->qbuf, qp->qbuf_size); |
| free(qp->qbuf); |
| } |
| |
| static int erdma_alloc_wrid_tbl(struct erdma_qp *qp) |
| { |
| qp->rq.wr_tbl = calloc(qp->rq.depth, sizeof(uint64_t)); |
| if (!qp->rq.wr_tbl) |
| return -ENOMEM; |
| |
| qp->sq.wr_tbl = calloc(qp->sq.depth, sizeof(uint64_t)); |
| if (!qp->sq.wr_tbl) { |
| free(qp->rq.wr_tbl); |
| return -ENOMEM; |
| } |
| |
| return 0; |
| } |
| |
| static void erdma_free_wrid_tbl(struct erdma_qp *qp) |
| { |
| free(qp->sq.wr_tbl); |
| free(qp->rq.wr_tbl); |
| } |
| |
| struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) |
| { |
| struct erdma_context *ctx = to_ectx(pd->context); |
| struct erdma_cmd_create_qp_resp resp = {}; |
| struct erdma_cmd_create_qp cmd = {}; |
| struct erdma_qp *qp; |
| int rv; |
| |
| qp = calloc(1, sizeof(*qp)); |
| if (!qp) |
| return NULL; |
| |
| rv = erdma_alloc_qp_buf_and_db(ctx, qp, attr); |
| if (rv) |
| goto err; |
| |
| cmd.db_record_va = (uintptr_t)qp->db_records; |
| cmd.qbuf_va = (uintptr_t)qp->qbuf; |
| cmd.qbuf_len = (__u32)qp->qbuf_size; |
| |
| rv = ibv_cmd_create_qp(pd, &qp->base_qp, attr, &cmd.ibv_cmd, |
| sizeof(cmd), &resp.ibv_resp, sizeof(resp)); |
| if (rv) |
| goto err_cmd; |
| |
| qp->id = resp.qp_id; |
| qp->sq.qbuf = qp->qbuf; |
| qp->rq.qbuf = qp->qbuf + resp.rq_offset; |
| qp->sq.depth = resp.num_sqe; |
| qp->rq.depth = resp.num_rqe; |
| qp->sq_sig_all = attr->sq_sig_all; |
| qp->sq.size = resp.num_sqe * SQEBB_SIZE; |
| qp->rq.size = resp.num_rqe * sizeof(struct erdma_rqe); |
| |
| /* doorbell allocation. */ |
| __erdma_alloc_dbs(qp, ctx); |
| |
| rv = erdma_alloc_wrid_tbl(qp); |
| if (rv) |
| goto err_wrid_tbl; |
| |
| rv = erdma_store_qp(ctx, qp); |
| if (rv) { |
| errno = -rv; |
| goto err_store; |
| } |
| |
| return &qp->base_qp; |
| |
| err_store: |
| erdma_free_wrid_tbl(qp); |
| err_wrid_tbl: |
| ibv_cmd_destroy_qp(&qp->base_qp); |
| err_cmd: |
| erdma_free_qp_buf_and_db(ctx, qp); |
| err: |
| free(qp); |
| |
| return NULL; |
| } |
| |
| int erdma_modify_qp(struct ibv_qp *base_qp, struct ibv_qp_attr *attr, |
| int attr_mask) |
| { |
| struct erdma_qp *qp = to_eqp(base_qp); |
| struct ibv_modify_qp cmd = {}; |
| int rv; |
| |
| pthread_spin_lock(&qp->sq_lock); |
| pthread_spin_lock(&qp->rq_lock); |
| |
| rv = ibv_cmd_modify_qp(base_qp, attr, attr_mask, &cmd, sizeof(cmd)); |
| |
| pthread_spin_unlock(&qp->rq_lock); |
| pthread_spin_unlock(&qp->sq_lock); |
| |
| return rv; |
| } |
| |
| int erdma_destroy_qp(struct ibv_qp *base_qp) |
| { |
| struct ibv_context *base_ctx = base_qp->pd->context; |
| struct erdma_context *ctx = to_ectx(base_ctx); |
| struct erdma_qp *qp = to_eqp(base_qp); |
| int rv; |
| |
| erdma_clear_qp(ctx, qp); |
| |
| rv = ibv_cmd_destroy_qp(base_qp); |
| if (rv) |
| return rv; |
| |
| erdma_free_wrid_tbl(qp); |
| erdma_free_qp_buf_and_db(ctx, qp); |
| |
| free(qp); |
| |
| return 0; |
| } |
| |
| static int erdma_push_one_sqe(struct erdma_qp *qp, struct ibv_send_wr *wr, |
| uint16_t *sq_pi) |
| { |
| uint32_t i, bytes, sgl_off, sgl_idx, wqebb_cnt, opcode, wqe_size = 0; |
| struct erdma_atomic_sqe *atomic_sqe; |
| struct erdma_readreq_sqe *read_sqe; |
| struct erdma_write_sqe *write_sqe; |
| struct erdma_send_sqe *send_sqe; |
| struct erdma_sge *sgl_base; |
| uint16_t tmp_pi = *sq_pi; |
| __le32 *length_field; |
| uint64_t sqe_hdr; |
| void *sqe; |
| |
| sqe = get_sq_wqebb(qp, tmp_pi); |
| /* Clear the first 8Byte of the wqe hdr. */ |
| *(uint64_t *)sqe = 0; |
| |
| qp->sq.wr_tbl[tmp_pi & (qp->sq.depth - 1)] = wr->wr_id; |
| |
| sqe_hdr = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, qp->id) | |
| FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, |
| wr->send_flags & IBV_SEND_SIGNALED ? 1 : 0) | |
| FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, qp->sq_sig_all) | |
| FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, |
| wr->send_flags & IBV_SEND_SOLICITED ? 1 : 0) | |
| FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, |
| wr->send_flags & IBV_SEND_FENCE ? 1 : 0) | |
| FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, |
| wr->send_flags & IBV_SEND_INLINE ? 1 : 0); |
| |
| switch (wr->opcode) { |
| case IBV_WR_RDMA_WRITE: |
| case IBV_WR_RDMA_WRITE_WITH_IMM: |
| if (wr->opcode == IBV_WR_RDMA_WRITE) |
| opcode = ERDMA_OP_WRITE; |
| else |
| opcode = ERDMA_OP_WRITE_WITH_IMM; |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, opcode); |
| write_sqe = sqe; |
| write_sqe->imm_data = wr->imm_data; |
| write_sqe->sink_stag = htole32(wr->wr.rdma.rkey); |
| write_sqe->sink_to_low = |
| htole32(wr->wr.rdma.remote_addr & 0xFFFFFFFF); |
| write_sqe->sink_to_high = |
| htole32((wr->wr.rdma.remote_addr >> 32) & 0xFFFFFFFF); |
| |
| length_field = &write_sqe->length; |
| /* sgl is at the start of next wqebb. */ |
| sgl_base = get_sq_wqebb(qp, tmp_pi + 1); |
| sgl_off = 0; |
| sgl_idx = tmp_pi + 1; |
| wqe_size = sizeof(struct erdma_write_sqe); |
| |
| break; |
| case IBV_WR_SEND: |
| case IBV_WR_SEND_WITH_IMM: |
| if (wr->opcode == IBV_WR_SEND) |
| opcode = ERDMA_OP_SEND; |
| else |
| opcode = ERDMA_OP_SEND_WITH_IMM; |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, opcode); |
| send_sqe = sqe; |
| send_sqe->imm_data = wr->imm_data; |
| |
| length_field = &send_sqe->length; |
| /* sgl is in the half of current wqebb (offset 16Byte) */ |
| sgl_base = sqe; |
| sgl_off = 16; |
| sgl_idx = tmp_pi; |
| wqe_size = sizeof(struct erdma_send_sqe); |
| |
| break; |
| case IBV_WR_RDMA_READ: |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_READ); |
| read_sqe = sqe; |
| |
| read_sqe->sink_to_low = htole32(wr->sg_list->addr & 0xFFFFFFFF); |
| read_sqe->sink_to_high = |
| htole32((wr->sg_list->addr >> 32) & 0xFFFFFFFF); |
| read_sqe->sink_stag = htole32(wr->sg_list->lkey); |
| read_sqe->length = htole32(wr->sg_list->length); |
| |
| sgl_base = get_sq_wqebb(qp, tmp_pi + 1); |
| |
| sgl_base->addr = htole64(wr->wr.rdma.remote_addr); |
| sgl_base->length = htole32(wr->sg_list->length); |
| sgl_base->key = htole32(wr->wr.rdma.rkey); |
| |
| wqe_size = sizeof(struct erdma_readreq_sqe); |
| |
| goto out; |
| case IBV_WR_ATOMIC_CMP_AND_SWP: |
| case IBV_WR_ATOMIC_FETCH_AND_ADD: |
| atomic_sqe = (struct erdma_atomic_sqe *)sqe; |
| |
| if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, |
| ERDMA_OP_ATOMIC_CAS); |
| atomic_sqe->fetchadd_swap_data = |
| htole64(wr->wr.atomic.swap); |
| atomic_sqe->cmp_data = |
| htole64(wr->wr.atomic.compare_add); |
| } else { |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, |
| ERDMA_OP_ATOMIC_FAD); |
| atomic_sqe->fetchadd_swap_data = |
| htole64(wr->wr.atomic.compare_add); |
| } |
| |
| sgl_base = (struct erdma_sge *)get_sq_wqebb(qp, tmp_pi + 1); |
| /* remote SGL fields */ |
| sgl_base->addr = htole64(wr->wr.atomic.remote_addr); |
| sgl_base->key = htole32(wr->wr.atomic.rkey); |
| |
| /* local SGL fields */ |
| sgl_base++; |
| sgl_base->addr = htole64(wr->sg_list[0].addr); |
| sgl_base->length = htole32(wr->sg_list[0].length); |
| sgl_base->key = htole32(wr->sg_list[0].lkey); |
| wqe_size = sizeof(struct erdma_atomic_sqe); |
| goto out; |
| default: |
| return -EINVAL; |
| } |
| |
| if (wr->send_flags & IBV_SEND_INLINE) { |
| char *data = (char *)sgl_base; |
| uint32_t remain_size; |
| uint32_t copy_size; |
| uint32_t data_off; |
| |
| i = 0; |
| bytes = 0; |
| |
| /* Allow more than ERDMA_MAX_SGE, since content copied here */ |
| while (i < wr->num_sge) { |
| bytes += wr->sg_list[i].length; |
| if (bytes > (int)ERDMA_MAX_INLINE) |
| return -EINVAL; |
| |
| remain_size = wr->sg_list[i].length; |
| data_off = 0; |
| |
| while (1) { |
| copy_size = |
| min(remain_size, SQEBB_SIZE - sgl_off); |
| memcpy(data + sgl_off, |
| (void *)(uintptr_t)wr->sg_list[i].addr + |
| data_off, |
| copy_size); |
| remain_size -= copy_size; |
| |
| /* Update sgl_offset. */ |
| sgl_idx += |
| ((sgl_off + copy_size) >> SQEBB_SHIFT); |
| sgl_off = (sgl_off + copy_size) & |
| (SQEBB_SIZE - 1); |
| data_off += copy_size; |
| data = get_sq_wqebb(qp, sgl_idx); |
| |
| if (!remain_size) |
| break; |
| }; |
| |
| i++; |
| } |
| |
| *length_field = htole32(bytes); |
| wqe_size += bytes; |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, bytes); |
| } else { |
| char *sgl = (char *)sgl_base; |
| |
| if (wr->num_sge > ERDMA_MAX_SEND_SGE) |
| return -EINVAL; |
| |
| i = 0; |
| bytes = 0; |
| |
| while (i < wr->num_sge) { |
| bytes += wr->sg_list[i].length; |
| memcpy(sgl + sgl_off, &wr->sg_list[i], |
| sizeof(struct ibv_sge)); |
| |
| if (sgl_off == 0) |
| *(uint32_t *)(sgl + 28) = qp->id; |
| |
| sgl_idx += (sgl_off == sizeof(struct ibv_sge) ? 1 : 0); |
| sgl = get_sq_wqebb(qp, sgl_idx); |
| sgl_off = sizeof(struct ibv_sge) - sgl_off; |
| |
| i++; |
| } |
| |
| *length_field = htole32(bytes); |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, wr->num_sge); |
| wqe_size += wr->num_sge * sizeof(struct ibv_sge); |
| } |
| |
| out: |
| wqebb_cnt = SQEBB_COUNT(wqe_size); |
| assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); |
| sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); |
| sqe_hdr |= |
| FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, tmp_pi + wqebb_cnt); |
| |
| *(__le64 *)sqe = htole64(sqe_hdr); |
| *sq_pi = tmp_pi + wqebb_cnt; |
| |
| return 0; |
| } |
| |
| int erdma_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr, |
| struct ibv_send_wr **bad_wr) |
| { |
| struct erdma_qp *qp = to_eqp(base_qp); |
| int new_sqe = 0, rv = 0; |
| uint16_t sq_pi; |
| |
| *bad_wr = NULL; |
| |
| if (base_qp->state == IBV_QPS_ERR) { |
| *bad_wr = wr; |
| return -EIO; |
| } |
| |
| pthread_spin_lock(&qp->sq_lock); |
| |
| sq_pi = qp->sq.pi; |
| |
| while (wr) { |
| if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) { |
| rv = -ENOMEM; |
| *bad_wr = wr; |
| break; |
| } |
| |
| rv = erdma_push_one_sqe(qp, wr, &sq_pi); |
| if (rv) { |
| *bad_wr = wr; |
| break; |
| } |
| |
| new_sqe++; |
| wr = wr->next; |
| } |
| |
| if (new_sqe) { |
| qp->sq.pi = sq_pi; |
| __kick_sq_db(qp, sq_pi); /* normal doorbell. */ |
| } |
| |
| pthread_spin_unlock(&qp->sq_lock); |
| |
| return rv; |
| } |
| |
| static int push_recv_wqe(struct erdma_qp *qp, struct ibv_recv_wr *wr) |
| { |
| uint16_t rq_pi = qp->rq.pi; |
| uint16_t idx = rq_pi & (qp->rq.depth - 1); |
| struct erdma_rqe *rqe = (struct erdma_rqe *)qp->rq.qbuf + idx; |
| |
| if ((uint16_t)(rq_pi - qp->rq.ci) == qp->rq.depth) |
| return -ENOMEM; |
| |
| rqe->qe_idx = htole16(rq_pi + 1); |
| rqe->qpn = htole32(qp->id); |
| qp->rq.wr_tbl[idx] = wr->wr_id; |
| |
| if (wr->num_sge == 0) { |
| rqe->length = 0; |
| } else if (wr->num_sge == 1) { |
| rqe->stag = htole32(wr->sg_list[0].lkey); |
| rqe->to = htole64(wr->sg_list[0].addr); |
| rqe->length = htole32(wr->sg_list[0].length); |
| } else { |
| return -EINVAL; |
| } |
| |
| *(__le64 *)qp->rq.db_record = *(__le64 *)rqe; |
| udma_to_device_barrier(); |
| mmio_write64_le(qp->rq.db, *(__le64 *)rqe); |
| |
| qp->rq.pi = rq_pi + 1; |
| |
| return 0; |
| } |
| |
| int erdma_post_recv(struct ibv_qp *base_qp, struct ibv_recv_wr *wr, |
| struct ibv_recv_wr **bad_wr) |
| { |
| struct erdma_qp *qp = to_eqp(base_qp); |
| int ret = 0; |
| |
| if (base_qp->state == IBV_QPS_ERR) { |
| *bad_wr = wr; |
| return -EIO; |
| } |
| |
| pthread_spin_lock(&qp->rq_lock); |
| |
| while (wr) { |
| ret = push_recv_wqe(qp, wr); |
| if (ret) { |
| *bad_wr = wr; |
| break; |
| } |
| |
| wr = wr->next; |
| } |
| |
| pthread_spin_unlock(&qp->rq_lock); |
| |
| return ret; |
| } |
| |
| void erdma_cq_event(struct ibv_cq *ibcq) |
| { |
| struct erdma_cq *cq = to_ecq(ibcq); |
| |
| cq->cmdsn++; |
| } |
| |
| static void *get_next_valid_cqe(struct erdma_cq *cq) |
| { |
| struct erdma_cqe *cqe = cq->queue + (cq->ci & (cq->depth - 1)); |
| uint32_t owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, be32toh(cqe->hdr)); |
| |
| return owner ^ !!(cq->ci & cq->depth) ? cqe : NULL; |
| } |
| |
| static const enum ibv_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { |
| [ERDMA_OP_WRITE] = IBV_WC_RDMA_WRITE, |
| [ERDMA_OP_READ] = IBV_WC_RDMA_READ, |
| [ERDMA_OP_SEND] = IBV_WC_SEND, |
| [ERDMA_OP_SEND_WITH_IMM] = IBV_WC_SEND, |
| [ERDMA_OP_RECEIVE] = IBV_WC_RECV, |
| [ERDMA_OP_RECV_IMM] = IBV_WC_RECV_RDMA_WITH_IMM, |
| [ERDMA_OP_RECV_INV] = IBV_WC_RECV, |
| [ERDMA_OP_WRITE_WITH_IMM] = IBV_WC_RDMA_WRITE, |
| [ERDMA_OP_INVALIDATE] = IBV_WC_LOCAL_INV, |
| [ERDMA_OP_RSP_SEND_IMM] = IBV_WC_RECV, |
| [ERDMA_OP_SEND_WITH_INV] = IBV_WC_SEND, |
| [ERDMA_OP_READ_WITH_INV] = IBV_WC_RDMA_READ, |
| [ERDMA_OP_ATOMIC_CAS] = IBV_WC_COMP_SWAP, |
| [ERDMA_OP_ATOMIC_FAD] = IBV_WC_FETCH_ADD, |
| }; |
| |
| static const struct { |
| enum erdma_wc_status erdma; |
| enum ibv_wc_status base; |
| enum erdma_vendor_err vendor; |
| } map_cqe_status[ERDMA_NUM_WC_STATUS] = { |
| { ERDMA_WC_SUCCESS, IBV_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR }, |
| { ERDMA_WC_GENERAL_ERR, IBV_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR }, |
| { ERDMA_WC_RECV_WQE_FORMAT_ERR, IBV_WC_GENERAL_ERR, |
| ERDMA_WC_VENDOR_INVALID_RQE }, |
| { ERDMA_WC_RECV_STAG_INVALID_ERR, IBV_WC_REM_ACCESS_ERR, |
| ERDMA_WC_VENDOR_RQE_INVALID_STAG }, |
| { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IBV_WC_REM_ACCESS_ERR, |
| ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, |
| { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IBV_WC_REM_ACCESS_ERR, |
| ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, |
| { ERDMA_WC_RECV_PDID_ERR, IBV_WC_REM_ACCESS_ERR, |
| ERDMA_WC_VENDOR_RQE_INVALID_PD }, |
| { ERDMA_WC_RECV_WARRPING_ERR, IBV_WC_REM_ACCESS_ERR, |
| ERDMA_WC_VENDOR_RQE_WRAP_ERR }, |
| { ERDMA_WC_SEND_WQE_FORMAT_ERR, IBV_WC_LOC_QP_OP_ERR, |
| ERDMA_WC_VENDOR_INVALID_SQE }, |
| { ERDMA_WC_SEND_WQE_ORD_EXCEED, IBV_WC_GENERAL_ERR, |
| ERDMA_WC_VENDOR_ZERO_ORD }, |
| { ERDMA_WC_SEND_STAG_INVALID_ERR, IBV_WC_LOC_ACCESS_ERR, |
| ERDMA_WC_VENDOR_SQE_INVALID_STAG }, |
| { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IBV_WC_LOC_ACCESS_ERR, |
| ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, |
| { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IBV_WC_LOC_ACCESS_ERR, |
| ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, |
| { ERDMA_WC_SEND_PDID_ERR, IBV_WC_LOC_ACCESS_ERR, |
| ERDMA_WC_VENDOR_SQE_INVALID_PD }, |
| { ERDMA_WC_SEND_WARRPING_ERR, IBV_WC_LOC_ACCESS_ERR, |
| ERDMA_WC_VENDOR_SQE_WARP_ERR }, |
| { ERDMA_WC_FLUSH_ERR, IBV_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR }, |
| { ERDMA_WC_RETRY_EXC_ERR, IBV_WC_RETRY_EXC_ERR, |
| ERDMA_WC_VENDOR_NO_ERR }, |
| }; |
| |
| #define ERDMA_POLLCQ_NO_QP (-1) |
| #define ERDMA_POLLCQ_DUP_COMP (-2) |
| #define ERDMA_POLLCQ_WRONG_IDX (-3) |
| |
| static int __erdma_poll_one_cqe(struct erdma_context *ctx, struct erdma_cq *cq, |
| struct ibv_wc *wc) |
| { |
| uint32_t cqe_hdr, opcode, syndrome, qpn; |
| uint16_t depth, wqe_idx, old_ci, new_ci; |
| uint64_t *sqe_hdr, *qeidx2wrid; |
| uint32_t tbl_idx, tbl_off; |
| struct erdma_cqe *cqe; |
| struct erdma_qp *qp; |
| |
| cqe = get_next_valid_cqe(cq); |
| if (!cqe) |
| return -EAGAIN; |
| |
| cq->ci++; |
| udma_from_device_barrier(); |
| |
| cqe_hdr = be32toh(cqe->hdr); |
| syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); |
| opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr); |
| qpn = be32toh(cqe->qpn); |
| wqe_idx = be32toh(cqe->qe_idx); |
| |
| tbl_idx = qpn >> ERDMA_QP_TABLE_SHIFT; |
| tbl_off = qpn & ERDMA_QP_TABLE_MASK; |
| |
| if (!ctx->qp_table[tbl_idx].table || |
| !ctx->qp_table[tbl_idx].table[tbl_off]) |
| return ERDMA_POLLCQ_NO_QP; |
| |
| qp = ctx->qp_table[tbl_idx].table[tbl_off]; |
| |
| if (FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr) == |
| ERDMA_CQE_QTYPE_SQ) { |
| qeidx2wrid = qp->sq.wr_tbl; |
| depth = qp->sq.depth; |
| sqe_hdr = get_sq_wqebb(qp, wqe_idx); |
| old_ci = qp->sq.ci; |
| new_ci = wqe_idx + |
| FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *sqe_hdr) + 1; |
| |
| if ((uint16_t)(new_ci - old_ci) > depth) |
| return ERDMA_POLLCQ_WRONG_IDX; |
| else if (new_ci == old_ci) |
| return ERDMA_POLLCQ_DUP_COMP; |
| |
| qp->sq.ci = new_ci; |
| } else { |
| qeidx2wrid = qp->rq.wr_tbl; |
| depth = qp->rq.depth; |
| qp->rq.ci++; |
| } |
| |
| wc->wr_id = qeidx2wrid[wqe_idx & (depth - 1)]; |
| wc->byte_len = be32toh(cqe->size); |
| wc->wc_flags = 0; |
| |
| wc->opcode = wc_mapping_table[opcode]; |
| if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) { |
| wc->imm_data = htobe32(le32toh(cqe->imm_data)); |
| wc->wc_flags |= IBV_WC_WITH_IMM; |
| } |
| |
| if (syndrome >= ERDMA_NUM_WC_STATUS) |
| syndrome = ERDMA_WC_GENERAL_ERR; |
| |
| wc->status = map_cqe_status[syndrome].base; |
| wc->vendor_err = map_cqe_status[syndrome].vendor; |
| wc->qp_num = qpn; |
| |
| return 0; |
| } |
| |
| int erdma_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) |
| { |
| struct erdma_context *ctx = to_ectx(ibcq->context); |
| struct erdma_cq *cq = to_ecq(ibcq); |
| int ret, npolled = 0; |
| |
| pthread_spin_lock(&cq->lock); |
| |
| while (npolled < num_entries) { |
| ret = __erdma_poll_one_cqe(ctx, cq, wc + npolled); |
| if (ret == -EAGAIN) /* CQ is empty, break the loop. */ |
| break; |
| else if (ret) /* We handle the polling error silently. */ |
| continue; |
| npolled++; |
| } |
| |
| pthread_spin_unlock(&cq->lock); |
| |
| return npolled; |
| } |
| |
| void erdma_free_context(struct ibv_context *ibv_ctx) |
| { |
| struct erdma_context *ctx = to_ectx(ibv_ctx); |
| int i; |
| |
| munmap(ctx->sdb, ERDMA_PAGE_SIZE); |
| munmap(ctx->rdb, ERDMA_PAGE_SIZE); |
| munmap(ctx->cdb, ERDMA_PAGE_SIZE); |
| |
| pthread_mutex_lock(&ctx->qp_table_mutex); |
| for (i = 0; i < ERDMA_QP_TABLE_SIZE; ++i) { |
| if (ctx->qp_table[i].refcnt) |
| free(ctx->qp_table[i].table); |
| } |
| |
| pthread_mutex_unlock(&ctx->qp_table_mutex); |
| pthread_mutex_destroy(&ctx->qp_table_mutex); |
| |
| verbs_uninit_context(&ctx->ibv_ctx); |
| free(ctx); |
| } |