| // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB |
| /* |
| * Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved |
| */ |
| |
| #define _GNU_SOURCE |
| #include <config.h> |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <sys/time.h> |
| #include <errno.h> |
| #include <sys/stat.h> |
| #include <fcntl.h> |
| #include <sys/mman.h> |
| #include <string.h> |
| #include <sys/param.h> |
| #include <linux/vfio.h> |
| #include <sys/eventfd.h> |
| #include <sys/ioctl.h> |
| #include <poll.h> |
| #include <util/mmio.h> |
| |
| #include <ccan/array_size.h> |
| |
| #include "mlx5dv.h" |
| #include "mlx5_vfio.h" |
| #include "mlx5.h" |
| #include "mlx5_ifc.h" |
| |
| enum { |
| MLX5_VFIO_CMD_VEC_IDX, |
| }; |
| |
| enum { |
| MLX5_VFIO_SUPP_MR_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE | |
| IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | |
| IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_RELAXED_ORDERING, |
| MLX5_VFIO_SUPP_UMEM_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE | |
| IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ, |
| }; |
| |
| static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, uint16_t func_id, |
| int32_t npages, bool is_event); |
| static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id, |
| int npages); |
| |
| static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx, |
| struct mlx5_cmd_msg *msg); |
| |
| static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx, |
| uint32_t size, struct mlx5_cmd_msg *msg); |
| |
| static int mlx5_vfio_post_cmd(struct mlx5_vfio_context *ctx, void *in, |
| int ilen, void *out, int olen, |
| unsigned int slot, bool async); |
| |
| static int mlx5_vfio_register_mem(struct mlx5_vfio_context *ctx, |
| void *vaddr, uint64_t iova, uint64_t size) |
| { |
| struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) }; |
| |
| dma_map.vaddr = (uintptr_t)vaddr; |
| dma_map.size = size; |
| dma_map.iova = iova; |
| dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; |
| |
| return ioctl(ctx->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); |
| } |
| |
| static void mlx5_vfio_unregister_mem(struct mlx5_vfio_context *ctx, |
| uint64_t iova, uint64_t size) |
| { |
| struct vfio_iommu_type1_dma_unmap dma_unmap = {}; |
| |
| dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); |
| dma_unmap.size = size; |
| dma_unmap.iova = iova; |
| |
| if (ioctl(ctx->container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap)) |
| assert(false); |
| } |
| |
| static struct page_block *mlx5_vfio_new_block(struct mlx5_vfio_context *ctx) |
| { |
| struct page_block *page_block; |
| int err; |
| |
| page_block = calloc(1, sizeof(*page_block)); |
| if (!page_block) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| err = posix_memalign(&page_block->page_ptr, MLX5_VFIO_BLOCK_SIZE, |
| MLX5_VFIO_BLOCK_SIZE); |
| if (err) { |
| errno = err; |
| goto err; |
| } |
| |
| err = iset_alloc_range(ctx->iova_alloc, MLX5_VFIO_BLOCK_SIZE, |
| &page_block->iova, MLX5_VFIO_BLOCK_SIZE); |
| if (err) |
| goto err_range; |
| |
| bitmap_fill(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES); |
| err = mlx5_vfio_register_mem(ctx, page_block->page_ptr, page_block->iova, |
| MLX5_VFIO_BLOCK_SIZE); |
| if (err) |
| goto err_reg; |
| |
| list_add(&ctx->mem_alloc.block_list, &page_block->next_block); |
| return page_block; |
| |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, page_block->iova, |
| MLX5_VFIO_BLOCK_SIZE); |
| err_range: |
| free(page_block->page_ptr); |
| err: |
| free(page_block); |
| return NULL; |
| } |
| |
| static void mlx5_vfio_free_block(struct mlx5_vfio_context *ctx, |
| struct page_block *page_block) |
| { |
| mlx5_vfio_unregister_mem(ctx, page_block->iova, MLX5_VFIO_BLOCK_SIZE); |
| iset_insert_range(ctx->iova_alloc, page_block->iova, MLX5_VFIO_BLOCK_SIZE); |
| list_del(&page_block->next_block); |
| free(page_block->page_ptr); |
| free(page_block); |
| } |
| |
| static int mlx5_vfio_alloc_page(struct mlx5_vfio_context *ctx, uint64_t *iova) |
| { |
| struct page_block *page_block; |
| unsigned long pg; |
| int ret = 0; |
| |
| pthread_mutex_lock(&ctx->mem_alloc.block_list_mutex); |
| while (true) { |
| list_for_each(&ctx->mem_alloc.block_list, page_block, next_block) { |
| pg = bitmap_find_first_bit(page_block->free_pages, 0, |
| MLX5_VFIO_BLOCK_NUM_PAGES); |
| if (pg != MLX5_VFIO_BLOCK_NUM_PAGES) { |
| bitmap_clear_bit(page_block->free_pages, pg); |
| *iova = page_block->iova + pg * MLX5_ADAPTER_PAGE_SIZE; |
| goto end; |
| } |
| } |
| if (!mlx5_vfio_new_block(ctx)) { |
| ret = -1; |
| goto end; |
| } |
| } |
| end: |
| pthread_mutex_unlock(&ctx->mem_alloc.block_list_mutex); |
| return ret; |
| } |
| |
| static void mlx5_vfio_free_page(struct mlx5_vfio_context *ctx, uint64_t iova) |
| { |
| struct page_block *page_block; |
| unsigned long pg; |
| |
| pthread_mutex_lock(&ctx->mem_alloc.block_list_mutex); |
| list_for_each(&ctx->mem_alloc.block_list, page_block, next_block) { |
| if (page_block->iova > iova || |
| (page_block->iova + MLX5_VFIO_BLOCK_SIZE <= iova)) |
| continue; |
| |
| pg = (iova - page_block->iova) / MLX5_ADAPTER_PAGE_SIZE; |
| assert(!bitmap_test_bit(page_block->free_pages, pg)); |
| bitmap_set_bit(page_block->free_pages, pg); |
| if (bitmap_full(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES)) |
| mlx5_vfio_free_block(ctx, page_block); |
| goto end; |
| } |
| |
| assert(false); |
| end: |
| pthread_mutex_unlock(&ctx->mem_alloc.block_list_mutex); |
| } |
| |
| static const char *cmd_status_str(uint8_t status) |
| { |
| switch (status) { |
| case MLX5_CMD_STAT_OK: |
| return "OK"; |
| case MLX5_CMD_STAT_INT_ERR: |
| return "internal error"; |
| case MLX5_CMD_STAT_BAD_OP_ERR: |
| return "bad operation"; |
| case MLX5_CMD_STAT_BAD_PARAM_ERR: |
| return "bad parameter"; |
| case MLX5_CMD_STAT_BAD_SYS_STATE_ERR: |
| return "bad system state"; |
| case MLX5_CMD_STAT_BAD_RES_ERR: |
| return "bad resource"; |
| case MLX5_CMD_STAT_RES_BUSY: |
| return "resource busy"; |
| case MLX5_CMD_STAT_LIM_ERR: |
| return "limits exceeded"; |
| case MLX5_CMD_STAT_BAD_RES_STATE_ERR: |
| return "bad resource state"; |
| case MLX5_CMD_STAT_IX_ERR: |
| return "bad index"; |
| case MLX5_CMD_STAT_NO_RES_ERR: |
| return "no resources"; |
| case MLX5_CMD_STAT_BAD_INP_LEN_ERR: |
| return "bad input length"; |
| case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR: |
| return "bad output length"; |
| case MLX5_CMD_STAT_BAD_QP_STATE_ERR: |
| return "bad QP state"; |
| case MLX5_CMD_STAT_BAD_PKT_ERR: |
| return "bad packet (discarded)"; |
| case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR: |
| return "bad size too many outstanding CQEs"; |
| default: |
| return "unknown status"; |
| } |
| } |
| |
| static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, uint32_t entry) |
| { |
| return eq->vaddr + entry * MLX5_EQE_SIZE; |
| } |
| |
| static struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, uint32_t cc) |
| { |
| uint32_t ci = eq->cons_index + cc; |
| struct mlx5_eqe *eqe; |
| |
| eqe = get_eqe(eq, ci & (eq->nent - 1)); |
| eqe = ((eqe->owner & 1) ^ !!(ci & eq->nent)) ? NULL : eqe; |
| |
| if (eqe) |
| udma_from_device_barrier(); |
| |
| return eqe; |
| } |
| |
| static void eq_update_ci(struct mlx5_eq *eq, uint32_t cc, int arm) |
| { |
| __be32 *addr = eq->doorbell + (arm ? 0 : 2); |
| uint32_t val; |
| |
| eq->cons_index += cc; |
| val = (eq->cons_index & 0xffffff) | (eq->eqn << 24); |
| |
| mmio_write32_be(addr, htobe32(val)); |
| udma_to_device_barrier(); |
| } |
| |
| static int mlx5_vfio_handle_page_req_event(struct mlx5_vfio_context *ctx, |
| struct mlx5_eqe *eqe) |
| { |
| struct mlx5_eqe_page_req *req = &eqe->data.req_pages; |
| int32_t num_pages; |
| int16_t func_id; |
| |
| func_id = be16toh(req->func_id); |
| num_pages = be32toh(req->num_pages); |
| |
| if (num_pages > 0) |
| return mlx5_vfio_give_pages(ctx, func_id, num_pages, true); |
| |
| return mlx5_vfio_reclaim_pages(ctx, func_id, -1 * num_pages); |
| } |
| |
| static void mlx5_cmd_mbox_status(void *out, uint8_t *status, uint32_t *syndrome) |
| { |
| *status = DEVX_GET(mbox_out, out, status); |
| *syndrome = DEVX_GET(mbox_out, out, syndrome); |
| } |
| |
| static int mlx5_vfio_cmd_check(struct mlx5_vfio_context *ctx, void *in, void *out) |
| { |
| uint32_t syndrome; |
| uint8_t status; |
| uint16_t opcode; |
| uint16_t op_mod; |
| |
| mlx5_cmd_mbox_status(out, &status, &syndrome); |
| if (!status) |
| return 0; |
| |
| opcode = DEVX_GET(mbox_in, in, opcode); |
| op_mod = DEVX_GET(mbox_in, in, op_mod); |
| |
| mlx5_err(ctx->dbg_fp, |
| "mlx5_vfio_op_code(0x%x), op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n", |
| opcode, op_mod, |
| cmd_status_str(status), status, syndrome); |
| |
| errno = mlx5_cmd_status_to_err(status); |
| return errno; |
| } |
| |
| static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size, |
| struct mlx5_cmd_layout *cmd_lay) |
| { |
| struct mlx5_cmd_block *block; |
| struct mlx5_cmd_mailbox *next; |
| int copy; |
| |
| copy = min_t(int, size, sizeof(cmd_lay->out)); |
| memcpy(to, cmd_lay->out, copy); |
| size -= copy; |
| to += copy; |
| |
| next = from->next; |
| while (size) { |
| if (!next) { |
| assert(false); |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); |
| block = next->buf; |
| |
| memcpy(to, block->data, copy); |
| to += copy; |
| size -= copy; |
| next = next->next; |
| } |
| |
| return 0; |
| } |
| |
| static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size, |
| struct mlx5_cmd_layout *cmd_lay) |
| { |
| struct mlx5_cmd_block *block; |
| struct mlx5_cmd_mailbox *next; |
| int copy; |
| |
| copy = min_t(int, size, sizeof(cmd_lay->in)); |
| memcpy(cmd_lay->in, from, copy); |
| size -= copy; |
| from += copy; |
| |
| next = to->next; |
| while (size) { |
| if (!next) { |
| assert(false); |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE); |
| block = next->buf; |
| memcpy(block->data, from, copy); |
| from += copy; |
| size -= copy; |
| next = next->next; |
| } |
| |
| return 0; |
| } |
| |
| /* The HCA will think the queue has overflowed if we don't tell it we've been |
| * processing events. |
| * We create EQs with MLX5_NUM_SPARE_EQE extra entries, |
| * so we must update our consumer index at least that often. |
| */ |
| static inline uint32_t mlx5_eq_update_cc(struct mlx5_eq *eq, uint32_t cc) |
| { |
| if (unlikely(cc >= MLX5_NUM_SPARE_EQE)) { |
| eq_update_ci(eq, cc, 0); |
| cc = 0; |
| } |
| return cc; |
| } |
| |
| static int mlx5_vfio_process_page_request_comp(struct mlx5_vfio_context *ctx, |
| unsigned long slot) |
| { |
| struct mlx5_vfio_cmd_slot *cmd_slot = &ctx->cmd.cmds[slot]; |
| struct cmd_async_data *cmd_data = &cmd_slot->curr; |
| int num_claimed; |
| int ret, i; |
| |
| ret = mlx5_copy_from_msg(cmd_data->buff_out, &cmd_slot->out, |
| cmd_data->olen, cmd_slot->lay); |
| if (ret) |
| goto end; |
| |
| ret = mlx5_vfio_cmd_check(ctx, cmd_data->buff_in, cmd_data->buff_out); |
| if (ret) |
| goto end; |
| |
| if (DEVX_GET(manage_pages_in, cmd_data->buff_in, op_mod) == MLX5_PAGES_GIVE) |
| goto end; |
| |
| num_claimed = DEVX_GET(manage_pages_out, cmd_data->buff_out, output_num_entries); |
| if (num_claimed > DEVX_GET(manage_pages_in, cmd_data->buff_in, input_num_entries)) { |
| ret = EINVAL; |
| errno = ret; |
| goto end; |
| } |
| |
| for (i = 0; i < num_claimed; i++) |
| mlx5_vfio_free_page(ctx, DEVX_GET64(manage_pages_out, cmd_data->buff_out, pas[i])); |
| |
| end: |
| free(cmd_data->buff_in); |
| free(cmd_data->buff_out); |
| cmd_slot->in_use = false; |
| if (!ret && cmd_slot->is_pending) { |
| cmd_data = &cmd_slot->pending; |
| |
| pthread_mutex_lock(&cmd_slot->lock); |
| cmd_slot->is_pending = false; |
| ret = mlx5_vfio_post_cmd(ctx, cmd_data->buff_in, cmd_data->ilen, |
| cmd_data->buff_out, cmd_data->olen, slot, true); |
| pthread_mutex_unlock(&cmd_slot->lock); |
| } |
| return ret; |
| } |
| |
| static int mlx5_vfio_cmd_comp(struct mlx5_vfio_context *ctx, unsigned long slot) |
| { |
| uint64_t u = 1; |
| ssize_t s; |
| |
| s = write(ctx->cmd.cmds[slot].completion_event_fd, &u, |
| sizeof(uint64_t)); |
| if (s != sizeof(uint64_t)) |
| return -1; |
| |
| return 0; |
| } |
| |
| static int mlx5_vfio_process_cmd_eqe(struct mlx5_vfio_context *ctx, |
| struct mlx5_eqe *eqe) |
| { |
| struct mlx5_eqe_cmd *cmd_eqe = &eqe->data.cmd; |
| unsigned long vector = be32toh(cmd_eqe->vector); |
| unsigned long slot; |
| int count = 0; |
| int ret; |
| |
| for (slot = 0; slot < MLX5_MAX_COMMANDS; slot++) { |
| if (vector & (1 << slot)) { |
| assert(ctx->cmd.cmds[slot].comp_func); |
| ret = ctx->cmd.cmds[slot].comp_func(ctx, slot); |
| if (ret) |
| return ret; |
| |
| vector &= ~(1 << slot); |
| count++; |
| } |
| } |
| |
| assert(!vector && count); |
| return 0; |
| } |
| |
| static int mlx5_vfio_process_async_events(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_eqe *eqe; |
| int ret = 0; |
| int cc = 0; |
| |
| pthread_mutex_lock(&ctx->eq_lock); |
| while ((eqe = mlx5_eq_get_eqe(&ctx->async_eq, cc))) { |
| switch (eqe->type) { |
| case MLX5_EVENT_TYPE_CMD: |
| ret = mlx5_vfio_process_cmd_eqe(ctx, eqe); |
| break; |
| case MLX5_EVENT_TYPE_PAGE_REQUEST: |
| ret = mlx5_vfio_handle_page_req_event(ctx, eqe); |
| break; |
| default: |
| break; |
| } |
| |
| cc = mlx5_eq_update_cc(&ctx->async_eq, ++cc); |
| if (ret) |
| goto out; |
| } |
| |
| out: |
| eq_update_ci(&ctx->async_eq, cc, 1); |
| pthread_mutex_unlock(&ctx->eq_lock); |
| return ret; |
| } |
| |
| static int mlx5_vfio_enlarge_cmd_msg(struct mlx5_vfio_context *ctx, struct mlx5_cmd_msg *cmd_msg, |
| struct mlx5_cmd_layout *cmd_lay, uint32_t len, bool is_in) |
| { |
| int err; |
| |
| mlx5_vfio_free_cmd_msg(ctx, cmd_msg); |
| err = mlx5_vfio_alloc_cmd_msg(ctx, len, cmd_msg); |
| if (err) |
| return err; |
| |
| if (is_in) |
| cmd_lay->iptr = htobe64(cmd_msg->next->iova); |
| else |
| cmd_lay->optr = htobe64(cmd_msg->next->iova); |
| |
| return 0; |
| } |
| |
| static int mlx5_vfio_wait_event(struct mlx5_vfio_context *ctx, |
| unsigned int slot) |
| { |
| struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay; |
| uint64_t u; |
| ssize_t s; |
| int err; |
| |
| struct pollfd fds[2] = { |
| { .fd = ctx->cmd_comp_fd, .events = POLLIN }, |
| { .fd = ctx->cmd.cmds[slot].completion_event_fd, .events = POLLIN } |
| }; |
| |
| while (true) { |
| err = poll(fds, 2, -1); |
| if (err < 0 && errno != EAGAIN) { |
| mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, poll failed, errno=%d\n", errno); |
| return errno; |
| } |
| if (fds[0].revents & POLLIN) { |
| s = read(fds[0].fd, &u, sizeof(uint64_t)); |
| if (s < 0 && errno != EAGAIN) { |
| mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, read failed, errno=%d\n", errno); |
| return errno; |
| } |
| |
| err = mlx5_vfio_process_async_events(ctx); |
| if (err) |
| return err; |
| } |
| if (fds[1].revents & POLLIN) { |
| s = read(fds[1].fd, &u, sizeof(uint64_t)); |
| if (s < 0 && errno != EAGAIN) { |
| mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, read failed, slot=%d, errno=%d\n", |
| slot, errno); |
| return errno; |
| } |
| if (!(mmio_read8(&cmd_lay->status_own) & 0x1)) |
| return 0; |
| } |
| } |
| } |
| |
| /* One minute for the sake of bringup */ |
| #define MLX5_CMD_TIMEOUT_MSEC (60 * 1000) |
| |
| static int mlx5_vfio_poll_timeout(struct mlx5_cmd_layout *cmd_lay) |
| { |
| static struct timeval start, curr; |
| uint64_t ms_start, ms_curr; |
| |
| gettimeofday(&start, NULL); |
| ms_start = (uint64_t)start.tv_sec * 1000 + start.tv_usec / 1000; |
| do { |
| if (!(mmio_read8(&cmd_lay->status_own) & 0x1)) |
| return 0; |
| sched_yield(); |
| gettimeofday(&curr, NULL); |
| ms_curr = (uint64_t)curr.tv_sec * 1000 + curr.tv_usec / 1000; |
| } while (ms_curr - ms_start < MLX5_CMD_TIMEOUT_MSEC); |
| |
| errno = ETIMEDOUT; |
| return errno; |
| } |
| |
| static int mlx5_vfio_cmd_prep_in(struct mlx5_vfio_context *ctx, |
| struct mlx5_cmd_msg *cmd_in, |
| struct mlx5_cmd_layout *cmd_lay, |
| void *in, int ilen) |
| { |
| int err; |
| |
| if (ilen > cmd_in->len) { |
| err = mlx5_vfio_enlarge_cmd_msg(ctx, cmd_in, cmd_lay, ilen, true); |
| if (err) |
| return err; |
| } |
| |
| err = mlx5_copy_to_msg(cmd_in, in, ilen, cmd_lay); |
| if (err) |
| return err; |
| |
| cmd_lay->ilen = htobe32(ilen); |
| return 0; |
| } |
| |
| static int mlx5_vfio_cmd_prep_out(struct mlx5_vfio_context *ctx, |
| struct mlx5_cmd_msg *cmd_out, |
| struct mlx5_cmd_layout *cmd_lay, int olen) |
| { |
| struct mlx5_cmd_mailbox *tmp; |
| struct mlx5_cmd_block *block; |
| |
| cmd_lay->olen = htobe32(olen); |
| |
| /* zeroing output header */ |
| memset(cmd_lay->out, 0, sizeof(cmd_lay->out)); |
| |
| if (olen > cmd_out->len) |
| /* Upon enlarge output message is zeroed */ |
| return mlx5_vfio_enlarge_cmd_msg(ctx, cmd_out, cmd_lay, olen, false); |
| |
| /* zeroing output message */ |
| tmp = cmd_out->next; |
| olen -= min_t(int, olen, sizeof(cmd_lay->out)); |
| while (olen > 0) { |
| block = tmp->buf; |
| memset(block->data, 0, MLX5_CMD_DATA_BLOCK_SIZE); |
| olen -= MLX5_CMD_DATA_BLOCK_SIZE; |
| tmp = tmp->next; |
| assert(tmp || olen <= 0); |
| } |
| return 0; |
| } |
| |
| static int mlx5_vfio_post_cmd(struct mlx5_vfio_context *ctx, void *in, |
| int ilen, void *out, int olen, |
| unsigned int slot, bool async) |
| { |
| struct mlx5_init_seg *init_seg = ctx->bar_map; |
| struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay; |
| struct mlx5_cmd_msg *cmd_in = &ctx->cmd.cmds[slot].in; |
| struct mlx5_cmd_msg *cmd_out = &ctx->cmd.cmds[slot].out; |
| int err; |
| |
| /* Lock was taken by caller */ |
| if (async && ctx->cmd.cmds[slot].in_use) { |
| struct cmd_async_data *pending = &ctx->cmd.cmds[slot].pending; |
| |
| if (ctx->cmd.cmds[slot].is_pending) { |
| assert(false); |
| return EINVAL; |
| } |
| |
| /* We might get another PAGE EVENT before previous CMD was completed. |
| * Save the new work and once get the CMD completion go and do the job. |
| */ |
| pending->buff_in = in; |
| pending->buff_out = out; |
| pending->ilen = ilen; |
| pending->olen = olen; |
| |
| ctx->cmd.cmds[slot].is_pending = true; |
| return 0; |
| } |
| |
| err = mlx5_vfio_cmd_prep_in(ctx, cmd_in, cmd_lay, in, ilen); |
| if (err) |
| return err; |
| |
| err = mlx5_vfio_cmd_prep_out(ctx, cmd_out, cmd_lay, olen); |
| if (err) |
| return err; |
| |
| if (async) { |
| ctx->cmd.cmds[slot].in_use = true; |
| ctx->cmd.cmds[slot].curr.ilen = ilen; |
| ctx->cmd.cmds[slot].curr.olen = olen; |
| ctx->cmd.cmds[slot].curr.buff_in = in; |
| ctx->cmd.cmds[slot].curr.buff_out = out; |
| } |
| |
| cmd_lay->status_own = 0x1; |
| |
| udma_to_device_barrier(); |
| mmio_write32_be(&init_seg->cmd_dbell, htobe32(0x1 << slot)); |
| return 0; |
| } |
| |
| static int mlx5_vfio_cmd_do(struct mlx5_vfio_context *ctx, void *in, |
| int ilen, void *out, int olen, |
| unsigned int slot) |
| { |
| struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay; |
| struct mlx5_cmd_msg *cmd_out = &ctx->cmd.cmds[slot].out; |
| int err; |
| |
| pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); |
| err = mlx5_vfio_post_cmd(ctx, in, ilen, out, olen, slot, false); |
| if (err) |
| goto end; |
| |
| if (ctx->have_eq) { |
| err = mlx5_vfio_wait_event(ctx, slot); |
| if (err) |
| goto end; |
| } else { |
| err = mlx5_vfio_poll_timeout(cmd_lay); |
| if (err) |
| goto end; |
| udma_from_device_barrier(); |
| } |
| |
| err = mlx5_copy_from_msg(out, cmd_out, olen, cmd_lay); |
| if (err) |
| goto end; |
| |
| if (DEVX_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) |
| err = EREMOTEIO; |
| |
| end: |
| pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock); |
| return err; |
| } |
| |
| static int mlx5_vfio_cmd_exec(struct mlx5_vfio_context *ctx, void *in, |
| int ilen, void *out, int olen, |
| unsigned int slot) |
| { |
| int err; |
| |
| err = mlx5_vfio_cmd_do(ctx, in, ilen, out, olen, slot); |
| if (err != EREMOTEIO) |
| return err; |
| |
| return mlx5_vfio_cmd_check(ctx, in, out); |
| } |
| |
| static int mlx5_vfio_enable_pci_cmd(struct mlx5_vfio_context *ctx) |
| { |
| struct vfio_region_info pci_config_reg = {}; |
| uint16_t pci_com_buf = 0x6; |
| char buffer[4096]; |
| |
| pci_config_reg.argsz = sizeof(pci_config_reg); |
| pci_config_reg.index = VFIO_PCI_CONFIG_REGION_INDEX; |
| |
| if (ioctl(ctx->device_fd, VFIO_DEVICE_GET_REGION_INFO, &pci_config_reg)) |
| return -1; |
| |
| if (pwrite(ctx->device_fd, &pci_com_buf, 2, pci_config_reg.offset + 0x4) != 2) |
| return -1; |
| |
| if (pread(ctx->device_fd, buffer, pci_config_reg.size, pci_config_reg.offset) |
| != pci_config_reg.size) |
| return -1; |
| |
| return 0; |
| } |
| |
| static void free_cmd_box(struct mlx5_vfio_context *ctx, |
| struct mlx5_cmd_mailbox *mailbox) |
| { |
| mlx5_vfio_unregister_mem(ctx, mailbox->iova, MLX5_ADAPTER_PAGE_SIZE); |
| iset_insert_range(ctx->iova_alloc, mailbox->iova, MLX5_ADAPTER_PAGE_SIZE); |
| free(mailbox->buf); |
| free(mailbox); |
| } |
| |
| static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_cmd_mailbox *mailbox; |
| int ret; |
| |
| mailbox = calloc(1, sizeof(*mailbox)); |
| if (!mailbox) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| ret = posix_memalign(&mailbox->buf, MLX5_ADAPTER_PAGE_SIZE, |
| MLX5_ADAPTER_PAGE_SIZE); |
| if (ret) { |
| errno = ret; |
| goto err_free; |
| } |
| |
| memset(mailbox->buf, 0, MLX5_ADAPTER_PAGE_SIZE); |
| |
| ret = iset_alloc_range(ctx->iova_alloc, MLX5_ADAPTER_PAGE_SIZE, |
| &mailbox->iova, MLX5_ADAPTER_PAGE_SIZE); |
| if (ret) |
| goto err_tree; |
| |
| ret = mlx5_vfio_register_mem(ctx, mailbox->buf, mailbox->iova, |
| MLX5_ADAPTER_PAGE_SIZE); |
| if (ret) |
| goto err_reg; |
| |
| return mailbox; |
| |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, mailbox->iova, |
| MLX5_ADAPTER_PAGE_SIZE); |
| err_tree: |
| free(mailbox->buf); |
| err_free: |
| free(mailbox); |
| return NULL; |
| } |
| |
| static int mlx5_calc_cmd_blocks(uint32_t msg_len) |
| { |
| int size = msg_len; |
| int blen = size - min_t(int, 16, size); |
| |
| return DIV_ROUND_UP(blen, MLX5_CMD_DATA_BLOCK_SIZE); |
| } |
| |
| static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx, |
| struct mlx5_cmd_msg *msg) |
| { |
| struct mlx5_cmd_mailbox *head = msg->next; |
| struct mlx5_cmd_mailbox *next; |
| |
| while (head) { |
| next = head->next; |
| free_cmd_box(ctx, head); |
| head = next; |
| } |
| msg->len = 0; |
| } |
| |
| static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx, |
| uint32_t size, struct mlx5_cmd_msg *msg) |
| { |
| struct mlx5_cmd_mailbox *tmp, *head = NULL; |
| struct mlx5_cmd_block *block; |
| int i, num_blocks; |
| |
| msg->len = size; |
| num_blocks = mlx5_calc_cmd_blocks(size); |
| |
| for (i = 0; i < num_blocks; i++) { |
| tmp = alloc_cmd_box(ctx); |
| if (!tmp) |
| goto err_alloc; |
| |
| block = tmp->buf; |
| tmp->next = head; |
| block->next = htobe64(tmp->next ? tmp->next->iova : 0); |
| block->block_num = htobe32(num_blocks - i - 1); |
| head = tmp; |
| } |
| msg->next = head; |
| return 0; |
| |
| err_alloc: |
| while (head) { |
| tmp = head->next; |
| free_cmd_box(ctx, head); |
| head = tmp; |
| } |
| msg->len = 0; |
| return -1; |
| } |
| |
| static void mlx5_vfio_free_cmd_slot(struct mlx5_vfio_context *ctx, int slot) |
| { |
| struct mlx5_vfio_cmd_slot *cmd_slot = &ctx->cmd.cmds[slot]; |
| |
| mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->in); |
| mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->out); |
| close(cmd_slot->completion_event_fd); |
| } |
| |
| static int mlx5_vfio_setup_cmd_slot(struct mlx5_vfio_context *ctx, int slot) |
| { |
| struct mlx5_vfio_cmd *cmd = &ctx->cmd; |
| struct mlx5_vfio_cmd_slot *cmd_slot = &cmd->cmds[slot]; |
| struct mlx5_cmd_layout *cmd_lay; |
| int ret; |
| |
| ret = mlx5_vfio_alloc_cmd_msg(ctx, 4096, &cmd_slot->in); |
| if (ret) |
| return ret; |
| |
| ret = mlx5_vfio_alloc_cmd_msg(ctx, 4096, &cmd_slot->out); |
| if (ret) |
| goto err; |
| |
| cmd_lay = cmd->vaddr + (slot * (1 << cmd->log_stride)); |
| cmd_lay->type = MLX5_PCI_CMD_XPORT; |
| cmd_lay->iptr = htobe64(cmd_slot->in.next->iova); |
| cmd_lay->optr = htobe64(cmd_slot->out.next->iova); |
| |
| cmd_slot->lay = cmd_lay; |
| cmd_slot->completion_event_fd = eventfd(0, EFD_CLOEXEC); |
| if (cmd_slot->completion_event_fd < 0) { |
| ret = -1; |
| goto err_fd; |
| } |
| |
| if (slot != MLX5_MAX_COMMANDS - 1) |
| cmd_slot->comp_func = mlx5_vfio_cmd_comp; |
| else |
| cmd_slot->comp_func = mlx5_vfio_process_page_request_comp; |
| |
| pthread_mutex_init(&cmd_slot->lock, NULL); |
| |
| return 0; |
| |
| err_fd: |
| mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->out); |
| err: |
| mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->in); |
| return ret; |
| } |
| |
| static int mlx5_vfio_init_cmd_interface(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_init_seg *init_seg = ctx->bar_map; |
| struct mlx5_vfio_cmd *cmd = &ctx->cmd; |
| uint16_t cmdif_rev; |
| uint32_t cmd_h, cmd_l; |
| int ret; |
| |
| cmdif_rev = be32toh(init_seg->cmdif_rev_fw_sub) >> 16; |
| |
| if (cmdif_rev != 5) { |
| errno = EINVAL; |
| return -1; |
| } |
| |
| cmd_l = be32toh(init_seg->cmdq_addr_l_sz) & 0xff; |
| ctx->cmd.log_sz = cmd_l >> 4 & 0xf; |
| ctx->cmd.log_stride = cmd_l & 0xf; |
| if (1 << ctx->cmd.log_sz > MLX5_MAX_COMMANDS) { |
| errno = EINVAL; |
| return -1; |
| } |
| |
| if (ctx->cmd.log_sz + ctx->cmd.log_stride > MLX5_ADAPTER_PAGE_SHIFT) { |
| errno = EINVAL; |
| return -1; |
| } |
| |
| /* The initial address must be 4K aligned */ |
| ret = posix_memalign(&cmd->vaddr, MLX5_ADAPTER_PAGE_SIZE, |
| MLX5_ADAPTER_PAGE_SIZE); |
| if (ret) { |
| errno = ret; |
| return -1; |
| } |
| |
| memset(cmd->vaddr, 0, MLX5_ADAPTER_PAGE_SIZE); |
| |
| ret = iset_alloc_range(ctx->iova_alloc, MLX5_ADAPTER_PAGE_SIZE, |
| &cmd->iova, MLX5_ADAPTER_PAGE_SIZE); |
| if (ret) |
| goto err_free; |
| |
| ret = mlx5_vfio_register_mem(ctx, cmd->vaddr, cmd->iova, MLX5_ADAPTER_PAGE_SIZE); |
| if (ret) |
| goto err_reg; |
| |
| cmd_h = (uint32_t)((uint64_t)(cmd->iova) >> 32); |
| cmd_l = (uint32_t)(uint64_t)(cmd->iova); |
| |
| init_seg->cmdq_addr_h = htobe32(cmd_h); |
| init_seg->cmdq_addr_l_sz = htobe32(cmd_l); |
| |
| /* Make sure firmware sees the complete address before we proceed */ |
| udma_to_device_barrier(); |
| |
| ret = mlx5_vfio_setup_cmd_slot(ctx, 0); |
| if (ret) |
| goto err_slot_0; |
| |
| ret = mlx5_vfio_setup_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1); |
| if (ret) |
| goto err_slot_1; |
| |
| ret = mlx5_vfio_enable_pci_cmd(ctx); |
| if (!ret) |
| return 0; |
| |
| mlx5_vfio_free_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1); |
| err_slot_1: |
| mlx5_vfio_free_cmd_slot(ctx, 0); |
| err_slot_0: |
| mlx5_vfio_unregister_mem(ctx, cmd->iova, MLX5_ADAPTER_PAGE_SIZE); |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, cmd->iova, MLX5_ADAPTER_PAGE_SIZE); |
| err_free: |
| free(cmd->vaddr); |
| return ret; |
| } |
| |
| static void mlx5_vfio_clean_cmd_interface(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_vfio_cmd *cmd = &ctx->cmd; |
| |
| mlx5_vfio_free_cmd_slot(ctx, 0); |
| mlx5_vfio_free_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1); |
| mlx5_vfio_unregister_mem(ctx, cmd->iova, MLX5_ADAPTER_PAGE_SIZE); |
| iset_insert_range(ctx->iova_alloc, cmd->iova, MLX5_ADAPTER_PAGE_SIZE); |
| free(cmd->vaddr); |
| } |
| |
| static void set_iova_min_page_size(struct mlx5_vfio_context *ctx, |
| uint64_t iova_pgsizes) |
| { |
| int i; |
| |
| for (i = MLX5_ADAPTER_PAGE_SHIFT; i < 64; i++) { |
| if (iova_pgsizes & (1 << i)) { |
| ctx->iova_min_page_size = 1 << i; |
| return; |
| } |
| } |
| |
| assert(false); |
| } |
| |
| /* if the kernel does not report usable IOVA regions, choose the legacy region */ |
| #define MLX5_VFIO_IOVA_MIN1 0x10000ULL |
| #define MLX5_VFIO_IOVA_MAX1 0xFEDFFFFFULL |
| #define MLX5_VFIO_IOVA_MIN2 0xFEF00000ULL |
| #define MLX5_VFIO_IOVA_MAX2 ((1ULL << 39) - 1) |
| |
| static int mlx5_vfio_get_iommu_info(struct mlx5_vfio_context *ctx) |
| { |
| struct vfio_iommu_type1_info *info; |
| int ret, i; |
| void *ptr; |
| uint32_t offset; |
| |
| info = calloc(1, sizeof(*info)); |
| if (!info) { |
| errno = ENOMEM; |
| return -1; |
| } |
| |
| info->argsz = sizeof(*info); |
| ret = ioctl(ctx->container_fd, VFIO_IOMMU_GET_INFO, info); |
| if (ret) |
| goto end; |
| |
| if (info->argsz > sizeof(*info)) { |
| struct vfio_iommu_type1_info *tmp; |
| |
| tmp = realloc(info, info->argsz); |
| if (!tmp) { |
| errno = ENOMEM; |
| ret = -1; |
| goto end; |
| } |
| info = tmp; |
| |
| ret = ioctl(ctx->container_fd, VFIO_IOMMU_GET_INFO, info); |
| if (ret) |
| goto end; |
| } |
| |
| set_iova_min_page_size(ctx, (info->flags & VFIO_IOMMU_INFO_PGSIZES) ? |
| info->iova_pgsizes : 4096); |
| |
| if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) |
| goto set_legacy; |
| |
| offset = info->cap_offset; |
| while (offset) { |
| struct vfio_iommu_type1_info_cap_iova_range *iova_range; |
| struct vfio_info_cap_header *header; |
| |
| ptr = (void *)info + offset; |
| header = ptr; |
| |
| if (header->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { |
| offset = header->next; |
| continue; |
| } |
| |
| iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)header; |
| |
| for (i = 0; i < iova_range->nr_iovas; i++) { |
| ret = iset_insert_range(ctx->iova_alloc, iova_range->iova_ranges[i].start, |
| iova_range->iova_ranges[i].end - |
| iova_range->iova_ranges[i].start + 1); |
| if (ret) |
| goto end; |
| } |
| |
| goto end; |
| } |
| |
| set_legacy: |
| ret = iset_insert_range(ctx->iova_alloc, MLX5_VFIO_IOVA_MIN1, |
| MLX5_VFIO_IOVA_MAX1 - MLX5_VFIO_IOVA_MIN1 + 1); |
| if (!ret) |
| ret = iset_insert_range(ctx->iova_alloc, MLX5_VFIO_IOVA_MIN2, |
| MLX5_VFIO_IOVA_MAX2 - MLX5_VFIO_IOVA_MIN2 + 1); |
| |
| end: |
| free(info); |
| return ret; |
| } |
| |
| static void mlx5_vfio_clean_device_dma(struct mlx5_vfio_context *ctx) |
| { |
| struct page_block *page_block, *tmp; |
| |
| list_for_each_safe(&ctx->mem_alloc.block_list, page_block, |
| tmp, next_block) |
| mlx5_vfio_free_block(ctx, page_block); |
| |
| iset_destroy(ctx->iova_alloc); |
| } |
| |
| static int mlx5_vfio_init_device_dma(struct mlx5_vfio_context *ctx) |
| { |
| ctx->iova_alloc = iset_create(); |
| if (!ctx->iova_alloc) |
| return -1; |
| |
| list_head_init(&ctx->mem_alloc.block_list); |
| pthread_mutex_init(&ctx->mem_alloc.block_list_mutex, NULL); |
| |
| if (mlx5_vfio_get_iommu_info(ctx)) |
| goto err; |
| |
| /* create an initial block of DMA memory ready to be used */ |
| if (!mlx5_vfio_new_block(ctx)) |
| goto err; |
| |
| return 0; |
| err: |
| iset_destroy(ctx->iova_alloc); |
| return -1; |
| } |
| |
| static void mlx5_vfio_uninit_bar0(struct mlx5_vfio_context *ctx) |
| { |
| munmap(ctx->bar_map, ctx->bar_map_size); |
| } |
| |
| static int mlx5_vfio_init_bar0(struct mlx5_vfio_context *ctx) |
| { |
| struct vfio_region_info reg = { .argsz = sizeof(reg) }; |
| void *base; |
| int err; |
| |
| reg.index = 0; |
| err = ioctl(ctx->device_fd, VFIO_DEVICE_GET_REGION_INFO, ®); |
| if (err) |
| return err; |
| |
| base = mmap(NULL, reg.size, PROT_READ | PROT_WRITE, MAP_SHARED, |
| ctx->device_fd, reg.offset); |
| if (base == MAP_FAILED) |
| return -1; |
| |
| ctx->bar_map = (struct mlx5_init_seg *)base; |
| ctx->bar_map_size = reg.size; |
| return 0; |
| } |
| |
| static int mlx5_vfio_msix_set_irqs(struct mlx5_vfio_context *ctx, |
| int start, int count, void *irq_set_buf) |
| { |
| struct vfio_irq_set *irq_set = (struct vfio_irq_set *)irq_set_buf; |
| |
| irq_set->argsz = sizeof(*irq_set) + sizeof(int) * count; |
| irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; |
| irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; |
| irq_set->start = start; |
| irq_set->count = count; |
| |
| return ioctl(ctx->device_fd, VFIO_DEVICE_SET_IRQS, irq_set); |
| } |
| |
| static int mlx5_vfio_init_async_fd(struct mlx5_vfio_context *ctx) |
| { |
| struct vfio_irq_info irq = { .argsz = sizeof(irq) }; |
| struct vfio_irq_set *irq_set_buf; |
| int fdlen, i; |
| |
| irq.index = VFIO_PCI_MSIX_IRQ_INDEX; |
| if (ioctl(ctx->device_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq)) |
| return -1; |
| |
| /* fail if this vector cannot be used with eventfd */ |
| if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0) |
| return -1; |
| |
| fdlen = sizeof(int) * irq.count; |
| ctx->msix_fds = calloc(1, fdlen); |
| if (!ctx->msix_fds) { |
| errno = ENOMEM; |
| return -1; |
| } |
| |
| for (i = 0; i < irq.count; i++) |
| ctx->msix_fds[i] = -1; |
| |
| /* set up an eventfd for command completion interrupts */ |
| ctx->cmd_comp_fd = eventfd(0, EFD_CLOEXEC | O_NONBLOCK); |
| if (ctx->cmd_comp_fd < 0) |
| goto err_eventfd; |
| |
| ctx->msix_fds[MLX5_VFIO_CMD_VEC_IDX] = ctx->cmd_comp_fd; |
| |
| irq_set_buf = calloc(1, sizeof(*irq_set_buf) + fdlen); |
| if (!irq_set_buf) { |
| errno = ENOMEM; |
| goto err_irq_set_buf; |
| } |
| |
| /* Enable MSI-X interrupts; In the first time it is called, the count |
| * must be the maximum that we need |
| */ |
| memcpy(irq_set_buf->data, ctx->msix_fds, fdlen); |
| if (mlx5_vfio_msix_set_irqs(ctx, 0, irq.count, irq_set_buf)) |
| goto err_msix; |
| |
| free(irq_set_buf); |
| pthread_mutex_init(&ctx->msix_fds_lock, NULL); |
| ctx->vctx.context.num_comp_vectors = irq.count; |
| return 0; |
| |
| err_msix: |
| free(irq_set_buf); |
| err_irq_set_buf: |
| close(ctx->cmd_comp_fd); |
| err_eventfd: |
| free(ctx->msix_fds); |
| return -1; |
| } |
| |
| static void mlx5_vfio_close_fds(struct mlx5_vfio_context *ctx) |
| { |
| int vec; |
| |
| close(ctx->device_fd); |
| close(ctx->container_fd); |
| close(ctx->group_fd); |
| |
| pthread_mutex_lock(&ctx->msix_fds_lock); |
| for (vec = 0; vec < ctx->vctx.context.num_comp_vectors; vec++) |
| if (ctx->msix_fds[vec] >= 0) |
| close(ctx->msix_fds[vec]); |
| |
| free(ctx->msix_fds); |
| pthread_mutex_unlock(&ctx->msix_fds_lock); |
| } |
| |
| static int mlx5_vfio_open_fds(struct mlx5_vfio_context *ctx, |
| struct mlx5_vfio_device *mdev) |
| { |
| struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; |
| |
| /* Create a new container */ |
| ctx->container_fd = open("/dev/vfio/vfio", O_RDWR); |
| |
| if (ctx->container_fd < 0) |
| return -1; |
| |
| if (ioctl(ctx->container_fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION) |
| goto close_cont; |
| |
| if (!ioctl(ctx->container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) |
| /* Doesn't support the IOMMU driver we want. */ |
| goto close_cont; |
| |
| /* Open the group */ |
| ctx->group_fd = open(mdev->vfio_path, O_RDWR); |
| if (ctx->group_fd < 0) |
| goto close_cont; |
| |
| /* Test the group is viable and available */ |
| if (ioctl(ctx->group_fd, VFIO_GROUP_GET_STATUS, &group_status)) |
| goto close_group; |
| |
| if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { |
| /* Group is not viable (ie, not all devices bound for vfio) */ |
| errno = EINVAL; |
| goto close_group; |
| } |
| |
| /* Add the group to the container */ |
| if (ioctl(ctx->group_fd, VFIO_GROUP_SET_CONTAINER, &ctx->container_fd)) |
| goto close_group; |
| |
| /* Enable the IOMMU model we want */ |
| if (ioctl(ctx->container_fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU)) |
| goto close_group; |
| |
| /* Get a file descriptor for the device */ |
| ctx->device_fd = ioctl(ctx->group_fd, VFIO_GROUP_GET_DEVICE_FD, |
| mdev->pci_name); |
| if (ctx->device_fd < 0) |
| goto close_group; |
| |
| if (mlx5_vfio_init_async_fd(ctx)) |
| goto close_group; |
| |
| return 0; |
| |
| close_group: |
| close(ctx->group_fd); |
| close_cont: |
| close(ctx->container_fd); |
| return -1; |
| } |
| |
| enum { |
| MLX5_EQE_OWNER_INIT_VAL = 0x1, |
| }; |
| |
| static void init_eq_buf(struct mlx5_eq *eq) |
| { |
| struct mlx5_eqe *eqe; |
| int i; |
| |
| for (i = 0; i < eq->nent; i++) { |
| eqe = get_eqe(eq, i); |
| eqe->owner = MLX5_EQE_OWNER_INIT_VAL; |
| } |
| } |
| |
| static uint64_t uar2iova(struct mlx5_vfio_context *ctx, uint32_t index) |
| { |
| return (uint64_t)(uintptr_t)((void *)ctx->bar_map + (index * MLX5_ADAPTER_PAGE_SIZE)); |
| } |
| |
| static int mlx5_vfio_alloc_uar(struct mlx5_vfio_context *ctx, uint32_t *uarn) |
| { |
| uint32_t out[DEVX_ST_SZ_DW(alloc_uar_out)] = {}; |
| uint32_t in[DEVX_ST_SZ_DW(alloc_uar_in)] = {}; |
| int err; |
| |
| DEVX_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR); |
| err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| if (!err) |
| *uarn = DEVX_GET(alloc_uar_out, out, uar); |
| |
| return err; |
| } |
| |
| static void mlx5_vfio_dealloc_uar(struct mlx5_vfio_context *ctx, uint32_t uarn) |
| { |
| uint32_t out[DEVX_ST_SZ_DW(dealloc_uar_out)] = {}; |
| uint32_t in[DEVX_ST_SZ_DW(dealloc_uar_in)] = {}; |
| |
| DEVX_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR); |
| DEVX_SET(dealloc_uar_in, in, uar, uarn); |
| mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| } |
| |
| static void mlx5_vfio_destroy_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq *eq) |
| { |
| uint32_t in[DEVX_ST_SZ_DW(destroy_eq_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(destroy_eq_out)] = {}; |
| |
| DEVX_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); |
| DEVX_SET(destroy_eq_in, in, eq_number, eq->eqn); |
| |
| mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| mlx5_vfio_unregister_mem(ctx, eq->iova, eq->iova_size); |
| iset_insert_range(ctx->iova_alloc, eq->iova, eq->iova_size); |
| free(eq->vaddr); |
| } |
| |
| static void destroy_async_eqs(struct mlx5_vfio_context *ctx) |
| { |
| ctx->have_eq = false; |
| mlx5_vfio_destroy_eq(ctx, &ctx->async_eq); |
| mlx5_vfio_dealloc_uar(ctx, ctx->eqs_uar.uarn); |
| } |
| |
| static int |
| create_map_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq *eq, |
| struct mlx5_eq_param *param) |
| { |
| uint32_t out[DEVX_ST_SZ_DW(create_eq_out)] = {}; |
| uint8_t vecidx = param->irq_index; |
| __be64 *pas; |
| void *eqc; |
| int inlen; |
| uint32_t *in; |
| int err; |
| int i; |
| int alloc_size; |
| |
| pthread_mutex_init(&ctx->eq_lock, NULL); |
| eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE); |
| eq->cons_index = 0; |
| alloc_size = eq->nent * MLX5_EQE_SIZE; |
| eq->iova_size = max(roundup_pow_of_two(alloc_size), ctx->iova_min_page_size); |
| |
| inlen = DEVX_ST_SZ_BYTES(create_eq_in) + |
| DEVX_FLD_SZ_BYTES(create_eq_in, pas[0]) * 1; |
| |
| in = calloc(1, inlen); |
| if (!in) |
| return ENOMEM; |
| |
| pas = (__be64 *)DEVX_ADDR_OF(create_eq_in, in, pas); |
| |
| err = posix_memalign(&eq->vaddr, eq->iova_size, alloc_size); |
| if (err) { |
| errno = err; |
| goto end; |
| } |
| |
| err = iset_alloc_range(ctx->iova_alloc, eq->iova_size, |
| &eq->iova, eq->iova_size); |
| if (err) |
| goto err_range; |
| |
| err = mlx5_vfio_register_mem(ctx, eq->vaddr, eq->iova, eq->iova_size); |
| if (err) |
| goto err_reg; |
| |
| pas[0] = htobe64(eq->iova); |
| init_eq_buf(eq); |
| DEVX_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ); |
| |
| for (i = 0; i < 4; i++) |
| DEVX_ARRAY_SET64(create_eq_in, in, event_bitmask, i, |
| param->mask[i]); |
| |
| eqc = DEVX_ADDR_OF(create_eq_in, in, eq_context_entry); |
| DEVX_SET(eqc, eqc, log_eq_size, ilog32(eq->nent - 1)); |
| DEVX_SET(eqc, eqc, uar_page, ctx->eqs_uar.uarn); |
| DEVX_SET(eqc, eqc, intr, vecidx); |
| DEVX_SET(eqc, eqc, log_page_size, ilog32(eq->iova_size - 1) - MLX5_ADAPTER_PAGE_SHIFT); |
| |
| err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0); |
| if (err) |
| goto err_cmd; |
| |
| eq->vecidx = vecidx; |
| eq->eqn = DEVX_GET(create_eq_out, out, eq_number); |
| eq->doorbell = (void *)(uintptr_t)ctx->eqs_uar.iova + MLX5_EQ_DOORBEL_OFFSET; |
| |
| free(in); |
| return 0; |
| |
| err_cmd: |
| mlx5_vfio_unregister_mem(ctx, eq->iova, eq->iova_size); |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, eq->iova, eq->iova_size); |
| err_range: |
| free(eq->vaddr); |
| end: |
| free(in); |
| return err; |
| } |
| |
| static int |
| setup_async_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq_param *param, |
| struct mlx5_eq *eq) |
| { |
| int err; |
| |
| err = create_map_eq(ctx, eq, param); |
| if (err) |
| return err; |
| |
| eq_update_ci(eq, 0, 1); |
| |
| return 0; |
| } |
| |
| static int create_async_eqs(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_eq_param param = {}; |
| int err; |
| |
| err = mlx5_vfio_alloc_uar(ctx, &ctx->eqs_uar.uarn); |
| if (err) |
| return err; |
| |
| ctx->eqs_uar.iova = uar2iova(ctx, ctx->eqs_uar.uarn); |
| |
| param = (struct mlx5_eq_param) { |
| .irq_index = MLX5_VFIO_CMD_VEC_IDX, |
| .nent = MLX5_NUM_CMD_EQE, |
| .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD | |
| 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST, |
| }; |
| |
| err = setup_async_eq(ctx, ¶m, &ctx->async_eq); |
| if (err) |
| goto err; |
| |
| ctx->have_eq = true; |
| return 0; |
| err: |
| mlx5_vfio_dealloc_uar(ctx, ctx->eqs_uar.uarn); |
| return err; |
| } |
| |
| static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id, |
| int npages) |
| { |
| uint32_t inlen = DEVX_ST_SZ_BYTES(manage_pages_in); |
| int outlen; |
| uint32_t *out; |
| void *in; |
| int err; |
| int slot = MLX5_MAX_COMMANDS - 1; |
| |
| outlen = DEVX_ST_SZ_BYTES(manage_pages_out); |
| |
| outlen += npages * DEVX_FLD_SZ_BYTES(manage_pages_out, pas[0]); |
| out = calloc(1, outlen); |
| if (!out) { |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| in = calloc(1, inlen); |
| if (!in) { |
| err = ENOMEM; |
| errno = err; |
| goto out_free; |
| } |
| |
| DEVX_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); |
| DEVX_SET(manage_pages_in, in, op_mod, MLX5_PAGES_TAKE); |
| DEVX_SET(manage_pages_in, in, function_id, func_id); |
| DEVX_SET(manage_pages_in, in, input_num_entries, npages); |
| |
| pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); |
| err = mlx5_vfio_post_cmd(ctx, in, inlen, out, outlen, slot, true); |
| pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock); |
| if (!err) |
| return 0; |
| |
| free(in); |
| out_free: |
| free(out); |
| return err; |
| } |
| |
| static int mlx5_vfio_enable_hca(struct mlx5_vfio_context *ctx) |
| { |
| uint32_t in[DEVX_ST_SZ_DW(enable_hca_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(enable_hca_out)] = {}; |
| |
| DEVX_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA); |
| return mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| } |
| |
| static int mlx5_vfio_set_issi(struct mlx5_vfio_context *ctx) |
| { |
| uint32_t query_in[DEVX_ST_SZ_DW(query_issi_in)] = {}; |
| uint32_t query_out[DEVX_ST_SZ_DW(query_issi_out)] = {}; |
| uint32_t set_in[DEVX_ST_SZ_DW(set_issi_in)] = {}; |
| uint32_t set_out[DEVX_ST_SZ_DW(set_issi_out)] = {}; |
| uint32_t sup_issi; |
| int err; |
| |
| DEVX_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI); |
| err = mlx5_vfio_cmd_exec(ctx, query_in, sizeof(query_in), query_out, |
| sizeof(query_out), 0); |
| if (err) |
| return err; |
| |
| sup_issi = DEVX_GET(query_issi_out, query_out, supported_issi_dw0); |
| |
| if (!(sup_issi & (1 << 1))) { |
| errno = EOPNOTSUPP; |
| return errno; |
| } |
| |
| DEVX_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI); |
| DEVX_SET(set_issi_in, set_in, current_issi, 1); |
| return mlx5_vfio_cmd_exec(ctx, set_in, sizeof(set_in), set_out, |
| sizeof(set_out), 0); |
| } |
| |
| static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, |
| uint16_t func_id, |
| int32_t npages, |
| bool is_event) |
| { |
| int32_t out[DEVX_ST_SZ_DW(manage_pages_out)] = {}; |
| int inlen = DEVX_ST_SZ_BYTES(manage_pages_in); |
| int slot = MLX5_MAX_COMMANDS - 1; |
| void *outp = out; |
| int i, err; |
| int32_t *in; |
| uint64_t iova; |
| |
| inlen += npages * DEVX_FLD_SZ_BYTES(manage_pages_in, pas[0]); |
| in = calloc(1, inlen); |
| if (!in) { |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| if (is_event) { |
| outp = calloc(1, sizeof(out)); |
| if (!outp) { |
| errno = ENOMEM; |
| err = errno; |
| goto end; |
| } |
| } |
| |
| for (i = 0; i < npages; i++) { |
| err = mlx5_vfio_alloc_page(ctx, &iova); |
| if (err) |
| goto err; |
| |
| DEVX_ARRAY_SET64(manage_pages_in, in, pas, i, iova); |
| } |
| |
| DEVX_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); |
| DEVX_SET(manage_pages_in, in, op_mod, MLX5_PAGES_GIVE); |
| DEVX_SET(manage_pages_in, in, function_id, func_id); |
| DEVX_SET(manage_pages_in, in, input_num_entries, npages); |
| |
| if (is_event) { |
| pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); |
| err = mlx5_vfio_post_cmd(ctx, in, inlen, outp, sizeof(out), slot, true); |
| pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock); |
| } else { |
| err = mlx5_vfio_cmd_exec(ctx, in, inlen, outp, sizeof(out), slot); |
| } |
| |
| if (!err) { |
| if (is_event) |
| return 0; |
| goto end; |
| } |
| err: |
| if (is_event) |
| free(outp); |
| for (i--; i >= 0; i--) |
| mlx5_vfio_free_page(ctx, DEVX_GET64(manage_pages_in, in, pas[i])); |
| end: |
| free(in); |
| return err; |
| } |
| |
| static int mlx5_vfio_query_pages(struct mlx5_vfio_context *ctx, int boot, |
| uint16_t *func_id, int32_t *npages) |
| { |
| uint32_t query_pages_in[DEVX_ST_SZ_DW(query_pages_in)] = {}; |
| uint32_t query_pages_out[DEVX_ST_SZ_DW(query_pages_out)] = {}; |
| int ret; |
| |
| DEVX_SET(query_pages_in, query_pages_in, opcode, MLX5_CMD_OP_QUERY_PAGES); |
| DEVX_SET(query_pages_in, query_pages_in, op_mod, boot ? 0x01 : 0x02); |
| |
| ret = mlx5_vfio_cmd_exec(ctx, query_pages_in, sizeof(query_pages_in), |
| query_pages_out, sizeof(query_pages_out), 0); |
| if (ret) |
| return ret; |
| |
| *npages = DEVX_GET(query_pages_out, query_pages_out, num_pages); |
| *func_id = DEVX_GET(query_pages_out, query_pages_out, function_id); |
| |
| return 0; |
| } |
| |
| static int mlx5_vfio_satisfy_startup_pages(struct mlx5_vfio_context *ctx, |
| int boot) |
| { |
| uint16_t function_id; |
| int32_t npages = 0; |
| int ret; |
| |
| ret = mlx5_vfio_query_pages(ctx, boot, &function_id, &npages); |
| if (ret) |
| return ret; |
| |
| return mlx5_vfio_give_pages(ctx, function_id, npages, false); |
| } |
| |
| static int mlx5_vfio_access_reg(struct mlx5_vfio_context *ctx, void *data_in, |
| int size_in, void *data_out, int size_out, |
| uint16_t reg_id, int arg, int write) |
| { |
| int outlen = DEVX_ST_SZ_BYTES(access_register_out) + size_out; |
| int inlen = DEVX_ST_SZ_BYTES(access_register_in) + size_in; |
| int err = ENOMEM; |
| uint32_t *out = NULL; |
| uint32_t *in = NULL; |
| void *data; |
| |
| in = calloc(1, inlen); |
| out = calloc(1, outlen); |
| if (!in || !out) { |
| errno = ENOMEM; |
| goto out; |
| } |
| |
| data = DEVX_ADDR_OF(access_register_in, in, register_data); |
| memcpy(data, data_in, size_in); |
| |
| DEVX_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REG); |
| DEVX_SET(access_register_in, in, op_mod, !write); |
| DEVX_SET(access_register_in, in, argument, arg); |
| DEVX_SET(access_register_in, in, register_id, reg_id); |
| |
| err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, outlen, 0); |
| if (err) |
| goto out; |
| |
| data = DEVX_ADDR_OF(access_register_out, out, register_data); |
| memcpy(data_out, data, size_out); |
| |
| out: |
| free(out); |
| free(in); |
| return err; |
| } |
| |
| static int mlx5_vfio_get_caps_mode(struct mlx5_vfio_context *ctx, |
| enum mlx5_cap_type cap_type, |
| enum mlx5_cap_mode cap_mode) |
| { |
| uint8_t in[DEVX_ST_SZ_BYTES(query_hca_cap_in)] = {}; |
| int out_sz = DEVX_ST_SZ_BYTES(query_hca_cap_out); |
| void *out, *hca_caps; |
| uint16_t opmod = (cap_type << 1) | (cap_mode & 0x01); |
| int err; |
| |
| out = calloc(1, out_sz); |
| if (!out) { |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); |
| DEVX_SET(query_hca_cap_in, in, op_mod, opmod); |
| err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, out_sz, 0); |
| if (err) |
| goto query_ex; |
| |
| hca_caps = DEVX_ADDR_OF(query_hca_cap_out, out, capability); |
| |
| switch (cap_mode) { |
| case HCA_CAP_OPMOD_GET_MAX: |
| memcpy(ctx->caps.hca_max[cap_type], hca_caps, |
| DEVX_UN_SZ_BYTES(hca_cap_union)); |
| break; |
| case HCA_CAP_OPMOD_GET_CUR: |
| memcpy(ctx->caps.hca_cur[cap_type], hca_caps, |
| DEVX_UN_SZ_BYTES(hca_cap_union)); |
| break; |
| default: |
| err = EINVAL; |
| assert(false); |
| break; |
| } |
| |
| query_ex: |
| free(out); |
| return err; |
| } |
| |
| enum mlx5_vport_roce_state { |
| MLX5_VPORT_ROCE_DISABLED = 0, |
| MLX5_VPORT_ROCE_ENABLED = 1, |
| }; |
| |
| static int mlx5_vfio_nic_vport_update_roce_state(struct mlx5_vfio_context *ctx, |
| enum mlx5_vport_roce_state state) |
| { |
| uint32_t out[DEVX_ST_SZ_DW(modify_nic_vport_context_out)] = {}; |
| int inlen = DEVX_ST_SZ_BYTES(modify_nic_vport_context_in); |
| void *in; |
| int err; |
| |
| in = calloc(1, inlen); |
| if (!in) { |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| DEVX_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1); |
| DEVX_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en, |
| state); |
| DEVX_SET(modify_nic_vport_context_in, in, opcode, |
| MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT); |
| |
| err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0); |
| |
| free(in); |
| |
| return err; |
| } |
| |
| static int mlx5_vfio_get_caps(struct mlx5_vfio_context *ctx, enum mlx5_cap_type cap_type) |
| { |
| int ret; |
| |
| ret = mlx5_vfio_get_caps_mode(ctx, cap_type, HCA_CAP_OPMOD_GET_CUR); |
| if (ret) |
| return ret; |
| |
| return mlx5_vfio_get_caps_mode(ctx, cap_type, HCA_CAP_OPMOD_GET_MAX); |
| } |
| |
| static int handle_hca_cap_roce(struct mlx5_vfio_context *ctx, void *set_ctx, |
| int ctx_size) |
| { |
| int err; |
| uint32_t out[DEVX_ST_SZ_DW(set_hca_cap_out)] = {}; |
| void *set_hca_cap; |
| |
| if (!MLX5_VFIO_CAP_GEN(ctx, roce)) |
| return 0; |
| |
| err = mlx5_vfio_get_caps(ctx, MLX5_CAP_ROCE); |
| if (err) |
| return err; |
| |
| if (MLX5_VFIO_CAP_ROCE(ctx, sw_r_roce_src_udp_port) || |
| !MLX5_VFIO_CAP_ROCE_MAX(ctx, sw_r_roce_src_udp_port)) |
| return 0; |
| |
| set_hca_cap = DEVX_ADDR_OF(set_hca_cap_in, set_ctx, capability); |
| memcpy(set_hca_cap, ctx->caps.hca_cur[MLX5_CAP_ROCE], |
| DEVX_ST_SZ_BYTES(roce_cap)); |
| DEVX_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1); |
| DEVX_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP); |
| DEVX_SET(set_hca_cap_in, set_ctx, op_mod, MLX5_SET_HCA_CAP_OP_MOD_ROCE); |
| return mlx5_vfio_cmd_exec(ctx, set_ctx, ctx_size, out, sizeof(out), 0); |
| } |
| |
| static int handle_hca_cap(struct mlx5_vfio_context *ctx, void *set_ctx, int set_sz) |
| { |
| struct mlx5_vfio_device *dev = to_mvfio_dev(ctx->vctx.context.device); |
| int sys_page_shift = ilog32(dev->page_size - 1); |
| uint32_t out[DEVX_ST_SZ_DW(set_hca_cap_out)] = {}; |
| void *set_hca_cap; |
| int err; |
| |
| err = mlx5_vfio_get_caps(ctx, MLX5_CAP_GENERAL); |
| if (err) |
| return err; |
| |
| set_hca_cap = DEVX_ADDR_OF(set_hca_cap_in, set_ctx, |
| capability); |
| memcpy(set_hca_cap, ctx->caps.hca_cur[MLX5_CAP_GENERAL], |
| DEVX_ST_SZ_BYTES(cmd_hca_cap)); |
| |
| /* disable cmdif checksum */ |
| DEVX_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0); |
| |
| if (dev->flags & MLX5DV_VFIO_CTX_FLAGS_INIT_LINK_DOWN) |
| DEVX_SET(cmd_hca_cap, set_hca_cap, disable_link_up_by_init_hca, 1); |
| |
| DEVX_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, sys_page_shift - 12); |
| |
| if (MLX5_VFIO_CAP_GEN_MAX(ctx, mkey_by_name)) |
| DEVX_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1); |
| |
| DEVX_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP); |
| DEVX_SET(set_hca_cap_in, set_ctx, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); |
| |
| return mlx5_vfio_cmd_exec(ctx, set_ctx, set_sz, out, sizeof(out), 0); |
| } |
| |
| static int set_hca_cap(struct mlx5_vfio_context *ctx) |
| { |
| int set_sz = DEVX_ST_SZ_BYTES(set_hca_cap_in); |
| void *set_ctx; |
| int err; |
| |
| set_ctx = calloc(1, set_sz); |
| if (!set_ctx) { |
| errno = ENOMEM; |
| return errno; |
| } |
| |
| err = handle_hca_cap(ctx, set_ctx, set_sz); |
| if (err) |
| goto out; |
| |
| memset(set_ctx, 0, set_sz); |
| err = handle_hca_cap_roce(ctx, set_ctx, set_sz); |
| out: |
| free(set_ctx); |
| return err; |
| } |
| |
| static int mlx5_vfio_set_hca_ctrl(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_reg_host_endianness he_in = {}; |
| struct mlx5_reg_host_endianness he_out = {}; |
| |
| he_in.he = MLX5_SET_HOST_ENDIANNESS; |
| return mlx5_vfio_access_reg(ctx, &he_in, sizeof(he_in), |
| &he_out, sizeof(he_out), |
| MLX5_REG_HOST_ENDIANNESS, 0, 1); |
| } |
| |
| static int mlx5_vfio_init_hca(struct mlx5_vfio_context *ctx) |
| { |
| uint32_t in[DEVX_ST_SZ_DW(init_hca_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(init_hca_out)] = {}; |
| |
| DEVX_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); |
| return mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| } |
| |
| static int fw_initializing(struct mlx5_init_seg *init_seg) |
| { |
| return be32toh(init_seg->initializing) >> 31; |
| } |
| |
| static int wait_fw_init(struct mlx5_init_seg *init_seg, uint32_t max_wait_mili) |
| { |
| int num_loops = max_wait_mili / FW_INIT_WAIT_MS; |
| int loop = 0; |
| |
| while (fw_initializing(init_seg)) { |
| usleep(FW_INIT_WAIT_MS * 1000); |
| loop++; |
| if (loop == num_loops) { |
| errno = EBUSY; |
| return errno; |
| } |
| } |
| |
| return 0; |
| } |
| |
| static int mlx5_vfio_teardown_hca_regular(struct mlx5_vfio_context *ctx) |
| { |
| uint32_t in[DEVX_ST_SZ_DW(teardown_hca_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(teardown_hca_out)] = {}; |
| |
| DEVX_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); |
| DEVX_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE); |
| return mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| } |
| |
| enum mlx5_cmd_addr_l_sz_offset { |
| MLX5_NIC_IFC_OFFSET = 8, |
| }; |
| |
| enum { |
| MLX5_NIC_IFC_DISABLED = 1, |
| MLX5_NIC_IFC_SW_RESET = 7, |
| }; |
| |
| static uint8_t mlx5_vfio_get_nic_state(struct mlx5_vfio_context *ctx) |
| { |
| return (be32toh(mmio_read32_be(&ctx->bar_map->cmdq_addr_l_sz)) >> 8) & 7; |
| } |
| |
| static void mlx5_vfio_set_nic_state(struct mlx5_vfio_context *ctx, uint8_t state) |
| { |
| uint32_t cur_cmdq_addr_l_sz; |
| |
| cur_cmdq_addr_l_sz = be32toh(mmio_read32_be(&ctx->bar_map->cmdq_addr_l_sz)); |
| mmio_write32_be(&ctx->bar_map->cmdq_addr_l_sz, |
| htobe32((cur_cmdq_addr_l_sz & 0xFFFFF000) | |
| state << MLX5_NIC_IFC_OFFSET)); |
| } |
| |
| #define MLX5_FAST_TEARDOWN_WAIT_MS 3000 |
| #define MLX5_FAST_TEARDOWN_WAIT_ONCE_MS 1 |
| static int mlx5_vfio_teardown_hca_fast(struct mlx5_vfio_context *ctx) |
| { |
| uint32_t out[DEVX_ST_SZ_DW(teardown_hca_out)] = {}; |
| uint32_t in[DEVX_ST_SZ_DW(teardown_hca_in)] = {}; |
| int waited = 0, state, ret; |
| |
| DEVX_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA); |
| DEVX_SET(teardown_hca_in, in, profile, |
| MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN); |
| ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| if (ret) |
| return ret; |
| |
| state = DEVX_GET(teardown_hca_out, out, state); |
| if (state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) { |
| mlx5_err(ctx->dbg_fp, "teardown with fast mode failed\n"); |
| return EIO; |
| } |
| |
| mlx5_vfio_set_nic_state(ctx, MLX5_NIC_IFC_DISABLED); |
| do { |
| if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_DISABLED) |
| break; |
| usleep(MLX5_FAST_TEARDOWN_WAIT_ONCE_MS * 1000); |
| waited += MLX5_FAST_TEARDOWN_WAIT_ONCE_MS; |
| } while (waited < MLX5_FAST_TEARDOWN_WAIT_MS); |
| |
| if (mlx5_vfio_get_nic_state(ctx) != MLX5_NIC_IFC_DISABLED) { |
| mlx5_err(ctx->dbg_fp, "NIC IFC still %d after %ums.\n", |
| mlx5_vfio_get_nic_state(ctx), waited); |
| return EIO; |
| } |
| |
| return 0; |
| } |
| |
| static int mlx5_vfio_teardown_hca(struct mlx5_vfio_context *ctx) |
| { |
| int err; |
| |
| if (MLX5_VFIO_CAP_GEN(ctx, fast_teardown)) { |
| err = mlx5_vfio_teardown_hca_fast(ctx); |
| if (!err) |
| return 0; |
| } |
| |
| return mlx5_vfio_teardown_hca_regular(ctx); |
| } |
| |
| static bool sensor_pci_not_working(struct mlx5_init_seg *init_seg) |
| { |
| /* Offline PCI reads return 0xffffffff */ |
| return (be32toh(mmio_read32_be(&init_seg->health.fw_ver)) == 0xffffffff); |
| } |
| |
| enum mlx5_fatal_assert_bit_offsets { |
| MLX5_RFR_OFFSET = 31, |
| }; |
| |
| static bool sensor_fw_synd_rfr(struct mlx5_init_seg *init_seg) |
| { |
| uint32_t rfr = be32toh(mmio_read32_be(&init_seg->health.rfr)) >> MLX5_RFR_OFFSET; |
| uint8_t synd = mmio_read8(&init_seg->health.synd); |
| |
| return (rfr && synd); |
| } |
| |
| enum { |
| MLX5_SENSOR_NO_ERR = 0, |
| MLX5_SENSOR_PCI_COMM_ERR = 1, |
| MLX5_SENSOR_NIC_DISABLED = 3, |
| MLX5_SENSOR_NIC_SW_RESET = 4, |
| MLX5_SENSOR_FW_SYND_RFR = 5, |
| }; |
| |
| static uint32_t mlx5_health_check_fatal_sensors(struct mlx5_vfio_context *ctx) |
| { |
| if (sensor_pci_not_working(ctx->bar_map)) |
| return MLX5_SENSOR_PCI_COMM_ERR; |
| |
| if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_DISABLED) |
| return MLX5_SENSOR_NIC_DISABLED; |
| |
| if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_SW_RESET) |
| return MLX5_SENSOR_NIC_SW_RESET; |
| |
| if (sensor_fw_synd_rfr(ctx->bar_map)) |
| return MLX5_SENSOR_FW_SYND_RFR; |
| |
| return MLX5_SENSOR_NO_ERR; |
| } |
| |
| enum { |
| MLX5_HEALTH_SYNDR_FW_ERR = 0x1, |
| MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7, |
| MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8, |
| MLX5_HEALTH_SYNDR_CRC_ERR = 0x9, |
| MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa, |
| MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb, |
| MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc, |
| MLX5_HEALTH_SYNDR_EQ_ERR = 0xd, |
| MLX5_HEALTH_SYNDR_EQ_INV = 0xe, |
| MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf, |
| MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10, |
| }; |
| |
| static const char *hsynd_str(u8 synd) |
| { |
| switch (synd) { |
| case MLX5_HEALTH_SYNDR_FW_ERR: |
| return "firmware internal error"; |
| case MLX5_HEALTH_SYNDR_IRISC_ERR: |
| return "irisc not responding"; |
| case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: |
| return "unrecoverable hardware error"; |
| case MLX5_HEALTH_SYNDR_CRC_ERR: |
| return "firmware CRC error"; |
| case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: |
| return "ICM fetch PCI error"; |
| case MLX5_HEALTH_SYNDR_HW_FTL_ERR: |
| return "HW fatal error\n"; |
| case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: |
| return "async EQ buffer overrun"; |
| case MLX5_HEALTH_SYNDR_EQ_ERR: |
| return "EQ error"; |
| case MLX5_HEALTH_SYNDR_EQ_INV: |
| return "Invalid EQ referenced"; |
| case MLX5_HEALTH_SYNDR_FFSER_ERR: |
| return "FFSER error"; |
| case MLX5_HEALTH_SYNDR_HIGH_TEMP: |
| return "High temperature"; |
| default: |
| return "unrecognized error"; |
| } |
| } |
| |
| static void print_health_info(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_init_seg *iseg = ctx->bar_map; |
| struct health_buffer *h = &iseg->health; |
| char fw_str[18] = {}; |
| int i; |
| |
| /* If the syndrome is 0, the device is OK and no need to print buffer */ |
| if (!mmio_read8(&h->synd)) |
| return; |
| |
| for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) |
| mlx5_err(ctx->dbg_fp, "assert_var[%d] 0x%08x\n", |
| i, be32toh(mmio_read32_be(h->assert_var + i))); |
| |
| mlx5_err(ctx->dbg_fp, "assert_exit_ptr 0x%08x\n", |
| be32toh(mmio_read32_be(&h->assert_exit_ptr))); |
| mlx5_err(ctx->dbg_fp, "assert_callra 0x%08x\n", |
| be32toh(mmio_read32_be(&h->assert_callra))); |
| sprintf(fw_str, "%d.%d.%d", |
| be32toh(mmio_read32_be(&iseg->fw_rev)) & 0xffff, |
| be32toh(mmio_read32_be(&iseg->fw_rev)) >> 16, |
| be32toh(mmio_read32_be(&iseg->cmdif_rev_fw_sub)) & 0xffff); |
| mlx5_err(ctx->dbg_fp, "fw_ver %s\n", fw_str); |
| mlx5_err(ctx->dbg_fp, "hw_id 0x%08x\n", be32toh(mmio_read32_be(&h->hw_id))); |
| mlx5_err(ctx->dbg_fp, "irisc_index %d\n", mmio_read8(&h->irisc_index)); |
| mlx5_err(ctx->dbg_fp, "synd 0x%x: %s\n", mmio_read8(&h->synd), |
| hsynd_str(mmio_read8(&h->synd))); |
| mlx5_err(ctx->dbg_fp, "ext_synd 0x%04x\n", |
| be16toh(mmio_read16_be(&h->ext_synd))); |
| mlx5_err(ctx->dbg_fp, "raw fw_ver 0x%08x\n", |
| be32toh(mmio_read32_be(&iseg->fw_rev))); |
| } |
| |
| static void mlx5_vfio_poll_health(struct mlx5_vfio_context *ctx) |
| { |
| struct mlx5_vfio_health_state *hstate = &ctx->health_state; |
| uint32_t fatal_error, count; |
| struct timeval tv; |
| uint64_t time; |
| int ret; |
| |
| ret = gettimeofday(&tv, NULL); |
| if (ret) |
| return; |
| |
| time = (uint64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000; |
| if (time - hstate->prev_time < POLL_HEALTH_INTERVAL) |
| return; |
| |
| fatal_error = mlx5_health_check_fatal_sensors(ctx); |
| if (fatal_error) { |
| mlx5_err(ctx->dbg_fp, "%s: Fatal error %u detected\n", |
| __func__, fatal_error); |
| goto err; |
| } |
| count = be32toh(mmio_read32_be(&ctx->bar_map->health_counter)) & 0xffffff; |
| if (count == hstate->prev_count) |
| ++hstate->miss_counter; |
| else |
| hstate->miss_counter = 0; |
| |
| hstate->prev_time = time; |
| hstate->prev_count = count; |
| if (hstate->miss_counter == MAX_MISSES) { |
| mlx5_err(ctx->dbg_fp, |
| "device's health compromised - reached miss count\n"); |
| goto err; |
| } |
| |
| return; |
| err: |
| print_health_info(ctx); |
| abort(); |
| } |
| |
| static int mlx5_vfio_setup_function(struct mlx5_vfio_context *ctx) |
| { |
| int err; |
| |
| err = wait_fw_init(ctx->bar_map, FW_PRE_INIT_TIMEOUT_MILI); |
| if (err) |
| return err; |
| |
| err = mlx5_vfio_enable_hca(ctx); |
| if (err) |
| return err; |
| |
| err = mlx5_vfio_set_issi(ctx); |
| if (err) |
| return err; |
| |
| err = mlx5_vfio_satisfy_startup_pages(ctx, 1); |
| if (err) |
| return err; |
| |
| err = mlx5_vfio_set_hca_ctrl(ctx); |
| if (err) |
| return err; |
| |
| err = set_hca_cap(ctx); |
| if (err) |
| return err; |
| |
| if (!MLX5_VFIO_CAP_GEN(ctx, umem_uid_0)) { |
| errno = EOPNOTSUPP; |
| return errno; |
| } |
| |
| err = mlx5_vfio_satisfy_startup_pages(ctx, 0); |
| if (err) |
| return err; |
| |
| err = mlx5_vfio_init_hca(ctx); |
| if (err) |
| return err; |
| |
| if (MLX5_VFIO_CAP_GEN(ctx, port_type) == MLX5_CAP_PORT_TYPE_ETH) |
| err = mlx5_vfio_nic_vport_update_roce_state(ctx, MLX5_VPORT_ROCE_ENABLED); |
| |
| return err; |
| } |
| |
| static struct ibv_pd *mlx5_vfio_alloc_pd(struct ibv_context *ibctx) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| uint32_t in[DEVX_ST_SZ_DW(alloc_pd_in)] = {0}; |
| uint32_t out[DEVX_ST_SZ_DW(alloc_pd_out)] = {0}; |
| int err; |
| struct mlx5_pd *pd; |
| |
| pd = calloc(1, sizeof(*pd)); |
| if (!pd) |
| return NULL; |
| |
| DEVX_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); |
| err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| |
| if (err) |
| goto err; |
| |
| pd->pdn = DEVX_GET(alloc_pd_out, out, pd); |
| |
| return &pd->ibv_pd; |
| err: |
| free(pd); |
| return NULL; |
| } |
| |
| static int mlx5_vfio_dealloc_pd(struct ibv_pd *pd) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(pd->context); |
| uint32_t in[DEVX_ST_SZ_DW(dealloc_pd_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(dealloc_pd_out)] = {}; |
| struct mlx5_pd *mpd = to_mpd(pd); |
| int ret; |
| |
| DEVX_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); |
| DEVX_SET(dealloc_pd_in, in, pd, mpd->pdn); |
| |
| ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| if (ret) |
| return ret; |
| |
| free(mpd); |
| return 0; |
| } |
| |
| static size_t calc_num_dma_blocks(uint64_t iova, size_t length, |
| unsigned long pgsz) |
| { |
| return (size_t)((align(iova + length, pgsz) - |
| align_down(iova, pgsz)) / pgsz); |
| } |
| |
| static int get_octo_len(uint64_t addr, uint64_t len, int page_shift) |
| { |
| uint64_t page_size = 1ULL << page_shift; |
| uint64_t offset; |
| int npages; |
| |
| offset = addr & (page_size - 1); |
| npages = align(len + offset, page_size) >> page_shift; |
| return (npages + 1) / 2; |
| } |
| |
| static inline uint32_t mlx5_mkey_to_idx(uint32_t mkey) |
| { |
| return mkey >> 8; |
| } |
| |
| static inline uint32_t mlx5_idx_to_mkey(uint32_t mkey_idx) |
| { |
| return mkey_idx << 8; |
| } |
| |
| static void set_mkc_access_pd_addr_fields(void *mkc, int acc, uint64_t start_addr, |
| struct ibv_pd *pd) |
| { |
| struct mlx5_pd *mpd = to_mpd(pd); |
| |
| DEVX_SET(mkc, mkc, a, !!(acc & IBV_ACCESS_REMOTE_ATOMIC)); |
| DEVX_SET(mkc, mkc, rw, !!(acc & IBV_ACCESS_REMOTE_WRITE)); |
| DEVX_SET(mkc, mkc, rr, !!(acc & IBV_ACCESS_REMOTE_READ)); |
| DEVX_SET(mkc, mkc, lw, !!(acc & IBV_ACCESS_LOCAL_WRITE)); |
| DEVX_SET(mkc, mkc, lr, 1); |
| /* Application is responsible to set based on caps */ |
| DEVX_SET(mkc, mkc, relaxed_ordering_write, |
| !!(acc & IBV_ACCESS_RELAXED_ORDERING)); |
| DEVX_SET(mkc, mkc, relaxed_ordering_read, |
| !!(acc & IBV_ACCESS_RELAXED_ORDERING)); |
| DEVX_SET(mkc, mkc, pd, mpd->pdn); |
| DEVX_SET(mkc, mkc, qpn, 0xffffff); |
| DEVX_SET64(mkc, mkc, start_addr, start_addr); |
| } |
| |
| static int mlx5_vfio_dereg_mr(struct verbs_mr *vmr) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(vmr->ibv_mr.context); |
| struct mlx5_vfio_mr *mr = to_mvfio_mr(&vmr->ibv_mr); |
| uint32_t in[DEVX_ST_SZ_DW(destroy_mkey_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(destroy_mkey_in)] = {}; |
| int ret; |
| |
| DEVX_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); |
| DEVX_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(vmr->ibv_mr.lkey)); |
| ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| if (ret) |
| return ret; |
| |
| mlx5_vfio_unregister_mem(ctx, mr->iova + mr->iova_aligned_offset, |
| mr->iova_reg_size); |
| iset_insert_range(ctx->iova_alloc, mr->iova, mr->iova_page_size); |
| |
| free(vmr); |
| return 0; |
| } |
| |
| static void mlx5_vfio_populate_pas(uint64_t dma_addr, int num_dma, size_t page_size, |
| __be64 *pas, uint64_t access_flags) |
| { |
| int i; |
| |
| for (i = 0; i < num_dma; i++) { |
| *pas = htobe64(dma_addr | access_flags); |
| pas++; |
| dma_addr += page_size; |
| } |
| } |
| |
| static uint64_t calc_spanning_page_size(uint64_t start, uint64_t length) |
| { |
| /* Compute a page_size such that: |
| * start & (page_size-1) == (start + length) & (page_size - 1) |
| */ |
| uint64_t diffs = start ^ (start + length - 1); |
| uint64_t page_size = roundup_pow_of_two(diffs + 1); |
| |
| /* |
| * Don't waste more than 1G of IOVA address space trying to |
| * minimize MTTs |
| */ |
| while (page_size - length > 1024 * 1024 * 1024) { |
| if (page_size / 2 < length) |
| break; |
| page_size /= 2; |
| } |
| |
| return page_size; |
| } |
| |
| static struct ibv_mr *mlx5_vfio_reg_mr(struct ibv_pd *pd, void *addr, size_t length, |
| uint64_t hca_va, int access) |
| { |
| struct mlx5_vfio_device *dev = to_mvfio_dev(pd->context->device); |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(pd->context); |
| uint32_t out[DEVX_ST_SZ_DW(create_mkey_out)] = {}; |
| uint32_t mkey_index; |
| uint32_t *in; |
| int inlen, num_pas, ret; |
| struct mlx5_vfio_mr *mr; |
| struct verbs_mr *vmr; |
| int page_shift, iova_min_page_shift; |
| __be64 *pas; |
| uint8_t key; |
| void *mkc; |
| void *aligned_va; |
| |
| if (!check_comp_mask(access, MLX5_VFIO_SUPP_MR_ACCESS_FLAGS)) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| if (((uint64_t)(uintptr_t)addr & (ctx->iova_min_page_size - 1)) != |
| (hca_va & (ctx->iova_min_page_size - 1))) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| mr = calloc(1, sizeof(*mr)); |
| if (!mr) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| aligned_va = (void *)(uintptr_t)((unsigned long)addr & |
| ~(ctx->iova_min_page_size - 1)); |
| iova_min_page_shift = ilog64(ctx->iova_min_page_size - 1); |
| |
| mr->iova_page_size = max(calc_spanning_page_size(hca_va, length), |
| ctx->iova_min_page_size); |
| page_shift = ilog64(mr->iova_page_size - 1); |
| |
| /* Ensure the low bis of the mkey VA match the low bits of the IOVA |
| * because the mkc start_addr specifies both the wire VA and the DMA VA. |
| */ |
| mr->iova_aligned_offset = |
| hca_va & GENMASK(page_shift - 1, iova_min_page_shift); |
| mr->iova_reg_size = align(length + hca_va, ctx->iova_min_page_size) - |
| align_down(hca_va, ctx->iova_min_page_size); |
| |
| if (page_shift > MLX5_MAX_PAGE_SHIFT) { |
| page_shift = MLX5_MAX_PAGE_SHIFT; |
| mr->iova_page_size = 1ULL << page_shift; |
| } |
| ret = iset_alloc_range(ctx->iova_alloc, |
| mr->iova_aligned_offset + mr->iova_reg_size, |
| &mr->iova, mr->iova_page_size); |
| if (ret) |
| goto end; |
| |
| /* IOVA must be aligned */ |
| assert(mr->iova % mr->iova_page_size == 0); |
| |
| ret = mlx5_vfio_register_mem(ctx, aligned_va, |
| mr->iova + mr->iova_aligned_offset, |
| mr->iova_reg_size); |
| if (ret) |
| goto err_reg; |
| |
| num_pas = calc_num_dma_blocks(hca_va, length, mr->iova_page_size); |
| |
| inlen = DEVX_ST_SZ_BYTES(create_mkey_in) + (sizeof(*pas) * align(num_pas, 2)); |
| |
| in = calloc(1, inlen); |
| if (!in) { |
| errno = ENOMEM; |
| goto err_in; |
| } |
| |
| pas = (__be64 *)DEVX_ADDR_OF(create_mkey_in, in, klm_pas_mtt); |
| /* if page_shift was greater than MLX5_MAX_PAGE_SHIFT then limiting it |
| * will cause the starting IOVA to be incorrect, adjust it. |
| */ |
| mlx5_vfio_populate_pas(align_down(mr->iova + mr->iova_aligned_offset, |
| mr->iova_page_size), |
| num_pas, mr->iova_page_size, |
| pas, MLX5_MTT_PRESENT); |
| |
| DEVX_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); |
| DEVX_SET(create_mkey_in, in, pg_access, 1); |
| mkc = DEVX_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); |
| set_mkc_access_pd_addr_fields(mkc, access, hca_va, pd); |
| DEVX_SET(mkc, mkc, free, 0); |
| DEVX_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); |
| DEVX_SET64(mkc, mkc, len, length); |
| DEVX_SET(mkc, mkc, bsf_octword_size, 0); |
| DEVX_SET(mkc, mkc, translations_octword_size, |
| get_octo_len(hca_va, length, page_shift)); |
| DEVX_SET(mkc, mkc, log_page_size, page_shift); |
| |
| DEVX_SET(create_mkey_in, in, translations_octword_actual_size, |
| get_octo_len(hca_va, length, page_shift)); |
| |
| key = atomic_fetch_add(&dev->mkey_var, 1); |
| DEVX_SET(mkc, mkc, mkey_7_0, key); |
| |
| ret = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0); |
| if (ret) |
| goto err_exec; |
| |
| free(in); |
| mkey_index = DEVX_GET(create_mkey_out, out, mkey_index); |
| vmr = &mr->vmr; |
| vmr->ibv_mr.lkey = key | mlx5_idx_to_mkey(mkey_index); |
| vmr->ibv_mr.rkey = vmr->ibv_mr.lkey; |
| vmr->ibv_mr.context = pd->context; |
| vmr->mr_type = IBV_MR_TYPE_MR; |
| vmr->access = access; |
| vmr->ibv_mr.handle = 0; |
| |
| return &mr->vmr.ibv_mr; |
| |
| err_exec: |
| free(in); |
| err_in: |
| mlx5_vfio_unregister_mem(ctx, mr->iova + mr->iova_aligned_offset, |
| mr->iova_reg_size); |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, mr->iova, mr->iova_page_size); |
| end: |
| free(mr); |
| return NULL; |
| } |
| |
| static int vfio_devx_query_eqn(struct ibv_context *ibctx, uint32_t vector, |
| uint32_t *eqn) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| |
| if (vector != MLX5_VFIO_CMD_VEC_IDX) |
| return EINVAL; |
| |
| /* For now use the singleton EQN created for async events */ |
| *eqn = ctx->async_eq.eqn; |
| return 0; |
| } |
| |
| static struct mlx5dv_devx_uar * |
| vfio_devx_alloc_uar(struct ibv_context *ibctx, uint32_t flags) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| struct mlx5_devx_uar *uar; |
| |
| if (flags != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| uar = calloc(1, sizeof(*uar)); |
| if (!uar) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| uar->dv_devx_uar.page_id = ctx->eqs_uar.uarn; |
| uar->dv_devx_uar.base_addr = (void *)(uintptr_t)ctx->eqs_uar.iova; |
| uar->dv_devx_uar.reg_addr = uar->dv_devx_uar.base_addr + MLX5_BF_OFFSET; |
| uar->context = ibctx; |
| |
| return &uar->dv_devx_uar; |
| } |
| |
| static void vfio_devx_free_uar(struct mlx5dv_devx_uar *dv_devx_uar) |
| { |
| free(dv_devx_uar); |
| } |
| |
| static struct mlx5dv_devx_umem * |
| _vfio_devx_umem_reg(struct ibv_context *context, |
| void *addr, size_t size, uint32_t access, |
| uint64_t pgsz_bitmap) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(context); |
| uint32_t out[DEVX_ST_SZ_DW(create_umem_out)] = {}; |
| struct mlx5_vfio_devx_umem *vfio_umem; |
| int iova_page_shift; |
| uint64_t iova_size; |
| int ret; |
| void *in; |
| uint32_t inlen; |
| __be64 *mtt; |
| void *umem; |
| bool writeable; |
| void *aligned_va; |
| int num_pas; |
| |
| if (!check_comp_mask(access, MLX5_VFIO_SUPP_UMEM_ACCESS_FLAGS)) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| if ((access & IBV_ACCESS_REMOTE_WRITE) && |
| !(access & IBV_ACCESS_LOCAL_WRITE)) { |
| errno = EINVAL; |
| return NULL; |
| } |
| |
| /* Page size that encloses the start and end of the umem range */ |
| iova_size = max(roundup_pow_of_two(size + ((uint64_t)(uintptr_t)addr & (ctx->iova_min_page_size - 1))), |
| ctx->iova_min_page_size); |
| |
| if (!(iova_size & pgsz_bitmap)) { |
| /* input should include the iova page size */ |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| writeable = access & |
| (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); |
| |
| vfio_umem = calloc(1, sizeof(*vfio_umem)); |
| if (!vfio_umem) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| vfio_umem->iova_size = iova_size; |
| if (ibv_dontfork_range(addr, size)) |
| goto err; |
| |
| ret = iset_alloc_range(ctx->iova_alloc, vfio_umem->iova_size, |
| &vfio_umem->iova, vfio_umem->iova_size); |
| if (ret) |
| goto err_alloc; |
| |
| /* The registration's arguments have to reflect real VA presently mapped into the process */ |
| aligned_va = (void *)(uintptr_t)((unsigned long) addr & ~(ctx->iova_min_page_size - 1)); |
| vfio_umem->iova_reg_size = align((addr + size) - aligned_va, ctx->iova_min_page_size); |
| ret = mlx5_vfio_register_mem(ctx, aligned_va, vfio_umem->iova, vfio_umem->iova_reg_size); |
| if (ret) |
| goto err_reg; |
| |
| iova_page_shift = ilog32(vfio_umem->iova_size - 1); |
| num_pas = 1; |
| if (iova_page_shift > MLX5_MAX_PAGE_SHIFT) { |
| iova_page_shift = MLX5_MAX_PAGE_SHIFT; |
| num_pas = DIV_ROUND_UP(vfio_umem->iova_size, (1ULL << iova_page_shift)); |
| } |
| |
| inlen = DEVX_ST_SZ_BYTES(create_umem_in) + DEVX_ST_SZ_BYTES(mtt) * num_pas; |
| |
| in = calloc(1, inlen); |
| if (!in) { |
| errno = ENOMEM; |
| goto err_in; |
| } |
| |
| umem = DEVX_ADDR_OF(create_umem_in, in, umem); |
| mtt = (__be64 *)DEVX_ADDR_OF(umem, umem, mtt); |
| |
| DEVX_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM); |
| DEVX_SET64(umem, umem, num_of_mtt, num_pas); |
| DEVX_SET(umem, umem, log_page_size, iova_page_shift - MLX5_ADAPTER_PAGE_SHIFT); |
| DEVX_SET(umem, umem, page_offset, addr - aligned_va); |
| |
| mlx5_vfio_populate_pas(vfio_umem->iova, num_pas, (1ULL << iova_page_shift), mtt, |
| (writeable ? MLX5_MTT_WRITE : 0) | MLX5_MTT_READ); |
| |
| ret = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0); |
| if (ret) |
| goto err_exec; |
| |
| free(in); |
| |
| vfio_umem->dv_devx_umem.umem_id = DEVX_GET(create_umem_out, out, umem_id); |
| vfio_umem->context = context; |
| vfio_umem->addr = addr; |
| vfio_umem->size = size; |
| return &vfio_umem->dv_devx_umem; |
| |
| err_exec: |
| free(in); |
| err_in: |
| mlx5_vfio_unregister_mem(ctx, vfio_umem->iova, vfio_umem->iova_reg_size); |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, vfio_umem->iova, vfio_umem->iova_size); |
| err_alloc: |
| ibv_dofork_range(addr, size); |
| err: |
| free(vfio_umem); |
| return NULL; |
| } |
| |
| static struct mlx5dv_devx_umem * |
| vfio_devx_umem_reg(struct ibv_context *context, |
| void *addr, size_t size, uint32_t access) |
| { |
| return _vfio_devx_umem_reg(context, addr, size, access, UINT64_MAX); |
| } |
| |
| static struct mlx5dv_devx_umem * |
| vfio_devx_umem_reg_ex(struct ibv_context *ctx, struct mlx5dv_devx_umem_in *in) |
| { |
| if (!check_comp_mask(in->comp_mask, 0)) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| return _vfio_devx_umem_reg(ctx, in->addr, in->size, in->access, in->pgsz_bitmap); |
| } |
| |
| static int vfio_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem) |
| { |
| struct mlx5_vfio_devx_umem *vfio_umem = |
| container_of(dv_devx_umem, struct mlx5_vfio_devx_umem, |
| dv_devx_umem); |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(vfio_umem->context); |
| uint32_t in[DEVX_ST_SZ_DW(create_umem_in)] = {}; |
| uint32_t out[DEVX_ST_SZ_DW(create_umem_out)] = {}; |
| int ret; |
| |
| DEVX_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM); |
| DEVX_SET(destroy_umem_in, in, umem_id, dv_devx_umem->umem_id); |
| |
| ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| if (ret) |
| return ret; |
| |
| mlx5_vfio_unregister_mem(ctx, vfio_umem->iova, vfio_umem->iova_reg_size); |
| iset_insert_range(ctx->iova_alloc, vfio_umem->iova, vfio_umem->iova_size); |
| ibv_dofork_range(vfio_umem->addr, vfio_umem->size); |
| free(vfio_umem); |
| return 0; |
| } |
| |
| static int vfio_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) |
| { |
| struct ibv_pd *pd_in = obj->pd.in; |
| struct mlx5dv_pd *pd_out = obj->pd.out; |
| struct mlx5_pd *mpd = to_mpd(pd_in); |
| |
| if (obj_type != MLX5DV_OBJ_PD) |
| return EOPNOTSUPP; |
| |
| pd_out->comp_mask = 0; |
| pd_out->pdn = mpd->pdn; |
| return 0; |
| } |
| |
| static int vfio_devx_general_cmd(struct ibv_context *context, const void *in, |
| size_t inlen, void *out, size_t outlen) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(context); |
| |
| return mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0); |
| } |
| |
| static bool devx_is_obj_create_cmd(const void *in) |
| { |
| uint16_t opcode = DEVX_GET(general_obj_in_cmd_hdr, in, opcode); |
| |
| switch (opcode) { |
| case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: |
| case MLX5_CMD_OP_CREATE_MKEY: |
| case MLX5_CMD_OP_CREATE_CQ: |
| case MLX5_CMD_OP_ALLOC_PD: |
| case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: |
| case MLX5_CMD_OP_CREATE_RMP: |
| case MLX5_CMD_OP_CREATE_SQ: |
| case MLX5_CMD_OP_CREATE_RQ: |
| case MLX5_CMD_OP_CREATE_RQT: |
| case MLX5_CMD_OP_CREATE_TIR: |
| case MLX5_CMD_OP_CREATE_TIS: |
| case MLX5_CMD_OP_ALLOC_Q_COUNTER: |
| case MLX5_CMD_OP_CREATE_FLOW_TABLE: |
| case MLX5_CMD_OP_CREATE_FLOW_GROUP: |
| case MLX5_CMD_OP_CREATE_FLOW_COUNTER: |
| case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: |
| case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: |
| case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: |
| case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: |
| case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: |
| case MLX5_CMD_OP_CREATE_QP: |
| case MLX5_CMD_OP_CREATE_SRQ: |
| case MLX5_CMD_OP_CREATE_XRC_SRQ: |
| case MLX5_CMD_OP_CREATE_DCT: |
| case MLX5_CMD_OP_CREATE_XRQ: |
| case MLX5_CMD_OP_ATTACH_TO_MCG: |
| case MLX5_CMD_OP_ALLOC_XRCD: |
| return true; |
| case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: |
| { |
| uint8_t op_mod = DEVX_GET(set_fte_in, in, op_mod); |
| |
| if (op_mod == 0) |
| return true; |
| return false; |
| } |
| case MLX5_CMD_OP_CREATE_PSV: |
| { |
| uint8_t num_psv = DEVX_GET(create_psv_in, in, num_psv); |
| |
| if (num_psv == 1) |
| return true; |
| return false; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| static uint32_t devx_get_created_obj_id(const void *in, const void *out, |
| uint16_t opcode) |
| { |
| switch (opcode) { |
| case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: |
| return DEVX_GET(general_obj_out_cmd_hdr, out, obj_id); |
| case MLX5_CMD_OP_CREATE_UMEM: |
| return DEVX_GET(create_umem_out, out, umem_id); |
| case MLX5_CMD_OP_CREATE_MKEY: |
| return DEVX_GET(create_mkey_out, out, mkey_index); |
| case MLX5_CMD_OP_CREATE_CQ: |
| return DEVX_GET(create_cq_out, out, cqn); |
| case MLX5_CMD_OP_ALLOC_PD: |
| return DEVX_GET(alloc_pd_out, out, pd); |
| case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: |
| return DEVX_GET(alloc_transport_domain_out, out, |
| transport_domain); |
| case MLX5_CMD_OP_CREATE_RMP: |
| return DEVX_GET(create_rmp_out, out, rmpn); |
| case MLX5_CMD_OP_CREATE_SQ: |
| return DEVX_GET(create_sq_out, out, sqn); |
| case MLX5_CMD_OP_CREATE_RQ: |
| return DEVX_GET(create_rq_out, out, rqn); |
| case MLX5_CMD_OP_CREATE_RQT: |
| return DEVX_GET(create_rqt_out, out, rqtn); |
| case MLX5_CMD_OP_CREATE_TIR: |
| return DEVX_GET(create_tir_out, out, tirn); |
| case MLX5_CMD_OP_CREATE_TIS: |
| return DEVX_GET(create_tis_out, out, tisn); |
| case MLX5_CMD_OP_ALLOC_Q_COUNTER: |
| return DEVX_GET(alloc_q_counter_out, out, counter_set_id); |
| case MLX5_CMD_OP_CREATE_FLOW_TABLE: |
| return DEVX_GET(create_flow_table_out, out, table_id); |
| case MLX5_CMD_OP_CREATE_FLOW_GROUP: |
| return DEVX_GET(create_flow_group_out, out, group_id); |
| case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: |
| return DEVX_GET(set_fte_in, in, flow_index); |
| case MLX5_CMD_OP_CREATE_FLOW_COUNTER: |
| return DEVX_GET(alloc_flow_counter_out, out, flow_counter_id); |
| case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: |
| return DEVX_GET(alloc_packet_reformat_context_out, out, |
| packet_reformat_id); |
| case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: |
| return DEVX_GET(alloc_modify_header_context_out, out, |
| modify_header_id); |
| case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: |
| return DEVX_GET(create_scheduling_element_out, out, |
| scheduling_element_id); |
| case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: |
| return DEVX_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); |
| case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: |
| return DEVX_GET(set_l2_table_entry_in, in, table_index); |
| case MLX5_CMD_OP_CREATE_QP: |
| return DEVX_GET(create_qp_out, out, qpn); |
| case MLX5_CMD_OP_CREATE_SRQ: |
| return DEVX_GET(create_srq_out, out, srqn); |
| case MLX5_CMD_OP_CREATE_XRC_SRQ: |
| return DEVX_GET(create_xrc_srq_out, out, xrc_srqn); |
| case MLX5_CMD_OP_CREATE_DCT: |
| return DEVX_GET(create_dct_out, out, dctn); |
| case MLX5_CMD_OP_CREATE_XRQ: |
| return DEVX_GET(create_xrq_out, out, xrqn); |
| case MLX5_CMD_OP_ATTACH_TO_MCG: |
| return DEVX_GET(attach_to_mcg_in, in, qpn); |
| case MLX5_CMD_OP_ALLOC_XRCD: |
| return DEVX_GET(alloc_xrcd_out, out, xrcd); |
| case MLX5_CMD_OP_CREATE_PSV: |
| return DEVX_GET(create_psv_out, out, psv0_index); |
| default: |
| /* The entry must match to one of the devx_is_obj_create_cmd */ |
| assert(false); |
| return 0; |
| } |
| } |
| |
| static void devx_obj_build_destroy_cmd(const void *in, void *out, |
| void *din, uint32_t *dinlen, |
| struct mlx5dv_devx_obj *obj) |
| { |
| uint16_t opcode = DEVX_GET(general_obj_in_cmd_hdr, in, opcode); |
| uint16_t uid = DEVX_GET(general_obj_in_cmd_hdr, in, uid); |
| uint32_t *obj_id = &obj->object_id; |
| |
| *obj_id = devx_get_created_obj_id(in, out, opcode); |
| *dinlen = DEVX_ST_SZ_BYTES(general_obj_in_cmd_hdr); |
| DEVX_SET(general_obj_in_cmd_hdr, din, uid, uid); |
| |
| switch (opcode) { |
| case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: |
| DEVX_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); |
| DEVX_SET(general_obj_in_cmd_hdr, din, obj_id, *obj_id); |
| DEVX_SET(general_obj_in_cmd_hdr, din, obj_type, |
| DEVX_GET(general_obj_in_cmd_hdr, in, obj_type)); |
| break; |
| |
| case MLX5_CMD_OP_CREATE_UMEM: |
| DEVX_SET(destroy_umem_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_UMEM); |
| DEVX_SET(destroy_umem_in, din, umem_id, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_MKEY: |
| DEVX_SET(destroy_mkey_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_MKEY); |
| DEVX_SET(destroy_mkey_in, din, mkey_index, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_CQ: |
| DEVX_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ); |
| DEVX_SET(destroy_cq_in, din, cqn, *obj_id); |
| break; |
| case MLX5_CMD_OP_ALLOC_PD: |
| DEVX_SET(dealloc_pd_in, din, opcode, MLX5_CMD_OP_DEALLOC_PD); |
| DEVX_SET(dealloc_pd_in, din, pd, *obj_id); |
| break; |
| case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: |
| DEVX_SET(dealloc_transport_domain_in, din, opcode, |
| MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); |
| DEVX_SET(dealloc_transport_domain_in, din, transport_domain, |
| *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_RMP: |
| DEVX_SET(destroy_rmp_in, din, opcode, MLX5_CMD_OP_DESTROY_RMP); |
| DEVX_SET(destroy_rmp_in, din, rmpn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_SQ: |
| DEVX_SET(destroy_sq_in, din, opcode, MLX5_CMD_OP_DESTROY_SQ); |
| DEVX_SET(destroy_sq_in, din, sqn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_RQ: |
| DEVX_SET(destroy_rq_in, din, opcode, MLX5_CMD_OP_DESTROY_RQ); |
| DEVX_SET(destroy_rq_in, din, rqn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_RQT: |
| DEVX_SET(destroy_rqt_in, din, opcode, MLX5_CMD_OP_DESTROY_RQT); |
| DEVX_SET(destroy_rqt_in, din, rqtn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_TIR: |
| DEVX_SET(destroy_tir_in, din, opcode, MLX5_CMD_OP_DESTROY_TIR); |
| DEVX_SET(destroy_tir_in, din, tirn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_TIS: |
| DEVX_SET(destroy_tis_in, din, opcode, MLX5_CMD_OP_DESTROY_TIS); |
| DEVX_SET(destroy_tis_in, din, tisn, *obj_id); |
| break; |
| case MLX5_CMD_OP_ALLOC_Q_COUNTER: |
| DEVX_SET(dealloc_q_counter_in, din, opcode, |
| MLX5_CMD_OP_DEALLOC_Q_COUNTER); |
| DEVX_SET(dealloc_q_counter_in, din, counter_set_id, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_FLOW_TABLE: |
| *dinlen = DEVX_ST_SZ_BYTES(destroy_flow_table_in); |
| DEVX_SET(destroy_flow_table_in, din, other_vport, |
| DEVX_GET(create_flow_table_in, in, other_vport)); |
| DEVX_SET(destroy_flow_table_in, din, vport_number, |
| DEVX_GET(create_flow_table_in, in, vport_number)); |
| DEVX_SET(destroy_flow_table_in, din, table_type, |
| DEVX_GET(create_flow_table_in, in, table_type)); |
| DEVX_SET(destroy_flow_table_in, din, table_id, *obj_id); |
| DEVX_SET(destroy_flow_table_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_FLOW_TABLE); |
| break; |
| case MLX5_CMD_OP_CREATE_FLOW_GROUP: |
| *dinlen = DEVX_ST_SZ_BYTES(destroy_flow_group_in); |
| DEVX_SET(destroy_flow_group_in, din, other_vport, |
| DEVX_GET(create_flow_group_in, in, other_vport)); |
| DEVX_SET(destroy_flow_group_in, din, vport_number, |
| DEVX_GET(create_flow_group_in, in, vport_number)); |
| DEVX_SET(destroy_flow_group_in, din, table_type, |
| DEVX_GET(create_flow_group_in, in, table_type)); |
| DEVX_SET(destroy_flow_group_in, din, table_id, |
| DEVX_GET(create_flow_group_in, in, table_id)); |
| DEVX_SET(destroy_flow_group_in, din, group_id, *obj_id); |
| DEVX_SET(destroy_flow_group_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_FLOW_GROUP); |
| break; |
| case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: |
| *dinlen = DEVX_ST_SZ_BYTES(delete_fte_in); |
| DEVX_SET(delete_fte_in, din, other_vport, |
| DEVX_GET(set_fte_in, in, other_vport)); |
| DEVX_SET(delete_fte_in, din, vport_number, |
| DEVX_GET(set_fte_in, in, vport_number)); |
| DEVX_SET(delete_fte_in, din, table_type, |
| DEVX_GET(set_fte_in, in, table_type)); |
| DEVX_SET(delete_fte_in, din, table_id, |
| DEVX_GET(set_fte_in, in, table_id)); |
| DEVX_SET(delete_fte_in, din, flow_index, *obj_id); |
| DEVX_SET(delete_fte_in, din, opcode, |
| MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); |
| break; |
| case MLX5_CMD_OP_CREATE_FLOW_COUNTER: |
| DEVX_SET(dealloc_flow_counter_in, din, opcode, |
| MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); |
| DEVX_SET(dealloc_flow_counter_in, din, flow_counter_id, |
| *obj_id); |
| break; |
| case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: |
| DEVX_SET(dealloc_packet_reformat_context_in, din, opcode, |
| MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT); |
| DEVX_SET(dealloc_packet_reformat_context_in, din, |
| packet_reformat_id, *obj_id); |
| break; |
| case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: |
| DEVX_SET(dealloc_modify_header_context_in, din, opcode, |
| MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); |
| DEVX_SET(dealloc_modify_header_context_in, din, |
| modify_header_id, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: |
| *dinlen = DEVX_ST_SZ_BYTES(destroy_scheduling_element_in); |
| DEVX_SET(destroy_scheduling_element_in, din, |
| scheduling_hierarchy, |
| DEVX_GET(create_scheduling_element_in, in, |
| scheduling_hierarchy)); |
| DEVX_SET(destroy_scheduling_element_in, din, |
| scheduling_element_id, *obj_id); |
| DEVX_SET(destroy_scheduling_element_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT); |
| break; |
| case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: |
| *dinlen = DEVX_ST_SZ_BYTES(delete_vxlan_udp_dport_in); |
| DEVX_SET(delete_vxlan_udp_dport_in, din, vxlan_udp_port, *obj_id); |
| DEVX_SET(delete_vxlan_udp_dport_in, din, opcode, |
| MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); |
| break; |
| case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: |
| *dinlen = DEVX_ST_SZ_BYTES(delete_l2_table_entry_in); |
| DEVX_SET(delete_l2_table_entry_in, din, table_index, *obj_id); |
| DEVX_SET(delete_l2_table_entry_in, din, opcode, |
| MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); |
| break; |
| case MLX5_CMD_OP_CREATE_QP: |
| DEVX_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP); |
| DEVX_SET(destroy_qp_in, din, qpn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_SRQ: |
| DEVX_SET(destroy_srq_in, din, opcode, MLX5_CMD_OP_DESTROY_SRQ); |
| DEVX_SET(destroy_srq_in, din, srqn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_XRC_SRQ: |
| DEVX_SET(destroy_xrc_srq_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_XRC_SRQ); |
| DEVX_SET(destroy_xrc_srq_in, din, xrc_srqn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_DCT: |
| DEVX_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT); |
| DEVX_SET(destroy_dct_in, din, dctn, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_XRQ: |
| DEVX_SET(destroy_xrq_in, din, opcode, MLX5_CMD_OP_DESTROY_XRQ); |
| DEVX_SET(destroy_xrq_in, din, xrqn, *obj_id); |
| break; |
| case MLX5_CMD_OP_ATTACH_TO_MCG: |
| *dinlen = DEVX_ST_SZ_BYTES(detach_from_mcg_in); |
| DEVX_SET(detach_from_mcg_in, din, qpn, |
| DEVX_GET(attach_to_mcg_in, in, qpn)); |
| memcpy(DEVX_ADDR_OF(detach_from_mcg_in, din, multicast_gid), |
| DEVX_ADDR_OF(attach_to_mcg_in, in, multicast_gid), |
| DEVX_FLD_SZ_BYTES(attach_to_mcg_in, multicast_gid)); |
| DEVX_SET(detach_from_mcg_in, din, opcode, |
| MLX5_CMD_OP_DETACH_FROM_MCG); |
| DEVX_SET(detach_from_mcg_in, din, qpn, *obj_id); |
| break; |
| case MLX5_CMD_OP_ALLOC_XRCD: |
| DEVX_SET(dealloc_xrcd_in, din, opcode, |
| MLX5_CMD_OP_DEALLOC_XRCD); |
| DEVX_SET(dealloc_xrcd_in, din, xrcd, *obj_id); |
| break; |
| case MLX5_CMD_OP_CREATE_PSV: |
| DEVX_SET(destroy_psv_in, din, opcode, |
| MLX5_CMD_OP_DESTROY_PSV); |
| DEVX_SET(destroy_psv_in, din, psvn, *obj_id); |
| break; |
| default: |
| /* The entry must match to one of the devx_is_obj_create_cmd */ |
| assert(false); |
| break; |
| } |
| } |
| |
| static struct mlx5dv_devx_obj * |
| vfio_devx_obj_create(struct ibv_context *context, const void *in, |
| size_t inlen, void *out, size_t outlen) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(context); |
| struct mlx5_devx_obj *obj; |
| int ret; |
| |
| if (!devx_is_obj_create_cmd(in)) { |
| errno = EINVAL; |
| return NULL; |
| } |
| |
| obj = calloc(1, sizeof(*obj)); |
| if (!obj) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| ret = mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0); |
| if (ret) { |
| errno = ret; |
| goto fail; |
| } |
| |
| devx_obj_build_destroy_cmd(in, out, obj->dinbox, |
| &obj->dinlen, &obj->dv_obj); |
| obj->dv_obj.context = context; |
| |
| return &obj->dv_obj; |
| fail: |
| free(obj); |
| return NULL; |
| } |
| |
| static int vfio_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, |
| size_t inlen, void *out, size_t outlen) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(obj->context); |
| |
| return mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0); |
| } |
| |
| static int vfio_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in, |
| size_t inlen, void *out, size_t outlen) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(obj->context); |
| |
| return mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0); |
| } |
| |
| static int vfio_devx_obj_destroy(struct mlx5dv_devx_obj *obj) |
| { |
| struct mlx5_devx_obj *mobj = container_of(obj, |
| struct mlx5_devx_obj, dv_obj); |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(obj->context); |
| uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)]; |
| int ret; |
| |
| ret = mlx5_vfio_cmd_exec(ctx, mobj->dinbox, mobj->dinlen, |
| out, sizeof(out), 0); |
| if (ret) |
| return ret; |
| |
| free(mobj); |
| return 0; |
| } |
| |
| static struct mlx5dv_devx_msi_vector * |
| vfio_devx_alloc_msi_vector(struct ibv_context *ibctx) |
| { |
| uint8_t buf[sizeof(struct vfio_irq_set) + sizeof(int)] = {}; |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| struct mlx5_devx_msi_vector *msi; |
| int vector, *fd, err; |
| |
| msi = calloc(1, sizeof(*msi)); |
| if (!msi) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| pthread_mutex_lock(&ctx->msix_fds_lock); |
| for (vector = 0; vector < ibctx->num_comp_vectors; vector++) |
| if (ctx->msix_fds[vector] < 0) |
| break; |
| |
| if (vector == ibctx->num_comp_vectors) { |
| errno = ENOSPC; |
| goto fail; |
| } |
| |
| fd = (int *)(buf + sizeof(struct vfio_irq_set)); |
| *fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); |
| if (*fd < 0) |
| goto fail; |
| |
| err = mlx5_vfio_msix_set_irqs(ctx, vector, 1, buf); |
| if (err) |
| goto fail_set_irqs; |
| |
| ctx->msix_fds[vector] = *fd; |
| msi->dv_msi.vector = vector; |
| msi->dv_msi.fd = *fd; |
| msi->ibctx = ibctx; |
| |
| pthread_mutex_unlock(&ctx->msix_fds_lock); |
| return &msi->dv_msi; |
| |
| fail_set_irqs: |
| close(*fd); |
| fail: |
| pthread_mutex_unlock(&ctx->msix_fds_lock); |
| free(msi); |
| return NULL; |
| } |
| |
| static int vfio_devx_free_msi_vector(struct mlx5dv_devx_msi_vector *msi) |
| { |
| struct mlx5_devx_msi_vector *msiv = |
| container_of(msi, struct mlx5_devx_msi_vector, dv_msi); |
| uint8_t buf[sizeof(struct vfio_irq_set) + sizeof(int)] = {}; |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(msiv->ibctx); |
| int ret; |
| |
| pthread_mutex_lock(&ctx->msix_fds_lock); |
| if ((msi->vector >= msiv->ibctx->num_comp_vectors) || |
| (msi->vector == MLX5_VFIO_CMD_VEC_IDX) || |
| (msi->fd != ctx->msix_fds[msi->vector])) { |
| ret = EINVAL; |
| goto out; |
| } |
| |
| *(int *)(buf + sizeof(struct vfio_irq_set)) = -1; |
| ret = mlx5_vfio_msix_set_irqs(ctx, msi->vector, 1, buf); |
| if (ret) { |
| ret = errno; |
| goto out; |
| } |
| |
| close(msi->fd); |
| ctx->msix_fds[msi->vector] = -1; |
| free(msiv); |
| out: |
| pthread_mutex_unlock(&ctx->msix_fds_lock); |
| return ret; |
| } |
| |
| static struct mlx5dv_devx_eq * |
| vfio_devx_create_eq(struct ibv_context *ibctx, const void *in, size_t inlen, |
| void *out, size_t outlen) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| struct mlx5_devx_eq *eq; |
| void *eqc, *in_pas; |
| size_t inlen_pas; |
| uint64_t size; |
| __be64 *pas; |
| int err; |
| |
| eqc = DEVX_ADDR_OF(create_eq_in, in, eq_context_entry); |
| if ((inlen < DEVX_ST_SZ_BYTES(create_eq_in)) || |
| (DEVX_GET(create_eq_in, in, opcode) != MLX5_CMD_OP_CREATE_EQ) || |
| (DEVX_GET(eqc, eqc, intr) == MLX5_VFIO_CMD_VEC_IDX)) { |
| errno = EINVAL; |
| return NULL; |
| } |
| |
| size = max(roundup_pow_of_two( |
| (1ULL << DEVX_GET(eqc, eqc, log_eq_size)) * MLX5_EQE_SIZE), |
| ctx->iova_min_page_size); |
| if (size > SIZE_MAX) { |
| errno = ERANGE; |
| return NULL; |
| } |
| |
| eq = calloc(1, sizeof(*eq)); |
| if (!eq) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| eq->size = size; |
| err = posix_memalign(&eq->dv_eq.vaddr, |
| MLX5_ADAPTER_PAGE_SIZE, eq->size); |
| if (err) { |
| errno = err; |
| goto err_va; |
| } |
| |
| err = iset_alloc_range(ctx->iova_alloc, eq->size, |
| &eq->iova, eq->size); |
| if (err) |
| goto err_range; |
| |
| err = mlx5_vfio_register_mem(ctx, eq->dv_eq.vaddr, eq->iova, eq->size); |
| if (err) |
| goto err_reg; |
| |
| inlen_pas = inlen + DEVX_FLD_SZ_BYTES(create_eq_in, pas[0]) * 1; |
| in_pas = calloc(1, inlen_pas); |
| if (!in_pas) { |
| errno = ENOMEM; |
| goto err_inpas; |
| } |
| |
| memcpy(in_pas, in, inlen); |
| eqc = DEVX_ADDR_OF(create_eq_in, in_pas, eq_context_entry); |
| DEVX_SET(eqc, eqc, log_page_size, |
| ilog32(eq->size - 1) - MLX5_ADAPTER_PAGE_SHIFT); |
| |
| pas = (__be64 *)DEVX_ADDR_OF(create_eq_in, in_pas, pas); |
| pas[0] = htobe64(eq->iova); |
| |
| err = mlx5_vfio_cmd_do(ctx, in_pas, inlen_pas, out, outlen, 0); |
| if (err) { |
| errno = err; |
| goto err_cmd; |
| } |
| |
| free(in_pas); |
| eq->ibctx = ibctx; |
| eq->eqn = DEVX_GET(create_eq_out, out, eq_number); |
| return &eq->dv_eq; |
| |
| err_cmd: |
| free(in_pas); |
| err_inpas: |
| mlx5_vfio_unregister_mem(ctx, eq->iova, eq->size); |
| err_reg: |
| iset_insert_range(ctx->iova_alloc, eq->iova, eq->size); |
| err_range: |
| free(eq->dv_eq.vaddr); |
| err_va: |
| free(eq); |
| return NULL; |
| } |
| |
| static int vfio_devx_destroy_eq(struct mlx5dv_devx_eq *dveq) |
| { |
| struct mlx5_devx_eq *eq = |
| container_of(dveq, struct mlx5_devx_eq, dv_eq); |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(eq->ibctx); |
| uint32_t out[DEVX_ST_SZ_DW(destroy_eq_out)] = {}; |
| uint32_t in[DEVX_ST_SZ_DW(destroy_eq_in)] = {}; |
| int err; |
| |
| DEVX_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ); |
| DEVX_SET(destroy_eq_in, in, eq_number, eq->eqn); |
| |
| err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); |
| if (err) |
| return err; |
| |
| mlx5_vfio_unregister_mem(ctx, eq->iova, eq->size); |
| iset_insert_range(ctx->iova_alloc, eq->iova, eq->size); |
| free(eq); |
| |
| return 0; |
| } |
| |
| static struct mlx5_dv_context_ops mlx5_vfio_dv_ctx_ops = { |
| .devx_general_cmd = vfio_devx_general_cmd, |
| .devx_obj_create = vfio_devx_obj_create, |
| .devx_obj_query = vfio_devx_obj_query, |
| .devx_obj_modify = vfio_devx_obj_modify, |
| .devx_obj_destroy = vfio_devx_obj_destroy, |
| .devx_query_eqn = vfio_devx_query_eqn, |
| .devx_alloc_uar = vfio_devx_alloc_uar, |
| .devx_free_uar = vfio_devx_free_uar, |
| .devx_umem_reg = vfio_devx_umem_reg, |
| .devx_umem_reg_ex = vfio_devx_umem_reg_ex, |
| .devx_umem_dereg = vfio_devx_umem_dereg, |
| .init_obj = vfio_init_obj, |
| .devx_alloc_msi_vector = vfio_devx_alloc_msi_vector, |
| .devx_free_msi_vector = vfio_devx_free_msi_vector, |
| .devx_create_eq = vfio_devx_create_eq, |
| .devx_destroy_eq = vfio_devx_destroy_eq, |
| }; |
| |
| static void mlx5_vfio_uninit_context(struct mlx5_vfio_context *ctx) |
| { |
| mlx5_close_debug_file(ctx->dbg_fp); |
| |
| verbs_uninit_context(&ctx->vctx); |
| free(ctx); |
| } |
| |
| static void mlx5_vfio_free_context(struct ibv_context *ibctx) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| |
| destroy_async_eqs(ctx); |
| mlx5_vfio_teardown_hca(ctx); |
| mlx5_vfio_clean_cmd_interface(ctx); |
| mlx5_vfio_clean_device_dma(ctx); |
| mlx5_vfio_uninit_bar0(ctx); |
| mlx5_vfio_close_fds(ctx); |
| mlx5_vfio_uninit_context(ctx); |
| } |
| |
| static const struct verbs_context_ops mlx5_vfio_common_ops = { |
| .alloc_pd = mlx5_vfio_alloc_pd, |
| .dealloc_pd = mlx5_vfio_dealloc_pd, |
| .reg_mr = mlx5_vfio_reg_mr, |
| .dereg_mr = mlx5_vfio_dereg_mr, |
| .free_context = mlx5_vfio_free_context, |
| }; |
| |
| static struct verbs_context * |
| mlx5_vfio_alloc_context(struct ibv_device *ibdev, |
| int cmd_fd, void *private_data) |
| { |
| struct mlx5_vfio_device *mdev = to_mvfio_dev(ibdev); |
| struct mlx5_vfio_context *mctx; |
| |
| cmd_fd = -1; |
| |
| mctx = verbs_init_and_alloc_context(ibdev, cmd_fd, mctx, vctx, |
| RDMA_DRIVER_UNKNOWN); |
| if (!mctx) |
| return NULL; |
| |
| mlx5_open_debug_file(&mctx->dbg_fp); |
| mlx5_set_debug_mask(); |
| |
| if (mlx5_vfio_open_fds(mctx, mdev)) |
| goto err; |
| |
| if (mlx5_vfio_init_bar0(mctx)) |
| goto close_fds; |
| |
| if (mlx5_vfio_init_device_dma(mctx)) |
| goto err_bar; |
| |
| if (mlx5_vfio_init_cmd_interface(mctx)) |
| goto err_dma; |
| |
| if (mlx5_vfio_setup_function(mctx)) |
| goto clean_cmd; |
| |
| if (create_async_eqs(mctx)) |
| goto func_teardown; |
| |
| verbs_set_ops(&mctx->vctx, &mlx5_vfio_common_ops); |
| mctx->dv_ctx_ops = &mlx5_vfio_dv_ctx_ops; |
| |
| return &mctx->vctx; |
| |
| func_teardown: |
| mlx5_vfio_teardown_hca(mctx); |
| clean_cmd: |
| mlx5_vfio_clean_cmd_interface(mctx); |
| err_dma: |
| mlx5_vfio_clean_device_dma(mctx); |
| err_bar: |
| mlx5_vfio_uninit_bar0(mctx); |
| close_fds: |
| mlx5_vfio_close_fds(mctx); |
| err: |
| mlx5_vfio_uninit_context(mctx); |
| return NULL; |
| } |
| |
| static void mlx5_vfio_uninit_device(struct verbs_device *verbs_device) |
| { |
| struct mlx5_vfio_device *dev = to_mvfio_dev(&verbs_device->device); |
| |
| free(dev->pci_name); |
| free(dev); |
| } |
| |
| static const struct verbs_device_ops mlx5_vfio_dev_ops = { |
| .name = "mlx5_vfio", |
| .alloc_context = mlx5_vfio_alloc_context, |
| .uninit_device = mlx5_vfio_uninit_device, |
| }; |
| |
| static bool is_mlx5_pci(const char *pci_path) |
| { |
| const struct verbs_match_ent *ent; |
| uint16_t vendor_id, device_id; |
| char pci_info_path[256]; |
| char buff[128]; |
| int fd; |
| |
| snprintf(pci_info_path, sizeof(pci_info_path), "%s/vendor", pci_path); |
| fd = open(pci_info_path, O_RDONLY); |
| if (fd < 0) |
| return false; |
| |
| if (read(fd, buff, sizeof(buff)) <= 0) |
| goto err; |
| |
| vendor_id = strtoul(buff, NULL, 0); |
| close(fd); |
| |
| snprintf(pci_info_path, sizeof(pci_info_path), "%s/device", pci_path); |
| fd = open(pci_info_path, O_RDONLY); |
| if (fd < 0) |
| return false; |
| |
| if (read(fd, buff, sizeof(buff)) <= 0) |
| goto err; |
| |
| device_id = strtoul(buff, NULL, 0); |
| close(fd); |
| |
| for (ent = mlx5_hca_table; ent->kind != VERBS_MATCH_SENTINEL; ent++) { |
| if (ent->kind != VERBS_MATCH_PCI) |
| continue; |
| if (ent->device == device_id && ent->vendor == vendor_id) |
| return true; |
| } |
| |
| return false; |
| |
| err: |
| close(fd); |
| return false; |
| } |
| |
| static int mlx5_vfio_get_iommu_group_id(const char *pci_name) |
| { |
| int seg, bus, slot, func; |
| int ret, groupid; |
| char path[128], iommu_group_path[128], *group_name; |
| struct stat st; |
| ssize_t len; |
| |
| ret = sscanf(pci_name, "%04x:%02x:%02x.%d", &seg, &bus, &slot, &func); |
| if (ret != 4) |
| return -1; |
| |
| snprintf(path, sizeof(path), |
| "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/", |
| seg, bus, slot, func); |
| |
| ret = stat(path, &st); |
| if (ret < 0) |
| return -1; |
| |
| if (!is_mlx5_pci(path)) |
| return -1; |
| |
| strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); |
| |
| len = readlink(path, iommu_group_path, sizeof(iommu_group_path)); |
| if (len <= 0) |
| return -1; |
| |
| iommu_group_path[len] = 0; |
| group_name = basename(iommu_group_path); |
| |
| if (sscanf(group_name, "%d", &groupid) != 1) |
| return -1; |
| |
| snprintf(path, sizeof(path), "/dev/vfio/%d", groupid); |
| ret = stat(path, &st); |
| if (ret < 0) |
| return -1; |
| |
| return groupid; |
| } |
| |
| static int mlx5_vfio_get_handle(struct mlx5_vfio_device *vfio_dev, |
| struct mlx5dv_vfio_context_attr *attr) |
| { |
| int iommu_group; |
| |
| iommu_group = mlx5_vfio_get_iommu_group_id(attr->pci_name); |
| if (iommu_group < 0) |
| return -1; |
| |
| sprintf(vfio_dev->vfio_path, "/dev/vfio/%d", iommu_group); |
| vfio_dev->pci_name = strdup(attr->pci_name); |
| |
| return 0; |
| } |
| |
| int mlx5dv_vfio_get_events_fd(struct ibv_context *ibctx) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| |
| return ctx->cmd_comp_fd; |
| } |
| |
| int mlx5dv_vfio_process_events(struct ibv_context *ibctx) |
| { |
| struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); |
| uint64_t u; |
| ssize_t s; |
| |
| mlx5_vfio_poll_health(ctx); |
| |
| /* read to re-arm the FD and process all existing events */ |
| s = read(ctx->cmd_comp_fd, &u, sizeof(uint64_t)); |
| if (s < 0 && errno != EAGAIN) { |
| mlx5_err(ctx->dbg_fp, "%s, read failed, errno=%d\n", |
| __func__, errno); |
| return errno; |
| } |
| |
| return mlx5_vfio_process_async_events(ctx); |
| } |
| |
| struct ibv_device ** |
| mlx5dv_get_vfio_device_list(struct mlx5dv_vfio_context_attr *attr) |
| { |
| struct mlx5_vfio_device *vfio_dev; |
| struct ibv_device **list = NULL; |
| int err; |
| |
| if (!check_comp_mask(attr->comp_mask, 0) || |
| !check_comp_mask(attr->flags, MLX5DV_VFIO_CTX_FLAGS_INIT_LINK_DOWN)) { |
| errno = EOPNOTSUPP; |
| return NULL; |
| } |
| |
| list = calloc(2, sizeof(struct ibv_device *)); |
| if (!list) { |
| errno = ENOMEM; |
| return NULL; |
| } |
| |
| vfio_dev = calloc(1, sizeof(*vfio_dev)); |
| if (!vfio_dev) { |
| errno = ENOMEM; |
| goto end; |
| } |
| |
| vfio_dev->vdev.ops = &mlx5_vfio_dev_ops; |
| atomic_init(&vfio_dev->vdev.refcount, 1); |
| |
| /* Find the vfio handle for attrs, store in mlx5_vfio_device */ |
| err = mlx5_vfio_get_handle(vfio_dev, attr); |
| if (err) |
| goto err_get; |
| |
| vfio_dev->flags = attr->flags; |
| vfio_dev->page_size = sysconf(_SC_PAGESIZE); |
| atomic_init(&vfio_dev->mkey_var, 0); |
| |
| list[0] = &vfio_dev->vdev.device; |
| return list; |
| |
| err_get: |
| free(vfio_dev); |
| end: |
| free(list); |
| return NULL; |
| } |
| |
| bool is_mlx5_vfio_dev(struct ibv_device *device) |
| { |
| struct verbs_device *verbs_device = verbs_get_device(device); |
| |
| return verbs_device->ops == &mlx5_vfio_dev_ops; |
| } |