blob: 00fba3cf7b5221a36d5a03cdf5cfbbb375cd1379 [file] [log] [blame] [edit]
// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
/*
* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved
*/
#define _GNU_SOURCE
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
#include <errno.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <sys/param.h>
#include <linux/vfio.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <poll.h>
#include <util/mmio.h>
#include <ccan/array_size.h>
#include "mlx5dv.h"
#include "mlx5_vfio.h"
#include "mlx5.h"
#include "mlx5_ifc.h"
enum {
MLX5_VFIO_CMD_VEC_IDX,
};
enum {
MLX5_VFIO_SUPP_MR_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_RELAXED_ORDERING,
MLX5_VFIO_SUPP_UMEM_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ,
};
static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, uint16_t func_id,
int32_t npages, bool is_event);
static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id,
int npages);
static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx,
struct mlx5_cmd_msg *msg);
static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx,
uint32_t size, struct mlx5_cmd_msg *msg);
static int mlx5_vfio_post_cmd(struct mlx5_vfio_context *ctx, void *in,
int ilen, void *out, int olen,
unsigned int slot, bool async);
static int mlx5_vfio_register_mem(struct mlx5_vfio_context *ctx,
void *vaddr, uint64_t iova, uint64_t size)
{
struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
dma_map.vaddr = (uintptr_t)vaddr;
dma_map.size = size;
dma_map.iova = iova;
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
return ioctl(ctx->container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
}
static void mlx5_vfio_unregister_mem(struct mlx5_vfio_context *ctx,
uint64_t iova, uint64_t size)
{
struct vfio_iommu_type1_dma_unmap dma_unmap = {};
dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
dma_unmap.size = size;
dma_unmap.iova = iova;
if (ioctl(ctx->container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap))
assert(false);
}
static struct page_block *mlx5_vfio_new_block(struct mlx5_vfio_context *ctx)
{
struct page_block *page_block;
int err;
page_block = calloc(1, sizeof(*page_block));
if (!page_block) {
errno = ENOMEM;
return NULL;
}
err = posix_memalign(&page_block->page_ptr, MLX5_VFIO_BLOCK_SIZE,
MLX5_VFIO_BLOCK_SIZE);
if (err) {
errno = err;
goto err;
}
err = iset_alloc_range(ctx->iova_alloc, MLX5_VFIO_BLOCK_SIZE,
&page_block->iova, MLX5_VFIO_BLOCK_SIZE);
if (err)
goto err_range;
bitmap_fill(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES);
err = mlx5_vfio_register_mem(ctx, page_block->page_ptr, page_block->iova,
MLX5_VFIO_BLOCK_SIZE);
if (err)
goto err_reg;
list_add(&ctx->mem_alloc.block_list, &page_block->next_block);
return page_block;
err_reg:
iset_insert_range(ctx->iova_alloc, page_block->iova,
MLX5_VFIO_BLOCK_SIZE);
err_range:
free(page_block->page_ptr);
err:
free(page_block);
return NULL;
}
static void mlx5_vfio_free_block(struct mlx5_vfio_context *ctx,
struct page_block *page_block)
{
mlx5_vfio_unregister_mem(ctx, page_block->iova, MLX5_VFIO_BLOCK_SIZE);
iset_insert_range(ctx->iova_alloc, page_block->iova, MLX5_VFIO_BLOCK_SIZE);
list_del(&page_block->next_block);
free(page_block->page_ptr);
free(page_block);
}
static int mlx5_vfio_alloc_page(struct mlx5_vfio_context *ctx, uint64_t *iova)
{
struct page_block *page_block;
unsigned long pg;
int ret = 0;
pthread_mutex_lock(&ctx->mem_alloc.block_list_mutex);
while (true) {
list_for_each(&ctx->mem_alloc.block_list, page_block, next_block) {
pg = bitmap_find_first_bit(page_block->free_pages, 0,
MLX5_VFIO_BLOCK_NUM_PAGES);
if (pg != MLX5_VFIO_BLOCK_NUM_PAGES) {
bitmap_clear_bit(page_block->free_pages, pg);
*iova = page_block->iova + pg * MLX5_ADAPTER_PAGE_SIZE;
goto end;
}
}
if (!mlx5_vfio_new_block(ctx)) {
ret = -1;
goto end;
}
}
end:
pthread_mutex_unlock(&ctx->mem_alloc.block_list_mutex);
return ret;
}
static void mlx5_vfio_free_page(struct mlx5_vfio_context *ctx, uint64_t iova)
{
struct page_block *page_block;
unsigned long pg;
pthread_mutex_lock(&ctx->mem_alloc.block_list_mutex);
list_for_each(&ctx->mem_alloc.block_list, page_block, next_block) {
if (page_block->iova > iova ||
(page_block->iova + MLX5_VFIO_BLOCK_SIZE <= iova))
continue;
pg = (iova - page_block->iova) / MLX5_ADAPTER_PAGE_SIZE;
assert(!bitmap_test_bit(page_block->free_pages, pg));
bitmap_set_bit(page_block->free_pages, pg);
if (bitmap_full(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES))
mlx5_vfio_free_block(ctx, page_block);
goto end;
}
assert(false);
end:
pthread_mutex_unlock(&ctx->mem_alloc.block_list_mutex);
}
static const char *cmd_status_str(uint8_t status)
{
switch (status) {
case MLX5_CMD_STAT_OK:
return "OK";
case MLX5_CMD_STAT_INT_ERR:
return "internal error";
case MLX5_CMD_STAT_BAD_OP_ERR:
return "bad operation";
case MLX5_CMD_STAT_BAD_PARAM_ERR:
return "bad parameter";
case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:
return "bad system state";
case MLX5_CMD_STAT_BAD_RES_ERR:
return "bad resource";
case MLX5_CMD_STAT_RES_BUSY:
return "resource busy";
case MLX5_CMD_STAT_LIM_ERR:
return "limits exceeded";
case MLX5_CMD_STAT_BAD_RES_STATE_ERR:
return "bad resource state";
case MLX5_CMD_STAT_IX_ERR:
return "bad index";
case MLX5_CMD_STAT_NO_RES_ERR:
return "no resources";
case MLX5_CMD_STAT_BAD_INP_LEN_ERR:
return "bad input length";
case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:
return "bad output length";
case MLX5_CMD_STAT_BAD_QP_STATE_ERR:
return "bad QP state";
case MLX5_CMD_STAT_BAD_PKT_ERR:
return "bad packet (discarded)";
case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:
return "bad size too many outstanding CQEs";
default:
return "unknown status";
}
}
static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, uint32_t entry)
{
return eq->vaddr + entry * MLX5_EQE_SIZE;
}
static struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, uint32_t cc)
{
uint32_t ci = eq->cons_index + cc;
struct mlx5_eqe *eqe;
eqe = get_eqe(eq, ci & (eq->nent - 1));
eqe = ((eqe->owner & 1) ^ !!(ci & eq->nent)) ? NULL : eqe;
if (eqe)
udma_from_device_barrier();
return eqe;
}
static void eq_update_ci(struct mlx5_eq *eq, uint32_t cc, int arm)
{
__be32 *addr = eq->doorbell + (arm ? 0 : 2);
uint32_t val;
eq->cons_index += cc;
val = (eq->cons_index & 0xffffff) | (eq->eqn << 24);
mmio_write32_be(addr, htobe32(val));
udma_to_device_barrier();
}
static int mlx5_vfio_handle_page_req_event(struct mlx5_vfio_context *ctx,
struct mlx5_eqe *eqe)
{
struct mlx5_eqe_page_req *req = &eqe->data.req_pages;
int32_t num_pages;
int16_t func_id;
func_id = be16toh(req->func_id);
num_pages = be32toh(req->num_pages);
if (num_pages > 0)
return mlx5_vfio_give_pages(ctx, func_id, num_pages, true);
return mlx5_vfio_reclaim_pages(ctx, func_id, -1 * num_pages);
}
static void mlx5_cmd_mbox_status(void *out, uint8_t *status, uint32_t *syndrome)
{
*status = DEVX_GET(mbox_out, out, status);
*syndrome = DEVX_GET(mbox_out, out, syndrome);
}
static int mlx5_vfio_cmd_check(struct mlx5_vfio_context *ctx, void *in, void *out)
{
uint32_t syndrome;
uint8_t status;
uint16_t opcode;
uint16_t op_mod;
mlx5_cmd_mbox_status(out, &status, &syndrome);
if (!status)
return 0;
opcode = DEVX_GET(mbox_in, in, opcode);
op_mod = DEVX_GET(mbox_in, in, op_mod);
mlx5_err(ctx->dbg_fp,
"mlx5_vfio_op_code(0x%x), op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
opcode, op_mod,
cmd_status_str(status), status, syndrome);
errno = mlx5_cmd_status_to_err(status);
return errno;
}
static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size,
struct mlx5_cmd_layout *cmd_lay)
{
struct mlx5_cmd_block *block;
struct mlx5_cmd_mailbox *next;
int copy;
copy = min_t(int, size, sizeof(cmd_lay->out));
memcpy(to, cmd_lay->out, copy);
size -= copy;
to += copy;
next = from->next;
while (size) {
if (!next) {
assert(false);
errno = ENOMEM;
return errno;
}
copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
block = next->buf;
memcpy(to, block->data, copy);
to += copy;
size -= copy;
next = next->next;
}
return 0;
}
static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size,
struct mlx5_cmd_layout *cmd_lay)
{
struct mlx5_cmd_block *block;
struct mlx5_cmd_mailbox *next;
int copy;
copy = min_t(int, size, sizeof(cmd_lay->in));
memcpy(cmd_lay->in, from, copy);
size -= copy;
from += copy;
next = to->next;
while (size) {
if (!next) {
assert(false);
errno = ENOMEM;
return errno;
}
copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
block = next->buf;
memcpy(block->data, from, copy);
from += copy;
size -= copy;
next = next->next;
}
return 0;
}
/* The HCA will think the queue has overflowed if we don't tell it we've been
* processing events.
* We create EQs with MLX5_NUM_SPARE_EQE extra entries,
* so we must update our consumer index at least that often.
*/
static inline uint32_t mlx5_eq_update_cc(struct mlx5_eq *eq, uint32_t cc)
{
if (unlikely(cc >= MLX5_NUM_SPARE_EQE)) {
eq_update_ci(eq, cc, 0);
cc = 0;
}
return cc;
}
static int mlx5_vfio_process_page_request_comp(struct mlx5_vfio_context *ctx,
unsigned long slot)
{
struct mlx5_vfio_cmd_slot *cmd_slot = &ctx->cmd.cmds[slot];
struct cmd_async_data *cmd_data = &cmd_slot->curr;
int num_claimed;
int ret, i;
ret = mlx5_copy_from_msg(cmd_data->buff_out, &cmd_slot->out,
cmd_data->olen, cmd_slot->lay);
if (ret)
goto end;
ret = mlx5_vfio_cmd_check(ctx, cmd_data->buff_in, cmd_data->buff_out);
if (ret)
goto end;
if (DEVX_GET(manage_pages_in, cmd_data->buff_in, op_mod) == MLX5_PAGES_GIVE)
goto end;
num_claimed = DEVX_GET(manage_pages_out, cmd_data->buff_out, output_num_entries);
if (num_claimed > DEVX_GET(manage_pages_in, cmd_data->buff_in, input_num_entries)) {
ret = EINVAL;
errno = ret;
goto end;
}
for (i = 0; i < num_claimed; i++)
mlx5_vfio_free_page(ctx, DEVX_GET64(manage_pages_out, cmd_data->buff_out, pas[i]));
end:
free(cmd_data->buff_in);
free(cmd_data->buff_out);
cmd_slot->in_use = false;
if (!ret && cmd_slot->is_pending) {
cmd_data = &cmd_slot->pending;
pthread_mutex_lock(&cmd_slot->lock);
cmd_slot->is_pending = false;
ret = mlx5_vfio_post_cmd(ctx, cmd_data->buff_in, cmd_data->ilen,
cmd_data->buff_out, cmd_data->olen, slot, true);
pthread_mutex_unlock(&cmd_slot->lock);
}
return ret;
}
static int mlx5_vfio_cmd_comp(struct mlx5_vfio_context *ctx, unsigned long slot)
{
uint64_t u = 1;
ssize_t s;
s = write(ctx->cmd.cmds[slot].completion_event_fd, &u,
sizeof(uint64_t));
if (s != sizeof(uint64_t))
return -1;
return 0;
}
static int mlx5_vfio_process_cmd_eqe(struct mlx5_vfio_context *ctx,
struct mlx5_eqe *eqe)
{
struct mlx5_eqe_cmd *cmd_eqe = &eqe->data.cmd;
unsigned long vector = be32toh(cmd_eqe->vector);
unsigned long slot;
int count = 0;
int ret;
for (slot = 0; slot < MLX5_MAX_COMMANDS; slot++) {
if (vector & (1 << slot)) {
assert(ctx->cmd.cmds[slot].comp_func);
ret = ctx->cmd.cmds[slot].comp_func(ctx, slot);
if (ret)
return ret;
vector &= ~(1 << slot);
count++;
}
}
assert(!vector && count);
return 0;
}
static int mlx5_vfio_process_async_events(struct mlx5_vfio_context *ctx)
{
struct mlx5_eqe *eqe;
int ret = 0;
int cc = 0;
pthread_mutex_lock(&ctx->eq_lock);
while ((eqe = mlx5_eq_get_eqe(&ctx->async_eq, cc))) {
switch (eqe->type) {
case MLX5_EVENT_TYPE_CMD:
ret = mlx5_vfio_process_cmd_eqe(ctx, eqe);
break;
case MLX5_EVENT_TYPE_PAGE_REQUEST:
ret = mlx5_vfio_handle_page_req_event(ctx, eqe);
break;
default:
break;
}
cc = mlx5_eq_update_cc(&ctx->async_eq, ++cc);
if (ret)
goto out;
}
out:
eq_update_ci(&ctx->async_eq, cc, 1);
pthread_mutex_unlock(&ctx->eq_lock);
return ret;
}
static int mlx5_vfio_enlarge_cmd_msg(struct mlx5_vfio_context *ctx, struct mlx5_cmd_msg *cmd_msg,
struct mlx5_cmd_layout *cmd_lay, uint32_t len, bool is_in)
{
int err;
mlx5_vfio_free_cmd_msg(ctx, cmd_msg);
err = mlx5_vfio_alloc_cmd_msg(ctx, len, cmd_msg);
if (err)
return err;
if (is_in)
cmd_lay->iptr = htobe64(cmd_msg->next->iova);
else
cmd_lay->optr = htobe64(cmd_msg->next->iova);
return 0;
}
static int mlx5_vfio_wait_event(struct mlx5_vfio_context *ctx,
unsigned int slot)
{
struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay;
uint64_t u;
ssize_t s;
int err;
struct pollfd fds[2] = {
{ .fd = ctx->cmd_comp_fd, .events = POLLIN },
{ .fd = ctx->cmd.cmds[slot].completion_event_fd, .events = POLLIN }
};
while (true) {
err = poll(fds, 2, -1);
if (err < 0 && errno != EAGAIN) {
mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, poll failed, errno=%d\n", errno);
return errno;
}
if (fds[0].revents & POLLIN) {
s = read(fds[0].fd, &u, sizeof(uint64_t));
if (s < 0 && errno != EAGAIN) {
mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, read failed, errno=%d\n", errno);
return errno;
}
err = mlx5_vfio_process_async_events(ctx);
if (err)
return err;
}
if (fds[1].revents & POLLIN) {
s = read(fds[1].fd, &u, sizeof(uint64_t));
if (s < 0 && errno != EAGAIN) {
mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, read failed, slot=%d, errno=%d\n",
slot, errno);
return errno;
}
if (!(mmio_read8(&cmd_lay->status_own) & 0x1))
return 0;
}
}
}
/* One minute for the sake of bringup */
#define MLX5_CMD_TIMEOUT_MSEC (60 * 1000)
static int mlx5_vfio_poll_timeout(struct mlx5_cmd_layout *cmd_lay)
{
static struct timeval start, curr;
uint64_t ms_start, ms_curr;
gettimeofday(&start, NULL);
ms_start = (uint64_t)start.tv_sec * 1000 + start.tv_usec / 1000;
do {
if (!(mmio_read8(&cmd_lay->status_own) & 0x1))
return 0;
sched_yield();
gettimeofday(&curr, NULL);
ms_curr = (uint64_t)curr.tv_sec * 1000 + curr.tv_usec / 1000;
} while (ms_curr - ms_start < MLX5_CMD_TIMEOUT_MSEC);
errno = ETIMEDOUT;
return errno;
}
static int mlx5_vfio_cmd_prep_in(struct mlx5_vfio_context *ctx,
struct mlx5_cmd_msg *cmd_in,
struct mlx5_cmd_layout *cmd_lay,
void *in, int ilen)
{
int err;
if (ilen > cmd_in->len) {
err = mlx5_vfio_enlarge_cmd_msg(ctx, cmd_in, cmd_lay, ilen, true);
if (err)
return err;
}
err = mlx5_copy_to_msg(cmd_in, in, ilen, cmd_lay);
if (err)
return err;
cmd_lay->ilen = htobe32(ilen);
return 0;
}
static int mlx5_vfio_cmd_prep_out(struct mlx5_vfio_context *ctx,
struct mlx5_cmd_msg *cmd_out,
struct mlx5_cmd_layout *cmd_lay, int olen)
{
struct mlx5_cmd_mailbox *tmp;
struct mlx5_cmd_block *block;
cmd_lay->olen = htobe32(olen);
/* zeroing output header */
memset(cmd_lay->out, 0, sizeof(cmd_lay->out));
if (olen > cmd_out->len)
/* Upon enlarge output message is zeroed */
return mlx5_vfio_enlarge_cmd_msg(ctx, cmd_out, cmd_lay, olen, false);
/* zeroing output message */
tmp = cmd_out->next;
olen -= min_t(int, olen, sizeof(cmd_lay->out));
while (olen > 0) {
block = tmp->buf;
memset(block->data, 0, MLX5_CMD_DATA_BLOCK_SIZE);
olen -= MLX5_CMD_DATA_BLOCK_SIZE;
tmp = tmp->next;
assert(tmp || olen <= 0);
}
return 0;
}
static int mlx5_vfio_post_cmd(struct mlx5_vfio_context *ctx, void *in,
int ilen, void *out, int olen,
unsigned int slot, bool async)
{
struct mlx5_init_seg *init_seg = ctx->bar_map;
struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay;
struct mlx5_cmd_msg *cmd_in = &ctx->cmd.cmds[slot].in;
struct mlx5_cmd_msg *cmd_out = &ctx->cmd.cmds[slot].out;
int err;
/* Lock was taken by caller */
if (async && ctx->cmd.cmds[slot].in_use) {
struct cmd_async_data *pending = &ctx->cmd.cmds[slot].pending;
if (ctx->cmd.cmds[slot].is_pending) {
assert(false);
return EINVAL;
}
/* We might get another PAGE EVENT before previous CMD was completed.
* Save the new work and once get the CMD completion go and do the job.
*/
pending->buff_in = in;
pending->buff_out = out;
pending->ilen = ilen;
pending->olen = olen;
ctx->cmd.cmds[slot].is_pending = true;
return 0;
}
err = mlx5_vfio_cmd_prep_in(ctx, cmd_in, cmd_lay, in, ilen);
if (err)
return err;
err = mlx5_vfio_cmd_prep_out(ctx, cmd_out, cmd_lay, olen);
if (err)
return err;
if (async) {
ctx->cmd.cmds[slot].in_use = true;
ctx->cmd.cmds[slot].curr.ilen = ilen;
ctx->cmd.cmds[slot].curr.olen = olen;
ctx->cmd.cmds[slot].curr.buff_in = in;
ctx->cmd.cmds[slot].curr.buff_out = out;
}
cmd_lay->status_own = 0x1;
udma_to_device_barrier();
mmio_write32_be(&init_seg->cmd_dbell, htobe32(0x1 << slot));
return 0;
}
static int mlx5_vfio_cmd_do(struct mlx5_vfio_context *ctx, void *in,
int ilen, void *out, int olen,
unsigned int slot)
{
struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay;
struct mlx5_cmd_msg *cmd_out = &ctx->cmd.cmds[slot].out;
int err;
pthread_mutex_lock(&ctx->cmd.cmds[slot].lock);
err = mlx5_vfio_post_cmd(ctx, in, ilen, out, olen, slot, false);
if (err)
goto end;
if (ctx->have_eq) {
err = mlx5_vfio_wait_event(ctx, slot);
if (err)
goto end;
} else {
err = mlx5_vfio_poll_timeout(cmd_lay);
if (err)
goto end;
udma_from_device_barrier();
}
err = mlx5_copy_from_msg(out, cmd_out, olen, cmd_lay);
if (err)
goto end;
if (DEVX_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK)
err = EREMOTEIO;
end:
pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock);
return err;
}
static int mlx5_vfio_cmd_exec(struct mlx5_vfio_context *ctx, void *in,
int ilen, void *out, int olen,
unsigned int slot)
{
int err;
err = mlx5_vfio_cmd_do(ctx, in, ilen, out, olen, slot);
if (err != EREMOTEIO)
return err;
return mlx5_vfio_cmd_check(ctx, in, out);
}
static int mlx5_vfio_enable_pci_cmd(struct mlx5_vfio_context *ctx)
{
struct vfio_region_info pci_config_reg = {};
uint16_t pci_com_buf = 0x6;
char buffer[4096];
pci_config_reg.argsz = sizeof(pci_config_reg);
pci_config_reg.index = VFIO_PCI_CONFIG_REGION_INDEX;
if (ioctl(ctx->device_fd, VFIO_DEVICE_GET_REGION_INFO, &pci_config_reg))
return -1;
if (pwrite(ctx->device_fd, &pci_com_buf, 2, pci_config_reg.offset + 0x4) != 2)
return -1;
if (pread(ctx->device_fd, buffer, pci_config_reg.size, pci_config_reg.offset)
!= pci_config_reg.size)
return -1;
return 0;
}
static void free_cmd_box(struct mlx5_vfio_context *ctx,
struct mlx5_cmd_mailbox *mailbox)
{
mlx5_vfio_unregister_mem(ctx, mailbox->iova, MLX5_ADAPTER_PAGE_SIZE);
iset_insert_range(ctx->iova_alloc, mailbox->iova, MLX5_ADAPTER_PAGE_SIZE);
free(mailbox->buf);
free(mailbox);
}
static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_vfio_context *ctx)
{
struct mlx5_cmd_mailbox *mailbox;
int ret;
mailbox = calloc(1, sizeof(*mailbox));
if (!mailbox) {
errno = ENOMEM;
return NULL;
}
ret = posix_memalign(&mailbox->buf, MLX5_ADAPTER_PAGE_SIZE,
MLX5_ADAPTER_PAGE_SIZE);
if (ret) {
errno = ret;
goto err_free;
}
memset(mailbox->buf, 0, MLX5_ADAPTER_PAGE_SIZE);
ret = iset_alloc_range(ctx->iova_alloc, MLX5_ADAPTER_PAGE_SIZE,
&mailbox->iova, MLX5_ADAPTER_PAGE_SIZE);
if (ret)
goto err_tree;
ret = mlx5_vfio_register_mem(ctx, mailbox->buf, mailbox->iova,
MLX5_ADAPTER_PAGE_SIZE);
if (ret)
goto err_reg;
return mailbox;
err_reg:
iset_insert_range(ctx->iova_alloc, mailbox->iova,
MLX5_ADAPTER_PAGE_SIZE);
err_tree:
free(mailbox->buf);
err_free:
free(mailbox);
return NULL;
}
static int mlx5_calc_cmd_blocks(uint32_t msg_len)
{
int size = msg_len;
int blen = size - min_t(int, 16, size);
return DIV_ROUND_UP(blen, MLX5_CMD_DATA_BLOCK_SIZE);
}
static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx,
struct mlx5_cmd_msg *msg)
{
struct mlx5_cmd_mailbox *head = msg->next;
struct mlx5_cmd_mailbox *next;
while (head) {
next = head->next;
free_cmd_box(ctx, head);
head = next;
}
msg->len = 0;
}
static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx,
uint32_t size, struct mlx5_cmd_msg *msg)
{
struct mlx5_cmd_mailbox *tmp, *head = NULL;
struct mlx5_cmd_block *block;
int i, num_blocks;
msg->len = size;
num_blocks = mlx5_calc_cmd_blocks(size);
for (i = 0; i < num_blocks; i++) {
tmp = alloc_cmd_box(ctx);
if (!tmp)
goto err_alloc;
block = tmp->buf;
tmp->next = head;
block->next = htobe64(tmp->next ? tmp->next->iova : 0);
block->block_num = htobe32(num_blocks - i - 1);
head = tmp;
}
msg->next = head;
return 0;
err_alloc:
while (head) {
tmp = head->next;
free_cmd_box(ctx, head);
head = tmp;
}
msg->len = 0;
return -1;
}
static void mlx5_vfio_free_cmd_slot(struct mlx5_vfio_context *ctx, int slot)
{
struct mlx5_vfio_cmd_slot *cmd_slot = &ctx->cmd.cmds[slot];
mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->in);
mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->out);
close(cmd_slot->completion_event_fd);
}
static int mlx5_vfio_setup_cmd_slot(struct mlx5_vfio_context *ctx, int slot)
{
struct mlx5_vfio_cmd *cmd = &ctx->cmd;
struct mlx5_vfio_cmd_slot *cmd_slot = &cmd->cmds[slot];
struct mlx5_cmd_layout *cmd_lay;
int ret;
ret = mlx5_vfio_alloc_cmd_msg(ctx, 4096, &cmd_slot->in);
if (ret)
return ret;
ret = mlx5_vfio_alloc_cmd_msg(ctx, 4096, &cmd_slot->out);
if (ret)
goto err;
cmd_lay = cmd->vaddr + (slot * (1 << cmd->log_stride));
cmd_lay->type = MLX5_PCI_CMD_XPORT;
cmd_lay->iptr = htobe64(cmd_slot->in.next->iova);
cmd_lay->optr = htobe64(cmd_slot->out.next->iova);
cmd_slot->lay = cmd_lay;
cmd_slot->completion_event_fd = eventfd(0, EFD_CLOEXEC);
if (cmd_slot->completion_event_fd < 0) {
ret = -1;
goto err_fd;
}
if (slot != MLX5_MAX_COMMANDS - 1)
cmd_slot->comp_func = mlx5_vfio_cmd_comp;
else
cmd_slot->comp_func = mlx5_vfio_process_page_request_comp;
pthread_mutex_init(&cmd_slot->lock, NULL);
return 0;
err_fd:
mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->out);
err:
mlx5_vfio_free_cmd_msg(ctx, &cmd_slot->in);
return ret;
}
static int mlx5_vfio_init_cmd_interface(struct mlx5_vfio_context *ctx)
{
struct mlx5_init_seg *init_seg = ctx->bar_map;
struct mlx5_vfio_cmd *cmd = &ctx->cmd;
uint16_t cmdif_rev;
uint32_t cmd_h, cmd_l;
int ret;
cmdif_rev = be32toh(init_seg->cmdif_rev_fw_sub) >> 16;
if (cmdif_rev != 5) {
errno = EINVAL;
return -1;
}
cmd_l = be32toh(init_seg->cmdq_addr_l_sz) & 0xff;
ctx->cmd.log_sz = cmd_l >> 4 & 0xf;
ctx->cmd.log_stride = cmd_l & 0xf;
if (1 << ctx->cmd.log_sz > MLX5_MAX_COMMANDS) {
errno = EINVAL;
return -1;
}
if (ctx->cmd.log_sz + ctx->cmd.log_stride > MLX5_ADAPTER_PAGE_SHIFT) {
errno = EINVAL;
return -1;
}
/* The initial address must be 4K aligned */
ret = posix_memalign(&cmd->vaddr, MLX5_ADAPTER_PAGE_SIZE,
MLX5_ADAPTER_PAGE_SIZE);
if (ret) {
errno = ret;
return -1;
}
memset(cmd->vaddr, 0, MLX5_ADAPTER_PAGE_SIZE);
ret = iset_alloc_range(ctx->iova_alloc, MLX5_ADAPTER_PAGE_SIZE,
&cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
if (ret)
goto err_free;
ret = mlx5_vfio_register_mem(ctx, cmd->vaddr, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
if (ret)
goto err_reg;
cmd_h = (uint32_t)((uint64_t)(cmd->iova) >> 32);
cmd_l = (uint32_t)(uint64_t)(cmd->iova);
init_seg->cmdq_addr_h = htobe32(cmd_h);
init_seg->cmdq_addr_l_sz = htobe32(cmd_l);
/* Make sure firmware sees the complete address before we proceed */
udma_to_device_barrier();
ret = mlx5_vfio_setup_cmd_slot(ctx, 0);
if (ret)
goto err_slot_0;
ret = mlx5_vfio_setup_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1);
if (ret)
goto err_slot_1;
ret = mlx5_vfio_enable_pci_cmd(ctx);
if (!ret)
return 0;
mlx5_vfio_free_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1);
err_slot_1:
mlx5_vfio_free_cmd_slot(ctx, 0);
err_slot_0:
mlx5_vfio_unregister_mem(ctx, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
err_reg:
iset_insert_range(ctx->iova_alloc, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
err_free:
free(cmd->vaddr);
return ret;
}
static void mlx5_vfio_clean_cmd_interface(struct mlx5_vfio_context *ctx)
{
struct mlx5_vfio_cmd *cmd = &ctx->cmd;
mlx5_vfio_free_cmd_slot(ctx, 0);
mlx5_vfio_free_cmd_slot(ctx, MLX5_MAX_COMMANDS - 1);
mlx5_vfio_unregister_mem(ctx, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
iset_insert_range(ctx->iova_alloc, cmd->iova, MLX5_ADAPTER_PAGE_SIZE);
free(cmd->vaddr);
}
static void set_iova_min_page_size(struct mlx5_vfio_context *ctx,
uint64_t iova_pgsizes)
{
int i;
for (i = MLX5_ADAPTER_PAGE_SHIFT; i < 64; i++) {
if (iova_pgsizes & (1 << i)) {
ctx->iova_min_page_size = 1 << i;
return;
}
}
assert(false);
}
/* if the kernel does not report usable IOVA regions, choose the legacy region */
#define MLX5_VFIO_IOVA_MIN1 0x10000ULL
#define MLX5_VFIO_IOVA_MAX1 0xFEDFFFFFULL
#define MLX5_VFIO_IOVA_MIN2 0xFEF00000ULL
#define MLX5_VFIO_IOVA_MAX2 ((1ULL << 39) - 1)
static int mlx5_vfio_get_iommu_info(struct mlx5_vfio_context *ctx)
{
struct vfio_iommu_type1_info *info;
int ret, i;
void *ptr;
uint32_t offset;
info = calloc(1, sizeof(*info));
if (!info) {
errno = ENOMEM;
return -1;
}
info->argsz = sizeof(*info);
ret = ioctl(ctx->container_fd, VFIO_IOMMU_GET_INFO, info);
if (ret)
goto end;
if (info->argsz > sizeof(*info)) {
struct vfio_iommu_type1_info *tmp;
tmp = realloc(info, info->argsz);
if (!tmp) {
errno = ENOMEM;
ret = -1;
goto end;
}
info = tmp;
ret = ioctl(ctx->container_fd, VFIO_IOMMU_GET_INFO, info);
if (ret)
goto end;
}
set_iova_min_page_size(ctx, (info->flags & VFIO_IOMMU_INFO_PGSIZES) ?
info->iova_pgsizes : 4096);
if (!(info->flags & VFIO_IOMMU_INFO_CAPS))
goto set_legacy;
offset = info->cap_offset;
while (offset) {
struct vfio_iommu_type1_info_cap_iova_range *iova_range;
struct vfio_info_cap_header *header;
ptr = (void *)info + offset;
header = ptr;
if (header->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
offset = header->next;
continue;
}
iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)header;
for (i = 0; i < iova_range->nr_iovas; i++) {
ret = iset_insert_range(ctx->iova_alloc, iova_range->iova_ranges[i].start,
iova_range->iova_ranges[i].end -
iova_range->iova_ranges[i].start + 1);
if (ret)
goto end;
}
goto end;
}
set_legacy:
ret = iset_insert_range(ctx->iova_alloc, MLX5_VFIO_IOVA_MIN1,
MLX5_VFIO_IOVA_MAX1 - MLX5_VFIO_IOVA_MIN1 + 1);
if (!ret)
ret = iset_insert_range(ctx->iova_alloc, MLX5_VFIO_IOVA_MIN2,
MLX5_VFIO_IOVA_MAX2 - MLX5_VFIO_IOVA_MIN2 + 1);
end:
free(info);
return ret;
}
static void mlx5_vfio_clean_device_dma(struct mlx5_vfio_context *ctx)
{
struct page_block *page_block, *tmp;
list_for_each_safe(&ctx->mem_alloc.block_list, page_block,
tmp, next_block)
mlx5_vfio_free_block(ctx, page_block);
iset_destroy(ctx->iova_alloc);
}
static int mlx5_vfio_init_device_dma(struct mlx5_vfio_context *ctx)
{
ctx->iova_alloc = iset_create();
if (!ctx->iova_alloc)
return -1;
list_head_init(&ctx->mem_alloc.block_list);
pthread_mutex_init(&ctx->mem_alloc.block_list_mutex, NULL);
if (mlx5_vfio_get_iommu_info(ctx))
goto err;
/* create an initial block of DMA memory ready to be used */
if (!mlx5_vfio_new_block(ctx))
goto err;
return 0;
err:
iset_destroy(ctx->iova_alloc);
return -1;
}
static void mlx5_vfio_uninit_bar0(struct mlx5_vfio_context *ctx)
{
munmap(ctx->bar_map, ctx->bar_map_size);
}
static int mlx5_vfio_init_bar0(struct mlx5_vfio_context *ctx)
{
struct vfio_region_info reg = { .argsz = sizeof(reg) };
void *base;
int err;
reg.index = 0;
err = ioctl(ctx->device_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
if (err)
return err;
base = mmap(NULL, reg.size, PROT_READ | PROT_WRITE, MAP_SHARED,
ctx->device_fd, reg.offset);
if (base == MAP_FAILED)
return -1;
ctx->bar_map = (struct mlx5_init_seg *)base;
ctx->bar_map_size = reg.size;
return 0;
}
static int mlx5_vfio_msix_set_irqs(struct mlx5_vfio_context *ctx,
int start, int count, void *irq_set_buf)
{
struct vfio_irq_set *irq_set = (struct vfio_irq_set *)irq_set_buf;
irq_set->argsz = sizeof(*irq_set) + sizeof(int) * count;
irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
irq_set->start = start;
irq_set->count = count;
return ioctl(ctx->device_fd, VFIO_DEVICE_SET_IRQS, irq_set);
}
static int mlx5_vfio_init_async_fd(struct mlx5_vfio_context *ctx)
{
struct vfio_irq_info irq = { .argsz = sizeof(irq) };
struct vfio_irq_set *irq_set_buf;
int fdlen, i;
irq.index = VFIO_PCI_MSIX_IRQ_INDEX;
if (ioctl(ctx->device_fd, VFIO_DEVICE_GET_IRQ_INFO, &irq))
return -1;
/* fail if this vector cannot be used with eventfd */
if ((irq.flags & VFIO_IRQ_INFO_EVENTFD) == 0)
return -1;
fdlen = sizeof(int) * irq.count;
ctx->msix_fds = calloc(1, fdlen);
if (!ctx->msix_fds) {
errno = ENOMEM;
return -1;
}
for (i = 0; i < irq.count; i++)
ctx->msix_fds[i] = -1;
/* set up an eventfd for command completion interrupts */
ctx->cmd_comp_fd = eventfd(0, EFD_CLOEXEC | O_NONBLOCK);
if (ctx->cmd_comp_fd < 0)
goto err_eventfd;
ctx->msix_fds[MLX5_VFIO_CMD_VEC_IDX] = ctx->cmd_comp_fd;
irq_set_buf = calloc(1, sizeof(*irq_set_buf) + fdlen);
if (!irq_set_buf) {
errno = ENOMEM;
goto err_irq_set_buf;
}
/* Enable MSI-X interrupts; In the first time it is called, the count
* must be the maximum that we need
*/
memcpy(irq_set_buf->data, ctx->msix_fds, fdlen);
if (mlx5_vfio_msix_set_irqs(ctx, 0, irq.count, irq_set_buf))
goto err_msix;
free(irq_set_buf);
pthread_mutex_init(&ctx->msix_fds_lock, NULL);
ctx->vctx.context.num_comp_vectors = irq.count;
return 0;
err_msix:
free(irq_set_buf);
err_irq_set_buf:
close(ctx->cmd_comp_fd);
err_eventfd:
free(ctx->msix_fds);
return -1;
}
static void mlx5_vfio_close_fds(struct mlx5_vfio_context *ctx)
{
int vec;
close(ctx->device_fd);
close(ctx->container_fd);
close(ctx->group_fd);
pthread_mutex_lock(&ctx->msix_fds_lock);
for (vec = 0; vec < ctx->vctx.context.num_comp_vectors; vec++)
if (ctx->msix_fds[vec] >= 0)
close(ctx->msix_fds[vec]);
free(ctx->msix_fds);
pthread_mutex_unlock(&ctx->msix_fds_lock);
}
static int mlx5_vfio_open_fds(struct mlx5_vfio_context *ctx,
struct mlx5_vfio_device *mdev)
{
struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
/* Create a new container */
ctx->container_fd = open("/dev/vfio/vfio", O_RDWR);
if (ctx->container_fd < 0)
return -1;
if (ioctl(ctx->container_fd, VFIO_GET_API_VERSION) != VFIO_API_VERSION)
goto close_cont;
if (!ioctl(ctx->container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU))
/* Doesn't support the IOMMU driver we want. */
goto close_cont;
/* Open the group */
ctx->group_fd = open(mdev->vfio_path, O_RDWR);
if (ctx->group_fd < 0)
goto close_cont;
/* Test the group is viable and available */
if (ioctl(ctx->group_fd, VFIO_GROUP_GET_STATUS, &group_status))
goto close_group;
if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
/* Group is not viable (ie, not all devices bound for vfio) */
errno = EINVAL;
goto close_group;
}
/* Add the group to the container */
if (ioctl(ctx->group_fd, VFIO_GROUP_SET_CONTAINER, &ctx->container_fd))
goto close_group;
/* Enable the IOMMU model we want */
if (ioctl(ctx->container_fd, VFIO_SET_IOMMU, VFIO_TYPE1v2_IOMMU))
goto close_group;
/* Get a file descriptor for the device */
ctx->device_fd = ioctl(ctx->group_fd, VFIO_GROUP_GET_DEVICE_FD,
mdev->pci_name);
if (ctx->device_fd < 0)
goto close_group;
if (mlx5_vfio_init_async_fd(ctx))
goto close_group;
return 0;
close_group:
close(ctx->group_fd);
close_cont:
close(ctx->container_fd);
return -1;
}
enum {
MLX5_EQE_OWNER_INIT_VAL = 0x1,
};
static void init_eq_buf(struct mlx5_eq *eq)
{
struct mlx5_eqe *eqe;
int i;
for (i = 0; i < eq->nent; i++) {
eqe = get_eqe(eq, i);
eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
}
}
static uint64_t uar2iova(struct mlx5_vfio_context *ctx, uint32_t index)
{
return (uint64_t)(uintptr_t)((void *)ctx->bar_map + (index * MLX5_ADAPTER_PAGE_SIZE));
}
static int mlx5_vfio_alloc_uar(struct mlx5_vfio_context *ctx, uint32_t *uarn)
{
uint32_t out[DEVX_ST_SZ_DW(alloc_uar_out)] = {};
uint32_t in[DEVX_ST_SZ_DW(alloc_uar_in)] = {};
int err;
DEVX_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR);
err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (!err)
*uarn = DEVX_GET(alloc_uar_out, out, uar);
return err;
}
static void mlx5_vfio_dealloc_uar(struct mlx5_vfio_context *ctx, uint32_t uarn)
{
uint32_t out[DEVX_ST_SZ_DW(dealloc_uar_out)] = {};
uint32_t in[DEVX_ST_SZ_DW(dealloc_uar_in)] = {};
DEVX_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR);
DEVX_SET(dealloc_uar_in, in, uar, uarn);
mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
}
static void mlx5_vfio_destroy_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq *eq)
{
uint32_t in[DEVX_ST_SZ_DW(destroy_eq_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(destroy_eq_out)] = {};
DEVX_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
DEVX_SET(destroy_eq_in, in, eq_number, eq->eqn);
mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
mlx5_vfio_unregister_mem(ctx, eq->iova, eq->iova_size);
iset_insert_range(ctx->iova_alloc, eq->iova, eq->iova_size);
free(eq->vaddr);
}
static void destroy_async_eqs(struct mlx5_vfio_context *ctx)
{
ctx->have_eq = false;
mlx5_vfio_destroy_eq(ctx, &ctx->async_eq);
mlx5_vfio_dealloc_uar(ctx, ctx->eqs_uar.uarn);
}
static int
create_map_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq *eq,
struct mlx5_eq_param *param)
{
uint32_t out[DEVX_ST_SZ_DW(create_eq_out)] = {};
uint8_t vecidx = param->irq_index;
__be64 *pas;
void *eqc;
int inlen;
uint32_t *in;
int err;
int i;
int alloc_size;
pthread_mutex_init(&ctx->eq_lock, NULL);
eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE);
eq->cons_index = 0;
alloc_size = eq->nent * MLX5_EQE_SIZE;
eq->iova_size = max(roundup_pow_of_two(alloc_size), ctx->iova_min_page_size);
inlen = DEVX_ST_SZ_BYTES(create_eq_in) +
DEVX_FLD_SZ_BYTES(create_eq_in, pas[0]) * 1;
in = calloc(1, inlen);
if (!in)
return ENOMEM;
pas = (__be64 *)DEVX_ADDR_OF(create_eq_in, in, pas);
err = posix_memalign(&eq->vaddr, eq->iova_size, alloc_size);
if (err) {
errno = err;
goto end;
}
err = iset_alloc_range(ctx->iova_alloc, eq->iova_size,
&eq->iova, eq->iova_size);
if (err)
goto err_range;
err = mlx5_vfio_register_mem(ctx, eq->vaddr, eq->iova, eq->iova_size);
if (err)
goto err_reg;
pas[0] = htobe64(eq->iova);
init_eq_buf(eq);
DEVX_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
for (i = 0; i < 4; i++)
DEVX_ARRAY_SET64(create_eq_in, in, event_bitmask, i,
param->mask[i]);
eqc = DEVX_ADDR_OF(create_eq_in, in, eq_context_entry);
DEVX_SET(eqc, eqc, log_eq_size, ilog32(eq->nent - 1));
DEVX_SET(eqc, eqc, uar_page, ctx->eqs_uar.uarn);
DEVX_SET(eqc, eqc, intr, vecidx);
DEVX_SET(eqc, eqc, log_page_size, ilog32(eq->iova_size - 1) - MLX5_ADAPTER_PAGE_SHIFT);
err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0);
if (err)
goto err_cmd;
eq->vecidx = vecidx;
eq->eqn = DEVX_GET(create_eq_out, out, eq_number);
eq->doorbell = (void *)(uintptr_t)ctx->eqs_uar.iova + MLX5_EQ_DOORBEL_OFFSET;
free(in);
return 0;
err_cmd:
mlx5_vfio_unregister_mem(ctx, eq->iova, eq->iova_size);
err_reg:
iset_insert_range(ctx->iova_alloc, eq->iova, eq->iova_size);
err_range:
free(eq->vaddr);
end:
free(in);
return err;
}
static int
setup_async_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq_param *param,
struct mlx5_eq *eq)
{
int err;
err = create_map_eq(ctx, eq, param);
if (err)
return err;
eq_update_ci(eq, 0, 1);
return 0;
}
static int create_async_eqs(struct mlx5_vfio_context *ctx)
{
struct mlx5_eq_param param = {};
int err;
err = mlx5_vfio_alloc_uar(ctx, &ctx->eqs_uar.uarn);
if (err)
return err;
ctx->eqs_uar.iova = uar2iova(ctx, ctx->eqs_uar.uarn);
param = (struct mlx5_eq_param) {
.irq_index = MLX5_VFIO_CMD_VEC_IDX,
.nent = MLX5_NUM_CMD_EQE,
.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD |
1ull << MLX5_EVENT_TYPE_PAGE_REQUEST,
};
err = setup_async_eq(ctx, &param, &ctx->async_eq);
if (err)
goto err;
ctx->have_eq = true;
return 0;
err:
mlx5_vfio_dealloc_uar(ctx, ctx->eqs_uar.uarn);
return err;
}
static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id,
int npages)
{
uint32_t inlen = DEVX_ST_SZ_BYTES(manage_pages_in);
int outlen;
uint32_t *out;
void *in;
int err;
int slot = MLX5_MAX_COMMANDS - 1;
outlen = DEVX_ST_SZ_BYTES(manage_pages_out);
outlen += npages * DEVX_FLD_SZ_BYTES(manage_pages_out, pas[0]);
out = calloc(1, outlen);
if (!out) {
errno = ENOMEM;
return errno;
}
in = calloc(1, inlen);
if (!in) {
err = ENOMEM;
errno = err;
goto out_free;
}
DEVX_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
DEVX_SET(manage_pages_in, in, op_mod, MLX5_PAGES_TAKE);
DEVX_SET(manage_pages_in, in, function_id, func_id);
DEVX_SET(manage_pages_in, in, input_num_entries, npages);
pthread_mutex_lock(&ctx->cmd.cmds[slot].lock);
err = mlx5_vfio_post_cmd(ctx, in, inlen, out, outlen, slot, true);
pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock);
if (!err)
return 0;
free(in);
out_free:
free(out);
return err;
}
static int mlx5_vfio_enable_hca(struct mlx5_vfio_context *ctx)
{
uint32_t in[DEVX_ST_SZ_DW(enable_hca_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(enable_hca_out)] = {};
DEVX_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA);
return mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
}
static int mlx5_vfio_set_issi(struct mlx5_vfio_context *ctx)
{
uint32_t query_in[DEVX_ST_SZ_DW(query_issi_in)] = {};
uint32_t query_out[DEVX_ST_SZ_DW(query_issi_out)] = {};
uint32_t set_in[DEVX_ST_SZ_DW(set_issi_in)] = {};
uint32_t set_out[DEVX_ST_SZ_DW(set_issi_out)] = {};
uint32_t sup_issi;
int err;
DEVX_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI);
err = mlx5_vfio_cmd_exec(ctx, query_in, sizeof(query_in), query_out,
sizeof(query_out), 0);
if (err)
return err;
sup_issi = DEVX_GET(query_issi_out, query_out, supported_issi_dw0);
if (!(sup_issi & (1 << 1))) {
errno = EOPNOTSUPP;
return errno;
}
DEVX_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI);
DEVX_SET(set_issi_in, set_in, current_issi, 1);
return mlx5_vfio_cmd_exec(ctx, set_in, sizeof(set_in), set_out,
sizeof(set_out), 0);
}
static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx,
uint16_t func_id,
int32_t npages,
bool is_event)
{
int32_t out[DEVX_ST_SZ_DW(manage_pages_out)] = {};
int inlen = DEVX_ST_SZ_BYTES(manage_pages_in);
int slot = MLX5_MAX_COMMANDS - 1;
void *outp = out;
int i, err;
int32_t *in;
uint64_t iova;
inlen += npages * DEVX_FLD_SZ_BYTES(manage_pages_in, pas[0]);
in = calloc(1, inlen);
if (!in) {
errno = ENOMEM;
return errno;
}
if (is_event) {
outp = calloc(1, sizeof(out));
if (!outp) {
errno = ENOMEM;
err = errno;
goto end;
}
}
for (i = 0; i < npages; i++) {
err = mlx5_vfio_alloc_page(ctx, &iova);
if (err)
goto err;
DEVX_ARRAY_SET64(manage_pages_in, in, pas, i, iova);
}
DEVX_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
DEVX_SET(manage_pages_in, in, op_mod, MLX5_PAGES_GIVE);
DEVX_SET(manage_pages_in, in, function_id, func_id);
DEVX_SET(manage_pages_in, in, input_num_entries, npages);
if (is_event) {
pthread_mutex_lock(&ctx->cmd.cmds[slot].lock);
err = mlx5_vfio_post_cmd(ctx, in, inlen, outp, sizeof(out), slot, true);
pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock);
} else {
err = mlx5_vfio_cmd_exec(ctx, in, inlen, outp, sizeof(out), slot);
}
if (!err) {
if (is_event)
return 0;
goto end;
}
err:
if (is_event)
free(outp);
for (i--; i >= 0; i--)
mlx5_vfio_free_page(ctx, DEVX_GET64(manage_pages_in, in, pas[i]));
end:
free(in);
return err;
}
static int mlx5_vfio_query_pages(struct mlx5_vfio_context *ctx, int boot,
uint16_t *func_id, int32_t *npages)
{
uint32_t query_pages_in[DEVX_ST_SZ_DW(query_pages_in)] = {};
uint32_t query_pages_out[DEVX_ST_SZ_DW(query_pages_out)] = {};
int ret;
DEVX_SET(query_pages_in, query_pages_in, opcode, MLX5_CMD_OP_QUERY_PAGES);
DEVX_SET(query_pages_in, query_pages_in, op_mod, boot ? 0x01 : 0x02);
ret = mlx5_vfio_cmd_exec(ctx, query_pages_in, sizeof(query_pages_in),
query_pages_out, sizeof(query_pages_out), 0);
if (ret)
return ret;
*npages = DEVX_GET(query_pages_out, query_pages_out, num_pages);
*func_id = DEVX_GET(query_pages_out, query_pages_out, function_id);
return 0;
}
static int mlx5_vfio_satisfy_startup_pages(struct mlx5_vfio_context *ctx,
int boot)
{
uint16_t function_id;
int32_t npages = 0;
int ret;
ret = mlx5_vfio_query_pages(ctx, boot, &function_id, &npages);
if (ret)
return ret;
return mlx5_vfio_give_pages(ctx, function_id, npages, false);
}
static int mlx5_vfio_access_reg(struct mlx5_vfio_context *ctx, void *data_in,
int size_in, void *data_out, int size_out,
uint16_t reg_id, int arg, int write)
{
int outlen = DEVX_ST_SZ_BYTES(access_register_out) + size_out;
int inlen = DEVX_ST_SZ_BYTES(access_register_in) + size_in;
int err = ENOMEM;
uint32_t *out = NULL;
uint32_t *in = NULL;
void *data;
in = calloc(1, inlen);
out = calloc(1, outlen);
if (!in || !out) {
errno = ENOMEM;
goto out;
}
data = DEVX_ADDR_OF(access_register_in, in, register_data);
memcpy(data, data_in, size_in);
DEVX_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REG);
DEVX_SET(access_register_in, in, op_mod, !write);
DEVX_SET(access_register_in, in, argument, arg);
DEVX_SET(access_register_in, in, register_id, reg_id);
err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, outlen, 0);
if (err)
goto out;
data = DEVX_ADDR_OF(access_register_out, out, register_data);
memcpy(data_out, data, size_out);
out:
free(out);
free(in);
return err;
}
static int mlx5_vfio_get_caps_mode(struct mlx5_vfio_context *ctx,
enum mlx5_cap_type cap_type,
enum mlx5_cap_mode cap_mode)
{
uint8_t in[DEVX_ST_SZ_BYTES(query_hca_cap_in)] = {};
int out_sz = DEVX_ST_SZ_BYTES(query_hca_cap_out);
void *out, *hca_caps;
uint16_t opmod = (cap_type << 1) | (cap_mode & 0x01);
int err;
out = calloc(1, out_sz);
if (!out) {
errno = ENOMEM;
return errno;
}
DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
DEVX_SET(query_hca_cap_in, in, op_mod, opmod);
err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, out_sz, 0);
if (err)
goto query_ex;
hca_caps = DEVX_ADDR_OF(query_hca_cap_out, out, capability);
switch (cap_mode) {
case HCA_CAP_OPMOD_GET_MAX:
memcpy(ctx->caps.hca_max[cap_type], hca_caps,
DEVX_UN_SZ_BYTES(hca_cap_union));
break;
case HCA_CAP_OPMOD_GET_CUR:
memcpy(ctx->caps.hca_cur[cap_type], hca_caps,
DEVX_UN_SZ_BYTES(hca_cap_union));
break;
default:
err = EINVAL;
assert(false);
break;
}
query_ex:
free(out);
return err;
}
enum mlx5_vport_roce_state {
MLX5_VPORT_ROCE_DISABLED = 0,
MLX5_VPORT_ROCE_ENABLED = 1,
};
static int mlx5_vfio_nic_vport_update_roce_state(struct mlx5_vfio_context *ctx,
enum mlx5_vport_roce_state state)
{
uint32_t out[DEVX_ST_SZ_DW(modify_nic_vport_context_out)] = {};
int inlen = DEVX_ST_SZ_BYTES(modify_nic_vport_context_in);
void *in;
int err;
in = calloc(1, inlen);
if (!in) {
errno = ENOMEM;
return errno;
}
DEVX_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1);
DEVX_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en,
state);
DEVX_SET(modify_nic_vport_context_in, in, opcode,
MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0);
free(in);
return err;
}
static int mlx5_vfio_get_caps(struct mlx5_vfio_context *ctx, enum mlx5_cap_type cap_type)
{
int ret;
ret = mlx5_vfio_get_caps_mode(ctx, cap_type, HCA_CAP_OPMOD_GET_CUR);
if (ret)
return ret;
return mlx5_vfio_get_caps_mode(ctx, cap_type, HCA_CAP_OPMOD_GET_MAX);
}
static int handle_hca_cap_roce(struct mlx5_vfio_context *ctx, void *set_ctx,
int ctx_size)
{
int err;
uint32_t out[DEVX_ST_SZ_DW(set_hca_cap_out)] = {};
void *set_hca_cap;
if (!MLX5_VFIO_CAP_GEN(ctx, roce))
return 0;
err = mlx5_vfio_get_caps(ctx, MLX5_CAP_ROCE);
if (err)
return err;
if (MLX5_VFIO_CAP_ROCE(ctx, sw_r_roce_src_udp_port) ||
!MLX5_VFIO_CAP_ROCE_MAX(ctx, sw_r_roce_src_udp_port))
return 0;
set_hca_cap = DEVX_ADDR_OF(set_hca_cap_in, set_ctx, capability);
memcpy(set_hca_cap, ctx->caps.hca_cur[MLX5_CAP_ROCE],
DEVX_ST_SZ_BYTES(roce_cap));
DEVX_SET(roce_cap, set_hca_cap, sw_r_roce_src_udp_port, 1);
DEVX_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP);
DEVX_SET(set_hca_cap_in, set_ctx, op_mod, MLX5_SET_HCA_CAP_OP_MOD_ROCE);
return mlx5_vfio_cmd_exec(ctx, set_ctx, ctx_size, out, sizeof(out), 0);
}
static int handle_hca_cap(struct mlx5_vfio_context *ctx, void *set_ctx, int set_sz)
{
struct mlx5_vfio_device *dev = to_mvfio_dev(ctx->vctx.context.device);
int sys_page_shift = ilog32(dev->page_size - 1);
uint32_t out[DEVX_ST_SZ_DW(set_hca_cap_out)] = {};
void *set_hca_cap;
int err;
err = mlx5_vfio_get_caps(ctx, MLX5_CAP_GENERAL);
if (err)
return err;
set_hca_cap = DEVX_ADDR_OF(set_hca_cap_in, set_ctx,
capability);
memcpy(set_hca_cap, ctx->caps.hca_cur[MLX5_CAP_GENERAL],
DEVX_ST_SZ_BYTES(cmd_hca_cap));
/* disable cmdif checksum */
DEVX_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0);
if (dev->flags & MLX5DV_VFIO_CTX_FLAGS_INIT_LINK_DOWN)
DEVX_SET(cmd_hca_cap, set_hca_cap, disable_link_up_by_init_hca, 1);
DEVX_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, sys_page_shift - 12);
if (MLX5_VFIO_CAP_GEN_MAX(ctx, mkey_by_name))
DEVX_SET(cmd_hca_cap, set_hca_cap, mkey_by_name, 1);
DEVX_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP);
DEVX_SET(set_hca_cap_in, set_ctx, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
return mlx5_vfio_cmd_exec(ctx, set_ctx, set_sz, out, sizeof(out), 0);
}
static int set_hca_cap(struct mlx5_vfio_context *ctx)
{
int set_sz = DEVX_ST_SZ_BYTES(set_hca_cap_in);
void *set_ctx;
int err;
set_ctx = calloc(1, set_sz);
if (!set_ctx) {
errno = ENOMEM;
return errno;
}
err = handle_hca_cap(ctx, set_ctx, set_sz);
if (err)
goto out;
memset(set_ctx, 0, set_sz);
err = handle_hca_cap_roce(ctx, set_ctx, set_sz);
out:
free(set_ctx);
return err;
}
static int mlx5_vfio_set_hca_ctrl(struct mlx5_vfio_context *ctx)
{
struct mlx5_reg_host_endianness he_in = {};
struct mlx5_reg_host_endianness he_out = {};
he_in.he = MLX5_SET_HOST_ENDIANNESS;
return mlx5_vfio_access_reg(ctx, &he_in, sizeof(he_in),
&he_out, sizeof(he_out),
MLX5_REG_HOST_ENDIANNESS, 0, 1);
}
static int mlx5_vfio_init_hca(struct mlx5_vfio_context *ctx)
{
uint32_t in[DEVX_ST_SZ_DW(init_hca_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(init_hca_out)] = {};
DEVX_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA);
return mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
}
static int fw_initializing(struct mlx5_init_seg *init_seg)
{
return be32toh(init_seg->initializing) >> 31;
}
static int wait_fw_init(struct mlx5_init_seg *init_seg, uint32_t max_wait_mili)
{
int num_loops = max_wait_mili / FW_INIT_WAIT_MS;
int loop = 0;
while (fw_initializing(init_seg)) {
usleep(FW_INIT_WAIT_MS * 1000);
loop++;
if (loop == num_loops) {
errno = EBUSY;
return errno;
}
}
return 0;
}
static int mlx5_vfio_teardown_hca_regular(struct mlx5_vfio_context *ctx)
{
uint32_t in[DEVX_ST_SZ_DW(teardown_hca_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(teardown_hca_out)] = {};
DEVX_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
DEVX_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE);
return mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
}
enum mlx5_cmd_addr_l_sz_offset {
MLX5_NIC_IFC_OFFSET = 8,
};
enum {
MLX5_NIC_IFC_DISABLED = 1,
MLX5_NIC_IFC_SW_RESET = 7,
};
static uint8_t mlx5_vfio_get_nic_state(struct mlx5_vfio_context *ctx)
{
return (be32toh(mmio_read32_be(&ctx->bar_map->cmdq_addr_l_sz)) >> 8) & 7;
}
static void mlx5_vfio_set_nic_state(struct mlx5_vfio_context *ctx, uint8_t state)
{
uint32_t cur_cmdq_addr_l_sz;
cur_cmdq_addr_l_sz = be32toh(mmio_read32_be(&ctx->bar_map->cmdq_addr_l_sz));
mmio_write32_be(&ctx->bar_map->cmdq_addr_l_sz,
htobe32((cur_cmdq_addr_l_sz & 0xFFFFF000) |
state << MLX5_NIC_IFC_OFFSET));
}
#define MLX5_FAST_TEARDOWN_WAIT_MS 3000
#define MLX5_FAST_TEARDOWN_WAIT_ONCE_MS 1
static int mlx5_vfio_teardown_hca_fast(struct mlx5_vfio_context *ctx)
{
uint32_t out[DEVX_ST_SZ_DW(teardown_hca_out)] = {};
uint32_t in[DEVX_ST_SZ_DW(teardown_hca_in)] = {};
int waited = 0, state, ret;
DEVX_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
DEVX_SET(teardown_hca_in, in, profile,
MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN);
ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (ret)
return ret;
state = DEVX_GET(teardown_hca_out, out, state);
if (state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
mlx5_err(ctx->dbg_fp, "teardown with fast mode failed\n");
return EIO;
}
mlx5_vfio_set_nic_state(ctx, MLX5_NIC_IFC_DISABLED);
do {
if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_DISABLED)
break;
usleep(MLX5_FAST_TEARDOWN_WAIT_ONCE_MS * 1000);
waited += MLX5_FAST_TEARDOWN_WAIT_ONCE_MS;
} while (waited < MLX5_FAST_TEARDOWN_WAIT_MS);
if (mlx5_vfio_get_nic_state(ctx) != MLX5_NIC_IFC_DISABLED) {
mlx5_err(ctx->dbg_fp, "NIC IFC still %d after %ums.\n",
mlx5_vfio_get_nic_state(ctx), waited);
return EIO;
}
return 0;
}
static int mlx5_vfio_teardown_hca(struct mlx5_vfio_context *ctx)
{
int err;
if (MLX5_VFIO_CAP_GEN(ctx, fast_teardown)) {
err = mlx5_vfio_teardown_hca_fast(ctx);
if (!err)
return 0;
}
return mlx5_vfio_teardown_hca_regular(ctx);
}
static bool sensor_pci_not_working(struct mlx5_init_seg *init_seg)
{
/* Offline PCI reads return 0xffffffff */
return (be32toh(mmio_read32_be(&init_seg->health.fw_ver)) == 0xffffffff);
}
enum mlx5_fatal_assert_bit_offsets {
MLX5_RFR_OFFSET = 31,
};
static bool sensor_fw_synd_rfr(struct mlx5_init_seg *init_seg)
{
uint32_t rfr = be32toh(mmio_read32_be(&init_seg->health.rfr)) >> MLX5_RFR_OFFSET;
uint8_t synd = mmio_read8(&init_seg->health.synd);
return (rfr && synd);
}
enum {
MLX5_SENSOR_NO_ERR = 0,
MLX5_SENSOR_PCI_COMM_ERR = 1,
MLX5_SENSOR_NIC_DISABLED = 3,
MLX5_SENSOR_NIC_SW_RESET = 4,
MLX5_SENSOR_FW_SYND_RFR = 5,
};
static uint32_t mlx5_health_check_fatal_sensors(struct mlx5_vfio_context *ctx)
{
if (sensor_pci_not_working(ctx->bar_map))
return MLX5_SENSOR_PCI_COMM_ERR;
if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_DISABLED)
return MLX5_SENSOR_NIC_DISABLED;
if (mlx5_vfio_get_nic_state(ctx) == MLX5_NIC_IFC_SW_RESET)
return MLX5_SENSOR_NIC_SW_RESET;
if (sensor_fw_synd_rfr(ctx->bar_map))
return MLX5_SENSOR_FW_SYND_RFR;
return MLX5_SENSOR_NO_ERR;
}
enum {
MLX5_HEALTH_SYNDR_FW_ERR = 0x1,
MLX5_HEALTH_SYNDR_IRISC_ERR = 0x7,
MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR = 0x8,
MLX5_HEALTH_SYNDR_CRC_ERR = 0x9,
MLX5_HEALTH_SYNDR_FETCH_PCI_ERR = 0xa,
MLX5_HEALTH_SYNDR_HW_FTL_ERR = 0xb,
MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR = 0xc,
MLX5_HEALTH_SYNDR_EQ_ERR = 0xd,
MLX5_HEALTH_SYNDR_EQ_INV = 0xe,
MLX5_HEALTH_SYNDR_FFSER_ERR = 0xf,
MLX5_HEALTH_SYNDR_HIGH_TEMP = 0x10,
};
static const char *hsynd_str(u8 synd)
{
switch (synd) {
case MLX5_HEALTH_SYNDR_FW_ERR:
return "firmware internal error";
case MLX5_HEALTH_SYNDR_IRISC_ERR:
return "irisc not responding";
case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
return "unrecoverable hardware error";
case MLX5_HEALTH_SYNDR_CRC_ERR:
return "firmware CRC error";
case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
return "ICM fetch PCI error";
case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
return "HW fatal error\n";
case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
return "async EQ buffer overrun";
case MLX5_HEALTH_SYNDR_EQ_ERR:
return "EQ error";
case MLX5_HEALTH_SYNDR_EQ_INV:
return "Invalid EQ referenced";
case MLX5_HEALTH_SYNDR_FFSER_ERR:
return "FFSER error";
case MLX5_HEALTH_SYNDR_HIGH_TEMP:
return "High temperature";
default:
return "unrecognized error";
}
}
static void print_health_info(struct mlx5_vfio_context *ctx)
{
struct mlx5_init_seg *iseg = ctx->bar_map;
struct health_buffer *h = &iseg->health;
char fw_str[18] = {};
int i;
/* If the syndrome is 0, the device is OK and no need to print buffer */
if (!mmio_read8(&h->synd))
return;
for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
mlx5_err(ctx->dbg_fp, "assert_var[%d] 0x%08x\n",
i, be32toh(mmio_read32_be(h->assert_var + i)));
mlx5_err(ctx->dbg_fp, "assert_exit_ptr 0x%08x\n",
be32toh(mmio_read32_be(&h->assert_exit_ptr)));
mlx5_err(ctx->dbg_fp, "assert_callra 0x%08x\n",
be32toh(mmio_read32_be(&h->assert_callra)));
sprintf(fw_str, "%d.%d.%d",
be32toh(mmio_read32_be(&iseg->fw_rev)) & 0xffff,
be32toh(mmio_read32_be(&iseg->fw_rev)) >> 16,
be32toh(mmio_read32_be(&iseg->cmdif_rev_fw_sub)) & 0xffff);
mlx5_err(ctx->dbg_fp, "fw_ver %s\n", fw_str);
mlx5_err(ctx->dbg_fp, "hw_id 0x%08x\n", be32toh(mmio_read32_be(&h->hw_id)));
mlx5_err(ctx->dbg_fp, "irisc_index %d\n", mmio_read8(&h->irisc_index));
mlx5_err(ctx->dbg_fp, "synd 0x%x: %s\n", mmio_read8(&h->synd),
hsynd_str(mmio_read8(&h->synd)));
mlx5_err(ctx->dbg_fp, "ext_synd 0x%04x\n",
be16toh(mmio_read16_be(&h->ext_synd)));
mlx5_err(ctx->dbg_fp, "raw fw_ver 0x%08x\n",
be32toh(mmio_read32_be(&iseg->fw_rev)));
}
static void mlx5_vfio_poll_health(struct mlx5_vfio_context *ctx)
{
struct mlx5_vfio_health_state *hstate = &ctx->health_state;
uint32_t fatal_error, count;
struct timeval tv;
uint64_t time;
int ret;
ret = gettimeofday(&tv, NULL);
if (ret)
return;
time = (uint64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000;
if (time - hstate->prev_time < POLL_HEALTH_INTERVAL)
return;
fatal_error = mlx5_health_check_fatal_sensors(ctx);
if (fatal_error) {
mlx5_err(ctx->dbg_fp, "%s: Fatal error %u detected\n",
__func__, fatal_error);
goto err;
}
count = be32toh(mmio_read32_be(&ctx->bar_map->health_counter)) & 0xffffff;
if (count == hstate->prev_count)
++hstate->miss_counter;
else
hstate->miss_counter = 0;
hstate->prev_time = time;
hstate->prev_count = count;
if (hstate->miss_counter == MAX_MISSES) {
mlx5_err(ctx->dbg_fp,
"device's health compromised - reached miss count\n");
goto err;
}
return;
err:
print_health_info(ctx);
abort();
}
static int mlx5_vfio_setup_function(struct mlx5_vfio_context *ctx)
{
int err;
err = wait_fw_init(ctx->bar_map, FW_PRE_INIT_TIMEOUT_MILI);
if (err)
return err;
err = mlx5_vfio_enable_hca(ctx);
if (err)
return err;
err = mlx5_vfio_set_issi(ctx);
if (err)
return err;
err = mlx5_vfio_satisfy_startup_pages(ctx, 1);
if (err)
return err;
err = mlx5_vfio_set_hca_ctrl(ctx);
if (err)
return err;
err = set_hca_cap(ctx);
if (err)
return err;
if (!MLX5_VFIO_CAP_GEN(ctx, umem_uid_0)) {
errno = EOPNOTSUPP;
return errno;
}
err = mlx5_vfio_satisfy_startup_pages(ctx, 0);
if (err)
return err;
err = mlx5_vfio_init_hca(ctx);
if (err)
return err;
if (MLX5_VFIO_CAP_GEN(ctx, port_type) == MLX5_CAP_PORT_TYPE_ETH)
err = mlx5_vfio_nic_vport_update_roce_state(ctx, MLX5_VPORT_ROCE_ENABLED);
return err;
}
static struct ibv_pd *mlx5_vfio_alloc_pd(struct ibv_context *ibctx)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
uint32_t in[DEVX_ST_SZ_DW(alloc_pd_in)] = {0};
uint32_t out[DEVX_ST_SZ_DW(alloc_pd_out)] = {0};
int err;
struct mlx5_pd *pd;
pd = calloc(1, sizeof(*pd));
if (!pd)
return NULL;
DEVX_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (err)
goto err;
pd->pdn = DEVX_GET(alloc_pd_out, out, pd);
return &pd->ibv_pd;
err:
free(pd);
return NULL;
}
static int mlx5_vfio_dealloc_pd(struct ibv_pd *pd)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(pd->context);
uint32_t in[DEVX_ST_SZ_DW(dealloc_pd_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(dealloc_pd_out)] = {};
struct mlx5_pd *mpd = to_mpd(pd);
int ret;
DEVX_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
DEVX_SET(dealloc_pd_in, in, pd, mpd->pdn);
ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (ret)
return ret;
free(mpd);
return 0;
}
static size_t calc_num_dma_blocks(uint64_t iova, size_t length,
unsigned long pgsz)
{
return (size_t)((align(iova + length, pgsz) -
align_down(iova, pgsz)) / pgsz);
}
static int get_octo_len(uint64_t addr, uint64_t len, int page_shift)
{
uint64_t page_size = 1ULL << page_shift;
uint64_t offset;
int npages;
offset = addr & (page_size - 1);
npages = align(len + offset, page_size) >> page_shift;
return (npages + 1) / 2;
}
static inline uint32_t mlx5_mkey_to_idx(uint32_t mkey)
{
return mkey >> 8;
}
static inline uint32_t mlx5_idx_to_mkey(uint32_t mkey_idx)
{
return mkey_idx << 8;
}
static void set_mkc_access_pd_addr_fields(void *mkc, int acc, uint64_t start_addr,
struct ibv_pd *pd)
{
struct mlx5_pd *mpd = to_mpd(pd);
DEVX_SET(mkc, mkc, a, !!(acc & IBV_ACCESS_REMOTE_ATOMIC));
DEVX_SET(mkc, mkc, rw, !!(acc & IBV_ACCESS_REMOTE_WRITE));
DEVX_SET(mkc, mkc, rr, !!(acc & IBV_ACCESS_REMOTE_READ));
DEVX_SET(mkc, mkc, lw, !!(acc & IBV_ACCESS_LOCAL_WRITE));
DEVX_SET(mkc, mkc, lr, 1);
/* Application is responsible to set based on caps */
DEVX_SET(mkc, mkc, relaxed_ordering_write,
!!(acc & IBV_ACCESS_RELAXED_ORDERING));
DEVX_SET(mkc, mkc, relaxed_ordering_read,
!!(acc & IBV_ACCESS_RELAXED_ORDERING));
DEVX_SET(mkc, mkc, pd, mpd->pdn);
DEVX_SET(mkc, mkc, qpn, 0xffffff);
DEVX_SET64(mkc, mkc, start_addr, start_addr);
}
static int mlx5_vfio_dereg_mr(struct verbs_mr *vmr)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(vmr->ibv_mr.context);
struct mlx5_vfio_mr *mr = to_mvfio_mr(&vmr->ibv_mr);
uint32_t in[DEVX_ST_SZ_DW(destroy_mkey_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(destroy_mkey_in)] = {};
int ret;
DEVX_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
DEVX_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(vmr->ibv_mr.lkey));
ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (ret)
return ret;
mlx5_vfio_unregister_mem(ctx, mr->iova + mr->iova_aligned_offset,
mr->iova_reg_size);
iset_insert_range(ctx->iova_alloc, mr->iova, mr->iova_page_size);
free(vmr);
return 0;
}
static void mlx5_vfio_populate_pas(uint64_t dma_addr, int num_dma, size_t page_size,
__be64 *pas, uint64_t access_flags)
{
int i;
for (i = 0; i < num_dma; i++) {
*pas = htobe64(dma_addr | access_flags);
pas++;
dma_addr += page_size;
}
}
static uint64_t calc_spanning_page_size(uint64_t start, uint64_t length)
{
/* Compute a page_size such that:
* start & (page_size-1) == (start + length) & (page_size - 1)
*/
uint64_t diffs = start ^ (start + length - 1);
uint64_t page_size = roundup_pow_of_two(diffs + 1);
/*
* Don't waste more than 1G of IOVA address space trying to
* minimize MTTs
*/
while (page_size - length > 1024 * 1024 * 1024) {
if (page_size / 2 < length)
break;
page_size /= 2;
}
return page_size;
}
static struct ibv_mr *mlx5_vfio_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
uint64_t hca_va, int access)
{
struct mlx5_vfio_device *dev = to_mvfio_dev(pd->context->device);
struct mlx5_vfio_context *ctx = to_mvfio_ctx(pd->context);
uint32_t out[DEVX_ST_SZ_DW(create_mkey_out)] = {};
uint32_t mkey_index;
uint32_t *in;
int inlen, num_pas, ret;
struct mlx5_vfio_mr *mr;
struct verbs_mr *vmr;
int page_shift, iova_min_page_shift;
__be64 *pas;
uint8_t key;
void *mkc;
void *aligned_va;
if (!check_comp_mask(access, MLX5_VFIO_SUPP_MR_ACCESS_FLAGS)) {
errno = EOPNOTSUPP;
return NULL;
}
if (((uint64_t)(uintptr_t)addr & (ctx->iova_min_page_size - 1)) !=
(hca_va & (ctx->iova_min_page_size - 1))) {
errno = EOPNOTSUPP;
return NULL;
}
mr = calloc(1, sizeof(*mr));
if (!mr) {
errno = ENOMEM;
return NULL;
}
aligned_va = (void *)(uintptr_t)((unsigned long)addr &
~(ctx->iova_min_page_size - 1));
iova_min_page_shift = ilog64(ctx->iova_min_page_size - 1);
mr->iova_page_size = max(calc_spanning_page_size(hca_va, length),
ctx->iova_min_page_size);
page_shift = ilog64(mr->iova_page_size - 1);
/* Ensure the low bis of the mkey VA match the low bits of the IOVA
* because the mkc start_addr specifies both the wire VA and the DMA VA.
*/
mr->iova_aligned_offset =
hca_va & GENMASK(page_shift - 1, iova_min_page_shift);
mr->iova_reg_size = align(length + hca_va, ctx->iova_min_page_size) -
align_down(hca_va, ctx->iova_min_page_size);
if (page_shift > MLX5_MAX_PAGE_SHIFT) {
page_shift = MLX5_MAX_PAGE_SHIFT;
mr->iova_page_size = 1ULL << page_shift;
}
ret = iset_alloc_range(ctx->iova_alloc,
mr->iova_aligned_offset + mr->iova_reg_size,
&mr->iova, mr->iova_page_size);
if (ret)
goto end;
/* IOVA must be aligned */
assert(mr->iova % mr->iova_page_size == 0);
ret = mlx5_vfio_register_mem(ctx, aligned_va,
mr->iova + mr->iova_aligned_offset,
mr->iova_reg_size);
if (ret)
goto err_reg;
num_pas = calc_num_dma_blocks(hca_va, length, mr->iova_page_size);
inlen = DEVX_ST_SZ_BYTES(create_mkey_in) + (sizeof(*pas) * align(num_pas, 2));
in = calloc(1, inlen);
if (!in) {
errno = ENOMEM;
goto err_in;
}
pas = (__be64 *)DEVX_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
/* if page_shift was greater than MLX5_MAX_PAGE_SHIFT then limiting it
* will cause the starting IOVA to be incorrect, adjust it.
*/
mlx5_vfio_populate_pas(align_down(mr->iova + mr->iova_aligned_offset,
mr->iova_page_size),
num_pas, mr->iova_page_size,
pas, MLX5_MTT_PRESENT);
DEVX_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
DEVX_SET(create_mkey_in, in, pg_access, 1);
mkc = DEVX_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
set_mkc_access_pd_addr_fields(mkc, access, hca_va, pd);
DEVX_SET(mkc, mkc, free, 0);
DEVX_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
DEVX_SET64(mkc, mkc, len, length);
DEVX_SET(mkc, mkc, bsf_octword_size, 0);
DEVX_SET(mkc, mkc, translations_octword_size,
get_octo_len(hca_va, length, page_shift));
DEVX_SET(mkc, mkc, log_page_size, page_shift);
DEVX_SET(create_mkey_in, in, translations_octword_actual_size,
get_octo_len(hca_va, length, page_shift));
key = atomic_fetch_add(&dev->mkey_var, 1);
DEVX_SET(mkc, mkc, mkey_7_0, key);
ret = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0);
if (ret)
goto err_exec;
free(in);
mkey_index = DEVX_GET(create_mkey_out, out, mkey_index);
vmr = &mr->vmr;
vmr->ibv_mr.lkey = key | mlx5_idx_to_mkey(mkey_index);
vmr->ibv_mr.rkey = vmr->ibv_mr.lkey;
vmr->ibv_mr.context = pd->context;
vmr->mr_type = IBV_MR_TYPE_MR;
vmr->access = access;
vmr->ibv_mr.handle = 0;
return &mr->vmr.ibv_mr;
err_exec:
free(in);
err_in:
mlx5_vfio_unregister_mem(ctx, mr->iova + mr->iova_aligned_offset,
mr->iova_reg_size);
err_reg:
iset_insert_range(ctx->iova_alloc, mr->iova, mr->iova_page_size);
end:
free(mr);
return NULL;
}
static int vfio_devx_query_eqn(struct ibv_context *ibctx, uint32_t vector,
uint32_t *eqn)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
if (vector != MLX5_VFIO_CMD_VEC_IDX)
return EINVAL;
/* For now use the singleton EQN created for async events */
*eqn = ctx->async_eq.eqn;
return 0;
}
static struct mlx5dv_devx_uar *
vfio_devx_alloc_uar(struct ibv_context *ibctx, uint32_t flags)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
struct mlx5_devx_uar *uar;
if (flags != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) {
errno = EOPNOTSUPP;
return NULL;
}
uar = calloc(1, sizeof(*uar));
if (!uar) {
errno = ENOMEM;
return NULL;
}
uar->dv_devx_uar.page_id = ctx->eqs_uar.uarn;
uar->dv_devx_uar.base_addr = (void *)(uintptr_t)ctx->eqs_uar.iova;
uar->dv_devx_uar.reg_addr = uar->dv_devx_uar.base_addr + MLX5_BF_OFFSET;
uar->context = ibctx;
return &uar->dv_devx_uar;
}
static void vfio_devx_free_uar(struct mlx5dv_devx_uar *dv_devx_uar)
{
free(dv_devx_uar);
}
static struct mlx5dv_devx_umem *
_vfio_devx_umem_reg(struct ibv_context *context,
void *addr, size_t size, uint32_t access,
uint64_t pgsz_bitmap)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(context);
uint32_t out[DEVX_ST_SZ_DW(create_umem_out)] = {};
struct mlx5_vfio_devx_umem *vfio_umem;
int iova_page_shift;
uint64_t iova_size;
int ret;
void *in;
uint32_t inlen;
__be64 *mtt;
void *umem;
bool writeable;
void *aligned_va;
int num_pas;
if (!check_comp_mask(access, MLX5_VFIO_SUPP_UMEM_ACCESS_FLAGS)) {
errno = EOPNOTSUPP;
return NULL;
}
if ((access & IBV_ACCESS_REMOTE_WRITE) &&
!(access & IBV_ACCESS_LOCAL_WRITE)) {
errno = EINVAL;
return NULL;
}
/* Page size that encloses the start and end of the umem range */
iova_size = max(roundup_pow_of_two(size + ((uint64_t)(uintptr_t)addr & (ctx->iova_min_page_size - 1))),
ctx->iova_min_page_size);
if (!(iova_size & pgsz_bitmap)) {
/* input should include the iova page size */
errno = EOPNOTSUPP;
return NULL;
}
writeable = access &
(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
vfio_umem = calloc(1, sizeof(*vfio_umem));
if (!vfio_umem) {
errno = ENOMEM;
return NULL;
}
vfio_umem->iova_size = iova_size;
if (ibv_dontfork_range(addr, size))
goto err;
ret = iset_alloc_range(ctx->iova_alloc, vfio_umem->iova_size,
&vfio_umem->iova, vfio_umem->iova_size);
if (ret)
goto err_alloc;
/* The registration's arguments have to reflect real VA presently mapped into the process */
aligned_va = (void *)(uintptr_t)((unsigned long) addr & ~(ctx->iova_min_page_size - 1));
vfio_umem->iova_reg_size = align((addr + size) - aligned_va, ctx->iova_min_page_size);
ret = mlx5_vfio_register_mem(ctx, aligned_va, vfio_umem->iova, vfio_umem->iova_reg_size);
if (ret)
goto err_reg;
iova_page_shift = ilog32(vfio_umem->iova_size - 1);
num_pas = 1;
if (iova_page_shift > MLX5_MAX_PAGE_SHIFT) {
iova_page_shift = MLX5_MAX_PAGE_SHIFT;
num_pas = DIV_ROUND_UP(vfio_umem->iova_size, (1ULL << iova_page_shift));
}
inlen = DEVX_ST_SZ_BYTES(create_umem_in) + DEVX_ST_SZ_BYTES(mtt) * num_pas;
in = calloc(1, inlen);
if (!in) {
errno = ENOMEM;
goto err_in;
}
umem = DEVX_ADDR_OF(create_umem_in, in, umem);
mtt = (__be64 *)DEVX_ADDR_OF(umem, umem, mtt);
DEVX_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
DEVX_SET64(umem, umem, num_of_mtt, num_pas);
DEVX_SET(umem, umem, log_page_size, iova_page_shift - MLX5_ADAPTER_PAGE_SHIFT);
DEVX_SET(umem, umem, page_offset, addr - aligned_va);
mlx5_vfio_populate_pas(vfio_umem->iova, num_pas, (1ULL << iova_page_shift), mtt,
(writeable ? MLX5_MTT_WRITE : 0) | MLX5_MTT_READ);
ret = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0);
if (ret)
goto err_exec;
free(in);
vfio_umem->dv_devx_umem.umem_id = DEVX_GET(create_umem_out, out, umem_id);
vfio_umem->context = context;
vfio_umem->addr = addr;
vfio_umem->size = size;
return &vfio_umem->dv_devx_umem;
err_exec:
free(in);
err_in:
mlx5_vfio_unregister_mem(ctx, vfio_umem->iova, vfio_umem->iova_reg_size);
err_reg:
iset_insert_range(ctx->iova_alloc, vfio_umem->iova, vfio_umem->iova_size);
err_alloc:
ibv_dofork_range(addr, size);
err:
free(vfio_umem);
return NULL;
}
static struct mlx5dv_devx_umem *
vfio_devx_umem_reg(struct ibv_context *context,
void *addr, size_t size, uint32_t access)
{
return _vfio_devx_umem_reg(context, addr, size, access, UINT64_MAX);
}
static struct mlx5dv_devx_umem *
vfio_devx_umem_reg_ex(struct ibv_context *ctx, struct mlx5dv_devx_umem_in *in)
{
if (!check_comp_mask(in->comp_mask, 0)) {
errno = EOPNOTSUPP;
return NULL;
}
return _vfio_devx_umem_reg(ctx, in->addr, in->size, in->access, in->pgsz_bitmap);
}
static int vfio_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem)
{
struct mlx5_vfio_devx_umem *vfio_umem =
container_of(dv_devx_umem, struct mlx5_vfio_devx_umem,
dv_devx_umem);
struct mlx5_vfio_context *ctx = to_mvfio_ctx(vfio_umem->context);
uint32_t in[DEVX_ST_SZ_DW(create_umem_in)] = {};
uint32_t out[DEVX_ST_SZ_DW(create_umem_out)] = {};
int ret;
DEVX_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
DEVX_SET(destroy_umem_in, in, umem_id, dv_devx_umem->umem_id);
ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (ret)
return ret;
mlx5_vfio_unregister_mem(ctx, vfio_umem->iova, vfio_umem->iova_reg_size);
iset_insert_range(ctx->iova_alloc, vfio_umem->iova, vfio_umem->iova_size);
ibv_dofork_range(vfio_umem->addr, vfio_umem->size);
free(vfio_umem);
return 0;
}
static int vfio_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type)
{
struct ibv_pd *pd_in = obj->pd.in;
struct mlx5dv_pd *pd_out = obj->pd.out;
struct mlx5_pd *mpd = to_mpd(pd_in);
if (obj_type != MLX5DV_OBJ_PD)
return EOPNOTSUPP;
pd_out->comp_mask = 0;
pd_out->pdn = mpd->pdn;
return 0;
}
static int vfio_devx_general_cmd(struct ibv_context *context, const void *in,
size_t inlen, void *out, size_t outlen)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(context);
return mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0);
}
static bool devx_is_obj_create_cmd(const void *in)
{
uint16_t opcode = DEVX_GET(general_obj_in_cmd_hdr, in, opcode);
switch (opcode) {
case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
case MLX5_CMD_OP_CREATE_MKEY:
case MLX5_CMD_OP_CREATE_CQ:
case MLX5_CMD_OP_ALLOC_PD:
case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN:
case MLX5_CMD_OP_CREATE_RMP:
case MLX5_CMD_OP_CREATE_SQ:
case MLX5_CMD_OP_CREATE_RQ:
case MLX5_CMD_OP_CREATE_RQT:
case MLX5_CMD_OP_CREATE_TIR:
case MLX5_CMD_OP_CREATE_TIS:
case MLX5_CMD_OP_ALLOC_Q_COUNTER:
case MLX5_CMD_OP_CREATE_FLOW_TABLE:
case MLX5_CMD_OP_CREATE_FLOW_GROUP:
case MLX5_CMD_OP_CREATE_FLOW_COUNTER:
case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
case MLX5_CMD_OP_CREATE_QP:
case MLX5_CMD_OP_CREATE_SRQ:
case MLX5_CMD_OP_CREATE_XRC_SRQ:
case MLX5_CMD_OP_CREATE_DCT:
case MLX5_CMD_OP_CREATE_XRQ:
case MLX5_CMD_OP_ATTACH_TO_MCG:
case MLX5_CMD_OP_ALLOC_XRCD:
return true;
case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
{
uint8_t op_mod = DEVX_GET(set_fte_in, in, op_mod);
if (op_mod == 0)
return true;
return false;
}
case MLX5_CMD_OP_CREATE_PSV:
{
uint8_t num_psv = DEVX_GET(create_psv_in, in, num_psv);
if (num_psv == 1)
return true;
return false;
}
default:
return false;
}
}
static uint32_t devx_get_created_obj_id(const void *in, const void *out,
uint16_t opcode)
{
switch (opcode) {
case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
return DEVX_GET(general_obj_out_cmd_hdr, out, obj_id);
case MLX5_CMD_OP_CREATE_UMEM:
return DEVX_GET(create_umem_out, out, umem_id);
case MLX5_CMD_OP_CREATE_MKEY:
return DEVX_GET(create_mkey_out, out, mkey_index);
case MLX5_CMD_OP_CREATE_CQ:
return DEVX_GET(create_cq_out, out, cqn);
case MLX5_CMD_OP_ALLOC_PD:
return DEVX_GET(alloc_pd_out, out, pd);
case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN:
return DEVX_GET(alloc_transport_domain_out, out,
transport_domain);
case MLX5_CMD_OP_CREATE_RMP:
return DEVX_GET(create_rmp_out, out, rmpn);
case MLX5_CMD_OP_CREATE_SQ:
return DEVX_GET(create_sq_out, out, sqn);
case MLX5_CMD_OP_CREATE_RQ:
return DEVX_GET(create_rq_out, out, rqn);
case MLX5_CMD_OP_CREATE_RQT:
return DEVX_GET(create_rqt_out, out, rqtn);
case MLX5_CMD_OP_CREATE_TIR:
return DEVX_GET(create_tir_out, out, tirn);
case MLX5_CMD_OP_CREATE_TIS:
return DEVX_GET(create_tis_out, out, tisn);
case MLX5_CMD_OP_ALLOC_Q_COUNTER:
return DEVX_GET(alloc_q_counter_out, out, counter_set_id);
case MLX5_CMD_OP_CREATE_FLOW_TABLE:
return DEVX_GET(create_flow_table_out, out, table_id);
case MLX5_CMD_OP_CREATE_FLOW_GROUP:
return DEVX_GET(create_flow_group_out, out, group_id);
case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
return DEVX_GET(set_fte_in, in, flow_index);
case MLX5_CMD_OP_CREATE_FLOW_COUNTER:
return DEVX_GET(alloc_flow_counter_out, out, flow_counter_id);
case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
return DEVX_GET(alloc_packet_reformat_context_out, out,
packet_reformat_id);
case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
return DEVX_GET(alloc_modify_header_context_out, out,
modify_header_id);
case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
return DEVX_GET(create_scheduling_element_out, out,
scheduling_element_id);
case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
return DEVX_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port);
case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
return DEVX_GET(set_l2_table_entry_in, in, table_index);
case MLX5_CMD_OP_CREATE_QP:
return DEVX_GET(create_qp_out, out, qpn);
case MLX5_CMD_OP_CREATE_SRQ:
return DEVX_GET(create_srq_out, out, srqn);
case MLX5_CMD_OP_CREATE_XRC_SRQ:
return DEVX_GET(create_xrc_srq_out, out, xrc_srqn);
case MLX5_CMD_OP_CREATE_DCT:
return DEVX_GET(create_dct_out, out, dctn);
case MLX5_CMD_OP_CREATE_XRQ:
return DEVX_GET(create_xrq_out, out, xrqn);
case MLX5_CMD_OP_ATTACH_TO_MCG:
return DEVX_GET(attach_to_mcg_in, in, qpn);
case MLX5_CMD_OP_ALLOC_XRCD:
return DEVX_GET(alloc_xrcd_out, out, xrcd);
case MLX5_CMD_OP_CREATE_PSV:
return DEVX_GET(create_psv_out, out, psv0_index);
default:
/* The entry must match to one of the devx_is_obj_create_cmd */
assert(false);
return 0;
}
}
static void devx_obj_build_destroy_cmd(const void *in, void *out,
void *din, uint32_t *dinlen,
struct mlx5dv_devx_obj *obj)
{
uint16_t opcode = DEVX_GET(general_obj_in_cmd_hdr, in, opcode);
uint16_t uid = DEVX_GET(general_obj_in_cmd_hdr, in, uid);
uint32_t *obj_id = &obj->object_id;
*obj_id = devx_get_created_obj_id(in, out, opcode);
*dinlen = DEVX_ST_SZ_BYTES(general_obj_in_cmd_hdr);
DEVX_SET(general_obj_in_cmd_hdr, din, uid, uid);
switch (opcode) {
case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
DEVX_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
DEVX_SET(general_obj_in_cmd_hdr, din, obj_id, *obj_id);
DEVX_SET(general_obj_in_cmd_hdr, din, obj_type,
DEVX_GET(general_obj_in_cmd_hdr, in, obj_type));
break;
case MLX5_CMD_OP_CREATE_UMEM:
DEVX_SET(destroy_umem_in, din, opcode,
MLX5_CMD_OP_DESTROY_UMEM);
DEVX_SET(destroy_umem_in, din, umem_id, *obj_id);
break;
case MLX5_CMD_OP_CREATE_MKEY:
DEVX_SET(destroy_mkey_in, din, opcode,
MLX5_CMD_OP_DESTROY_MKEY);
DEVX_SET(destroy_mkey_in, din, mkey_index, *obj_id);
break;
case MLX5_CMD_OP_CREATE_CQ:
DEVX_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ);
DEVX_SET(destroy_cq_in, din, cqn, *obj_id);
break;
case MLX5_CMD_OP_ALLOC_PD:
DEVX_SET(dealloc_pd_in, din, opcode, MLX5_CMD_OP_DEALLOC_PD);
DEVX_SET(dealloc_pd_in, din, pd, *obj_id);
break;
case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN:
DEVX_SET(dealloc_transport_domain_in, din, opcode,
MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
DEVX_SET(dealloc_transport_domain_in, din, transport_domain,
*obj_id);
break;
case MLX5_CMD_OP_CREATE_RMP:
DEVX_SET(destroy_rmp_in, din, opcode, MLX5_CMD_OP_DESTROY_RMP);
DEVX_SET(destroy_rmp_in, din, rmpn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_SQ:
DEVX_SET(destroy_sq_in, din, opcode, MLX5_CMD_OP_DESTROY_SQ);
DEVX_SET(destroy_sq_in, din, sqn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_RQ:
DEVX_SET(destroy_rq_in, din, opcode, MLX5_CMD_OP_DESTROY_RQ);
DEVX_SET(destroy_rq_in, din, rqn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_RQT:
DEVX_SET(destroy_rqt_in, din, opcode, MLX5_CMD_OP_DESTROY_RQT);
DEVX_SET(destroy_rqt_in, din, rqtn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_TIR:
DEVX_SET(destroy_tir_in, din, opcode, MLX5_CMD_OP_DESTROY_TIR);
DEVX_SET(destroy_tir_in, din, tirn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_TIS:
DEVX_SET(destroy_tis_in, din, opcode, MLX5_CMD_OP_DESTROY_TIS);
DEVX_SET(destroy_tis_in, din, tisn, *obj_id);
break;
case MLX5_CMD_OP_ALLOC_Q_COUNTER:
DEVX_SET(dealloc_q_counter_in, din, opcode,
MLX5_CMD_OP_DEALLOC_Q_COUNTER);
DEVX_SET(dealloc_q_counter_in, din, counter_set_id, *obj_id);
break;
case MLX5_CMD_OP_CREATE_FLOW_TABLE:
*dinlen = DEVX_ST_SZ_BYTES(destroy_flow_table_in);
DEVX_SET(destroy_flow_table_in, din, other_vport,
DEVX_GET(create_flow_table_in, in, other_vport));
DEVX_SET(destroy_flow_table_in, din, vport_number,
DEVX_GET(create_flow_table_in, in, vport_number));
DEVX_SET(destroy_flow_table_in, din, table_type,
DEVX_GET(create_flow_table_in, in, table_type));
DEVX_SET(destroy_flow_table_in, din, table_id, *obj_id);
DEVX_SET(destroy_flow_table_in, din, opcode,
MLX5_CMD_OP_DESTROY_FLOW_TABLE);
break;
case MLX5_CMD_OP_CREATE_FLOW_GROUP:
*dinlen = DEVX_ST_SZ_BYTES(destroy_flow_group_in);
DEVX_SET(destroy_flow_group_in, din, other_vport,
DEVX_GET(create_flow_group_in, in, other_vport));
DEVX_SET(destroy_flow_group_in, din, vport_number,
DEVX_GET(create_flow_group_in, in, vport_number));
DEVX_SET(destroy_flow_group_in, din, table_type,
DEVX_GET(create_flow_group_in, in, table_type));
DEVX_SET(destroy_flow_group_in, din, table_id,
DEVX_GET(create_flow_group_in, in, table_id));
DEVX_SET(destroy_flow_group_in, din, group_id, *obj_id);
DEVX_SET(destroy_flow_group_in, din, opcode,
MLX5_CMD_OP_DESTROY_FLOW_GROUP);
break;
case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
*dinlen = DEVX_ST_SZ_BYTES(delete_fte_in);
DEVX_SET(delete_fte_in, din, other_vport,
DEVX_GET(set_fte_in, in, other_vport));
DEVX_SET(delete_fte_in, din, vport_number,
DEVX_GET(set_fte_in, in, vport_number));
DEVX_SET(delete_fte_in, din, table_type,
DEVX_GET(set_fte_in, in, table_type));
DEVX_SET(delete_fte_in, din, table_id,
DEVX_GET(set_fte_in, in, table_id));
DEVX_SET(delete_fte_in, din, flow_index, *obj_id);
DEVX_SET(delete_fte_in, din, opcode,
MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY);
break;
case MLX5_CMD_OP_CREATE_FLOW_COUNTER:
DEVX_SET(dealloc_flow_counter_in, din, opcode,
MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
DEVX_SET(dealloc_flow_counter_in, din, flow_counter_id,
*obj_id);
break;
case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT:
DEVX_SET(dealloc_packet_reformat_context_in, din, opcode,
MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT);
DEVX_SET(dealloc_packet_reformat_context_in, din,
packet_reformat_id, *obj_id);
break;
case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
DEVX_SET(dealloc_modify_header_context_in, din, opcode,
MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT);
DEVX_SET(dealloc_modify_header_context_in, din,
modify_header_id, *obj_id);
break;
case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
*dinlen = DEVX_ST_SZ_BYTES(destroy_scheduling_element_in);
DEVX_SET(destroy_scheduling_element_in, din,
scheduling_hierarchy,
DEVX_GET(create_scheduling_element_in, in,
scheduling_hierarchy));
DEVX_SET(destroy_scheduling_element_in, din,
scheduling_element_id, *obj_id);
DEVX_SET(destroy_scheduling_element_in, din, opcode,
MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT);
break;
case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
*dinlen = DEVX_ST_SZ_BYTES(delete_vxlan_udp_dport_in);
DEVX_SET(delete_vxlan_udp_dport_in, din, vxlan_udp_port, *obj_id);
DEVX_SET(delete_vxlan_udp_dport_in, din, opcode,
MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT);
break;
case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
*dinlen = DEVX_ST_SZ_BYTES(delete_l2_table_entry_in);
DEVX_SET(delete_l2_table_entry_in, din, table_index, *obj_id);
DEVX_SET(delete_l2_table_entry_in, din, opcode,
MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY);
break;
case MLX5_CMD_OP_CREATE_QP:
DEVX_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
DEVX_SET(destroy_qp_in, din, qpn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_SRQ:
DEVX_SET(destroy_srq_in, din, opcode, MLX5_CMD_OP_DESTROY_SRQ);
DEVX_SET(destroy_srq_in, din, srqn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_XRC_SRQ:
DEVX_SET(destroy_xrc_srq_in, din, opcode,
MLX5_CMD_OP_DESTROY_XRC_SRQ);
DEVX_SET(destroy_xrc_srq_in, din, xrc_srqn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_DCT:
DEVX_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT);
DEVX_SET(destroy_dct_in, din, dctn, *obj_id);
break;
case MLX5_CMD_OP_CREATE_XRQ:
DEVX_SET(destroy_xrq_in, din, opcode, MLX5_CMD_OP_DESTROY_XRQ);
DEVX_SET(destroy_xrq_in, din, xrqn, *obj_id);
break;
case MLX5_CMD_OP_ATTACH_TO_MCG:
*dinlen = DEVX_ST_SZ_BYTES(detach_from_mcg_in);
DEVX_SET(detach_from_mcg_in, din, qpn,
DEVX_GET(attach_to_mcg_in, in, qpn));
memcpy(DEVX_ADDR_OF(detach_from_mcg_in, din, multicast_gid),
DEVX_ADDR_OF(attach_to_mcg_in, in, multicast_gid),
DEVX_FLD_SZ_BYTES(attach_to_mcg_in, multicast_gid));
DEVX_SET(detach_from_mcg_in, din, opcode,
MLX5_CMD_OP_DETACH_FROM_MCG);
DEVX_SET(detach_from_mcg_in, din, qpn, *obj_id);
break;
case MLX5_CMD_OP_ALLOC_XRCD:
DEVX_SET(dealloc_xrcd_in, din, opcode,
MLX5_CMD_OP_DEALLOC_XRCD);
DEVX_SET(dealloc_xrcd_in, din, xrcd, *obj_id);
break;
case MLX5_CMD_OP_CREATE_PSV:
DEVX_SET(destroy_psv_in, din, opcode,
MLX5_CMD_OP_DESTROY_PSV);
DEVX_SET(destroy_psv_in, din, psvn, *obj_id);
break;
default:
/* The entry must match to one of the devx_is_obj_create_cmd */
assert(false);
break;
}
}
static struct mlx5dv_devx_obj *
vfio_devx_obj_create(struct ibv_context *context, const void *in,
size_t inlen, void *out, size_t outlen)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(context);
struct mlx5_devx_obj *obj;
int ret;
if (!devx_is_obj_create_cmd(in)) {
errno = EINVAL;
return NULL;
}
obj = calloc(1, sizeof(*obj));
if (!obj) {
errno = ENOMEM;
return NULL;
}
ret = mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0);
if (ret) {
errno = ret;
goto fail;
}
devx_obj_build_destroy_cmd(in, out, obj->dinbox,
&obj->dinlen, &obj->dv_obj);
obj->dv_obj.context = context;
return &obj->dv_obj;
fail:
free(obj);
return NULL;
}
static int vfio_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in,
size_t inlen, void *out, size_t outlen)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(obj->context);
return mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0);
}
static int vfio_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in,
size_t inlen, void *out, size_t outlen)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(obj->context);
return mlx5_vfio_cmd_do(ctx, (void *)in, inlen, out, outlen, 0);
}
static int vfio_devx_obj_destroy(struct mlx5dv_devx_obj *obj)
{
struct mlx5_devx_obj *mobj = container_of(obj,
struct mlx5_devx_obj, dv_obj);
struct mlx5_vfio_context *ctx = to_mvfio_ctx(obj->context);
uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)];
int ret;
ret = mlx5_vfio_cmd_exec(ctx, mobj->dinbox, mobj->dinlen,
out, sizeof(out), 0);
if (ret)
return ret;
free(mobj);
return 0;
}
static struct mlx5dv_devx_msi_vector *
vfio_devx_alloc_msi_vector(struct ibv_context *ibctx)
{
uint8_t buf[sizeof(struct vfio_irq_set) + sizeof(int)] = {};
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
struct mlx5_devx_msi_vector *msi;
int vector, *fd, err;
msi = calloc(1, sizeof(*msi));
if (!msi) {
errno = ENOMEM;
return NULL;
}
pthread_mutex_lock(&ctx->msix_fds_lock);
for (vector = 0; vector < ibctx->num_comp_vectors; vector++)
if (ctx->msix_fds[vector] < 0)
break;
if (vector == ibctx->num_comp_vectors) {
errno = ENOSPC;
goto fail;
}
fd = (int *)(buf + sizeof(struct vfio_irq_set));
*fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
if (*fd < 0)
goto fail;
err = mlx5_vfio_msix_set_irqs(ctx, vector, 1, buf);
if (err)
goto fail_set_irqs;
ctx->msix_fds[vector] = *fd;
msi->dv_msi.vector = vector;
msi->dv_msi.fd = *fd;
msi->ibctx = ibctx;
pthread_mutex_unlock(&ctx->msix_fds_lock);
return &msi->dv_msi;
fail_set_irqs:
close(*fd);
fail:
pthread_mutex_unlock(&ctx->msix_fds_lock);
free(msi);
return NULL;
}
static int vfio_devx_free_msi_vector(struct mlx5dv_devx_msi_vector *msi)
{
struct mlx5_devx_msi_vector *msiv =
container_of(msi, struct mlx5_devx_msi_vector, dv_msi);
uint8_t buf[sizeof(struct vfio_irq_set) + sizeof(int)] = {};
struct mlx5_vfio_context *ctx = to_mvfio_ctx(msiv->ibctx);
int ret;
pthread_mutex_lock(&ctx->msix_fds_lock);
if ((msi->vector >= msiv->ibctx->num_comp_vectors) ||
(msi->vector == MLX5_VFIO_CMD_VEC_IDX) ||
(msi->fd != ctx->msix_fds[msi->vector])) {
ret = EINVAL;
goto out;
}
*(int *)(buf + sizeof(struct vfio_irq_set)) = -1;
ret = mlx5_vfio_msix_set_irqs(ctx, msi->vector, 1, buf);
if (ret) {
ret = errno;
goto out;
}
close(msi->fd);
ctx->msix_fds[msi->vector] = -1;
free(msiv);
out:
pthread_mutex_unlock(&ctx->msix_fds_lock);
return ret;
}
static struct mlx5dv_devx_eq *
vfio_devx_create_eq(struct ibv_context *ibctx, const void *in, size_t inlen,
void *out, size_t outlen)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
struct mlx5_devx_eq *eq;
void *eqc, *in_pas;
size_t inlen_pas;
uint64_t size;
__be64 *pas;
int err;
eqc = DEVX_ADDR_OF(create_eq_in, in, eq_context_entry);
if ((inlen < DEVX_ST_SZ_BYTES(create_eq_in)) ||
(DEVX_GET(create_eq_in, in, opcode) != MLX5_CMD_OP_CREATE_EQ) ||
(DEVX_GET(eqc, eqc, intr) == MLX5_VFIO_CMD_VEC_IDX)) {
errno = EINVAL;
return NULL;
}
size = max(roundup_pow_of_two(
(1ULL << DEVX_GET(eqc, eqc, log_eq_size)) * MLX5_EQE_SIZE),
ctx->iova_min_page_size);
if (size > SIZE_MAX) {
errno = ERANGE;
return NULL;
}
eq = calloc(1, sizeof(*eq));
if (!eq) {
errno = ENOMEM;
return NULL;
}
eq->size = size;
err = posix_memalign(&eq->dv_eq.vaddr,
MLX5_ADAPTER_PAGE_SIZE, eq->size);
if (err) {
errno = err;
goto err_va;
}
err = iset_alloc_range(ctx->iova_alloc, eq->size,
&eq->iova, eq->size);
if (err)
goto err_range;
err = mlx5_vfio_register_mem(ctx, eq->dv_eq.vaddr, eq->iova, eq->size);
if (err)
goto err_reg;
inlen_pas = inlen + DEVX_FLD_SZ_BYTES(create_eq_in, pas[0]) * 1;
in_pas = calloc(1, inlen_pas);
if (!in_pas) {
errno = ENOMEM;
goto err_inpas;
}
memcpy(in_pas, in, inlen);
eqc = DEVX_ADDR_OF(create_eq_in, in_pas, eq_context_entry);
DEVX_SET(eqc, eqc, log_page_size,
ilog32(eq->size - 1) - MLX5_ADAPTER_PAGE_SHIFT);
pas = (__be64 *)DEVX_ADDR_OF(create_eq_in, in_pas, pas);
pas[0] = htobe64(eq->iova);
err = mlx5_vfio_cmd_do(ctx, in_pas, inlen_pas, out, outlen, 0);
if (err) {
errno = err;
goto err_cmd;
}
free(in_pas);
eq->ibctx = ibctx;
eq->eqn = DEVX_GET(create_eq_out, out, eq_number);
return &eq->dv_eq;
err_cmd:
free(in_pas);
err_inpas:
mlx5_vfio_unregister_mem(ctx, eq->iova, eq->size);
err_reg:
iset_insert_range(ctx->iova_alloc, eq->iova, eq->size);
err_range:
free(eq->dv_eq.vaddr);
err_va:
free(eq);
return NULL;
}
static int vfio_devx_destroy_eq(struct mlx5dv_devx_eq *dveq)
{
struct mlx5_devx_eq *eq =
container_of(dveq, struct mlx5_devx_eq, dv_eq);
struct mlx5_vfio_context *ctx = to_mvfio_ctx(eq->ibctx);
uint32_t out[DEVX_ST_SZ_DW(destroy_eq_out)] = {};
uint32_t in[DEVX_ST_SZ_DW(destroy_eq_in)] = {};
int err;
DEVX_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
DEVX_SET(destroy_eq_in, in, eq_number, eq->eqn);
err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
if (err)
return err;
mlx5_vfio_unregister_mem(ctx, eq->iova, eq->size);
iset_insert_range(ctx->iova_alloc, eq->iova, eq->size);
free(eq);
return 0;
}
static struct mlx5_dv_context_ops mlx5_vfio_dv_ctx_ops = {
.devx_general_cmd = vfio_devx_general_cmd,
.devx_obj_create = vfio_devx_obj_create,
.devx_obj_query = vfio_devx_obj_query,
.devx_obj_modify = vfio_devx_obj_modify,
.devx_obj_destroy = vfio_devx_obj_destroy,
.devx_query_eqn = vfio_devx_query_eqn,
.devx_alloc_uar = vfio_devx_alloc_uar,
.devx_free_uar = vfio_devx_free_uar,
.devx_umem_reg = vfio_devx_umem_reg,
.devx_umem_reg_ex = vfio_devx_umem_reg_ex,
.devx_umem_dereg = vfio_devx_umem_dereg,
.init_obj = vfio_init_obj,
.devx_alloc_msi_vector = vfio_devx_alloc_msi_vector,
.devx_free_msi_vector = vfio_devx_free_msi_vector,
.devx_create_eq = vfio_devx_create_eq,
.devx_destroy_eq = vfio_devx_destroy_eq,
};
static void mlx5_vfio_uninit_context(struct mlx5_vfio_context *ctx)
{
mlx5_close_debug_file(ctx->dbg_fp);
verbs_uninit_context(&ctx->vctx);
free(ctx);
}
static void mlx5_vfio_free_context(struct ibv_context *ibctx)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
destroy_async_eqs(ctx);
mlx5_vfio_teardown_hca(ctx);
mlx5_vfio_clean_cmd_interface(ctx);
mlx5_vfio_clean_device_dma(ctx);
mlx5_vfio_uninit_bar0(ctx);
mlx5_vfio_close_fds(ctx);
mlx5_vfio_uninit_context(ctx);
}
static const struct verbs_context_ops mlx5_vfio_common_ops = {
.alloc_pd = mlx5_vfio_alloc_pd,
.dealloc_pd = mlx5_vfio_dealloc_pd,
.reg_mr = mlx5_vfio_reg_mr,
.dereg_mr = mlx5_vfio_dereg_mr,
.free_context = mlx5_vfio_free_context,
};
static struct verbs_context *
mlx5_vfio_alloc_context(struct ibv_device *ibdev,
int cmd_fd, void *private_data)
{
struct mlx5_vfio_device *mdev = to_mvfio_dev(ibdev);
struct mlx5_vfio_context *mctx;
cmd_fd = -1;
mctx = verbs_init_and_alloc_context(ibdev, cmd_fd, mctx, vctx,
RDMA_DRIVER_UNKNOWN);
if (!mctx)
return NULL;
mlx5_open_debug_file(&mctx->dbg_fp);
mlx5_set_debug_mask();
if (mlx5_vfio_open_fds(mctx, mdev))
goto err;
if (mlx5_vfio_init_bar0(mctx))
goto close_fds;
if (mlx5_vfio_init_device_dma(mctx))
goto err_bar;
if (mlx5_vfio_init_cmd_interface(mctx))
goto err_dma;
if (mlx5_vfio_setup_function(mctx))
goto clean_cmd;
if (create_async_eqs(mctx))
goto func_teardown;
verbs_set_ops(&mctx->vctx, &mlx5_vfio_common_ops);
mctx->dv_ctx_ops = &mlx5_vfio_dv_ctx_ops;
return &mctx->vctx;
func_teardown:
mlx5_vfio_teardown_hca(mctx);
clean_cmd:
mlx5_vfio_clean_cmd_interface(mctx);
err_dma:
mlx5_vfio_clean_device_dma(mctx);
err_bar:
mlx5_vfio_uninit_bar0(mctx);
close_fds:
mlx5_vfio_close_fds(mctx);
err:
mlx5_vfio_uninit_context(mctx);
return NULL;
}
static void mlx5_vfio_uninit_device(struct verbs_device *verbs_device)
{
struct mlx5_vfio_device *dev = to_mvfio_dev(&verbs_device->device);
free(dev->pci_name);
free(dev);
}
static const struct verbs_device_ops mlx5_vfio_dev_ops = {
.name = "mlx5_vfio",
.alloc_context = mlx5_vfio_alloc_context,
.uninit_device = mlx5_vfio_uninit_device,
};
static bool is_mlx5_pci(const char *pci_path)
{
const struct verbs_match_ent *ent;
uint16_t vendor_id, device_id;
char pci_info_path[256];
char buff[128];
int fd;
snprintf(pci_info_path, sizeof(pci_info_path), "%s/vendor", pci_path);
fd = open(pci_info_path, O_RDONLY);
if (fd < 0)
return false;
if (read(fd, buff, sizeof(buff)) <= 0)
goto err;
vendor_id = strtoul(buff, NULL, 0);
close(fd);
snprintf(pci_info_path, sizeof(pci_info_path), "%s/device", pci_path);
fd = open(pci_info_path, O_RDONLY);
if (fd < 0)
return false;
if (read(fd, buff, sizeof(buff)) <= 0)
goto err;
device_id = strtoul(buff, NULL, 0);
close(fd);
for (ent = mlx5_hca_table; ent->kind != VERBS_MATCH_SENTINEL; ent++) {
if (ent->kind != VERBS_MATCH_PCI)
continue;
if (ent->device == device_id && ent->vendor == vendor_id)
return true;
}
return false;
err:
close(fd);
return false;
}
static int mlx5_vfio_get_iommu_group_id(const char *pci_name)
{
int seg, bus, slot, func;
int ret, groupid;
char path[128], iommu_group_path[128], *group_name;
struct stat st;
ssize_t len;
ret = sscanf(pci_name, "%04x:%02x:%02x.%d", &seg, &bus, &slot, &func);
if (ret != 4)
return -1;
snprintf(path, sizeof(path),
"/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
seg, bus, slot, func);
ret = stat(path, &st);
if (ret < 0)
return -1;
if (!is_mlx5_pci(path))
return -1;
strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
len = readlink(path, iommu_group_path, sizeof(iommu_group_path));
if (len <= 0)
return -1;
iommu_group_path[len] = 0;
group_name = basename(iommu_group_path);
if (sscanf(group_name, "%d", &groupid) != 1)
return -1;
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
ret = stat(path, &st);
if (ret < 0)
return -1;
return groupid;
}
static int mlx5_vfio_get_handle(struct mlx5_vfio_device *vfio_dev,
struct mlx5dv_vfio_context_attr *attr)
{
int iommu_group;
iommu_group = mlx5_vfio_get_iommu_group_id(attr->pci_name);
if (iommu_group < 0)
return -1;
sprintf(vfio_dev->vfio_path, "/dev/vfio/%d", iommu_group);
vfio_dev->pci_name = strdup(attr->pci_name);
return 0;
}
int mlx5dv_vfio_get_events_fd(struct ibv_context *ibctx)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
return ctx->cmd_comp_fd;
}
int mlx5dv_vfio_process_events(struct ibv_context *ibctx)
{
struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
uint64_t u;
ssize_t s;
mlx5_vfio_poll_health(ctx);
/* read to re-arm the FD and process all existing events */
s = read(ctx->cmd_comp_fd, &u, sizeof(uint64_t));
if (s < 0 && errno != EAGAIN) {
mlx5_err(ctx->dbg_fp, "%s, read failed, errno=%d\n",
__func__, errno);
return errno;
}
return mlx5_vfio_process_async_events(ctx);
}
struct ibv_device **
mlx5dv_get_vfio_device_list(struct mlx5dv_vfio_context_attr *attr)
{
struct mlx5_vfio_device *vfio_dev;
struct ibv_device **list = NULL;
int err;
if (!check_comp_mask(attr->comp_mask, 0) ||
!check_comp_mask(attr->flags, MLX5DV_VFIO_CTX_FLAGS_INIT_LINK_DOWN)) {
errno = EOPNOTSUPP;
return NULL;
}
list = calloc(2, sizeof(struct ibv_device *));
if (!list) {
errno = ENOMEM;
return NULL;
}
vfio_dev = calloc(1, sizeof(*vfio_dev));
if (!vfio_dev) {
errno = ENOMEM;
goto end;
}
vfio_dev->vdev.ops = &mlx5_vfio_dev_ops;
atomic_init(&vfio_dev->vdev.refcount, 1);
/* Find the vfio handle for attrs, store in mlx5_vfio_device */
err = mlx5_vfio_get_handle(vfio_dev, attr);
if (err)
goto err_get;
vfio_dev->flags = attr->flags;
vfio_dev->page_size = sysconf(_SC_PAGESIZE);
atomic_init(&vfio_dev->mkey_var, 0);
list[0] = &vfio_dev->vdev.device;
return list;
err_get:
free(vfio_dev);
end:
free(list);
return NULL;
}
bool is_mlx5_vfio_dev(struct ibv_device *device)
{
struct verbs_device *verbs_device = verbs_get_device(device);
return verbs_device->ops == &mlx5_vfio_dev_ops;
}