10f5db3c6SLong Li /* SPDX-License-Identifier: BSD-3-Clause
20f5db3c6SLong Li * Copyright 2022 Microsoft Corporation
30f5db3c6SLong Li */
40f5db3c6SLong Li
50f5db3c6SLong Li #include <rte_malloc.h>
60f5db3c6SLong Li #include <ethdev_driver.h>
70f5db3c6SLong Li #include <rte_eal_paging.h>
80f5db3c6SLong Li
90f5db3c6SLong Li #include <infiniband/verbs.h>
100f5db3c6SLong Li
110f5db3c6SLong Li #include "mana.h"
120f5db3c6SLong Li
130f5db3c6SLong Li struct mana_range {
140f5db3c6SLong Li uintptr_t start;
150f5db3c6SLong Li uintptr_t end;
160f5db3c6SLong Li uint32_t len;
170f5db3c6SLong Li };
180f5db3c6SLong Li
190f5db3c6SLong Li void
mana_mempool_chunk_cb(struct rte_mempool * mp __rte_unused,void * opaque,struct rte_mempool_memhdr * memhdr,unsigned int idx)200f5db3c6SLong Li mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
210f5db3c6SLong Li struct rte_mempool_memhdr *memhdr, unsigned int idx)
220f5db3c6SLong Li {
230f5db3c6SLong Li struct mana_range *ranges = opaque;
240f5db3c6SLong Li struct mana_range *range = &ranges[idx];
250f5db3c6SLong Li uint64_t page_size = rte_mem_page_size();
260f5db3c6SLong Li
270f5db3c6SLong Li range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
280f5db3c6SLong Li range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
290f5db3c6SLong Li page_size);
300f5db3c6SLong Li range->len = range->end - range->start;
310f5db3c6SLong Li }
320f5db3c6SLong Li
330f5db3c6SLong Li /*
340f5db3c6SLong Li * Register all memory regions from pool.
350f5db3c6SLong Li */
360f5db3c6SLong Li int
mana_new_pmd_mr(struct mana_mr_btree * local_tree,struct mana_priv * priv,struct rte_mempool * pool)370f5db3c6SLong Li mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
380f5db3c6SLong Li struct rte_mempool *pool)
390f5db3c6SLong Li {
400f5db3c6SLong Li struct ibv_mr *ibv_mr;
410f5db3c6SLong Li struct mana_range ranges[pool->nb_mem_chunks];
420f5db3c6SLong Li uint32_t i;
439d61fe41SLong Li struct mana_mr_cache mr;
440f5db3c6SLong Li int ret;
450f5db3c6SLong Li
460f5db3c6SLong Li rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
470f5db3c6SLong Li
480f5db3c6SLong Li for (i = 0; i < pool->nb_mem_chunks; i++) {
490f5db3c6SLong Li if (ranges[i].len > priv->max_mr_size) {
50e2d3a3c0SLong Li DP_LOG(ERR, "memory chunk size %u exceeding max MR",
510f5db3c6SLong Li ranges[i].len);
520f5db3c6SLong Li return -ENOMEM;
530f5db3c6SLong Li }
540f5db3c6SLong Li
55e2d3a3c0SLong Li DP_LOG(DEBUG,
5674decf3bSWei Hu "registering memory chunk start 0x%" PRIxPTR " len %u",
570f5db3c6SLong Li ranges[i].start, ranges[i].len);
580f5db3c6SLong Li
590f5db3c6SLong Li if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
600f5db3c6SLong Li /* Send a message to the primary to do MR */
610f5db3c6SLong Li ret = mana_mp_req_mr_create(priv, ranges[i].start,
620f5db3c6SLong Li ranges[i].len);
630f5db3c6SLong Li if (ret) {
64e2d3a3c0SLong Li DP_LOG(ERR,
6574decf3bSWei Hu "MR failed start 0x%" PRIxPTR " len %u",
660f5db3c6SLong Li ranges[i].start, ranges[i].len);
670f5db3c6SLong Li return ret;
680f5db3c6SLong Li }
690f5db3c6SLong Li continue;
700f5db3c6SLong Li }
710f5db3c6SLong Li
720f5db3c6SLong Li ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
730f5db3c6SLong Li ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
740f5db3c6SLong Li if (ibv_mr) {
7574decf3bSWei Hu DP_LOG(DEBUG, "MR lkey %u addr %p len %zu",
760f5db3c6SLong Li ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
770f5db3c6SLong Li
789d61fe41SLong Li mr.lkey = ibv_mr->lkey;
799d61fe41SLong Li mr.addr = (uintptr_t)ibv_mr->addr;
809d61fe41SLong Li mr.len = ibv_mr->length;
819d61fe41SLong Li mr.verb_obj = ibv_mr;
820f5db3c6SLong Li
830f5db3c6SLong Li rte_spinlock_lock(&priv->mr_btree_lock);
849d61fe41SLong Li ret = mana_mr_btree_insert(&priv->mr_btree, &mr);
850f5db3c6SLong Li rte_spinlock_unlock(&priv->mr_btree_lock);
860f5db3c6SLong Li if (ret) {
870f5db3c6SLong Li ibv_dereg_mr(ibv_mr);
88e2d3a3c0SLong Li DP_LOG(ERR, "Failed to add to global MR btree");
890f5db3c6SLong Li return ret;
900f5db3c6SLong Li }
910f5db3c6SLong Li
929d61fe41SLong Li ret = mana_mr_btree_insert(local_tree, &mr);
930f5db3c6SLong Li if (ret) {
940f5db3c6SLong Li /* Don't need to clean up MR as it's already
950f5db3c6SLong Li * in the global tree
960f5db3c6SLong Li */
97e2d3a3c0SLong Li DP_LOG(ERR, "Failed to add to local MR btree");
980f5db3c6SLong Li return ret;
990f5db3c6SLong Li }
1000f5db3c6SLong Li } else {
10174decf3bSWei Hu DP_LOG(ERR, "MR failed at 0x%" PRIxPTR " len %u",
1020f5db3c6SLong Li ranges[i].start, ranges[i].len);
1030f5db3c6SLong Li return -errno;
1040f5db3c6SLong Li }
1050f5db3c6SLong Li }
1060f5db3c6SLong Li return 0;
1070f5db3c6SLong Li }
1080f5db3c6SLong Li
1090f5db3c6SLong Li /*
1100f5db3c6SLong Li * Deregister a MR.
1110f5db3c6SLong Li */
1120f5db3c6SLong Li void
mana_del_pmd_mr(struct mana_mr_cache * mr)1130f5db3c6SLong Li mana_del_pmd_mr(struct mana_mr_cache *mr)
1140f5db3c6SLong Li {
1150f5db3c6SLong Li int ret;
1160f5db3c6SLong Li struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
1170f5db3c6SLong Li
1180f5db3c6SLong Li ret = ibv_dereg_mr(ibv_mr);
1190f5db3c6SLong Li if (ret)
120e2d3a3c0SLong Li DP_LOG(ERR, "dereg MR failed ret %d", ret);
1210f5db3c6SLong Li }
1220f5db3c6SLong Li
1230f5db3c6SLong Li /*
1247d79530eSLong Li * Alloc a MR.
1257d79530eSLong Li * Try to find a MR in the cache. If not found, register a new MR.
1260f5db3c6SLong Li */
1270f5db3c6SLong Li struct mana_mr_cache *
mana_alloc_pmd_mr(struct mana_mr_btree * local_mr_btree,struct mana_priv * priv,struct rte_mbuf * mbuf)1287d79530eSLong Li mana_alloc_pmd_mr(struct mana_mr_btree *local_mr_btree, struct mana_priv *priv,
1290f5db3c6SLong Li struct rte_mbuf *mbuf)
1300f5db3c6SLong Li {
1310f5db3c6SLong Li struct rte_mempool *pool = mbuf->pool;
1320f5db3c6SLong Li int ret, second_try = 0;
1330f5db3c6SLong Li struct mana_mr_cache *mr;
1340f5db3c6SLong Li uint16_t idx;
1350f5db3c6SLong Li
136e2d3a3c0SLong Li DP_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
1370f5db3c6SLong Li mbuf->buf_addr, mbuf->buf_len);
1380f5db3c6SLong Li
1390f5db3c6SLong Li try_again:
1400f5db3c6SLong Li /* First try to find the MR in local queue tree */
141*0c7bc26bSLong Li ret = mana_mr_btree_lookup(local_mr_btree, &idx,
142*0c7bc26bSLong Li (uintptr_t)mbuf->buf_addr, mbuf->buf_len,
143*0c7bc26bSLong Li &mr);
144*0c7bc26bSLong Li if (ret)
145*0c7bc26bSLong Li return NULL;
146*0c7bc26bSLong Li
1470f5db3c6SLong Li if (mr) {
14874decf3bSWei Hu DP_LOG(DEBUG, "Local mr lkey %u addr 0x%" PRIxPTR " len %zu",
1490f5db3c6SLong Li mr->lkey, mr->addr, mr->len);
1500f5db3c6SLong Li return mr;
1510f5db3c6SLong Li }
1520f5db3c6SLong Li
1530f5db3c6SLong Li /* If not found, try to find the MR in global tree */
1540f5db3c6SLong Li rte_spinlock_lock(&priv->mr_btree_lock);
155*0c7bc26bSLong Li ret = mana_mr_btree_lookup(&priv->mr_btree, &idx,
1560f5db3c6SLong Li (uintptr_t)mbuf->buf_addr,
157*0c7bc26bSLong Li mbuf->buf_len, &mr);
1580f5db3c6SLong Li rte_spinlock_unlock(&priv->mr_btree_lock);
1590f5db3c6SLong Li
160*0c7bc26bSLong Li if (ret)
161*0c7bc26bSLong Li return NULL;
162*0c7bc26bSLong Li
1630f5db3c6SLong Li /* If found in the global tree, add it to the local tree */
1640f5db3c6SLong Li if (mr) {
1650f5db3c6SLong Li ret = mana_mr_btree_insert(local_mr_btree, mr);
1660f5db3c6SLong Li if (ret) {
167e2d3a3c0SLong Li DP_LOG(ERR, "Failed to add MR to local tree.");
1680f5db3c6SLong Li return NULL;
1690f5db3c6SLong Li }
1700f5db3c6SLong Li
171e2d3a3c0SLong Li DP_LOG(DEBUG,
17274decf3bSWei Hu "Added local MR key %u addr 0x%" PRIxPTR " len %zu",
1730f5db3c6SLong Li mr->lkey, mr->addr, mr->len);
1740f5db3c6SLong Li return mr;
1750f5db3c6SLong Li }
1760f5db3c6SLong Li
1770f5db3c6SLong Li if (second_try) {
178e2d3a3c0SLong Li DP_LOG(ERR, "Internal error second try failed");
1790f5db3c6SLong Li return NULL;
1800f5db3c6SLong Li }
1810f5db3c6SLong Li
1820f5db3c6SLong Li ret = mana_new_pmd_mr(local_mr_btree, priv, pool);
1830f5db3c6SLong Li if (ret) {
184e2d3a3c0SLong Li DP_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
1850f5db3c6SLong Li ret, mbuf->buf_addr, mbuf->buf_len);
1860f5db3c6SLong Li return NULL;
1870f5db3c6SLong Li }
1880f5db3c6SLong Li
1890f5db3c6SLong Li second_try = 1;
1900f5db3c6SLong Li goto try_again;
1910f5db3c6SLong Li }
1920f5db3c6SLong Li
1930f5db3c6SLong Li void
mana_remove_all_mr(struct mana_priv * priv)1940f5db3c6SLong Li mana_remove_all_mr(struct mana_priv *priv)
1950f5db3c6SLong Li {
1960f5db3c6SLong Li struct mana_mr_btree *bt = &priv->mr_btree;
1970f5db3c6SLong Li struct mana_mr_cache *mr;
1980f5db3c6SLong Li struct ibv_mr *ibv_mr;
1990f5db3c6SLong Li uint16_t i;
2000f5db3c6SLong Li
2010f5db3c6SLong Li rte_spinlock_lock(&priv->mr_btree_lock);
2020f5db3c6SLong Li /* Start with index 1 as the 1st entry is always NULL */
2030f5db3c6SLong Li for (i = 1; i < bt->len; i++) {
2040f5db3c6SLong Li mr = &bt->table[i];
2050f5db3c6SLong Li ibv_mr = mr->verb_obj;
2060f5db3c6SLong Li ibv_dereg_mr(ibv_mr);
2070f5db3c6SLong Li }
2080f5db3c6SLong Li bt->len = 1;
2090f5db3c6SLong Li rte_spinlock_unlock(&priv->mr_btree_lock);
2100f5db3c6SLong Li }
2110f5db3c6SLong Li
2120f5db3c6SLong Li /*
2130f5db3c6SLong Li * Expand the MR cache.
2140f5db3c6SLong Li * MR cache is maintained as a btree and expand on demand.
2150f5db3c6SLong Li */
2160f5db3c6SLong Li static int
mana_mr_btree_expand(struct mana_mr_btree * bt,int n)2170f5db3c6SLong Li mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
2180f5db3c6SLong Li {
2190f5db3c6SLong Li void *mem;
2200f5db3c6SLong Li
2210f5db3c6SLong Li mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
2220f5db3c6SLong Li 0, bt->socket);
2230f5db3c6SLong Li if (!mem) {
224e2d3a3c0SLong Li DP_LOG(ERR, "Failed to expand btree size %d", n);
2250f5db3c6SLong Li return -1;
2260f5db3c6SLong Li }
2270f5db3c6SLong Li
228e2d3a3c0SLong Li DP_LOG(ERR, "Expanded btree to size %d", n);
2290f5db3c6SLong Li bt->table = mem;
2300f5db3c6SLong Li bt->size = n;
2310f5db3c6SLong Li
2320f5db3c6SLong Li return 0;
2330f5db3c6SLong Li }
2340f5db3c6SLong Li
2350f5db3c6SLong Li /*
2360f5db3c6SLong Li * Look for a region of memory in MR cache.
2370f5db3c6SLong Li */
mana_mr_btree_lookup(struct mana_mr_btree * bt,uint16_t * idx,uintptr_t addr,size_t len,struct mana_mr_cache ** cache)238*0c7bc26bSLong Li int mana_mr_btree_lookup(struct mana_mr_btree *bt, uint16_t *idx,
239*0c7bc26bSLong Li uintptr_t addr, size_t len,
240*0c7bc26bSLong Li struct mana_mr_cache **cache)
2410f5db3c6SLong Li {
2420f5db3c6SLong Li struct mana_mr_cache *table;
2430f5db3c6SLong Li uint16_t n;
2440f5db3c6SLong Li uint16_t base = 0;
2450f5db3c6SLong Li int ret;
2460f5db3c6SLong Li
247*0c7bc26bSLong Li *cache = NULL;
2480f5db3c6SLong Li
249*0c7bc26bSLong Li n = bt->len;
2500f5db3c6SLong Li /* Try to double the cache if it's full */
2510f5db3c6SLong Li if (n == bt->size) {
2520f5db3c6SLong Li ret = mana_mr_btree_expand(bt, bt->size << 1);
2530f5db3c6SLong Li if (ret)
254*0c7bc26bSLong Li return ret;
2550f5db3c6SLong Li }
2560f5db3c6SLong Li
2570f5db3c6SLong Li table = bt->table;
2580f5db3c6SLong Li
2590f5db3c6SLong Li /* Do binary search on addr */
2600f5db3c6SLong Li do {
2610f5db3c6SLong Li uint16_t delta = n >> 1;
2620f5db3c6SLong Li
2630f5db3c6SLong Li if (addr < table[base + delta].addr) {
2640f5db3c6SLong Li n = delta;
2650f5db3c6SLong Li } else {
2660f5db3c6SLong Li base += delta;
2670f5db3c6SLong Li n -= delta;
2680f5db3c6SLong Li }
2690f5db3c6SLong Li } while (n > 1);
2700f5db3c6SLong Li
2710f5db3c6SLong Li *idx = base;
2720f5db3c6SLong Li
273*0c7bc26bSLong Li if (addr + len <= table[base].addr + table[base].len) {
274*0c7bc26bSLong Li *cache = &table[base];
275*0c7bc26bSLong Li return 0;
276*0c7bc26bSLong Li }
2770f5db3c6SLong Li
278e2d3a3c0SLong Li DP_LOG(DEBUG,
27974decf3bSWei Hu "addr 0x%" PRIxPTR " len %zu idx %u sum 0x%" PRIxPTR " not found",
2800f5db3c6SLong Li addr, len, *idx, addr + len);
2810f5db3c6SLong Li
282*0c7bc26bSLong Li return 0;
2830f5db3c6SLong Li }
2840f5db3c6SLong Li
2850f5db3c6SLong Li int
mana_mr_btree_init(struct mana_mr_btree * bt,int n,int socket)2860f5db3c6SLong Li mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
2870f5db3c6SLong Li {
2880f5db3c6SLong Li memset(bt, 0, sizeof(*bt));
2890f5db3c6SLong Li bt->table = rte_calloc_socket("MANA B-tree table",
2900f5db3c6SLong Li n,
2910f5db3c6SLong Li sizeof(struct mana_mr_cache),
2920f5db3c6SLong Li 0, socket);
2930f5db3c6SLong Li if (!bt->table) {
2940f5db3c6SLong Li DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
2950f5db3c6SLong Li n, socket);
2960f5db3c6SLong Li return -ENOMEM;
2970f5db3c6SLong Li }
2980f5db3c6SLong Li
2990f5db3c6SLong Li bt->socket = socket;
3000f5db3c6SLong Li bt->size = n;
3010f5db3c6SLong Li
3020f5db3c6SLong Li /* First entry must be NULL for binary search to work */
3030f5db3c6SLong Li bt->table[0] = (struct mana_mr_cache) {
3040f5db3c6SLong Li .lkey = UINT32_MAX,
3050f5db3c6SLong Li };
3060f5db3c6SLong Li bt->len = 1;
3070f5db3c6SLong Li
3080f5db3c6SLong Li DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
3090f5db3c6SLong Li bt->table, n, bt->len);
3100f5db3c6SLong Li
3110f5db3c6SLong Li return 0;
3120f5db3c6SLong Li }
3130f5db3c6SLong Li
3140f5db3c6SLong Li void
mana_mr_btree_free(struct mana_mr_btree * bt)3150f5db3c6SLong Li mana_mr_btree_free(struct mana_mr_btree *bt)
3160f5db3c6SLong Li {
3170f5db3c6SLong Li rte_free(bt->table);
3180f5db3c6SLong Li memset(bt, 0, sizeof(*bt));
3190f5db3c6SLong Li }
3200f5db3c6SLong Li
3210f5db3c6SLong Li int
mana_mr_btree_insert(struct mana_mr_btree * bt,struct mana_mr_cache * entry)3220f5db3c6SLong Li mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
3230f5db3c6SLong Li {
3240f5db3c6SLong Li struct mana_mr_cache *table;
3250f5db3c6SLong Li uint16_t idx = 0;
3260f5db3c6SLong Li uint16_t shift;
327*0c7bc26bSLong Li int ret;
3280f5db3c6SLong Li
329*0c7bc26bSLong Li ret = mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len, &table);
330*0c7bc26bSLong Li if (ret)
331*0c7bc26bSLong Li return ret;
332*0c7bc26bSLong Li
333*0c7bc26bSLong Li if (table) {
33474decf3bSWei Hu DP_LOG(DEBUG, "Addr 0x%" PRIxPTR " len %zu exists in btree",
3350f5db3c6SLong Li entry->addr, entry->len);
3360f5db3c6SLong Li return 0;
3370f5db3c6SLong Li }
3380f5db3c6SLong Li
3390f5db3c6SLong Li if (bt->len >= bt->size) {
340*0c7bc26bSLong Li DP_LOG(ERR, "Btree overflow detected len %u size %u",
341*0c7bc26bSLong Li bt->len, bt->size);
3420f5db3c6SLong Li bt->overflow = 1;
3430f5db3c6SLong Li return -1;
3440f5db3c6SLong Li }
3450f5db3c6SLong Li
3460f5db3c6SLong Li table = bt->table;
3470f5db3c6SLong Li
3480f5db3c6SLong Li idx++;
3490f5db3c6SLong Li shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
3500f5db3c6SLong Li if (shift) {
351e2d3a3c0SLong Li DP_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
3520f5db3c6SLong Li shift, idx, idx + 1);
3530f5db3c6SLong Li memmove(&table[idx + 1], &table[idx], shift);
3540f5db3c6SLong Li }
3550f5db3c6SLong Li
3560f5db3c6SLong Li table[idx] = *entry;
3570f5db3c6SLong Li bt->len++;
3580f5db3c6SLong Li
359e2d3a3c0SLong Li DP_LOG(DEBUG,
36074decf3bSWei Hu "Inserted MR b-tree table %p idx %d addr 0x%" PRIxPTR " len %zu",
3610f5db3c6SLong Li table, idx, entry->addr, entry->len);
3620f5db3c6SLong Li
3630f5db3c6SLong Li return 0;
3640f5db3c6SLong Li }
365