xref: /dpdk/drivers/common/mlx5/mlx5_common_mr.c (revision b8dc6b0e29d86bfda4b635aa6e12469728c46e63)
1*b8dc6b0eSVu Pham /* SPDX-License-Identifier: BSD-3-Clause
2*b8dc6b0eSVu Pham  * Copyright 2016 6WIND S.A.
3*b8dc6b0eSVu Pham  * Copyright 2020 Mellanox Technologies, Ltd
4*b8dc6b0eSVu Pham  */
5*b8dc6b0eSVu Pham #include <rte_eal_memconfig.h>
6*b8dc6b0eSVu Pham #include <rte_errno.h>
7*b8dc6b0eSVu Pham #include <rte_mempool.h>
8*b8dc6b0eSVu Pham #include <rte_malloc.h>
9*b8dc6b0eSVu Pham #include <rte_rwlock.h>
10*b8dc6b0eSVu Pham 
11*b8dc6b0eSVu Pham #include "mlx5_glue.h"
12*b8dc6b0eSVu Pham #include "mlx5_common_mp.h"
13*b8dc6b0eSVu Pham #include "mlx5_common_mr.h"
14*b8dc6b0eSVu Pham #include "mlx5_common_utils.h"
15*b8dc6b0eSVu Pham 
16*b8dc6b0eSVu Pham struct mr_find_contig_memsegs_data {
17*b8dc6b0eSVu Pham 	uintptr_t addr;
18*b8dc6b0eSVu Pham 	uintptr_t start;
19*b8dc6b0eSVu Pham 	uintptr_t end;
20*b8dc6b0eSVu Pham 	const struct rte_memseg_list *msl;
21*b8dc6b0eSVu Pham };
22*b8dc6b0eSVu Pham 
23*b8dc6b0eSVu Pham /**
24*b8dc6b0eSVu Pham  * Expand B-tree table to a given size. Can't be called with holding
25*b8dc6b0eSVu Pham  * memory_hotplug_lock or share_cache.rwlock due to rte_realloc().
26*b8dc6b0eSVu Pham  *
27*b8dc6b0eSVu Pham  * @param bt
28*b8dc6b0eSVu Pham  *   Pointer to B-tree structure.
29*b8dc6b0eSVu Pham  * @param n
30*b8dc6b0eSVu Pham  *   Number of entries for expansion.
31*b8dc6b0eSVu Pham  *
32*b8dc6b0eSVu Pham  * @return
33*b8dc6b0eSVu Pham  *   0 on success, -1 on failure.
34*b8dc6b0eSVu Pham  */
35*b8dc6b0eSVu Pham static int
36*b8dc6b0eSVu Pham mr_btree_expand(struct mlx5_mr_btree *bt, int n)
37*b8dc6b0eSVu Pham {
38*b8dc6b0eSVu Pham 	void *mem;
39*b8dc6b0eSVu Pham 	int ret = 0;
40*b8dc6b0eSVu Pham 
41*b8dc6b0eSVu Pham 	if (n <= bt->size)
42*b8dc6b0eSVu Pham 		return ret;
43*b8dc6b0eSVu Pham 	/*
44*b8dc6b0eSVu Pham 	 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
45*b8dc6b0eSVu Pham 	 * used inside if there's no room to expand. Because this is a quite
46*b8dc6b0eSVu Pham 	 * rare case and a part of very slow path, it is very acceptable.
47*b8dc6b0eSVu Pham 	 * Initially cache_bh[] will be given practically enough space and once
48*b8dc6b0eSVu Pham 	 * it is expanded, expansion wouldn't be needed again ever.
49*b8dc6b0eSVu Pham 	 */
50*b8dc6b0eSVu Pham 	mem = rte_realloc(bt->table, n * sizeof(struct mr_cache_entry), 0);
51*b8dc6b0eSVu Pham 	if (mem == NULL) {
52*b8dc6b0eSVu Pham 		/* Not an error, B-tree search will be skipped. */
53*b8dc6b0eSVu Pham 		DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table",
54*b8dc6b0eSVu Pham 			(void *)bt);
55*b8dc6b0eSVu Pham 		ret = -1;
56*b8dc6b0eSVu Pham 	} else {
57*b8dc6b0eSVu Pham 		DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n);
58*b8dc6b0eSVu Pham 		bt->table = mem;
59*b8dc6b0eSVu Pham 		bt->size = n;
60*b8dc6b0eSVu Pham 	}
61*b8dc6b0eSVu Pham 	return ret;
62*b8dc6b0eSVu Pham }
63*b8dc6b0eSVu Pham 
64*b8dc6b0eSVu Pham /**
65*b8dc6b0eSVu Pham  * Look up LKey from given B-tree lookup table, store the last index and return
66*b8dc6b0eSVu Pham  * searched LKey.
67*b8dc6b0eSVu Pham  *
68*b8dc6b0eSVu Pham  * @param bt
69*b8dc6b0eSVu Pham  *   Pointer to B-tree structure.
70*b8dc6b0eSVu Pham  * @param[out] idx
71*b8dc6b0eSVu Pham  *   Pointer to index. Even on search failure, returns index where it stops
72*b8dc6b0eSVu Pham  *   searching so that index can be used when inserting a new entry.
73*b8dc6b0eSVu Pham  * @param addr
74*b8dc6b0eSVu Pham  *   Search key.
75*b8dc6b0eSVu Pham  *
76*b8dc6b0eSVu Pham  * @return
77*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on no match.
78*b8dc6b0eSVu Pham  */
79*b8dc6b0eSVu Pham static uint32_t
80*b8dc6b0eSVu Pham mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr)
81*b8dc6b0eSVu Pham {
82*b8dc6b0eSVu Pham 	struct mr_cache_entry *lkp_tbl;
83*b8dc6b0eSVu Pham 	uint16_t n;
84*b8dc6b0eSVu Pham 	uint16_t base = 0;
85*b8dc6b0eSVu Pham 
86*b8dc6b0eSVu Pham 	MLX5_ASSERT(bt != NULL);
87*b8dc6b0eSVu Pham 	lkp_tbl = *bt->table;
88*b8dc6b0eSVu Pham 	n = bt->len;
89*b8dc6b0eSVu Pham 	/* First entry must be NULL for comparison. */
90*b8dc6b0eSVu Pham 	MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 &&
91*b8dc6b0eSVu Pham 				    lkp_tbl[0].lkey == UINT32_MAX));
92*b8dc6b0eSVu Pham 	/* Binary search. */
93*b8dc6b0eSVu Pham 	do {
94*b8dc6b0eSVu Pham 		register uint16_t delta = n >> 1;
95*b8dc6b0eSVu Pham 
96*b8dc6b0eSVu Pham 		if (addr < lkp_tbl[base + delta].start) {
97*b8dc6b0eSVu Pham 			n = delta;
98*b8dc6b0eSVu Pham 		} else {
99*b8dc6b0eSVu Pham 			base += delta;
100*b8dc6b0eSVu Pham 			n -= delta;
101*b8dc6b0eSVu Pham 		}
102*b8dc6b0eSVu Pham 	} while (n > 1);
103*b8dc6b0eSVu Pham 	MLX5_ASSERT(addr >= lkp_tbl[base].start);
104*b8dc6b0eSVu Pham 	*idx = base;
105*b8dc6b0eSVu Pham 	if (addr < lkp_tbl[base].end)
106*b8dc6b0eSVu Pham 		return lkp_tbl[base].lkey;
107*b8dc6b0eSVu Pham 	/* Not found. */
108*b8dc6b0eSVu Pham 	return UINT32_MAX;
109*b8dc6b0eSVu Pham }
110*b8dc6b0eSVu Pham 
111*b8dc6b0eSVu Pham /**
112*b8dc6b0eSVu Pham  * Insert an entry to B-tree lookup table.
113*b8dc6b0eSVu Pham  *
114*b8dc6b0eSVu Pham  * @param bt
115*b8dc6b0eSVu Pham  *   Pointer to B-tree structure.
116*b8dc6b0eSVu Pham  * @param entry
117*b8dc6b0eSVu Pham  *   Pointer to new entry to insert.
118*b8dc6b0eSVu Pham  *
119*b8dc6b0eSVu Pham  * @return
120*b8dc6b0eSVu Pham  *   0 on success, -1 on failure.
121*b8dc6b0eSVu Pham  */
122*b8dc6b0eSVu Pham static int
123*b8dc6b0eSVu Pham mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
124*b8dc6b0eSVu Pham {
125*b8dc6b0eSVu Pham 	struct mr_cache_entry *lkp_tbl;
126*b8dc6b0eSVu Pham 	uint16_t idx = 0;
127*b8dc6b0eSVu Pham 	size_t shift;
128*b8dc6b0eSVu Pham 
129*b8dc6b0eSVu Pham 	MLX5_ASSERT(bt != NULL);
130*b8dc6b0eSVu Pham 	MLX5_ASSERT(bt->len <= bt->size);
131*b8dc6b0eSVu Pham 	MLX5_ASSERT(bt->len > 0);
132*b8dc6b0eSVu Pham 	lkp_tbl = *bt->table;
133*b8dc6b0eSVu Pham 	/* Find out the slot for insertion. */
134*b8dc6b0eSVu Pham 	if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
135*b8dc6b0eSVu Pham 		DRV_LOG(DEBUG,
136*b8dc6b0eSVu Pham 			"abort insertion to B-tree(%p): already exist at"
137*b8dc6b0eSVu Pham 			" idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
138*b8dc6b0eSVu Pham 			(void *)bt, idx, entry->start, entry->end, entry->lkey);
139*b8dc6b0eSVu Pham 		/* Already exist, return. */
140*b8dc6b0eSVu Pham 		return 0;
141*b8dc6b0eSVu Pham 	}
142*b8dc6b0eSVu Pham 	/* If table is full, return error. */
143*b8dc6b0eSVu Pham 	if (unlikely(bt->len == bt->size)) {
144*b8dc6b0eSVu Pham 		bt->overflow = 1;
145*b8dc6b0eSVu Pham 		return -1;
146*b8dc6b0eSVu Pham 	}
147*b8dc6b0eSVu Pham 	/* Insert entry. */
148*b8dc6b0eSVu Pham 	++idx;
149*b8dc6b0eSVu Pham 	shift = (bt->len - idx) * sizeof(struct mr_cache_entry);
150*b8dc6b0eSVu Pham 	if (shift)
151*b8dc6b0eSVu Pham 		memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
152*b8dc6b0eSVu Pham 	lkp_tbl[idx] = *entry;
153*b8dc6b0eSVu Pham 	bt->len++;
154*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG,
155*b8dc6b0eSVu Pham 		"inserted B-tree(%p)[%u],"
156*b8dc6b0eSVu Pham 		" [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
157*b8dc6b0eSVu Pham 		(void *)bt, idx, entry->start, entry->end, entry->lkey);
158*b8dc6b0eSVu Pham 	return 0;
159*b8dc6b0eSVu Pham }
160*b8dc6b0eSVu Pham 
161*b8dc6b0eSVu Pham /**
162*b8dc6b0eSVu Pham  * Initialize B-tree and allocate memory for lookup table.
163*b8dc6b0eSVu Pham  *
164*b8dc6b0eSVu Pham  * @param bt
165*b8dc6b0eSVu Pham  *   Pointer to B-tree structure.
166*b8dc6b0eSVu Pham  * @param n
167*b8dc6b0eSVu Pham  *   Number of entries to allocate.
168*b8dc6b0eSVu Pham  * @param socket
169*b8dc6b0eSVu Pham  *   NUMA socket on which memory must be allocated.
170*b8dc6b0eSVu Pham  *
171*b8dc6b0eSVu Pham  * @return
172*b8dc6b0eSVu Pham  *   0 on success, a negative errno value otherwise and rte_errno is set.
173*b8dc6b0eSVu Pham  */
174*b8dc6b0eSVu Pham int
175*b8dc6b0eSVu Pham mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket)
176*b8dc6b0eSVu Pham {
177*b8dc6b0eSVu Pham 	if (bt == NULL) {
178*b8dc6b0eSVu Pham 		rte_errno = EINVAL;
179*b8dc6b0eSVu Pham 		return -rte_errno;
180*b8dc6b0eSVu Pham 	}
181*b8dc6b0eSVu Pham 	MLX5_ASSERT(!bt->table && !bt->size);
182*b8dc6b0eSVu Pham 	memset(bt, 0, sizeof(*bt));
183*b8dc6b0eSVu Pham 	bt->table = rte_calloc_socket("B-tree table",
184*b8dc6b0eSVu Pham 				      n, sizeof(struct mr_cache_entry),
185*b8dc6b0eSVu Pham 				      0, socket);
186*b8dc6b0eSVu Pham 	if (bt->table == NULL) {
187*b8dc6b0eSVu Pham 		rte_errno = ENOMEM;
188*b8dc6b0eSVu Pham 		DEBUG("failed to allocate memory for btree cache on socket %d",
189*b8dc6b0eSVu Pham 		      socket);
190*b8dc6b0eSVu Pham 		return -rte_errno;
191*b8dc6b0eSVu Pham 	}
192*b8dc6b0eSVu Pham 	bt->size = n;
193*b8dc6b0eSVu Pham 	/* First entry must be NULL for binary search. */
194*b8dc6b0eSVu Pham 	(*bt->table)[bt->len++] = (struct mr_cache_entry) {
195*b8dc6b0eSVu Pham 		.lkey = UINT32_MAX,
196*b8dc6b0eSVu Pham 	};
197*b8dc6b0eSVu Pham 	DEBUG("initialized B-tree %p with table %p",
198*b8dc6b0eSVu Pham 	      (void *)bt, (void *)bt->table);
199*b8dc6b0eSVu Pham 	return 0;
200*b8dc6b0eSVu Pham }
201*b8dc6b0eSVu Pham 
202*b8dc6b0eSVu Pham /**
203*b8dc6b0eSVu Pham  * Free B-tree resources.
204*b8dc6b0eSVu Pham  *
205*b8dc6b0eSVu Pham  * @param bt
206*b8dc6b0eSVu Pham  *   Pointer to B-tree structure.
207*b8dc6b0eSVu Pham  */
208*b8dc6b0eSVu Pham void
209*b8dc6b0eSVu Pham mlx5_mr_btree_free(struct mlx5_mr_btree *bt)
210*b8dc6b0eSVu Pham {
211*b8dc6b0eSVu Pham 	if (bt == NULL)
212*b8dc6b0eSVu Pham 		return;
213*b8dc6b0eSVu Pham 	DEBUG("freeing B-tree %p with table %p",
214*b8dc6b0eSVu Pham 	      (void *)bt, (void *)bt->table);
215*b8dc6b0eSVu Pham 	rte_free(bt->table);
216*b8dc6b0eSVu Pham 	memset(bt, 0, sizeof(*bt));
217*b8dc6b0eSVu Pham }
218*b8dc6b0eSVu Pham 
219*b8dc6b0eSVu Pham /**
220*b8dc6b0eSVu Pham  * Dump all the entries in a B-tree
221*b8dc6b0eSVu Pham  *
222*b8dc6b0eSVu Pham  * @param bt
223*b8dc6b0eSVu Pham  *   Pointer to B-tree structure.
224*b8dc6b0eSVu Pham  */
225*b8dc6b0eSVu Pham void
226*b8dc6b0eSVu Pham mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused)
227*b8dc6b0eSVu Pham {
228*b8dc6b0eSVu Pham #ifdef RTE_LIBRTE_MLX5_DEBUG
229*b8dc6b0eSVu Pham 	int idx;
230*b8dc6b0eSVu Pham 	struct mr_cache_entry *lkp_tbl;
231*b8dc6b0eSVu Pham 
232*b8dc6b0eSVu Pham 	if (bt == NULL)
233*b8dc6b0eSVu Pham 		return;
234*b8dc6b0eSVu Pham 	lkp_tbl = *bt->table;
235*b8dc6b0eSVu Pham 	for (idx = 0; idx < bt->len; ++idx) {
236*b8dc6b0eSVu Pham 		struct mr_cache_entry *entry = &lkp_tbl[idx];
237*b8dc6b0eSVu Pham 
238*b8dc6b0eSVu Pham 		DEBUG("B-tree(%p)[%u],"
239*b8dc6b0eSVu Pham 		      " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
240*b8dc6b0eSVu Pham 		      (void *)bt, idx, entry->start, entry->end, entry->lkey);
241*b8dc6b0eSVu Pham 	}
242*b8dc6b0eSVu Pham #endif
243*b8dc6b0eSVu Pham }
244*b8dc6b0eSVu Pham 
245*b8dc6b0eSVu Pham /**
246*b8dc6b0eSVu Pham  * Find virtually contiguous memory chunk in a given MR.
247*b8dc6b0eSVu Pham  *
248*b8dc6b0eSVu Pham  * @param dev
249*b8dc6b0eSVu Pham  *   Pointer to MR structure.
250*b8dc6b0eSVu Pham  * @param[out] entry
251*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry. If not found, this will not be
252*b8dc6b0eSVu Pham  *   updated.
253*b8dc6b0eSVu Pham  * @param start_idx
254*b8dc6b0eSVu Pham  *   Start index of the memseg bitmap.
255*b8dc6b0eSVu Pham  *
256*b8dc6b0eSVu Pham  * @return
257*b8dc6b0eSVu Pham  *   Next index to go on lookup.
258*b8dc6b0eSVu Pham  */
259*b8dc6b0eSVu Pham static int
260*b8dc6b0eSVu Pham mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry,
261*b8dc6b0eSVu Pham 		   int base_idx)
262*b8dc6b0eSVu Pham {
263*b8dc6b0eSVu Pham 	uintptr_t start = 0;
264*b8dc6b0eSVu Pham 	uintptr_t end = 0;
265*b8dc6b0eSVu Pham 	uint32_t idx = 0;
266*b8dc6b0eSVu Pham 
267*b8dc6b0eSVu Pham 	/* MR for external memory doesn't have memseg list. */
268*b8dc6b0eSVu Pham 	if (mr->msl == NULL) {
269*b8dc6b0eSVu Pham 		struct ibv_mr *ibv_mr = mr->ibv_mr;
270*b8dc6b0eSVu Pham 
271*b8dc6b0eSVu Pham 		MLX5_ASSERT(mr->ms_bmp_n == 1);
272*b8dc6b0eSVu Pham 		MLX5_ASSERT(mr->ms_n == 1);
273*b8dc6b0eSVu Pham 		MLX5_ASSERT(base_idx == 0);
274*b8dc6b0eSVu Pham 		/*
275*b8dc6b0eSVu Pham 		 * Can't search it from memseg list but get it directly from
276*b8dc6b0eSVu Pham 		 * verbs MR as there's only one chunk.
277*b8dc6b0eSVu Pham 		 */
278*b8dc6b0eSVu Pham 		entry->start = (uintptr_t)ibv_mr->addr;
279*b8dc6b0eSVu Pham 		entry->end = (uintptr_t)ibv_mr->addr + mr->ibv_mr->length;
280*b8dc6b0eSVu Pham 		entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
281*b8dc6b0eSVu Pham 		/* Returning 1 ends iteration. */
282*b8dc6b0eSVu Pham 		return 1;
283*b8dc6b0eSVu Pham 	}
284*b8dc6b0eSVu Pham 	for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
285*b8dc6b0eSVu Pham 		if (rte_bitmap_get(mr->ms_bmp, idx)) {
286*b8dc6b0eSVu Pham 			const struct rte_memseg_list *msl;
287*b8dc6b0eSVu Pham 			const struct rte_memseg *ms;
288*b8dc6b0eSVu Pham 
289*b8dc6b0eSVu Pham 			msl = mr->msl;
290*b8dc6b0eSVu Pham 			ms = rte_fbarray_get(&msl->memseg_arr,
291*b8dc6b0eSVu Pham 					     mr->ms_base_idx + idx);
292*b8dc6b0eSVu Pham 			MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
293*b8dc6b0eSVu Pham 			if (!start)
294*b8dc6b0eSVu Pham 				start = ms->addr_64;
295*b8dc6b0eSVu Pham 			end = ms->addr_64 + ms->hugepage_sz;
296*b8dc6b0eSVu Pham 		} else if (start) {
297*b8dc6b0eSVu Pham 			/* Passed the end of a fragment. */
298*b8dc6b0eSVu Pham 			break;
299*b8dc6b0eSVu Pham 		}
300*b8dc6b0eSVu Pham 	}
301*b8dc6b0eSVu Pham 	if (start) {
302*b8dc6b0eSVu Pham 		/* Found one chunk. */
303*b8dc6b0eSVu Pham 		entry->start = start;
304*b8dc6b0eSVu Pham 		entry->end = end;
305*b8dc6b0eSVu Pham 		entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
306*b8dc6b0eSVu Pham 	}
307*b8dc6b0eSVu Pham 	return idx;
308*b8dc6b0eSVu Pham }
309*b8dc6b0eSVu Pham 
310*b8dc6b0eSVu Pham /**
311*b8dc6b0eSVu Pham  * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
312*b8dc6b0eSVu Pham  * Then, this entry will have to be searched by mr_lookup_list() in
313*b8dc6b0eSVu Pham  * mlx5_mr_create() on miss.
314*b8dc6b0eSVu Pham  *
315*b8dc6b0eSVu Pham  * @param share_cache
316*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
317*b8dc6b0eSVu Pham  * @param mr
318*b8dc6b0eSVu Pham  *   Pointer to MR to insert.
319*b8dc6b0eSVu Pham  *
320*b8dc6b0eSVu Pham  * @return
321*b8dc6b0eSVu Pham  *   0 on success, -1 on failure.
322*b8dc6b0eSVu Pham  */
323*b8dc6b0eSVu Pham int
324*b8dc6b0eSVu Pham mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache,
325*b8dc6b0eSVu Pham 		     struct mlx5_mr *mr)
326*b8dc6b0eSVu Pham {
327*b8dc6b0eSVu Pham 	unsigned int n;
328*b8dc6b0eSVu Pham 
329*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)",
330*b8dc6b0eSVu Pham 		(void *)mr, (void *)share_cache);
331*b8dc6b0eSVu Pham 	for (n = 0; n < mr->ms_bmp_n; ) {
332*b8dc6b0eSVu Pham 		struct mr_cache_entry entry;
333*b8dc6b0eSVu Pham 
334*b8dc6b0eSVu Pham 		memset(&entry, 0, sizeof(entry));
335*b8dc6b0eSVu Pham 		/* Find a contiguous chunk and advance the index. */
336*b8dc6b0eSVu Pham 		n = mr_find_next_chunk(mr, &entry, n);
337*b8dc6b0eSVu Pham 		if (!entry.end)
338*b8dc6b0eSVu Pham 			break;
339*b8dc6b0eSVu Pham 		if (mr_btree_insert(&share_cache->cache, &entry) < 0) {
340*b8dc6b0eSVu Pham 			/*
341*b8dc6b0eSVu Pham 			 * Overflowed, but the global table cannot be expanded
342*b8dc6b0eSVu Pham 			 * because of deadlock.
343*b8dc6b0eSVu Pham 			 */
344*b8dc6b0eSVu Pham 			return -1;
345*b8dc6b0eSVu Pham 		}
346*b8dc6b0eSVu Pham 	}
347*b8dc6b0eSVu Pham 	return 0;
348*b8dc6b0eSVu Pham }
349*b8dc6b0eSVu Pham 
350*b8dc6b0eSVu Pham /**
351*b8dc6b0eSVu Pham  * Look up address in the original global MR list.
352*b8dc6b0eSVu Pham  *
353*b8dc6b0eSVu Pham  * @param share_cache
354*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
355*b8dc6b0eSVu Pham  * @param[out] entry
356*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry. If no match, this will not be updated.
357*b8dc6b0eSVu Pham  * @param addr
358*b8dc6b0eSVu Pham  *   Search key.
359*b8dc6b0eSVu Pham  *
360*b8dc6b0eSVu Pham  * @return
361*b8dc6b0eSVu Pham  *   Found MR on match, NULL otherwise.
362*b8dc6b0eSVu Pham  */
363*b8dc6b0eSVu Pham struct mlx5_mr *
364*b8dc6b0eSVu Pham mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache,
365*b8dc6b0eSVu Pham 		    struct mr_cache_entry *entry, uintptr_t addr)
366*b8dc6b0eSVu Pham {
367*b8dc6b0eSVu Pham 	struct mlx5_mr *mr;
368*b8dc6b0eSVu Pham 
369*b8dc6b0eSVu Pham 	/* Iterate all the existing MRs. */
370*b8dc6b0eSVu Pham 	LIST_FOREACH(mr, &share_cache->mr_list, mr) {
371*b8dc6b0eSVu Pham 		unsigned int n;
372*b8dc6b0eSVu Pham 
373*b8dc6b0eSVu Pham 		if (mr->ms_n == 0)
374*b8dc6b0eSVu Pham 			continue;
375*b8dc6b0eSVu Pham 		for (n = 0; n < mr->ms_bmp_n; ) {
376*b8dc6b0eSVu Pham 			struct mr_cache_entry ret;
377*b8dc6b0eSVu Pham 
378*b8dc6b0eSVu Pham 			memset(&ret, 0, sizeof(ret));
379*b8dc6b0eSVu Pham 			n = mr_find_next_chunk(mr, &ret, n);
380*b8dc6b0eSVu Pham 			if (addr >= ret.start && addr < ret.end) {
381*b8dc6b0eSVu Pham 				/* Found. */
382*b8dc6b0eSVu Pham 				*entry = ret;
383*b8dc6b0eSVu Pham 				return mr;
384*b8dc6b0eSVu Pham 			}
385*b8dc6b0eSVu Pham 		}
386*b8dc6b0eSVu Pham 	}
387*b8dc6b0eSVu Pham 	return NULL;
388*b8dc6b0eSVu Pham }
389*b8dc6b0eSVu Pham 
390*b8dc6b0eSVu Pham /**
391*b8dc6b0eSVu Pham  * Look up address on global MR cache.
392*b8dc6b0eSVu Pham  *
393*b8dc6b0eSVu Pham  * @param share_cache
394*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
395*b8dc6b0eSVu Pham  * @param[out] entry
396*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry. If no match, this will not be updated.
397*b8dc6b0eSVu Pham  * @param addr
398*b8dc6b0eSVu Pham  *   Search key.
399*b8dc6b0eSVu Pham  *
400*b8dc6b0eSVu Pham  * @return
401*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
402*b8dc6b0eSVu Pham  */
403*b8dc6b0eSVu Pham uint32_t
404*b8dc6b0eSVu Pham mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache,
405*b8dc6b0eSVu Pham 		     struct mr_cache_entry *entry, uintptr_t addr)
406*b8dc6b0eSVu Pham {
407*b8dc6b0eSVu Pham 	uint16_t idx;
408*b8dc6b0eSVu Pham 	uint32_t lkey = UINT32_MAX;
409*b8dc6b0eSVu Pham 	struct mlx5_mr *mr;
410*b8dc6b0eSVu Pham 
411*b8dc6b0eSVu Pham 	/*
412*b8dc6b0eSVu Pham 	 * If the global cache has overflowed since it failed to expand the
413*b8dc6b0eSVu Pham 	 * B-tree table, it can't have all the existing MRs. Then, the address
414*b8dc6b0eSVu Pham 	 * has to be searched by traversing the original MR list instead, which
415*b8dc6b0eSVu Pham 	 * is very slow path. Otherwise, the global cache is all inclusive.
416*b8dc6b0eSVu Pham 	 */
417*b8dc6b0eSVu Pham 	if (!unlikely(share_cache->cache.overflow)) {
418*b8dc6b0eSVu Pham 		lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
419*b8dc6b0eSVu Pham 		if (lkey != UINT32_MAX)
420*b8dc6b0eSVu Pham 			*entry = (*share_cache->cache.table)[idx];
421*b8dc6b0eSVu Pham 	} else {
422*b8dc6b0eSVu Pham 		/* Falling back to the slowest path. */
423*b8dc6b0eSVu Pham 		mr = mlx5_mr_lookup_list(share_cache, entry, addr);
424*b8dc6b0eSVu Pham 		if (mr != NULL)
425*b8dc6b0eSVu Pham 			lkey = entry->lkey;
426*b8dc6b0eSVu Pham 	}
427*b8dc6b0eSVu Pham 	MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
428*b8dc6b0eSVu Pham 					   addr < entry->end));
429*b8dc6b0eSVu Pham 	return lkey;
430*b8dc6b0eSVu Pham }
431*b8dc6b0eSVu Pham 
432*b8dc6b0eSVu Pham /**
433*b8dc6b0eSVu Pham  * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
434*b8dc6b0eSVu Pham  * can raise memory free event and the callback function will spin on the lock.
435*b8dc6b0eSVu Pham  *
436*b8dc6b0eSVu Pham  * @param mr
437*b8dc6b0eSVu Pham  *   Pointer to MR to free.
438*b8dc6b0eSVu Pham  */
439*b8dc6b0eSVu Pham static void
440*b8dc6b0eSVu Pham mr_free(struct mlx5_mr *mr)
441*b8dc6b0eSVu Pham {
442*b8dc6b0eSVu Pham 	if (mr == NULL)
443*b8dc6b0eSVu Pham 		return;
444*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr);
445*b8dc6b0eSVu Pham 	if (mr->ibv_mr != NULL)
446*b8dc6b0eSVu Pham 		claim_zero(mlx5_glue->dereg_mr(mr->ibv_mr));
447*b8dc6b0eSVu Pham 	if (mr->ms_bmp != NULL)
448*b8dc6b0eSVu Pham 		rte_bitmap_free(mr->ms_bmp);
449*b8dc6b0eSVu Pham 	rte_free(mr);
450*b8dc6b0eSVu Pham }
451*b8dc6b0eSVu Pham 
452*b8dc6b0eSVu Pham void
453*b8dc6b0eSVu Pham mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache)
454*b8dc6b0eSVu Pham {
455*b8dc6b0eSVu Pham 	struct mlx5_mr *mr;
456*b8dc6b0eSVu Pham 
457*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache);
458*b8dc6b0eSVu Pham 	/* Flush cache to rebuild. */
459*b8dc6b0eSVu Pham 	share_cache->cache.len = 1;
460*b8dc6b0eSVu Pham 	share_cache->cache.overflow = 0;
461*b8dc6b0eSVu Pham 	/* Iterate all the existing MRs. */
462*b8dc6b0eSVu Pham 	LIST_FOREACH(mr, &share_cache->mr_list, mr)
463*b8dc6b0eSVu Pham 		if (mlx5_mr_insert_cache(share_cache, mr) < 0)
464*b8dc6b0eSVu Pham 			return;
465*b8dc6b0eSVu Pham }
466*b8dc6b0eSVu Pham 
467*b8dc6b0eSVu Pham /**
468*b8dc6b0eSVu Pham  * Release resources of detached MR having no online entry.
469*b8dc6b0eSVu Pham  *
470*b8dc6b0eSVu Pham  * @param share_cache
471*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
472*b8dc6b0eSVu Pham  */
473*b8dc6b0eSVu Pham static void
474*b8dc6b0eSVu Pham mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache)
475*b8dc6b0eSVu Pham {
476*b8dc6b0eSVu Pham 	struct mlx5_mr *mr_next;
477*b8dc6b0eSVu Pham 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
478*b8dc6b0eSVu Pham 
479*b8dc6b0eSVu Pham 	/* Must be called from the primary process. */
480*b8dc6b0eSVu Pham 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
481*b8dc6b0eSVu Pham 	/*
482*b8dc6b0eSVu Pham 	 * MR can't be freed with holding the lock because rte_free() could call
483*b8dc6b0eSVu Pham 	 * memory free callback function. This will be a deadlock situation.
484*b8dc6b0eSVu Pham 	 */
485*b8dc6b0eSVu Pham 	rte_rwlock_write_lock(&share_cache->rwlock);
486*b8dc6b0eSVu Pham 	/* Detach the whole free list and release it after unlocking. */
487*b8dc6b0eSVu Pham 	free_list = share_cache->mr_free_list;
488*b8dc6b0eSVu Pham 	LIST_INIT(&share_cache->mr_free_list);
489*b8dc6b0eSVu Pham 	rte_rwlock_write_unlock(&share_cache->rwlock);
490*b8dc6b0eSVu Pham 	/* Release resources. */
491*b8dc6b0eSVu Pham 	mr_next = LIST_FIRST(&free_list);
492*b8dc6b0eSVu Pham 	while (mr_next != NULL) {
493*b8dc6b0eSVu Pham 		struct mlx5_mr *mr = mr_next;
494*b8dc6b0eSVu Pham 
495*b8dc6b0eSVu Pham 		mr_next = LIST_NEXT(mr, mr);
496*b8dc6b0eSVu Pham 		mr_free(mr);
497*b8dc6b0eSVu Pham 	}
498*b8dc6b0eSVu Pham }
499*b8dc6b0eSVu Pham 
500*b8dc6b0eSVu Pham /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */
501*b8dc6b0eSVu Pham static int
502*b8dc6b0eSVu Pham mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
503*b8dc6b0eSVu Pham 			  const struct rte_memseg *ms, size_t len, void *arg)
504*b8dc6b0eSVu Pham {
505*b8dc6b0eSVu Pham 	struct mr_find_contig_memsegs_data *data = arg;
506*b8dc6b0eSVu Pham 
507*b8dc6b0eSVu Pham 	if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
508*b8dc6b0eSVu Pham 		return 0;
509*b8dc6b0eSVu Pham 	/* Found, save it and stop walking. */
510*b8dc6b0eSVu Pham 	data->start = ms->addr_64;
511*b8dc6b0eSVu Pham 	data->end = ms->addr_64 + len;
512*b8dc6b0eSVu Pham 	data->msl = msl;
513*b8dc6b0eSVu Pham 	return 1;
514*b8dc6b0eSVu Pham }
515*b8dc6b0eSVu Pham 
516*b8dc6b0eSVu Pham /**
517*b8dc6b0eSVu Pham  * Create a new global Memory Region (MR) for a missing virtual address.
518*b8dc6b0eSVu Pham  * This API should be called on a secondary process, then a request is sent to
519*b8dc6b0eSVu Pham  * the primary process in order to create a MR for the address. As the global MR
520*b8dc6b0eSVu Pham  * list is on the shared memory, following LKey lookup should succeed unless the
521*b8dc6b0eSVu Pham  * request fails.
522*b8dc6b0eSVu Pham  *
523*b8dc6b0eSVu Pham  * @param pd
524*b8dc6b0eSVu Pham  *   Pointer to ibv_pd of a device (net, regex, vdpa,...).
525*b8dc6b0eSVu Pham  * @param share_cache
526*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
527*b8dc6b0eSVu Pham  * @param[out] entry
528*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry, found in the global cache or newly
529*b8dc6b0eSVu Pham  *   created. If failed to create one, this will not be updated.
530*b8dc6b0eSVu Pham  * @param addr
531*b8dc6b0eSVu Pham  *   Target virtual address to register.
532*b8dc6b0eSVu Pham  * @param mr_ext_memseg_en
533*b8dc6b0eSVu Pham  *   Configurable flag about external memory segment enable or not.
534*b8dc6b0eSVu Pham  *
535*b8dc6b0eSVu Pham  * @return
536*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
537*b8dc6b0eSVu Pham  */
538*b8dc6b0eSVu Pham static uint32_t
539*b8dc6b0eSVu Pham mlx5_mr_create_secondary(struct ibv_pd *pd __rte_unused,
540*b8dc6b0eSVu Pham 			 struct mlx5_mp_id *mp_id,
541*b8dc6b0eSVu Pham 			 struct mlx5_mr_share_cache *share_cache,
542*b8dc6b0eSVu Pham 			 struct mr_cache_entry *entry, uintptr_t addr,
543*b8dc6b0eSVu Pham 			 unsigned int mr_ext_memseg_en __rte_unused)
544*b8dc6b0eSVu Pham {
545*b8dc6b0eSVu Pham 	int ret;
546*b8dc6b0eSVu Pham 
547*b8dc6b0eSVu Pham 	DEBUG("port %u requesting MR creation for address (%p)",
548*b8dc6b0eSVu Pham 	      mp_id->port_id, (void *)addr);
549*b8dc6b0eSVu Pham 	ret = mlx5_mp_req_mr_create(mp_id, addr);
550*b8dc6b0eSVu Pham 	if (ret) {
551*b8dc6b0eSVu Pham 		DEBUG("Fail to request MR creation for address (%p)",
552*b8dc6b0eSVu Pham 		      (void *)addr);
553*b8dc6b0eSVu Pham 		return UINT32_MAX;
554*b8dc6b0eSVu Pham 	}
555*b8dc6b0eSVu Pham 	rte_rwlock_read_lock(&share_cache->rwlock);
556*b8dc6b0eSVu Pham 	/* Fill in output data. */
557*b8dc6b0eSVu Pham 	mlx5_mr_lookup_cache(share_cache, entry, addr);
558*b8dc6b0eSVu Pham 	/* Lookup can't fail. */
559*b8dc6b0eSVu Pham 	MLX5_ASSERT(entry->lkey != UINT32_MAX);
560*b8dc6b0eSVu Pham 	rte_rwlock_read_unlock(&share_cache->rwlock);
561*b8dc6b0eSVu Pham 	DEBUG("MR CREATED by primary process for %p:\n"
562*b8dc6b0eSVu Pham 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
563*b8dc6b0eSVu Pham 	      (void *)addr, entry->start, entry->end, entry->lkey);
564*b8dc6b0eSVu Pham 	return entry->lkey;
565*b8dc6b0eSVu Pham }
566*b8dc6b0eSVu Pham 
567*b8dc6b0eSVu Pham /**
568*b8dc6b0eSVu Pham  * Create a new global Memory Region (MR) for a missing virtual address.
569*b8dc6b0eSVu Pham  * Register entire virtually contiguous memory chunk around the address.
570*b8dc6b0eSVu Pham  *
571*b8dc6b0eSVu Pham  * @param pd
572*b8dc6b0eSVu Pham  *   Pointer to ibv_pd of a device (net, regex, vdpa,...).
573*b8dc6b0eSVu Pham  * @param share_cache
574*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
575*b8dc6b0eSVu Pham  * @param[out] entry
576*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry, found in the global cache or newly
577*b8dc6b0eSVu Pham  *   created. If failed to create one, this will not be updated.
578*b8dc6b0eSVu Pham  * @param addr
579*b8dc6b0eSVu Pham  *   Target virtual address to register.
580*b8dc6b0eSVu Pham  * @param mr_ext_memseg_en
581*b8dc6b0eSVu Pham  *   Configurable flag about external memory segment enable or not.
582*b8dc6b0eSVu Pham  *
583*b8dc6b0eSVu Pham  * @return
584*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
585*b8dc6b0eSVu Pham  */
586*b8dc6b0eSVu Pham uint32_t
587*b8dc6b0eSVu Pham mlx5_mr_create_primary(struct ibv_pd *pd,
588*b8dc6b0eSVu Pham 		       struct mlx5_mr_share_cache *share_cache,
589*b8dc6b0eSVu Pham 		       struct mr_cache_entry *entry, uintptr_t addr,
590*b8dc6b0eSVu Pham 		       unsigned int mr_ext_memseg_en)
591*b8dc6b0eSVu Pham {
592*b8dc6b0eSVu Pham 	struct mr_find_contig_memsegs_data data = {.addr = addr, };
593*b8dc6b0eSVu Pham 	struct mr_find_contig_memsegs_data data_re;
594*b8dc6b0eSVu Pham 	const struct rte_memseg_list *msl;
595*b8dc6b0eSVu Pham 	const struct rte_memseg *ms;
596*b8dc6b0eSVu Pham 	struct mlx5_mr *mr = NULL;
597*b8dc6b0eSVu Pham 	int ms_idx_shift = -1;
598*b8dc6b0eSVu Pham 	uint32_t bmp_size;
599*b8dc6b0eSVu Pham 	void *bmp_mem;
600*b8dc6b0eSVu Pham 	uint32_t ms_n;
601*b8dc6b0eSVu Pham 	uint32_t n;
602*b8dc6b0eSVu Pham 	size_t len;
603*b8dc6b0eSVu Pham 
604*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr);
605*b8dc6b0eSVu Pham 	/*
606*b8dc6b0eSVu Pham 	 * Release detached MRs if any. This can't be called with holding either
607*b8dc6b0eSVu Pham 	 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have
608*b8dc6b0eSVu Pham 	 * been detached by the memory free event but it couldn't be released
609*b8dc6b0eSVu Pham 	 * inside the callback due to deadlock. As a result, releasing resources
610*b8dc6b0eSVu Pham 	 * is quite opportunistic.
611*b8dc6b0eSVu Pham 	 */
612*b8dc6b0eSVu Pham 	mlx5_mr_garbage_collect(share_cache);
613*b8dc6b0eSVu Pham 	/*
614*b8dc6b0eSVu Pham 	 * If enabled, find out a contiguous virtual address chunk in use, to
615*b8dc6b0eSVu Pham 	 * which the given address belongs, in order to register maximum range.
616*b8dc6b0eSVu Pham 	 * In the best case where mempools are not dynamically recreated and
617*b8dc6b0eSVu Pham 	 * '--socket-mem' is specified as an EAL option, it is very likely to
618*b8dc6b0eSVu Pham 	 * have only one MR(LKey) per a socket and per a hugepage-size even
619*b8dc6b0eSVu Pham 	 * though the system memory is highly fragmented. As the whole memory
620*b8dc6b0eSVu Pham 	 * chunk will be pinned by kernel, it can't be reused unless entire
621*b8dc6b0eSVu Pham 	 * chunk is freed from EAL.
622*b8dc6b0eSVu Pham 	 *
623*b8dc6b0eSVu Pham 	 * If disabled, just register one memseg (page). Then, memory
624*b8dc6b0eSVu Pham 	 * consumption will be minimized but it may drop performance if there
625*b8dc6b0eSVu Pham 	 * are many MRs to lookup on the datapath.
626*b8dc6b0eSVu Pham 	 */
627*b8dc6b0eSVu Pham 	if (!mr_ext_memseg_en) {
628*b8dc6b0eSVu Pham 		data.msl = rte_mem_virt2memseg_list((void *)addr);
629*b8dc6b0eSVu Pham 		data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz);
630*b8dc6b0eSVu Pham 		data.end = data.start + data.msl->page_sz;
631*b8dc6b0eSVu Pham 	} else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
632*b8dc6b0eSVu Pham 		DRV_LOG(WARNING,
633*b8dc6b0eSVu Pham 			"Unable to find virtually contiguous"
634*b8dc6b0eSVu Pham 			" chunk for address (%p)."
635*b8dc6b0eSVu Pham 			" rte_memseg_contig_walk() failed.", (void *)addr);
636*b8dc6b0eSVu Pham 		rte_errno = ENXIO;
637*b8dc6b0eSVu Pham 		goto err_nolock;
638*b8dc6b0eSVu Pham 	}
639*b8dc6b0eSVu Pham alloc_resources:
640*b8dc6b0eSVu Pham 	/* Addresses must be page-aligned. */
641*b8dc6b0eSVu Pham 	MLX5_ASSERT(data.msl);
642*b8dc6b0eSVu Pham 	MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz));
643*b8dc6b0eSVu Pham 	MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz));
644*b8dc6b0eSVu Pham 	msl = data.msl;
645*b8dc6b0eSVu Pham 	ms = rte_mem_virt2memseg((void *)data.start, msl);
646*b8dc6b0eSVu Pham 	len = data.end - data.start;
647*b8dc6b0eSVu Pham 	MLX5_ASSERT(ms);
648*b8dc6b0eSVu Pham 	MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
649*b8dc6b0eSVu Pham 	/* Number of memsegs in the range. */
650*b8dc6b0eSVu Pham 	ms_n = len / msl->page_sz;
651*b8dc6b0eSVu Pham 	DEBUG("Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
652*b8dc6b0eSVu Pham 	      " page_sz=0x%" PRIx64 ", ms_n=%u",
653*b8dc6b0eSVu Pham 	      (void *)addr, data.start, data.end, msl->page_sz, ms_n);
654*b8dc6b0eSVu Pham 	/* Size of memory for bitmap. */
655*b8dc6b0eSVu Pham 	bmp_size = rte_bitmap_get_memory_footprint(ms_n);
656*b8dc6b0eSVu Pham 	mr = rte_zmalloc_socket(NULL,
657*b8dc6b0eSVu Pham 				RTE_ALIGN_CEIL(sizeof(*mr),
658*b8dc6b0eSVu Pham 					       RTE_CACHE_LINE_SIZE) +
659*b8dc6b0eSVu Pham 				bmp_size,
660*b8dc6b0eSVu Pham 				RTE_CACHE_LINE_SIZE, msl->socket_id);
661*b8dc6b0eSVu Pham 	if (mr == NULL) {
662*b8dc6b0eSVu Pham 		DEBUG("Unable to allocate memory for a new MR of"
663*b8dc6b0eSVu Pham 		      " address (%p).", (void *)addr);
664*b8dc6b0eSVu Pham 		rte_errno = ENOMEM;
665*b8dc6b0eSVu Pham 		goto err_nolock;
666*b8dc6b0eSVu Pham 	}
667*b8dc6b0eSVu Pham 	mr->msl = msl;
668*b8dc6b0eSVu Pham 	/*
669*b8dc6b0eSVu Pham 	 * Save the index of the first memseg and initialize memseg bitmap. To
670*b8dc6b0eSVu Pham 	 * see if a memseg of ms_idx in the memseg-list is still valid, check:
671*b8dc6b0eSVu Pham 	 *	rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
672*b8dc6b0eSVu Pham 	 */
673*b8dc6b0eSVu Pham 	mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
674*b8dc6b0eSVu Pham 	bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
675*b8dc6b0eSVu Pham 	mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
676*b8dc6b0eSVu Pham 	if (mr->ms_bmp == NULL) {
677*b8dc6b0eSVu Pham 		DEBUG("Unable to initialize bitmap for a new MR of"
678*b8dc6b0eSVu Pham 		      " address (%p).", (void *)addr);
679*b8dc6b0eSVu Pham 		rte_errno = EINVAL;
680*b8dc6b0eSVu Pham 		goto err_nolock;
681*b8dc6b0eSVu Pham 	}
682*b8dc6b0eSVu Pham 	/*
683*b8dc6b0eSVu Pham 	 * Should recheck whether the extended contiguous chunk is still valid.
684*b8dc6b0eSVu Pham 	 * Because memory_hotplug_lock can't be held if there's any memory
685*b8dc6b0eSVu Pham 	 * related calls in a critical path, resource allocation above can't be
686*b8dc6b0eSVu Pham 	 * locked. If the memory has been changed at this point, try again with
687*b8dc6b0eSVu Pham 	 * just single page. If not, go on with the big chunk atomically from
688*b8dc6b0eSVu Pham 	 * here.
689*b8dc6b0eSVu Pham 	 */
690*b8dc6b0eSVu Pham 	rte_mcfg_mem_read_lock();
691*b8dc6b0eSVu Pham 	data_re = data;
692*b8dc6b0eSVu Pham 	if (len > msl->page_sz &&
693*b8dc6b0eSVu Pham 	    !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
694*b8dc6b0eSVu Pham 		DEBUG("Unable to find virtually contiguous"
695*b8dc6b0eSVu Pham 		      " chunk for address (%p)."
696*b8dc6b0eSVu Pham 		      " rte_memseg_contig_walk() failed.", (void *)addr);
697*b8dc6b0eSVu Pham 		rte_errno = ENXIO;
698*b8dc6b0eSVu Pham 		goto err_memlock;
699*b8dc6b0eSVu Pham 	}
700*b8dc6b0eSVu Pham 	if (data.start != data_re.start || data.end != data_re.end) {
701*b8dc6b0eSVu Pham 		/*
702*b8dc6b0eSVu Pham 		 * The extended contiguous chunk has been changed. Try again
703*b8dc6b0eSVu Pham 		 * with single memseg instead.
704*b8dc6b0eSVu Pham 		 */
705*b8dc6b0eSVu Pham 		data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
706*b8dc6b0eSVu Pham 		data.end = data.start + msl->page_sz;
707*b8dc6b0eSVu Pham 		rte_mcfg_mem_read_unlock();
708*b8dc6b0eSVu Pham 		mr_free(mr);
709*b8dc6b0eSVu Pham 		goto alloc_resources;
710*b8dc6b0eSVu Pham 	}
711*b8dc6b0eSVu Pham 	MLX5_ASSERT(data.msl == data_re.msl);
712*b8dc6b0eSVu Pham 	rte_rwlock_write_lock(&share_cache->rwlock);
713*b8dc6b0eSVu Pham 	/*
714*b8dc6b0eSVu Pham 	 * Check the address is really missing. If other thread already created
715*b8dc6b0eSVu Pham 	 * one or it is not found due to overflow, abort and return.
716*b8dc6b0eSVu Pham 	 */
717*b8dc6b0eSVu Pham 	if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) {
718*b8dc6b0eSVu Pham 		/*
719*b8dc6b0eSVu Pham 		 * Insert to the global cache table. It may fail due to
720*b8dc6b0eSVu Pham 		 * low-on-memory. Then, this entry will have to be searched
721*b8dc6b0eSVu Pham 		 * here again.
722*b8dc6b0eSVu Pham 		 */
723*b8dc6b0eSVu Pham 		mr_btree_insert(&share_cache->cache, entry);
724*b8dc6b0eSVu Pham 		DEBUG("Found MR for %p on final lookup, abort", (void *)addr);
725*b8dc6b0eSVu Pham 		rte_rwlock_write_unlock(&share_cache->rwlock);
726*b8dc6b0eSVu Pham 		rte_mcfg_mem_read_unlock();
727*b8dc6b0eSVu Pham 		/*
728*b8dc6b0eSVu Pham 		 * Must be unlocked before calling rte_free() because
729*b8dc6b0eSVu Pham 		 * mlx5_mr_mem_event_free_cb() can be called inside.
730*b8dc6b0eSVu Pham 		 */
731*b8dc6b0eSVu Pham 		mr_free(mr);
732*b8dc6b0eSVu Pham 		return entry->lkey;
733*b8dc6b0eSVu Pham 	}
734*b8dc6b0eSVu Pham 	/*
735*b8dc6b0eSVu Pham 	 * Trim start and end addresses for verbs MR. Set bits for registering
736*b8dc6b0eSVu Pham 	 * memsegs but exclude already registered ones. Bitmap can be
737*b8dc6b0eSVu Pham 	 * fragmented.
738*b8dc6b0eSVu Pham 	 */
739*b8dc6b0eSVu Pham 	for (n = 0; n < ms_n; ++n) {
740*b8dc6b0eSVu Pham 		uintptr_t start;
741*b8dc6b0eSVu Pham 		struct mr_cache_entry ret;
742*b8dc6b0eSVu Pham 
743*b8dc6b0eSVu Pham 		memset(&ret, 0, sizeof(ret));
744*b8dc6b0eSVu Pham 		start = data_re.start + n * msl->page_sz;
745*b8dc6b0eSVu Pham 		/* Exclude memsegs already registered by other MRs. */
746*b8dc6b0eSVu Pham 		if (mlx5_mr_lookup_cache(share_cache, &ret, start) ==
747*b8dc6b0eSVu Pham 		    UINT32_MAX) {
748*b8dc6b0eSVu Pham 			/*
749*b8dc6b0eSVu Pham 			 * Start from the first unregistered memseg in the
750*b8dc6b0eSVu Pham 			 * extended range.
751*b8dc6b0eSVu Pham 			 */
752*b8dc6b0eSVu Pham 			if (ms_idx_shift == -1) {
753*b8dc6b0eSVu Pham 				mr->ms_base_idx += n;
754*b8dc6b0eSVu Pham 				data.start = start;
755*b8dc6b0eSVu Pham 				ms_idx_shift = n;
756*b8dc6b0eSVu Pham 			}
757*b8dc6b0eSVu Pham 			data.end = start + msl->page_sz;
758*b8dc6b0eSVu Pham 			rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
759*b8dc6b0eSVu Pham 			++mr->ms_n;
760*b8dc6b0eSVu Pham 		}
761*b8dc6b0eSVu Pham 	}
762*b8dc6b0eSVu Pham 	len = data.end - data.start;
763*b8dc6b0eSVu Pham 	mr->ms_bmp_n = len / msl->page_sz;
764*b8dc6b0eSVu Pham 	MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
765*b8dc6b0eSVu Pham 	/*
766*b8dc6b0eSVu Pham 	 * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be
767*b8dc6b0eSVu Pham 	 * called with holding the memory lock because it doesn't use
768*b8dc6b0eSVu Pham 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
769*b8dc6b0eSVu Pham 	 * through mlx5_alloc_verbs_buf().
770*b8dc6b0eSVu Pham 	 */
771*b8dc6b0eSVu Pham 	mr->ibv_mr = mlx5_glue->reg_mr(pd, (void *)data.start, len,
772*b8dc6b0eSVu Pham 				       IBV_ACCESS_LOCAL_WRITE |
773*b8dc6b0eSVu Pham 					   IBV_ACCESS_RELAXED_ORDERING);
774*b8dc6b0eSVu Pham 	if (mr->ibv_mr == NULL) {
775*b8dc6b0eSVu Pham 		DEBUG("Fail to create a verbs MR for address (%p)",
776*b8dc6b0eSVu Pham 		      (void *)addr);
777*b8dc6b0eSVu Pham 		rte_errno = EINVAL;
778*b8dc6b0eSVu Pham 		goto err_mrlock;
779*b8dc6b0eSVu Pham 	}
780*b8dc6b0eSVu Pham 	MLX5_ASSERT((uintptr_t)mr->ibv_mr->addr == data.start);
781*b8dc6b0eSVu Pham 	MLX5_ASSERT(mr->ibv_mr->length == len);
782*b8dc6b0eSVu Pham 	LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr);
783*b8dc6b0eSVu Pham 	DEBUG("MR CREATED (%p) for %p:\n"
784*b8dc6b0eSVu Pham 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
785*b8dc6b0eSVu Pham 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
786*b8dc6b0eSVu Pham 	      (void *)mr, (void *)addr, data.start, data.end,
787*b8dc6b0eSVu Pham 	      rte_cpu_to_be_32(mr->ibv_mr->lkey),
788*b8dc6b0eSVu Pham 	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
789*b8dc6b0eSVu Pham 	/* Insert to the global cache table. */
790*b8dc6b0eSVu Pham 	mlx5_mr_insert_cache(share_cache, mr);
791*b8dc6b0eSVu Pham 	/* Fill in output data. */
792*b8dc6b0eSVu Pham 	mlx5_mr_lookup_cache(share_cache, entry, addr);
793*b8dc6b0eSVu Pham 	/* Lookup can't fail. */
794*b8dc6b0eSVu Pham 	MLX5_ASSERT(entry->lkey != UINT32_MAX);
795*b8dc6b0eSVu Pham 	rte_rwlock_write_unlock(&share_cache->rwlock);
796*b8dc6b0eSVu Pham 	rte_mcfg_mem_read_unlock();
797*b8dc6b0eSVu Pham 	return entry->lkey;
798*b8dc6b0eSVu Pham err_mrlock:
799*b8dc6b0eSVu Pham 	rte_rwlock_write_unlock(&share_cache->rwlock);
800*b8dc6b0eSVu Pham err_memlock:
801*b8dc6b0eSVu Pham 	rte_mcfg_mem_read_unlock();
802*b8dc6b0eSVu Pham err_nolock:
803*b8dc6b0eSVu Pham 	/*
804*b8dc6b0eSVu Pham 	 * In case of error, as this can be called in a datapath, a warning
805*b8dc6b0eSVu Pham 	 * message per an error is preferable instead. Must be unlocked before
806*b8dc6b0eSVu Pham 	 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called
807*b8dc6b0eSVu Pham 	 * inside.
808*b8dc6b0eSVu Pham 	 */
809*b8dc6b0eSVu Pham 	mr_free(mr);
810*b8dc6b0eSVu Pham 	return UINT32_MAX;
811*b8dc6b0eSVu Pham }
812*b8dc6b0eSVu Pham 
813*b8dc6b0eSVu Pham /**
814*b8dc6b0eSVu Pham  * Create a new global Memory Region (MR) for a missing virtual address.
815*b8dc6b0eSVu Pham  * This can be called from primary and secondary process.
816*b8dc6b0eSVu Pham  *
817*b8dc6b0eSVu Pham  * @param pd
818*b8dc6b0eSVu Pham  *   Pointer to ibv_pd of a device (net, regex, vdpa,...).
819*b8dc6b0eSVu Pham  * @param share_cache
820*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
821*b8dc6b0eSVu Pham  * @param[out] entry
822*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry, found in the global cache or newly
823*b8dc6b0eSVu Pham  *   created. If failed to create one, this will not be updated.
824*b8dc6b0eSVu Pham  * @param addr
825*b8dc6b0eSVu Pham  *   Target virtual address to register.
826*b8dc6b0eSVu Pham  *
827*b8dc6b0eSVu Pham  * @return
828*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
829*b8dc6b0eSVu Pham  */
830*b8dc6b0eSVu Pham static uint32_t
831*b8dc6b0eSVu Pham mlx5_mr_create(struct ibv_pd *pd, struct mlx5_mp_id *mp_id,
832*b8dc6b0eSVu Pham 	       struct mlx5_mr_share_cache *share_cache,
833*b8dc6b0eSVu Pham 	       struct mr_cache_entry *entry, uintptr_t addr,
834*b8dc6b0eSVu Pham 	       unsigned int mr_ext_memseg_en)
835*b8dc6b0eSVu Pham {
836*b8dc6b0eSVu Pham 	uint32_t ret = 0;
837*b8dc6b0eSVu Pham 
838*b8dc6b0eSVu Pham 	switch (rte_eal_process_type()) {
839*b8dc6b0eSVu Pham 	case RTE_PROC_PRIMARY:
840*b8dc6b0eSVu Pham 		ret = mlx5_mr_create_primary(pd, share_cache, entry,
841*b8dc6b0eSVu Pham 					     addr, mr_ext_memseg_en);
842*b8dc6b0eSVu Pham 		break;
843*b8dc6b0eSVu Pham 	case RTE_PROC_SECONDARY:
844*b8dc6b0eSVu Pham 		ret = mlx5_mr_create_secondary(pd, mp_id, share_cache, entry,
845*b8dc6b0eSVu Pham 					       addr, mr_ext_memseg_en);
846*b8dc6b0eSVu Pham 		break;
847*b8dc6b0eSVu Pham 	default:
848*b8dc6b0eSVu Pham 		break;
849*b8dc6b0eSVu Pham 	}
850*b8dc6b0eSVu Pham 	return ret;
851*b8dc6b0eSVu Pham }
852*b8dc6b0eSVu Pham 
853*b8dc6b0eSVu Pham /**
854*b8dc6b0eSVu Pham  * Look up address in the global MR cache table. If not found, create a new MR.
855*b8dc6b0eSVu Pham  * Insert the found/created entry to local bottom-half cache table.
856*b8dc6b0eSVu Pham  *
857*b8dc6b0eSVu Pham  * @param pd
858*b8dc6b0eSVu Pham  *   Pointer to ibv_pd of a device (net, regex, vdpa,...).
859*b8dc6b0eSVu Pham  * @param share_cache
860*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
861*b8dc6b0eSVu Pham  * @param mr_ctrl
862*b8dc6b0eSVu Pham  *   Pointer to per-queue MR control structure.
863*b8dc6b0eSVu Pham  * @param[out] entry
864*b8dc6b0eSVu Pham  *   Pointer to returning MR cache entry, found in the global cache or newly
865*b8dc6b0eSVu Pham  *   created. If failed to create one, this is not written.
866*b8dc6b0eSVu Pham  * @param addr
867*b8dc6b0eSVu Pham  *   Search key.
868*b8dc6b0eSVu Pham  *
869*b8dc6b0eSVu Pham  * @return
870*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on no match.
871*b8dc6b0eSVu Pham  */
872*b8dc6b0eSVu Pham static uint32_t
873*b8dc6b0eSVu Pham mr_lookup_caches(struct ibv_pd *pd, struct mlx5_mp_id *mp_id,
874*b8dc6b0eSVu Pham 		 struct mlx5_mr_share_cache *share_cache,
875*b8dc6b0eSVu Pham 		 struct mlx5_mr_ctrl *mr_ctrl,
876*b8dc6b0eSVu Pham 		 struct mr_cache_entry *entry, uintptr_t addr,
877*b8dc6b0eSVu Pham 		 unsigned int mr_ext_memseg_en)
878*b8dc6b0eSVu Pham {
879*b8dc6b0eSVu Pham 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
880*b8dc6b0eSVu Pham 	uint32_t lkey;
881*b8dc6b0eSVu Pham 	uint16_t idx;
882*b8dc6b0eSVu Pham 
883*b8dc6b0eSVu Pham 	/* If local cache table is full, try to double it. */
884*b8dc6b0eSVu Pham 	if (unlikely(bt->len == bt->size))
885*b8dc6b0eSVu Pham 		mr_btree_expand(bt, bt->size << 1);
886*b8dc6b0eSVu Pham 	/* Look up in the global cache. */
887*b8dc6b0eSVu Pham 	rte_rwlock_read_lock(&share_cache->rwlock);
888*b8dc6b0eSVu Pham 	lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
889*b8dc6b0eSVu Pham 	if (lkey != UINT32_MAX) {
890*b8dc6b0eSVu Pham 		/* Found. */
891*b8dc6b0eSVu Pham 		*entry = (*share_cache->cache.table)[idx];
892*b8dc6b0eSVu Pham 		rte_rwlock_read_unlock(&share_cache->rwlock);
893*b8dc6b0eSVu Pham 		/*
894*b8dc6b0eSVu Pham 		 * Update local cache. Even if it fails, return the found entry
895*b8dc6b0eSVu Pham 		 * to update top-half cache. Next time, this entry will be found
896*b8dc6b0eSVu Pham 		 * in the global cache.
897*b8dc6b0eSVu Pham 		 */
898*b8dc6b0eSVu Pham 		mr_btree_insert(bt, entry);
899*b8dc6b0eSVu Pham 		return lkey;
900*b8dc6b0eSVu Pham 	}
901*b8dc6b0eSVu Pham 	rte_rwlock_read_unlock(&share_cache->rwlock);
902*b8dc6b0eSVu Pham 	/* First time to see the address? Create a new MR. */
903*b8dc6b0eSVu Pham 	lkey = mlx5_mr_create(pd, mp_id, share_cache, entry, addr,
904*b8dc6b0eSVu Pham 			      mr_ext_memseg_en);
905*b8dc6b0eSVu Pham 	/*
906*b8dc6b0eSVu Pham 	 * Update the local cache if successfully created a new global MR. Even
907*b8dc6b0eSVu Pham 	 * if failed to create one, there's no action to take in this datapath
908*b8dc6b0eSVu Pham 	 * code. As returning LKey is invalid, this will eventually make HW
909*b8dc6b0eSVu Pham 	 * fail.
910*b8dc6b0eSVu Pham 	 */
911*b8dc6b0eSVu Pham 	if (lkey != UINT32_MAX)
912*b8dc6b0eSVu Pham 		mr_btree_insert(bt, entry);
913*b8dc6b0eSVu Pham 	return lkey;
914*b8dc6b0eSVu Pham }
915*b8dc6b0eSVu Pham 
916*b8dc6b0eSVu Pham /**
917*b8dc6b0eSVu Pham  * Bottom-half of LKey search on datapath. First search in cache_bh[] and if
918*b8dc6b0eSVu Pham  * misses, search in the global MR cache table and update the new entry to
919*b8dc6b0eSVu Pham  * per-queue local caches.
920*b8dc6b0eSVu Pham  *
921*b8dc6b0eSVu Pham  * @param pd
922*b8dc6b0eSVu Pham  *   Pointer to ibv_pd of a device (net, regex, vdpa,...).
923*b8dc6b0eSVu Pham  * @param share_cache
924*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
925*b8dc6b0eSVu Pham  * @param mr_ctrl
926*b8dc6b0eSVu Pham  *   Pointer to per-queue MR control structure.
927*b8dc6b0eSVu Pham  * @param addr
928*b8dc6b0eSVu Pham  *   Search key.
929*b8dc6b0eSVu Pham  *
930*b8dc6b0eSVu Pham  * @return
931*b8dc6b0eSVu Pham  *   Searched LKey on success, UINT32_MAX on no match.
932*b8dc6b0eSVu Pham  */
933*b8dc6b0eSVu Pham uint32_t mlx5_mr_addr2mr_bh(struct ibv_pd *pd, struct mlx5_mp_id *mp_id,
934*b8dc6b0eSVu Pham 			    struct mlx5_mr_share_cache *share_cache,
935*b8dc6b0eSVu Pham 			    struct mlx5_mr_ctrl *mr_ctrl,
936*b8dc6b0eSVu Pham 			    uintptr_t addr, unsigned int mr_ext_memseg_en)
937*b8dc6b0eSVu Pham {
938*b8dc6b0eSVu Pham 	uint32_t lkey;
939*b8dc6b0eSVu Pham 	uint16_t bh_idx = 0;
940*b8dc6b0eSVu Pham 	/* Victim in top-half cache to replace with new entry. */
941*b8dc6b0eSVu Pham 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
942*b8dc6b0eSVu Pham 
943*b8dc6b0eSVu Pham 	/* Binary-search MR translation table. */
944*b8dc6b0eSVu Pham 	lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
945*b8dc6b0eSVu Pham 	/* Update top-half cache. */
946*b8dc6b0eSVu Pham 	if (likely(lkey != UINT32_MAX)) {
947*b8dc6b0eSVu Pham 		*repl = (*mr_ctrl->cache_bh.table)[bh_idx];
948*b8dc6b0eSVu Pham 	} else {
949*b8dc6b0eSVu Pham 		/*
950*b8dc6b0eSVu Pham 		 * If missed in local lookup table, search in the global cache
951*b8dc6b0eSVu Pham 		 * and local cache_bh[] will be updated inside if possible.
952*b8dc6b0eSVu Pham 		 * Top-half cache entry will also be updated.
953*b8dc6b0eSVu Pham 		 */
954*b8dc6b0eSVu Pham 		lkey = mr_lookup_caches(pd, mp_id, share_cache, mr_ctrl,
955*b8dc6b0eSVu Pham 					repl, addr, mr_ext_memseg_en);
956*b8dc6b0eSVu Pham 		if (unlikely(lkey == UINT32_MAX))
957*b8dc6b0eSVu Pham 			return UINT32_MAX;
958*b8dc6b0eSVu Pham 	}
959*b8dc6b0eSVu Pham 	/* Update the most recently used entry. */
960*b8dc6b0eSVu Pham 	mr_ctrl->mru = mr_ctrl->head;
961*b8dc6b0eSVu Pham 	/* Point to the next victim, the oldest. */
962*b8dc6b0eSVu Pham 	mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
963*b8dc6b0eSVu Pham 	return lkey;
964*b8dc6b0eSVu Pham }
965*b8dc6b0eSVu Pham 
966*b8dc6b0eSVu Pham /**
967*b8dc6b0eSVu Pham  * Release all the created MRs and resources on global MR cache of a device.
968*b8dc6b0eSVu Pham  * list.
969*b8dc6b0eSVu Pham  *
970*b8dc6b0eSVu Pham  * @param share_cache
971*b8dc6b0eSVu Pham  *   Pointer to a global shared MR cache.
972*b8dc6b0eSVu Pham  */
973*b8dc6b0eSVu Pham void
974*b8dc6b0eSVu Pham mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache)
975*b8dc6b0eSVu Pham {
976*b8dc6b0eSVu Pham 	struct mlx5_mr *mr_next;
977*b8dc6b0eSVu Pham 
978*b8dc6b0eSVu Pham 	rte_rwlock_write_lock(&share_cache->rwlock);
979*b8dc6b0eSVu Pham 	/* Detach from MR list and move to free list. */
980*b8dc6b0eSVu Pham 	mr_next = LIST_FIRST(&share_cache->mr_list);
981*b8dc6b0eSVu Pham 	while (mr_next != NULL) {
982*b8dc6b0eSVu Pham 		struct mlx5_mr *mr = mr_next;
983*b8dc6b0eSVu Pham 
984*b8dc6b0eSVu Pham 		mr_next = LIST_NEXT(mr, mr);
985*b8dc6b0eSVu Pham 		LIST_REMOVE(mr, mr);
986*b8dc6b0eSVu Pham 		LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr);
987*b8dc6b0eSVu Pham 	}
988*b8dc6b0eSVu Pham 	LIST_INIT(&share_cache->mr_list);
989*b8dc6b0eSVu Pham 	/* Free global cache. */
990*b8dc6b0eSVu Pham 	mlx5_mr_btree_free(&share_cache->cache);
991*b8dc6b0eSVu Pham 	rte_rwlock_write_unlock(&share_cache->rwlock);
992*b8dc6b0eSVu Pham 	/* Free all remaining MRs. */
993*b8dc6b0eSVu Pham 	mlx5_mr_garbage_collect(share_cache);
994*b8dc6b0eSVu Pham }
995*b8dc6b0eSVu Pham 
996*b8dc6b0eSVu Pham /**
997*b8dc6b0eSVu Pham  * Flush all of the local cache entries.
998*b8dc6b0eSVu Pham  *
999*b8dc6b0eSVu Pham  * @param mr_ctrl
1000*b8dc6b0eSVu Pham  *   Pointer to per-queue MR local cache.
1001*b8dc6b0eSVu Pham  */
1002*b8dc6b0eSVu Pham void
1003*b8dc6b0eSVu Pham mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl)
1004*b8dc6b0eSVu Pham {
1005*b8dc6b0eSVu Pham 	/* Reset the most-recently-used index. */
1006*b8dc6b0eSVu Pham 	mr_ctrl->mru = 0;
1007*b8dc6b0eSVu Pham 	/* Reset the linear search array. */
1008*b8dc6b0eSVu Pham 	mr_ctrl->head = 0;
1009*b8dc6b0eSVu Pham 	memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
1010*b8dc6b0eSVu Pham 	/* Reset the B-tree table. */
1011*b8dc6b0eSVu Pham 	mr_ctrl->cache_bh.len = 1;
1012*b8dc6b0eSVu Pham 	mr_ctrl->cache_bh.overflow = 0;
1013*b8dc6b0eSVu Pham 	/* Update the generation number. */
1014*b8dc6b0eSVu Pham 	mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
1015*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d",
1016*b8dc6b0eSVu Pham 		(void *)mr_ctrl, mr_ctrl->cur_gen);
1017*b8dc6b0eSVu Pham }
1018*b8dc6b0eSVu Pham 
1019*b8dc6b0eSVu Pham /**
1020*b8dc6b0eSVu Pham  * Creates a memory region for external memory, that is memory which is not
1021*b8dc6b0eSVu Pham  * part of the DPDK memory segments.
1022*b8dc6b0eSVu Pham  *
1023*b8dc6b0eSVu Pham  * @param pd
1024*b8dc6b0eSVu Pham  *   Pointer to ibv_pd of a device (net, regex, vdpa,...).
1025*b8dc6b0eSVu Pham  * @param addr
1026*b8dc6b0eSVu Pham  *   Starting virtual address of memory.
1027*b8dc6b0eSVu Pham  * @param len
1028*b8dc6b0eSVu Pham  *   Length of memory segment being mapped.
1029*b8dc6b0eSVu Pham  * @param socked_id
1030*b8dc6b0eSVu Pham  *   Socket to allocate heap memory for the control structures.
1031*b8dc6b0eSVu Pham  *
1032*b8dc6b0eSVu Pham  * @return
1033*b8dc6b0eSVu Pham  *   Pointer to MR structure on success, NULL otherwise.
1034*b8dc6b0eSVu Pham  */
1035*b8dc6b0eSVu Pham struct mlx5_mr *
1036*b8dc6b0eSVu Pham mlx5_create_mr_ext(struct ibv_pd *pd, uintptr_t addr, size_t len, int socket_id)
1037*b8dc6b0eSVu Pham {
1038*b8dc6b0eSVu Pham 	struct mlx5_mr *mr = NULL;
1039*b8dc6b0eSVu Pham 
1040*b8dc6b0eSVu Pham 	mr = rte_zmalloc_socket(NULL,
1041*b8dc6b0eSVu Pham 				RTE_ALIGN_CEIL(sizeof(*mr),
1042*b8dc6b0eSVu Pham 					       RTE_CACHE_LINE_SIZE),
1043*b8dc6b0eSVu Pham 				RTE_CACHE_LINE_SIZE, socket_id);
1044*b8dc6b0eSVu Pham 	if (mr == NULL)
1045*b8dc6b0eSVu Pham 		return NULL;
1046*b8dc6b0eSVu Pham 	mr->ibv_mr = mlx5_glue->reg_mr(pd, (void *)addr, len,
1047*b8dc6b0eSVu Pham 				       IBV_ACCESS_LOCAL_WRITE |
1048*b8dc6b0eSVu Pham 					   IBV_ACCESS_RELAXED_ORDERING);
1049*b8dc6b0eSVu Pham 	if (mr->ibv_mr == NULL) {
1050*b8dc6b0eSVu Pham 		DRV_LOG(WARNING,
1051*b8dc6b0eSVu Pham 			"Fail to create a verbs MR for address (%p)",
1052*b8dc6b0eSVu Pham 			(void *)addr);
1053*b8dc6b0eSVu Pham 		rte_free(mr);
1054*b8dc6b0eSVu Pham 		return NULL;
1055*b8dc6b0eSVu Pham 	}
1056*b8dc6b0eSVu Pham 	mr->msl = NULL; /* Mark it is external memory. */
1057*b8dc6b0eSVu Pham 	mr->ms_bmp = NULL;
1058*b8dc6b0eSVu Pham 	mr->ms_n = 1;
1059*b8dc6b0eSVu Pham 	mr->ms_bmp_n = 1;
1060*b8dc6b0eSVu Pham 	DRV_LOG(DEBUG,
1061*b8dc6b0eSVu Pham 		"MR CREATED (%p) for external memory %p:\n"
1062*b8dc6b0eSVu Pham 		"  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
1063*b8dc6b0eSVu Pham 		" lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
1064*b8dc6b0eSVu Pham 		(void *)mr, (void *)addr,
1065*b8dc6b0eSVu Pham 		addr, addr + len, rte_cpu_to_be_32(mr->ibv_mr->lkey),
1066*b8dc6b0eSVu Pham 		mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
1067*b8dc6b0eSVu Pham 	return mr;
1068*b8dc6b0eSVu Pham }
1069*b8dc6b0eSVu Pham 
1070*b8dc6b0eSVu Pham /**
1071*b8dc6b0eSVu Pham  * Dump all the created MRs and the global cache entries.
1072*b8dc6b0eSVu Pham  *
1073*b8dc6b0eSVu Pham  * @param sh
1074*b8dc6b0eSVu Pham  *   Pointer to Ethernet device shared context.
1075*b8dc6b0eSVu Pham  */
1076*b8dc6b0eSVu Pham void
1077*b8dc6b0eSVu Pham mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused)
1078*b8dc6b0eSVu Pham {
1079*b8dc6b0eSVu Pham #ifdef RTE_LIBRTE_MLX5_DEBUG
1080*b8dc6b0eSVu Pham 	struct mlx5_mr *mr;
1081*b8dc6b0eSVu Pham 	int mr_n = 0;
1082*b8dc6b0eSVu Pham 	int chunk_n = 0;
1083*b8dc6b0eSVu Pham 
1084*b8dc6b0eSVu Pham 	rte_rwlock_read_lock(&share_cache->rwlock);
1085*b8dc6b0eSVu Pham 	/* Iterate all the existing MRs. */
1086*b8dc6b0eSVu Pham 	LIST_FOREACH(mr, &share_cache->mr_list, mr) {
1087*b8dc6b0eSVu Pham 		unsigned int n;
1088*b8dc6b0eSVu Pham 
1089*b8dc6b0eSVu Pham 		DEBUG("MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
1090*b8dc6b0eSVu Pham 		      mr_n++, rte_cpu_to_be_32(mr->ibv_mr->lkey),
1091*b8dc6b0eSVu Pham 		      mr->ms_n, mr->ms_bmp_n);
1092*b8dc6b0eSVu Pham 		if (mr->ms_n == 0)
1093*b8dc6b0eSVu Pham 			continue;
1094*b8dc6b0eSVu Pham 		for (n = 0; n < mr->ms_bmp_n; ) {
1095*b8dc6b0eSVu Pham 			struct mr_cache_entry ret = { 0, };
1096*b8dc6b0eSVu Pham 
1097*b8dc6b0eSVu Pham 			n = mr_find_next_chunk(mr, &ret, n);
1098*b8dc6b0eSVu Pham 			if (!ret.end)
1099*b8dc6b0eSVu Pham 				break;
1100*b8dc6b0eSVu Pham 			DEBUG("  chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
1101*b8dc6b0eSVu Pham 			      chunk_n++, ret.start, ret.end);
1102*b8dc6b0eSVu Pham 		}
1103*b8dc6b0eSVu Pham 	}
1104*b8dc6b0eSVu Pham 	DEBUG("Dumping global cache %p", (void *)share_cache);
1105*b8dc6b0eSVu Pham 	mlx5_mr_btree_dump(&share_cache->cache);
1106*b8dc6b0eSVu Pham 	rte_rwlock_read_unlock(&share_cache->rwlock);
1107*b8dc6b0eSVu Pham #endif
1108*b8dc6b0eSVu Pham }
1109