xref: /dpdk/drivers/common/mlx5/mlx5_common_mr.c (revision 1dc6665d364b06ad44423f9dfac3818924950593)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2016 6WIND S.A.
3  * Copyright 2020 Mellanox Technologies, Ltd
4  */
5 #include <stddef.h>
6 
7 #include <rte_eal_memconfig.h>
8 #include <rte_eal_paging.h>
9 #include <rte_errno.h>
10 #include <rte_mempool.h>
11 #include <rte_malloc.h>
12 #include <rte_rwlock.h>
13 
14 #include "mlx5_glue.h"
15 #include "mlx5_common_mp.h"
16 #include "mlx5_common_mr.h"
17 #include "mlx5_common_log.h"
18 #include "mlx5_malloc.h"
19 
20 struct mr_find_contig_memsegs_data {
21 	uintptr_t addr;
22 	uintptr_t start;
23 	uintptr_t end;
24 	const struct rte_memseg_list *msl;
25 };
26 
27 /* Virtual memory range. */
28 struct mlx5_range {
29 	uintptr_t start;
30 	uintptr_t end;
31 };
32 
33 /** Memory region for a mempool. */
34 struct mlx5_mempool_mr {
35 	struct mlx5_pmd_mr pmd_mr;
36 	uint32_t refcnt; /**< Number of mempools sharing this MR. */
37 };
38 
39 /* Mempool registration. */
40 struct mlx5_mempool_reg {
41 	LIST_ENTRY(mlx5_mempool_reg) next;
42 	/** Registered mempool, used to designate registrations. */
43 	struct rte_mempool *mp;
44 	/** Memory regions for the address ranges of the mempool. */
45 	struct mlx5_mempool_mr *mrs;
46 	/** Number of memory regions. */
47 	unsigned int mrs_n;
48 };
49 
50 /**
51  * Expand B-tree table to a given size. Can't be called with holding
52  * memory_hotplug_lock or share_cache.rwlock due to rte_realloc().
53  *
54  * @param bt
55  *   Pointer to B-tree structure.
56  * @param n
57  *   Number of entries for expansion.
58  *
59  * @return
60  *   0 on success, -1 on failure.
61  */
62 static int
63 mr_btree_expand(struct mlx5_mr_btree *bt, int n)
64 {
65 	void *mem;
66 	int ret = 0;
67 
68 	if (n <= bt->size)
69 		return ret;
70 	/*
71 	 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
72 	 * used inside if there's no room to expand. Because this is a quite
73 	 * rare case and a part of very slow path, it is very acceptable.
74 	 * Initially cache_bh[] will be given practically enough space and once
75 	 * it is expanded, expansion wouldn't be needed again ever.
76 	 */
77 	mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO,
78 			   n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY);
79 	if (mem == NULL) {
80 		/* Not an error, B-tree search will be skipped. */
81 		DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table",
82 			(void *)bt);
83 		ret = -1;
84 	} else {
85 		DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n);
86 		bt->table = mem;
87 		bt->size = n;
88 	}
89 	return ret;
90 }
91 
92 /**
93  * Look up LKey from given B-tree lookup table, store the last index and return
94  * searched LKey.
95  *
96  * @param bt
97  *   Pointer to B-tree structure.
98  * @param[out] idx
99  *   Pointer to index. Even on search failure, returns index where it stops
100  *   searching so that index can be used when inserting a new entry.
101  * @param addr
102  *   Search key.
103  *
104  * @return
105  *   Searched LKey on success, UINT32_MAX on no match.
106  */
107 static uint32_t
108 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr)
109 {
110 	struct mr_cache_entry *lkp_tbl;
111 	uint16_t n;
112 	uint16_t base = 0;
113 
114 	MLX5_ASSERT(bt != NULL);
115 	lkp_tbl = *bt->table;
116 	n = bt->len;
117 	/* First entry must be NULL for comparison. */
118 	MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 &&
119 				    lkp_tbl[0].lkey == UINT32_MAX));
120 	/* Binary search. */
121 	do {
122 		register uint16_t delta = n >> 1;
123 
124 		if (addr < lkp_tbl[base + delta].start) {
125 			n = delta;
126 		} else {
127 			base += delta;
128 			n -= delta;
129 		}
130 	} while (n > 1);
131 	MLX5_ASSERT(addr >= lkp_tbl[base].start);
132 	*idx = base;
133 	if (addr < lkp_tbl[base].end)
134 		return lkp_tbl[base].lkey;
135 	/* Not found. */
136 	return UINT32_MAX;
137 }
138 
139 /**
140  * Insert an entry to B-tree lookup table.
141  *
142  * @param bt
143  *   Pointer to B-tree structure.
144  * @param entry
145  *   Pointer to new entry to insert.
146  *
147  * @return
148  *   0 on success, -1 on failure.
149  */
150 static int
151 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
152 {
153 	struct mr_cache_entry *lkp_tbl;
154 	uint16_t idx = 0;
155 	size_t shift;
156 
157 	MLX5_ASSERT(bt != NULL);
158 	MLX5_ASSERT(bt->len <= bt->size);
159 	MLX5_ASSERT(bt->len > 0);
160 	lkp_tbl = *bt->table;
161 	/* Find out the slot for insertion. */
162 	if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
163 		DRV_LOG(DEBUG,
164 			"abort insertion to B-tree(%p): already exist at"
165 			" idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
166 			(void *)bt, idx, entry->start, entry->end, entry->lkey);
167 		/* Already exist, return. */
168 		return 0;
169 	}
170 	/* If table is full, return error. */
171 	if (unlikely(bt->len == bt->size)) {
172 		bt->overflow = 1;
173 		return -1;
174 	}
175 	/* Insert entry. */
176 	++idx;
177 	shift = (bt->len - idx) * sizeof(struct mr_cache_entry);
178 	if (shift)
179 		memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
180 	lkp_tbl[idx] = *entry;
181 	bt->len++;
182 	DRV_LOG(DEBUG,
183 		"inserted B-tree(%p)[%u],"
184 		" [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
185 		(void *)bt, idx, entry->start, entry->end, entry->lkey);
186 	return 0;
187 }
188 
189 /**
190  * Initialize B-tree and allocate memory for lookup table.
191  *
192  * @param bt
193  *   Pointer to B-tree structure.
194  * @param n
195  *   Number of entries to allocate.
196  * @param socket
197  *   NUMA socket on which memory must be allocated.
198  *
199  * @return
200  *   0 on success, a negative errno value otherwise and rte_errno is set.
201  */
202 int
203 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket)
204 {
205 	if (bt == NULL) {
206 		rte_errno = EINVAL;
207 		return -rte_errno;
208 	}
209 	MLX5_ASSERT(!bt->table && !bt->size);
210 	memset(bt, 0, sizeof(*bt));
211 	bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
212 				sizeof(struct mr_cache_entry) * n,
213 				0, socket);
214 	if (bt->table == NULL) {
215 		rte_errno = ENOMEM;
216 		DRV_LOG(DEBUG,
217 			"failed to allocate memory for btree cache on socket "
218 			"%d", socket);
219 		return -rte_errno;
220 	}
221 	bt->size = n;
222 	/* First entry must be NULL for binary search. */
223 	(*bt->table)[bt->len++] = (struct mr_cache_entry) {
224 		.lkey = UINT32_MAX,
225 	};
226 	DRV_LOG(DEBUG, "initialized B-tree %p with table %p",
227 	      (void *)bt, (void *)bt->table);
228 	return 0;
229 }
230 
231 /**
232  * Free B-tree resources.
233  *
234  * @param bt
235  *   Pointer to B-tree structure.
236  */
237 void
238 mlx5_mr_btree_free(struct mlx5_mr_btree *bt)
239 {
240 	if (bt == NULL)
241 		return;
242 	DRV_LOG(DEBUG, "freeing B-tree %p with table %p",
243 	      (void *)bt, (void *)bt->table);
244 	mlx5_free(bt->table);
245 	memset(bt, 0, sizeof(*bt));
246 }
247 
248 /**
249  * Dump all the entries in a B-tree
250  *
251  * @param bt
252  *   Pointer to B-tree structure.
253  */
254 void
255 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused)
256 {
257 #ifdef RTE_LIBRTE_MLX5_DEBUG
258 	int idx;
259 	struct mr_cache_entry *lkp_tbl;
260 
261 	if (bt == NULL)
262 		return;
263 	lkp_tbl = *bt->table;
264 	for (idx = 0; idx < bt->len; ++idx) {
265 		struct mr_cache_entry *entry = &lkp_tbl[idx];
266 
267 		DRV_LOG(DEBUG, "B-tree(%p)[%u],"
268 		      " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
269 		      (void *)bt, idx, entry->start, entry->end, entry->lkey);
270 	}
271 #endif
272 }
273 
274 /**
275  * Find virtually contiguous memory chunk in a given MR.
276  *
277  * @param dev
278  *   Pointer to MR structure.
279  * @param[out] entry
280  *   Pointer to returning MR cache entry. If not found, this will not be
281  *   updated.
282  * @param start_idx
283  *   Start index of the memseg bitmap.
284  *
285  * @return
286  *   Next index to go on lookup.
287  */
288 static int
289 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry,
290 		   int base_idx)
291 {
292 	uintptr_t start = 0;
293 	uintptr_t end = 0;
294 	uint32_t idx = 0;
295 
296 	/* MR for external memory doesn't have memseg list. */
297 	if (mr->msl == NULL) {
298 		MLX5_ASSERT(mr->ms_bmp_n == 1);
299 		MLX5_ASSERT(mr->ms_n == 1);
300 		MLX5_ASSERT(base_idx == 0);
301 		/*
302 		 * Can't search it from memseg list but get it directly from
303 		 * pmd_mr as there's only one chunk.
304 		 */
305 		entry->start = (uintptr_t)mr->pmd_mr.addr;
306 		entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len;
307 		entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey);
308 		/* Returning 1 ends iteration. */
309 		return 1;
310 	}
311 	for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
312 		if (rte_bitmap_get(mr->ms_bmp, idx)) {
313 			const struct rte_memseg_list *msl;
314 			const struct rte_memseg *ms;
315 
316 			msl = mr->msl;
317 			ms = rte_fbarray_get(&msl->memseg_arr,
318 					     mr->ms_base_idx + idx);
319 			MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
320 			if (!start)
321 				start = ms->addr_64;
322 			end = ms->addr_64 + ms->hugepage_sz;
323 		} else if (start) {
324 			/* Passed the end of a fragment. */
325 			break;
326 		}
327 	}
328 	if (start) {
329 		/* Found one chunk. */
330 		entry->start = start;
331 		entry->end = end;
332 		entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey);
333 	}
334 	return idx;
335 }
336 
337 /**
338  * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
339  * Then, this entry will have to be searched by mr_lookup_list() in
340  * mlx5_mr_create() on miss.
341  *
342  * @param share_cache
343  *   Pointer to a global shared MR cache.
344  * @param mr
345  *   Pointer to MR to insert.
346  *
347  * @return
348  *   0 on success, -1 on failure.
349  */
350 int
351 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache,
352 		     struct mlx5_mr *mr)
353 {
354 	unsigned int n;
355 
356 	DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)",
357 		(void *)mr, (void *)share_cache);
358 	for (n = 0; n < mr->ms_bmp_n; ) {
359 		struct mr_cache_entry entry;
360 
361 		memset(&entry, 0, sizeof(entry));
362 		/* Find a contiguous chunk and advance the index. */
363 		n = mr_find_next_chunk(mr, &entry, n);
364 		if (!entry.end)
365 			break;
366 		if (mr_btree_insert(&share_cache->cache, &entry) < 0) {
367 			/*
368 			 * Overflowed, but the global table cannot be expanded
369 			 * because of deadlock.
370 			 */
371 			return -1;
372 		}
373 	}
374 	return 0;
375 }
376 
377 /**
378  * Look up address in the original global MR list.
379  *
380  * @param share_cache
381  *   Pointer to a global shared MR cache.
382  * @param[out] entry
383  *   Pointer to returning MR cache entry. If no match, this will not be updated.
384  * @param addr
385  *   Search key.
386  *
387  * @return
388  *   Found MR on match, NULL otherwise.
389  */
390 struct mlx5_mr *
391 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache,
392 		    struct mr_cache_entry *entry, uintptr_t addr)
393 {
394 	struct mlx5_mr *mr;
395 
396 	/* Iterate all the existing MRs. */
397 	LIST_FOREACH(mr, &share_cache->mr_list, mr) {
398 		unsigned int n;
399 
400 		if (mr->ms_n == 0)
401 			continue;
402 		for (n = 0; n < mr->ms_bmp_n; ) {
403 			struct mr_cache_entry ret;
404 
405 			memset(&ret, 0, sizeof(ret));
406 			n = mr_find_next_chunk(mr, &ret, n);
407 			if (addr >= ret.start && addr < ret.end) {
408 				/* Found. */
409 				*entry = ret;
410 				return mr;
411 			}
412 		}
413 	}
414 	return NULL;
415 }
416 
417 /**
418  * Look up address on global MR cache.
419  *
420  * @param share_cache
421  *   Pointer to a global shared MR cache.
422  * @param[out] entry
423  *   Pointer to returning MR cache entry. If no match, this will not be updated.
424  * @param addr
425  *   Search key.
426  *
427  * @return
428  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
429  */
430 uint32_t
431 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache,
432 		     struct mr_cache_entry *entry, uintptr_t addr)
433 {
434 	uint16_t idx;
435 	uint32_t lkey = UINT32_MAX;
436 	struct mlx5_mr *mr;
437 
438 	/*
439 	 * If the global cache has overflowed since it failed to expand the
440 	 * B-tree table, it can't have all the existing MRs. Then, the address
441 	 * has to be searched by traversing the original MR list instead, which
442 	 * is very slow path. Otherwise, the global cache is all inclusive.
443 	 */
444 	if (!unlikely(share_cache->cache.overflow)) {
445 		lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
446 		if (lkey != UINT32_MAX)
447 			*entry = (*share_cache->cache.table)[idx];
448 	} else {
449 		/* Falling back to the slowest path. */
450 		mr = mlx5_mr_lookup_list(share_cache, entry, addr);
451 		if (mr != NULL)
452 			lkey = entry->lkey;
453 	}
454 	MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
455 					   addr < entry->end));
456 	return lkey;
457 }
458 
459 /**
460  * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
461  * can raise memory free event and the callback function will spin on the lock.
462  *
463  * @param mr
464  *   Pointer to MR to free.
465  */
466 void
467 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb)
468 {
469 	if (mr == NULL)
470 		return;
471 	DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr);
472 	dereg_mr_cb(&mr->pmd_mr);
473 	if (mr->ms_bmp != NULL)
474 		rte_bitmap_free(mr->ms_bmp);
475 	mlx5_free(mr);
476 }
477 
478 void
479 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache)
480 {
481 	struct mlx5_mr *mr;
482 
483 	DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache);
484 	/* Flush cache to rebuild. */
485 	share_cache->cache.len = 1;
486 	share_cache->cache.overflow = 0;
487 	/* Iterate all the existing MRs. */
488 	LIST_FOREACH(mr, &share_cache->mr_list, mr)
489 		if (mlx5_mr_insert_cache(share_cache, mr) < 0)
490 			return;
491 }
492 
493 /**
494  * Release resources of detached MR having no online entry.
495  *
496  * @param share_cache
497  *   Pointer to a global shared MR cache.
498  */
499 static void
500 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache)
501 {
502 	struct mlx5_mr *mr_next;
503 	struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
504 
505 	/* Must be called from the primary process. */
506 	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
507 	/*
508 	 * MR can't be freed with holding the lock because rte_free() could call
509 	 * memory free callback function. This will be a deadlock situation.
510 	 */
511 	rte_rwlock_write_lock(&share_cache->rwlock);
512 	/* Detach the whole free list and release it after unlocking. */
513 	free_list = share_cache->mr_free_list;
514 	LIST_INIT(&share_cache->mr_free_list);
515 	rte_rwlock_write_unlock(&share_cache->rwlock);
516 	/* Release resources. */
517 	mr_next = LIST_FIRST(&free_list);
518 	while (mr_next != NULL) {
519 		struct mlx5_mr *mr = mr_next;
520 
521 		mr_next = LIST_NEXT(mr, mr);
522 		mlx5_mr_free(mr, share_cache->dereg_mr_cb);
523 	}
524 }
525 
526 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */
527 static int
528 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
529 			  const struct rte_memseg *ms, size_t len, void *arg)
530 {
531 	struct mr_find_contig_memsegs_data *data = arg;
532 
533 	if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
534 		return 0;
535 	/* Found, save it and stop walking. */
536 	data->start = ms->addr_64;
537 	data->end = ms->addr_64 + len;
538 	data->msl = msl;
539 	return 1;
540 }
541 
542 /**
543  * Create a new global Memory Region (MR) for a missing virtual address.
544  * This API should be called on a secondary process, then a request is sent to
545  * the primary process in order to create a MR for the address. As the global MR
546  * list is on the shared memory, following LKey lookup should succeed unless the
547  * request fails.
548  *
549  * @param pd
550  *   Pointer to pd of a device (net, regex, vdpa,...).
551  * @param share_cache
552  *   Pointer to a global shared MR cache.
553  * @param[out] entry
554  *   Pointer to returning MR cache entry, found in the global cache or newly
555  *   created. If failed to create one, this will not be updated.
556  * @param addr
557  *   Target virtual address to register.
558  * @param mr_ext_memseg_en
559  *   Configurable flag about external memory segment enable or not.
560  *
561  * @return
562  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
563  */
564 static uint32_t
565 mlx5_mr_create_secondary(void *pd __rte_unused,
566 			 struct mlx5_mp_id *mp_id,
567 			 struct mlx5_mr_share_cache *share_cache,
568 			 struct mr_cache_entry *entry, uintptr_t addr,
569 			 unsigned int mr_ext_memseg_en __rte_unused)
570 {
571 	int ret;
572 
573 	DRV_LOG(DEBUG, "port %u requesting MR creation for address (%p)",
574 	      mp_id->port_id, (void *)addr);
575 	ret = mlx5_mp_req_mr_create(mp_id, addr);
576 	if (ret) {
577 		DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)",
578 		      (void *)addr);
579 		return UINT32_MAX;
580 	}
581 	rte_rwlock_read_lock(&share_cache->rwlock);
582 	/* Fill in output data. */
583 	mlx5_mr_lookup_cache(share_cache, entry, addr);
584 	/* Lookup can't fail. */
585 	MLX5_ASSERT(entry->lkey != UINT32_MAX);
586 	rte_rwlock_read_unlock(&share_cache->rwlock);
587 	DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n"
588 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
589 	      (void *)addr, entry->start, entry->end, entry->lkey);
590 	return entry->lkey;
591 }
592 
593 /**
594  * Create a new global Memory Region (MR) for a missing virtual address.
595  * Register entire virtually contiguous memory chunk around the address.
596  *
597  * @param pd
598  *   Pointer to pd of a device (net, regex, vdpa,...).
599  * @param share_cache
600  *   Pointer to a global shared MR cache.
601  * @param[out] entry
602  *   Pointer to returning MR cache entry, found in the global cache or newly
603  *   created. If failed to create one, this will not be updated.
604  * @param addr
605  *   Target virtual address to register.
606  * @param mr_ext_memseg_en
607  *   Configurable flag about external memory segment enable or not.
608  *
609  * @return
610  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
611  */
612 uint32_t
613 mlx5_mr_create_primary(void *pd,
614 		       struct mlx5_mr_share_cache *share_cache,
615 		       struct mr_cache_entry *entry, uintptr_t addr,
616 		       unsigned int mr_ext_memseg_en)
617 {
618 	struct mr_find_contig_memsegs_data data = {.addr = addr, };
619 	struct mr_find_contig_memsegs_data data_re;
620 	const struct rte_memseg_list *msl;
621 	const struct rte_memseg *ms;
622 	struct mlx5_mr *mr = NULL;
623 	int ms_idx_shift = -1;
624 	uint32_t bmp_size;
625 	void *bmp_mem;
626 	uint32_t ms_n;
627 	uint32_t n;
628 	size_t len;
629 
630 	DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr);
631 	/*
632 	 * Release detached MRs if any. This can't be called with holding either
633 	 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have
634 	 * been detached by the memory free event but it couldn't be released
635 	 * inside the callback due to deadlock. As a result, releasing resources
636 	 * is quite opportunistic.
637 	 */
638 	mlx5_mr_garbage_collect(share_cache);
639 	/*
640 	 * If enabled, find out a contiguous virtual address chunk in use, to
641 	 * which the given address belongs, in order to register maximum range.
642 	 * In the best case where mempools are not dynamically recreated and
643 	 * '--socket-mem' is specified as an EAL option, it is very likely to
644 	 * have only one MR(LKey) per a socket and per a hugepage-size even
645 	 * though the system memory is highly fragmented. As the whole memory
646 	 * chunk will be pinned by kernel, it can't be reused unless entire
647 	 * chunk is freed from EAL.
648 	 *
649 	 * If disabled, just register one memseg (page). Then, memory
650 	 * consumption will be minimized but it may drop performance if there
651 	 * are many MRs to lookup on the datapath.
652 	 */
653 	if (!mr_ext_memseg_en) {
654 		data.msl = rte_mem_virt2memseg_list((void *)addr);
655 		data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz);
656 		data.end = data.start + data.msl->page_sz;
657 	} else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
658 		DRV_LOG(WARNING,
659 			"Unable to find virtually contiguous"
660 			" chunk for address (%p)."
661 			" rte_memseg_contig_walk() failed.", (void *)addr);
662 		rte_errno = ENXIO;
663 		goto err_nolock;
664 	}
665 alloc_resources:
666 	/* Addresses must be page-aligned. */
667 	MLX5_ASSERT(data.msl);
668 	MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz));
669 	MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz));
670 	msl = data.msl;
671 	ms = rte_mem_virt2memseg((void *)data.start, msl);
672 	len = data.end - data.start;
673 	MLX5_ASSERT(ms);
674 	MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
675 	/* Number of memsegs in the range. */
676 	ms_n = len / msl->page_sz;
677 	DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
678 	      " page_sz=0x%" PRIx64 ", ms_n=%u",
679 	      (void *)addr, data.start, data.end, msl->page_sz, ms_n);
680 	/* Size of memory for bitmap. */
681 	bmp_size = rte_bitmap_get_memory_footprint(ms_n);
682 	mr = mlx5_malloc(MLX5_MEM_RTE |  MLX5_MEM_ZERO,
683 			 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) +
684 			 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id);
685 	if (mr == NULL) {
686 		DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of"
687 		      " address (%p).", (void *)addr);
688 		rte_errno = ENOMEM;
689 		goto err_nolock;
690 	}
691 	mr->msl = msl;
692 	/*
693 	 * Save the index of the first memseg and initialize memseg bitmap. To
694 	 * see if a memseg of ms_idx in the memseg-list is still valid, check:
695 	 *	rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
696 	 */
697 	mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
698 	bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
699 	mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
700 	if (mr->ms_bmp == NULL) {
701 		DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of"
702 		      " address (%p).", (void *)addr);
703 		rte_errno = EINVAL;
704 		goto err_nolock;
705 	}
706 	/*
707 	 * Should recheck whether the extended contiguous chunk is still valid.
708 	 * Because memory_hotplug_lock can't be held if there's any memory
709 	 * related calls in a critical path, resource allocation above can't be
710 	 * locked. If the memory has been changed at this point, try again with
711 	 * just single page. If not, go on with the big chunk atomically from
712 	 * here.
713 	 */
714 	rte_mcfg_mem_read_lock();
715 	data_re = data;
716 	if (len > msl->page_sz &&
717 	    !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
718 		DRV_LOG(DEBUG,
719 			"Unable to find virtually contiguous chunk for address "
720 			"(%p). rte_memseg_contig_walk() failed.", (void *)addr);
721 		rte_errno = ENXIO;
722 		goto err_memlock;
723 	}
724 	if (data.start != data_re.start || data.end != data_re.end) {
725 		/*
726 		 * The extended contiguous chunk has been changed. Try again
727 		 * with single memseg instead.
728 		 */
729 		data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
730 		data.end = data.start + msl->page_sz;
731 		rte_mcfg_mem_read_unlock();
732 		mlx5_mr_free(mr, share_cache->dereg_mr_cb);
733 		goto alloc_resources;
734 	}
735 	MLX5_ASSERT(data.msl == data_re.msl);
736 	rte_rwlock_write_lock(&share_cache->rwlock);
737 	/*
738 	 * Check the address is really missing. If other thread already created
739 	 * one or it is not found due to overflow, abort and return.
740 	 */
741 	if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) {
742 		/*
743 		 * Insert to the global cache table. It may fail due to
744 		 * low-on-memory. Then, this entry will have to be searched
745 		 * here again.
746 		 */
747 		mr_btree_insert(&share_cache->cache, entry);
748 		DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort",
749 			(void *)addr);
750 		rte_rwlock_write_unlock(&share_cache->rwlock);
751 		rte_mcfg_mem_read_unlock();
752 		/*
753 		 * Must be unlocked before calling rte_free() because
754 		 * mlx5_mr_mem_event_free_cb() can be called inside.
755 		 */
756 		mlx5_mr_free(mr, share_cache->dereg_mr_cb);
757 		return entry->lkey;
758 	}
759 	/*
760 	 * Trim start and end addresses for verbs MR. Set bits for registering
761 	 * memsegs but exclude already registered ones. Bitmap can be
762 	 * fragmented.
763 	 */
764 	for (n = 0; n < ms_n; ++n) {
765 		uintptr_t start;
766 		struct mr_cache_entry ret;
767 
768 		memset(&ret, 0, sizeof(ret));
769 		start = data_re.start + n * msl->page_sz;
770 		/* Exclude memsegs already registered by other MRs. */
771 		if (mlx5_mr_lookup_cache(share_cache, &ret, start) ==
772 		    UINT32_MAX) {
773 			/*
774 			 * Start from the first unregistered memseg in the
775 			 * extended range.
776 			 */
777 			if (ms_idx_shift == -1) {
778 				mr->ms_base_idx += n;
779 				data.start = start;
780 				ms_idx_shift = n;
781 			}
782 			data.end = start + msl->page_sz;
783 			rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
784 			++mr->ms_n;
785 		}
786 	}
787 	len = data.end - data.start;
788 	mr->ms_bmp_n = len / msl->page_sz;
789 	MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
790 	/*
791 	 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can
792 	 * be called with holding the memory lock because it doesn't use
793 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
794 	 * through mlx5_alloc_verbs_buf().
795 	 */
796 	share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr);
797 	if (mr->pmd_mr.obj == NULL) {
798 		DRV_LOG(DEBUG, "Fail to create an MR for address (%p)",
799 		      (void *)addr);
800 		rte_errno = EINVAL;
801 		goto err_mrlock;
802 	}
803 	MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start);
804 	MLX5_ASSERT(mr->pmd_mr.len);
805 	LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr);
806 	DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n"
807 	      "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
808 	      " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
809 	      (void *)mr, (void *)addr, data.start, data.end,
810 	      rte_cpu_to_be_32(mr->pmd_mr.lkey),
811 	      mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
812 	/* Insert to the global cache table. */
813 	mlx5_mr_insert_cache(share_cache, mr);
814 	/* Fill in output data. */
815 	mlx5_mr_lookup_cache(share_cache, entry, addr);
816 	/* Lookup can't fail. */
817 	MLX5_ASSERT(entry->lkey != UINT32_MAX);
818 	rte_rwlock_write_unlock(&share_cache->rwlock);
819 	rte_mcfg_mem_read_unlock();
820 	return entry->lkey;
821 err_mrlock:
822 	rte_rwlock_write_unlock(&share_cache->rwlock);
823 err_memlock:
824 	rte_mcfg_mem_read_unlock();
825 err_nolock:
826 	/*
827 	 * In case of error, as this can be called in a datapath, a warning
828 	 * message per an error is preferable instead. Must be unlocked before
829 	 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called
830 	 * inside.
831 	 */
832 	mlx5_mr_free(mr, share_cache->dereg_mr_cb);
833 	return UINT32_MAX;
834 }
835 
836 /**
837  * Create a new global Memory Region (MR) for a missing virtual address.
838  * This can be called from primary and secondary process.
839  *
840  * @param pd
841  *   Pointer to pd handle of a device (net, regex, vdpa,...).
842  * @param share_cache
843  *   Pointer to a global shared MR cache.
844  * @param[out] entry
845  *   Pointer to returning MR cache entry, found in the global cache or newly
846  *   created. If failed to create one, this will not be updated.
847  * @param addr
848  *   Target virtual address to register.
849  *
850  * @return
851  *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
852  */
853 static uint32_t
854 mlx5_mr_create(void *pd, struct mlx5_mp_id *mp_id,
855 	       struct mlx5_mr_share_cache *share_cache,
856 	       struct mr_cache_entry *entry, uintptr_t addr,
857 	       unsigned int mr_ext_memseg_en)
858 {
859 	uint32_t ret = 0;
860 
861 	switch (rte_eal_process_type()) {
862 	case RTE_PROC_PRIMARY:
863 		ret = mlx5_mr_create_primary(pd, share_cache, entry,
864 					     addr, mr_ext_memseg_en);
865 		break;
866 	case RTE_PROC_SECONDARY:
867 		ret = mlx5_mr_create_secondary(pd, mp_id, share_cache, entry,
868 					       addr, mr_ext_memseg_en);
869 		break;
870 	default:
871 		break;
872 	}
873 	return ret;
874 }
875 
876 /**
877  * Look up address in the global MR cache table. If not found, create a new MR.
878  * Insert the found/created entry to local bottom-half cache table.
879  *
880  * @param pd
881  *   Pointer to pd of a device (net, regex, vdpa,...).
882  * @param share_cache
883  *   Pointer to a global shared MR cache.
884  * @param mr_ctrl
885  *   Pointer to per-queue MR control structure.
886  * @param[out] entry
887  *   Pointer to returning MR cache entry, found in the global cache or newly
888  *   created. If failed to create one, this is not written.
889  * @param addr
890  *   Search key.
891  *
892  * @return
893  *   Searched LKey on success, UINT32_MAX on no match.
894  */
895 static uint32_t
896 mr_lookup_caches(void *pd, struct mlx5_mp_id *mp_id,
897 		 struct mlx5_mr_share_cache *share_cache,
898 		 struct mlx5_mr_ctrl *mr_ctrl,
899 		 struct mr_cache_entry *entry, uintptr_t addr,
900 		 unsigned int mr_ext_memseg_en)
901 {
902 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
903 	uint32_t lkey;
904 	uint16_t idx;
905 
906 	/* If local cache table is full, try to double it. */
907 	if (unlikely(bt->len == bt->size))
908 		mr_btree_expand(bt, bt->size << 1);
909 	/* Look up in the global cache. */
910 	rte_rwlock_read_lock(&share_cache->rwlock);
911 	lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
912 	if (lkey != UINT32_MAX) {
913 		/* Found. */
914 		*entry = (*share_cache->cache.table)[idx];
915 		rte_rwlock_read_unlock(&share_cache->rwlock);
916 		/*
917 		 * Update local cache. Even if it fails, return the found entry
918 		 * to update top-half cache. Next time, this entry will be found
919 		 * in the global cache.
920 		 */
921 		mr_btree_insert(bt, entry);
922 		return lkey;
923 	}
924 	rte_rwlock_read_unlock(&share_cache->rwlock);
925 	/* First time to see the address? Create a new MR. */
926 	lkey = mlx5_mr_create(pd, mp_id, share_cache, entry, addr,
927 			      mr_ext_memseg_en);
928 	/*
929 	 * Update the local cache if successfully created a new global MR. Even
930 	 * if failed to create one, there's no action to take in this datapath
931 	 * code. As returning LKey is invalid, this will eventually make HW
932 	 * fail.
933 	 */
934 	if (lkey != UINT32_MAX)
935 		mr_btree_insert(bt, entry);
936 	return lkey;
937 }
938 
939 /**
940  * Bottom-half of LKey search on datapath. First search in cache_bh[] and if
941  * misses, search in the global MR cache table and update the new entry to
942  * per-queue local caches.
943  *
944  * @param pd
945  *   Pointer to pd of a device (net, regex, vdpa,...).
946  * @param share_cache
947  *   Pointer to a global shared MR cache.
948  * @param mr_ctrl
949  *   Pointer to per-queue MR control structure.
950  * @param addr
951  *   Search key.
952  *
953  * @return
954  *   Searched LKey on success, UINT32_MAX on no match.
955  */
956 uint32_t mlx5_mr_addr2mr_bh(void *pd, struct mlx5_mp_id *mp_id,
957 			    struct mlx5_mr_share_cache *share_cache,
958 			    struct mlx5_mr_ctrl *mr_ctrl,
959 			    uintptr_t addr, unsigned int mr_ext_memseg_en)
960 {
961 	uint32_t lkey;
962 	uint16_t bh_idx = 0;
963 	/* Victim in top-half cache to replace with new entry. */
964 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
965 
966 	/* Binary-search MR translation table. */
967 	lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
968 	/* Update top-half cache. */
969 	if (likely(lkey != UINT32_MAX)) {
970 		*repl = (*mr_ctrl->cache_bh.table)[bh_idx];
971 	} else {
972 		/*
973 		 * If missed in local lookup table, search in the global cache
974 		 * and local cache_bh[] will be updated inside if possible.
975 		 * Top-half cache entry will also be updated.
976 		 */
977 		lkey = mr_lookup_caches(pd, mp_id, share_cache, mr_ctrl,
978 					repl, addr, mr_ext_memseg_en);
979 		if (unlikely(lkey == UINT32_MAX))
980 			return UINT32_MAX;
981 	}
982 	/* Update the most recently used entry. */
983 	mr_ctrl->mru = mr_ctrl->head;
984 	/* Point to the next victim, the oldest. */
985 	mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
986 	return lkey;
987 }
988 
989 /**
990  * Release all the created MRs and resources on global MR cache of a device.
991  * list.
992  *
993  * @param share_cache
994  *   Pointer to a global shared MR cache.
995  */
996 void
997 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache)
998 {
999 	struct mlx5_mr *mr_next;
1000 
1001 	rte_rwlock_write_lock(&share_cache->rwlock);
1002 	/* Detach from MR list and move to free list. */
1003 	mr_next = LIST_FIRST(&share_cache->mr_list);
1004 	while (mr_next != NULL) {
1005 		struct mlx5_mr *mr = mr_next;
1006 
1007 		mr_next = LIST_NEXT(mr, mr);
1008 		LIST_REMOVE(mr, mr);
1009 		LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr);
1010 	}
1011 	LIST_INIT(&share_cache->mr_list);
1012 	/* Free global cache. */
1013 	mlx5_mr_btree_free(&share_cache->cache);
1014 	rte_rwlock_write_unlock(&share_cache->rwlock);
1015 	/* Free all remaining MRs. */
1016 	mlx5_mr_garbage_collect(share_cache);
1017 }
1018 
1019 /**
1020  * Flush all of the local cache entries.
1021  *
1022  * @param mr_ctrl
1023  *   Pointer to per-queue MR local cache.
1024  */
1025 void
1026 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl)
1027 {
1028 	/* Reset the most-recently-used index. */
1029 	mr_ctrl->mru = 0;
1030 	/* Reset the linear search array. */
1031 	mr_ctrl->head = 0;
1032 	memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
1033 	/* Reset the B-tree table. */
1034 	mr_ctrl->cache_bh.len = 1;
1035 	mr_ctrl->cache_bh.overflow = 0;
1036 	/* Update the generation number. */
1037 	mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
1038 	DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d",
1039 		(void *)mr_ctrl, mr_ctrl->cur_gen);
1040 }
1041 
1042 /**
1043  * Creates a memory region for external memory, that is memory which is not
1044  * part of the DPDK memory segments.
1045  *
1046  * @param pd
1047  *   Pointer to pd of a device (net, regex, vdpa,...).
1048  * @param addr
1049  *   Starting virtual address of memory.
1050  * @param len
1051  *   Length of memory segment being mapped.
1052  * @param socked_id
1053  *   Socket to allocate heap memory for the control structures.
1054  *
1055  * @return
1056  *   Pointer to MR structure on success, NULL otherwise.
1057  */
1058 struct mlx5_mr *
1059 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id,
1060 		   mlx5_reg_mr_t reg_mr_cb)
1061 {
1062 	struct mlx5_mr *mr = NULL;
1063 
1064 	mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
1065 			 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE),
1066 			 RTE_CACHE_LINE_SIZE, socket_id);
1067 	if (mr == NULL)
1068 		return NULL;
1069 	reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr);
1070 	if (mr->pmd_mr.obj == NULL) {
1071 		DRV_LOG(WARNING,
1072 			"Fail to create MR for address (%p)",
1073 			(void *)addr);
1074 		mlx5_free(mr);
1075 		return NULL;
1076 	}
1077 	mr->msl = NULL; /* Mark it is external memory. */
1078 	mr->ms_bmp = NULL;
1079 	mr->ms_n = 1;
1080 	mr->ms_bmp_n = 1;
1081 	DRV_LOG(DEBUG,
1082 		"MR CREATED (%p) for external memory %p:\n"
1083 		"  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
1084 		" lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
1085 		(void *)mr, (void *)addr,
1086 		addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey),
1087 		mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
1088 	return mr;
1089 }
1090 
1091 /**
1092  * Callback for memory free event. Iterate freed memsegs and check whether it
1093  * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
1094  * result, the MR would be fragmented. If it becomes empty, the MR will be freed
1095  * later by mlx5_mr_garbage_collect(). Even if this callback is called from a
1096  * secondary process, the garbage collector will be called in primary process
1097  * as the secondary process can't call mlx5_mr_create().
1098  *
1099  * The global cache must be rebuilt if there's any change and this event has to
1100  * be propagated to dataplane threads to flush the local caches.
1101  *
1102  * @param share_cache
1103  *   Pointer to a global shared MR cache.
1104  * @param ibdev_name
1105  *   Name of ibv device.
1106  * @param addr
1107  *   Address of freed memory.
1108  * @param len
1109  *   Size of freed memory.
1110  */
1111 void
1112 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache,
1113 		     const char *ibdev_name, const void *addr, size_t len)
1114 {
1115 	const struct rte_memseg_list *msl;
1116 	struct mlx5_mr *mr;
1117 	int ms_n;
1118 	int i;
1119 	int rebuild = 0;
1120 
1121 	DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu",
1122 		ibdev_name, addr, len);
1123 	msl = rte_mem_virt2memseg_list(addr);
1124 	/* addr and len must be page-aligned. */
1125 	MLX5_ASSERT((uintptr_t)addr ==
1126 		    RTE_ALIGN((uintptr_t)addr, msl->page_sz));
1127 	MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
1128 	ms_n = len / msl->page_sz;
1129 	rte_rwlock_write_lock(&share_cache->rwlock);
1130 	/* Clear bits of freed memsegs from MR. */
1131 	for (i = 0; i < ms_n; ++i) {
1132 		const struct rte_memseg *ms;
1133 		struct mr_cache_entry entry;
1134 		uintptr_t start;
1135 		int ms_idx;
1136 		uint32_t pos;
1137 
1138 		/* Find MR having this memseg. */
1139 		start = (uintptr_t)addr + i * msl->page_sz;
1140 		mr = mlx5_mr_lookup_list(share_cache, &entry, start);
1141 		if (mr == NULL)
1142 			continue;
1143 		MLX5_ASSERT(mr->msl); /* Can't be external memory. */
1144 		ms = rte_mem_virt2memseg((void *)start, msl);
1145 		MLX5_ASSERT(ms != NULL);
1146 		MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
1147 		ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
1148 		pos = ms_idx - mr->ms_base_idx;
1149 		MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
1150 		MLX5_ASSERT(pos < mr->ms_bmp_n);
1151 		DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p",
1152 			ibdev_name, (void *)mr, pos, (void *)start);
1153 		rte_bitmap_clear(mr->ms_bmp, pos);
1154 		if (--mr->ms_n == 0) {
1155 			LIST_REMOVE(mr, mr);
1156 			LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr);
1157 			DRV_LOG(DEBUG, "device %s remove MR(%p) from list",
1158 				ibdev_name, (void *)mr);
1159 		}
1160 		/*
1161 		 * MR is fragmented or will be freed. the global cache must be
1162 		 * rebuilt.
1163 		 */
1164 		rebuild = 1;
1165 	}
1166 	if (rebuild) {
1167 		mlx5_mr_rebuild_cache(share_cache);
1168 		/*
1169 		 * No explicit wmb is needed after updating dev_gen due to
1170 		 * store-release ordering in unlock that provides the
1171 		 * implicit barrier at the software visible level.
1172 		 */
1173 		++share_cache->dev_gen;
1174 		DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d",
1175 			share_cache->dev_gen);
1176 	}
1177 	rte_rwlock_write_unlock(&share_cache->rwlock);
1178 }
1179 
1180 /**
1181  * Dump all the created MRs and the global cache entries.
1182  *
1183  * @param sh
1184  *   Pointer to Ethernet device shared context.
1185  */
1186 void
1187 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused)
1188 {
1189 #ifdef RTE_LIBRTE_MLX5_DEBUG
1190 	struct mlx5_mr *mr;
1191 	int mr_n = 0;
1192 	int chunk_n = 0;
1193 
1194 	rte_rwlock_read_lock(&share_cache->rwlock);
1195 	/* Iterate all the existing MRs. */
1196 	LIST_FOREACH(mr, &share_cache->mr_list, mr) {
1197 		unsigned int n;
1198 
1199 		DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
1200 		      mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey),
1201 		      mr->ms_n, mr->ms_bmp_n);
1202 		if (mr->ms_n == 0)
1203 			continue;
1204 		for (n = 0; n < mr->ms_bmp_n; ) {
1205 			struct mr_cache_entry ret = { 0, };
1206 
1207 			n = mr_find_next_chunk(mr, &ret, n);
1208 			if (!ret.end)
1209 				break;
1210 			DRV_LOG(DEBUG,
1211 				"  chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
1212 				chunk_n++, ret.start, ret.end);
1213 		}
1214 	}
1215 	DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache);
1216 	mlx5_mr_btree_dump(&share_cache->cache);
1217 	rte_rwlock_read_unlock(&share_cache->rwlock);
1218 #endif
1219 }
1220 
1221 static int
1222 mlx5_range_compare_start(const void *lhs, const void *rhs)
1223 {
1224 	const struct mlx5_range *r1 = lhs, *r2 = rhs;
1225 
1226 	if (r1->start > r2->start)
1227 		return 1;
1228 	else if (r1->start < r2->start)
1229 		return -1;
1230 	return 0;
1231 }
1232 
1233 static void
1234 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque,
1235 			      struct rte_mempool_memhdr *memhdr,
1236 			      unsigned int idx)
1237 {
1238 	struct mlx5_range *ranges = opaque, *range = &ranges[idx];
1239 	uint64_t page_size = rte_mem_page_size();
1240 
1241 	RTE_SET_USED(mp);
1242 	range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
1243 	range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size);
1244 }
1245 
1246 /**
1247  * Get VA-contiguous ranges of the mempool memory.
1248  * Each range start and end is aligned to the system page size.
1249  *
1250  * @param[in] mp
1251  *   Analyzed mempool.
1252  * @param[out] out
1253  *   Receives the ranges, caller must release it with free().
1254  * @param[out] ount_n
1255  *   Receives the number of @p out elements.
1256  *
1257  * @return
1258  *   0 on success, (-1) on failure.
1259  */
1260 static int
1261 mlx5_get_mempool_ranges(struct rte_mempool *mp, struct mlx5_range **out,
1262 			unsigned int *out_n)
1263 {
1264 	struct mlx5_range *chunks;
1265 	unsigned int chunks_n = mp->nb_mem_chunks, contig_n, i;
1266 
1267 	/* Collect page-aligned memory ranges of the mempool. */
1268 	chunks = calloc(sizeof(chunks[0]), chunks_n);
1269 	if (chunks == NULL)
1270 		return -1;
1271 	rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, chunks);
1272 	/* Merge adjacent chunks and place them at the beginning. */
1273 	qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start);
1274 	contig_n = 1;
1275 	for (i = 1; i < chunks_n; i++)
1276 		if (chunks[i - 1].end != chunks[i].start) {
1277 			chunks[contig_n - 1].end = chunks[i - 1].end;
1278 			chunks[contig_n] = chunks[i];
1279 			contig_n++;
1280 		}
1281 	/* Extend the last contiguous chunk to the end of the mempool. */
1282 	chunks[contig_n - 1].end = chunks[i - 1].end;
1283 	*out = chunks;
1284 	*out_n = contig_n;
1285 	return 0;
1286 }
1287 
1288 /**
1289  * Analyze mempool memory to select memory ranges to register.
1290  *
1291  * @param[in] mp
1292  *   Mempool to analyze.
1293  * @param[out] out
1294  *   Receives memory ranges to register, aligned to the system page size.
1295  *   The caller must release them with free().
1296  * @param[out] out_n
1297  *   Receives the number of @p out items.
1298  * @param[out] share_hugepage
1299  *   Receives True if the entire pool resides within a single hugepage.
1300  *
1301  * @return
1302  *   0 on success, (-1) on failure.
1303  */
1304 static int
1305 mlx5_mempool_reg_analyze(struct rte_mempool *mp, struct mlx5_range **out,
1306 			 unsigned int *out_n, bool *share_hugepage)
1307 {
1308 	struct mlx5_range *ranges = NULL;
1309 	unsigned int i, ranges_n = 0;
1310 	struct rte_memseg_list *msl;
1311 
1312 	if (mlx5_get_mempool_ranges(mp, &ranges, &ranges_n) < 0) {
1313 		DRV_LOG(ERR, "Cannot get address ranges for mempool %s",
1314 			mp->name);
1315 		return -1;
1316 	}
1317 	/* Check if the hugepage of the pool can be shared. */
1318 	*share_hugepage = false;
1319 	msl = rte_mem_virt2memseg_list((void *)ranges[0].start);
1320 	if (msl != NULL) {
1321 		uint64_t hugepage_sz = 0;
1322 
1323 		/* Check that all ranges are on pages of the same size. */
1324 		for (i = 0; i < ranges_n; i++) {
1325 			if (hugepage_sz != 0 && hugepage_sz != msl->page_sz)
1326 				break;
1327 			hugepage_sz = msl->page_sz;
1328 		}
1329 		if (i == ranges_n) {
1330 			/*
1331 			 * If the entire pool is within one hugepage,
1332 			 * combine all ranges into one of the hugepage size.
1333 			 */
1334 			uintptr_t reg_start = ranges[0].start;
1335 			uintptr_t reg_end = ranges[ranges_n - 1].end;
1336 			uintptr_t hugepage_start =
1337 				RTE_ALIGN_FLOOR(reg_start, hugepage_sz);
1338 			uintptr_t hugepage_end = hugepage_start + hugepage_sz;
1339 			if (reg_end < hugepage_end) {
1340 				ranges[0].start = hugepage_start;
1341 				ranges[0].end = hugepage_end;
1342 				ranges_n = 1;
1343 				*share_hugepage = true;
1344 			}
1345 		}
1346 	}
1347 	*out = ranges;
1348 	*out_n = ranges_n;
1349 	return 0;
1350 }
1351 
1352 /** Create a registration object for the mempool. */
1353 static struct mlx5_mempool_reg *
1354 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n)
1355 {
1356 	struct mlx5_mempool_reg *mpr = NULL;
1357 
1358 	mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
1359 			  sizeof(*mpr) + mrs_n * sizeof(mpr->mrs[0]),
1360 			  RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
1361 	if (mpr == NULL) {
1362 		DRV_LOG(ERR, "Cannot allocate mempool %s registration object",
1363 			mp->name);
1364 		return NULL;
1365 	}
1366 	mpr->mp = mp;
1367 	mpr->mrs = (struct mlx5_mempool_mr *)(mpr + 1);
1368 	mpr->mrs_n = mrs_n;
1369 	return mpr;
1370 }
1371 
1372 /**
1373  * Destroy a mempool registration object.
1374  *
1375  * @param standalone
1376  *   Whether @p mpr owns its MRs excludively, i.e. they are not shared.
1377  */
1378 static void
1379 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache,
1380 			 struct mlx5_mempool_reg *mpr, bool standalone)
1381 {
1382 	if (standalone) {
1383 		unsigned int i;
1384 
1385 		for (i = 0; i < mpr->mrs_n; i++)
1386 			share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr);
1387 	}
1388 	mlx5_free(mpr);
1389 }
1390 
1391 /** Find registration object of a mempool. */
1392 static struct mlx5_mempool_reg *
1393 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache,
1394 			struct rte_mempool *mp)
1395 {
1396 	struct mlx5_mempool_reg *mpr;
1397 
1398 	LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next)
1399 		if (mpr->mp == mp)
1400 			break;
1401 	return mpr;
1402 }
1403 
1404 /** Increment reference counters of MRs used in the registration. */
1405 static void
1406 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr)
1407 {
1408 	unsigned int i;
1409 
1410 	for (i = 0; i < mpr->mrs_n; i++)
1411 		__atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED);
1412 }
1413 
1414 /**
1415  * Decrement reference counters of MRs used in the registration.
1416  *
1417  * @return True if no more references to @p mpr MRs exist, False otherwise.
1418  */
1419 static bool
1420 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr)
1421 {
1422 	unsigned int i;
1423 	bool ret = false;
1424 
1425 	for (i = 0; i < mpr->mrs_n; i++)
1426 		ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1,
1427 					  __ATOMIC_RELAXED) == 0;
1428 	return ret;
1429 }
1430 
1431 static int
1432 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache,
1433 				 void *pd, struct rte_mempool *mp)
1434 {
1435 	struct mlx5_range *ranges = NULL;
1436 	struct mlx5_mempool_reg *mpr, *new_mpr;
1437 	unsigned int i, ranges_n;
1438 	bool share_hugepage;
1439 	int ret = -1;
1440 
1441 	/* Early check to avoid unnecessary creation of MRs. */
1442 	rte_rwlock_read_lock(&share_cache->rwlock);
1443 	mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1444 	rte_rwlock_read_unlock(&share_cache->rwlock);
1445 	if (mpr != NULL) {
1446 		DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p",
1447 			mp->name, pd);
1448 		rte_errno = EEXIST;
1449 		goto exit;
1450 	}
1451 	if (mlx5_mempool_reg_analyze(mp, &ranges, &ranges_n,
1452 				     &share_hugepage) < 0) {
1453 		DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name);
1454 		rte_errno = ENOMEM;
1455 		goto exit;
1456 	}
1457 	new_mpr = mlx5_mempool_reg_create(mp, ranges_n);
1458 	if (new_mpr == NULL) {
1459 		DRV_LOG(ERR,
1460 			"Cannot create a registration object for mempool %s in PD %p",
1461 			mp->name, pd);
1462 		rte_errno = ENOMEM;
1463 		goto exit;
1464 	}
1465 	/*
1466 	 * If the entire mempool fits in a single hugepage, the MR for this
1467 	 * hugepage can be shared across mempools that also fit in it.
1468 	 */
1469 	if (share_hugepage) {
1470 		rte_rwlock_write_lock(&share_cache->rwlock);
1471 		LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) {
1472 			if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start)
1473 				break;
1474 		}
1475 		if (mpr != NULL) {
1476 			new_mpr->mrs = mpr->mrs;
1477 			mlx5_mempool_reg_attach(new_mpr);
1478 			LIST_INSERT_HEAD(&share_cache->mempool_reg_list,
1479 					 new_mpr, next);
1480 		}
1481 		rte_rwlock_write_unlock(&share_cache->rwlock);
1482 		if (mpr != NULL) {
1483 			DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s",
1484 				mpr->mrs[0].pmd_mr.lkey, pd, mp->name,
1485 				mpr->mp->name);
1486 			ret = 0;
1487 			goto exit;
1488 		}
1489 	}
1490 	for (i = 0; i < ranges_n; i++) {
1491 		struct mlx5_mempool_mr *mr = &new_mpr->mrs[i];
1492 		const struct mlx5_range *range = &ranges[i];
1493 		size_t len = range->end - range->start;
1494 
1495 		if (share_cache->reg_mr_cb(pd, (void *)range->start, len,
1496 		    &mr->pmd_mr) < 0) {
1497 			DRV_LOG(ERR,
1498 				"Failed to create an MR in PD %p for address range "
1499 				"[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s",
1500 				pd, range->start, range->end, len, mp->name);
1501 			break;
1502 		}
1503 		DRV_LOG(DEBUG,
1504 			"Created a new MR %#x in PD %p for address range "
1505 			"[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s",
1506 			mr->pmd_mr.lkey, pd, range->start, range->end, len,
1507 			mp->name);
1508 	}
1509 	if (i != ranges_n) {
1510 		mlx5_mempool_reg_destroy(share_cache, new_mpr, true);
1511 		rte_errno = EINVAL;
1512 		goto exit;
1513 	}
1514 	/* Concurrent registration is not supposed to happen. */
1515 	rte_rwlock_write_lock(&share_cache->rwlock);
1516 	mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1517 	if (mpr == NULL) {
1518 		mlx5_mempool_reg_attach(new_mpr);
1519 		LIST_INSERT_HEAD(&share_cache->mempool_reg_list,
1520 				 new_mpr, next);
1521 		ret = 0;
1522 	}
1523 	rte_rwlock_write_unlock(&share_cache->rwlock);
1524 	if (mpr != NULL) {
1525 		DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p",
1526 			mp->name, pd);
1527 		mlx5_mempool_reg_destroy(share_cache, new_mpr, true);
1528 		rte_errno = EEXIST;
1529 		goto exit;
1530 	}
1531 exit:
1532 	free(ranges);
1533 	return ret;
1534 }
1535 
1536 static int
1537 mlx5_mr_mempool_register_secondary(struct mlx5_mr_share_cache *share_cache,
1538 				   void *pd, struct rte_mempool *mp,
1539 				   struct mlx5_mp_id *mp_id)
1540 {
1541 	if (mp_id == NULL) {
1542 		rte_errno = EINVAL;
1543 		return -1;
1544 	}
1545 	return mlx5_mp_req_mempool_reg(mp_id, share_cache, pd, mp, true);
1546 }
1547 
1548 /**
1549  * Register the memory of a mempool in the protection domain.
1550  *
1551  * @param share_cache
1552  *   Shared MR cache of the protection domain.
1553  * @param pd
1554  *   Protection domain object.
1555  * @param mp
1556  *   Mempool to register.
1557  * @param mp_id
1558  *   Multi-process identifier, may be NULL for the primary process.
1559  *
1560  * @return
1561  *   0 on success, (-1) on failure and rte_errno is set.
1562  */
1563 int
1564 mlx5_mr_mempool_register(struct mlx5_mr_share_cache *share_cache, void *pd,
1565 			 struct rte_mempool *mp, struct mlx5_mp_id *mp_id)
1566 {
1567 	if (mp->flags & RTE_MEMPOOL_F_NON_IO)
1568 		return 0;
1569 	switch (rte_eal_process_type()) {
1570 	case RTE_PROC_PRIMARY:
1571 		return mlx5_mr_mempool_register_primary(share_cache, pd, mp);
1572 	case RTE_PROC_SECONDARY:
1573 		return mlx5_mr_mempool_register_secondary(share_cache, pd, mp,
1574 							  mp_id);
1575 	default:
1576 		return -1;
1577 	}
1578 }
1579 
1580 static int
1581 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache,
1582 				   struct rte_mempool *mp)
1583 {
1584 	struct mlx5_mempool_reg *mpr;
1585 	bool standalone = false;
1586 
1587 	rte_rwlock_write_lock(&share_cache->rwlock);
1588 	LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next)
1589 		if (mpr->mp == mp) {
1590 			LIST_REMOVE(mpr, next);
1591 			standalone = mlx5_mempool_reg_detach(mpr);
1592 			if (standalone)
1593 				/*
1594 				 * The unlock operation below provides a memory
1595 				 * barrier due to its store-release semantics.
1596 				 */
1597 				++share_cache->dev_gen;
1598 			break;
1599 		}
1600 	rte_rwlock_write_unlock(&share_cache->rwlock);
1601 	if (mpr == NULL) {
1602 		rte_errno = ENOENT;
1603 		return -1;
1604 	}
1605 	mlx5_mempool_reg_destroy(share_cache, mpr, standalone);
1606 	return 0;
1607 }
1608 
1609 static int
1610 mlx5_mr_mempool_unregister_secondary(struct mlx5_mr_share_cache *share_cache,
1611 				     struct rte_mempool *mp,
1612 				     struct mlx5_mp_id *mp_id)
1613 {
1614 	if (mp_id == NULL) {
1615 		rte_errno = EINVAL;
1616 		return -1;
1617 	}
1618 	return mlx5_mp_req_mempool_reg(mp_id, share_cache, NULL, mp, false);
1619 }
1620 
1621 /**
1622  * Unregister the memory of a mempool from the protection domain.
1623  *
1624  * @param share_cache
1625  *   Shared MR cache of the protection domain.
1626  * @param mp
1627  *   Mempool to unregister.
1628  * @param mp_id
1629  *   Multi-process identifier, may be NULL for the primary process.
1630  *
1631  * @return
1632  *   0 on success, (-1) on failure and rte_errno is set.
1633  */
1634 int
1635 mlx5_mr_mempool_unregister(struct mlx5_mr_share_cache *share_cache,
1636 			   struct rte_mempool *mp, struct mlx5_mp_id *mp_id)
1637 {
1638 	if (mp->flags & RTE_MEMPOOL_F_NON_IO)
1639 		return 0;
1640 	switch (rte_eal_process_type()) {
1641 	case RTE_PROC_PRIMARY:
1642 		return mlx5_mr_mempool_unregister_primary(share_cache, mp);
1643 	case RTE_PROC_SECONDARY:
1644 		return mlx5_mr_mempool_unregister_secondary(share_cache, mp,
1645 							    mp_id);
1646 	default:
1647 		return -1;
1648 	}
1649 }
1650 
1651 /**
1652  * Lookup a MR key by and address in a registered mempool.
1653  *
1654  * @param mpr
1655  *   Mempool registration object.
1656  * @param addr
1657  *   Address within the mempool.
1658  * @param entry
1659  *   Bottom-half cache entry to fill.
1660  *
1661  * @return
1662  *   MR key or UINT32_MAX on failure, which can only happen
1663  *   if the address is not from within the mempool.
1664  */
1665 static uint32_t
1666 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr,
1667 			 struct mr_cache_entry *entry)
1668 {
1669 	uint32_t lkey = UINT32_MAX;
1670 	unsigned int i;
1671 
1672 	for (i = 0; i < mpr->mrs_n; i++) {
1673 		const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr;
1674 		uintptr_t mr_addr = (uintptr_t)mr->addr;
1675 
1676 		if (mr_addr <= addr) {
1677 			lkey = rte_cpu_to_be_32(mr->lkey);
1678 			entry->start = mr_addr;
1679 			entry->end = mr_addr + mr->len;
1680 			entry->lkey = lkey;
1681 			break;
1682 		}
1683 	}
1684 	return lkey;
1685 }
1686 
1687 /**
1688  * Update bottom-half cache from the list of mempool registrations.
1689  *
1690  * @param share_cache
1691  *   Pointer to a global shared MR cache.
1692  * @param mr_ctrl
1693  *   Per-queue MR control handle.
1694  * @param entry
1695  *   Pointer to an entry in the bottom-half cache to update
1696  *   with the MR lkey looked up.
1697  * @param mp
1698  *   Mempool containing the address.
1699  * @param addr
1700  *   Address to lookup.
1701  * @return
1702  *   MR lkey on success, UINT32_MAX on failure.
1703  */
1704 static uint32_t
1705 mlx5_lookup_mempool_regs(struct mlx5_mr_share_cache *share_cache,
1706 			 struct mlx5_mr_ctrl *mr_ctrl,
1707 			 struct mr_cache_entry *entry,
1708 			 struct rte_mempool *mp, uintptr_t addr)
1709 {
1710 	struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
1711 	struct mlx5_mempool_reg *mpr;
1712 	uint32_t lkey = UINT32_MAX;
1713 
1714 	/* If local cache table is full, try to double it. */
1715 	if (unlikely(bt->len == bt->size))
1716 		mr_btree_expand(bt, bt->size << 1);
1717 	/* Look up in mempool registrations. */
1718 	rte_rwlock_read_lock(&share_cache->rwlock);
1719 	mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1720 	if (mpr != NULL)
1721 		lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry);
1722 	rte_rwlock_read_unlock(&share_cache->rwlock);
1723 	/*
1724 	 * Update local cache. Even if it fails, return the found entry
1725 	 * to update top-half cache. Next time, this entry will be found
1726 	 * in the global cache.
1727 	 */
1728 	if (lkey != UINT32_MAX)
1729 		mr_btree_insert(bt, entry);
1730 	return lkey;
1731 }
1732 
1733 /**
1734  * Bottom-half lookup for the address from the mempool.
1735  *
1736  * @param share_cache
1737  *   Pointer to a global shared MR cache.
1738  * @param mr_ctrl
1739  *   Per-queue MR control handle.
1740  * @param mp
1741  *   Mempool containing the address.
1742  * @param addr
1743  *   Address to lookup.
1744  * @return
1745  *   MR lkey on success, UINT32_MAX on failure.
1746  */
1747 uint32_t
1748 mlx5_mr_mempool2mr_bh(struct mlx5_mr_share_cache *share_cache,
1749 		      struct mlx5_mr_ctrl *mr_ctrl,
1750 		      struct rte_mempool *mp, uintptr_t addr)
1751 {
1752 	struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
1753 	uint32_t lkey;
1754 	uint16_t bh_idx = 0;
1755 
1756 	/* Binary-search MR translation table. */
1757 	lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
1758 	/* Update top-half cache. */
1759 	if (likely(lkey != UINT32_MAX)) {
1760 		*repl = (*mr_ctrl->cache_bh.table)[bh_idx];
1761 	} else {
1762 		lkey = mlx5_lookup_mempool_regs(share_cache, mr_ctrl, repl,
1763 						mp, addr);
1764 		/* Can only fail if the address is not from the mempool. */
1765 		if (unlikely(lkey == UINT32_MAX))
1766 			return UINT32_MAX;
1767 	}
1768 	/* Update the most recently used entry. */
1769 	mr_ctrl->mru = mr_ctrl->head;
1770 	/* Point to the next victim, the oldest. */
1771 	mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
1772 	return lkey;
1773 }
1774