1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2016 6WIND S.A.
3 * Copyright 2020 Mellanox Technologies, Ltd
4 */
5 #include <stddef.h>
6
7 #include <rte_eal_memconfig.h>
8 #include <rte_eal_paging.h>
9 #include <rte_errno.h>
10 #include <rte_mempool.h>
11 #include <rte_malloc.h>
12 #include <rte_rwlock.h>
13
14 #include "mlx5_glue.h"
15 #include "mlx5_common.h"
16 #include "mlx5_common_mp.h"
17 #include "mlx5_common_mr.h"
18 #include "mlx5_common_os.h"
19 #include "mlx5_common_log.h"
20 #include "mlx5_malloc.h"
21
22 struct mr_find_contig_memsegs_data {
23 uintptr_t addr;
24 uintptr_t start;
25 uintptr_t end;
26 const struct rte_memseg_list *msl;
27 };
28
29 /* Virtual memory range. */
30 struct mlx5_range {
31 uintptr_t start;
32 uintptr_t end;
33 };
34
35 /** Memory region for a mempool. */
36 struct mlx5_mempool_mr {
37 struct mlx5_pmd_mr pmd_mr;
38 RTE_ATOMIC(uint32_t) refcnt; /**< Number of mempools sharing this MR. */
39 };
40
41 /* Mempool registration. */
42 struct mlx5_mempool_reg {
43 LIST_ENTRY(mlx5_mempool_reg) next;
44 /** Registered mempool, used to designate registrations. */
45 struct rte_mempool *mp;
46 /** Memory regions for the address ranges of the mempool. */
47 struct mlx5_mempool_mr *mrs;
48 /** Number of memory regions. */
49 unsigned int mrs_n;
50 /** Whether the MR were created for external pinned memory. */
51 bool is_extmem;
52 };
53
54 void
mlx5_mprq_buf_free_cb(void * addr __rte_unused,void * opaque)55 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
56 {
57 struct mlx5_mprq_buf *buf = opaque;
58
59 if (rte_atomic_load_explicit(&buf->refcnt, rte_memory_order_relaxed) == 1) {
60 rte_mempool_put(buf->mp, buf);
61 } else if (unlikely(rte_atomic_fetch_sub_explicit(&buf->refcnt, 1,
62 rte_memory_order_relaxed) - 1 == 0)) {
63 rte_atomic_store_explicit(&buf->refcnt, 1, rte_memory_order_relaxed);
64 rte_mempool_put(buf->mp, buf);
65 }
66 }
67
68 /**
69 * Expand B-tree table to a given size. Can't be called with holding
70 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc().
71 *
72 * @param bt
73 * Pointer to B-tree structure.
74 * @param n
75 * Number of entries for expansion.
76 *
77 * @return
78 * 0 on success, -1 on failure.
79 */
80 static int
mr_btree_expand(struct mlx5_mr_btree * bt,uint32_t n)81 mr_btree_expand(struct mlx5_mr_btree *bt, uint32_t n)
82 {
83 void *mem;
84 int ret = 0;
85
86 if (n <= bt->size)
87 return ret;
88 /*
89 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
90 * used inside if there's no room to expand. Because this is a quite
91 * rare case and a part of very slow path, it is very acceptable.
92 * Initially cache_bh[] will be given practically enough space and once
93 * it is expanded, expansion wouldn't be needed again ever.
94 */
95 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO,
96 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY);
97 if (mem == NULL) {
98 /* Not an error, B-tree search will be skipped. */
99 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table",
100 (void *)bt);
101 ret = -1;
102 } else {
103 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n);
104 bt->table = mem;
105 bt->size = n;
106 }
107 return ret;
108 }
109
110 /**
111 * Look up LKey from given B-tree lookup table, store the last index and return
112 * searched LKey.
113 *
114 * @param bt
115 * Pointer to B-tree structure.
116 * @param[out] idx
117 * Pointer to index. Even on search failure, returns index where it stops
118 * searching so that index can be used when inserting a new entry.
119 * @param addr
120 * Search key.
121 *
122 * @return
123 * Searched LKey on success, UINT32_MAX on no match.
124 */
125 static uint32_t
mr_btree_lookup(struct mlx5_mr_btree * bt,uint32_t * idx,uintptr_t addr)126 mr_btree_lookup(struct mlx5_mr_btree *bt, uint32_t *idx, uintptr_t addr)
127 {
128 struct mr_cache_entry *lkp_tbl;
129 uint32_t n;
130 uint32_t base = 0;
131
132 MLX5_ASSERT(bt != NULL);
133 lkp_tbl = *bt->table;
134 n = bt->len;
135 /* First entry must be NULL for comparison. */
136 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 &&
137 lkp_tbl[0].lkey == UINT32_MAX));
138 /* Binary search. */
139 do {
140 register uint32_t delta = n >> 1;
141
142 if (addr < lkp_tbl[base + delta].start) {
143 n = delta;
144 } else {
145 base += delta;
146 n -= delta;
147 }
148 } while (n > 1);
149 MLX5_ASSERT(addr >= lkp_tbl[base].start);
150 *idx = base;
151 if (addr < lkp_tbl[base].end)
152 return lkp_tbl[base].lkey;
153 /* Not found. */
154 return UINT32_MAX;
155 }
156
157 /**
158 * Insert an entry to B-tree lookup table.
159 *
160 * @param bt
161 * Pointer to B-tree structure.
162 * @param entry
163 * Pointer to new entry to insert.
164 *
165 * @return
166 * 0 on success, -1 on failure.
167 */
168 static int
mr_btree_insert(struct mlx5_mr_btree * bt,struct mr_cache_entry * entry)169 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry)
170 {
171 struct mr_cache_entry *lkp_tbl;
172 uint32_t idx = 0;
173 size_t shift;
174
175 MLX5_ASSERT(bt != NULL);
176 MLX5_ASSERT(bt->len <= bt->size);
177 MLX5_ASSERT(bt->len > 0);
178 lkp_tbl = *bt->table;
179 /* Find out the slot for insertion. */
180 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
181 DRV_LOG(DEBUG,
182 "abort insertion to B-tree(%p): already exist at"
183 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
184 (void *)bt, idx, entry->start, entry->end, entry->lkey);
185 /* Already exist, return. */
186 return 0;
187 }
188 /* Caller must ensure that there is enough place for a new entry. */
189 MLX5_ASSERT(bt->len < bt->size);
190 /* Insert entry. */
191 ++idx;
192 shift = (bt->len - idx) * sizeof(struct mr_cache_entry);
193 if (shift)
194 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
195 lkp_tbl[idx] = *entry;
196 bt->len++;
197 DRV_LOG(DEBUG,
198 "inserted B-tree(%p)[%u],"
199 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
200 (void *)bt, idx, entry->start, entry->end, entry->lkey);
201 return 0;
202 }
203
204 /**
205 * Initialize B-tree and allocate memory for lookup table.
206 *
207 * @param bt
208 * Pointer to B-tree structure.
209 * @param n
210 * Number of entries to allocate.
211 * @param socket
212 * NUMA socket on which memory must be allocated.
213 *
214 * @return
215 * 0 on success, a negative errno value otherwise and rte_errno is set.
216 */
217 static int
mlx5_mr_btree_init(struct mlx5_mr_btree * bt,int n,int socket)218 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket)
219 {
220 if (bt == NULL) {
221 rte_errno = EINVAL;
222 return -rte_errno;
223 }
224 MLX5_ASSERT(!bt->table && !bt->size);
225 memset(bt, 0, sizeof(*bt));
226 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
227 sizeof(struct mr_cache_entry) * n,
228 0, socket);
229 if (bt->table == NULL) {
230 rte_errno = ENOMEM;
231 DRV_LOG(DEBUG,
232 "failed to allocate memory for btree cache on socket "
233 "%d", socket);
234 return -rte_errno;
235 }
236 bt->size = n;
237 /* First entry must be NULL for binary search. */
238 (*bt->table)[bt->len++] = (struct mr_cache_entry) {
239 .lkey = UINT32_MAX,
240 };
241 DRV_LOG(DEBUG, "initialized B-tree %p with table %p",
242 (void *)bt, (void *)bt->table);
243 return 0;
244 }
245
246 /**
247 * Free B-tree resources.
248 *
249 * @param bt
250 * Pointer to B-tree structure.
251 */
252 void
mlx5_mr_btree_free(struct mlx5_mr_btree * bt)253 mlx5_mr_btree_free(struct mlx5_mr_btree *bt)
254 {
255 if (bt == NULL)
256 return;
257 DRV_LOG(DEBUG, "freeing B-tree %p with table %p",
258 (void *)bt, (void *)bt->table);
259 mlx5_free(bt->table);
260 memset(bt, 0, sizeof(*bt));
261 }
262
263 /**
264 * Dump all the entries in a B-tree
265 *
266 * @param bt
267 * Pointer to B-tree structure.
268 */
269 void
mlx5_mr_btree_dump(struct mlx5_mr_btree * bt __rte_unused)270 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused)
271 {
272 #ifdef RTE_LIBRTE_MLX5_DEBUG
273 uint32_t idx;
274 struct mr_cache_entry *lkp_tbl;
275
276 if (bt == NULL)
277 return;
278 lkp_tbl = *bt->table;
279 for (idx = 0; idx < bt->len; ++idx) {
280 struct mr_cache_entry *entry = &lkp_tbl[idx];
281
282 DRV_LOG(DEBUG, "B-tree(%p)[%u],"
283 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
284 (void *)bt, idx, entry->start, entry->end, entry->lkey);
285 }
286 #endif
287 }
288
289 /**
290 * Initialize per-queue MR control descriptor.
291 *
292 * @param mr_ctrl
293 * Pointer to MR control structure.
294 * @param dev_gen_ptr
295 * Pointer to generation number of global cache.
296 * @param socket
297 * NUMA socket on which memory must be allocated.
298 *
299 * @return
300 * 0 on success, a negative errno value otherwise and rte_errno is set.
301 */
302 int
mlx5_mr_ctrl_init(struct mlx5_mr_ctrl * mr_ctrl,uint32_t * dev_gen_ptr,int socket)303 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr,
304 int socket)
305 {
306 if (mr_ctrl == NULL) {
307 rte_errno = EINVAL;
308 return -rte_errno;
309 }
310 /* Save pointer of global generation number to check memory event. */
311 mr_ctrl->dev_gen_ptr = dev_gen_ptr;
312 /* Initialize B-tree and allocate memory for bottom-half cache table. */
313 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N,
314 socket);
315 }
316
317 /**
318 * Find virtually contiguous memory chunk in a given MR.
319 *
320 * @param dev
321 * Pointer to MR structure.
322 * @param[out] entry
323 * Pointer to returning MR cache entry. If not found, this will not be
324 * updated.
325 * @param start_idx
326 * Start index of the memseg bitmap.
327 *
328 * @return
329 * Next index to go on lookup.
330 */
331 static int
mr_find_next_chunk(struct mlx5_mr * mr,struct mr_cache_entry * entry,int base_idx)332 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry,
333 int base_idx)
334 {
335 uintptr_t start = 0;
336 uintptr_t end = 0;
337 uint32_t idx = 0;
338
339 /* MR for external memory doesn't have memseg list. */
340 if (mr->msl == NULL) {
341 MLX5_ASSERT(mr->ms_bmp_n == 1);
342 MLX5_ASSERT(mr->ms_n == 1);
343 MLX5_ASSERT(base_idx == 0);
344 /*
345 * Can't search it from memseg list but get it directly from
346 * pmd_mr as there's only one chunk.
347 */
348 entry->start = (uintptr_t)mr->pmd_mr.addr;
349 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len;
350 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey);
351 /* Returning 1 ends iteration. */
352 return 1;
353 }
354 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
355 if (rte_bitmap_get(mr->ms_bmp, idx)) {
356 const struct rte_memseg_list *msl;
357 const struct rte_memseg *ms;
358
359 msl = mr->msl;
360 ms = rte_fbarray_get(&msl->memseg_arr,
361 mr->ms_base_idx + idx);
362 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
363 if (!start)
364 start = ms->addr_64;
365 end = ms->addr_64 + ms->hugepage_sz;
366 } else if (start) {
367 /* Passed the end of a fragment. */
368 break;
369 }
370 }
371 if (start) {
372 /* Found one chunk. */
373 entry->start = start;
374 entry->end = end;
375 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey);
376 }
377 return idx;
378 }
379
380 /**
381 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
382 * Then, this entry will have to be searched by mr_lookup_list() in
383 * mlx5_mr_create() on miss.
384 *
385 * @param share_cache
386 * Pointer to a global shared MR cache.
387 * @param mr
388 * Pointer to MR to insert.
389 *
390 * @return
391 * 0 on success, -1 on failure.
392 */
393 int
mlx5_mr_insert_cache(struct mlx5_mr_share_cache * share_cache,struct mlx5_mr * mr)394 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache,
395 struct mlx5_mr *mr)
396 {
397 unsigned int n;
398
399 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)",
400 (void *)mr, (void *)share_cache);
401 for (n = 0; n < mr->ms_bmp_n; ) {
402 struct mr_cache_entry entry;
403
404 memset(&entry, 0, sizeof(entry));
405 /* Find a contiguous chunk and advance the index. */
406 n = mr_find_next_chunk(mr, &entry, n);
407 if (!entry.end)
408 break;
409 if (mr_btree_insert(&share_cache->cache, &entry) < 0)
410 return -1;
411 }
412 return 0;
413 }
414
415 /**
416 * Look up address in the original global MR list.
417 *
418 * @param share_cache
419 * Pointer to a global shared MR cache.
420 * @param[out] entry
421 * Pointer to returning MR cache entry. If no match, this will not be updated.
422 * @param addr
423 * Search key.
424 *
425 * @return
426 * Found MR on match, NULL otherwise.
427 */
428 struct mlx5_mr *
mlx5_mr_lookup_list(struct mlx5_mr_share_cache * share_cache,struct mr_cache_entry * entry,uintptr_t addr)429 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache,
430 struct mr_cache_entry *entry, uintptr_t addr)
431 {
432 struct mlx5_mr *mr;
433
434 /* Iterate all the existing MRs. */
435 LIST_FOREACH(mr, &share_cache->mr_list, mr) {
436 unsigned int n;
437
438 if (mr->ms_n == 0)
439 continue;
440 for (n = 0; n < mr->ms_bmp_n; ) {
441 struct mr_cache_entry ret;
442
443 memset(&ret, 0, sizeof(ret));
444 n = mr_find_next_chunk(mr, &ret, n);
445 if (addr >= ret.start && addr < ret.end) {
446 /* Found. */
447 *entry = ret;
448 return mr;
449 }
450 }
451 }
452 return NULL;
453 }
454
455 /**
456 * Look up address on global MR cache.
457 *
458 * @param share_cache
459 * Pointer to a global shared MR cache.
460 * @param[out] entry
461 * Pointer to returning MR cache entry. If no match, this will not be updated.
462 * @param addr
463 * Search key.
464 *
465 * @return
466 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
467 */
468 static uint32_t
mlx5_mr_lookup_cache(struct mlx5_mr_share_cache * share_cache,struct mr_cache_entry * entry,uintptr_t addr)469 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache,
470 struct mr_cache_entry *entry, uintptr_t addr)
471 {
472 uint32_t idx;
473 uint32_t lkey;
474
475 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
476 if (lkey != UINT32_MAX)
477 *entry = (*share_cache->cache.table)[idx];
478 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
479 addr < entry->end));
480 return lkey;
481 }
482
483 /**
484 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
485 * can raise memory free event and the callback function will spin on the lock.
486 *
487 * @param mr
488 * Pointer to MR to free.
489 */
490 void
mlx5_mr_free(struct mlx5_mr * mr,mlx5_dereg_mr_t dereg_mr_cb)491 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb)
492 {
493 if (mr == NULL)
494 return;
495 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr);
496 dereg_mr_cb(&mr->pmd_mr);
497 rte_bitmap_free(mr->ms_bmp);
498 mlx5_free(mr);
499 }
500
501 void
mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache * share_cache)502 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache)
503 {
504 struct mlx5_mr *mr;
505
506 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache);
507 /* Flush cache to rebuild. */
508 share_cache->cache.len = 1;
509 /* Iterate all the existing MRs. */
510 LIST_FOREACH(mr, &share_cache->mr_list, mr)
511 if (mlx5_mr_insert_cache(share_cache, mr) < 0)
512 return;
513 }
514
515 /**
516 * Release resources of detached MR having no online entry.
517 *
518 * @param share_cache
519 * Pointer to a global shared MR cache.
520 */
521 static void
mlx5_mr_garbage_collect(struct mlx5_mr_share_cache * share_cache)522 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache)
523 {
524 struct mlx5_mr *mr_next;
525 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
526
527 /* Must be called from the primary process. */
528 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
529 /*
530 * MR can't be freed with holding the lock because rte_free() could call
531 * memory free callback function. This will be a deadlock situation.
532 */
533 rte_rwlock_write_lock(&share_cache->rwlock);
534 /* Detach the whole free list and release it after unlocking. */
535 free_list = share_cache->mr_free_list;
536 LIST_INIT(&share_cache->mr_free_list);
537 rte_rwlock_write_unlock(&share_cache->rwlock);
538 /* Release resources. */
539 mr_next = LIST_FIRST(&free_list);
540 while (mr_next != NULL) {
541 struct mlx5_mr *mr = mr_next;
542
543 mr_next = LIST_NEXT(mr, mr);
544 mlx5_mr_free(mr, share_cache->dereg_mr_cb);
545 }
546 }
547
548 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */
549 static int
mr_find_contig_memsegs_cb(const struct rte_memseg_list * msl,const struct rte_memseg * ms,size_t len,void * arg)550 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
551 const struct rte_memseg *ms, size_t len, void *arg)
552 {
553 struct mr_find_contig_memsegs_data *data = arg;
554
555 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
556 return 0;
557 /* Found, save it and stop walking. */
558 data->start = ms->addr_64;
559 data->end = ms->addr_64 + len;
560 data->msl = msl;
561 return 1;
562 }
563
564 /**
565 * Get the number of virtually-contiguous chunks in the MR.
566 * HW MR does not need to be already created to use this function.
567 *
568 * @param mr
569 * Pointer to the MR.
570 *
571 * @return
572 * Number of chunks.
573 */
574 static uint32_t
mr_get_chunk_count(const struct mlx5_mr * mr)575 mr_get_chunk_count(const struct mlx5_mr *mr)
576 {
577 uint32_t i, count = 0;
578 bool was_in_chunk = false;
579 bool is_in_chunk;
580
581 /* There is only one chunk in case of external memory. */
582 if (mr->msl == NULL)
583 return 1;
584 for (i = 0; i < mr->ms_bmp_n; i++) {
585 is_in_chunk = rte_bitmap_get(mr->ms_bmp, i);
586 if (!was_in_chunk && is_in_chunk)
587 count++;
588 was_in_chunk = is_in_chunk;
589 }
590 return count;
591 }
592
593 /**
594 * Thread-safely expand the global MR cache to at least @p new_size slots.
595 *
596 * @param share_cache
597 * Shared MR cache for locking.
598 * @param new_size
599 * Desired cache size.
600 * @param socket
601 * NUMA node.
602 *
603 * @return
604 * 0 in success, negative on failure and rte_errno is set.
605 */
606 int
mlx5_mr_expand_cache(struct mlx5_mr_share_cache * share_cache,uint32_t size,int socket)607 mlx5_mr_expand_cache(struct mlx5_mr_share_cache *share_cache,
608 uint32_t size, int socket)
609 {
610 struct mlx5_mr_btree cache = {0};
611 struct mlx5_mr_btree *bt;
612 struct mr_cache_entry *lkp_tbl;
613 int ret;
614
615 size = rte_align32pow2(size);
616 ret = mlx5_mr_btree_init(&cache, size, socket);
617 if (ret < 0)
618 return ret;
619 rte_rwlock_write_lock(&share_cache->rwlock);
620 bt = &share_cache->cache;
621 lkp_tbl = *bt->table;
622 if (cache.size > bt->size) {
623 rte_memcpy(cache.table, lkp_tbl, bt->len * sizeof(lkp_tbl[0]));
624 RTE_SWAP(*bt, cache);
625 DRV_LOG(DEBUG, "Global MR cache expanded to %u slots", size);
626 }
627 rte_rwlock_write_unlock(&share_cache->rwlock);
628 mlx5_mr_btree_free(&cache);
629 return 0;
630 }
631
632 /**
633 * Create a new global Memory Region (MR) for a missing virtual address.
634 * This API should be called on a secondary process, then a request is sent to
635 * the primary process in order to create a MR for the address. As the global MR
636 * list is on the shared memory, following LKey lookup should succeed unless the
637 * request fails.
638 *
639 * @param cdev
640 * Pointer to the mlx5 common device.
641 * @param share_cache
642 * Pointer to a global shared MR cache.
643 * @param[out] entry
644 * Pointer to returning MR cache entry, found in the global cache or newly
645 * created. If failed to create one, this will not be updated.
646 * @param addr
647 * Target virtual address to register.
648 *
649 * @return
650 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
651 */
652 static uint32_t
mlx5_mr_create_secondary(struct mlx5_common_device * cdev,struct mlx5_mr_share_cache * share_cache,struct mr_cache_entry * entry,uintptr_t addr)653 mlx5_mr_create_secondary(struct mlx5_common_device *cdev,
654 struct mlx5_mr_share_cache *share_cache,
655 struct mr_cache_entry *entry, uintptr_t addr)
656 {
657 int ret;
658
659 DRV_LOG(DEBUG, "Requesting MR creation for address (%p)", (void *)addr);
660 ret = mlx5_mp_req_mr_create(cdev, addr);
661 if (ret) {
662 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)",
663 (void *)addr);
664 return UINT32_MAX;
665 }
666 rte_rwlock_read_lock(&share_cache->rwlock);
667 /* Fill in output data. */
668 mlx5_mr_lookup_cache(share_cache, entry, addr);
669 /* Lookup can't fail. */
670 MLX5_ASSERT(entry->lkey != UINT32_MAX);
671 rte_rwlock_read_unlock(&share_cache->rwlock);
672 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n"
673 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
674 (void *)addr, entry->start, entry->end, entry->lkey);
675 return entry->lkey;
676 }
677
678 /**
679 * Create a new global Memory Region (MR) for a missing virtual address.
680 * Register entire virtually contiguous memory chunk around the address.
681 *
682 * @param pd
683 * Pointer to pd of a device (net, regex, vdpa,...).
684 * @param share_cache
685 * Pointer to a global shared MR cache.
686 * @param[out] entry
687 * Pointer to returning MR cache entry, found in the global cache or newly
688 * created. If failed to create one, this will not be updated.
689 * @param addr
690 * Target virtual address to register.
691 * @param mr_ext_memseg_en
692 * Configurable flag about external memory segment enable or not.
693 *
694 * @return
695 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
696 */
697 static uint32_t
mlx5_mr_create_primary(void * pd,struct mlx5_mr_share_cache * share_cache,struct mr_cache_entry * entry,uintptr_t addr,unsigned int mr_ext_memseg_en)698 mlx5_mr_create_primary(void *pd,
699 struct mlx5_mr_share_cache *share_cache,
700 struct mr_cache_entry *entry, uintptr_t addr,
701 unsigned int mr_ext_memseg_en)
702 {
703 struct mr_find_contig_memsegs_data data = {.addr = addr, };
704 struct mr_find_contig_memsegs_data data_re;
705 const struct rte_memseg_list *msl;
706 const struct rte_memseg *ms;
707 struct mlx5_mr_btree *bt;
708 struct mlx5_mr *mr = NULL;
709 int ms_idx_shift = -1;
710 uint32_t bmp_size;
711 void *bmp_mem;
712 uint32_t ms_n;
713 uint32_t n;
714 uint32_t chunks_n;
715 size_t len;
716
717 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr);
718 /*
719 * Release detached MRs if any. This can't be called with holding either
720 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have
721 * been detached by the memory free event but it couldn't be released
722 * inside the callback due to deadlock. As a result, releasing resources
723 * is quite opportunistic.
724 */
725 mlx5_mr_garbage_collect(share_cache);
726 find_range:
727 /*
728 * If enabled, find out a contiguous virtual address chunk in use, to
729 * which the given address belongs, in order to register maximum range.
730 * In the best case where mempools are not dynamically recreated and
731 * '--socket-mem' is specified as an EAL option, it is very likely to
732 * have only one MR(LKey) per a socket and per a hugepage-size even
733 * though the system memory is highly fragmented. As the whole memory
734 * chunk will be pinned by kernel, it can't be reused unless entire
735 * chunk is freed from EAL.
736 *
737 * If disabled, just register one memseg (page). Then, memory
738 * consumption will be minimized but it may drop performance if there
739 * are many MRs to lookup on the datapath.
740 */
741 if (!mr_ext_memseg_en) {
742 data.msl = rte_mem_virt2memseg_list((void *)addr);
743 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz);
744 data.end = data.start + data.msl->page_sz;
745 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
746 DRV_LOG(WARNING,
747 "Unable to find virtually contiguous"
748 " chunk for address (%p)."
749 " rte_memseg_contig_walk() failed.", (void *)addr);
750 rte_errno = ENXIO;
751 goto err_nolock;
752 }
753 alloc_resources:
754 /* Addresses must be page-aligned. */
755 MLX5_ASSERT(data.msl);
756 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz));
757 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz));
758 msl = data.msl;
759 ms = rte_mem_virt2memseg((void *)data.start, msl);
760 len = data.end - data.start;
761 MLX5_ASSERT(ms);
762 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
763 /* Number of memsegs in the range. */
764 ms_n = len / msl->page_sz;
765 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
766 " page_sz=0x%" PRIx64 ", ms_n=%u",
767 (void *)addr, data.start, data.end, msl->page_sz, ms_n);
768 /* Size of memory for bitmap. */
769 bmp_size = rte_bitmap_get_memory_footprint(ms_n);
770 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
771 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) +
772 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id);
773 if (mr == NULL) {
774 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of"
775 " address (%p).", (void *)addr);
776 rte_errno = ENOMEM;
777 goto err_nolock;
778 }
779 mr->msl = msl;
780 /*
781 * Save the index of the first memseg and initialize memseg bitmap. To
782 * see if a memseg of ms_idx in the memseg-list is still valid, check:
783 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
784 */
785 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
786 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
787 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
788 if (mr->ms_bmp == NULL) {
789 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of"
790 " address (%p).", (void *)addr);
791 rte_errno = EINVAL;
792 goto err_nolock;
793 }
794 /*
795 * Should recheck whether the extended contiguous chunk is still valid.
796 * Because memory_hotplug_lock can't be held if there's any memory
797 * related calls in a critical path, resource allocation above can't be
798 * locked. If the memory has been changed at this point, try again with
799 * just single page. If not, go on with the big chunk atomically from
800 * here.
801 */
802 rte_mcfg_mem_read_lock();
803 data_re = data;
804 if (len > msl->page_sz &&
805 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
806 DRV_LOG(DEBUG,
807 "Unable to find virtually contiguous chunk for address "
808 "(%p). rte_memseg_contig_walk() failed.", (void *)addr);
809 rte_errno = ENXIO;
810 goto err_memlock;
811 }
812 if (data.start != data_re.start || data.end != data_re.end) {
813 /*
814 * The extended contiguous chunk has been changed. Try again
815 * with single memseg instead.
816 */
817 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
818 data.end = data.start + msl->page_sz;
819 rte_mcfg_mem_read_unlock();
820 mlx5_mr_free(mr, share_cache->dereg_mr_cb);
821 goto alloc_resources;
822 }
823 MLX5_ASSERT(data.msl == data_re.msl);
824 rte_rwlock_write_lock(&share_cache->rwlock);
825 /*
826 * Check the address is really missing. If other thread already created
827 * one or it is not found due to overflow, abort and return.
828 */
829 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) {
830 /*
831 * Insert to the global cache table. It may fail due to
832 * low-on-memory. Then, this entry will have to be searched
833 * here again.
834 */
835 mr_btree_insert(&share_cache->cache, entry);
836 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort",
837 (void *)addr);
838 rte_rwlock_write_unlock(&share_cache->rwlock);
839 rte_mcfg_mem_read_unlock();
840 /*
841 * Must be unlocked before calling rte_free() because
842 * mlx5_mr_mem_event_free_cb() can be called inside.
843 */
844 mlx5_mr_free(mr, share_cache->dereg_mr_cb);
845 return entry->lkey;
846 }
847 /*
848 * Trim start and end addresses for verbs MR. Set bits for registering
849 * memsegs but exclude already registered ones. Bitmap can be
850 * fragmented.
851 */
852 for (n = 0; n < ms_n; ++n) {
853 uintptr_t start;
854 struct mr_cache_entry ret;
855
856 memset(&ret, 0, sizeof(ret));
857 start = data_re.start + n * msl->page_sz;
858 /* Exclude memsegs already registered by other MRs. */
859 if (mlx5_mr_lookup_cache(share_cache, &ret, start) ==
860 UINT32_MAX) {
861 /*
862 * Start from the first unregistered memseg in the
863 * extended range.
864 */
865 if (ms_idx_shift == -1) {
866 mr->ms_base_idx += n;
867 data.start = start;
868 ms_idx_shift = n;
869 }
870 data.end = start + msl->page_sz;
871 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
872 ++mr->ms_n;
873 }
874 }
875 len = data.end - data.start;
876 mr->ms_bmp_n = len / msl->page_sz;
877 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
878 /*
879 * It is now known how many entries will be used in the global cache.
880 * If there is not enough, expand the cache.
881 * This cannot be done while holding the memory hotplug lock.
882 * While it is released, memory layout may change,
883 * so the process must be repeated from the beginning.
884 */
885 bt = &share_cache->cache;
886 chunks_n = mr_get_chunk_count(mr);
887 if (bt->len + chunks_n > bt->size) {
888 struct mlx5_common_device *cdev;
889 uint32_t size;
890
891 size = bt->size + chunks_n;
892 MLX5_ASSERT(size > bt->size);
893 cdev = container_of(share_cache, struct mlx5_common_device,
894 mr_scache);
895 rte_rwlock_write_unlock(&share_cache->rwlock);
896 rte_mcfg_mem_read_unlock();
897 if (mlx5_mr_expand_cache(share_cache, size,
898 cdev->dev->numa_node) < 0) {
899 DRV_LOG(ERR, "Failed to expand global MR cache to %u slots",
900 size);
901 goto err_nolock;
902 }
903 goto find_range;
904 }
905 /*
906 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can
907 * be called with holding the memory lock because it doesn't use
908 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
909 * through mlx5_alloc_verbs_buf().
910 */
911 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr);
912 if (mr->pmd_mr.obj == NULL) {
913 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)",
914 (void *)addr);
915 rte_errno = EINVAL;
916 goto err_mrlock;
917 }
918 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start);
919 MLX5_ASSERT(mr->pmd_mr.len);
920 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr);
921 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n"
922 " [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
923 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
924 (void *)mr, (void *)addr, data.start, data.end,
925 rte_cpu_to_be_32(mr->pmd_mr.lkey),
926 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
927 /* Insert to the global cache table. */
928 mlx5_mr_insert_cache(share_cache, mr);
929 /* Fill in output data. */
930 mlx5_mr_lookup_cache(share_cache, entry, addr);
931 /* Lookup can't fail. */
932 MLX5_ASSERT(entry->lkey != UINT32_MAX);
933 rte_rwlock_write_unlock(&share_cache->rwlock);
934 rte_mcfg_mem_read_unlock();
935 return entry->lkey;
936 err_mrlock:
937 rte_rwlock_write_unlock(&share_cache->rwlock);
938 err_memlock:
939 rte_mcfg_mem_read_unlock();
940 err_nolock:
941 /*
942 * In case of error, as this can be called in a datapath, a warning
943 * message per an error is preferable instead. Must be unlocked before
944 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called
945 * inside.
946 */
947 mlx5_mr_free(mr, share_cache->dereg_mr_cb);
948 return UINT32_MAX;
949 }
950
951 /**
952 * Create a new global Memory Region (MR) for a missing virtual address.
953 * This can be called from primary and secondary process.
954 *
955 * @param cdev
956 * Pointer to the mlx5 common device.
957 * @param share_cache
958 * Pointer to a global shared MR cache.
959 * @param[out] entry
960 * Pointer to returning MR cache entry, found in the global cache or newly
961 * created. If failed to create one, this will not be updated.
962 * @param addr
963 * Target virtual address to register.
964 *
965 * @return
966 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
967 */
968 uint32_t
mlx5_mr_create(struct mlx5_common_device * cdev,struct mlx5_mr_share_cache * share_cache,struct mr_cache_entry * entry,uintptr_t addr)969 mlx5_mr_create(struct mlx5_common_device *cdev,
970 struct mlx5_mr_share_cache *share_cache,
971 struct mr_cache_entry *entry, uintptr_t addr)
972 {
973 uint32_t ret = 0;
974
975 switch (rte_eal_process_type()) {
976 case RTE_PROC_PRIMARY:
977 ret = mlx5_mr_create_primary(cdev->pd, share_cache, entry, addr,
978 cdev->config.mr_ext_memseg_en);
979 break;
980 case RTE_PROC_SECONDARY:
981 ret = mlx5_mr_create_secondary(cdev, share_cache, entry, addr);
982 break;
983 default:
984 break;
985 }
986 return ret;
987 }
988
989 /**
990 * Look up address in the global MR cache table. If not found, create a new MR.
991 * Insert the found/created entry to local bottom-half cache table.
992 *
993 * @param mr_ctrl
994 * Pointer to per-queue MR control structure.
995 * @param[out] entry
996 * Pointer to returning MR cache entry, found in the global cache or newly
997 * created. If failed to create one, this is not written.
998 * @param addr
999 * Search key.
1000 *
1001 * @return
1002 * Searched LKey on success, UINT32_MAX on no match.
1003 */
1004 static uint32_t
mr_lookup_caches(struct mlx5_mr_ctrl * mr_ctrl,struct mr_cache_entry * entry,uintptr_t addr)1005 mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl,
1006 struct mr_cache_entry *entry, uintptr_t addr)
1007 {
1008 struct mlx5_mr_share_cache *share_cache =
1009 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache,
1010 dev_gen);
1011 struct mlx5_common_device *cdev =
1012 container_of(share_cache, struct mlx5_common_device, mr_scache);
1013 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
1014 uint32_t lkey;
1015 uint32_t idx;
1016
1017 /* If local cache table is full, try to double it. */
1018 if (unlikely(bt->len == bt->size))
1019 mr_btree_expand(bt, bt->size << 1);
1020 /* Look up in the global cache. */
1021 rte_rwlock_read_lock(&share_cache->rwlock);
1022 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr);
1023 if (lkey != UINT32_MAX) {
1024 /* Found. */
1025 *entry = (*share_cache->cache.table)[idx];
1026 rte_rwlock_read_unlock(&share_cache->rwlock);
1027 /*
1028 * Update local cache. Even if it fails, return the found entry
1029 * to update top-half cache. Next time, this entry will be found
1030 * in the global cache.
1031 */
1032 mr_btree_insert(bt, entry);
1033 return lkey;
1034 }
1035 rte_rwlock_read_unlock(&share_cache->rwlock);
1036 /* First time to see the address? Create a new MR. */
1037 lkey = mlx5_mr_create(cdev, share_cache, entry, addr);
1038 /*
1039 * Update the local cache if successfully created a new global MR. Even
1040 * if failed to create one, there's no action to take in this datapath
1041 * code. As returning LKey is invalid, this will eventually make HW
1042 * fail.
1043 */
1044 if (lkey != UINT32_MAX)
1045 mr_btree_insert(bt, entry);
1046 return lkey;
1047 }
1048
1049 /**
1050 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if
1051 * misses, search in the global MR cache table and update the new entry to
1052 * per-queue local caches.
1053 *
1054 * @param mr_ctrl
1055 * Pointer to per-queue MR control structure.
1056 * @param addr
1057 * Search key.
1058 *
1059 * @return
1060 * Searched LKey on success, UINT32_MAX on no match.
1061 */
1062 uint32_t
mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl * mr_ctrl,uintptr_t addr)1063 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr)
1064 {
1065 uint32_t lkey;
1066 uint32_t bh_idx = 0;
1067 /* Victim in top-half cache to replace with new entry. */
1068 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
1069
1070 /* Binary-search MR translation table. */
1071 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
1072 /* Update top-half cache. */
1073 if (likely(lkey != UINT32_MAX)) {
1074 *repl = (*mr_ctrl->cache_bh.table)[bh_idx];
1075 } else {
1076 /*
1077 * If missed in local lookup table, search in the global cache
1078 * and local cache_bh[] will be updated inside if possible.
1079 * Top-half cache entry will also be updated.
1080 */
1081 lkey = mr_lookup_caches(mr_ctrl, repl, addr);
1082 if (unlikely(lkey == UINT32_MAX))
1083 return UINT32_MAX;
1084 }
1085 /* Update the most recently used entry. */
1086 mr_ctrl->mru = mr_ctrl->head;
1087 /* Point to the next victim, the oldest. */
1088 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
1089 return lkey;
1090 }
1091
1092 /**
1093 * Release all the created MRs and resources on global MR cache of a device
1094 * list.
1095 *
1096 * @param share_cache
1097 * Pointer to a global shared MR cache.
1098 */
1099 void
mlx5_mr_release_cache(struct mlx5_mr_share_cache * share_cache)1100 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache)
1101 {
1102 struct mlx5_mr *mr_next;
1103
1104 rte_rwlock_write_lock(&share_cache->rwlock);
1105 /* Detach from MR list and move to free list. */
1106 mr_next = LIST_FIRST(&share_cache->mr_list);
1107 while (mr_next != NULL) {
1108 struct mlx5_mr *mr = mr_next;
1109
1110 mr_next = LIST_NEXT(mr, mr);
1111 LIST_REMOVE(mr, mr);
1112 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr);
1113 }
1114 LIST_INIT(&share_cache->mr_list);
1115 /* Free global cache. */
1116 mlx5_mr_btree_free(&share_cache->cache);
1117 rte_rwlock_write_unlock(&share_cache->rwlock);
1118 /* Free all remaining MRs. */
1119 mlx5_mr_garbage_collect(share_cache);
1120 }
1121
1122 /**
1123 * Initialize global MR cache of a device.
1124 *
1125 * @param share_cache
1126 * Pointer to a global shared MR cache.
1127 * @param socket
1128 * NUMA socket on which memory must be allocated.
1129 *
1130 * @return
1131 * 0 on success, a negative errno value otherwise and rte_errno is set.
1132 */
1133 int
mlx5_mr_create_cache(struct mlx5_mr_share_cache * share_cache,int socket)1134 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket)
1135 {
1136 /* Set the reg_mr and dereg_mr callback functions */
1137 mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb,
1138 &share_cache->dereg_mr_cb);
1139 rte_rwlock_init(&share_cache->rwlock);
1140 rte_rwlock_init(&share_cache->mprwlock);
1141 /* Initialize B-tree and allocate memory for global MR cache table. */
1142 return mlx5_mr_btree_init(&share_cache->cache,
1143 MLX5_MR_BTREE_CACHE_N * 2, socket);
1144 }
1145
1146 /**
1147 * Flush all of the local cache entries.
1148 *
1149 * @param mr_ctrl
1150 * Pointer to per-queue MR local cache.
1151 */
1152 void
mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl * mr_ctrl)1153 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl)
1154 {
1155 /* Reset the most-recently-used index. */
1156 mr_ctrl->mru = 0;
1157 /* Reset the linear search array. */
1158 mr_ctrl->head = 0;
1159 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
1160 /* Reset the B-tree table. */
1161 mr_ctrl->cache_bh.len = 1;
1162 /* Update the generation number. */
1163 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
1164 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d",
1165 (void *)mr_ctrl, mr_ctrl->cur_gen);
1166 }
1167
1168 /**
1169 * Creates a memory region for external memory, that is memory which is not
1170 * part of the DPDK memory segments.
1171 *
1172 * @param pd
1173 * Pointer to pd of a device (net, regex, vdpa,...).
1174 * @param addr
1175 * Starting virtual address of memory.
1176 * @param len
1177 * Length of memory segment being mapped.
1178 * @param socked_id
1179 * Socket to allocate heap memory for the control structures.
1180 *
1181 * @return
1182 * Pointer to MR structure on success, NULL otherwise.
1183 */
1184 struct mlx5_mr *
mlx5_create_mr_ext(void * pd,uintptr_t addr,size_t len,int socket_id,mlx5_reg_mr_t reg_mr_cb)1185 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id,
1186 mlx5_reg_mr_t reg_mr_cb)
1187 {
1188 struct mlx5_mr *mr = NULL;
1189
1190 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
1191 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE),
1192 RTE_CACHE_LINE_SIZE, socket_id);
1193 if (mr == NULL)
1194 return NULL;
1195 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr);
1196 if (mr->pmd_mr.obj == NULL) {
1197 DRV_LOG(WARNING,
1198 "Fail to create MR for address (%p)",
1199 (void *)addr);
1200 mlx5_free(mr);
1201 return NULL;
1202 }
1203 mr->msl = NULL; /* Mark it is external memory. */
1204 mr->ms_bmp = NULL;
1205 mr->ms_n = 1;
1206 mr->ms_bmp_n = 1;
1207 DRV_LOG(DEBUG,
1208 "MR CREATED (%p) for external memory %p:\n"
1209 " [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
1210 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
1211 (void *)mr, (void *)addr,
1212 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey),
1213 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
1214 return mr;
1215 }
1216
1217 /**
1218 * Callback for memory free event. Iterate freed memsegs and check whether it
1219 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
1220 * result, the MR would be fragmented. If it becomes empty, the MR will be freed
1221 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a
1222 * secondary process, the garbage collector will be called in primary process
1223 * as the secondary process can't call mlx5_mr_create().
1224 *
1225 * The global cache must be rebuilt if there's any change and this event has to
1226 * be propagated to dataplane threads to flush the local caches.
1227 *
1228 * @param share_cache
1229 * Pointer to a global shared MR cache.
1230 * @param ibdev_name
1231 * Name of ibv device.
1232 * @param addr
1233 * Address of freed memory.
1234 * @param len
1235 * Size of freed memory.
1236 */
1237 void
mlx5_free_mr_by_addr(struct mlx5_mr_share_cache * share_cache,const char * ibdev_name,const void * addr,size_t len)1238 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache,
1239 const char *ibdev_name, const void *addr, size_t len)
1240 {
1241 const struct rte_memseg_list *msl;
1242 struct mlx5_mr *mr;
1243 int ms_n;
1244 int i;
1245 int rebuild = 0;
1246
1247 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu",
1248 ibdev_name, addr, len);
1249 msl = rte_mem_virt2memseg_list(addr);
1250 /* addr and len must be page-aligned. */
1251 MLX5_ASSERT((uintptr_t)addr ==
1252 RTE_ALIGN((uintptr_t)addr, msl->page_sz));
1253 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
1254 ms_n = len / msl->page_sz;
1255 rte_rwlock_write_lock(&share_cache->rwlock);
1256 /* Clear bits of freed memsegs from MR. */
1257 for (i = 0; i < ms_n; ++i) {
1258 const struct rte_memseg *ms;
1259 struct mr_cache_entry entry;
1260 uintptr_t start;
1261 int ms_idx;
1262 uint32_t pos;
1263
1264 /* Find MR having this memseg. */
1265 start = (uintptr_t)addr + i * msl->page_sz;
1266 mr = mlx5_mr_lookup_list(share_cache, &entry, start);
1267 if (mr == NULL)
1268 continue;
1269 MLX5_ASSERT(mr->msl); /* Can't be external memory. */
1270 ms = rte_mem_virt2memseg((void *)start, msl);
1271 MLX5_ASSERT(ms != NULL);
1272 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz);
1273 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
1274 pos = ms_idx - mr->ms_base_idx;
1275 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
1276 MLX5_ASSERT(pos < mr->ms_bmp_n);
1277 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p",
1278 ibdev_name, (void *)mr, pos, (void *)start);
1279 rte_bitmap_clear(mr->ms_bmp, pos);
1280 if (--mr->ms_n == 0) {
1281 LIST_REMOVE(mr, mr);
1282 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr);
1283 DRV_LOG(DEBUG, "device %s remove MR(%p) from list",
1284 ibdev_name, (void *)mr);
1285 }
1286 /*
1287 * MR is fragmented or will be freed. the global cache must be
1288 * rebuilt.
1289 */
1290 rebuild = 1;
1291 }
1292 if (rebuild) {
1293 mlx5_mr_rebuild_cache(share_cache);
1294 /*
1295 * No explicit wmb is needed after updating dev_gen due to
1296 * store-release ordering in unlock that provides the
1297 * implicit barrier at the software visible level.
1298 */
1299 ++share_cache->dev_gen;
1300 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d",
1301 share_cache->dev_gen);
1302 }
1303 rte_rwlock_write_unlock(&share_cache->rwlock);
1304 }
1305
1306 /**
1307 * Dump all the created MRs and the global cache entries.
1308 *
1309 * @param share_cache
1310 * Pointer to a global shared MR cache.
1311 */
1312 void
mlx5_mr_dump_cache(struct mlx5_mr_share_cache * share_cache __rte_unused)1313 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused)
1314 {
1315 #ifdef RTE_LIBRTE_MLX5_DEBUG
1316 struct mlx5_mr *mr;
1317 int mr_n = 0;
1318 int chunk_n = 0;
1319
1320 rte_rwlock_read_lock(&share_cache->rwlock);
1321 /* Iterate all the existing MRs. */
1322 LIST_FOREACH(mr, &share_cache->mr_list, mr) {
1323 unsigned int n;
1324
1325 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
1326 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey),
1327 mr->ms_n, mr->ms_bmp_n);
1328 if (mr->ms_n == 0)
1329 continue;
1330 for (n = 0; n < mr->ms_bmp_n; ) {
1331 struct mr_cache_entry ret = { 0, };
1332
1333 n = mr_find_next_chunk(mr, &ret, n);
1334 if (!ret.end)
1335 break;
1336 DRV_LOG(DEBUG,
1337 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
1338 chunk_n++, ret.start, ret.end);
1339 }
1340 }
1341 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache);
1342 mlx5_mr_btree_dump(&share_cache->cache);
1343 rte_rwlock_read_unlock(&share_cache->rwlock);
1344 #endif
1345 }
1346
1347 static int
mlx5_range_compare_start(const void * lhs,const void * rhs)1348 mlx5_range_compare_start(const void *lhs, const void *rhs)
1349 {
1350 const struct mlx5_range *r1 = lhs, *r2 = rhs;
1351
1352 if (r1->start > r2->start)
1353 return 1;
1354 else if (r1->start < r2->start)
1355 return -1;
1356 return 0;
1357 }
1358
1359 static void
mlx5_range_from_mempool_chunk(struct rte_mempool * mp,void * opaque,struct rte_mempool_memhdr * memhdr,unsigned int idx)1360 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque,
1361 struct rte_mempool_memhdr *memhdr,
1362 unsigned int idx)
1363 {
1364 struct mlx5_range *ranges = opaque, *range = &ranges[idx];
1365 uintptr_t start = (uintptr_t)memhdr->addr;
1366 uint64_t page_size = rte_mem_page_size();
1367
1368 RTE_SET_USED(mp);
1369 range->start = RTE_ALIGN_FLOOR(start, page_size);
1370 range->end = RTE_ALIGN_CEIL(start + memhdr->len, page_size);
1371 }
1372
1373 /**
1374 * Collect page-aligned memory ranges of the mempool.
1375 */
1376 static int
mlx5_mempool_get_chunks(struct rte_mempool * mp,struct mlx5_range ** out,unsigned int * out_n)1377 mlx5_mempool_get_chunks(struct rte_mempool *mp, struct mlx5_range **out,
1378 unsigned int *out_n)
1379 {
1380 unsigned int n;
1381
1382 DRV_LOG(DEBUG, "Collecting chunks of regular mempool %s", mp->name);
1383 n = mp->nb_mem_chunks;
1384 *out = calloc(n, sizeof(**out));
1385 if (*out == NULL)
1386 return -1;
1387 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, *out);
1388 *out_n = n;
1389 return 0;
1390 }
1391
1392 struct mlx5_mempool_get_extmem_data {
1393 struct mlx5_range *heap;
1394 unsigned int heap_size;
1395 int ret;
1396 };
1397
1398 static void
mlx5_mempool_get_extmem_cb(struct rte_mempool * mp,void * opaque,void * obj,unsigned int obj_idx)1399 mlx5_mempool_get_extmem_cb(struct rte_mempool *mp, void *opaque,
1400 void *obj, unsigned int obj_idx)
1401 {
1402 struct mlx5_mempool_get_extmem_data *data = opaque;
1403 struct rte_mbuf *mbuf = obj;
1404 uintptr_t addr = (uintptr_t)mbuf->buf_addr;
1405 struct mlx5_range *seg, *heap;
1406 struct rte_memseg_list *msl;
1407 size_t page_size;
1408 uintptr_t page_start;
1409 unsigned int pos = 0, len = data->heap_size, delta;
1410
1411 RTE_SET_USED(mp);
1412 RTE_SET_USED(obj_idx);
1413 if (data->ret < 0)
1414 return;
1415 /* Binary search for an already visited page. */
1416 while (len > 1) {
1417 delta = len / 2;
1418 if (addr < data->heap[pos + delta].start) {
1419 len = delta;
1420 } else {
1421 pos += delta;
1422 len -= delta;
1423 }
1424 }
1425 if (data->heap != NULL) {
1426 seg = &data->heap[pos];
1427 if (seg->start <= addr && addr < seg->end)
1428 return;
1429 }
1430 /* Determine the page boundaries and remember them. */
1431 heap = realloc(data->heap, sizeof(heap[0]) * (data->heap_size + 1));
1432 if (heap == NULL) {
1433 free(data->heap);
1434 data->heap = NULL;
1435 data->ret = -1;
1436 return;
1437 }
1438 data->heap = heap;
1439 data->heap_size++;
1440 seg = &heap[data->heap_size - 1];
1441 msl = rte_mem_virt2memseg_list((void *)addr);
1442 page_size = msl != NULL ? msl->page_sz : rte_mem_page_size();
1443 page_start = RTE_PTR_ALIGN_FLOOR(addr, page_size);
1444 seg->start = page_start;
1445 seg->end = page_start + page_size;
1446 /* Maintain the heap order. */
1447 qsort(data->heap, data->heap_size, sizeof(heap[0]),
1448 mlx5_range_compare_start);
1449 }
1450
1451 /**
1452 * Recover pages of external memory as close as possible
1453 * for a mempool with RTE_PKTMBUF_POOL_PINNED_EXT_BUF.
1454 * Pages are stored in a heap for efficient search, for mbufs are many.
1455 */
1456 static int
mlx5_mempool_get_extmem(struct rte_mempool * mp,struct mlx5_range ** out,unsigned int * out_n)1457 mlx5_mempool_get_extmem(struct rte_mempool *mp, struct mlx5_range **out,
1458 unsigned int *out_n)
1459 {
1460 struct mlx5_mempool_get_extmem_data data;
1461
1462 DRV_LOG(DEBUG, "Recovering external pinned pages of mempool %s",
1463 mp->name);
1464 memset(&data, 0, sizeof(data));
1465 rte_mempool_obj_iter(mp, mlx5_mempool_get_extmem_cb, &data);
1466 *out = data.heap;
1467 *out_n = data.heap_size;
1468 return data.ret;
1469 }
1470
1471 /**
1472 * Get VA-contiguous ranges of the mempool memory.
1473 * Each range start and end is aligned to the system page size.
1474 *
1475 * @param[in] mp
1476 * Analyzed mempool.
1477 * @param[in] is_extmem
1478 * Whether the pool is contains only external pinned buffers.
1479 * @param[out] out
1480 * Receives the ranges, caller must release it with free().
1481 * @param[out] out_n
1482 * Receives the number of @p out elements.
1483 *
1484 * @return
1485 * 0 on success, (-1) on failure.
1486 */
1487 static int
mlx5_get_mempool_ranges(struct rte_mempool * mp,bool is_extmem,struct mlx5_range ** out,unsigned int * out_n)1488 mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem,
1489 struct mlx5_range **out, unsigned int *out_n)
1490 {
1491 struct mlx5_range *chunks;
1492 unsigned int chunks_n, contig_n, i;
1493 int ret;
1494
1495 /* Collect the pool underlying memory. */
1496 ret = is_extmem ? mlx5_mempool_get_extmem(mp, &chunks, &chunks_n) :
1497 mlx5_mempool_get_chunks(mp, &chunks, &chunks_n);
1498 if (ret < 0)
1499 return ret;
1500 /* Merge adjacent chunks and place them at the beginning. */
1501 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start);
1502 contig_n = 1;
1503 for (i = 1; i < chunks_n; i++)
1504 if (chunks[i - 1].end != chunks[i].start) {
1505 chunks[contig_n - 1].end = chunks[i - 1].end;
1506 chunks[contig_n] = chunks[i];
1507 contig_n++;
1508 }
1509 /* Extend the last contiguous chunk to the end of the mempool. */
1510 chunks[contig_n - 1].end = chunks[i - 1].end;
1511 *out = chunks;
1512 *out_n = contig_n;
1513 return 0;
1514 }
1515
1516 /**
1517 * Analyze mempool memory to select memory ranges to register.
1518 *
1519 * @param[in] mp
1520 * Mempool to analyze.
1521 * @param[in] is_extmem
1522 * Whether the pool is contains only external pinned buffers.
1523 * @param[out] out
1524 * Receives memory ranges to register, aligned to the system page size.
1525 * The caller must release them with free().
1526 * @param[out] out_n
1527 * Receives the number of @p out items.
1528 * @param[out] share_hugepage
1529 * Receives True if the entire pool resides within a single hugepage.
1530 *
1531 * @return
1532 * 0 on success, (-1) on failure.
1533 */
1534 static int
mlx5_mempool_reg_analyze(struct rte_mempool * mp,bool is_extmem,struct mlx5_range ** out,unsigned int * out_n,bool * share_hugepage)1535 mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
1536 struct mlx5_range **out, unsigned int *out_n,
1537 bool *share_hugepage)
1538 {
1539 struct mlx5_range *ranges = NULL;
1540 unsigned int i, ranges_n = 0;
1541 struct rte_memseg_list *msl;
1542
1543 if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) {
1544 DRV_LOG(ERR, "Cannot get address ranges for mempool %s",
1545 mp->name);
1546 return -1;
1547 }
1548 /* Check if the hugepage of the pool can be shared. */
1549 *share_hugepage = false;
1550 msl = rte_mem_virt2memseg_list((void *)ranges[0].start);
1551 if (msl != NULL) {
1552 uint64_t hugepage_sz = 0;
1553
1554 /* Check that all ranges are on pages of the same size. */
1555 for (i = 0; i < ranges_n; i++) {
1556 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz)
1557 break;
1558 hugepage_sz = msl->page_sz;
1559 }
1560 if (i == ranges_n) {
1561 /*
1562 * If the entire pool is within one hugepage,
1563 * combine all ranges into one of the hugepage size.
1564 */
1565 uintptr_t reg_start = ranges[0].start;
1566 uintptr_t reg_end = ranges[ranges_n - 1].end;
1567 uintptr_t hugepage_start =
1568 RTE_ALIGN_FLOOR(reg_start, hugepage_sz);
1569 uintptr_t hugepage_end = hugepage_start + hugepage_sz;
1570 if (reg_end < hugepage_end) {
1571 ranges[0].start = hugepage_start;
1572 ranges[0].end = hugepage_end;
1573 ranges_n = 1;
1574 *share_hugepage = true;
1575 }
1576 }
1577 }
1578 *out = ranges;
1579 *out_n = ranges_n;
1580 return 0;
1581 }
1582
1583 /** Create a registration object for the mempool. */
1584 static struct mlx5_mempool_reg *
mlx5_mempool_reg_create(struct rte_mempool * mp,unsigned int mrs_n,bool is_extmem)1585 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n,
1586 bool is_extmem)
1587 {
1588 struct mlx5_mempool_reg *mpr = NULL;
1589
1590 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
1591 sizeof(struct mlx5_mempool_reg),
1592 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
1593 if (mpr == NULL) {
1594 DRV_LOG(ERR, "Cannot allocate mempool %s registration object",
1595 mp->name);
1596 return NULL;
1597 }
1598 mpr->mrs = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO,
1599 mrs_n * sizeof(struct mlx5_mempool_mr),
1600 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
1601 if (!mpr->mrs) {
1602 DRV_LOG(ERR, "Cannot allocate mempool %s registration MRs",
1603 mp->name);
1604 mlx5_free(mpr);
1605 return NULL;
1606 }
1607 mpr->mp = mp;
1608 mpr->mrs_n = mrs_n;
1609 mpr->is_extmem = is_extmem;
1610 return mpr;
1611 }
1612
1613 /**
1614 * Destroy a mempool registration object.
1615 *
1616 * @param standalone
1617 * Whether @p mpr owns its MRs exclusively, i.e. they are not shared.
1618 */
1619 static void
mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache * share_cache,struct mlx5_mempool_reg * mpr,bool standalone)1620 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache,
1621 struct mlx5_mempool_reg *mpr, bool standalone)
1622 {
1623 if (standalone) {
1624 unsigned int i;
1625
1626 for (i = 0; i < mpr->mrs_n; i++)
1627 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr);
1628 mlx5_free(mpr->mrs);
1629 }
1630 mlx5_free(mpr);
1631 }
1632
1633 /** Find registration object of a mempool. */
1634 static struct mlx5_mempool_reg *
mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache * share_cache,struct rte_mempool * mp)1635 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache,
1636 struct rte_mempool *mp)
1637 {
1638 struct mlx5_mempool_reg *mpr;
1639
1640 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next)
1641 if (mpr->mp == mp)
1642 break;
1643 return mpr;
1644 }
1645
1646 /** Increment reference counters of MRs used in the registration. */
1647 static void
mlx5_mempool_reg_attach(struct mlx5_mempool_reg * mpr)1648 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr)
1649 {
1650 unsigned int i;
1651
1652 for (i = 0; i < mpr->mrs_n; i++)
1653 rte_atomic_fetch_add_explicit(&mpr->mrs[i].refcnt, 1, rte_memory_order_relaxed);
1654 }
1655
1656 /**
1657 * Decrement reference counters of MRs used in the registration.
1658 *
1659 * @return True if no more references to @p mpr MRs exist, False otherwise.
1660 */
1661 static bool
mlx5_mempool_reg_detach(struct mlx5_mempool_reg * mpr)1662 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr)
1663 {
1664 unsigned int i;
1665 bool ret = false;
1666
1667 for (i = 0; i < mpr->mrs_n; i++)
1668 ret |= rte_atomic_fetch_sub_explicit(&mpr->mrs[i].refcnt, 1,
1669 rte_memory_order_relaxed) - 1 == 0;
1670 return ret;
1671 }
1672
1673 static int
mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache * share_cache,void * pd,struct rte_mempool * mp,bool is_extmem)1674 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache,
1675 void *pd, struct rte_mempool *mp,
1676 bool is_extmem)
1677 {
1678 struct mlx5_range *ranges = NULL;
1679 struct mlx5_mempool_reg *mpr, *old_mpr, *new_mpr;
1680 unsigned int i, ranges_n;
1681 bool share_hugepage, standalone = false;
1682 int ret = -1;
1683
1684 /* Early check to avoid unnecessary creation of MRs. */
1685 rte_rwlock_read_lock(&share_cache->rwlock);
1686 old_mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1687 rte_rwlock_read_unlock(&share_cache->rwlock);
1688 if (old_mpr != NULL && (!is_extmem || old_mpr->is_extmem)) {
1689 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p",
1690 mp->name, pd);
1691 rte_errno = EEXIST;
1692 goto exit;
1693 }
1694 if (mlx5_mempool_reg_analyze(mp, is_extmem, &ranges, &ranges_n,
1695 &share_hugepage) < 0) {
1696 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name);
1697 rte_errno = ENOMEM;
1698 goto exit;
1699 }
1700 new_mpr = mlx5_mempool_reg_create(mp, ranges_n, is_extmem);
1701 if (new_mpr == NULL) {
1702 DRV_LOG(ERR,
1703 "Cannot create a registration object for mempool %s in PD %p",
1704 mp->name, pd);
1705 rte_errno = ENOMEM;
1706 goto exit;
1707 }
1708 /*
1709 * If the entire mempool fits in a single hugepage, the MR for this
1710 * hugepage can be shared across mempools that also fit in it.
1711 */
1712 if (share_hugepage) {
1713 rte_rwlock_write_lock(&share_cache->rwlock);
1714 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) {
1715 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start)
1716 break;
1717 }
1718 if (mpr != NULL) {
1719 new_mpr->mrs = mpr->mrs;
1720 mlx5_mempool_reg_attach(new_mpr);
1721 LIST_INSERT_HEAD(&share_cache->mempool_reg_list,
1722 new_mpr, next);
1723 }
1724 rte_rwlock_write_unlock(&share_cache->rwlock);
1725 if (mpr != NULL) {
1726 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s",
1727 mpr->mrs[0].pmd_mr.lkey, pd, mp->name,
1728 mpr->mp->name);
1729 ret = 0;
1730 goto exit;
1731 }
1732 }
1733 for (i = 0; i < ranges_n; i++) {
1734 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i];
1735 const struct mlx5_range *range = &ranges[i];
1736 size_t len = range->end - range->start;
1737
1738 if (share_cache->reg_mr_cb(pd, (void *)range->start, len,
1739 &mr->pmd_mr) < 0) {
1740 DRV_LOG(ERR,
1741 "Failed to create an MR in PD %p for address range "
1742 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s",
1743 pd, range->start, range->end, len, mp->name);
1744 break;
1745 }
1746 DRV_LOG(DEBUG,
1747 "Created a new MR %#x in PD %p for address range "
1748 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s",
1749 mr->pmd_mr.lkey, pd, range->start, range->end, len,
1750 mp->name);
1751 }
1752 if (i != ranges_n) {
1753 mlx5_mempool_reg_destroy(share_cache, new_mpr, true);
1754 rte_errno = EINVAL;
1755 goto exit;
1756 }
1757 /* Concurrent registration is not supposed to happen. */
1758 rte_rwlock_write_lock(&share_cache->rwlock);
1759 mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1760 if (mpr == old_mpr && old_mpr != NULL) {
1761 LIST_REMOVE(old_mpr, next);
1762 standalone = mlx5_mempool_reg_detach(mpr);
1763 /* No need to flush the cache: old MRs cannot be in use. */
1764 mpr = NULL;
1765 }
1766 if (mpr == NULL) {
1767 mlx5_mempool_reg_attach(new_mpr);
1768 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, new_mpr, next);
1769 ret = 0;
1770 }
1771 rte_rwlock_write_unlock(&share_cache->rwlock);
1772 if (mpr != NULL) {
1773 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p",
1774 mp->name, pd);
1775 mlx5_mempool_reg_destroy(share_cache, new_mpr, true);
1776 rte_errno = EEXIST;
1777 goto exit;
1778 } else if (old_mpr != NULL) {
1779 DRV_LOG(DEBUG, "Mempool %s registration for PD %p updated for external memory",
1780 mp->name, pd);
1781 mlx5_mempool_reg_destroy(share_cache, old_mpr, standalone);
1782 }
1783 exit:
1784 free(ranges);
1785 return ret;
1786 }
1787
1788 static int
mlx5_mr_mempool_register_secondary(struct mlx5_common_device * cdev,struct rte_mempool * mp,bool is_extmem)1789 mlx5_mr_mempool_register_secondary(struct mlx5_common_device *cdev,
1790 struct rte_mempool *mp, bool is_extmem)
1791 {
1792 return mlx5_mp_req_mempool_reg(cdev, mp, true, is_extmem);
1793 }
1794
1795 /**
1796 * Register the memory of a mempool in the protection domain.
1797 *
1798 * @param cdev
1799 * Pointer to the mlx5 common device.
1800 * @param mp
1801 * Mempool to register.
1802 *
1803 * @return
1804 * 0 on success, (-1) on failure and rte_errno is set.
1805 */
1806 int
mlx5_mr_mempool_register(struct mlx5_common_device * cdev,struct rte_mempool * mp,bool is_extmem)1807 mlx5_mr_mempool_register(struct mlx5_common_device *cdev,
1808 struct rte_mempool *mp, bool is_extmem)
1809 {
1810 if (mp->flags & RTE_MEMPOOL_F_NON_IO)
1811 return 0;
1812 switch (rte_eal_process_type()) {
1813 case RTE_PROC_PRIMARY:
1814 return mlx5_mr_mempool_register_primary(&cdev->mr_scache,
1815 cdev->pd, mp,
1816 is_extmem);
1817 case RTE_PROC_SECONDARY:
1818 return mlx5_mr_mempool_register_secondary(cdev, mp, is_extmem);
1819 default:
1820 return -1;
1821 }
1822 }
1823
1824 static int
mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache * share_cache,struct rte_mempool * mp)1825 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache,
1826 struct rte_mempool *mp)
1827 {
1828 struct mlx5_mempool_reg *mpr;
1829 bool standalone = false;
1830
1831 rte_rwlock_write_lock(&share_cache->rwlock);
1832 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next)
1833 if (mpr->mp == mp) {
1834 LIST_REMOVE(mpr, next);
1835 standalone = mlx5_mempool_reg_detach(mpr);
1836 if (standalone)
1837 /*
1838 * The unlock operation below provides a memory
1839 * barrier due to its store-release semantics.
1840 */
1841 ++share_cache->dev_gen;
1842 break;
1843 }
1844 rte_rwlock_write_unlock(&share_cache->rwlock);
1845 if (mpr == NULL) {
1846 rte_errno = ENOENT;
1847 return -1;
1848 }
1849 mlx5_mempool_reg_destroy(share_cache, mpr, standalone);
1850 return 0;
1851 }
1852
1853 static int
mlx5_mr_mempool_unregister_secondary(struct mlx5_common_device * cdev,struct rte_mempool * mp)1854 mlx5_mr_mempool_unregister_secondary(struct mlx5_common_device *cdev,
1855 struct rte_mempool *mp)
1856 {
1857 return mlx5_mp_req_mempool_reg(cdev, mp, false, false /* is_extmem */);
1858 }
1859
1860 /**
1861 * Unregister the memory of a mempool from the protection domain.
1862 *
1863 * @param cdev
1864 * Pointer to the mlx5 common device.
1865 * @param mp
1866 * Mempool to unregister.
1867 *
1868 * @return
1869 * 0 on success, (-1) on failure and rte_errno is set.
1870 */
1871 int
mlx5_mr_mempool_unregister(struct mlx5_common_device * cdev,struct rte_mempool * mp)1872 mlx5_mr_mempool_unregister(struct mlx5_common_device *cdev,
1873 struct rte_mempool *mp)
1874 {
1875 if (mp->flags & RTE_MEMPOOL_F_NON_IO)
1876 return 0;
1877 switch (rte_eal_process_type()) {
1878 case RTE_PROC_PRIMARY:
1879 return mlx5_mr_mempool_unregister_primary(&cdev->mr_scache, mp);
1880 case RTE_PROC_SECONDARY:
1881 return mlx5_mr_mempool_unregister_secondary(cdev, mp);
1882 default:
1883 return -1;
1884 }
1885 }
1886
1887 /**
1888 * Lookup a MR key by and address in a registered mempool.
1889 *
1890 * @param mpr
1891 * Mempool registration object.
1892 * @param addr
1893 * Address within the mempool.
1894 * @param entry
1895 * Bottom-half cache entry to fill.
1896 *
1897 * @return
1898 * MR key or UINT32_MAX on failure, which can only happen
1899 * if the address is not from within the mempool.
1900 */
1901 static uint32_t
mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg * mpr,uintptr_t addr,struct mr_cache_entry * entry)1902 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr,
1903 struct mr_cache_entry *entry)
1904 {
1905 uint32_t lkey = UINT32_MAX;
1906 unsigned int i;
1907
1908 for (i = 0; i < mpr->mrs_n; i++) {
1909 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr;
1910 uintptr_t mr_start = (uintptr_t)mr->addr;
1911 uintptr_t mr_end = mr_start + mr->len;
1912
1913 if (mr_start <= addr && addr < mr_end) {
1914 lkey = rte_cpu_to_be_32(mr->lkey);
1915 entry->start = mr_start;
1916 entry->end = mr_end;
1917 entry->lkey = lkey;
1918 break;
1919 }
1920 }
1921 return lkey;
1922 }
1923
1924 /**
1925 * Update bottom-half cache from the list of mempool registrations.
1926 *
1927 * @param mr_ctrl
1928 * Per-queue MR control handle.
1929 * @param entry
1930 * Pointer to an entry in the bottom-half cache to update
1931 * with the MR lkey looked up.
1932 * @param mp
1933 * Mempool containing the address.
1934 * @param addr
1935 * Address to lookup.
1936 * @return
1937 * MR lkey on success, UINT32_MAX on failure.
1938 */
1939 static uint32_t
mlx5_lookup_mempool_regs(struct mlx5_mr_ctrl * mr_ctrl,struct mr_cache_entry * entry,struct rte_mempool * mp,uintptr_t addr)1940 mlx5_lookup_mempool_regs(struct mlx5_mr_ctrl *mr_ctrl,
1941 struct mr_cache_entry *entry,
1942 struct rte_mempool *mp, uintptr_t addr)
1943 {
1944 struct mlx5_mr_share_cache *share_cache =
1945 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache,
1946 dev_gen);
1947 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
1948 struct mlx5_mempool_reg *mpr;
1949 uint32_t lkey = UINT32_MAX;
1950
1951 /* If local cache table is full, try to double it. */
1952 if (unlikely(bt->len == bt->size))
1953 mr_btree_expand(bt, bt->size << 1);
1954 /* Look up in mempool registrations. */
1955 rte_rwlock_read_lock(&share_cache->rwlock);
1956 mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1957 if (mpr != NULL)
1958 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry);
1959 rte_rwlock_read_unlock(&share_cache->rwlock);
1960 /*
1961 * Update local cache. Even if it fails, return the found entry
1962 * to update top-half cache. Next time, this entry will be found
1963 * in the global cache.
1964 */
1965 if (lkey != UINT32_MAX)
1966 mr_btree_insert(bt, entry);
1967 return lkey;
1968 }
1969
1970 /**
1971 * Populate cache with LKeys of all MRs used by the mempool.
1972 * It is intended to be used to register Rx mempools in advance.
1973 *
1974 * @param mr_ctrl
1975 * Per-queue MR control handle.
1976 * @param mp
1977 * Registered memory pool.
1978 *
1979 * @return
1980 * 0 on success, (-1) on failure and rte_errno is set.
1981 */
1982 int
mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl * mr_ctrl,struct rte_mempool * mp)1983 mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl,
1984 struct rte_mempool *mp)
1985 {
1986 struct mlx5_mr_share_cache *share_cache =
1987 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache,
1988 dev_gen);
1989 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh;
1990 struct mlx5_mempool_reg *mpr;
1991 unsigned int i;
1992
1993 /*
1994 * Registration is valid after the lock is released,
1995 * because the function is called after the mempool is registered.
1996 */
1997 rte_rwlock_read_lock(&share_cache->rwlock);
1998 mpr = mlx5_mempool_reg_lookup(share_cache, mp);
1999 rte_rwlock_read_unlock(&share_cache->rwlock);
2000 if (mpr == NULL) {
2001 DRV_LOG(ERR, "Mempool %s is not registered", mp->name);
2002 rte_errno = ENOENT;
2003 return -1;
2004 }
2005 for (i = 0; i < mpr->mrs_n; i++) {
2006 struct mlx5_mempool_mr *mr = &mpr->mrs[i];
2007 struct mr_cache_entry entry;
2008 uint32_t lkey;
2009 uint32_t idx;
2010
2011 lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr);
2012 if (lkey != UINT32_MAX)
2013 continue;
2014 if (bt->len == bt->size)
2015 mr_btree_expand(bt, bt->size << 1);
2016 entry.start = (uintptr_t)mr->pmd_mr.addr;
2017 entry.end = entry.start + mr->pmd_mr.len;
2018 entry.lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey);
2019 if (mr_btree_insert(bt, &entry) < 0) {
2020 DRV_LOG(ERR, "Cannot insert cache entry for mempool %s MR %08x",
2021 mp->name, entry.lkey);
2022 rte_errno = EINVAL;
2023 return -1;
2024 }
2025 }
2026 return 0;
2027 }
2028
2029 /**
2030 * Bottom-half lookup for the address from the mempool.
2031 *
2032 * @param mr_ctrl
2033 * Per-queue MR control handle.
2034 * @param mp
2035 * Mempool containing the address.
2036 * @param addr
2037 * Address to lookup.
2038 * @return
2039 * MR lkey on success, UINT32_MAX on failure.
2040 */
2041 uint32_t
mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl * mr_ctrl,struct rte_mempool * mp,uintptr_t addr)2042 mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl,
2043 struct rte_mempool *mp, uintptr_t addr)
2044 {
2045 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head];
2046 uint32_t lkey;
2047 uint32_t bh_idx = 0;
2048
2049 /* Binary-search MR translation table. */
2050 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
2051 /* Update top-half cache. */
2052 if (likely(lkey != UINT32_MAX)) {
2053 *repl = (*mr_ctrl->cache_bh.table)[bh_idx];
2054 } else {
2055 lkey = mlx5_lookup_mempool_regs(mr_ctrl, repl, mp, addr);
2056 /* Can only fail if the address is not from the mempool. */
2057 if (unlikely(lkey == UINT32_MAX))
2058 return UINT32_MAX;
2059 }
2060 /* Update the most recently used entry. */
2061 mr_ctrl->mru = mr_ctrl->head;
2062 /* Point to the next victim, the oldest. */
2063 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
2064 return lkey;
2065 }
2066
2067 uint32_t
mlx5_mr_mb2mr_bh(struct mlx5_mr_ctrl * mr_ctrl,struct rte_mbuf * mb)2068 mlx5_mr_mb2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, struct rte_mbuf *mb)
2069 {
2070 struct rte_mempool *mp;
2071 struct mlx5_mprq_buf *buf;
2072 uint32_t lkey;
2073 uintptr_t addr = (uintptr_t)mb->buf_addr;
2074 struct mlx5_mr_share_cache *share_cache =
2075 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache,
2076 dev_gen);
2077 struct mlx5_common_device *cdev =
2078 container_of(share_cache, struct mlx5_common_device, mr_scache);
2079 bool external, mprq, pinned = false;
2080
2081 /* Recover MPRQ mempool. */
2082 external = RTE_MBUF_HAS_EXTBUF(mb);
2083 if (external && mb->shinfo->free_cb == mlx5_mprq_buf_free_cb) {
2084 mprq = true;
2085 buf = mb->shinfo->fcb_opaque;
2086 mp = buf->mp;
2087 } else {
2088 mprq = false;
2089 mp = mlx5_mb2mp(mb);
2090 pinned = rte_pktmbuf_priv_flags(mp) &
2091 RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF;
2092 }
2093 if (!external || mprq || pinned) {
2094 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr);
2095 if (lkey != UINT32_MAX)
2096 return lkey;
2097 /* MPRQ is always registered. */
2098 MLX5_ASSERT(!mprq);
2099 }
2100 /* Register pinned external memory if the mempool is not used for Rx. */
2101 if (cdev->config.mr_mempool_reg_en && pinned) {
2102 if (mlx5_mr_mempool_register(cdev, mp, true) < 0)
2103 return UINT32_MAX;
2104 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr);
2105 MLX5_ASSERT(lkey != UINT32_MAX);
2106 return lkey;
2107 }
2108 /* Fallback to generic mechanism in corner cases. */
2109 return mlx5_mr_addr2mr_bh(mr_ctrl, addr);
2110 }
2111