1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common.h" 16 #include "mlx5_common_mp.h" 17 #include "mlx5_common_mr.h" 18 #include "mlx5_common_os.h" 19 #include "mlx5_common_log.h" 20 #include "mlx5_malloc.h" 21 22 struct mr_find_contig_memsegs_data { 23 uintptr_t addr; 24 uintptr_t start; 25 uintptr_t end; 26 const struct rte_memseg_list *msl; 27 }; 28 29 /* Virtual memory range. */ 30 struct mlx5_range { 31 uintptr_t start; 32 uintptr_t end; 33 }; 34 35 /** Memory region for a mempool. */ 36 struct mlx5_mempool_mr { 37 struct mlx5_pmd_mr pmd_mr; 38 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 39 }; 40 41 /* Mempool registration. */ 42 struct mlx5_mempool_reg { 43 LIST_ENTRY(mlx5_mempool_reg) next; 44 /** Registered mempool, used to designate registrations. */ 45 struct rte_mempool *mp; 46 /** Memory regions for the address ranges of the mempool. */ 47 struct mlx5_mempool_mr *mrs; 48 /** Number of memory regions. */ 49 unsigned int mrs_n; 50 /** Whether the MR were created for external pinned memory. */ 51 bool is_extmem; 52 }; 53 54 void 55 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 56 { 57 struct mlx5_mprq_buf *buf = opaque; 58 59 if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) { 60 rte_mempool_put(buf->mp, buf); 61 } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1, 62 __ATOMIC_RELAXED) == 0)) { 63 __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED); 64 rte_mempool_put(buf->mp, buf); 65 } 66 } 67 68 /** 69 * Expand B-tree table to a given size. Can't be called with holding 70 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 71 * 72 * @param bt 73 * Pointer to B-tree structure. 74 * @param n 75 * Number of entries for expansion. 76 * 77 * @return 78 * 0 on success, -1 on failure. 79 */ 80 static int 81 mr_btree_expand(struct mlx5_mr_btree *bt, int n) 82 { 83 void *mem; 84 int ret = 0; 85 86 if (n <= bt->size) 87 return ret; 88 /* 89 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 90 * used inside if there's no room to expand. Because this is a quite 91 * rare case and a part of very slow path, it is very acceptable. 92 * Initially cache_bh[] will be given practically enough space and once 93 * it is expanded, expansion wouldn't be needed again ever. 94 */ 95 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 96 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 97 if (mem == NULL) { 98 /* Not an error, B-tree search will be skipped. */ 99 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 100 (void *)bt); 101 ret = -1; 102 } else { 103 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 104 bt->table = mem; 105 bt->size = n; 106 } 107 return ret; 108 } 109 110 /** 111 * Look up LKey from given B-tree lookup table, store the last index and return 112 * searched LKey. 113 * 114 * @param bt 115 * Pointer to B-tree structure. 116 * @param[out] idx 117 * Pointer to index. Even on search failure, returns index where it stops 118 * searching so that index can be used when inserting a new entry. 119 * @param addr 120 * Search key. 121 * 122 * @return 123 * Searched LKey on success, UINT32_MAX on no match. 124 */ 125 static uint32_t 126 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) 127 { 128 struct mr_cache_entry *lkp_tbl; 129 uint16_t n; 130 uint16_t base = 0; 131 132 MLX5_ASSERT(bt != NULL); 133 lkp_tbl = *bt->table; 134 n = bt->len; 135 /* First entry must be NULL for comparison. */ 136 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 137 lkp_tbl[0].lkey == UINT32_MAX)); 138 /* Binary search. */ 139 do { 140 register uint16_t delta = n >> 1; 141 142 if (addr < lkp_tbl[base + delta].start) { 143 n = delta; 144 } else { 145 base += delta; 146 n -= delta; 147 } 148 } while (n > 1); 149 MLX5_ASSERT(addr >= lkp_tbl[base].start); 150 *idx = base; 151 if (addr < lkp_tbl[base].end) 152 return lkp_tbl[base].lkey; 153 /* Not found. */ 154 return UINT32_MAX; 155 } 156 157 /** 158 * Insert an entry to B-tree lookup table. 159 * 160 * @param bt 161 * Pointer to B-tree structure. 162 * @param entry 163 * Pointer to new entry to insert. 164 * 165 * @return 166 * 0 on success, -1 on failure. 167 */ 168 static int 169 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 170 { 171 struct mr_cache_entry *lkp_tbl; 172 uint16_t idx = 0; 173 size_t shift; 174 175 MLX5_ASSERT(bt != NULL); 176 MLX5_ASSERT(bt->len <= bt->size); 177 MLX5_ASSERT(bt->len > 0); 178 lkp_tbl = *bt->table; 179 /* Find out the slot for insertion. */ 180 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 181 DRV_LOG(DEBUG, 182 "abort insertion to B-tree(%p): already exist at" 183 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 184 (void *)bt, idx, entry->start, entry->end, entry->lkey); 185 /* Already exist, return. */ 186 return 0; 187 } 188 /* If table is full, return error. */ 189 if (unlikely(bt->len == bt->size)) { 190 bt->overflow = 1; 191 return -1; 192 } 193 /* Insert entry. */ 194 ++idx; 195 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 196 if (shift) 197 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 198 lkp_tbl[idx] = *entry; 199 bt->len++; 200 DRV_LOG(DEBUG, 201 "inserted B-tree(%p)[%u]," 202 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 203 (void *)bt, idx, entry->start, entry->end, entry->lkey); 204 return 0; 205 } 206 207 /** 208 * Initialize B-tree and allocate memory for lookup table. 209 * 210 * @param bt 211 * Pointer to B-tree structure. 212 * @param n 213 * Number of entries to allocate. 214 * @param socket 215 * NUMA socket on which memory must be allocated. 216 * 217 * @return 218 * 0 on success, a negative errno value otherwise and rte_errno is set. 219 */ 220 static int 221 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 222 { 223 if (bt == NULL) { 224 rte_errno = EINVAL; 225 return -rte_errno; 226 } 227 MLX5_ASSERT(!bt->table && !bt->size); 228 memset(bt, 0, sizeof(*bt)); 229 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 230 sizeof(struct mr_cache_entry) * n, 231 0, socket); 232 if (bt->table == NULL) { 233 rte_errno = ENOMEM; 234 DRV_LOG(DEBUG, 235 "failed to allocate memory for btree cache on socket " 236 "%d", socket); 237 return -rte_errno; 238 } 239 bt->size = n; 240 /* First entry must be NULL for binary search. */ 241 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 242 .lkey = UINT32_MAX, 243 }; 244 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 245 (void *)bt, (void *)bt->table); 246 return 0; 247 } 248 249 /** 250 * Free B-tree resources. 251 * 252 * @param bt 253 * Pointer to B-tree structure. 254 */ 255 void 256 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 257 { 258 if (bt == NULL) 259 return; 260 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 261 (void *)bt, (void *)bt->table); 262 mlx5_free(bt->table); 263 memset(bt, 0, sizeof(*bt)); 264 } 265 266 /** 267 * Dump all the entries in a B-tree 268 * 269 * @param bt 270 * Pointer to B-tree structure. 271 */ 272 void 273 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 274 { 275 #ifdef RTE_LIBRTE_MLX5_DEBUG 276 int idx; 277 struct mr_cache_entry *lkp_tbl; 278 279 if (bt == NULL) 280 return; 281 lkp_tbl = *bt->table; 282 for (idx = 0; idx < bt->len; ++idx) { 283 struct mr_cache_entry *entry = &lkp_tbl[idx]; 284 285 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 286 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 287 (void *)bt, idx, entry->start, entry->end, entry->lkey); 288 } 289 #endif 290 } 291 292 /** 293 * Initialize per-queue MR control descriptor. 294 * 295 * @param mr_ctrl 296 * Pointer to MR control structure. 297 * @param dev_gen_ptr 298 * Pointer to generation number of global cache. 299 * @param socket 300 * NUMA socket on which memory must be allocated. 301 * 302 * @return 303 * 0 on success, a negative errno value otherwise and rte_errno is set. 304 */ 305 int 306 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr, 307 int socket) 308 { 309 if (mr_ctrl == NULL) { 310 rte_errno = EINVAL; 311 return -rte_errno; 312 } 313 /* Save pointer of global generation number to check memory event. */ 314 mr_ctrl->dev_gen_ptr = dev_gen_ptr; 315 /* Initialize B-tree and allocate memory for bottom-half cache table. */ 316 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N, 317 socket); 318 } 319 320 /** 321 * Find virtually contiguous memory chunk in a given MR. 322 * 323 * @param dev 324 * Pointer to MR structure. 325 * @param[out] entry 326 * Pointer to returning MR cache entry. If not found, this will not be 327 * updated. 328 * @param start_idx 329 * Start index of the memseg bitmap. 330 * 331 * @return 332 * Next index to go on lookup. 333 */ 334 static int 335 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 336 int base_idx) 337 { 338 uintptr_t start = 0; 339 uintptr_t end = 0; 340 uint32_t idx = 0; 341 342 /* MR for external memory doesn't have memseg list. */ 343 if (mr->msl == NULL) { 344 MLX5_ASSERT(mr->ms_bmp_n == 1); 345 MLX5_ASSERT(mr->ms_n == 1); 346 MLX5_ASSERT(base_idx == 0); 347 /* 348 * Can't search it from memseg list but get it directly from 349 * pmd_mr as there's only one chunk. 350 */ 351 entry->start = (uintptr_t)mr->pmd_mr.addr; 352 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 353 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 354 /* Returning 1 ends iteration. */ 355 return 1; 356 } 357 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 358 if (rte_bitmap_get(mr->ms_bmp, idx)) { 359 const struct rte_memseg_list *msl; 360 const struct rte_memseg *ms; 361 362 msl = mr->msl; 363 ms = rte_fbarray_get(&msl->memseg_arr, 364 mr->ms_base_idx + idx); 365 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 366 if (!start) 367 start = ms->addr_64; 368 end = ms->addr_64 + ms->hugepage_sz; 369 } else if (start) { 370 /* Passed the end of a fragment. */ 371 break; 372 } 373 } 374 if (start) { 375 /* Found one chunk. */ 376 entry->start = start; 377 entry->end = end; 378 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 379 } 380 return idx; 381 } 382 383 /** 384 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 385 * Then, this entry will have to be searched by mr_lookup_list() in 386 * mlx5_mr_create() on miss. 387 * 388 * @param share_cache 389 * Pointer to a global shared MR cache. 390 * @param mr 391 * Pointer to MR to insert. 392 * 393 * @return 394 * 0 on success, -1 on failure. 395 */ 396 int 397 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 398 struct mlx5_mr *mr) 399 { 400 unsigned int n; 401 402 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 403 (void *)mr, (void *)share_cache); 404 for (n = 0; n < mr->ms_bmp_n; ) { 405 struct mr_cache_entry entry; 406 407 memset(&entry, 0, sizeof(entry)); 408 /* Find a contiguous chunk and advance the index. */ 409 n = mr_find_next_chunk(mr, &entry, n); 410 if (!entry.end) 411 break; 412 if (mr_btree_insert(&share_cache->cache, &entry) < 0) { 413 /* 414 * Overflowed, but the global table cannot be expanded 415 * because of deadlock. 416 */ 417 return -1; 418 } 419 } 420 return 0; 421 } 422 423 /** 424 * Look up address in the original global MR list. 425 * 426 * @param share_cache 427 * Pointer to a global shared MR cache. 428 * @param[out] entry 429 * Pointer to returning MR cache entry. If no match, this will not be updated. 430 * @param addr 431 * Search key. 432 * 433 * @return 434 * Found MR on match, NULL otherwise. 435 */ 436 struct mlx5_mr * 437 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 438 struct mr_cache_entry *entry, uintptr_t addr) 439 { 440 struct mlx5_mr *mr; 441 442 /* Iterate all the existing MRs. */ 443 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 444 unsigned int n; 445 446 if (mr->ms_n == 0) 447 continue; 448 for (n = 0; n < mr->ms_bmp_n; ) { 449 struct mr_cache_entry ret; 450 451 memset(&ret, 0, sizeof(ret)); 452 n = mr_find_next_chunk(mr, &ret, n); 453 if (addr >= ret.start && addr < ret.end) { 454 /* Found. */ 455 *entry = ret; 456 return mr; 457 } 458 } 459 } 460 return NULL; 461 } 462 463 /** 464 * Look up address on global MR cache. 465 * 466 * @param share_cache 467 * Pointer to a global shared MR cache. 468 * @param[out] entry 469 * Pointer to returning MR cache entry. If no match, this will not be updated. 470 * @param addr 471 * Search key. 472 * 473 * @return 474 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 475 */ 476 static uint32_t 477 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 478 struct mr_cache_entry *entry, uintptr_t addr) 479 { 480 uint16_t idx; 481 uint32_t lkey = UINT32_MAX; 482 struct mlx5_mr *mr; 483 484 /* 485 * If the global cache has overflowed since it failed to expand the 486 * B-tree table, it can't have all the existing MRs. Then, the address 487 * has to be searched by traversing the original MR list instead, which 488 * is very slow path. Otherwise, the global cache is all inclusive. 489 */ 490 if (!unlikely(share_cache->cache.overflow)) { 491 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 492 if (lkey != UINT32_MAX) 493 *entry = (*share_cache->cache.table)[idx]; 494 } else { 495 /* Falling back to the slowest path. */ 496 mr = mlx5_mr_lookup_list(share_cache, entry, addr); 497 if (mr != NULL) 498 lkey = entry->lkey; 499 } 500 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 501 addr < entry->end)); 502 return lkey; 503 } 504 505 /** 506 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 507 * can raise memory free event and the callback function will spin on the lock. 508 * 509 * @param mr 510 * Pointer to MR to free. 511 */ 512 void 513 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 514 { 515 if (mr == NULL) 516 return; 517 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 518 dereg_mr_cb(&mr->pmd_mr); 519 if (mr->ms_bmp != NULL) 520 rte_bitmap_free(mr->ms_bmp); 521 mlx5_free(mr); 522 } 523 524 void 525 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 526 { 527 struct mlx5_mr *mr; 528 529 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 530 /* Flush cache to rebuild. */ 531 share_cache->cache.len = 1; 532 share_cache->cache.overflow = 0; 533 /* Iterate all the existing MRs. */ 534 LIST_FOREACH(mr, &share_cache->mr_list, mr) 535 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 536 return; 537 } 538 539 /** 540 * Release resources of detached MR having no online entry. 541 * 542 * @param share_cache 543 * Pointer to a global shared MR cache. 544 */ 545 static void 546 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 547 { 548 struct mlx5_mr *mr_next; 549 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 550 551 /* Must be called from the primary process. */ 552 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 553 /* 554 * MR can't be freed with holding the lock because rte_free() could call 555 * memory free callback function. This will be a deadlock situation. 556 */ 557 rte_rwlock_write_lock(&share_cache->rwlock); 558 /* Detach the whole free list and release it after unlocking. */ 559 free_list = share_cache->mr_free_list; 560 LIST_INIT(&share_cache->mr_free_list); 561 rte_rwlock_write_unlock(&share_cache->rwlock); 562 /* Release resources. */ 563 mr_next = LIST_FIRST(&free_list); 564 while (mr_next != NULL) { 565 struct mlx5_mr *mr = mr_next; 566 567 mr_next = LIST_NEXT(mr, mr); 568 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 569 } 570 } 571 572 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 573 static int 574 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 575 const struct rte_memseg *ms, size_t len, void *arg) 576 { 577 struct mr_find_contig_memsegs_data *data = arg; 578 579 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 580 return 0; 581 /* Found, save it and stop walking. */ 582 data->start = ms->addr_64; 583 data->end = ms->addr_64 + len; 584 data->msl = msl; 585 return 1; 586 } 587 588 /** 589 * Create a new global Memory Region (MR) for a missing virtual address. 590 * This API should be called on a secondary process, then a request is sent to 591 * the primary process in order to create a MR for the address. As the global MR 592 * list is on the shared memory, following LKey lookup should succeed unless the 593 * request fails. 594 * 595 * @param cdev 596 * Pointer to the mlx5 common device. 597 * @param share_cache 598 * Pointer to a global shared MR cache. 599 * @param[out] entry 600 * Pointer to returning MR cache entry, found in the global cache or newly 601 * created. If failed to create one, this will not be updated. 602 * @param addr 603 * Target virtual address to register. 604 * 605 * @return 606 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 607 */ 608 static uint32_t 609 mlx5_mr_create_secondary(struct mlx5_common_device *cdev, 610 struct mlx5_mr_share_cache *share_cache, 611 struct mr_cache_entry *entry, uintptr_t addr) 612 { 613 int ret; 614 615 DRV_LOG(DEBUG, "Requesting MR creation for address (%p)", (void *)addr); 616 ret = mlx5_mp_req_mr_create(cdev, addr); 617 if (ret) { 618 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 619 (void *)addr); 620 return UINT32_MAX; 621 } 622 rte_rwlock_read_lock(&share_cache->rwlock); 623 /* Fill in output data. */ 624 mlx5_mr_lookup_cache(share_cache, entry, addr); 625 /* Lookup can't fail. */ 626 MLX5_ASSERT(entry->lkey != UINT32_MAX); 627 rte_rwlock_read_unlock(&share_cache->rwlock); 628 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 629 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 630 (void *)addr, entry->start, entry->end, entry->lkey); 631 return entry->lkey; 632 } 633 634 /** 635 * Create a new global Memory Region (MR) for a missing virtual address. 636 * Register entire virtually contiguous memory chunk around the address. 637 * 638 * @param pd 639 * Pointer to pd of a device (net, regex, vdpa,...). 640 * @param share_cache 641 * Pointer to a global shared MR cache. 642 * @param[out] entry 643 * Pointer to returning MR cache entry, found in the global cache or newly 644 * created. If failed to create one, this will not be updated. 645 * @param addr 646 * Target virtual address to register. 647 * @param mr_ext_memseg_en 648 * Configurable flag about external memory segment enable or not. 649 * 650 * @return 651 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 652 */ 653 static uint32_t 654 mlx5_mr_create_primary(void *pd, 655 struct mlx5_mr_share_cache *share_cache, 656 struct mr_cache_entry *entry, uintptr_t addr, 657 unsigned int mr_ext_memseg_en) 658 { 659 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 660 struct mr_find_contig_memsegs_data data_re; 661 const struct rte_memseg_list *msl; 662 const struct rte_memseg *ms; 663 struct mlx5_mr *mr = NULL; 664 int ms_idx_shift = -1; 665 uint32_t bmp_size; 666 void *bmp_mem; 667 uint32_t ms_n; 668 uint32_t n; 669 size_t len; 670 671 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 672 /* 673 * Release detached MRs if any. This can't be called with holding either 674 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 675 * been detached by the memory free event but it couldn't be released 676 * inside the callback due to deadlock. As a result, releasing resources 677 * is quite opportunistic. 678 */ 679 mlx5_mr_garbage_collect(share_cache); 680 /* 681 * If enabled, find out a contiguous virtual address chunk in use, to 682 * which the given address belongs, in order to register maximum range. 683 * In the best case where mempools are not dynamically recreated and 684 * '--socket-mem' is specified as an EAL option, it is very likely to 685 * have only one MR(LKey) per a socket and per a hugepage-size even 686 * though the system memory is highly fragmented. As the whole memory 687 * chunk will be pinned by kernel, it can't be reused unless entire 688 * chunk is freed from EAL. 689 * 690 * If disabled, just register one memseg (page). Then, memory 691 * consumption will be minimized but it may drop performance if there 692 * are many MRs to lookup on the datapath. 693 */ 694 if (!mr_ext_memseg_en) { 695 data.msl = rte_mem_virt2memseg_list((void *)addr); 696 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 697 data.end = data.start + data.msl->page_sz; 698 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 699 DRV_LOG(WARNING, 700 "Unable to find virtually contiguous" 701 " chunk for address (%p)." 702 " rte_memseg_contig_walk() failed.", (void *)addr); 703 rte_errno = ENXIO; 704 goto err_nolock; 705 } 706 alloc_resources: 707 /* Addresses must be page-aligned. */ 708 MLX5_ASSERT(data.msl); 709 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 710 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 711 msl = data.msl; 712 ms = rte_mem_virt2memseg((void *)data.start, msl); 713 len = data.end - data.start; 714 MLX5_ASSERT(ms); 715 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 716 /* Number of memsegs in the range. */ 717 ms_n = len / msl->page_sz; 718 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 719 " page_sz=0x%" PRIx64 ", ms_n=%u", 720 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 721 /* Size of memory for bitmap. */ 722 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 723 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 724 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 725 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 726 if (mr == NULL) { 727 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 728 " address (%p).", (void *)addr); 729 rte_errno = ENOMEM; 730 goto err_nolock; 731 } 732 mr->msl = msl; 733 /* 734 * Save the index of the first memseg and initialize memseg bitmap. To 735 * see if a memseg of ms_idx in the memseg-list is still valid, check: 736 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 737 */ 738 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 739 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 740 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 741 if (mr->ms_bmp == NULL) { 742 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 743 " address (%p).", (void *)addr); 744 rte_errno = EINVAL; 745 goto err_nolock; 746 } 747 /* 748 * Should recheck whether the extended contiguous chunk is still valid. 749 * Because memory_hotplug_lock can't be held if there's any memory 750 * related calls in a critical path, resource allocation above can't be 751 * locked. If the memory has been changed at this point, try again with 752 * just single page. If not, go on with the big chunk atomically from 753 * here. 754 */ 755 rte_mcfg_mem_read_lock(); 756 data_re = data; 757 if (len > msl->page_sz && 758 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 759 DRV_LOG(DEBUG, 760 "Unable to find virtually contiguous chunk for address " 761 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 762 rte_errno = ENXIO; 763 goto err_memlock; 764 } 765 if (data.start != data_re.start || data.end != data_re.end) { 766 /* 767 * The extended contiguous chunk has been changed. Try again 768 * with single memseg instead. 769 */ 770 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 771 data.end = data.start + msl->page_sz; 772 rte_mcfg_mem_read_unlock(); 773 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 774 goto alloc_resources; 775 } 776 MLX5_ASSERT(data.msl == data_re.msl); 777 rte_rwlock_write_lock(&share_cache->rwlock); 778 /* 779 * Check the address is really missing. If other thread already created 780 * one or it is not found due to overflow, abort and return. 781 */ 782 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 783 /* 784 * Insert to the global cache table. It may fail due to 785 * low-on-memory. Then, this entry will have to be searched 786 * here again. 787 */ 788 mr_btree_insert(&share_cache->cache, entry); 789 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 790 (void *)addr); 791 rte_rwlock_write_unlock(&share_cache->rwlock); 792 rte_mcfg_mem_read_unlock(); 793 /* 794 * Must be unlocked before calling rte_free() because 795 * mlx5_mr_mem_event_free_cb() can be called inside. 796 */ 797 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 798 return entry->lkey; 799 } 800 /* 801 * Trim start and end addresses for verbs MR. Set bits for registering 802 * memsegs but exclude already registered ones. Bitmap can be 803 * fragmented. 804 */ 805 for (n = 0; n < ms_n; ++n) { 806 uintptr_t start; 807 struct mr_cache_entry ret; 808 809 memset(&ret, 0, sizeof(ret)); 810 start = data_re.start + n * msl->page_sz; 811 /* Exclude memsegs already registered by other MRs. */ 812 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 813 UINT32_MAX) { 814 /* 815 * Start from the first unregistered memseg in the 816 * extended range. 817 */ 818 if (ms_idx_shift == -1) { 819 mr->ms_base_idx += n; 820 data.start = start; 821 ms_idx_shift = n; 822 } 823 data.end = start + msl->page_sz; 824 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 825 ++mr->ms_n; 826 } 827 } 828 len = data.end - data.start; 829 mr->ms_bmp_n = len / msl->page_sz; 830 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 831 /* 832 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 833 * be called with holding the memory lock because it doesn't use 834 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 835 * through mlx5_alloc_verbs_buf(). 836 */ 837 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 838 if (mr->pmd_mr.obj == NULL) { 839 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 840 (void *)addr); 841 rte_errno = EINVAL; 842 goto err_mrlock; 843 } 844 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 845 MLX5_ASSERT(mr->pmd_mr.len); 846 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 847 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 848 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 849 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 850 (void *)mr, (void *)addr, data.start, data.end, 851 rte_cpu_to_be_32(mr->pmd_mr.lkey), 852 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 853 /* Insert to the global cache table. */ 854 mlx5_mr_insert_cache(share_cache, mr); 855 /* Fill in output data. */ 856 mlx5_mr_lookup_cache(share_cache, entry, addr); 857 /* Lookup can't fail. */ 858 MLX5_ASSERT(entry->lkey != UINT32_MAX); 859 rte_rwlock_write_unlock(&share_cache->rwlock); 860 rte_mcfg_mem_read_unlock(); 861 return entry->lkey; 862 err_mrlock: 863 rte_rwlock_write_unlock(&share_cache->rwlock); 864 err_memlock: 865 rte_mcfg_mem_read_unlock(); 866 err_nolock: 867 /* 868 * In case of error, as this can be called in a datapath, a warning 869 * message per an error is preferable instead. Must be unlocked before 870 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 871 * inside. 872 */ 873 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 874 return UINT32_MAX; 875 } 876 877 /** 878 * Create a new global Memory Region (MR) for a missing virtual address. 879 * This can be called from primary and secondary process. 880 * 881 * @param cdev 882 * Pointer to the mlx5 common device. 883 * @param share_cache 884 * Pointer to a global shared MR cache. 885 * @param[out] entry 886 * Pointer to returning MR cache entry, found in the global cache or newly 887 * created. If failed to create one, this will not be updated. 888 * @param addr 889 * Target virtual address to register. 890 * 891 * @return 892 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 893 */ 894 uint32_t 895 mlx5_mr_create(struct mlx5_common_device *cdev, 896 struct mlx5_mr_share_cache *share_cache, 897 struct mr_cache_entry *entry, uintptr_t addr) 898 { 899 uint32_t ret = 0; 900 901 switch (rte_eal_process_type()) { 902 case RTE_PROC_PRIMARY: 903 ret = mlx5_mr_create_primary(cdev->pd, share_cache, entry, addr, 904 cdev->config.mr_ext_memseg_en); 905 break; 906 case RTE_PROC_SECONDARY: 907 ret = mlx5_mr_create_secondary(cdev, share_cache, entry, addr); 908 break; 909 default: 910 break; 911 } 912 return ret; 913 } 914 915 /** 916 * Look up address in the global MR cache table. If not found, create a new MR. 917 * Insert the found/created entry to local bottom-half cache table. 918 * 919 * @param mr_ctrl 920 * Pointer to per-queue MR control structure. 921 * @param[out] entry 922 * Pointer to returning MR cache entry, found in the global cache or newly 923 * created. If failed to create one, this is not written. 924 * @param addr 925 * Search key. 926 * 927 * @return 928 * Searched LKey on success, UINT32_MAX on no match. 929 */ 930 static uint32_t 931 mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl, 932 struct mr_cache_entry *entry, uintptr_t addr) 933 { 934 struct mlx5_mr_share_cache *share_cache = 935 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 936 dev_gen); 937 struct mlx5_common_device *cdev = 938 container_of(share_cache, struct mlx5_common_device, mr_scache); 939 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 940 uint32_t lkey; 941 uint16_t idx; 942 943 /* If local cache table is full, try to double it. */ 944 if (unlikely(bt->len == bt->size)) 945 mr_btree_expand(bt, bt->size << 1); 946 /* Look up in the global cache. */ 947 rte_rwlock_read_lock(&share_cache->rwlock); 948 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 949 if (lkey != UINT32_MAX) { 950 /* Found. */ 951 *entry = (*share_cache->cache.table)[idx]; 952 rte_rwlock_read_unlock(&share_cache->rwlock); 953 /* 954 * Update local cache. Even if it fails, return the found entry 955 * to update top-half cache. Next time, this entry will be found 956 * in the global cache. 957 */ 958 mr_btree_insert(bt, entry); 959 return lkey; 960 } 961 rte_rwlock_read_unlock(&share_cache->rwlock); 962 /* First time to see the address? Create a new MR. */ 963 lkey = mlx5_mr_create(cdev, share_cache, entry, addr); 964 /* 965 * Update the local cache if successfully created a new global MR. Even 966 * if failed to create one, there's no action to take in this datapath 967 * code. As returning LKey is invalid, this will eventually make HW 968 * fail. 969 */ 970 if (lkey != UINT32_MAX) 971 mr_btree_insert(bt, entry); 972 return lkey; 973 } 974 975 /** 976 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 977 * misses, search in the global MR cache table and update the new entry to 978 * per-queue local caches. 979 * 980 * @param mr_ctrl 981 * Pointer to per-queue MR control structure. 982 * @param addr 983 * Search key. 984 * 985 * @return 986 * Searched LKey on success, UINT32_MAX on no match. 987 */ 988 static uint32_t 989 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr) 990 { 991 uint32_t lkey; 992 uint16_t bh_idx = 0; 993 /* Victim in top-half cache to replace with new entry. */ 994 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 995 996 /* Binary-search MR translation table. */ 997 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 998 /* Update top-half cache. */ 999 if (likely(lkey != UINT32_MAX)) { 1000 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1001 } else { 1002 /* 1003 * If missed in local lookup table, search in the global cache 1004 * and local cache_bh[] will be updated inside if possible. 1005 * Top-half cache entry will also be updated. 1006 */ 1007 lkey = mr_lookup_caches(mr_ctrl, repl, addr); 1008 if (unlikely(lkey == UINT32_MAX)) 1009 return UINT32_MAX; 1010 } 1011 /* Update the most recently used entry. */ 1012 mr_ctrl->mru = mr_ctrl->head; 1013 /* Point to the next victim, the oldest. */ 1014 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1015 return lkey; 1016 } 1017 1018 /** 1019 * Release all the created MRs and resources on global MR cache of a device 1020 * list. 1021 * 1022 * @param share_cache 1023 * Pointer to a global shared MR cache. 1024 */ 1025 void 1026 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 1027 { 1028 struct mlx5_mr *mr_next; 1029 1030 rte_rwlock_write_lock(&share_cache->rwlock); 1031 /* Detach from MR list and move to free list. */ 1032 mr_next = LIST_FIRST(&share_cache->mr_list); 1033 while (mr_next != NULL) { 1034 struct mlx5_mr *mr = mr_next; 1035 1036 mr_next = LIST_NEXT(mr, mr); 1037 LIST_REMOVE(mr, mr); 1038 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1039 } 1040 LIST_INIT(&share_cache->mr_list); 1041 /* Free global cache. */ 1042 mlx5_mr_btree_free(&share_cache->cache); 1043 rte_rwlock_write_unlock(&share_cache->rwlock); 1044 /* Free all remaining MRs. */ 1045 mlx5_mr_garbage_collect(share_cache); 1046 } 1047 1048 /** 1049 * Initialize global MR cache of a device. 1050 * 1051 * @param share_cache 1052 * Pointer to a global shared MR cache. 1053 * @param socket 1054 * NUMA socket on which memory must be allocated. 1055 * 1056 * @return 1057 * 0 on success, a negative errno value otherwise and rte_errno is set. 1058 */ 1059 int 1060 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket) 1061 { 1062 /* Set the reg_mr and dereg_mr callback functions */ 1063 mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb, 1064 &share_cache->dereg_mr_cb); 1065 rte_rwlock_init(&share_cache->rwlock); 1066 rte_rwlock_init(&share_cache->mprwlock); 1067 share_cache->mp_cb_registered = 0; 1068 /* Initialize B-tree and allocate memory for global MR cache table. */ 1069 return mlx5_mr_btree_init(&share_cache->cache, 1070 MLX5_MR_BTREE_CACHE_N * 2, socket); 1071 } 1072 1073 /** 1074 * Flush all of the local cache entries. 1075 * 1076 * @param mr_ctrl 1077 * Pointer to per-queue MR local cache. 1078 */ 1079 void 1080 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1081 { 1082 /* Reset the most-recently-used index. */ 1083 mr_ctrl->mru = 0; 1084 /* Reset the linear search array. */ 1085 mr_ctrl->head = 0; 1086 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1087 /* Reset the B-tree table. */ 1088 mr_ctrl->cache_bh.len = 1; 1089 mr_ctrl->cache_bh.overflow = 0; 1090 /* Update the generation number. */ 1091 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1092 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1093 (void *)mr_ctrl, mr_ctrl->cur_gen); 1094 } 1095 1096 /** 1097 * Creates a memory region for external memory, that is memory which is not 1098 * part of the DPDK memory segments. 1099 * 1100 * @param pd 1101 * Pointer to pd of a device (net, regex, vdpa,...). 1102 * @param addr 1103 * Starting virtual address of memory. 1104 * @param len 1105 * Length of memory segment being mapped. 1106 * @param socked_id 1107 * Socket to allocate heap memory for the control structures. 1108 * 1109 * @return 1110 * Pointer to MR structure on success, NULL otherwise. 1111 */ 1112 struct mlx5_mr * 1113 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1114 mlx5_reg_mr_t reg_mr_cb) 1115 { 1116 struct mlx5_mr *mr = NULL; 1117 1118 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1119 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1120 RTE_CACHE_LINE_SIZE, socket_id); 1121 if (mr == NULL) 1122 return NULL; 1123 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1124 if (mr->pmd_mr.obj == NULL) { 1125 DRV_LOG(WARNING, 1126 "Fail to create MR for address (%p)", 1127 (void *)addr); 1128 mlx5_free(mr); 1129 return NULL; 1130 } 1131 mr->msl = NULL; /* Mark it is external memory. */ 1132 mr->ms_bmp = NULL; 1133 mr->ms_n = 1; 1134 mr->ms_bmp_n = 1; 1135 DRV_LOG(DEBUG, 1136 "MR CREATED (%p) for external memory %p:\n" 1137 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1138 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1139 (void *)mr, (void *)addr, 1140 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1141 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1142 return mr; 1143 } 1144 1145 /** 1146 * Callback for memory free event. Iterate freed memsegs and check whether it 1147 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1148 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1149 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1150 * secondary process, the garbage collector will be called in primary process 1151 * as the secondary process can't call mlx5_mr_create(). 1152 * 1153 * The global cache must be rebuilt if there's any change and this event has to 1154 * be propagated to dataplane threads to flush the local caches. 1155 * 1156 * @param share_cache 1157 * Pointer to a global shared MR cache. 1158 * @param ibdev_name 1159 * Name of ibv device. 1160 * @param addr 1161 * Address of freed memory. 1162 * @param len 1163 * Size of freed memory. 1164 */ 1165 void 1166 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1167 const char *ibdev_name, const void *addr, size_t len) 1168 { 1169 const struct rte_memseg_list *msl; 1170 struct mlx5_mr *mr; 1171 int ms_n; 1172 int i; 1173 int rebuild = 0; 1174 1175 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1176 ibdev_name, addr, len); 1177 msl = rte_mem_virt2memseg_list(addr); 1178 /* addr and len must be page-aligned. */ 1179 MLX5_ASSERT((uintptr_t)addr == 1180 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1181 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1182 ms_n = len / msl->page_sz; 1183 rte_rwlock_write_lock(&share_cache->rwlock); 1184 /* Clear bits of freed memsegs from MR. */ 1185 for (i = 0; i < ms_n; ++i) { 1186 const struct rte_memseg *ms; 1187 struct mr_cache_entry entry; 1188 uintptr_t start; 1189 int ms_idx; 1190 uint32_t pos; 1191 1192 /* Find MR having this memseg. */ 1193 start = (uintptr_t)addr + i * msl->page_sz; 1194 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1195 if (mr == NULL) 1196 continue; 1197 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1198 ms = rte_mem_virt2memseg((void *)start, msl); 1199 MLX5_ASSERT(ms != NULL); 1200 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1201 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1202 pos = ms_idx - mr->ms_base_idx; 1203 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1204 MLX5_ASSERT(pos < mr->ms_bmp_n); 1205 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1206 ibdev_name, (void *)mr, pos, (void *)start); 1207 rte_bitmap_clear(mr->ms_bmp, pos); 1208 if (--mr->ms_n == 0) { 1209 LIST_REMOVE(mr, mr); 1210 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1211 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1212 ibdev_name, (void *)mr); 1213 } 1214 /* 1215 * MR is fragmented or will be freed. the global cache must be 1216 * rebuilt. 1217 */ 1218 rebuild = 1; 1219 } 1220 if (rebuild) { 1221 mlx5_mr_rebuild_cache(share_cache); 1222 /* 1223 * No explicit wmb is needed after updating dev_gen due to 1224 * store-release ordering in unlock that provides the 1225 * implicit barrier at the software visible level. 1226 */ 1227 ++share_cache->dev_gen; 1228 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1229 share_cache->dev_gen); 1230 } 1231 rte_rwlock_write_unlock(&share_cache->rwlock); 1232 } 1233 1234 /** 1235 * Dump all the created MRs and the global cache entries. 1236 * 1237 * @param share_cache 1238 * Pointer to a global shared MR cache. 1239 */ 1240 void 1241 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1242 { 1243 #ifdef RTE_LIBRTE_MLX5_DEBUG 1244 struct mlx5_mr *mr; 1245 int mr_n = 0; 1246 int chunk_n = 0; 1247 1248 rte_rwlock_read_lock(&share_cache->rwlock); 1249 /* Iterate all the existing MRs. */ 1250 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1251 unsigned int n; 1252 1253 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1254 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1255 mr->ms_n, mr->ms_bmp_n); 1256 if (mr->ms_n == 0) 1257 continue; 1258 for (n = 0; n < mr->ms_bmp_n; ) { 1259 struct mr_cache_entry ret = { 0, }; 1260 1261 n = mr_find_next_chunk(mr, &ret, n); 1262 if (!ret.end) 1263 break; 1264 DRV_LOG(DEBUG, 1265 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1266 chunk_n++, ret.start, ret.end); 1267 } 1268 } 1269 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1270 mlx5_mr_btree_dump(&share_cache->cache); 1271 rte_rwlock_read_unlock(&share_cache->rwlock); 1272 #endif 1273 } 1274 1275 static int 1276 mlx5_range_compare_start(const void *lhs, const void *rhs) 1277 { 1278 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1279 1280 if (r1->start > r2->start) 1281 return 1; 1282 else if (r1->start < r2->start) 1283 return -1; 1284 return 0; 1285 } 1286 1287 static void 1288 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1289 struct rte_mempool_memhdr *memhdr, 1290 unsigned int idx) 1291 { 1292 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1293 uint64_t page_size = rte_mem_page_size(); 1294 1295 RTE_SET_USED(mp); 1296 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 1297 range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size); 1298 } 1299 1300 /** 1301 * Collect page-aligned memory ranges of the mempool. 1302 */ 1303 static int 1304 mlx5_mempool_get_chunks(struct rte_mempool *mp, struct mlx5_range **out, 1305 unsigned int *out_n) 1306 { 1307 unsigned int n; 1308 1309 DRV_LOG(DEBUG, "Collecting chunks of regular mempool %s", mp->name); 1310 n = mp->nb_mem_chunks; 1311 *out = calloc(sizeof(**out), n); 1312 if (*out == NULL) 1313 return -1; 1314 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, *out); 1315 *out_n = n; 1316 return 0; 1317 } 1318 1319 struct mlx5_mempool_get_extmem_data { 1320 struct mlx5_range *heap; 1321 unsigned int heap_size; 1322 int ret; 1323 }; 1324 1325 static void 1326 mlx5_mempool_get_extmem_cb(struct rte_mempool *mp, void *opaque, 1327 void *obj, unsigned int obj_idx) 1328 { 1329 struct mlx5_mempool_get_extmem_data *data = opaque; 1330 struct rte_mbuf *mbuf = obj; 1331 uintptr_t addr = (uintptr_t)mbuf->buf_addr; 1332 struct mlx5_range *seg, *heap; 1333 struct rte_memseg_list *msl; 1334 size_t page_size; 1335 uintptr_t page_start; 1336 unsigned int pos = 0, len = data->heap_size, delta; 1337 1338 RTE_SET_USED(mp); 1339 RTE_SET_USED(obj_idx); 1340 if (data->ret < 0) 1341 return; 1342 /* Binary search for an already visited page. */ 1343 while (len > 1) { 1344 delta = len / 2; 1345 if (addr < data->heap[pos + delta].start) { 1346 len = delta; 1347 } else { 1348 pos += delta; 1349 len -= delta; 1350 } 1351 } 1352 if (data->heap != NULL) { 1353 seg = &data->heap[pos]; 1354 if (seg->start <= addr && addr < seg->end) 1355 return; 1356 } 1357 /* Determine the page boundaries and remember them. */ 1358 heap = realloc(data->heap, sizeof(heap[0]) * (data->heap_size + 1)); 1359 if (heap == NULL) { 1360 free(data->heap); 1361 data->heap = NULL; 1362 data->ret = -1; 1363 return; 1364 } 1365 data->heap = heap; 1366 data->heap_size++; 1367 seg = &heap[data->heap_size - 1]; 1368 msl = rte_mem_virt2memseg_list((void *)addr); 1369 page_size = msl != NULL ? msl->page_sz : rte_mem_page_size(); 1370 page_start = RTE_PTR_ALIGN_FLOOR(addr, page_size); 1371 seg->start = page_start; 1372 seg->end = page_start + page_size; 1373 /* Maintain the heap order. */ 1374 qsort(data->heap, data->heap_size, sizeof(heap[0]), 1375 mlx5_range_compare_start); 1376 } 1377 1378 /** 1379 * Recover pages of external memory as close as possible 1380 * for a mempool with RTE_PKTMBUF_POOL_PINNED_EXT_BUF. 1381 * Pages are stored in a heap for efficient search, for mbufs are many. 1382 */ 1383 static int 1384 mlx5_mempool_get_extmem(struct rte_mempool *mp, struct mlx5_range **out, 1385 unsigned int *out_n) 1386 { 1387 struct mlx5_mempool_get_extmem_data data; 1388 1389 DRV_LOG(DEBUG, "Recovering external pinned pages of mempool %s", 1390 mp->name); 1391 memset(&data, 0, sizeof(data)); 1392 rte_mempool_obj_iter(mp, mlx5_mempool_get_extmem_cb, &data); 1393 *out = data.heap; 1394 *out_n = data.heap_size; 1395 return data.ret; 1396 } 1397 1398 /** 1399 * Get VA-contiguous ranges of the mempool memory. 1400 * Each range start and end is aligned to the system page size. 1401 * 1402 * @param[in] mp 1403 * Analyzed mempool. 1404 * @param[in] is_extmem 1405 * Whether the pool is contains only external pinned buffers. 1406 * @param[out] out 1407 * Receives the ranges, caller must release it with free(). 1408 * @param[out] out_n 1409 * Receives the number of @p out elements. 1410 * 1411 * @return 1412 * 0 on success, (-1) on failure. 1413 */ 1414 static int 1415 mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem, 1416 struct mlx5_range **out, unsigned int *out_n) 1417 { 1418 struct mlx5_range *chunks; 1419 unsigned int chunks_n, contig_n, i; 1420 int ret; 1421 1422 /* Collect the pool underlying memory. */ 1423 ret = is_extmem ? mlx5_mempool_get_extmem(mp, &chunks, &chunks_n) : 1424 mlx5_mempool_get_chunks(mp, &chunks, &chunks_n); 1425 if (ret < 0) 1426 return ret; 1427 /* Merge adjacent chunks and place them at the beginning. */ 1428 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1429 contig_n = 1; 1430 for (i = 1; i < chunks_n; i++) 1431 if (chunks[i - 1].end != chunks[i].start) { 1432 chunks[contig_n - 1].end = chunks[i - 1].end; 1433 chunks[contig_n] = chunks[i]; 1434 contig_n++; 1435 } 1436 /* Extend the last contiguous chunk to the end of the mempool. */ 1437 chunks[contig_n - 1].end = chunks[i - 1].end; 1438 *out = chunks; 1439 *out_n = contig_n; 1440 return 0; 1441 } 1442 1443 /** 1444 * Analyze mempool memory to select memory ranges to register. 1445 * 1446 * @param[in] mp 1447 * Mempool to analyze. 1448 * @param[in] is_extmem 1449 * Whether the pool is contains only external pinned buffers. 1450 * @param[out] out 1451 * Receives memory ranges to register, aligned to the system page size. 1452 * The caller must release them with free(). 1453 * @param[out] out_n 1454 * Receives the number of @p out items. 1455 * @param[out] share_hugepage 1456 * Receives True if the entire pool resides within a single hugepage. 1457 * 1458 * @return 1459 * 0 on success, (-1) on failure. 1460 */ 1461 static int 1462 mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem, 1463 struct mlx5_range **out, unsigned int *out_n, 1464 bool *share_hugepage) 1465 { 1466 struct mlx5_range *ranges = NULL; 1467 unsigned int i, ranges_n = 0; 1468 struct rte_memseg_list *msl; 1469 1470 if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) { 1471 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1472 mp->name); 1473 return -1; 1474 } 1475 /* Check if the hugepage of the pool can be shared. */ 1476 *share_hugepage = false; 1477 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1478 if (msl != NULL) { 1479 uint64_t hugepage_sz = 0; 1480 1481 /* Check that all ranges are on pages of the same size. */ 1482 for (i = 0; i < ranges_n; i++) { 1483 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1484 break; 1485 hugepage_sz = msl->page_sz; 1486 } 1487 if (i == ranges_n) { 1488 /* 1489 * If the entire pool is within one hugepage, 1490 * combine all ranges into one of the hugepage size. 1491 */ 1492 uintptr_t reg_start = ranges[0].start; 1493 uintptr_t reg_end = ranges[ranges_n - 1].end; 1494 uintptr_t hugepage_start = 1495 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1496 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1497 if (reg_end < hugepage_end) { 1498 ranges[0].start = hugepage_start; 1499 ranges[0].end = hugepage_end; 1500 ranges_n = 1; 1501 *share_hugepage = true; 1502 } 1503 } 1504 } 1505 *out = ranges; 1506 *out_n = ranges_n; 1507 return 0; 1508 } 1509 1510 /** Create a registration object for the mempool. */ 1511 static struct mlx5_mempool_reg * 1512 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n, 1513 bool is_extmem) 1514 { 1515 struct mlx5_mempool_reg *mpr = NULL; 1516 1517 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1518 sizeof(struct mlx5_mempool_reg), 1519 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1520 if (mpr == NULL) { 1521 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1522 mp->name); 1523 return NULL; 1524 } 1525 mpr->mrs = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1526 mrs_n * sizeof(struct mlx5_mempool_mr), 1527 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1528 if (!mpr->mrs) { 1529 DRV_LOG(ERR, "Cannot allocate mempool %s registration MRs", 1530 mp->name); 1531 mlx5_free(mpr); 1532 return NULL; 1533 } 1534 mpr->mp = mp; 1535 mpr->mrs_n = mrs_n; 1536 mpr->is_extmem = is_extmem; 1537 return mpr; 1538 } 1539 1540 /** 1541 * Destroy a mempool registration object. 1542 * 1543 * @param standalone 1544 * Whether @p mpr owns its MRs exclusively, i.e. they are not shared. 1545 */ 1546 static void 1547 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1548 struct mlx5_mempool_reg *mpr, bool standalone) 1549 { 1550 if (standalone) { 1551 unsigned int i; 1552 1553 for (i = 0; i < mpr->mrs_n; i++) 1554 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1555 mlx5_free(mpr->mrs); 1556 } 1557 mlx5_free(mpr); 1558 } 1559 1560 /** Find registration object of a mempool. */ 1561 static struct mlx5_mempool_reg * 1562 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1563 struct rte_mempool *mp) 1564 { 1565 struct mlx5_mempool_reg *mpr; 1566 1567 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1568 if (mpr->mp == mp) 1569 break; 1570 return mpr; 1571 } 1572 1573 /** Increment reference counters of MRs used in the registration. */ 1574 static void 1575 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1576 { 1577 unsigned int i; 1578 1579 for (i = 0; i < mpr->mrs_n; i++) 1580 __atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1581 } 1582 1583 /** 1584 * Decrement reference counters of MRs used in the registration. 1585 * 1586 * @return True if no more references to @p mpr MRs exist, False otherwise. 1587 */ 1588 static bool 1589 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1590 { 1591 unsigned int i; 1592 bool ret = false; 1593 1594 for (i = 0; i < mpr->mrs_n; i++) 1595 ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1, 1596 __ATOMIC_RELAXED) == 0; 1597 return ret; 1598 } 1599 1600 static int 1601 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1602 void *pd, struct rte_mempool *mp, 1603 bool is_extmem) 1604 { 1605 struct mlx5_range *ranges = NULL; 1606 struct mlx5_mempool_reg *mpr, *old_mpr, *new_mpr; 1607 unsigned int i, ranges_n; 1608 bool share_hugepage, standalone = false; 1609 int ret = -1; 1610 1611 /* Early check to avoid unnecessary creation of MRs. */ 1612 rte_rwlock_read_lock(&share_cache->rwlock); 1613 old_mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1614 rte_rwlock_read_unlock(&share_cache->rwlock); 1615 if (old_mpr != NULL && (!is_extmem || old_mpr->is_extmem)) { 1616 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1617 mp->name, pd); 1618 rte_errno = EEXIST; 1619 goto exit; 1620 } 1621 if (mlx5_mempool_reg_analyze(mp, is_extmem, &ranges, &ranges_n, 1622 &share_hugepage) < 0) { 1623 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1624 rte_errno = ENOMEM; 1625 goto exit; 1626 } 1627 new_mpr = mlx5_mempool_reg_create(mp, ranges_n, is_extmem); 1628 if (new_mpr == NULL) { 1629 DRV_LOG(ERR, 1630 "Cannot create a registration object for mempool %s in PD %p", 1631 mp->name, pd); 1632 rte_errno = ENOMEM; 1633 goto exit; 1634 } 1635 /* 1636 * If the entire mempool fits in a single hugepage, the MR for this 1637 * hugepage can be shared across mempools that also fit in it. 1638 */ 1639 if (share_hugepage) { 1640 rte_rwlock_write_lock(&share_cache->rwlock); 1641 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1642 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1643 break; 1644 } 1645 if (mpr != NULL) { 1646 new_mpr->mrs = mpr->mrs; 1647 mlx5_mempool_reg_attach(new_mpr); 1648 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1649 new_mpr, next); 1650 } 1651 rte_rwlock_write_unlock(&share_cache->rwlock); 1652 if (mpr != NULL) { 1653 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1654 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1655 mpr->mp->name); 1656 ret = 0; 1657 goto exit; 1658 } 1659 } 1660 for (i = 0; i < ranges_n; i++) { 1661 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1662 const struct mlx5_range *range = &ranges[i]; 1663 size_t len = range->end - range->start; 1664 1665 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1666 &mr->pmd_mr) < 0) { 1667 DRV_LOG(ERR, 1668 "Failed to create an MR in PD %p for address range " 1669 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1670 pd, range->start, range->end, len, mp->name); 1671 break; 1672 } 1673 DRV_LOG(DEBUG, 1674 "Created a new MR %#x in PD %p for address range " 1675 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1676 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1677 mp->name); 1678 } 1679 if (i != ranges_n) { 1680 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1681 rte_errno = EINVAL; 1682 goto exit; 1683 } 1684 /* Concurrent registration is not supposed to happen. */ 1685 rte_rwlock_write_lock(&share_cache->rwlock); 1686 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1687 if (mpr == old_mpr && old_mpr != NULL) { 1688 LIST_REMOVE(old_mpr, next); 1689 standalone = mlx5_mempool_reg_detach(mpr); 1690 /* No need to flush the cache: old MRs cannot be in use. */ 1691 mpr = NULL; 1692 } 1693 if (mpr == NULL) { 1694 mlx5_mempool_reg_attach(new_mpr); 1695 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, new_mpr, next); 1696 ret = 0; 1697 } 1698 rte_rwlock_write_unlock(&share_cache->rwlock); 1699 if (mpr != NULL) { 1700 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1701 mp->name, pd); 1702 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1703 rte_errno = EEXIST; 1704 goto exit; 1705 } else if (old_mpr != NULL) { 1706 DRV_LOG(DEBUG, "Mempool %s registration for PD %p updated for external memory", 1707 mp->name, pd); 1708 mlx5_mempool_reg_destroy(share_cache, old_mpr, standalone); 1709 } 1710 exit: 1711 free(ranges); 1712 return ret; 1713 } 1714 1715 static int 1716 mlx5_mr_mempool_register_secondary(struct mlx5_common_device *cdev, 1717 struct rte_mempool *mp, bool is_extmem) 1718 { 1719 return mlx5_mp_req_mempool_reg(cdev, mp, true, is_extmem); 1720 } 1721 1722 /** 1723 * Register the memory of a mempool in the protection domain. 1724 * 1725 * @param cdev 1726 * Pointer to the mlx5 common device. 1727 * @param mp 1728 * Mempool to register. 1729 * 1730 * @return 1731 * 0 on success, (-1) on failure and rte_errno is set. 1732 */ 1733 int 1734 mlx5_mr_mempool_register(struct mlx5_common_device *cdev, 1735 struct rte_mempool *mp, bool is_extmem) 1736 { 1737 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1738 return 0; 1739 switch (rte_eal_process_type()) { 1740 case RTE_PROC_PRIMARY: 1741 return mlx5_mr_mempool_register_primary(&cdev->mr_scache, 1742 cdev->pd, mp, 1743 is_extmem); 1744 case RTE_PROC_SECONDARY: 1745 return mlx5_mr_mempool_register_secondary(cdev, mp, is_extmem); 1746 default: 1747 return -1; 1748 } 1749 } 1750 1751 static int 1752 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1753 struct rte_mempool *mp) 1754 { 1755 struct mlx5_mempool_reg *mpr; 1756 bool standalone = false; 1757 1758 rte_rwlock_write_lock(&share_cache->rwlock); 1759 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1760 if (mpr->mp == mp) { 1761 LIST_REMOVE(mpr, next); 1762 standalone = mlx5_mempool_reg_detach(mpr); 1763 if (standalone) 1764 /* 1765 * The unlock operation below provides a memory 1766 * barrier due to its store-release semantics. 1767 */ 1768 ++share_cache->dev_gen; 1769 break; 1770 } 1771 rte_rwlock_write_unlock(&share_cache->rwlock); 1772 if (mpr == NULL) { 1773 rte_errno = ENOENT; 1774 return -1; 1775 } 1776 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1777 return 0; 1778 } 1779 1780 static int 1781 mlx5_mr_mempool_unregister_secondary(struct mlx5_common_device *cdev, 1782 struct rte_mempool *mp) 1783 { 1784 return mlx5_mp_req_mempool_reg(cdev, mp, false, false /* is_extmem */); 1785 } 1786 1787 /** 1788 * Unregister the memory of a mempool from the protection domain. 1789 * 1790 * @param cdev 1791 * Pointer to the mlx5 common device. 1792 * @param mp 1793 * Mempool to unregister. 1794 * 1795 * @return 1796 * 0 on success, (-1) on failure and rte_errno is set. 1797 */ 1798 int 1799 mlx5_mr_mempool_unregister(struct mlx5_common_device *cdev, 1800 struct rte_mempool *mp) 1801 { 1802 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1803 return 0; 1804 switch (rte_eal_process_type()) { 1805 case RTE_PROC_PRIMARY: 1806 return mlx5_mr_mempool_unregister_primary(&cdev->mr_scache, mp); 1807 case RTE_PROC_SECONDARY: 1808 return mlx5_mr_mempool_unregister_secondary(cdev, mp); 1809 default: 1810 return -1; 1811 } 1812 } 1813 1814 /** 1815 * Lookup a MR key by and address in a registered mempool. 1816 * 1817 * @param mpr 1818 * Mempool registration object. 1819 * @param addr 1820 * Address within the mempool. 1821 * @param entry 1822 * Bottom-half cache entry to fill. 1823 * 1824 * @return 1825 * MR key or UINT32_MAX on failure, which can only happen 1826 * if the address is not from within the mempool. 1827 */ 1828 static uint32_t 1829 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1830 struct mr_cache_entry *entry) 1831 { 1832 uint32_t lkey = UINT32_MAX; 1833 unsigned int i; 1834 1835 for (i = 0; i < mpr->mrs_n; i++) { 1836 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1837 uintptr_t mr_start = (uintptr_t)mr->addr; 1838 uintptr_t mr_end = mr_start + mr->len; 1839 1840 if (mr_start <= addr && addr < mr_end) { 1841 lkey = rte_cpu_to_be_32(mr->lkey); 1842 entry->start = mr_start; 1843 entry->end = mr_end; 1844 entry->lkey = lkey; 1845 break; 1846 } 1847 } 1848 return lkey; 1849 } 1850 1851 /** 1852 * Update bottom-half cache from the list of mempool registrations. 1853 * 1854 * @param mr_ctrl 1855 * Per-queue MR control handle. 1856 * @param entry 1857 * Pointer to an entry in the bottom-half cache to update 1858 * with the MR lkey looked up. 1859 * @param mp 1860 * Mempool containing the address. 1861 * @param addr 1862 * Address to lookup. 1863 * @return 1864 * MR lkey on success, UINT32_MAX on failure. 1865 */ 1866 static uint32_t 1867 mlx5_lookup_mempool_regs(struct mlx5_mr_ctrl *mr_ctrl, 1868 struct mr_cache_entry *entry, 1869 struct rte_mempool *mp, uintptr_t addr) 1870 { 1871 struct mlx5_mr_share_cache *share_cache = 1872 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1873 dev_gen); 1874 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1875 struct mlx5_mempool_reg *mpr; 1876 uint32_t lkey = UINT32_MAX; 1877 1878 /* If local cache table is full, try to double it. */ 1879 if (unlikely(bt->len == bt->size)) 1880 mr_btree_expand(bt, bt->size << 1); 1881 /* Look up in mempool registrations. */ 1882 rte_rwlock_read_lock(&share_cache->rwlock); 1883 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1884 if (mpr != NULL) 1885 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1886 rte_rwlock_read_unlock(&share_cache->rwlock); 1887 /* 1888 * Update local cache. Even if it fails, return the found entry 1889 * to update top-half cache. Next time, this entry will be found 1890 * in the global cache. 1891 */ 1892 if (lkey != UINT32_MAX) 1893 mr_btree_insert(bt, entry); 1894 return lkey; 1895 } 1896 1897 /** 1898 * Populate cache with LKeys of all MRs used by the mempool. 1899 * It is intended to be used to register Rx mempools in advance. 1900 * 1901 * @param mr_ctrl 1902 * Per-queue MR control handle. 1903 * @param mp 1904 * Registered memory pool. 1905 * 1906 * @return 1907 * 0 on success, (-1) on failure and rte_errno is set. 1908 */ 1909 int 1910 mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl, 1911 struct rte_mempool *mp) 1912 { 1913 struct mlx5_mr_share_cache *share_cache = 1914 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1915 dev_gen); 1916 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1917 struct mlx5_mempool_reg *mpr; 1918 unsigned int i; 1919 1920 /* 1921 * Registration is valid after the lock is released, 1922 * because the function is called after the mempool is registered. 1923 */ 1924 rte_rwlock_read_lock(&share_cache->rwlock); 1925 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1926 rte_rwlock_read_unlock(&share_cache->rwlock); 1927 if (mpr == NULL) { 1928 DRV_LOG(ERR, "Mempool %s is not registered", mp->name); 1929 rte_errno = ENOENT; 1930 return -1; 1931 } 1932 for (i = 0; i < mpr->mrs_n; i++) { 1933 struct mlx5_mempool_mr *mr = &mpr->mrs[i]; 1934 struct mr_cache_entry entry; 1935 uint32_t lkey; 1936 uint16_t idx; 1937 1938 lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr); 1939 if (lkey != UINT32_MAX) 1940 continue; 1941 if (bt->len == bt->size) 1942 mr_btree_expand(bt, bt->size << 1); 1943 entry.start = (uintptr_t)mr->pmd_mr.addr; 1944 entry.end = entry.start + mr->pmd_mr.len; 1945 entry.lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 1946 if (mr_btree_insert(bt, &entry) < 0) { 1947 DRV_LOG(ERR, "Cannot insert cache entry for mempool %s MR %08x", 1948 mp->name, entry.lkey); 1949 rte_errno = EINVAL; 1950 return -1; 1951 } 1952 } 1953 return 0; 1954 } 1955 1956 /** 1957 * Bottom-half lookup for the address from the mempool. 1958 * 1959 * @param mr_ctrl 1960 * Per-queue MR control handle. 1961 * @param mp 1962 * Mempool containing the address. 1963 * @param addr 1964 * Address to lookup. 1965 * @return 1966 * MR lkey on success, UINT32_MAX on failure. 1967 */ 1968 uint32_t 1969 mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, 1970 struct rte_mempool *mp, uintptr_t addr) 1971 { 1972 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1973 uint32_t lkey; 1974 uint16_t bh_idx = 0; 1975 1976 /* Binary-search MR translation table. */ 1977 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1978 /* Update top-half cache. */ 1979 if (likely(lkey != UINT32_MAX)) { 1980 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1981 } else { 1982 lkey = mlx5_lookup_mempool_regs(mr_ctrl, repl, mp, addr); 1983 /* Can only fail if the address is not from the mempool. */ 1984 if (unlikely(lkey == UINT32_MAX)) 1985 return UINT32_MAX; 1986 } 1987 /* Update the most recently used entry. */ 1988 mr_ctrl->mru = mr_ctrl->head; 1989 /* Point to the next victim, the oldest. */ 1990 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1991 return lkey; 1992 } 1993 1994 uint32_t 1995 mlx5_mr_mb2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, struct rte_mbuf *mb) 1996 { 1997 struct rte_mempool *mp; 1998 struct mlx5_mprq_buf *buf; 1999 uint32_t lkey; 2000 uintptr_t addr = (uintptr_t)mb->buf_addr; 2001 struct mlx5_mr_share_cache *share_cache = 2002 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 2003 dev_gen); 2004 struct mlx5_common_device *cdev = 2005 container_of(share_cache, struct mlx5_common_device, mr_scache); 2006 bool external, mprq, pinned = false; 2007 2008 /* Recover MPRQ mempool. */ 2009 external = RTE_MBUF_HAS_EXTBUF(mb); 2010 if (external && mb->shinfo->free_cb == mlx5_mprq_buf_free_cb) { 2011 mprq = true; 2012 buf = mb->shinfo->fcb_opaque; 2013 mp = buf->mp; 2014 } else { 2015 mprq = false; 2016 mp = mlx5_mb2mp(mb); 2017 pinned = rte_pktmbuf_priv_flags(mp) & 2018 RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF; 2019 } 2020 if (!external || mprq || pinned) { 2021 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2022 if (lkey != UINT32_MAX) 2023 return lkey; 2024 /* MPRQ is always registered. */ 2025 MLX5_ASSERT(!mprq); 2026 } 2027 /* Register pinned external memory if the mempool is not used for Rx. */ 2028 if (cdev->config.mr_mempool_reg_en && pinned) { 2029 if (mlx5_mr_mempool_register(cdev, mp, true) < 0) 2030 return UINT32_MAX; 2031 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2032 MLX5_ASSERT(lkey != UINT32_MAX); 2033 return lkey; 2034 } 2035 /* Fallback to generic mechanism in corner cases. */ 2036 return mlx5_mr_addr2mr_bh(mr_ctrl, addr); 2037 } 2038