1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common.h" 16 #include "mlx5_common_mp.h" 17 #include "mlx5_common_mr.h" 18 #include "mlx5_common_os.h" 19 #include "mlx5_common_log.h" 20 #include "mlx5_malloc.h" 21 22 struct mr_find_contig_memsegs_data { 23 uintptr_t addr; 24 uintptr_t start; 25 uintptr_t end; 26 const struct rte_memseg_list *msl; 27 }; 28 29 /* Virtual memory range. */ 30 struct mlx5_range { 31 uintptr_t start; 32 uintptr_t end; 33 }; 34 35 /** Memory region for a mempool. */ 36 struct mlx5_mempool_mr { 37 struct mlx5_pmd_mr pmd_mr; 38 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 39 }; 40 41 /* Mempool registration. */ 42 struct mlx5_mempool_reg { 43 LIST_ENTRY(mlx5_mempool_reg) next; 44 /** Registered mempool, used to designate registrations. */ 45 struct rte_mempool *mp; 46 /** Memory regions for the address ranges of the mempool. */ 47 struct mlx5_mempool_mr *mrs; 48 /** Number of memory regions. */ 49 unsigned int mrs_n; 50 /** Whether the MR were created for external pinned memory. */ 51 bool is_extmem; 52 }; 53 54 void 55 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 56 { 57 struct mlx5_mprq_buf *buf = opaque; 58 59 if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) { 60 rte_mempool_put(buf->mp, buf); 61 } else if (unlikely(__atomic_fetch_sub(&buf->refcnt, 1, 62 __ATOMIC_RELAXED) - 1 == 0)) { 63 __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED); 64 rte_mempool_put(buf->mp, buf); 65 } 66 } 67 68 /** 69 * Expand B-tree table to a given size. Can't be called with holding 70 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 71 * 72 * @param bt 73 * Pointer to B-tree structure. 74 * @param n 75 * Number of entries for expansion. 76 * 77 * @return 78 * 0 on success, -1 on failure. 79 */ 80 static int 81 mr_btree_expand(struct mlx5_mr_btree *bt, uint32_t n) 82 { 83 void *mem; 84 int ret = 0; 85 86 if (n <= bt->size) 87 return ret; 88 /* 89 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 90 * used inside if there's no room to expand. Because this is a quite 91 * rare case and a part of very slow path, it is very acceptable. 92 * Initially cache_bh[] will be given practically enough space and once 93 * it is expanded, expansion wouldn't be needed again ever. 94 */ 95 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 96 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 97 if (mem == NULL) { 98 /* Not an error, B-tree search will be skipped. */ 99 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 100 (void *)bt); 101 ret = -1; 102 } else { 103 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 104 bt->table = mem; 105 bt->size = n; 106 } 107 return ret; 108 } 109 110 /** 111 * Look up LKey from given B-tree lookup table, store the last index and return 112 * searched LKey. 113 * 114 * @param bt 115 * Pointer to B-tree structure. 116 * @param[out] idx 117 * Pointer to index. Even on search failure, returns index where it stops 118 * searching so that index can be used when inserting a new entry. 119 * @param addr 120 * Search key. 121 * 122 * @return 123 * Searched LKey on success, UINT32_MAX on no match. 124 */ 125 static uint32_t 126 mr_btree_lookup(struct mlx5_mr_btree *bt, uint32_t *idx, uintptr_t addr) 127 { 128 struct mr_cache_entry *lkp_tbl; 129 uint32_t n; 130 uint32_t base = 0; 131 132 MLX5_ASSERT(bt != NULL); 133 lkp_tbl = *bt->table; 134 n = bt->len; 135 /* First entry must be NULL for comparison. */ 136 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 137 lkp_tbl[0].lkey == UINT32_MAX)); 138 /* Binary search. */ 139 do { 140 register uint32_t delta = n >> 1; 141 142 if (addr < lkp_tbl[base + delta].start) { 143 n = delta; 144 } else { 145 base += delta; 146 n -= delta; 147 } 148 } while (n > 1); 149 MLX5_ASSERT(addr >= lkp_tbl[base].start); 150 *idx = base; 151 if (addr < lkp_tbl[base].end) 152 return lkp_tbl[base].lkey; 153 /* Not found. */ 154 return UINT32_MAX; 155 } 156 157 /** 158 * Insert an entry to B-tree lookup table. 159 * 160 * @param bt 161 * Pointer to B-tree structure. 162 * @param entry 163 * Pointer to new entry to insert. 164 * 165 * @return 166 * 0 on success, -1 on failure. 167 */ 168 static int 169 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 170 { 171 struct mr_cache_entry *lkp_tbl; 172 uint32_t idx = 0; 173 size_t shift; 174 175 MLX5_ASSERT(bt != NULL); 176 MLX5_ASSERT(bt->len <= bt->size); 177 MLX5_ASSERT(bt->len > 0); 178 lkp_tbl = *bt->table; 179 /* Find out the slot for insertion. */ 180 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 181 DRV_LOG(DEBUG, 182 "abort insertion to B-tree(%p): already exist at" 183 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 184 (void *)bt, idx, entry->start, entry->end, entry->lkey); 185 /* Already exist, return. */ 186 return 0; 187 } 188 /* Caller must ensure that there is enough place for a new entry. */ 189 MLX5_ASSERT(bt->len < bt->size); 190 /* Insert entry. */ 191 ++idx; 192 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 193 if (shift) 194 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 195 lkp_tbl[idx] = *entry; 196 bt->len++; 197 DRV_LOG(DEBUG, 198 "inserted B-tree(%p)[%u]," 199 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 200 (void *)bt, idx, entry->start, entry->end, entry->lkey); 201 return 0; 202 } 203 204 /** 205 * Initialize B-tree and allocate memory for lookup table. 206 * 207 * @param bt 208 * Pointer to B-tree structure. 209 * @param n 210 * Number of entries to allocate. 211 * @param socket 212 * NUMA socket on which memory must be allocated. 213 * 214 * @return 215 * 0 on success, a negative errno value otherwise and rte_errno is set. 216 */ 217 static int 218 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 219 { 220 if (bt == NULL) { 221 rte_errno = EINVAL; 222 return -rte_errno; 223 } 224 MLX5_ASSERT(!bt->table && !bt->size); 225 memset(bt, 0, sizeof(*bt)); 226 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 227 sizeof(struct mr_cache_entry) * n, 228 0, socket); 229 if (bt->table == NULL) { 230 rte_errno = ENOMEM; 231 DRV_LOG(DEBUG, 232 "failed to allocate memory for btree cache on socket " 233 "%d", socket); 234 return -rte_errno; 235 } 236 bt->size = n; 237 /* First entry must be NULL for binary search. */ 238 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 239 .lkey = UINT32_MAX, 240 }; 241 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 242 (void *)bt, (void *)bt->table); 243 return 0; 244 } 245 246 /** 247 * Free B-tree resources. 248 * 249 * @param bt 250 * Pointer to B-tree structure. 251 */ 252 void 253 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 254 { 255 if (bt == NULL) 256 return; 257 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 258 (void *)bt, (void *)bt->table); 259 mlx5_free(bt->table); 260 memset(bt, 0, sizeof(*bt)); 261 } 262 263 /** 264 * Dump all the entries in a B-tree 265 * 266 * @param bt 267 * Pointer to B-tree structure. 268 */ 269 void 270 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 271 { 272 #ifdef RTE_LIBRTE_MLX5_DEBUG 273 uint32_t idx; 274 struct mr_cache_entry *lkp_tbl; 275 276 if (bt == NULL) 277 return; 278 lkp_tbl = *bt->table; 279 for (idx = 0; idx < bt->len; ++idx) { 280 struct mr_cache_entry *entry = &lkp_tbl[idx]; 281 282 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 283 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 284 (void *)bt, idx, entry->start, entry->end, entry->lkey); 285 } 286 #endif 287 } 288 289 /** 290 * Initialize per-queue MR control descriptor. 291 * 292 * @param mr_ctrl 293 * Pointer to MR control structure. 294 * @param dev_gen_ptr 295 * Pointer to generation number of global cache. 296 * @param socket 297 * NUMA socket on which memory must be allocated. 298 * 299 * @return 300 * 0 on success, a negative errno value otherwise and rte_errno is set. 301 */ 302 int 303 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr, 304 int socket) 305 { 306 if (mr_ctrl == NULL) { 307 rte_errno = EINVAL; 308 return -rte_errno; 309 } 310 /* Save pointer of global generation number to check memory event. */ 311 mr_ctrl->dev_gen_ptr = dev_gen_ptr; 312 /* Initialize B-tree and allocate memory for bottom-half cache table. */ 313 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N, 314 socket); 315 } 316 317 /** 318 * Find virtually contiguous memory chunk in a given MR. 319 * 320 * @param dev 321 * Pointer to MR structure. 322 * @param[out] entry 323 * Pointer to returning MR cache entry. If not found, this will not be 324 * updated. 325 * @param start_idx 326 * Start index of the memseg bitmap. 327 * 328 * @return 329 * Next index to go on lookup. 330 */ 331 static int 332 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 333 int base_idx) 334 { 335 uintptr_t start = 0; 336 uintptr_t end = 0; 337 uint32_t idx = 0; 338 339 /* MR for external memory doesn't have memseg list. */ 340 if (mr->msl == NULL) { 341 MLX5_ASSERT(mr->ms_bmp_n == 1); 342 MLX5_ASSERT(mr->ms_n == 1); 343 MLX5_ASSERT(base_idx == 0); 344 /* 345 * Can't search it from memseg list but get it directly from 346 * pmd_mr as there's only one chunk. 347 */ 348 entry->start = (uintptr_t)mr->pmd_mr.addr; 349 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 350 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 351 /* Returning 1 ends iteration. */ 352 return 1; 353 } 354 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 355 if (rte_bitmap_get(mr->ms_bmp, idx)) { 356 const struct rte_memseg_list *msl; 357 const struct rte_memseg *ms; 358 359 msl = mr->msl; 360 ms = rte_fbarray_get(&msl->memseg_arr, 361 mr->ms_base_idx + idx); 362 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 363 if (!start) 364 start = ms->addr_64; 365 end = ms->addr_64 + ms->hugepage_sz; 366 } else if (start) { 367 /* Passed the end of a fragment. */ 368 break; 369 } 370 } 371 if (start) { 372 /* Found one chunk. */ 373 entry->start = start; 374 entry->end = end; 375 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 376 } 377 return idx; 378 } 379 380 /** 381 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 382 * Then, this entry will have to be searched by mr_lookup_list() in 383 * mlx5_mr_create() on miss. 384 * 385 * @param share_cache 386 * Pointer to a global shared MR cache. 387 * @param mr 388 * Pointer to MR to insert. 389 * 390 * @return 391 * 0 on success, -1 on failure. 392 */ 393 int 394 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 395 struct mlx5_mr *mr) 396 { 397 unsigned int n; 398 399 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 400 (void *)mr, (void *)share_cache); 401 for (n = 0; n < mr->ms_bmp_n; ) { 402 struct mr_cache_entry entry; 403 404 memset(&entry, 0, sizeof(entry)); 405 /* Find a contiguous chunk and advance the index. */ 406 n = mr_find_next_chunk(mr, &entry, n); 407 if (!entry.end) 408 break; 409 if (mr_btree_insert(&share_cache->cache, &entry) < 0) 410 return -1; 411 } 412 return 0; 413 } 414 415 /** 416 * Look up address in the original global MR list. 417 * 418 * @param share_cache 419 * Pointer to a global shared MR cache. 420 * @param[out] entry 421 * Pointer to returning MR cache entry. If no match, this will not be updated. 422 * @param addr 423 * Search key. 424 * 425 * @return 426 * Found MR on match, NULL otherwise. 427 */ 428 struct mlx5_mr * 429 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 430 struct mr_cache_entry *entry, uintptr_t addr) 431 { 432 struct mlx5_mr *mr; 433 434 /* Iterate all the existing MRs. */ 435 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 436 unsigned int n; 437 438 if (mr->ms_n == 0) 439 continue; 440 for (n = 0; n < mr->ms_bmp_n; ) { 441 struct mr_cache_entry ret; 442 443 memset(&ret, 0, sizeof(ret)); 444 n = mr_find_next_chunk(mr, &ret, n); 445 if (addr >= ret.start && addr < ret.end) { 446 /* Found. */ 447 *entry = ret; 448 return mr; 449 } 450 } 451 } 452 return NULL; 453 } 454 455 /** 456 * Look up address on global MR cache. 457 * 458 * @param share_cache 459 * Pointer to a global shared MR cache. 460 * @param[out] entry 461 * Pointer to returning MR cache entry. If no match, this will not be updated. 462 * @param addr 463 * Search key. 464 * 465 * @return 466 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 467 */ 468 static uint32_t 469 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 470 struct mr_cache_entry *entry, uintptr_t addr) 471 { 472 uint32_t idx; 473 uint32_t lkey; 474 475 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 476 if (lkey != UINT32_MAX) 477 *entry = (*share_cache->cache.table)[idx]; 478 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 479 addr < entry->end)); 480 return lkey; 481 } 482 483 /** 484 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 485 * can raise memory free event and the callback function will spin on the lock. 486 * 487 * @param mr 488 * Pointer to MR to free. 489 */ 490 void 491 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 492 { 493 if (mr == NULL) 494 return; 495 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 496 dereg_mr_cb(&mr->pmd_mr); 497 rte_bitmap_free(mr->ms_bmp); 498 mlx5_free(mr); 499 } 500 501 void 502 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 503 { 504 struct mlx5_mr *mr; 505 506 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 507 /* Flush cache to rebuild. */ 508 share_cache->cache.len = 1; 509 /* Iterate all the existing MRs. */ 510 LIST_FOREACH(mr, &share_cache->mr_list, mr) 511 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 512 return; 513 } 514 515 /** 516 * Release resources of detached MR having no online entry. 517 * 518 * @param share_cache 519 * Pointer to a global shared MR cache. 520 */ 521 static void 522 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 523 { 524 struct mlx5_mr *mr_next; 525 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 526 527 /* Must be called from the primary process. */ 528 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 529 /* 530 * MR can't be freed with holding the lock because rte_free() could call 531 * memory free callback function. This will be a deadlock situation. 532 */ 533 rte_rwlock_write_lock(&share_cache->rwlock); 534 /* Detach the whole free list and release it after unlocking. */ 535 free_list = share_cache->mr_free_list; 536 LIST_INIT(&share_cache->mr_free_list); 537 rte_rwlock_write_unlock(&share_cache->rwlock); 538 /* Release resources. */ 539 mr_next = LIST_FIRST(&free_list); 540 while (mr_next != NULL) { 541 struct mlx5_mr *mr = mr_next; 542 543 mr_next = LIST_NEXT(mr, mr); 544 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 545 } 546 } 547 548 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 549 static int 550 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 551 const struct rte_memseg *ms, size_t len, void *arg) 552 { 553 struct mr_find_contig_memsegs_data *data = arg; 554 555 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 556 return 0; 557 /* Found, save it and stop walking. */ 558 data->start = ms->addr_64; 559 data->end = ms->addr_64 + len; 560 data->msl = msl; 561 return 1; 562 } 563 564 /** 565 * Get the number of virtually-contiguous chunks in the MR. 566 * HW MR does not need to be already created to use this function. 567 * 568 * @param mr 569 * Pointer to the MR. 570 * 571 * @return 572 * Number of chunks. 573 */ 574 static uint32_t 575 mr_get_chunk_count(const struct mlx5_mr *mr) 576 { 577 uint32_t i, count = 0; 578 bool was_in_chunk = false; 579 bool is_in_chunk; 580 581 /* There is only one chunk in case of external memory. */ 582 if (mr->msl == NULL) 583 return 1; 584 for (i = 0; i < mr->ms_bmp_n; i++) { 585 is_in_chunk = rte_bitmap_get(mr->ms_bmp, i); 586 if (!was_in_chunk && is_in_chunk) 587 count++; 588 was_in_chunk = is_in_chunk; 589 } 590 return count; 591 } 592 593 /** 594 * Thread-safely expand the global MR cache to at least @p new_size slots. 595 * 596 * @param share_cache 597 * Shared MR cache for locking. 598 * @param new_size 599 * Desired cache size. 600 * @param socket 601 * NUMA node. 602 * 603 * @return 604 * 0 in success, negative on failure and rte_errno is set. 605 */ 606 int 607 mlx5_mr_expand_cache(struct mlx5_mr_share_cache *share_cache, 608 uint32_t size, int socket) 609 { 610 struct mlx5_mr_btree cache = {0}; 611 struct mlx5_mr_btree *bt; 612 struct mr_cache_entry *lkp_tbl; 613 int ret; 614 615 size = rte_align32pow2(size); 616 ret = mlx5_mr_btree_init(&cache, size, socket); 617 if (ret < 0) 618 return ret; 619 rte_rwlock_write_lock(&share_cache->rwlock); 620 bt = &share_cache->cache; 621 lkp_tbl = *bt->table; 622 if (cache.size > bt->size) { 623 rte_memcpy(cache.table, lkp_tbl, bt->len * sizeof(lkp_tbl[0])); 624 RTE_SWAP(*bt, cache); 625 DRV_LOG(DEBUG, "Global MR cache expanded to %u slots", size); 626 } 627 rte_rwlock_write_unlock(&share_cache->rwlock); 628 mlx5_mr_btree_free(&cache); 629 return 0; 630 } 631 632 /** 633 * Create a new global Memory Region (MR) for a missing virtual address. 634 * This API should be called on a secondary process, then a request is sent to 635 * the primary process in order to create a MR for the address. As the global MR 636 * list is on the shared memory, following LKey lookup should succeed unless the 637 * request fails. 638 * 639 * @param cdev 640 * Pointer to the mlx5 common device. 641 * @param share_cache 642 * Pointer to a global shared MR cache. 643 * @param[out] entry 644 * Pointer to returning MR cache entry, found in the global cache or newly 645 * created. If failed to create one, this will not be updated. 646 * @param addr 647 * Target virtual address to register. 648 * 649 * @return 650 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 651 */ 652 static uint32_t 653 mlx5_mr_create_secondary(struct mlx5_common_device *cdev, 654 struct mlx5_mr_share_cache *share_cache, 655 struct mr_cache_entry *entry, uintptr_t addr) 656 { 657 int ret; 658 659 DRV_LOG(DEBUG, "Requesting MR creation for address (%p)", (void *)addr); 660 ret = mlx5_mp_req_mr_create(cdev, addr); 661 if (ret) { 662 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 663 (void *)addr); 664 return UINT32_MAX; 665 } 666 rte_rwlock_read_lock(&share_cache->rwlock); 667 /* Fill in output data. */ 668 mlx5_mr_lookup_cache(share_cache, entry, addr); 669 /* Lookup can't fail. */ 670 MLX5_ASSERT(entry->lkey != UINT32_MAX); 671 rte_rwlock_read_unlock(&share_cache->rwlock); 672 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 673 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 674 (void *)addr, entry->start, entry->end, entry->lkey); 675 return entry->lkey; 676 } 677 678 /** 679 * Create a new global Memory Region (MR) for a missing virtual address. 680 * Register entire virtually contiguous memory chunk around the address. 681 * 682 * @param pd 683 * Pointer to pd of a device (net, regex, vdpa,...). 684 * @param share_cache 685 * Pointer to a global shared MR cache. 686 * @param[out] entry 687 * Pointer to returning MR cache entry, found in the global cache or newly 688 * created. If failed to create one, this will not be updated. 689 * @param addr 690 * Target virtual address to register. 691 * @param mr_ext_memseg_en 692 * Configurable flag about external memory segment enable or not. 693 * 694 * @return 695 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 696 */ 697 static uint32_t 698 mlx5_mr_create_primary(void *pd, 699 struct mlx5_mr_share_cache *share_cache, 700 struct mr_cache_entry *entry, uintptr_t addr, 701 unsigned int mr_ext_memseg_en) 702 { 703 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 704 struct mr_find_contig_memsegs_data data_re; 705 const struct rte_memseg_list *msl; 706 const struct rte_memseg *ms; 707 struct mlx5_mr_btree *bt; 708 struct mlx5_mr *mr = NULL; 709 int ms_idx_shift = -1; 710 uint32_t bmp_size; 711 void *bmp_mem; 712 uint32_t ms_n; 713 uint32_t n; 714 uint32_t chunks_n; 715 size_t len; 716 717 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 718 /* 719 * Release detached MRs if any. This can't be called with holding either 720 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 721 * been detached by the memory free event but it couldn't be released 722 * inside the callback due to deadlock. As a result, releasing resources 723 * is quite opportunistic. 724 */ 725 mlx5_mr_garbage_collect(share_cache); 726 find_range: 727 /* 728 * If enabled, find out a contiguous virtual address chunk in use, to 729 * which the given address belongs, in order to register maximum range. 730 * In the best case where mempools are not dynamically recreated and 731 * '--socket-mem' is specified as an EAL option, it is very likely to 732 * have only one MR(LKey) per a socket and per a hugepage-size even 733 * though the system memory is highly fragmented. As the whole memory 734 * chunk will be pinned by kernel, it can't be reused unless entire 735 * chunk is freed from EAL. 736 * 737 * If disabled, just register one memseg (page). Then, memory 738 * consumption will be minimized but it may drop performance if there 739 * are many MRs to lookup on the datapath. 740 */ 741 if (!mr_ext_memseg_en) { 742 data.msl = rte_mem_virt2memseg_list((void *)addr); 743 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 744 data.end = data.start + data.msl->page_sz; 745 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 746 DRV_LOG(WARNING, 747 "Unable to find virtually contiguous" 748 " chunk for address (%p)." 749 " rte_memseg_contig_walk() failed.", (void *)addr); 750 rte_errno = ENXIO; 751 goto err_nolock; 752 } 753 alloc_resources: 754 /* Addresses must be page-aligned. */ 755 MLX5_ASSERT(data.msl); 756 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 757 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 758 msl = data.msl; 759 ms = rte_mem_virt2memseg((void *)data.start, msl); 760 len = data.end - data.start; 761 MLX5_ASSERT(ms); 762 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 763 /* Number of memsegs in the range. */ 764 ms_n = len / msl->page_sz; 765 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 766 " page_sz=0x%" PRIx64 ", ms_n=%u", 767 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 768 /* Size of memory for bitmap. */ 769 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 770 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 771 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 772 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 773 if (mr == NULL) { 774 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 775 " address (%p).", (void *)addr); 776 rte_errno = ENOMEM; 777 goto err_nolock; 778 } 779 mr->msl = msl; 780 /* 781 * Save the index of the first memseg and initialize memseg bitmap. To 782 * see if a memseg of ms_idx in the memseg-list is still valid, check: 783 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 784 */ 785 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 786 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 787 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 788 if (mr->ms_bmp == NULL) { 789 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 790 " address (%p).", (void *)addr); 791 rte_errno = EINVAL; 792 goto err_nolock; 793 } 794 /* 795 * Should recheck whether the extended contiguous chunk is still valid. 796 * Because memory_hotplug_lock can't be held if there's any memory 797 * related calls in a critical path, resource allocation above can't be 798 * locked. If the memory has been changed at this point, try again with 799 * just single page. If not, go on with the big chunk atomically from 800 * here. 801 */ 802 rte_mcfg_mem_read_lock(); 803 data_re = data; 804 if (len > msl->page_sz && 805 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 806 DRV_LOG(DEBUG, 807 "Unable to find virtually contiguous chunk for address " 808 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 809 rte_errno = ENXIO; 810 goto err_memlock; 811 } 812 if (data.start != data_re.start || data.end != data_re.end) { 813 /* 814 * The extended contiguous chunk has been changed. Try again 815 * with single memseg instead. 816 */ 817 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 818 data.end = data.start + msl->page_sz; 819 rte_mcfg_mem_read_unlock(); 820 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 821 goto alloc_resources; 822 } 823 MLX5_ASSERT(data.msl == data_re.msl); 824 rte_rwlock_write_lock(&share_cache->rwlock); 825 /* 826 * Check the address is really missing. If other thread already created 827 * one or it is not found due to overflow, abort and return. 828 */ 829 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 830 /* 831 * Insert to the global cache table. It may fail due to 832 * low-on-memory. Then, this entry will have to be searched 833 * here again. 834 */ 835 mr_btree_insert(&share_cache->cache, entry); 836 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 837 (void *)addr); 838 rte_rwlock_write_unlock(&share_cache->rwlock); 839 rte_mcfg_mem_read_unlock(); 840 /* 841 * Must be unlocked before calling rte_free() because 842 * mlx5_mr_mem_event_free_cb() can be called inside. 843 */ 844 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 845 return entry->lkey; 846 } 847 /* 848 * Trim start and end addresses for verbs MR. Set bits for registering 849 * memsegs but exclude already registered ones. Bitmap can be 850 * fragmented. 851 */ 852 for (n = 0; n < ms_n; ++n) { 853 uintptr_t start; 854 struct mr_cache_entry ret; 855 856 memset(&ret, 0, sizeof(ret)); 857 start = data_re.start + n * msl->page_sz; 858 /* Exclude memsegs already registered by other MRs. */ 859 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 860 UINT32_MAX) { 861 /* 862 * Start from the first unregistered memseg in the 863 * extended range. 864 */ 865 if (ms_idx_shift == -1) { 866 mr->ms_base_idx += n; 867 data.start = start; 868 ms_idx_shift = n; 869 } 870 data.end = start + msl->page_sz; 871 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 872 ++mr->ms_n; 873 } 874 } 875 len = data.end - data.start; 876 mr->ms_bmp_n = len / msl->page_sz; 877 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 878 /* 879 * It is now known how many entries will be used in the global cache. 880 * If there is not enough, expand the cache. 881 * This cannot be done while holding the memory hotplug lock. 882 * While it is released, memory layout may change, 883 * so the process must be repeated from the beginning. 884 */ 885 bt = &share_cache->cache; 886 chunks_n = mr_get_chunk_count(mr); 887 if (bt->len + chunks_n > bt->size) { 888 struct mlx5_common_device *cdev; 889 uint32_t size; 890 891 size = bt->size + chunks_n; 892 MLX5_ASSERT(size > bt->size); 893 cdev = container_of(share_cache, struct mlx5_common_device, 894 mr_scache); 895 rte_rwlock_write_unlock(&share_cache->rwlock); 896 rte_mcfg_mem_read_unlock(); 897 if (mlx5_mr_expand_cache(share_cache, size, 898 cdev->dev->numa_node) < 0) { 899 DRV_LOG(ERR, "Failed to expand global MR cache to %u slots", 900 size); 901 goto err_nolock; 902 } 903 goto find_range; 904 } 905 /* 906 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 907 * be called with holding the memory lock because it doesn't use 908 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 909 * through mlx5_alloc_verbs_buf(). 910 */ 911 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 912 if (mr->pmd_mr.obj == NULL) { 913 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 914 (void *)addr); 915 rte_errno = EINVAL; 916 goto err_mrlock; 917 } 918 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 919 MLX5_ASSERT(mr->pmd_mr.len); 920 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 921 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 922 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 923 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 924 (void *)mr, (void *)addr, data.start, data.end, 925 rte_cpu_to_be_32(mr->pmd_mr.lkey), 926 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 927 /* Insert to the global cache table. */ 928 mlx5_mr_insert_cache(share_cache, mr); 929 /* Fill in output data. */ 930 mlx5_mr_lookup_cache(share_cache, entry, addr); 931 /* Lookup can't fail. */ 932 MLX5_ASSERT(entry->lkey != UINT32_MAX); 933 rte_rwlock_write_unlock(&share_cache->rwlock); 934 rte_mcfg_mem_read_unlock(); 935 return entry->lkey; 936 err_mrlock: 937 rte_rwlock_write_unlock(&share_cache->rwlock); 938 err_memlock: 939 rte_mcfg_mem_read_unlock(); 940 err_nolock: 941 /* 942 * In case of error, as this can be called in a datapath, a warning 943 * message per an error is preferable instead. Must be unlocked before 944 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 945 * inside. 946 */ 947 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 948 return UINT32_MAX; 949 } 950 951 /** 952 * Create a new global Memory Region (MR) for a missing virtual address. 953 * This can be called from primary and secondary process. 954 * 955 * @param cdev 956 * Pointer to the mlx5 common device. 957 * @param share_cache 958 * Pointer to a global shared MR cache. 959 * @param[out] entry 960 * Pointer to returning MR cache entry, found in the global cache or newly 961 * created. If failed to create one, this will not be updated. 962 * @param addr 963 * Target virtual address to register. 964 * 965 * @return 966 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 967 */ 968 uint32_t 969 mlx5_mr_create(struct mlx5_common_device *cdev, 970 struct mlx5_mr_share_cache *share_cache, 971 struct mr_cache_entry *entry, uintptr_t addr) 972 { 973 uint32_t ret = 0; 974 975 switch (rte_eal_process_type()) { 976 case RTE_PROC_PRIMARY: 977 ret = mlx5_mr_create_primary(cdev->pd, share_cache, entry, addr, 978 cdev->config.mr_ext_memseg_en); 979 break; 980 case RTE_PROC_SECONDARY: 981 ret = mlx5_mr_create_secondary(cdev, share_cache, entry, addr); 982 break; 983 default: 984 break; 985 } 986 return ret; 987 } 988 989 /** 990 * Look up address in the global MR cache table. If not found, create a new MR. 991 * Insert the found/created entry to local bottom-half cache table. 992 * 993 * @param mr_ctrl 994 * Pointer to per-queue MR control structure. 995 * @param[out] entry 996 * Pointer to returning MR cache entry, found in the global cache or newly 997 * created. If failed to create one, this is not written. 998 * @param addr 999 * Search key. 1000 * 1001 * @return 1002 * Searched LKey on success, UINT32_MAX on no match. 1003 */ 1004 static uint32_t 1005 mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl, 1006 struct mr_cache_entry *entry, uintptr_t addr) 1007 { 1008 struct mlx5_mr_share_cache *share_cache = 1009 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1010 dev_gen); 1011 struct mlx5_common_device *cdev = 1012 container_of(share_cache, struct mlx5_common_device, mr_scache); 1013 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1014 uint32_t lkey; 1015 uint32_t idx; 1016 1017 /* If local cache table is full, try to double it. */ 1018 if (unlikely(bt->len == bt->size)) 1019 mr_btree_expand(bt, bt->size << 1); 1020 /* Look up in the global cache. */ 1021 rte_rwlock_read_lock(&share_cache->rwlock); 1022 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 1023 if (lkey != UINT32_MAX) { 1024 /* Found. */ 1025 *entry = (*share_cache->cache.table)[idx]; 1026 rte_rwlock_read_unlock(&share_cache->rwlock); 1027 /* 1028 * Update local cache. Even if it fails, return the found entry 1029 * to update top-half cache. Next time, this entry will be found 1030 * in the global cache. 1031 */ 1032 mr_btree_insert(bt, entry); 1033 return lkey; 1034 } 1035 rte_rwlock_read_unlock(&share_cache->rwlock); 1036 /* First time to see the address? Create a new MR. */ 1037 lkey = mlx5_mr_create(cdev, share_cache, entry, addr); 1038 /* 1039 * Update the local cache if successfully created a new global MR. Even 1040 * if failed to create one, there's no action to take in this datapath 1041 * code. As returning LKey is invalid, this will eventually make HW 1042 * fail. 1043 */ 1044 if (lkey != UINT32_MAX) 1045 mr_btree_insert(bt, entry); 1046 return lkey; 1047 } 1048 1049 /** 1050 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 1051 * misses, search in the global MR cache table and update the new entry to 1052 * per-queue local caches. 1053 * 1054 * @param mr_ctrl 1055 * Pointer to per-queue MR control structure. 1056 * @param addr 1057 * Search key. 1058 * 1059 * @return 1060 * Searched LKey on success, UINT32_MAX on no match. 1061 */ 1062 uint32_t 1063 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr) 1064 { 1065 uint32_t lkey; 1066 uint32_t bh_idx = 0; 1067 /* Victim in top-half cache to replace with new entry. */ 1068 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1069 1070 /* Binary-search MR translation table. */ 1071 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1072 /* Update top-half cache. */ 1073 if (likely(lkey != UINT32_MAX)) { 1074 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1075 } else { 1076 /* 1077 * If missed in local lookup table, search in the global cache 1078 * and local cache_bh[] will be updated inside if possible. 1079 * Top-half cache entry will also be updated. 1080 */ 1081 lkey = mr_lookup_caches(mr_ctrl, repl, addr); 1082 if (unlikely(lkey == UINT32_MAX)) 1083 return UINT32_MAX; 1084 } 1085 /* Update the most recently used entry. */ 1086 mr_ctrl->mru = mr_ctrl->head; 1087 /* Point to the next victim, the oldest. */ 1088 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1089 return lkey; 1090 } 1091 1092 /** 1093 * Release all the created MRs and resources on global MR cache of a device 1094 * list. 1095 * 1096 * @param share_cache 1097 * Pointer to a global shared MR cache. 1098 */ 1099 void 1100 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 1101 { 1102 struct mlx5_mr *mr_next; 1103 1104 rte_rwlock_write_lock(&share_cache->rwlock); 1105 /* Detach from MR list and move to free list. */ 1106 mr_next = LIST_FIRST(&share_cache->mr_list); 1107 while (mr_next != NULL) { 1108 struct mlx5_mr *mr = mr_next; 1109 1110 mr_next = LIST_NEXT(mr, mr); 1111 LIST_REMOVE(mr, mr); 1112 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1113 } 1114 LIST_INIT(&share_cache->mr_list); 1115 /* Free global cache. */ 1116 mlx5_mr_btree_free(&share_cache->cache); 1117 rte_rwlock_write_unlock(&share_cache->rwlock); 1118 /* Free all remaining MRs. */ 1119 mlx5_mr_garbage_collect(share_cache); 1120 } 1121 1122 /** 1123 * Initialize global MR cache of a device. 1124 * 1125 * @param share_cache 1126 * Pointer to a global shared MR cache. 1127 * @param socket 1128 * NUMA socket on which memory must be allocated. 1129 * 1130 * @return 1131 * 0 on success, a negative errno value otherwise and rte_errno is set. 1132 */ 1133 int 1134 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket) 1135 { 1136 /* Set the reg_mr and dereg_mr callback functions */ 1137 mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb, 1138 &share_cache->dereg_mr_cb); 1139 rte_rwlock_init(&share_cache->rwlock); 1140 rte_rwlock_init(&share_cache->mprwlock); 1141 /* Initialize B-tree and allocate memory for global MR cache table. */ 1142 return mlx5_mr_btree_init(&share_cache->cache, 1143 MLX5_MR_BTREE_CACHE_N * 2, socket); 1144 } 1145 1146 /** 1147 * Flush all of the local cache entries. 1148 * 1149 * @param mr_ctrl 1150 * Pointer to per-queue MR local cache. 1151 */ 1152 void 1153 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1154 { 1155 /* Reset the most-recently-used index. */ 1156 mr_ctrl->mru = 0; 1157 /* Reset the linear search array. */ 1158 mr_ctrl->head = 0; 1159 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1160 /* Reset the B-tree table. */ 1161 mr_ctrl->cache_bh.len = 1; 1162 /* Update the generation number. */ 1163 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1164 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1165 (void *)mr_ctrl, mr_ctrl->cur_gen); 1166 } 1167 1168 /** 1169 * Creates a memory region for external memory, that is memory which is not 1170 * part of the DPDK memory segments. 1171 * 1172 * @param pd 1173 * Pointer to pd of a device (net, regex, vdpa,...). 1174 * @param addr 1175 * Starting virtual address of memory. 1176 * @param len 1177 * Length of memory segment being mapped. 1178 * @param socked_id 1179 * Socket to allocate heap memory for the control structures. 1180 * 1181 * @return 1182 * Pointer to MR structure on success, NULL otherwise. 1183 */ 1184 struct mlx5_mr * 1185 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1186 mlx5_reg_mr_t reg_mr_cb) 1187 { 1188 struct mlx5_mr *mr = NULL; 1189 1190 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1191 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1192 RTE_CACHE_LINE_SIZE, socket_id); 1193 if (mr == NULL) 1194 return NULL; 1195 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1196 if (mr->pmd_mr.obj == NULL) { 1197 DRV_LOG(WARNING, 1198 "Fail to create MR for address (%p)", 1199 (void *)addr); 1200 mlx5_free(mr); 1201 return NULL; 1202 } 1203 mr->msl = NULL; /* Mark it is external memory. */ 1204 mr->ms_bmp = NULL; 1205 mr->ms_n = 1; 1206 mr->ms_bmp_n = 1; 1207 DRV_LOG(DEBUG, 1208 "MR CREATED (%p) for external memory %p:\n" 1209 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1210 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1211 (void *)mr, (void *)addr, 1212 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1213 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1214 return mr; 1215 } 1216 1217 /** 1218 * Callback for memory free event. Iterate freed memsegs and check whether it 1219 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1220 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1221 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1222 * secondary process, the garbage collector will be called in primary process 1223 * as the secondary process can't call mlx5_mr_create(). 1224 * 1225 * The global cache must be rebuilt if there's any change and this event has to 1226 * be propagated to dataplane threads to flush the local caches. 1227 * 1228 * @param share_cache 1229 * Pointer to a global shared MR cache. 1230 * @param ibdev_name 1231 * Name of ibv device. 1232 * @param addr 1233 * Address of freed memory. 1234 * @param len 1235 * Size of freed memory. 1236 */ 1237 void 1238 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1239 const char *ibdev_name, const void *addr, size_t len) 1240 { 1241 const struct rte_memseg_list *msl; 1242 struct mlx5_mr *mr; 1243 int ms_n; 1244 int i; 1245 int rebuild = 0; 1246 1247 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1248 ibdev_name, addr, len); 1249 msl = rte_mem_virt2memseg_list(addr); 1250 /* addr and len must be page-aligned. */ 1251 MLX5_ASSERT((uintptr_t)addr == 1252 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1253 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1254 ms_n = len / msl->page_sz; 1255 rte_rwlock_write_lock(&share_cache->rwlock); 1256 /* Clear bits of freed memsegs from MR. */ 1257 for (i = 0; i < ms_n; ++i) { 1258 const struct rte_memseg *ms; 1259 struct mr_cache_entry entry; 1260 uintptr_t start; 1261 int ms_idx; 1262 uint32_t pos; 1263 1264 /* Find MR having this memseg. */ 1265 start = (uintptr_t)addr + i * msl->page_sz; 1266 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1267 if (mr == NULL) 1268 continue; 1269 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1270 ms = rte_mem_virt2memseg((void *)start, msl); 1271 MLX5_ASSERT(ms != NULL); 1272 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1273 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1274 pos = ms_idx - mr->ms_base_idx; 1275 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1276 MLX5_ASSERT(pos < mr->ms_bmp_n); 1277 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1278 ibdev_name, (void *)mr, pos, (void *)start); 1279 rte_bitmap_clear(mr->ms_bmp, pos); 1280 if (--mr->ms_n == 0) { 1281 LIST_REMOVE(mr, mr); 1282 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1283 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1284 ibdev_name, (void *)mr); 1285 } 1286 /* 1287 * MR is fragmented or will be freed. the global cache must be 1288 * rebuilt. 1289 */ 1290 rebuild = 1; 1291 } 1292 if (rebuild) { 1293 mlx5_mr_rebuild_cache(share_cache); 1294 /* 1295 * No explicit wmb is needed after updating dev_gen due to 1296 * store-release ordering in unlock that provides the 1297 * implicit barrier at the software visible level. 1298 */ 1299 ++share_cache->dev_gen; 1300 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1301 share_cache->dev_gen); 1302 } 1303 rte_rwlock_write_unlock(&share_cache->rwlock); 1304 } 1305 1306 /** 1307 * Dump all the created MRs and the global cache entries. 1308 * 1309 * @param share_cache 1310 * Pointer to a global shared MR cache. 1311 */ 1312 void 1313 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1314 { 1315 #ifdef RTE_LIBRTE_MLX5_DEBUG 1316 struct mlx5_mr *mr; 1317 int mr_n = 0; 1318 int chunk_n = 0; 1319 1320 rte_rwlock_read_lock(&share_cache->rwlock); 1321 /* Iterate all the existing MRs. */ 1322 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1323 unsigned int n; 1324 1325 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1326 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1327 mr->ms_n, mr->ms_bmp_n); 1328 if (mr->ms_n == 0) 1329 continue; 1330 for (n = 0; n < mr->ms_bmp_n; ) { 1331 struct mr_cache_entry ret = { 0, }; 1332 1333 n = mr_find_next_chunk(mr, &ret, n); 1334 if (!ret.end) 1335 break; 1336 DRV_LOG(DEBUG, 1337 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1338 chunk_n++, ret.start, ret.end); 1339 } 1340 } 1341 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1342 mlx5_mr_btree_dump(&share_cache->cache); 1343 rte_rwlock_read_unlock(&share_cache->rwlock); 1344 #endif 1345 } 1346 1347 static int 1348 mlx5_range_compare_start(const void *lhs, const void *rhs) 1349 { 1350 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1351 1352 if (r1->start > r2->start) 1353 return 1; 1354 else if (r1->start < r2->start) 1355 return -1; 1356 return 0; 1357 } 1358 1359 static void 1360 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1361 struct rte_mempool_memhdr *memhdr, 1362 unsigned int idx) 1363 { 1364 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1365 uintptr_t start = (uintptr_t)memhdr->addr; 1366 uint64_t page_size = rte_mem_page_size(); 1367 1368 RTE_SET_USED(mp); 1369 range->start = RTE_ALIGN_FLOOR(start, page_size); 1370 range->end = RTE_ALIGN_CEIL(start + memhdr->len, page_size); 1371 } 1372 1373 /** 1374 * Collect page-aligned memory ranges of the mempool. 1375 */ 1376 static int 1377 mlx5_mempool_get_chunks(struct rte_mempool *mp, struct mlx5_range **out, 1378 unsigned int *out_n) 1379 { 1380 unsigned int n; 1381 1382 DRV_LOG(DEBUG, "Collecting chunks of regular mempool %s", mp->name); 1383 n = mp->nb_mem_chunks; 1384 *out = calloc(n, sizeof(**out)); 1385 if (*out == NULL) 1386 return -1; 1387 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, *out); 1388 *out_n = n; 1389 return 0; 1390 } 1391 1392 struct mlx5_mempool_get_extmem_data { 1393 struct mlx5_range *heap; 1394 unsigned int heap_size; 1395 int ret; 1396 }; 1397 1398 static void 1399 mlx5_mempool_get_extmem_cb(struct rte_mempool *mp, void *opaque, 1400 void *obj, unsigned int obj_idx) 1401 { 1402 struct mlx5_mempool_get_extmem_data *data = opaque; 1403 struct rte_mbuf *mbuf = obj; 1404 uintptr_t addr = (uintptr_t)mbuf->buf_addr; 1405 struct mlx5_range *seg, *heap; 1406 struct rte_memseg_list *msl; 1407 size_t page_size; 1408 uintptr_t page_start; 1409 unsigned int pos = 0, len = data->heap_size, delta; 1410 1411 RTE_SET_USED(mp); 1412 RTE_SET_USED(obj_idx); 1413 if (data->ret < 0) 1414 return; 1415 /* Binary search for an already visited page. */ 1416 while (len > 1) { 1417 delta = len / 2; 1418 if (addr < data->heap[pos + delta].start) { 1419 len = delta; 1420 } else { 1421 pos += delta; 1422 len -= delta; 1423 } 1424 } 1425 if (data->heap != NULL) { 1426 seg = &data->heap[pos]; 1427 if (seg->start <= addr && addr < seg->end) 1428 return; 1429 } 1430 /* Determine the page boundaries and remember them. */ 1431 heap = realloc(data->heap, sizeof(heap[0]) * (data->heap_size + 1)); 1432 if (heap == NULL) { 1433 free(data->heap); 1434 data->heap = NULL; 1435 data->ret = -1; 1436 return; 1437 } 1438 data->heap = heap; 1439 data->heap_size++; 1440 seg = &heap[data->heap_size - 1]; 1441 msl = rte_mem_virt2memseg_list((void *)addr); 1442 page_size = msl != NULL ? msl->page_sz : rte_mem_page_size(); 1443 page_start = RTE_PTR_ALIGN_FLOOR(addr, page_size); 1444 seg->start = page_start; 1445 seg->end = page_start + page_size; 1446 /* Maintain the heap order. */ 1447 qsort(data->heap, data->heap_size, sizeof(heap[0]), 1448 mlx5_range_compare_start); 1449 } 1450 1451 /** 1452 * Recover pages of external memory as close as possible 1453 * for a mempool with RTE_PKTMBUF_POOL_PINNED_EXT_BUF. 1454 * Pages are stored in a heap for efficient search, for mbufs are many. 1455 */ 1456 static int 1457 mlx5_mempool_get_extmem(struct rte_mempool *mp, struct mlx5_range **out, 1458 unsigned int *out_n) 1459 { 1460 struct mlx5_mempool_get_extmem_data data; 1461 1462 DRV_LOG(DEBUG, "Recovering external pinned pages of mempool %s", 1463 mp->name); 1464 memset(&data, 0, sizeof(data)); 1465 rte_mempool_obj_iter(mp, mlx5_mempool_get_extmem_cb, &data); 1466 *out = data.heap; 1467 *out_n = data.heap_size; 1468 return data.ret; 1469 } 1470 1471 /** 1472 * Get VA-contiguous ranges of the mempool memory. 1473 * Each range start and end is aligned to the system page size. 1474 * 1475 * @param[in] mp 1476 * Analyzed mempool. 1477 * @param[in] is_extmem 1478 * Whether the pool is contains only external pinned buffers. 1479 * @param[out] out 1480 * Receives the ranges, caller must release it with free(). 1481 * @param[out] out_n 1482 * Receives the number of @p out elements. 1483 * 1484 * @return 1485 * 0 on success, (-1) on failure. 1486 */ 1487 static int 1488 mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem, 1489 struct mlx5_range **out, unsigned int *out_n) 1490 { 1491 struct mlx5_range *chunks; 1492 unsigned int chunks_n, contig_n, i; 1493 int ret; 1494 1495 /* Collect the pool underlying memory. */ 1496 ret = is_extmem ? mlx5_mempool_get_extmem(mp, &chunks, &chunks_n) : 1497 mlx5_mempool_get_chunks(mp, &chunks, &chunks_n); 1498 if (ret < 0) 1499 return ret; 1500 /* Merge adjacent chunks and place them at the beginning. */ 1501 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1502 contig_n = 1; 1503 for (i = 1; i < chunks_n; i++) 1504 if (chunks[i - 1].end != chunks[i].start) { 1505 chunks[contig_n - 1].end = chunks[i - 1].end; 1506 chunks[contig_n] = chunks[i]; 1507 contig_n++; 1508 } 1509 /* Extend the last contiguous chunk to the end of the mempool. */ 1510 chunks[contig_n - 1].end = chunks[i - 1].end; 1511 *out = chunks; 1512 *out_n = contig_n; 1513 return 0; 1514 } 1515 1516 /** 1517 * Analyze mempool memory to select memory ranges to register. 1518 * 1519 * @param[in] mp 1520 * Mempool to analyze. 1521 * @param[in] is_extmem 1522 * Whether the pool is contains only external pinned buffers. 1523 * @param[out] out 1524 * Receives memory ranges to register, aligned to the system page size. 1525 * The caller must release them with free(). 1526 * @param[out] out_n 1527 * Receives the number of @p out items. 1528 * @param[out] share_hugepage 1529 * Receives True if the entire pool resides within a single hugepage. 1530 * 1531 * @return 1532 * 0 on success, (-1) on failure. 1533 */ 1534 static int 1535 mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem, 1536 struct mlx5_range **out, unsigned int *out_n, 1537 bool *share_hugepage) 1538 { 1539 struct mlx5_range *ranges = NULL; 1540 unsigned int i, ranges_n = 0; 1541 struct rte_memseg_list *msl; 1542 1543 if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) { 1544 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1545 mp->name); 1546 return -1; 1547 } 1548 /* Check if the hugepage of the pool can be shared. */ 1549 *share_hugepage = false; 1550 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1551 if (msl != NULL) { 1552 uint64_t hugepage_sz = 0; 1553 1554 /* Check that all ranges are on pages of the same size. */ 1555 for (i = 0; i < ranges_n; i++) { 1556 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1557 break; 1558 hugepage_sz = msl->page_sz; 1559 } 1560 if (i == ranges_n) { 1561 /* 1562 * If the entire pool is within one hugepage, 1563 * combine all ranges into one of the hugepage size. 1564 */ 1565 uintptr_t reg_start = ranges[0].start; 1566 uintptr_t reg_end = ranges[ranges_n - 1].end; 1567 uintptr_t hugepage_start = 1568 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1569 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1570 if (reg_end < hugepage_end) { 1571 ranges[0].start = hugepage_start; 1572 ranges[0].end = hugepage_end; 1573 ranges_n = 1; 1574 *share_hugepage = true; 1575 } 1576 } 1577 } 1578 *out = ranges; 1579 *out_n = ranges_n; 1580 return 0; 1581 } 1582 1583 /** Create a registration object for the mempool. */ 1584 static struct mlx5_mempool_reg * 1585 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n, 1586 bool is_extmem) 1587 { 1588 struct mlx5_mempool_reg *mpr = NULL; 1589 1590 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1591 sizeof(struct mlx5_mempool_reg), 1592 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1593 if (mpr == NULL) { 1594 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1595 mp->name); 1596 return NULL; 1597 } 1598 mpr->mrs = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1599 mrs_n * sizeof(struct mlx5_mempool_mr), 1600 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1601 if (!mpr->mrs) { 1602 DRV_LOG(ERR, "Cannot allocate mempool %s registration MRs", 1603 mp->name); 1604 mlx5_free(mpr); 1605 return NULL; 1606 } 1607 mpr->mp = mp; 1608 mpr->mrs_n = mrs_n; 1609 mpr->is_extmem = is_extmem; 1610 return mpr; 1611 } 1612 1613 /** 1614 * Destroy a mempool registration object. 1615 * 1616 * @param standalone 1617 * Whether @p mpr owns its MRs exclusively, i.e. they are not shared. 1618 */ 1619 static void 1620 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1621 struct mlx5_mempool_reg *mpr, bool standalone) 1622 { 1623 if (standalone) { 1624 unsigned int i; 1625 1626 for (i = 0; i < mpr->mrs_n; i++) 1627 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1628 mlx5_free(mpr->mrs); 1629 } 1630 mlx5_free(mpr); 1631 } 1632 1633 /** Find registration object of a mempool. */ 1634 static struct mlx5_mempool_reg * 1635 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1636 struct rte_mempool *mp) 1637 { 1638 struct mlx5_mempool_reg *mpr; 1639 1640 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1641 if (mpr->mp == mp) 1642 break; 1643 return mpr; 1644 } 1645 1646 /** Increment reference counters of MRs used in the registration. */ 1647 static void 1648 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1649 { 1650 unsigned int i; 1651 1652 for (i = 0; i < mpr->mrs_n; i++) 1653 __atomic_fetch_add(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1654 } 1655 1656 /** 1657 * Decrement reference counters of MRs used in the registration. 1658 * 1659 * @return True if no more references to @p mpr MRs exist, False otherwise. 1660 */ 1661 static bool 1662 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1663 { 1664 unsigned int i; 1665 bool ret = false; 1666 1667 for (i = 0; i < mpr->mrs_n; i++) 1668 ret |= __atomic_fetch_sub(&mpr->mrs[i].refcnt, 1, 1669 __ATOMIC_RELAXED) - 1 == 0; 1670 return ret; 1671 } 1672 1673 static int 1674 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1675 void *pd, struct rte_mempool *mp, 1676 bool is_extmem) 1677 { 1678 struct mlx5_range *ranges = NULL; 1679 struct mlx5_mempool_reg *mpr, *old_mpr, *new_mpr; 1680 unsigned int i, ranges_n; 1681 bool share_hugepage, standalone = false; 1682 int ret = -1; 1683 1684 /* Early check to avoid unnecessary creation of MRs. */ 1685 rte_rwlock_read_lock(&share_cache->rwlock); 1686 old_mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1687 rte_rwlock_read_unlock(&share_cache->rwlock); 1688 if (old_mpr != NULL && (!is_extmem || old_mpr->is_extmem)) { 1689 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1690 mp->name, pd); 1691 rte_errno = EEXIST; 1692 goto exit; 1693 } 1694 if (mlx5_mempool_reg_analyze(mp, is_extmem, &ranges, &ranges_n, 1695 &share_hugepage) < 0) { 1696 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1697 rte_errno = ENOMEM; 1698 goto exit; 1699 } 1700 new_mpr = mlx5_mempool_reg_create(mp, ranges_n, is_extmem); 1701 if (new_mpr == NULL) { 1702 DRV_LOG(ERR, 1703 "Cannot create a registration object for mempool %s in PD %p", 1704 mp->name, pd); 1705 rte_errno = ENOMEM; 1706 goto exit; 1707 } 1708 /* 1709 * If the entire mempool fits in a single hugepage, the MR for this 1710 * hugepage can be shared across mempools that also fit in it. 1711 */ 1712 if (share_hugepage) { 1713 rte_rwlock_write_lock(&share_cache->rwlock); 1714 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1715 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1716 break; 1717 } 1718 if (mpr != NULL) { 1719 new_mpr->mrs = mpr->mrs; 1720 mlx5_mempool_reg_attach(new_mpr); 1721 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1722 new_mpr, next); 1723 } 1724 rte_rwlock_write_unlock(&share_cache->rwlock); 1725 if (mpr != NULL) { 1726 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1727 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1728 mpr->mp->name); 1729 ret = 0; 1730 goto exit; 1731 } 1732 } 1733 for (i = 0; i < ranges_n; i++) { 1734 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1735 const struct mlx5_range *range = &ranges[i]; 1736 size_t len = range->end - range->start; 1737 1738 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1739 &mr->pmd_mr) < 0) { 1740 DRV_LOG(ERR, 1741 "Failed to create an MR in PD %p for address range " 1742 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1743 pd, range->start, range->end, len, mp->name); 1744 break; 1745 } 1746 DRV_LOG(DEBUG, 1747 "Created a new MR %#x in PD %p for address range " 1748 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1749 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1750 mp->name); 1751 } 1752 if (i != ranges_n) { 1753 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1754 rte_errno = EINVAL; 1755 goto exit; 1756 } 1757 /* Concurrent registration is not supposed to happen. */ 1758 rte_rwlock_write_lock(&share_cache->rwlock); 1759 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1760 if (mpr == old_mpr && old_mpr != NULL) { 1761 LIST_REMOVE(old_mpr, next); 1762 standalone = mlx5_mempool_reg_detach(mpr); 1763 /* No need to flush the cache: old MRs cannot be in use. */ 1764 mpr = NULL; 1765 } 1766 if (mpr == NULL) { 1767 mlx5_mempool_reg_attach(new_mpr); 1768 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, new_mpr, next); 1769 ret = 0; 1770 } 1771 rte_rwlock_write_unlock(&share_cache->rwlock); 1772 if (mpr != NULL) { 1773 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1774 mp->name, pd); 1775 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1776 rte_errno = EEXIST; 1777 goto exit; 1778 } else if (old_mpr != NULL) { 1779 DRV_LOG(DEBUG, "Mempool %s registration for PD %p updated for external memory", 1780 mp->name, pd); 1781 mlx5_mempool_reg_destroy(share_cache, old_mpr, standalone); 1782 } 1783 exit: 1784 free(ranges); 1785 return ret; 1786 } 1787 1788 static int 1789 mlx5_mr_mempool_register_secondary(struct mlx5_common_device *cdev, 1790 struct rte_mempool *mp, bool is_extmem) 1791 { 1792 return mlx5_mp_req_mempool_reg(cdev, mp, true, is_extmem); 1793 } 1794 1795 /** 1796 * Register the memory of a mempool in the protection domain. 1797 * 1798 * @param cdev 1799 * Pointer to the mlx5 common device. 1800 * @param mp 1801 * Mempool to register. 1802 * 1803 * @return 1804 * 0 on success, (-1) on failure and rte_errno is set. 1805 */ 1806 int 1807 mlx5_mr_mempool_register(struct mlx5_common_device *cdev, 1808 struct rte_mempool *mp, bool is_extmem) 1809 { 1810 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1811 return 0; 1812 switch (rte_eal_process_type()) { 1813 case RTE_PROC_PRIMARY: 1814 return mlx5_mr_mempool_register_primary(&cdev->mr_scache, 1815 cdev->pd, mp, 1816 is_extmem); 1817 case RTE_PROC_SECONDARY: 1818 return mlx5_mr_mempool_register_secondary(cdev, mp, is_extmem); 1819 default: 1820 return -1; 1821 } 1822 } 1823 1824 static int 1825 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1826 struct rte_mempool *mp) 1827 { 1828 struct mlx5_mempool_reg *mpr; 1829 bool standalone = false; 1830 1831 rte_rwlock_write_lock(&share_cache->rwlock); 1832 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1833 if (mpr->mp == mp) { 1834 LIST_REMOVE(mpr, next); 1835 standalone = mlx5_mempool_reg_detach(mpr); 1836 if (standalone) 1837 /* 1838 * The unlock operation below provides a memory 1839 * barrier due to its store-release semantics. 1840 */ 1841 ++share_cache->dev_gen; 1842 break; 1843 } 1844 rte_rwlock_write_unlock(&share_cache->rwlock); 1845 if (mpr == NULL) { 1846 rte_errno = ENOENT; 1847 return -1; 1848 } 1849 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1850 return 0; 1851 } 1852 1853 static int 1854 mlx5_mr_mempool_unregister_secondary(struct mlx5_common_device *cdev, 1855 struct rte_mempool *mp) 1856 { 1857 return mlx5_mp_req_mempool_reg(cdev, mp, false, false /* is_extmem */); 1858 } 1859 1860 /** 1861 * Unregister the memory of a mempool from the protection domain. 1862 * 1863 * @param cdev 1864 * Pointer to the mlx5 common device. 1865 * @param mp 1866 * Mempool to unregister. 1867 * 1868 * @return 1869 * 0 on success, (-1) on failure and rte_errno is set. 1870 */ 1871 int 1872 mlx5_mr_mempool_unregister(struct mlx5_common_device *cdev, 1873 struct rte_mempool *mp) 1874 { 1875 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1876 return 0; 1877 switch (rte_eal_process_type()) { 1878 case RTE_PROC_PRIMARY: 1879 return mlx5_mr_mempool_unregister_primary(&cdev->mr_scache, mp); 1880 case RTE_PROC_SECONDARY: 1881 return mlx5_mr_mempool_unregister_secondary(cdev, mp); 1882 default: 1883 return -1; 1884 } 1885 } 1886 1887 /** 1888 * Lookup a MR key by and address in a registered mempool. 1889 * 1890 * @param mpr 1891 * Mempool registration object. 1892 * @param addr 1893 * Address within the mempool. 1894 * @param entry 1895 * Bottom-half cache entry to fill. 1896 * 1897 * @return 1898 * MR key or UINT32_MAX on failure, which can only happen 1899 * if the address is not from within the mempool. 1900 */ 1901 static uint32_t 1902 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1903 struct mr_cache_entry *entry) 1904 { 1905 uint32_t lkey = UINT32_MAX; 1906 unsigned int i; 1907 1908 for (i = 0; i < mpr->mrs_n; i++) { 1909 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1910 uintptr_t mr_start = (uintptr_t)mr->addr; 1911 uintptr_t mr_end = mr_start + mr->len; 1912 1913 if (mr_start <= addr && addr < mr_end) { 1914 lkey = rte_cpu_to_be_32(mr->lkey); 1915 entry->start = mr_start; 1916 entry->end = mr_end; 1917 entry->lkey = lkey; 1918 break; 1919 } 1920 } 1921 return lkey; 1922 } 1923 1924 /** 1925 * Update bottom-half cache from the list of mempool registrations. 1926 * 1927 * @param mr_ctrl 1928 * Per-queue MR control handle. 1929 * @param entry 1930 * Pointer to an entry in the bottom-half cache to update 1931 * with the MR lkey looked up. 1932 * @param mp 1933 * Mempool containing the address. 1934 * @param addr 1935 * Address to lookup. 1936 * @return 1937 * MR lkey on success, UINT32_MAX on failure. 1938 */ 1939 static uint32_t 1940 mlx5_lookup_mempool_regs(struct mlx5_mr_ctrl *mr_ctrl, 1941 struct mr_cache_entry *entry, 1942 struct rte_mempool *mp, uintptr_t addr) 1943 { 1944 struct mlx5_mr_share_cache *share_cache = 1945 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1946 dev_gen); 1947 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1948 struct mlx5_mempool_reg *mpr; 1949 uint32_t lkey = UINT32_MAX; 1950 1951 /* If local cache table is full, try to double it. */ 1952 if (unlikely(bt->len == bt->size)) 1953 mr_btree_expand(bt, bt->size << 1); 1954 /* Look up in mempool registrations. */ 1955 rte_rwlock_read_lock(&share_cache->rwlock); 1956 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1957 if (mpr != NULL) 1958 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1959 rte_rwlock_read_unlock(&share_cache->rwlock); 1960 /* 1961 * Update local cache. Even if it fails, return the found entry 1962 * to update top-half cache. Next time, this entry will be found 1963 * in the global cache. 1964 */ 1965 if (lkey != UINT32_MAX) 1966 mr_btree_insert(bt, entry); 1967 return lkey; 1968 } 1969 1970 /** 1971 * Populate cache with LKeys of all MRs used by the mempool. 1972 * It is intended to be used to register Rx mempools in advance. 1973 * 1974 * @param mr_ctrl 1975 * Per-queue MR control handle. 1976 * @param mp 1977 * Registered memory pool. 1978 * 1979 * @return 1980 * 0 on success, (-1) on failure and rte_errno is set. 1981 */ 1982 int 1983 mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl, 1984 struct rte_mempool *mp) 1985 { 1986 struct mlx5_mr_share_cache *share_cache = 1987 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1988 dev_gen); 1989 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1990 struct mlx5_mempool_reg *mpr; 1991 unsigned int i; 1992 1993 /* 1994 * Registration is valid after the lock is released, 1995 * because the function is called after the mempool is registered. 1996 */ 1997 rte_rwlock_read_lock(&share_cache->rwlock); 1998 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1999 rte_rwlock_read_unlock(&share_cache->rwlock); 2000 if (mpr == NULL) { 2001 DRV_LOG(ERR, "Mempool %s is not registered", mp->name); 2002 rte_errno = ENOENT; 2003 return -1; 2004 } 2005 for (i = 0; i < mpr->mrs_n; i++) { 2006 struct mlx5_mempool_mr *mr = &mpr->mrs[i]; 2007 struct mr_cache_entry entry; 2008 uint32_t lkey; 2009 uint32_t idx; 2010 2011 lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr); 2012 if (lkey != UINT32_MAX) 2013 continue; 2014 if (bt->len == bt->size) 2015 mr_btree_expand(bt, bt->size << 1); 2016 entry.start = (uintptr_t)mr->pmd_mr.addr; 2017 entry.end = entry.start + mr->pmd_mr.len; 2018 entry.lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 2019 if (mr_btree_insert(bt, &entry) < 0) { 2020 DRV_LOG(ERR, "Cannot insert cache entry for mempool %s MR %08x", 2021 mp->name, entry.lkey); 2022 rte_errno = EINVAL; 2023 return -1; 2024 } 2025 } 2026 return 0; 2027 } 2028 2029 /** 2030 * Bottom-half lookup for the address from the mempool. 2031 * 2032 * @param mr_ctrl 2033 * Per-queue MR control handle. 2034 * @param mp 2035 * Mempool containing the address. 2036 * @param addr 2037 * Address to lookup. 2038 * @return 2039 * MR lkey on success, UINT32_MAX on failure. 2040 */ 2041 uint32_t 2042 mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, 2043 struct rte_mempool *mp, uintptr_t addr) 2044 { 2045 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 2046 uint32_t lkey; 2047 uint32_t bh_idx = 0; 2048 2049 /* Binary-search MR translation table. */ 2050 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 2051 /* Update top-half cache. */ 2052 if (likely(lkey != UINT32_MAX)) { 2053 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 2054 } else { 2055 lkey = mlx5_lookup_mempool_regs(mr_ctrl, repl, mp, addr); 2056 /* Can only fail if the address is not from the mempool. */ 2057 if (unlikely(lkey == UINT32_MAX)) 2058 return UINT32_MAX; 2059 } 2060 /* Update the most recently used entry. */ 2061 mr_ctrl->mru = mr_ctrl->head; 2062 /* Point to the next victim, the oldest. */ 2063 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 2064 return lkey; 2065 } 2066 2067 uint32_t 2068 mlx5_mr_mb2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, struct rte_mbuf *mb) 2069 { 2070 struct rte_mempool *mp; 2071 struct mlx5_mprq_buf *buf; 2072 uint32_t lkey; 2073 uintptr_t addr = (uintptr_t)mb->buf_addr; 2074 struct mlx5_mr_share_cache *share_cache = 2075 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 2076 dev_gen); 2077 struct mlx5_common_device *cdev = 2078 container_of(share_cache, struct mlx5_common_device, mr_scache); 2079 bool external, mprq, pinned = false; 2080 2081 /* Recover MPRQ mempool. */ 2082 external = RTE_MBUF_HAS_EXTBUF(mb); 2083 if (external && mb->shinfo->free_cb == mlx5_mprq_buf_free_cb) { 2084 mprq = true; 2085 buf = mb->shinfo->fcb_opaque; 2086 mp = buf->mp; 2087 } else { 2088 mprq = false; 2089 mp = mlx5_mb2mp(mb); 2090 pinned = rte_pktmbuf_priv_flags(mp) & 2091 RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF; 2092 } 2093 if (!external || mprq || pinned) { 2094 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2095 if (lkey != UINT32_MAX) 2096 return lkey; 2097 /* MPRQ is always registered. */ 2098 MLX5_ASSERT(!mprq); 2099 } 2100 /* Register pinned external memory if the mempool is not used for Rx. */ 2101 if (cdev->config.mr_mempool_reg_en && pinned) { 2102 if (mlx5_mr_mempool_register(cdev, mp, true) < 0) 2103 return UINT32_MAX; 2104 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2105 MLX5_ASSERT(lkey != UINT32_MAX); 2106 return lkey; 2107 } 2108 /* Fallback to generic mechanism in corner cases. */ 2109 return mlx5_mr_addr2mr_bh(mr_ctrl, addr); 2110 } 2111