1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common.h" 16 #include "mlx5_common_mp.h" 17 #include "mlx5_common_mr.h" 18 #include "mlx5_common_os.h" 19 #include "mlx5_common_log.h" 20 #include "mlx5_malloc.h" 21 22 struct mr_find_contig_memsegs_data { 23 uintptr_t addr; 24 uintptr_t start; 25 uintptr_t end; 26 const struct rte_memseg_list *msl; 27 }; 28 29 /* Virtual memory range. */ 30 struct mlx5_range { 31 uintptr_t start; 32 uintptr_t end; 33 }; 34 35 /** Memory region for a mempool. */ 36 struct mlx5_mempool_mr { 37 struct mlx5_pmd_mr pmd_mr; 38 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 39 }; 40 41 /* Mempool registration. */ 42 struct mlx5_mempool_reg { 43 LIST_ENTRY(mlx5_mempool_reg) next; 44 /** Registered mempool, used to designate registrations. */ 45 struct rte_mempool *mp; 46 /** Memory regions for the address ranges of the mempool. */ 47 struct mlx5_mempool_mr *mrs; 48 /** Number of memory regions. */ 49 unsigned int mrs_n; 50 /** Whether the MR were created for external pinned memory. */ 51 bool is_extmem; 52 }; 53 54 void 55 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 56 { 57 struct mlx5_mprq_buf *buf = opaque; 58 59 if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) { 60 rte_mempool_put(buf->mp, buf); 61 } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1, 62 __ATOMIC_RELAXED) == 0)) { 63 __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED); 64 rte_mempool_put(buf->mp, buf); 65 } 66 } 67 68 /** 69 * Expand B-tree table to a given size. Can't be called with holding 70 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 71 * 72 * @param bt 73 * Pointer to B-tree structure. 74 * @param n 75 * Number of entries for expansion. 76 * 77 * @return 78 * 0 on success, -1 on failure. 79 */ 80 static int 81 mr_btree_expand(struct mlx5_mr_btree *bt, int n) 82 { 83 void *mem; 84 int ret = 0; 85 86 if (n <= bt->size) 87 return ret; 88 /* 89 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 90 * used inside if there's no room to expand. Because this is a quite 91 * rare case and a part of very slow path, it is very acceptable. 92 * Initially cache_bh[] will be given practically enough space and once 93 * it is expanded, expansion wouldn't be needed again ever. 94 */ 95 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 96 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 97 if (mem == NULL) { 98 /* Not an error, B-tree search will be skipped. */ 99 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 100 (void *)bt); 101 ret = -1; 102 } else { 103 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 104 bt->table = mem; 105 bt->size = n; 106 } 107 return ret; 108 } 109 110 /** 111 * Look up LKey from given B-tree lookup table, store the last index and return 112 * searched LKey. 113 * 114 * @param bt 115 * Pointer to B-tree structure. 116 * @param[out] idx 117 * Pointer to index. Even on search failure, returns index where it stops 118 * searching so that index can be used when inserting a new entry. 119 * @param addr 120 * Search key. 121 * 122 * @return 123 * Searched LKey on success, UINT32_MAX on no match. 124 */ 125 static uint32_t 126 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) 127 { 128 struct mr_cache_entry *lkp_tbl; 129 uint16_t n; 130 uint16_t base = 0; 131 132 MLX5_ASSERT(bt != NULL); 133 lkp_tbl = *bt->table; 134 n = bt->len; 135 /* First entry must be NULL for comparison. */ 136 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 137 lkp_tbl[0].lkey == UINT32_MAX)); 138 /* Binary search. */ 139 do { 140 register uint16_t delta = n >> 1; 141 142 if (addr < lkp_tbl[base + delta].start) { 143 n = delta; 144 } else { 145 base += delta; 146 n -= delta; 147 } 148 } while (n > 1); 149 MLX5_ASSERT(addr >= lkp_tbl[base].start); 150 *idx = base; 151 if (addr < lkp_tbl[base].end) 152 return lkp_tbl[base].lkey; 153 /* Not found. */ 154 return UINT32_MAX; 155 } 156 157 /** 158 * Insert an entry to B-tree lookup table. 159 * 160 * @param bt 161 * Pointer to B-tree structure. 162 * @param entry 163 * Pointer to new entry to insert. 164 * 165 * @return 166 * 0 on success, -1 on failure. 167 */ 168 static int 169 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 170 { 171 struct mr_cache_entry *lkp_tbl; 172 uint16_t idx = 0; 173 size_t shift; 174 175 MLX5_ASSERT(bt != NULL); 176 MLX5_ASSERT(bt->len <= bt->size); 177 MLX5_ASSERT(bt->len > 0); 178 lkp_tbl = *bt->table; 179 /* Find out the slot for insertion. */ 180 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 181 DRV_LOG(DEBUG, 182 "abort insertion to B-tree(%p): already exist at" 183 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 184 (void *)bt, idx, entry->start, entry->end, entry->lkey); 185 /* Already exist, return. */ 186 return 0; 187 } 188 /* If table is full, return error. */ 189 if (unlikely(bt->len == bt->size)) { 190 bt->overflow = 1; 191 return -1; 192 } 193 /* Insert entry. */ 194 ++idx; 195 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 196 if (shift) 197 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 198 lkp_tbl[idx] = *entry; 199 bt->len++; 200 DRV_LOG(DEBUG, 201 "inserted B-tree(%p)[%u]," 202 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 203 (void *)bt, idx, entry->start, entry->end, entry->lkey); 204 return 0; 205 } 206 207 /** 208 * Initialize B-tree and allocate memory for lookup table. 209 * 210 * @param bt 211 * Pointer to B-tree structure. 212 * @param n 213 * Number of entries to allocate. 214 * @param socket 215 * NUMA socket on which memory must be allocated. 216 * 217 * @return 218 * 0 on success, a negative errno value otherwise and rte_errno is set. 219 */ 220 static int 221 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 222 { 223 if (bt == NULL) { 224 rte_errno = EINVAL; 225 return -rte_errno; 226 } 227 MLX5_ASSERT(!bt->table && !bt->size); 228 memset(bt, 0, sizeof(*bt)); 229 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 230 sizeof(struct mr_cache_entry) * n, 231 0, socket); 232 if (bt->table == NULL) { 233 rte_errno = ENOMEM; 234 DRV_LOG(DEBUG, 235 "failed to allocate memory for btree cache on socket " 236 "%d", socket); 237 return -rte_errno; 238 } 239 bt->size = n; 240 /* First entry must be NULL for binary search. */ 241 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 242 .lkey = UINT32_MAX, 243 }; 244 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 245 (void *)bt, (void *)bt->table); 246 return 0; 247 } 248 249 /** 250 * Free B-tree resources. 251 * 252 * @param bt 253 * Pointer to B-tree structure. 254 */ 255 void 256 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 257 { 258 if (bt == NULL) 259 return; 260 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 261 (void *)bt, (void *)bt->table); 262 mlx5_free(bt->table); 263 memset(bt, 0, sizeof(*bt)); 264 } 265 266 /** 267 * Dump all the entries in a B-tree 268 * 269 * @param bt 270 * Pointer to B-tree structure. 271 */ 272 void 273 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 274 { 275 #ifdef RTE_LIBRTE_MLX5_DEBUG 276 int idx; 277 struct mr_cache_entry *lkp_tbl; 278 279 if (bt == NULL) 280 return; 281 lkp_tbl = *bt->table; 282 for (idx = 0; idx < bt->len; ++idx) { 283 struct mr_cache_entry *entry = &lkp_tbl[idx]; 284 285 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 286 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 287 (void *)bt, idx, entry->start, entry->end, entry->lkey); 288 } 289 #endif 290 } 291 292 /** 293 * Initialize per-queue MR control descriptor. 294 * 295 * @param mr_ctrl 296 * Pointer to MR control structure. 297 * @param dev_gen_ptr 298 * Pointer to generation number of global cache. 299 * @param socket 300 * NUMA socket on which memory must be allocated. 301 * 302 * @return 303 * 0 on success, a negative errno value otherwise and rte_errno is set. 304 */ 305 int 306 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr, 307 int socket) 308 { 309 if (mr_ctrl == NULL) { 310 rte_errno = EINVAL; 311 return -rte_errno; 312 } 313 /* Save pointer of global generation number to check memory event. */ 314 mr_ctrl->dev_gen_ptr = dev_gen_ptr; 315 /* Initialize B-tree and allocate memory for bottom-half cache table. */ 316 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N, 317 socket); 318 } 319 320 /** 321 * Find virtually contiguous memory chunk in a given MR. 322 * 323 * @param dev 324 * Pointer to MR structure. 325 * @param[out] entry 326 * Pointer to returning MR cache entry. If not found, this will not be 327 * updated. 328 * @param start_idx 329 * Start index of the memseg bitmap. 330 * 331 * @return 332 * Next index to go on lookup. 333 */ 334 static int 335 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 336 int base_idx) 337 { 338 uintptr_t start = 0; 339 uintptr_t end = 0; 340 uint32_t idx = 0; 341 342 /* MR for external memory doesn't have memseg list. */ 343 if (mr->msl == NULL) { 344 MLX5_ASSERT(mr->ms_bmp_n == 1); 345 MLX5_ASSERT(mr->ms_n == 1); 346 MLX5_ASSERT(base_idx == 0); 347 /* 348 * Can't search it from memseg list but get it directly from 349 * pmd_mr as there's only one chunk. 350 */ 351 entry->start = (uintptr_t)mr->pmd_mr.addr; 352 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 353 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 354 /* Returning 1 ends iteration. */ 355 return 1; 356 } 357 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 358 if (rte_bitmap_get(mr->ms_bmp, idx)) { 359 const struct rte_memseg_list *msl; 360 const struct rte_memseg *ms; 361 362 msl = mr->msl; 363 ms = rte_fbarray_get(&msl->memseg_arr, 364 mr->ms_base_idx + idx); 365 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 366 if (!start) 367 start = ms->addr_64; 368 end = ms->addr_64 + ms->hugepage_sz; 369 } else if (start) { 370 /* Passed the end of a fragment. */ 371 break; 372 } 373 } 374 if (start) { 375 /* Found one chunk. */ 376 entry->start = start; 377 entry->end = end; 378 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 379 } 380 return idx; 381 } 382 383 /** 384 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 385 * Then, this entry will have to be searched by mr_lookup_list() in 386 * mlx5_mr_create() on miss. 387 * 388 * @param share_cache 389 * Pointer to a global shared MR cache. 390 * @param mr 391 * Pointer to MR to insert. 392 * 393 * @return 394 * 0 on success, -1 on failure. 395 */ 396 int 397 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 398 struct mlx5_mr *mr) 399 { 400 unsigned int n; 401 402 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 403 (void *)mr, (void *)share_cache); 404 for (n = 0; n < mr->ms_bmp_n; ) { 405 struct mr_cache_entry entry; 406 407 memset(&entry, 0, sizeof(entry)); 408 /* Find a contiguous chunk and advance the index. */ 409 n = mr_find_next_chunk(mr, &entry, n); 410 if (!entry.end) 411 break; 412 if (mr_btree_insert(&share_cache->cache, &entry) < 0) { 413 /* 414 * Overflowed, but the global table cannot be expanded 415 * because of deadlock. 416 */ 417 return -1; 418 } 419 } 420 return 0; 421 } 422 423 /** 424 * Look up address in the original global MR list. 425 * 426 * @param share_cache 427 * Pointer to a global shared MR cache. 428 * @param[out] entry 429 * Pointer to returning MR cache entry. If no match, this will not be updated. 430 * @param addr 431 * Search key. 432 * 433 * @return 434 * Found MR on match, NULL otherwise. 435 */ 436 struct mlx5_mr * 437 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 438 struct mr_cache_entry *entry, uintptr_t addr) 439 { 440 struct mlx5_mr *mr; 441 442 /* Iterate all the existing MRs. */ 443 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 444 unsigned int n; 445 446 if (mr->ms_n == 0) 447 continue; 448 for (n = 0; n < mr->ms_bmp_n; ) { 449 struct mr_cache_entry ret; 450 451 memset(&ret, 0, sizeof(ret)); 452 n = mr_find_next_chunk(mr, &ret, n); 453 if (addr >= ret.start && addr < ret.end) { 454 /* Found. */ 455 *entry = ret; 456 return mr; 457 } 458 } 459 } 460 return NULL; 461 } 462 463 /** 464 * Look up address on global MR cache. 465 * 466 * @param share_cache 467 * Pointer to a global shared MR cache. 468 * @param[out] entry 469 * Pointer to returning MR cache entry. If no match, this will not be updated. 470 * @param addr 471 * Search key. 472 * 473 * @return 474 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 475 */ 476 static uint32_t 477 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 478 struct mr_cache_entry *entry, uintptr_t addr) 479 { 480 uint16_t idx; 481 uint32_t lkey = UINT32_MAX; 482 struct mlx5_mr *mr; 483 484 /* 485 * If the global cache has overflowed since it failed to expand the 486 * B-tree table, it can't have all the existing MRs. Then, the address 487 * has to be searched by traversing the original MR list instead, which 488 * is very slow path. Otherwise, the global cache is all inclusive. 489 */ 490 if (!unlikely(share_cache->cache.overflow)) { 491 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 492 if (lkey != UINT32_MAX) 493 *entry = (*share_cache->cache.table)[idx]; 494 } else { 495 /* Falling back to the slowest path. */ 496 mr = mlx5_mr_lookup_list(share_cache, entry, addr); 497 if (mr != NULL) 498 lkey = entry->lkey; 499 } 500 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 501 addr < entry->end)); 502 return lkey; 503 } 504 505 /** 506 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 507 * can raise memory free event and the callback function will spin on the lock. 508 * 509 * @param mr 510 * Pointer to MR to free. 511 */ 512 void 513 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 514 { 515 if (mr == NULL) 516 return; 517 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 518 dereg_mr_cb(&mr->pmd_mr); 519 if (mr->ms_bmp != NULL) 520 rte_bitmap_free(mr->ms_bmp); 521 mlx5_free(mr); 522 } 523 524 void 525 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 526 { 527 struct mlx5_mr *mr; 528 529 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 530 /* Flush cache to rebuild. */ 531 share_cache->cache.len = 1; 532 share_cache->cache.overflow = 0; 533 /* Iterate all the existing MRs. */ 534 LIST_FOREACH(mr, &share_cache->mr_list, mr) 535 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 536 return; 537 } 538 539 /** 540 * Release resources of detached MR having no online entry. 541 * 542 * @param share_cache 543 * Pointer to a global shared MR cache. 544 */ 545 static void 546 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 547 { 548 struct mlx5_mr *mr_next; 549 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 550 551 /* Must be called from the primary process. */ 552 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 553 /* 554 * MR can't be freed with holding the lock because rte_free() could call 555 * memory free callback function. This will be a deadlock situation. 556 */ 557 rte_rwlock_write_lock(&share_cache->rwlock); 558 /* Detach the whole free list and release it after unlocking. */ 559 free_list = share_cache->mr_free_list; 560 LIST_INIT(&share_cache->mr_free_list); 561 rte_rwlock_write_unlock(&share_cache->rwlock); 562 /* Release resources. */ 563 mr_next = LIST_FIRST(&free_list); 564 while (mr_next != NULL) { 565 struct mlx5_mr *mr = mr_next; 566 567 mr_next = LIST_NEXT(mr, mr); 568 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 569 } 570 } 571 572 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 573 static int 574 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 575 const struct rte_memseg *ms, size_t len, void *arg) 576 { 577 struct mr_find_contig_memsegs_data *data = arg; 578 579 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 580 return 0; 581 /* Found, save it and stop walking. */ 582 data->start = ms->addr_64; 583 data->end = ms->addr_64 + len; 584 data->msl = msl; 585 return 1; 586 } 587 588 /** 589 * Create a new global Memory Region (MR) for a missing virtual address. 590 * This API should be called on a secondary process, then a request is sent to 591 * the primary process in order to create a MR for the address. As the global MR 592 * list is on the shared memory, following LKey lookup should succeed unless the 593 * request fails. 594 * 595 * @param cdev 596 * Pointer to the mlx5 common device. 597 * @param share_cache 598 * Pointer to a global shared MR cache. 599 * @param[out] entry 600 * Pointer to returning MR cache entry, found in the global cache or newly 601 * created. If failed to create one, this will not be updated. 602 * @param addr 603 * Target virtual address to register. 604 * 605 * @return 606 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 607 */ 608 static uint32_t 609 mlx5_mr_create_secondary(struct mlx5_common_device *cdev, 610 struct mlx5_mr_share_cache *share_cache, 611 struct mr_cache_entry *entry, uintptr_t addr) 612 { 613 int ret; 614 615 DRV_LOG(DEBUG, "Requesting MR creation for address (%p)", (void *)addr); 616 ret = mlx5_mp_req_mr_create(cdev, addr); 617 if (ret) { 618 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 619 (void *)addr); 620 return UINT32_MAX; 621 } 622 rte_rwlock_read_lock(&share_cache->rwlock); 623 /* Fill in output data. */ 624 mlx5_mr_lookup_cache(share_cache, entry, addr); 625 /* Lookup can't fail. */ 626 MLX5_ASSERT(entry->lkey != UINT32_MAX); 627 rte_rwlock_read_unlock(&share_cache->rwlock); 628 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 629 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 630 (void *)addr, entry->start, entry->end, entry->lkey); 631 return entry->lkey; 632 } 633 634 /** 635 * Create a new global Memory Region (MR) for a missing virtual address. 636 * Register entire virtually contiguous memory chunk around the address. 637 * 638 * @param pd 639 * Pointer to pd of a device (net, regex, vdpa,...). 640 * @param share_cache 641 * Pointer to a global shared MR cache. 642 * @param[out] entry 643 * Pointer to returning MR cache entry, found in the global cache or newly 644 * created. If failed to create one, this will not be updated. 645 * @param addr 646 * Target virtual address to register. 647 * @param mr_ext_memseg_en 648 * Configurable flag about external memory segment enable or not. 649 * 650 * @return 651 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 652 */ 653 static uint32_t 654 mlx5_mr_create_primary(void *pd, 655 struct mlx5_mr_share_cache *share_cache, 656 struct mr_cache_entry *entry, uintptr_t addr, 657 unsigned int mr_ext_memseg_en) 658 { 659 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 660 struct mr_find_contig_memsegs_data data_re; 661 const struct rte_memseg_list *msl; 662 const struct rte_memseg *ms; 663 struct mlx5_mr *mr = NULL; 664 int ms_idx_shift = -1; 665 uint32_t bmp_size; 666 void *bmp_mem; 667 uint32_t ms_n; 668 uint32_t n; 669 size_t len; 670 671 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 672 /* 673 * Release detached MRs if any. This can't be called with holding either 674 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 675 * been detached by the memory free event but it couldn't be released 676 * inside the callback due to deadlock. As a result, releasing resources 677 * is quite opportunistic. 678 */ 679 mlx5_mr_garbage_collect(share_cache); 680 /* 681 * If enabled, find out a contiguous virtual address chunk in use, to 682 * which the given address belongs, in order to register maximum range. 683 * In the best case where mempools are not dynamically recreated and 684 * '--socket-mem' is specified as an EAL option, it is very likely to 685 * have only one MR(LKey) per a socket and per a hugepage-size even 686 * though the system memory is highly fragmented. As the whole memory 687 * chunk will be pinned by kernel, it can't be reused unless entire 688 * chunk is freed from EAL. 689 * 690 * If disabled, just register one memseg (page). Then, memory 691 * consumption will be minimized but it may drop performance if there 692 * are many MRs to lookup on the datapath. 693 */ 694 if (!mr_ext_memseg_en) { 695 data.msl = rte_mem_virt2memseg_list((void *)addr); 696 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 697 data.end = data.start + data.msl->page_sz; 698 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 699 DRV_LOG(WARNING, 700 "Unable to find virtually contiguous" 701 " chunk for address (%p)." 702 " rte_memseg_contig_walk() failed.", (void *)addr); 703 rte_errno = ENXIO; 704 goto err_nolock; 705 } 706 alloc_resources: 707 /* Addresses must be page-aligned. */ 708 MLX5_ASSERT(data.msl); 709 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 710 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 711 msl = data.msl; 712 ms = rte_mem_virt2memseg((void *)data.start, msl); 713 len = data.end - data.start; 714 MLX5_ASSERT(ms); 715 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 716 /* Number of memsegs in the range. */ 717 ms_n = len / msl->page_sz; 718 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 719 " page_sz=0x%" PRIx64 ", ms_n=%u", 720 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 721 /* Size of memory for bitmap. */ 722 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 723 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 724 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 725 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 726 if (mr == NULL) { 727 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 728 " address (%p).", (void *)addr); 729 rte_errno = ENOMEM; 730 goto err_nolock; 731 } 732 mr->msl = msl; 733 /* 734 * Save the index of the first memseg and initialize memseg bitmap. To 735 * see if a memseg of ms_idx in the memseg-list is still valid, check: 736 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 737 */ 738 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 739 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 740 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 741 if (mr->ms_bmp == NULL) { 742 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 743 " address (%p).", (void *)addr); 744 rte_errno = EINVAL; 745 goto err_nolock; 746 } 747 /* 748 * Should recheck whether the extended contiguous chunk is still valid. 749 * Because memory_hotplug_lock can't be held if there's any memory 750 * related calls in a critical path, resource allocation above can't be 751 * locked. If the memory has been changed at this point, try again with 752 * just single page. If not, go on with the big chunk atomically from 753 * here. 754 */ 755 rte_mcfg_mem_read_lock(); 756 data_re = data; 757 if (len > msl->page_sz && 758 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 759 DRV_LOG(DEBUG, 760 "Unable to find virtually contiguous chunk for address " 761 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 762 rte_errno = ENXIO; 763 goto err_memlock; 764 } 765 if (data.start != data_re.start || data.end != data_re.end) { 766 /* 767 * The extended contiguous chunk has been changed. Try again 768 * with single memseg instead. 769 */ 770 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 771 data.end = data.start + msl->page_sz; 772 rte_mcfg_mem_read_unlock(); 773 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 774 goto alloc_resources; 775 } 776 MLX5_ASSERT(data.msl == data_re.msl); 777 rte_rwlock_write_lock(&share_cache->rwlock); 778 /* 779 * Check the address is really missing. If other thread already created 780 * one or it is not found due to overflow, abort and return. 781 */ 782 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 783 /* 784 * Insert to the global cache table. It may fail due to 785 * low-on-memory. Then, this entry will have to be searched 786 * here again. 787 */ 788 mr_btree_insert(&share_cache->cache, entry); 789 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 790 (void *)addr); 791 rte_rwlock_write_unlock(&share_cache->rwlock); 792 rte_mcfg_mem_read_unlock(); 793 /* 794 * Must be unlocked before calling rte_free() because 795 * mlx5_mr_mem_event_free_cb() can be called inside. 796 */ 797 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 798 return entry->lkey; 799 } 800 /* 801 * Trim start and end addresses for verbs MR. Set bits for registering 802 * memsegs but exclude already registered ones. Bitmap can be 803 * fragmented. 804 */ 805 for (n = 0; n < ms_n; ++n) { 806 uintptr_t start; 807 struct mr_cache_entry ret; 808 809 memset(&ret, 0, sizeof(ret)); 810 start = data_re.start + n * msl->page_sz; 811 /* Exclude memsegs already registered by other MRs. */ 812 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 813 UINT32_MAX) { 814 /* 815 * Start from the first unregistered memseg in the 816 * extended range. 817 */ 818 if (ms_idx_shift == -1) { 819 mr->ms_base_idx += n; 820 data.start = start; 821 ms_idx_shift = n; 822 } 823 data.end = start + msl->page_sz; 824 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 825 ++mr->ms_n; 826 } 827 } 828 len = data.end - data.start; 829 mr->ms_bmp_n = len / msl->page_sz; 830 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 831 /* 832 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 833 * be called with holding the memory lock because it doesn't use 834 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 835 * through mlx5_alloc_verbs_buf(). 836 */ 837 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 838 if (mr->pmd_mr.obj == NULL) { 839 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 840 (void *)addr); 841 rte_errno = EINVAL; 842 goto err_mrlock; 843 } 844 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 845 MLX5_ASSERT(mr->pmd_mr.len); 846 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 847 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 848 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 849 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 850 (void *)mr, (void *)addr, data.start, data.end, 851 rte_cpu_to_be_32(mr->pmd_mr.lkey), 852 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 853 /* Insert to the global cache table. */ 854 mlx5_mr_insert_cache(share_cache, mr); 855 /* Fill in output data. */ 856 mlx5_mr_lookup_cache(share_cache, entry, addr); 857 /* Lookup can't fail. */ 858 MLX5_ASSERT(entry->lkey != UINT32_MAX); 859 rte_rwlock_write_unlock(&share_cache->rwlock); 860 rte_mcfg_mem_read_unlock(); 861 return entry->lkey; 862 err_mrlock: 863 rte_rwlock_write_unlock(&share_cache->rwlock); 864 err_memlock: 865 rte_mcfg_mem_read_unlock(); 866 err_nolock: 867 /* 868 * In case of error, as this can be called in a datapath, a warning 869 * message per an error is preferable instead. Must be unlocked before 870 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 871 * inside. 872 */ 873 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 874 return UINT32_MAX; 875 } 876 877 /** 878 * Create a new global Memory Region (MR) for a missing virtual address. 879 * This can be called from primary and secondary process. 880 * 881 * @param cdev 882 * Pointer to the mlx5 common device. 883 * @param share_cache 884 * Pointer to a global shared MR cache. 885 * @param[out] entry 886 * Pointer to returning MR cache entry, found in the global cache or newly 887 * created. If failed to create one, this will not be updated. 888 * @param addr 889 * Target virtual address to register. 890 * 891 * @return 892 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 893 */ 894 uint32_t 895 mlx5_mr_create(struct mlx5_common_device *cdev, 896 struct mlx5_mr_share_cache *share_cache, 897 struct mr_cache_entry *entry, uintptr_t addr) 898 { 899 uint32_t ret = 0; 900 901 switch (rte_eal_process_type()) { 902 case RTE_PROC_PRIMARY: 903 ret = mlx5_mr_create_primary(cdev->pd, share_cache, entry, addr, 904 cdev->config.mr_ext_memseg_en); 905 break; 906 case RTE_PROC_SECONDARY: 907 ret = mlx5_mr_create_secondary(cdev, share_cache, entry, addr); 908 break; 909 default: 910 break; 911 } 912 return ret; 913 } 914 915 /** 916 * Look up address in the global MR cache table. If not found, create a new MR. 917 * Insert the found/created entry to local bottom-half cache table. 918 * 919 * @param mr_ctrl 920 * Pointer to per-queue MR control structure. 921 * @param[out] entry 922 * Pointer to returning MR cache entry, found in the global cache or newly 923 * created. If failed to create one, this is not written. 924 * @param addr 925 * Search key. 926 * 927 * @return 928 * Searched LKey on success, UINT32_MAX on no match. 929 */ 930 static uint32_t 931 mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl, 932 struct mr_cache_entry *entry, uintptr_t addr) 933 { 934 struct mlx5_mr_share_cache *share_cache = 935 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 936 dev_gen); 937 struct mlx5_common_device *cdev = 938 container_of(share_cache, struct mlx5_common_device, mr_scache); 939 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 940 uint32_t lkey; 941 uint16_t idx; 942 943 /* If local cache table is full, try to double it. */ 944 if (unlikely(bt->len == bt->size)) 945 mr_btree_expand(bt, bt->size << 1); 946 /* Look up in the global cache. */ 947 rte_rwlock_read_lock(&share_cache->rwlock); 948 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 949 if (lkey != UINT32_MAX) { 950 /* Found. */ 951 *entry = (*share_cache->cache.table)[idx]; 952 rte_rwlock_read_unlock(&share_cache->rwlock); 953 /* 954 * Update local cache. Even if it fails, return the found entry 955 * to update top-half cache. Next time, this entry will be found 956 * in the global cache. 957 */ 958 mr_btree_insert(bt, entry); 959 return lkey; 960 } 961 rte_rwlock_read_unlock(&share_cache->rwlock); 962 /* First time to see the address? Create a new MR. */ 963 lkey = mlx5_mr_create(cdev, share_cache, entry, addr); 964 /* 965 * Update the local cache if successfully created a new global MR. Even 966 * if failed to create one, there's no action to take in this datapath 967 * code. As returning LKey is invalid, this will eventually make HW 968 * fail. 969 */ 970 if (lkey != UINT32_MAX) 971 mr_btree_insert(bt, entry); 972 return lkey; 973 } 974 975 /** 976 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 977 * misses, search in the global MR cache table and update the new entry to 978 * per-queue local caches. 979 * 980 * @param mr_ctrl 981 * Pointer to per-queue MR control structure. 982 * @param addr 983 * Search key. 984 * 985 * @return 986 * Searched LKey on success, UINT32_MAX on no match. 987 */ 988 static uint32_t 989 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr) 990 { 991 uint32_t lkey; 992 uint16_t bh_idx = 0; 993 /* Victim in top-half cache to replace with new entry. */ 994 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 995 996 /* Binary-search MR translation table. */ 997 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 998 /* Update top-half cache. */ 999 if (likely(lkey != UINT32_MAX)) { 1000 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1001 } else { 1002 /* 1003 * If missed in local lookup table, search in the global cache 1004 * and local cache_bh[] will be updated inside if possible. 1005 * Top-half cache entry will also be updated. 1006 */ 1007 lkey = mr_lookup_caches(mr_ctrl, repl, addr); 1008 if (unlikely(lkey == UINT32_MAX)) 1009 return UINT32_MAX; 1010 } 1011 /* Update the most recently used entry. */ 1012 mr_ctrl->mru = mr_ctrl->head; 1013 /* Point to the next victim, the oldest. */ 1014 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1015 return lkey; 1016 } 1017 1018 /** 1019 * Release all the created MRs and resources on global MR cache of a device 1020 * list. 1021 * 1022 * @param share_cache 1023 * Pointer to a global shared MR cache. 1024 */ 1025 void 1026 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 1027 { 1028 struct mlx5_mr *mr_next; 1029 1030 rte_rwlock_write_lock(&share_cache->rwlock); 1031 /* Detach from MR list and move to free list. */ 1032 mr_next = LIST_FIRST(&share_cache->mr_list); 1033 while (mr_next != NULL) { 1034 struct mlx5_mr *mr = mr_next; 1035 1036 mr_next = LIST_NEXT(mr, mr); 1037 LIST_REMOVE(mr, mr); 1038 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1039 } 1040 LIST_INIT(&share_cache->mr_list); 1041 /* Free global cache. */ 1042 mlx5_mr_btree_free(&share_cache->cache); 1043 rte_rwlock_write_unlock(&share_cache->rwlock); 1044 /* Free all remaining MRs. */ 1045 mlx5_mr_garbage_collect(share_cache); 1046 } 1047 1048 /** 1049 * Initialize global MR cache of a device. 1050 * 1051 * @param share_cache 1052 * Pointer to a global shared MR cache. 1053 * @param socket 1054 * NUMA socket on which memory must be allocated. 1055 * 1056 * @return 1057 * 0 on success, a negative errno value otherwise and rte_errno is set. 1058 */ 1059 int 1060 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket) 1061 { 1062 /* Set the reg_mr and dereg_mr callback functions */ 1063 mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb, 1064 &share_cache->dereg_mr_cb); 1065 rte_rwlock_init(&share_cache->rwlock); 1066 rte_rwlock_init(&share_cache->mprwlock); 1067 share_cache->mp_cb_registered = 0; 1068 /* Initialize B-tree and allocate memory for global MR cache table. */ 1069 return mlx5_mr_btree_init(&share_cache->cache, 1070 MLX5_MR_BTREE_CACHE_N * 2, socket); 1071 } 1072 1073 /** 1074 * Flush all of the local cache entries. 1075 * 1076 * @param mr_ctrl 1077 * Pointer to per-queue MR local cache. 1078 */ 1079 void 1080 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1081 { 1082 /* Reset the most-recently-used index. */ 1083 mr_ctrl->mru = 0; 1084 /* Reset the linear search array. */ 1085 mr_ctrl->head = 0; 1086 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1087 /* Reset the B-tree table. */ 1088 mr_ctrl->cache_bh.len = 1; 1089 mr_ctrl->cache_bh.overflow = 0; 1090 /* Update the generation number. */ 1091 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1092 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1093 (void *)mr_ctrl, mr_ctrl->cur_gen); 1094 } 1095 1096 /** 1097 * Creates a memory region for external memory, that is memory which is not 1098 * part of the DPDK memory segments. 1099 * 1100 * @param pd 1101 * Pointer to pd of a device (net, regex, vdpa,...). 1102 * @param addr 1103 * Starting virtual address of memory. 1104 * @param len 1105 * Length of memory segment being mapped. 1106 * @param socked_id 1107 * Socket to allocate heap memory for the control structures. 1108 * 1109 * @return 1110 * Pointer to MR structure on success, NULL otherwise. 1111 */ 1112 struct mlx5_mr * 1113 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1114 mlx5_reg_mr_t reg_mr_cb) 1115 { 1116 struct mlx5_mr *mr = NULL; 1117 1118 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1119 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1120 RTE_CACHE_LINE_SIZE, socket_id); 1121 if (mr == NULL) 1122 return NULL; 1123 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1124 if (mr->pmd_mr.obj == NULL) { 1125 DRV_LOG(WARNING, 1126 "Fail to create MR for address (%p)", 1127 (void *)addr); 1128 mlx5_free(mr); 1129 return NULL; 1130 } 1131 mr->msl = NULL; /* Mark it is external memory. */ 1132 mr->ms_bmp = NULL; 1133 mr->ms_n = 1; 1134 mr->ms_bmp_n = 1; 1135 DRV_LOG(DEBUG, 1136 "MR CREATED (%p) for external memory %p:\n" 1137 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1138 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1139 (void *)mr, (void *)addr, 1140 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1141 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1142 return mr; 1143 } 1144 1145 /** 1146 * Callback for memory free event. Iterate freed memsegs and check whether it 1147 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1148 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1149 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1150 * secondary process, the garbage collector will be called in primary process 1151 * as the secondary process can't call mlx5_mr_create(). 1152 * 1153 * The global cache must be rebuilt if there's any change and this event has to 1154 * be propagated to dataplane threads to flush the local caches. 1155 * 1156 * @param share_cache 1157 * Pointer to a global shared MR cache. 1158 * @param ibdev_name 1159 * Name of ibv device. 1160 * @param addr 1161 * Address of freed memory. 1162 * @param len 1163 * Size of freed memory. 1164 */ 1165 void 1166 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1167 const char *ibdev_name, const void *addr, size_t len) 1168 { 1169 const struct rte_memseg_list *msl; 1170 struct mlx5_mr *mr; 1171 int ms_n; 1172 int i; 1173 int rebuild = 0; 1174 1175 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1176 ibdev_name, addr, len); 1177 msl = rte_mem_virt2memseg_list(addr); 1178 /* addr and len must be page-aligned. */ 1179 MLX5_ASSERT((uintptr_t)addr == 1180 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1181 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1182 ms_n = len / msl->page_sz; 1183 rte_rwlock_write_lock(&share_cache->rwlock); 1184 /* Clear bits of freed memsegs from MR. */ 1185 for (i = 0; i < ms_n; ++i) { 1186 const struct rte_memseg *ms; 1187 struct mr_cache_entry entry; 1188 uintptr_t start; 1189 int ms_idx; 1190 uint32_t pos; 1191 1192 /* Find MR having this memseg. */ 1193 start = (uintptr_t)addr + i * msl->page_sz; 1194 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1195 if (mr == NULL) 1196 continue; 1197 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1198 ms = rte_mem_virt2memseg((void *)start, msl); 1199 MLX5_ASSERT(ms != NULL); 1200 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1201 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1202 pos = ms_idx - mr->ms_base_idx; 1203 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1204 MLX5_ASSERT(pos < mr->ms_bmp_n); 1205 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1206 ibdev_name, (void *)mr, pos, (void *)start); 1207 rte_bitmap_clear(mr->ms_bmp, pos); 1208 if (--mr->ms_n == 0) { 1209 LIST_REMOVE(mr, mr); 1210 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1211 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1212 ibdev_name, (void *)mr); 1213 } 1214 /* 1215 * MR is fragmented or will be freed. the global cache must be 1216 * rebuilt. 1217 */ 1218 rebuild = 1; 1219 } 1220 if (rebuild) { 1221 mlx5_mr_rebuild_cache(share_cache); 1222 /* 1223 * No explicit wmb is needed after updating dev_gen due to 1224 * store-release ordering in unlock that provides the 1225 * implicit barrier at the software visible level. 1226 */ 1227 ++share_cache->dev_gen; 1228 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1229 share_cache->dev_gen); 1230 } 1231 rte_rwlock_write_unlock(&share_cache->rwlock); 1232 } 1233 1234 /** 1235 * Dump all the created MRs and the global cache entries. 1236 * 1237 * @param share_cache 1238 * Pointer to a global shared MR cache. 1239 */ 1240 void 1241 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1242 { 1243 #ifdef RTE_LIBRTE_MLX5_DEBUG 1244 struct mlx5_mr *mr; 1245 int mr_n = 0; 1246 int chunk_n = 0; 1247 1248 rte_rwlock_read_lock(&share_cache->rwlock); 1249 /* Iterate all the existing MRs. */ 1250 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1251 unsigned int n; 1252 1253 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1254 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1255 mr->ms_n, mr->ms_bmp_n); 1256 if (mr->ms_n == 0) 1257 continue; 1258 for (n = 0; n < mr->ms_bmp_n; ) { 1259 struct mr_cache_entry ret = { 0, }; 1260 1261 n = mr_find_next_chunk(mr, &ret, n); 1262 if (!ret.end) 1263 break; 1264 DRV_LOG(DEBUG, 1265 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1266 chunk_n++, ret.start, ret.end); 1267 } 1268 } 1269 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1270 mlx5_mr_btree_dump(&share_cache->cache); 1271 rte_rwlock_read_unlock(&share_cache->rwlock); 1272 #endif 1273 } 1274 1275 static int 1276 mlx5_range_compare_start(const void *lhs, const void *rhs) 1277 { 1278 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1279 1280 if (r1->start > r2->start) 1281 return 1; 1282 else if (r1->start < r2->start) 1283 return -1; 1284 return 0; 1285 } 1286 1287 static void 1288 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1289 struct rte_mempool_memhdr *memhdr, 1290 unsigned int idx) 1291 { 1292 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1293 uint64_t page_size = rte_mem_page_size(); 1294 1295 RTE_SET_USED(mp); 1296 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 1297 range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size); 1298 } 1299 1300 /** 1301 * Collect page-aligned memory ranges of the mempool. 1302 */ 1303 static int 1304 mlx5_mempool_get_chunks(struct rte_mempool *mp, struct mlx5_range **out, 1305 unsigned int *out_n) 1306 { 1307 unsigned int n; 1308 1309 DRV_LOG(DEBUG, "Collecting chunks of regular mempool %s", mp->name); 1310 n = mp->nb_mem_chunks; 1311 *out = calloc(sizeof(**out), n); 1312 if (*out == NULL) 1313 return -1; 1314 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, *out); 1315 *out_n = n; 1316 return 0; 1317 } 1318 1319 struct mlx5_mempool_get_extmem_data { 1320 struct mlx5_range *heap; 1321 unsigned int heap_size; 1322 int ret; 1323 }; 1324 1325 static void 1326 mlx5_mempool_get_extmem_cb(struct rte_mempool *mp, void *opaque, 1327 void *obj, unsigned int obj_idx) 1328 { 1329 struct mlx5_mempool_get_extmem_data *data = opaque; 1330 struct rte_mbuf *mbuf = obj; 1331 uintptr_t addr = (uintptr_t)mbuf->buf_addr; 1332 struct mlx5_range *seg, *heap; 1333 struct rte_memseg_list *msl; 1334 size_t page_size; 1335 uintptr_t page_start; 1336 unsigned int pos = 0, len = data->heap_size, delta; 1337 1338 RTE_SET_USED(mp); 1339 RTE_SET_USED(obj_idx); 1340 if (data->ret < 0) 1341 return; 1342 /* Binary search for an already visited page. */ 1343 while (len > 1) { 1344 delta = len / 2; 1345 if (addr < data->heap[pos + delta].start) { 1346 len = delta; 1347 } else { 1348 pos += delta; 1349 len -= delta; 1350 } 1351 } 1352 if (data->heap != NULL) { 1353 seg = &data->heap[pos]; 1354 if (seg->start <= addr && addr < seg->end) 1355 return; 1356 } 1357 /* Determine the page boundaries and remember them. */ 1358 heap = realloc(data->heap, sizeof(heap[0]) * (data->heap_size + 1)); 1359 if (heap == NULL) { 1360 free(data->heap); 1361 data->heap = NULL; 1362 data->ret = -1; 1363 return; 1364 } 1365 data->heap = heap; 1366 data->heap_size++; 1367 seg = &heap[data->heap_size - 1]; 1368 msl = rte_mem_virt2memseg_list((void *)addr); 1369 page_size = msl != NULL ? msl->page_sz : rte_mem_page_size(); 1370 page_start = RTE_PTR_ALIGN_FLOOR(addr, page_size); 1371 seg->start = page_start; 1372 seg->end = page_start + page_size; 1373 /* Maintain the heap order. */ 1374 qsort(data->heap, data->heap_size, sizeof(heap[0]), 1375 mlx5_range_compare_start); 1376 } 1377 1378 /** 1379 * Recover pages of external memory as close as possible 1380 * for a mempool with RTE_PKTMBUF_POOL_PINNED_EXT_BUF. 1381 * Pages are stored in a heap for efficient search, for mbufs are many. 1382 */ 1383 static int 1384 mlx5_mempool_get_extmem(struct rte_mempool *mp, struct mlx5_range **out, 1385 unsigned int *out_n) 1386 { 1387 struct mlx5_mempool_get_extmem_data data; 1388 1389 DRV_LOG(DEBUG, "Recovering external pinned pages of mempool %s", 1390 mp->name); 1391 memset(&data, 0, sizeof(data)); 1392 rte_mempool_obj_iter(mp, mlx5_mempool_get_extmem_cb, &data); 1393 *out = data.heap; 1394 *out_n = data.heap_size; 1395 return data.ret; 1396 } 1397 1398 /** 1399 * Get VA-contiguous ranges of the mempool memory. 1400 * Each range start and end is aligned to the system page size. 1401 * 1402 * @param[in] mp 1403 * Analyzed mempool. 1404 * @param[in] is_extmem 1405 * Whether the pool is contains only external pinned buffers. 1406 * @param[out] out 1407 * Receives the ranges, caller must release it with free(). 1408 * @param[out] out_n 1409 * Receives the number of @p out elements. 1410 * 1411 * @return 1412 * 0 on success, (-1) on failure. 1413 */ 1414 static int 1415 mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem, 1416 struct mlx5_range **out, unsigned int *out_n) 1417 { 1418 struct mlx5_range *chunks; 1419 unsigned int chunks_n, contig_n, i; 1420 int ret; 1421 1422 /* Collect the pool underlying memory. */ 1423 ret = is_extmem ? mlx5_mempool_get_extmem(mp, &chunks, &chunks_n) : 1424 mlx5_mempool_get_chunks(mp, &chunks, &chunks_n); 1425 if (ret < 0) 1426 return ret; 1427 /* Merge adjacent chunks and place them at the beginning. */ 1428 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1429 contig_n = 1; 1430 for (i = 1; i < chunks_n; i++) 1431 if (chunks[i - 1].end != chunks[i].start) { 1432 chunks[contig_n - 1].end = chunks[i - 1].end; 1433 chunks[contig_n] = chunks[i]; 1434 contig_n++; 1435 } 1436 /* Extend the last contiguous chunk to the end of the mempool. */ 1437 chunks[contig_n - 1].end = chunks[i - 1].end; 1438 *out = chunks; 1439 *out_n = contig_n; 1440 return 0; 1441 } 1442 1443 /** 1444 * Analyze mempool memory to select memory ranges to register. 1445 * 1446 * @param[in] mp 1447 * Mempool to analyze. 1448 * @param[in] is_extmem 1449 * Whether the pool is contains only external pinned buffers. 1450 * @param[out] out 1451 * Receives memory ranges to register, aligned to the system page size. 1452 * The caller must release them with free(). 1453 * @param[out] out_n 1454 * Receives the number of @p out items. 1455 * @param[out] share_hugepage 1456 * Receives True if the entire pool resides within a single hugepage. 1457 * 1458 * @return 1459 * 0 on success, (-1) on failure. 1460 */ 1461 static int 1462 mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem, 1463 struct mlx5_range **out, unsigned int *out_n, 1464 bool *share_hugepage) 1465 { 1466 struct mlx5_range *ranges = NULL; 1467 unsigned int i, ranges_n = 0; 1468 struct rte_memseg_list *msl; 1469 1470 if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) { 1471 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1472 mp->name); 1473 return -1; 1474 } 1475 /* Check if the hugepage of the pool can be shared. */ 1476 *share_hugepage = false; 1477 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1478 if (msl != NULL) { 1479 uint64_t hugepage_sz = 0; 1480 1481 /* Check that all ranges are on pages of the same size. */ 1482 for (i = 0; i < ranges_n; i++) { 1483 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1484 break; 1485 hugepage_sz = msl->page_sz; 1486 } 1487 if (i == ranges_n) { 1488 /* 1489 * If the entire pool is within one hugepage, 1490 * combine all ranges into one of the hugepage size. 1491 */ 1492 uintptr_t reg_start = ranges[0].start; 1493 uintptr_t reg_end = ranges[ranges_n - 1].end; 1494 uintptr_t hugepage_start = 1495 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1496 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1497 if (reg_end < hugepage_end) { 1498 ranges[0].start = hugepage_start; 1499 ranges[0].end = hugepage_end; 1500 ranges_n = 1; 1501 *share_hugepage = true; 1502 } 1503 } 1504 } 1505 *out = ranges; 1506 *out_n = ranges_n; 1507 return 0; 1508 } 1509 1510 /** Create a registration object for the mempool. */ 1511 static struct mlx5_mempool_reg * 1512 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n, 1513 bool is_extmem) 1514 { 1515 struct mlx5_mempool_reg *mpr = NULL; 1516 1517 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1518 sizeof(*mpr) + mrs_n * sizeof(mpr->mrs[0]), 1519 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1520 if (mpr == NULL) { 1521 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1522 mp->name); 1523 return NULL; 1524 } 1525 mpr->mp = mp; 1526 mpr->mrs = (struct mlx5_mempool_mr *)(mpr + 1); 1527 mpr->mrs_n = mrs_n; 1528 mpr->is_extmem = is_extmem; 1529 return mpr; 1530 } 1531 1532 /** 1533 * Destroy a mempool registration object. 1534 * 1535 * @param standalone 1536 * Whether @p mpr owns its MRs excludively, i.e. they are not shared. 1537 */ 1538 static void 1539 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1540 struct mlx5_mempool_reg *mpr, bool standalone) 1541 { 1542 if (standalone) { 1543 unsigned int i; 1544 1545 for (i = 0; i < mpr->mrs_n; i++) 1546 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1547 } 1548 mlx5_free(mpr); 1549 } 1550 1551 /** Find registration object of a mempool. */ 1552 static struct mlx5_mempool_reg * 1553 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1554 struct rte_mempool *mp) 1555 { 1556 struct mlx5_mempool_reg *mpr; 1557 1558 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1559 if (mpr->mp == mp) 1560 break; 1561 return mpr; 1562 } 1563 1564 /** Increment reference counters of MRs used in the registration. */ 1565 static void 1566 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1567 { 1568 unsigned int i; 1569 1570 for (i = 0; i < mpr->mrs_n; i++) 1571 __atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1572 } 1573 1574 /** 1575 * Decrement reference counters of MRs used in the registration. 1576 * 1577 * @return True if no more references to @p mpr MRs exist, False otherwise. 1578 */ 1579 static bool 1580 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1581 { 1582 unsigned int i; 1583 bool ret = false; 1584 1585 for (i = 0; i < mpr->mrs_n; i++) 1586 ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1, 1587 __ATOMIC_RELAXED) == 0; 1588 return ret; 1589 } 1590 1591 static int 1592 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1593 void *pd, struct rte_mempool *mp, 1594 bool is_extmem) 1595 { 1596 struct mlx5_range *ranges = NULL; 1597 struct mlx5_mempool_reg *mpr, *old_mpr, *new_mpr; 1598 unsigned int i, ranges_n; 1599 bool share_hugepage, standalone = false; 1600 int ret = -1; 1601 1602 /* Early check to avoid unnecessary creation of MRs. */ 1603 rte_rwlock_read_lock(&share_cache->rwlock); 1604 old_mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1605 rte_rwlock_read_unlock(&share_cache->rwlock); 1606 if (old_mpr != NULL && (!is_extmem || old_mpr->is_extmem)) { 1607 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1608 mp->name, pd); 1609 rte_errno = EEXIST; 1610 goto exit; 1611 } 1612 if (mlx5_mempool_reg_analyze(mp, is_extmem, &ranges, &ranges_n, 1613 &share_hugepage) < 0) { 1614 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1615 rte_errno = ENOMEM; 1616 goto exit; 1617 } 1618 new_mpr = mlx5_mempool_reg_create(mp, ranges_n, is_extmem); 1619 if (new_mpr == NULL) { 1620 DRV_LOG(ERR, 1621 "Cannot create a registration object for mempool %s in PD %p", 1622 mp->name, pd); 1623 rte_errno = ENOMEM; 1624 goto exit; 1625 } 1626 /* 1627 * If the entire mempool fits in a single hugepage, the MR for this 1628 * hugepage can be shared across mempools that also fit in it. 1629 */ 1630 if (share_hugepage) { 1631 rte_rwlock_write_lock(&share_cache->rwlock); 1632 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1633 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1634 break; 1635 } 1636 if (mpr != NULL) { 1637 new_mpr->mrs = mpr->mrs; 1638 mlx5_mempool_reg_attach(new_mpr); 1639 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1640 new_mpr, next); 1641 } 1642 rte_rwlock_write_unlock(&share_cache->rwlock); 1643 if (mpr != NULL) { 1644 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1645 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1646 mpr->mp->name); 1647 ret = 0; 1648 goto exit; 1649 } 1650 } 1651 for (i = 0; i < ranges_n; i++) { 1652 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1653 const struct mlx5_range *range = &ranges[i]; 1654 size_t len = range->end - range->start; 1655 1656 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1657 &mr->pmd_mr) < 0) { 1658 DRV_LOG(ERR, 1659 "Failed to create an MR in PD %p for address range " 1660 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1661 pd, range->start, range->end, len, mp->name); 1662 break; 1663 } 1664 DRV_LOG(DEBUG, 1665 "Created a new MR %#x in PD %p for address range " 1666 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1667 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1668 mp->name); 1669 } 1670 if (i != ranges_n) { 1671 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1672 rte_errno = EINVAL; 1673 goto exit; 1674 } 1675 /* Concurrent registration is not supposed to happen. */ 1676 rte_rwlock_write_lock(&share_cache->rwlock); 1677 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1678 if (mpr == old_mpr && old_mpr != NULL) { 1679 LIST_REMOVE(old_mpr, next); 1680 standalone = mlx5_mempool_reg_detach(mpr); 1681 /* No need to flush the cache: old MRs cannot be in use. */ 1682 mpr = NULL; 1683 } 1684 if (mpr == NULL) { 1685 mlx5_mempool_reg_attach(new_mpr); 1686 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, new_mpr, next); 1687 ret = 0; 1688 } 1689 rte_rwlock_write_unlock(&share_cache->rwlock); 1690 if (mpr != NULL) { 1691 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1692 mp->name, pd); 1693 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1694 rte_errno = EEXIST; 1695 goto exit; 1696 } else if (old_mpr != NULL) { 1697 DRV_LOG(DEBUG, "Mempool %s registration for PD %p updated for external memory", 1698 mp->name, pd); 1699 mlx5_mempool_reg_destroy(share_cache, old_mpr, standalone); 1700 } 1701 exit: 1702 free(ranges); 1703 return ret; 1704 } 1705 1706 static int 1707 mlx5_mr_mempool_register_secondary(struct mlx5_common_device *cdev, 1708 struct rte_mempool *mp, bool is_extmem) 1709 { 1710 return mlx5_mp_req_mempool_reg(cdev, mp, true, is_extmem); 1711 } 1712 1713 /** 1714 * Register the memory of a mempool in the protection domain. 1715 * 1716 * @param cdev 1717 * Pointer to the mlx5 common device. 1718 * @param mp 1719 * Mempool to register. 1720 * 1721 * @return 1722 * 0 on success, (-1) on failure and rte_errno is set. 1723 */ 1724 int 1725 mlx5_mr_mempool_register(struct mlx5_common_device *cdev, 1726 struct rte_mempool *mp, bool is_extmem) 1727 { 1728 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1729 return 0; 1730 switch (rte_eal_process_type()) { 1731 case RTE_PROC_PRIMARY: 1732 return mlx5_mr_mempool_register_primary(&cdev->mr_scache, 1733 cdev->pd, mp, 1734 is_extmem); 1735 case RTE_PROC_SECONDARY: 1736 return mlx5_mr_mempool_register_secondary(cdev, mp, is_extmem); 1737 default: 1738 return -1; 1739 } 1740 } 1741 1742 static int 1743 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1744 struct rte_mempool *mp) 1745 { 1746 struct mlx5_mempool_reg *mpr; 1747 bool standalone = false; 1748 1749 rte_rwlock_write_lock(&share_cache->rwlock); 1750 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1751 if (mpr->mp == mp) { 1752 LIST_REMOVE(mpr, next); 1753 standalone = mlx5_mempool_reg_detach(mpr); 1754 if (standalone) 1755 /* 1756 * The unlock operation below provides a memory 1757 * barrier due to its store-release semantics. 1758 */ 1759 ++share_cache->dev_gen; 1760 break; 1761 } 1762 rte_rwlock_write_unlock(&share_cache->rwlock); 1763 if (mpr == NULL) { 1764 rte_errno = ENOENT; 1765 return -1; 1766 } 1767 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1768 return 0; 1769 } 1770 1771 static int 1772 mlx5_mr_mempool_unregister_secondary(struct mlx5_common_device *cdev, 1773 struct rte_mempool *mp) 1774 { 1775 return mlx5_mp_req_mempool_reg(cdev, mp, false, false /* is_extmem */); 1776 } 1777 1778 /** 1779 * Unregister the memory of a mempool from the protection domain. 1780 * 1781 * @param cdev 1782 * Pointer to the mlx5 common device. 1783 * @param mp 1784 * Mempool to unregister. 1785 * 1786 * @return 1787 * 0 on success, (-1) on failure and rte_errno is set. 1788 */ 1789 int 1790 mlx5_mr_mempool_unregister(struct mlx5_common_device *cdev, 1791 struct rte_mempool *mp) 1792 { 1793 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1794 return 0; 1795 switch (rte_eal_process_type()) { 1796 case RTE_PROC_PRIMARY: 1797 return mlx5_mr_mempool_unregister_primary(&cdev->mr_scache, mp); 1798 case RTE_PROC_SECONDARY: 1799 return mlx5_mr_mempool_unregister_secondary(cdev, mp); 1800 default: 1801 return -1; 1802 } 1803 } 1804 1805 /** 1806 * Lookup a MR key by and address in a registered mempool. 1807 * 1808 * @param mpr 1809 * Mempool registration object. 1810 * @param addr 1811 * Address within the mempool. 1812 * @param entry 1813 * Bottom-half cache entry to fill. 1814 * 1815 * @return 1816 * MR key or UINT32_MAX on failure, which can only happen 1817 * if the address is not from within the mempool. 1818 */ 1819 static uint32_t 1820 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1821 struct mr_cache_entry *entry) 1822 { 1823 uint32_t lkey = UINT32_MAX; 1824 unsigned int i; 1825 1826 for (i = 0; i < mpr->mrs_n; i++) { 1827 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1828 uintptr_t mr_addr = (uintptr_t)mr->addr; 1829 1830 if (mr_addr <= addr) { 1831 lkey = rte_cpu_to_be_32(mr->lkey); 1832 entry->start = mr_addr; 1833 entry->end = mr_addr + mr->len; 1834 entry->lkey = lkey; 1835 break; 1836 } 1837 } 1838 return lkey; 1839 } 1840 1841 /** 1842 * Update bottom-half cache from the list of mempool registrations. 1843 * 1844 * @param mr_ctrl 1845 * Per-queue MR control handle. 1846 * @param entry 1847 * Pointer to an entry in the bottom-half cache to update 1848 * with the MR lkey looked up. 1849 * @param mp 1850 * Mempool containing the address. 1851 * @param addr 1852 * Address to lookup. 1853 * @return 1854 * MR lkey on success, UINT32_MAX on failure. 1855 */ 1856 static uint32_t 1857 mlx5_lookup_mempool_regs(struct mlx5_mr_ctrl *mr_ctrl, 1858 struct mr_cache_entry *entry, 1859 struct rte_mempool *mp, uintptr_t addr) 1860 { 1861 struct mlx5_mr_share_cache *share_cache = 1862 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1863 dev_gen); 1864 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1865 struct mlx5_mempool_reg *mpr; 1866 uint32_t lkey = UINT32_MAX; 1867 1868 /* If local cache table is full, try to double it. */ 1869 if (unlikely(bt->len == bt->size)) 1870 mr_btree_expand(bt, bt->size << 1); 1871 /* Look up in mempool registrations. */ 1872 rte_rwlock_read_lock(&share_cache->rwlock); 1873 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1874 if (mpr != NULL) 1875 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1876 rte_rwlock_read_unlock(&share_cache->rwlock); 1877 /* 1878 * Update local cache. Even if it fails, return the found entry 1879 * to update top-half cache. Next time, this entry will be found 1880 * in the global cache. 1881 */ 1882 if (lkey != UINT32_MAX) 1883 mr_btree_insert(bt, entry); 1884 return lkey; 1885 } 1886 1887 /** 1888 * Populate cache with LKeys of all MRs used by the mempool. 1889 * It is intended to be used to register Rx mempools in advance. 1890 * 1891 * @param mr_ctrl 1892 * Per-queue MR control handle. 1893 * @param mp 1894 * Registered memory pool. 1895 * 1896 * @return 1897 * 0 on success, (-1) on failure and rte_errno is set. 1898 */ 1899 int 1900 mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl, 1901 struct rte_mempool *mp) 1902 { 1903 struct mlx5_mr_share_cache *share_cache = 1904 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1905 dev_gen); 1906 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1907 struct mlx5_mempool_reg *mpr; 1908 unsigned int i; 1909 1910 /* 1911 * Registration is valid after the lock is released, 1912 * because the function is called after the mempool is registered. 1913 */ 1914 rte_rwlock_read_lock(&share_cache->rwlock); 1915 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1916 rte_rwlock_read_unlock(&share_cache->rwlock); 1917 if (mpr == NULL) { 1918 DRV_LOG(ERR, "Mempool %s is not registered", mp->name); 1919 rte_errno = ENOENT; 1920 return -1; 1921 } 1922 for (i = 0; i < mpr->mrs_n; i++) { 1923 struct mlx5_mempool_mr *mr = &mpr->mrs[i]; 1924 struct mr_cache_entry entry; 1925 uint32_t lkey; 1926 uint16_t idx; 1927 1928 lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr); 1929 if (lkey != UINT32_MAX) 1930 continue; 1931 if (bt->len == bt->size) 1932 mr_btree_expand(bt, bt->size << 1); 1933 entry.start = (uintptr_t)mr->pmd_mr.addr; 1934 entry.end = entry.start + mr->pmd_mr.len; 1935 entry.lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 1936 if (mr_btree_insert(bt, &entry) < 0) { 1937 DRV_LOG(ERR, "Cannot insert cache entry for mempool %s MR %08x", 1938 mp->name, entry.lkey); 1939 rte_errno = EINVAL; 1940 return -1; 1941 } 1942 } 1943 return 0; 1944 } 1945 1946 /** 1947 * Bottom-half lookup for the address from the mempool. 1948 * 1949 * @param mr_ctrl 1950 * Per-queue MR control handle. 1951 * @param mp 1952 * Mempool containing the address. 1953 * @param addr 1954 * Address to lookup. 1955 * @return 1956 * MR lkey on success, UINT32_MAX on failure. 1957 */ 1958 uint32_t 1959 mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, 1960 struct rte_mempool *mp, uintptr_t addr) 1961 { 1962 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1963 uint32_t lkey; 1964 uint16_t bh_idx = 0; 1965 1966 /* Binary-search MR translation table. */ 1967 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1968 /* Update top-half cache. */ 1969 if (likely(lkey != UINT32_MAX)) { 1970 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1971 } else { 1972 lkey = mlx5_lookup_mempool_regs(mr_ctrl, repl, mp, addr); 1973 /* Can only fail if the address is not from the mempool. */ 1974 if (unlikely(lkey == UINT32_MAX)) 1975 return UINT32_MAX; 1976 } 1977 /* Update the most recently used entry. */ 1978 mr_ctrl->mru = mr_ctrl->head; 1979 /* Point to the next victim, the oldest. */ 1980 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1981 return lkey; 1982 } 1983 1984 uint32_t 1985 mlx5_mr_mb2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, struct rte_mbuf *mb) 1986 { 1987 struct rte_mempool *mp; 1988 struct mlx5_mprq_buf *buf; 1989 uint32_t lkey; 1990 uintptr_t addr = (uintptr_t)mb->buf_addr; 1991 struct mlx5_mr_share_cache *share_cache = 1992 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1993 dev_gen); 1994 struct mlx5_common_device *cdev = 1995 container_of(share_cache, struct mlx5_common_device, mr_scache); 1996 1997 /* Recover MPRQ mempool. */ 1998 if (RTE_MBUF_HAS_EXTBUF(mb) && 1999 mb->shinfo->free_cb == mlx5_mprq_buf_free_cb) { 2000 buf = mb->shinfo->fcb_opaque; 2001 mp = buf->mp; 2002 } else { 2003 mp = mlx5_mb2mp(mb); 2004 } 2005 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2006 if (lkey != UINT32_MAX) 2007 return lkey; 2008 /* Register pinned external memory if the mempool is not used for Rx. */ 2009 if (cdev->config.mr_mempool_reg_en && 2010 (rte_pktmbuf_priv_flags(mp) & RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF)) { 2011 if (mlx5_mr_mempool_register(cdev, mp, true) < 0) 2012 return UINT32_MAX; 2013 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2014 MLX5_ASSERT(lkey != UINT32_MAX); 2015 return lkey; 2016 } 2017 /* Fallback to generic mechanism in corner cases. */ 2018 return mlx5_mr_addr2mr_bh(mr_ctrl, addr); 2019 } 2020