1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common.h" 16 #include "mlx5_common_mp.h" 17 #include "mlx5_common_mr.h" 18 #include "mlx5_common_os.h" 19 #include "mlx5_common_log.h" 20 #include "mlx5_malloc.h" 21 22 struct mr_find_contig_memsegs_data { 23 uintptr_t addr; 24 uintptr_t start; 25 uintptr_t end; 26 const struct rte_memseg_list *msl; 27 }; 28 29 /* Virtual memory range. */ 30 struct mlx5_range { 31 uintptr_t start; 32 uintptr_t end; 33 }; 34 35 /** Memory region for a mempool. */ 36 struct mlx5_mempool_mr { 37 struct mlx5_pmd_mr pmd_mr; 38 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 39 }; 40 41 /* Mempool registration. */ 42 struct mlx5_mempool_reg { 43 LIST_ENTRY(mlx5_mempool_reg) next; 44 /** Registered mempool, used to designate registrations. */ 45 struct rte_mempool *mp; 46 /** Memory regions for the address ranges of the mempool. */ 47 struct mlx5_mempool_mr *mrs; 48 /** Number of memory regions. */ 49 unsigned int mrs_n; 50 /** Whether the MR were created for external pinned memory. */ 51 bool is_extmem; 52 }; 53 54 void 55 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque) 56 { 57 struct mlx5_mprq_buf *buf = opaque; 58 59 if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) { 60 rte_mempool_put(buf->mp, buf); 61 } else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1, 62 __ATOMIC_RELAXED) == 0)) { 63 __atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED); 64 rte_mempool_put(buf->mp, buf); 65 } 66 } 67 68 /** 69 * Expand B-tree table to a given size. Can't be called with holding 70 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 71 * 72 * @param bt 73 * Pointer to B-tree structure. 74 * @param n 75 * Number of entries for expansion. 76 * 77 * @return 78 * 0 on success, -1 on failure. 79 */ 80 static int 81 mr_btree_expand(struct mlx5_mr_btree *bt, int n) 82 { 83 void *mem; 84 int ret = 0; 85 86 if (n <= bt->size) 87 return ret; 88 /* 89 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 90 * used inside if there's no room to expand. Because this is a quite 91 * rare case and a part of very slow path, it is very acceptable. 92 * Initially cache_bh[] will be given practically enough space and once 93 * it is expanded, expansion wouldn't be needed again ever. 94 */ 95 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 96 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 97 if (mem == NULL) { 98 /* Not an error, B-tree search will be skipped. */ 99 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 100 (void *)bt); 101 ret = -1; 102 } else { 103 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 104 bt->table = mem; 105 bt->size = n; 106 } 107 return ret; 108 } 109 110 /** 111 * Look up LKey from given B-tree lookup table, store the last index and return 112 * searched LKey. 113 * 114 * @param bt 115 * Pointer to B-tree structure. 116 * @param[out] idx 117 * Pointer to index. Even on search failure, returns index where it stops 118 * searching so that index can be used when inserting a new entry. 119 * @param addr 120 * Search key. 121 * 122 * @return 123 * Searched LKey on success, UINT32_MAX on no match. 124 */ 125 static uint32_t 126 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) 127 { 128 struct mr_cache_entry *lkp_tbl; 129 uint16_t n; 130 uint16_t base = 0; 131 132 MLX5_ASSERT(bt != NULL); 133 lkp_tbl = *bt->table; 134 n = bt->len; 135 /* First entry must be NULL for comparison. */ 136 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 137 lkp_tbl[0].lkey == UINT32_MAX)); 138 /* Binary search. */ 139 do { 140 register uint16_t delta = n >> 1; 141 142 if (addr < lkp_tbl[base + delta].start) { 143 n = delta; 144 } else { 145 base += delta; 146 n -= delta; 147 } 148 } while (n > 1); 149 MLX5_ASSERT(addr >= lkp_tbl[base].start); 150 *idx = base; 151 if (addr < lkp_tbl[base].end) 152 return lkp_tbl[base].lkey; 153 /* Not found. */ 154 return UINT32_MAX; 155 } 156 157 /** 158 * Insert an entry to B-tree lookup table. 159 * 160 * @param bt 161 * Pointer to B-tree structure. 162 * @param entry 163 * Pointer to new entry to insert. 164 * 165 * @return 166 * 0 on success, -1 on failure. 167 */ 168 static int 169 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 170 { 171 struct mr_cache_entry *lkp_tbl; 172 uint16_t idx = 0; 173 size_t shift; 174 175 MLX5_ASSERT(bt != NULL); 176 MLX5_ASSERT(bt->len <= bt->size); 177 MLX5_ASSERT(bt->len > 0); 178 lkp_tbl = *bt->table; 179 /* Find out the slot for insertion. */ 180 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 181 DRV_LOG(DEBUG, 182 "abort insertion to B-tree(%p): already exist at" 183 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 184 (void *)bt, idx, entry->start, entry->end, entry->lkey); 185 /* Already exist, return. */ 186 return 0; 187 } 188 /* If table is full, return error. */ 189 if (unlikely(bt->len == bt->size)) { 190 bt->overflow = 1; 191 return -1; 192 } 193 /* Insert entry. */ 194 ++idx; 195 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 196 if (shift) 197 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 198 lkp_tbl[idx] = *entry; 199 bt->len++; 200 DRV_LOG(DEBUG, 201 "inserted B-tree(%p)[%u]," 202 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 203 (void *)bt, idx, entry->start, entry->end, entry->lkey); 204 return 0; 205 } 206 207 /** 208 * Initialize B-tree and allocate memory for lookup table. 209 * 210 * @param bt 211 * Pointer to B-tree structure. 212 * @param n 213 * Number of entries to allocate. 214 * @param socket 215 * NUMA socket on which memory must be allocated. 216 * 217 * @return 218 * 0 on success, a negative errno value otherwise and rte_errno is set. 219 */ 220 static int 221 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 222 { 223 if (bt == NULL) { 224 rte_errno = EINVAL; 225 return -rte_errno; 226 } 227 MLX5_ASSERT(!bt->table && !bt->size); 228 memset(bt, 0, sizeof(*bt)); 229 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 230 sizeof(struct mr_cache_entry) * n, 231 0, socket); 232 if (bt->table == NULL) { 233 rte_errno = ENOMEM; 234 DRV_LOG(DEBUG, 235 "failed to allocate memory for btree cache on socket " 236 "%d", socket); 237 return -rte_errno; 238 } 239 bt->size = n; 240 /* First entry must be NULL for binary search. */ 241 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 242 .lkey = UINT32_MAX, 243 }; 244 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 245 (void *)bt, (void *)bt->table); 246 return 0; 247 } 248 249 /** 250 * Free B-tree resources. 251 * 252 * @param bt 253 * Pointer to B-tree structure. 254 */ 255 void 256 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 257 { 258 if (bt == NULL) 259 return; 260 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 261 (void *)bt, (void *)bt->table); 262 mlx5_free(bt->table); 263 memset(bt, 0, sizeof(*bt)); 264 } 265 266 /** 267 * Dump all the entries in a B-tree 268 * 269 * @param bt 270 * Pointer to B-tree structure. 271 */ 272 void 273 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 274 { 275 #ifdef RTE_LIBRTE_MLX5_DEBUG 276 int idx; 277 struct mr_cache_entry *lkp_tbl; 278 279 if (bt == NULL) 280 return; 281 lkp_tbl = *bt->table; 282 for (idx = 0; idx < bt->len; ++idx) { 283 struct mr_cache_entry *entry = &lkp_tbl[idx]; 284 285 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 286 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 287 (void *)bt, idx, entry->start, entry->end, entry->lkey); 288 } 289 #endif 290 } 291 292 /** 293 * Initialize per-queue MR control descriptor. 294 * 295 * @param mr_ctrl 296 * Pointer to MR control structure. 297 * @param dev_gen_ptr 298 * Pointer to generation number of global cache. 299 * @param socket 300 * NUMA socket on which memory must be allocated. 301 * 302 * @return 303 * 0 on success, a negative errno value otherwise and rte_errno is set. 304 */ 305 int 306 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr, 307 int socket) 308 { 309 if (mr_ctrl == NULL) { 310 rte_errno = EINVAL; 311 return -rte_errno; 312 } 313 /* Save pointer of global generation number to check memory event. */ 314 mr_ctrl->dev_gen_ptr = dev_gen_ptr; 315 /* Initialize B-tree and allocate memory for bottom-half cache table. */ 316 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N, 317 socket); 318 } 319 320 /** 321 * Find virtually contiguous memory chunk in a given MR. 322 * 323 * @param dev 324 * Pointer to MR structure. 325 * @param[out] entry 326 * Pointer to returning MR cache entry. If not found, this will not be 327 * updated. 328 * @param start_idx 329 * Start index of the memseg bitmap. 330 * 331 * @return 332 * Next index to go on lookup. 333 */ 334 static int 335 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 336 int base_idx) 337 { 338 uintptr_t start = 0; 339 uintptr_t end = 0; 340 uint32_t idx = 0; 341 342 /* MR for external memory doesn't have memseg list. */ 343 if (mr->msl == NULL) { 344 MLX5_ASSERT(mr->ms_bmp_n == 1); 345 MLX5_ASSERT(mr->ms_n == 1); 346 MLX5_ASSERT(base_idx == 0); 347 /* 348 * Can't search it from memseg list but get it directly from 349 * pmd_mr as there's only one chunk. 350 */ 351 entry->start = (uintptr_t)mr->pmd_mr.addr; 352 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 353 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 354 /* Returning 1 ends iteration. */ 355 return 1; 356 } 357 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 358 if (rte_bitmap_get(mr->ms_bmp, idx)) { 359 const struct rte_memseg_list *msl; 360 const struct rte_memseg *ms; 361 362 msl = mr->msl; 363 ms = rte_fbarray_get(&msl->memseg_arr, 364 mr->ms_base_idx + idx); 365 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 366 if (!start) 367 start = ms->addr_64; 368 end = ms->addr_64 + ms->hugepage_sz; 369 } else if (start) { 370 /* Passed the end of a fragment. */ 371 break; 372 } 373 } 374 if (start) { 375 /* Found one chunk. */ 376 entry->start = start; 377 entry->end = end; 378 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 379 } 380 return idx; 381 } 382 383 /** 384 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 385 * Then, this entry will have to be searched by mr_lookup_list() in 386 * mlx5_mr_create() on miss. 387 * 388 * @param share_cache 389 * Pointer to a global shared MR cache. 390 * @param mr 391 * Pointer to MR to insert. 392 * 393 * @return 394 * 0 on success, -1 on failure. 395 */ 396 int 397 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 398 struct mlx5_mr *mr) 399 { 400 unsigned int n; 401 402 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 403 (void *)mr, (void *)share_cache); 404 for (n = 0; n < mr->ms_bmp_n; ) { 405 struct mr_cache_entry entry; 406 407 memset(&entry, 0, sizeof(entry)); 408 /* Find a contiguous chunk and advance the index. */ 409 n = mr_find_next_chunk(mr, &entry, n); 410 if (!entry.end) 411 break; 412 if (mr_btree_insert(&share_cache->cache, &entry) < 0) { 413 /* 414 * Overflowed, but the global table cannot be expanded 415 * because of deadlock. 416 */ 417 return -1; 418 } 419 } 420 return 0; 421 } 422 423 /** 424 * Look up address in the original global MR list. 425 * 426 * @param share_cache 427 * Pointer to a global shared MR cache. 428 * @param[out] entry 429 * Pointer to returning MR cache entry. If no match, this will not be updated. 430 * @param addr 431 * Search key. 432 * 433 * @return 434 * Found MR on match, NULL otherwise. 435 */ 436 struct mlx5_mr * 437 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 438 struct mr_cache_entry *entry, uintptr_t addr) 439 { 440 struct mlx5_mr *mr; 441 442 /* Iterate all the existing MRs. */ 443 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 444 unsigned int n; 445 446 if (mr->ms_n == 0) 447 continue; 448 for (n = 0; n < mr->ms_bmp_n; ) { 449 struct mr_cache_entry ret; 450 451 memset(&ret, 0, sizeof(ret)); 452 n = mr_find_next_chunk(mr, &ret, n); 453 if (addr >= ret.start && addr < ret.end) { 454 /* Found. */ 455 *entry = ret; 456 return mr; 457 } 458 } 459 } 460 return NULL; 461 } 462 463 /** 464 * Look up address on global MR cache. 465 * 466 * @param share_cache 467 * Pointer to a global shared MR cache. 468 * @param[out] entry 469 * Pointer to returning MR cache entry. If no match, this will not be updated. 470 * @param addr 471 * Search key. 472 * 473 * @return 474 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 475 */ 476 static uint32_t 477 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 478 struct mr_cache_entry *entry, uintptr_t addr) 479 { 480 uint16_t idx; 481 uint32_t lkey = UINT32_MAX; 482 struct mlx5_mr *mr; 483 484 /* 485 * If the global cache has overflowed since it failed to expand the 486 * B-tree table, it can't have all the existing MRs. Then, the address 487 * has to be searched by traversing the original MR list instead, which 488 * is very slow path. Otherwise, the global cache is all inclusive. 489 */ 490 if (!unlikely(share_cache->cache.overflow)) { 491 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 492 if (lkey != UINT32_MAX) 493 *entry = (*share_cache->cache.table)[idx]; 494 } else { 495 /* Falling back to the slowest path. */ 496 mr = mlx5_mr_lookup_list(share_cache, entry, addr); 497 if (mr != NULL) 498 lkey = entry->lkey; 499 } 500 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 501 addr < entry->end)); 502 return lkey; 503 } 504 505 /** 506 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 507 * can raise memory free event and the callback function will spin on the lock. 508 * 509 * @param mr 510 * Pointer to MR to free. 511 */ 512 void 513 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 514 { 515 if (mr == NULL) 516 return; 517 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 518 dereg_mr_cb(&mr->pmd_mr); 519 rte_bitmap_free(mr->ms_bmp); 520 mlx5_free(mr); 521 } 522 523 void 524 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 525 { 526 struct mlx5_mr *mr; 527 528 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 529 /* Flush cache to rebuild. */ 530 share_cache->cache.len = 1; 531 share_cache->cache.overflow = 0; 532 /* Iterate all the existing MRs. */ 533 LIST_FOREACH(mr, &share_cache->mr_list, mr) 534 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 535 return; 536 } 537 538 /** 539 * Release resources of detached MR having no online entry. 540 * 541 * @param share_cache 542 * Pointer to a global shared MR cache. 543 */ 544 static void 545 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 546 { 547 struct mlx5_mr *mr_next; 548 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 549 550 /* Must be called from the primary process. */ 551 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 552 /* 553 * MR can't be freed with holding the lock because rte_free() could call 554 * memory free callback function. This will be a deadlock situation. 555 */ 556 rte_rwlock_write_lock(&share_cache->rwlock); 557 /* Detach the whole free list and release it after unlocking. */ 558 free_list = share_cache->mr_free_list; 559 LIST_INIT(&share_cache->mr_free_list); 560 rte_rwlock_write_unlock(&share_cache->rwlock); 561 /* Release resources. */ 562 mr_next = LIST_FIRST(&free_list); 563 while (mr_next != NULL) { 564 struct mlx5_mr *mr = mr_next; 565 566 mr_next = LIST_NEXT(mr, mr); 567 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 568 } 569 } 570 571 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 572 static int 573 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 574 const struct rte_memseg *ms, size_t len, void *arg) 575 { 576 struct mr_find_contig_memsegs_data *data = arg; 577 578 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 579 return 0; 580 /* Found, save it and stop walking. */ 581 data->start = ms->addr_64; 582 data->end = ms->addr_64 + len; 583 data->msl = msl; 584 return 1; 585 } 586 587 /** 588 * Create a new global Memory Region (MR) for a missing virtual address. 589 * This API should be called on a secondary process, then a request is sent to 590 * the primary process in order to create a MR for the address. As the global MR 591 * list is on the shared memory, following LKey lookup should succeed unless the 592 * request fails. 593 * 594 * @param cdev 595 * Pointer to the mlx5 common device. 596 * @param share_cache 597 * Pointer to a global shared MR cache. 598 * @param[out] entry 599 * Pointer to returning MR cache entry, found in the global cache or newly 600 * created. If failed to create one, this will not be updated. 601 * @param addr 602 * Target virtual address to register. 603 * 604 * @return 605 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 606 */ 607 static uint32_t 608 mlx5_mr_create_secondary(struct mlx5_common_device *cdev, 609 struct mlx5_mr_share_cache *share_cache, 610 struct mr_cache_entry *entry, uintptr_t addr) 611 { 612 int ret; 613 614 DRV_LOG(DEBUG, "Requesting MR creation for address (%p)", (void *)addr); 615 ret = mlx5_mp_req_mr_create(cdev, addr); 616 if (ret) { 617 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 618 (void *)addr); 619 return UINT32_MAX; 620 } 621 rte_rwlock_read_lock(&share_cache->rwlock); 622 /* Fill in output data. */ 623 mlx5_mr_lookup_cache(share_cache, entry, addr); 624 /* Lookup can't fail. */ 625 MLX5_ASSERT(entry->lkey != UINT32_MAX); 626 rte_rwlock_read_unlock(&share_cache->rwlock); 627 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 628 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 629 (void *)addr, entry->start, entry->end, entry->lkey); 630 return entry->lkey; 631 } 632 633 /** 634 * Create a new global Memory Region (MR) for a missing virtual address. 635 * Register entire virtually contiguous memory chunk around the address. 636 * 637 * @param pd 638 * Pointer to pd of a device (net, regex, vdpa,...). 639 * @param share_cache 640 * Pointer to a global shared MR cache. 641 * @param[out] entry 642 * Pointer to returning MR cache entry, found in the global cache or newly 643 * created. If failed to create one, this will not be updated. 644 * @param addr 645 * Target virtual address to register. 646 * @param mr_ext_memseg_en 647 * Configurable flag about external memory segment enable or not. 648 * 649 * @return 650 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 651 */ 652 static uint32_t 653 mlx5_mr_create_primary(void *pd, 654 struct mlx5_mr_share_cache *share_cache, 655 struct mr_cache_entry *entry, uintptr_t addr, 656 unsigned int mr_ext_memseg_en) 657 { 658 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 659 struct mr_find_contig_memsegs_data data_re; 660 const struct rte_memseg_list *msl; 661 const struct rte_memseg *ms; 662 struct mlx5_mr *mr = NULL; 663 int ms_idx_shift = -1; 664 uint32_t bmp_size; 665 void *bmp_mem; 666 uint32_t ms_n; 667 uint32_t n; 668 size_t len; 669 670 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 671 /* 672 * Release detached MRs if any. This can't be called with holding either 673 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 674 * been detached by the memory free event but it couldn't be released 675 * inside the callback due to deadlock. As a result, releasing resources 676 * is quite opportunistic. 677 */ 678 mlx5_mr_garbage_collect(share_cache); 679 /* 680 * If enabled, find out a contiguous virtual address chunk in use, to 681 * which the given address belongs, in order to register maximum range. 682 * In the best case where mempools are not dynamically recreated and 683 * '--socket-mem' is specified as an EAL option, it is very likely to 684 * have only one MR(LKey) per a socket and per a hugepage-size even 685 * though the system memory is highly fragmented. As the whole memory 686 * chunk will be pinned by kernel, it can't be reused unless entire 687 * chunk is freed from EAL. 688 * 689 * If disabled, just register one memseg (page). Then, memory 690 * consumption will be minimized but it may drop performance if there 691 * are many MRs to lookup on the datapath. 692 */ 693 if (!mr_ext_memseg_en) { 694 data.msl = rte_mem_virt2memseg_list((void *)addr); 695 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 696 data.end = data.start + data.msl->page_sz; 697 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 698 DRV_LOG(WARNING, 699 "Unable to find virtually contiguous" 700 " chunk for address (%p)." 701 " rte_memseg_contig_walk() failed.", (void *)addr); 702 rte_errno = ENXIO; 703 goto err_nolock; 704 } 705 alloc_resources: 706 /* Addresses must be page-aligned. */ 707 MLX5_ASSERT(data.msl); 708 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 709 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 710 msl = data.msl; 711 ms = rte_mem_virt2memseg((void *)data.start, msl); 712 len = data.end - data.start; 713 MLX5_ASSERT(ms); 714 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 715 /* Number of memsegs in the range. */ 716 ms_n = len / msl->page_sz; 717 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 718 " page_sz=0x%" PRIx64 ", ms_n=%u", 719 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 720 /* Size of memory for bitmap. */ 721 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 722 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 723 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 724 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 725 if (mr == NULL) { 726 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 727 " address (%p).", (void *)addr); 728 rte_errno = ENOMEM; 729 goto err_nolock; 730 } 731 mr->msl = msl; 732 /* 733 * Save the index of the first memseg and initialize memseg bitmap. To 734 * see if a memseg of ms_idx in the memseg-list is still valid, check: 735 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 736 */ 737 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 738 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 739 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 740 if (mr->ms_bmp == NULL) { 741 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 742 " address (%p).", (void *)addr); 743 rte_errno = EINVAL; 744 goto err_nolock; 745 } 746 /* 747 * Should recheck whether the extended contiguous chunk is still valid. 748 * Because memory_hotplug_lock can't be held if there's any memory 749 * related calls in a critical path, resource allocation above can't be 750 * locked. If the memory has been changed at this point, try again with 751 * just single page. If not, go on with the big chunk atomically from 752 * here. 753 */ 754 rte_mcfg_mem_read_lock(); 755 data_re = data; 756 if (len > msl->page_sz && 757 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 758 DRV_LOG(DEBUG, 759 "Unable to find virtually contiguous chunk for address " 760 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 761 rte_errno = ENXIO; 762 goto err_memlock; 763 } 764 if (data.start != data_re.start || data.end != data_re.end) { 765 /* 766 * The extended contiguous chunk has been changed. Try again 767 * with single memseg instead. 768 */ 769 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 770 data.end = data.start + msl->page_sz; 771 rte_mcfg_mem_read_unlock(); 772 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 773 goto alloc_resources; 774 } 775 MLX5_ASSERT(data.msl == data_re.msl); 776 rte_rwlock_write_lock(&share_cache->rwlock); 777 /* 778 * Check the address is really missing. If other thread already created 779 * one or it is not found due to overflow, abort and return. 780 */ 781 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 782 /* 783 * Insert to the global cache table. It may fail due to 784 * low-on-memory. Then, this entry will have to be searched 785 * here again. 786 */ 787 mr_btree_insert(&share_cache->cache, entry); 788 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 789 (void *)addr); 790 rte_rwlock_write_unlock(&share_cache->rwlock); 791 rte_mcfg_mem_read_unlock(); 792 /* 793 * Must be unlocked before calling rte_free() because 794 * mlx5_mr_mem_event_free_cb() can be called inside. 795 */ 796 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 797 return entry->lkey; 798 } 799 /* 800 * Trim start and end addresses for verbs MR. Set bits for registering 801 * memsegs but exclude already registered ones. Bitmap can be 802 * fragmented. 803 */ 804 for (n = 0; n < ms_n; ++n) { 805 uintptr_t start; 806 struct mr_cache_entry ret; 807 808 memset(&ret, 0, sizeof(ret)); 809 start = data_re.start + n * msl->page_sz; 810 /* Exclude memsegs already registered by other MRs. */ 811 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 812 UINT32_MAX) { 813 /* 814 * Start from the first unregistered memseg in the 815 * extended range. 816 */ 817 if (ms_idx_shift == -1) { 818 mr->ms_base_idx += n; 819 data.start = start; 820 ms_idx_shift = n; 821 } 822 data.end = start + msl->page_sz; 823 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 824 ++mr->ms_n; 825 } 826 } 827 len = data.end - data.start; 828 mr->ms_bmp_n = len / msl->page_sz; 829 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 830 /* 831 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 832 * be called with holding the memory lock because it doesn't use 833 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 834 * through mlx5_alloc_verbs_buf(). 835 */ 836 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 837 if (mr->pmd_mr.obj == NULL) { 838 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 839 (void *)addr); 840 rte_errno = EINVAL; 841 goto err_mrlock; 842 } 843 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 844 MLX5_ASSERT(mr->pmd_mr.len); 845 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 846 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 847 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 848 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 849 (void *)mr, (void *)addr, data.start, data.end, 850 rte_cpu_to_be_32(mr->pmd_mr.lkey), 851 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 852 /* Insert to the global cache table. */ 853 mlx5_mr_insert_cache(share_cache, mr); 854 /* Fill in output data. */ 855 mlx5_mr_lookup_cache(share_cache, entry, addr); 856 /* Lookup can't fail. */ 857 MLX5_ASSERT(entry->lkey != UINT32_MAX); 858 rte_rwlock_write_unlock(&share_cache->rwlock); 859 rte_mcfg_mem_read_unlock(); 860 return entry->lkey; 861 err_mrlock: 862 rte_rwlock_write_unlock(&share_cache->rwlock); 863 err_memlock: 864 rte_mcfg_mem_read_unlock(); 865 err_nolock: 866 /* 867 * In case of error, as this can be called in a datapath, a warning 868 * message per an error is preferable instead. Must be unlocked before 869 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 870 * inside. 871 */ 872 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 873 return UINT32_MAX; 874 } 875 876 /** 877 * Create a new global Memory Region (MR) for a missing virtual address. 878 * This can be called from primary and secondary process. 879 * 880 * @param cdev 881 * Pointer to the mlx5 common device. 882 * @param share_cache 883 * Pointer to a global shared MR cache. 884 * @param[out] entry 885 * Pointer to returning MR cache entry, found in the global cache or newly 886 * created. If failed to create one, this will not be updated. 887 * @param addr 888 * Target virtual address to register. 889 * 890 * @return 891 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 892 */ 893 uint32_t 894 mlx5_mr_create(struct mlx5_common_device *cdev, 895 struct mlx5_mr_share_cache *share_cache, 896 struct mr_cache_entry *entry, uintptr_t addr) 897 { 898 uint32_t ret = 0; 899 900 switch (rte_eal_process_type()) { 901 case RTE_PROC_PRIMARY: 902 ret = mlx5_mr_create_primary(cdev->pd, share_cache, entry, addr, 903 cdev->config.mr_ext_memseg_en); 904 break; 905 case RTE_PROC_SECONDARY: 906 ret = mlx5_mr_create_secondary(cdev, share_cache, entry, addr); 907 break; 908 default: 909 break; 910 } 911 return ret; 912 } 913 914 /** 915 * Look up address in the global MR cache table. If not found, create a new MR. 916 * Insert the found/created entry to local bottom-half cache table. 917 * 918 * @param mr_ctrl 919 * Pointer to per-queue MR control structure. 920 * @param[out] entry 921 * Pointer to returning MR cache entry, found in the global cache or newly 922 * created. If failed to create one, this is not written. 923 * @param addr 924 * Search key. 925 * 926 * @return 927 * Searched LKey on success, UINT32_MAX on no match. 928 */ 929 static uint32_t 930 mr_lookup_caches(struct mlx5_mr_ctrl *mr_ctrl, 931 struct mr_cache_entry *entry, uintptr_t addr) 932 { 933 struct mlx5_mr_share_cache *share_cache = 934 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 935 dev_gen); 936 struct mlx5_common_device *cdev = 937 container_of(share_cache, struct mlx5_common_device, mr_scache); 938 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 939 uint32_t lkey; 940 uint16_t idx; 941 942 /* If local cache table is full, try to double it. */ 943 if (unlikely(bt->len == bt->size)) 944 mr_btree_expand(bt, bt->size << 1); 945 /* Look up in the global cache. */ 946 rte_rwlock_read_lock(&share_cache->rwlock); 947 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 948 if (lkey != UINT32_MAX) { 949 /* Found. */ 950 *entry = (*share_cache->cache.table)[idx]; 951 rte_rwlock_read_unlock(&share_cache->rwlock); 952 /* 953 * Update local cache. Even if it fails, return the found entry 954 * to update top-half cache. Next time, this entry will be found 955 * in the global cache. 956 */ 957 mr_btree_insert(bt, entry); 958 return lkey; 959 } 960 rte_rwlock_read_unlock(&share_cache->rwlock); 961 /* First time to see the address? Create a new MR. */ 962 lkey = mlx5_mr_create(cdev, share_cache, entry, addr); 963 /* 964 * Update the local cache if successfully created a new global MR. Even 965 * if failed to create one, there's no action to take in this datapath 966 * code. As returning LKey is invalid, this will eventually make HW 967 * fail. 968 */ 969 if (lkey != UINT32_MAX) 970 mr_btree_insert(bt, entry); 971 return lkey; 972 } 973 974 /** 975 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 976 * misses, search in the global MR cache table and update the new entry to 977 * per-queue local caches. 978 * 979 * @param mr_ctrl 980 * Pointer to per-queue MR control structure. 981 * @param addr 982 * Search key. 983 * 984 * @return 985 * Searched LKey on success, UINT32_MAX on no match. 986 */ 987 static uint32_t 988 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr) 989 { 990 uint32_t lkey; 991 uint16_t bh_idx = 0; 992 /* Victim in top-half cache to replace with new entry. */ 993 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 994 995 /* Binary-search MR translation table. */ 996 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 997 /* Update top-half cache. */ 998 if (likely(lkey != UINT32_MAX)) { 999 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1000 } else { 1001 /* 1002 * If missed in local lookup table, search in the global cache 1003 * and local cache_bh[] will be updated inside if possible. 1004 * Top-half cache entry will also be updated. 1005 */ 1006 lkey = mr_lookup_caches(mr_ctrl, repl, addr); 1007 if (unlikely(lkey == UINT32_MAX)) 1008 return UINT32_MAX; 1009 } 1010 /* Update the most recently used entry. */ 1011 mr_ctrl->mru = mr_ctrl->head; 1012 /* Point to the next victim, the oldest. */ 1013 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1014 return lkey; 1015 } 1016 1017 /** 1018 * Release all the created MRs and resources on global MR cache of a device 1019 * list. 1020 * 1021 * @param share_cache 1022 * Pointer to a global shared MR cache. 1023 */ 1024 void 1025 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 1026 { 1027 struct mlx5_mr *mr_next; 1028 1029 rte_rwlock_write_lock(&share_cache->rwlock); 1030 /* Detach from MR list and move to free list. */ 1031 mr_next = LIST_FIRST(&share_cache->mr_list); 1032 while (mr_next != NULL) { 1033 struct mlx5_mr *mr = mr_next; 1034 1035 mr_next = LIST_NEXT(mr, mr); 1036 LIST_REMOVE(mr, mr); 1037 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1038 } 1039 LIST_INIT(&share_cache->mr_list); 1040 /* Free global cache. */ 1041 mlx5_mr_btree_free(&share_cache->cache); 1042 rte_rwlock_write_unlock(&share_cache->rwlock); 1043 /* Free all remaining MRs. */ 1044 mlx5_mr_garbage_collect(share_cache); 1045 } 1046 1047 /** 1048 * Initialize global MR cache of a device. 1049 * 1050 * @param share_cache 1051 * Pointer to a global shared MR cache. 1052 * @param socket 1053 * NUMA socket on which memory must be allocated. 1054 * 1055 * @return 1056 * 0 on success, a negative errno value otherwise and rte_errno is set. 1057 */ 1058 int 1059 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket) 1060 { 1061 /* Set the reg_mr and dereg_mr callback functions */ 1062 mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb, 1063 &share_cache->dereg_mr_cb); 1064 rte_rwlock_init(&share_cache->rwlock); 1065 rte_rwlock_init(&share_cache->mprwlock); 1066 share_cache->mp_cb_registered = 0; 1067 /* Initialize B-tree and allocate memory for global MR cache table. */ 1068 return mlx5_mr_btree_init(&share_cache->cache, 1069 MLX5_MR_BTREE_CACHE_N * 2, socket); 1070 } 1071 1072 /** 1073 * Flush all of the local cache entries. 1074 * 1075 * @param mr_ctrl 1076 * Pointer to per-queue MR local cache. 1077 */ 1078 void 1079 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1080 { 1081 /* Reset the most-recently-used index. */ 1082 mr_ctrl->mru = 0; 1083 /* Reset the linear search array. */ 1084 mr_ctrl->head = 0; 1085 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1086 /* Reset the B-tree table. */ 1087 mr_ctrl->cache_bh.len = 1; 1088 mr_ctrl->cache_bh.overflow = 0; 1089 /* Update the generation number. */ 1090 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1091 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1092 (void *)mr_ctrl, mr_ctrl->cur_gen); 1093 } 1094 1095 /** 1096 * Creates a memory region for external memory, that is memory which is not 1097 * part of the DPDK memory segments. 1098 * 1099 * @param pd 1100 * Pointer to pd of a device (net, regex, vdpa,...). 1101 * @param addr 1102 * Starting virtual address of memory. 1103 * @param len 1104 * Length of memory segment being mapped. 1105 * @param socked_id 1106 * Socket to allocate heap memory for the control structures. 1107 * 1108 * @return 1109 * Pointer to MR structure on success, NULL otherwise. 1110 */ 1111 struct mlx5_mr * 1112 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1113 mlx5_reg_mr_t reg_mr_cb) 1114 { 1115 struct mlx5_mr *mr = NULL; 1116 1117 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1118 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1119 RTE_CACHE_LINE_SIZE, socket_id); 1120 if (mr == NULL) 1121 return NULL; 1122 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1123 if (mr->pmd_mr.obj == NULL) { 1124 DRV_LOG(WARNING, 1125 "Fail to create MR for address (%p)", 1126 (void *)addr); 1127 mlx5_free(mr); 1128 return NULL; 1129 } 1130 mr->msl = NULL; /* Mark it is external memory. */ 1131 mr->ms_bmp = NULL; 1132 mr->ms_n = 1; 1133 mr->ms_bmp_n = 1; 1134 DRV_LOG(DEBUG, 1135 "MR CREATED (%p) for external memory %p:\n" 1136 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1137 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1138 (void *)mr, (void *)addr, 1139 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1140 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1141 return mr; 1142 } 1143 1144 /** 1145 * Callback for memory free event. Iterate freed memsegs and check whether it 1146 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1147 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1148 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1149 * secondary process, the garbage collector will be called in primary process 1150 * as the secondary process can't call mlx5_mr_create(). 1151 * 1152 * The global cache must be rebuilt if there's any change and this event has to 1153 * be propagated to dataplane threads to flush the local caches. 1154 * 1155 * @param share_cache 1156 * Pointer to a global shared MR cache. 1157 * @param ibdev_name 1158 * Name of ibv device. 1159 * @param addr 1160 * Address of freed memory. 1161 * @param len 1162 * Size of freed memory. 1163 */ 1164 void 1165 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1166 const char *ibdev_name, const void *addr, size_t len) 1167 { 1168 const struct rte_memseg_list *msl; 1169 struct mlx5_mr *mr; 1170 int ms_n; 1171 int i; 1172 int rebuild = 0; 1173 1174 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1175 ibdev_name, addr, len); 1176 msl = rte_mem_virt2memseg_list(addr); 1177 /* addr and len must be page-aligned. */ 1178 MLX5_ASSERT((uintptr_t)addr == 1179 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1180 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1181 ms_n = len / msl->page_sz; 1182 rte_rwlock_write_lock(&share_cache->rwlock); 1183 /* Clear bits of freed memsegs from MR. */ 1184 for (i = 0; i < ms_n; ++i) { 1185 const struct rte_memseg *ms; 1186 struct mr_cache_entry entry; 1187 uintptr_t start; 1188 int ms_idx; 1189 uint32_t pos; 1190 1191 /* Find MR having this memseg. */ 1192 start = (uintptr_t)addr + i * msl->page_sz; 1193 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1194 if (mr == NULL) 1195 continue; 1196 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1197 ms = rte_mem_virt2memseg((void *)start, msl); 1198 MLX5_ASSERT(ms != NULL); 1199 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1200 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1201 pos = ms_idx - mr->ms_base_idx; 1202 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1203 MLX5_ASSERT(pos < mr->ms_bmp_n); 1204 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1205 ibdev_name, (void *)mr, pos, (void *)start); 1206 rte_bitmap_clear(mr->ms_bmp, pos); 1207 if (--mr->ms_n == 0) { 1208 LIST_REMOVE(mr, mr); 1209 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1210 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1211 ibdev_name, (void *)mr); 1212 } 1213 /* 1214 * MR is fragmented or will be freed. the global cache must be 1215 * rebuilt. 1216 */ 1217 rebuild = 1; 1218 } 1219 if (rebuild) { 1220 mlx5_mr_rebuild_cache(share_cache); 1221 /* 1222 * No explicit wmb is needed after updating dev_gen due to 1223 * store-release ordering in unlock that provides the 1224 * implicit barrier at the software visible level. 1225 */ 1226 ++share_cache->dev_gen; 1227 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1228 share_cache->dev_gen); 1229 } 1230 rte_rwlock_write_unlock(&share_cache->rwlock); 1231 } 1232 1233 /** 1234 * Dump all the created MRs and the global cache entries. 1235 * 1236 * @param share_cache 1237 * Pointer to a global shared MR cache. 1238 */ 1239 void 1240 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1241 { 1242 #ifdef RTE_LIBRTE_MLX5_DEBUG 1243 struct mlx5_mr *mr; 1244 int mr_n = 0; 1245 int chunk_n = 0; 1246 1247 rte_rwlock_read_lock(&share_cache->rwlock); 1248 /* Iterate all the existing MRs. */ 1249 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1250 unsigned int n; 1251 1252 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1253 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1254 mr->ms_n, mr->ms_bmp_n); 1255 if (mr->ms_n == 0) 1256 continue; 1257 for (n = 0; n < mr->ms_bmp_n; ) { 1258 struct mr_cache_entry ret = { 0, }; 1259 1260 n = mr_find_next_chunk(mr, &ret, n); 1261 if (!ret.end) 1262 break; 1263 DRV_LOG(DEBUG, 1264 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1265 chunk_n++, ret.start, ret.end); 1266 } 1267 } 1268 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1269 mlx5_mr_btree_dump(&share_cache->cache); 1270 rte_rwlock_read_unlock(&share_cache->rwlock); 1271 #endif 1272 } 1273 1274 static int 1275 mlx5_range_compare_start(const void *lhs, const void *rhs) 1276 { 1277 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1278 1279 if (r1->start > r2->start) 1280 return 1; 1281 else if (r1->start < r2->start) 1282 return -1; 1283 return 0; 1284 } 1285 1286 static void 1287 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1288 struct rte_mempool_memhdr *memhdr, 1289 unsigned int idx) 1290 { 1291 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1292 uint64_t page_size = rte_mem_page_size(); 1293 1294 RTE_SET_USED(mp); 1295 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 1296 range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size); 1297 } 1298 1299 /** 1300 * Collect page-aligned memory ranges of the mempool. 1301 */ 1302 static int 1303 mlx5_mempool_get_chunks(struct rte_mempool *mp, struct mlx5_range **out, 1304 unsigned int *out_n) 1305 { 1306 unsigned int n; 1307 1308 DRV_LOG(DEBUG, "Collecting chunks of regular mempool %s", mp->name); 1309 n = mp->nb_mem_chunks; 1310 *out = calloc(sizeof(**out), n); 1311 if (*out == NULL) 1312 return -1; 1313 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, *out); 1314 *out_n = n; 1315 return 0; 1316 } 1317 1318 struct mlx5_mempool_get_extmem_data { 1319 struct mlx5_range *heap; 1320 unsigned int heap_size; 1321 int ret; 1322 }; 1323 1324 static void 1325 mlx5_mempool_get_extmem_cb(struct rte_mempool *mp, void *opaque, 1326 void *obj, unsigned int obj_idx) 1327 { 1328 struct mlx5_mempool_get_extmem_data *data = opaque; 1329 struct rte_mbuf *mbuf = obj; 1330 uintptr_t addr = (uintptr_t)mbuf->buf_addr; 1331 struct mlx5_range *seg, *heap; 1332 struct rte_memseg_list *msl; 1333 size_t page_size; 1334 uintptr_t page_start; 1335 unsigned int pos = 0, len = data->heap_size, delta; 1336 1337 RTE_SET_USED(mp); 1338 RTE_SET_USED(obj_idx); 1339 if (data->ret < 0) 1340 return; 1341 /* Binary search for an already visited page. */ 1342 while (len > 1) { 1343 delta = len / 2; 1344 if (addr < data->heap[pos + delta].start) { 1345 len = delta; 1346 } else { 1347 pos += delta; 1348 len -= delta; 1349 } 1350 } 1351 if (data->heap != NULL) { 1352 seg = &data->heap[pos]; 1353 if (seg->start <= addr && addr < seg->end) 1354 return; 1355 } 1356 /* Determine the page boundaries and remember them. */ 1357 heap = realloc(data->heap, sizeof(heap[0]) * (data->heap_size + 1)); 1358 if (heap == NULL) { 1359 free(data->heap); 1360 data->heap = NULL; 1361 data->ret = -1; 1362 return; 1363 } 1364 data->heap = heap; 1365 data->heap_size++; 1366 seg = &heap[data->heap_size - 1]; 1367 msl = rte_mem_virt2memseg_list((void *)addr); 1368 page_size = msl != NULL ? msl->page_sz : rte_mem_page_size(); 1369 page_start = RTE_PTR_ALIGN_FLOOR(addr, page_size); 1370 seg->start = page_start; 1371 seg->end = page_start + page_size; 1372 /* Maintain the heap order. */ 1373 qsort(data->heap, data->heap_size, sizeof(heap[0]), 1374 mlx5_range_compare_start); 1375 } 1376 1377 /** 1378 * Recover pages of external memory as close as possible 1379 * for a mempool with RTE_PKTMBUF_POOL_PINNED_EXT_BUF. 1380 * Pages are stored in a heap for efficient search, for mbufs are many. 1381 */ 1382 static int 1383 mlx5_mempool_get_extmem(struct rte_mempool *mp, struct mlx5_range **out, 1384 unsigned int *out_n) 1385 { 1386 struct mlx5_mempool_get_extmem_data data; 1387 1388 DRV_LOG(DEBUG, "Recovering external pinned pages of mempool %s", 1389 mp->name); 1390 memset(&data, 0, sizeof(data)); 1391 rte_mempool_obj_iter(mp, mlx5_mempool_get_extmem_cb, &data); 1392 *out = data.heap; 1393 *out_n = data.heap_size; 1394 return data.ret; 1395 } 1396 1397 /** 1398 * Get VA-contiguous ranges of the mempool memory. 1399 * Each range start and end is aligned to the system page size. 1400 * 1401 * @param[in] mp 1402 * Analyzed mempool. 1403 * @param[in] is_extmem 1404 * Whether the pool is contains only external pinned buffers. 1405 * @param[out] out 1406 * Receives the ranges, caller must release it with free(). 1407 * @param[out] out_n 1408 * Receives the number of @p out elements. 1409 * 1410 * @return 1411 * 0 on success, (-1) on failure. 1412 */ 1413 static int 1414 mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem, 1415 struct mlx5_range **out, unsigned int *out_n) 1416 { 1417 struct mlx5_range *chunks; 1418 unsigned int chunks_n, contig_n, i; 1419 int ret; 1420 1421 /* Collect the pool underlying memory. */ 1422 ret = is_extmem ? mlx5_mempool_get_extmem(mp, &chunks, &chunks_n) : 1423 mlx5_mempool_get_chunks(mp, &chunks, &chunks_n); 1424 if (ret < 0) 1425 return ret; 1426 /* Merge adjacent chunks and place them at the beginning. */ 1427 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1428 contig_n = 1; 1429 for (i = 1; i < chunks_n; i++) 1430 if (chunks[i - 1].end != chunks[i].start) { 1431 chunks[contig_n - 1].end = chunks[i - 1].end; 1432 chunks[contig_n] = chunks[i]; 1433 contig_n++; 1434 } 1435 /* Extend the last contiguous chunk to the end of the mempool. */ 1436 chunks[contig_n - 1].end = chunks[i - 1].end; 1437 *out = chunks; 1438 *out_n = contig_n; 1439 return 0; 1440 } 1441 1442 /** 1443 * Analyze mempool memory to select memory ranges to register. 1444 * 1445 * @param[in] mp 1446 * Mempool to analyze. 1447 * @param[in] is_extmem 1448 * Whether the pool is contains only external pinned buffers. 1449 * @param[out] out 1450 * Receives memory ranges to register, aligned to the system page size. 1451 * The caller must release them with free(). 1452 * @param[out] out_n 1453 * Receives the number of @p out items. 1454 * @param[out] share_hugepage 1455 * Receives True if the entire pool resides within a single hugepage. 1456 * 1457 * @return 1458 * 0 on success, (-1) on failure. 1459 */ 1460 static int 1461 mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem, 1462 struct mlx5_range **out, unsigned int *out_n, 1463 bool *share_hugepage) 1464 { 1465 struct mlx5_range *ranges = NULL; 1466 unsigned int i, ranges_n = 0; 1467 struct rte_memseg_list *msl; 1468 1469 if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) { 1470 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1471 mp->name); 1472 return -1; 1473 } 1474 /* Check if the hugepage of the pool can be shared. */ 1475 *share_hugepage = false; 1476 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1477 if (msl != NULL) { 1478 uint64_t hugepage_sz = 0; 1479 1480 /* Check that all ranges are on pages of the same size. */ 1481 for (i = 0; i < ranges_n; i++) { 1482 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1483 break; 1484 hugepage_sz = msl->page_sz; 1485 } 1486 if (i == ranges_n) { 1487 /* 1488 * If the entire pool is within one hugepage, 1489 * combine all ranges into one of the hugepage size. 1490 */ 1491 uintptr_t reg_start = ranges[0].start; 1492 uintptr_t reg_end = ranges[ranges_n - 1].end; 1493 uintptr_t hugepage_start = 1494 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1495 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1496 if (reg_end < hugepage_end) { 1497 ranges[0].start = hugepage_start; 1498 ranges[0].end = hugepage_end; 1499 ranges_n = 1; 1500 *share_hugepage = true; 1501 } 1502 } 1503 } 1504 *out = ranges; 1505 *out_n = ranges_n; 1506 return 0; 1507 } 1508 1509 /** Create a registration object for the mempool. */ 1510 static struct mlx5_mempool_reg * 1511 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n, 1512 bool is_extmem) 1513 { 1514 struct mlx5_mempool_reg *mpr = NULL; 1515 1516 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1517 sizeof(struct mlx5_mempool_reg), 1518 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1519 if (mpr == NULL) { 1520 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1521 mp->name); 1522 return NULL; 1523 } 1524 mpr->mrs = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1525 mrs_n * sizeof(struct mlx5_mempool_mr), 1526 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1527 if (!mpr->mrs) { 1528 DRV_LOG(ERR, "Cannot allocate mempool %s registration MRs", 1529 mp->name); 1530 mlx5_free(mpr); 1531 return NULL; 1532 } 1533 mpr->mp = mp; 1534 mpr->mrs_n = mrs_n; 1535 mpr->is_extmem = is_extmem; 1536 return mpr; 1537 } 1538 1539 /** 1540 * Destroy a mempool registration object. 1541 * 1542 * @param standalone 1543 * Whether @p mpr owns its MRs exclusively, i.e. they are not shared. 1544 */ 1545 static void 1546 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1547 struct mlx5_mempool_reg *mpr, bool standalone) 1548 { 1549 if (standalone) { 1550 unsigned int i; 1551 1552 for (i = 0; i < mpr->mrs_n; i++) 1553 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1554 mlx5_free(mpr->mrs); 1555 } 1556 mlx5_free(mpr); 1557 } 1558 1559 /** Find registration object of a mempool. */ 1560 static struct mlx5_mempool_reg * 1561 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1562 struct rte_mempool *mp) 1563 { 1564 struct mlx5_mempool_reg *mpr; 1565 1566 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1567 if (mpr->mp == mp) 1568 break; 1569 return mpr; 1570 } 1571 1572 /** Increment reference counters of MRs used in the registration. */ 1573 static void 1574 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1575 { 1576 unsigned int i; 1577 1578 for (i = 0; i < mpr->mrs_n; i++) 1579 __atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1580 } 1581 1582 /** 1583 * Decrement reference counters of MRs used in the registration. 1584 * 1585 * @return True if no more references to @p mpr MRs exist, False otherwise. 1586 */ 1587 static bool 1588 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1589 { 1590 unsigned int i; 1591 bool ret = false; 1592 1593 for (i = 0; i < mpr->mrs_n; i++) 1594 ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1, 1595 __ATOMIC_RELAXED) == 0; 1596 return ret; 1597 } 1598 1599 static int 1600 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1601 void *pd, struct rte_mempool *mp, 1602 bool is_extmem) 1603 { 1604 struct mlx5_range *ranges = NULL; 1605 struct mlx5_mempool_reg *mpr, *old_mpr, *new_mpr; 1606 unsigned int i, ranges_n; 1607 bool share_hugepage, standalone = false; 1608 int ret = -1; 1609 1610 /* Early check to avoid unnecessary creation of MRs. */ 1611 rte_rwlock_read_lock(&share_cache->rwlock); 1612 old_mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1613 rte_rwlock_read_unlock(&share_cache->rwlock); 1614 if (old_mpr != NULL && (!is_extmem || old_mpr->is_extmem)) { 1615 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1616 mp->name, pd); 1617 rte_errno = EEXIST; 1618 goto exit; 1619 } 1620 if (mlx5_mempool_reg_analyze(mp, is_extmem, &ranges, &ranges_n, 1621 &share_hugepage) < 0) { 1622 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1623 rte_errno = ENOMEM; 1624 goto exit; 1625 } 1626 new_mpr = mlx5_mempool_reg_create(mp, ranges_n, is_extmem); 1627 if (new_mpr == NULL) { 1628 DRV_LOG(ERR, 1629 "Cannot create a registration object for mempool %s in PD %p", 1630 mp->name, pd); 1631 rte_errno = ENOMEM; 1632 goto exit; 1633 } 1634 /* 1635 * If the entire mempool fits in a single hugepage, the MR for this 1636 * hugepage can be shared across mempools that also fit in it. 1637 */ 1638 if (share_hugepage) { 1639 rte_rwlock_write_lock(&share_cache->rwlock); 1640 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1641 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1642 break; 1643 } 1644 if (mpr != NULL) { 1645 new_mpr->mrs = mpr->mrs; 1646 mlx5_mempool_reg_attach(new_mpr); 1647 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1648 new_mpr, next); 1649 } 1650 rte_rwlock_write_unlock(&share_cache->rwlock); 1651 if (mpr != NULL) { 1652 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1653 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1654 mpr->mp->name); 1655 ret = 0; 1656 goto exit; 1657 } 1658 } 1659 for (i = 0; i < ranges_n; i++) { 1660 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1661 const struct mlx5_range *range = &ranges[i]; 1662 size_t len = range->end - range->start; 1663 1664 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1665 &mr->pmd_mr) < 0) { 1666 DRV_LOG(ERR, 1667 "Failed to create an MR in PD %p for address range " 1668 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1669 pd, range->start, range->end, len, mp->name); 1670 break; 1671 } 1672 DRV_LOG(DEBUG, 1673 "Created a new MR %#x in PD %p for address range " 1674 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1675 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1676 mp->name); 1677 } 1678 if (i != ranges_n) { 1679 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1680 rte_errno = EINVAL; 1681 goto exit; 1682 } 1683 /* Concurrent registration is not supposed to happen. */ 1684 rte_rwlock_write_lock(&share_cache->rwlock); 1685 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1686 if (mpr == old_mpr && old_mpr != NULL) { 1687 LIST_REMOVE(old_mpr, next); 1688 standalone = mlx5_mempool_reg_detach(mpr); 1689 /* No need to flush the cache: old MRs cannot be in use. */ 1690 mpr = NULL; 1691 } 1692 if (mpr == NULL) { 1693 mlx5_mempool_reg_attach(new_mpr); 1694 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, new_mpr, next); 1695 ret = 0; 1696 } 1697 rte_rwlock_write_unlock(&share_cache->rwlock); 1698 if (mpr != NULL) { 1699 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1700 mp->name, pd); 1701 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1702 rte_errno = EEXIST; 1703 goto exit; 1704 } else if (old_mpr != NULL) { 1705 DRV_LOG(DEBUG, "Mempool %s registration for PD %p updated for external memory", 1706 mp->name, pd); 1707 mlx5_mempool_reg_destroy(share_cache, old_mpr, standalone); 1708 } 1709 exit: 1710 free(ranges); 1711 return ret; 1712 } 1713 1714 static int 1715 mlx5_mr_mempool_register_secondary(struct mlx5_common_device *cdev, 1716 struct rte_mempool *mp, bool is_extmem) 1717 { 1718 return mlx5_mp_req_mempool_reg(cdev, mp, true, is_extmem); 1719 } 1720 1721 /** 1722 * Register the memory of a mempool in the protection domain. 1723 * 1724 * @param cdev 1725 * Pointer to the mlx5 common device. 1726 * @param mp 1727 * Mempool to register. 1728 * 1729 * @return 1730 * 0 on success, (-1) on failure and rte_errno is set. 1731 */ 1732 int 1733 mlx5_mr_mempool_register(struct mlx5_common_device *cdev, 1734 struct rte_mempool *mp, bool is_extmem) 1735 { 1736 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1737 return 0; 1738 switch (rte_eal_process_type()) { 1739 case RTE_PROC_PRIMARY: 1740 return mlx5_mr_mempool_register_primary(&cdev->mr_scache, 1741 cdev->pd, mp, 1742 is_extmem); 1743 case RTE_PROC_SECONDARY: 1744 return mlx5_mr_mempool_register_secondary(cdev, mp, is_extmem); 1745 default: 1746 return -1; 1747 } 1748 } 1749 1750 static int 1751 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1752 struct rte_mempool *mp) 1753 { 1754 struct mlx5_mempool_reg *mpr; 1755 bool standalone = false; 1756 1757 rte_rwlock_write_lock(&share_cache->rwlock); 1758 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1759 if (mpr->mp == mp) { 1760 LIST_REMOVE(mpr, next); 1761 standalone = mlx5_mempool_reg_detach(mpr); 1762 if (standalone) 1763 /* 1764 * The unlock operation below provides a memory 1765 * barrier due to its store-release semantics. 1766 */ 1767 ++share_cache->dev_gen; 1768 break; 1769 } 1770 rte_rwlock_write_unlock(&share_cache->rwlock); 1771 if (mpr == NULL) { 1772 rte_errno = ENOENT; 1773 return -1; 1774 } 1775 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1776 return 0; 1777 } 1778 1779 static int 1780 mlx5_mr_mempool_unregister_secondary(struct mlx5_common_device *cdev, 1781 struct rte_mempool *mp) 1782 { 1783 return mlx5_mp_req_mempool_reg(cdev, mp, false, false /* is_extmem */); 1784 } 1785 1786 /** 1787 * Unregister the memory of a mempool from the protection domain. 1788 * 1789 * @param cdev 1790 * Pointer to the mlx5 common device. 1791 * @param mp 1792 * Mempool to unregister. 1793 * 1794 * @return 1795 * 0 on success, (-1) on failure and rte_errno is set. 1796 */ 1797 int 1798 mlx5_mr_mempool_unregister(struct mlx5_common_device *cdev, 1799 struct rte_mempool *mp) 1800 { 1801 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1802 return 0; 1803 switch (rte_eal_process_type()) { 1804 case RTE_PROC_PRIMARY: 1805 return mlx5_mr_mempool_unregister_primary(&cdev->mr_scache, mp); 1806 case RTE_PROC_SECONDARY: 1807 return mlx5_mr_mempool_unregister_secondary(cdev, mp); 1808 default: 1809 return -1; 1810 } 1811 } 1812 1813 /** 1814 * Lookup a MR key by and address in a registered mempool. 1815 * 1816 * @param mpr 1817 * Mempool registration object. 1818 * @param addr 1819 * Address within the mempool. 1820 * @param entry 1821 * Bottom-half cache entry to fill. 1822 * 1823 * @return 1824 * MR key or UINT32_MAX on failure, which can only happen 1825 * if the address is not from within the mempool. 1826 */ 1827 static uint32_t 1828 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1829 struct mr_cache_entry *entry) 1830 { 1831 uint32_t lkey = UINT32_MAX; 1832 unsigned int i; 1833 1834 for (i = 0; i < mpr->mrs_n; i++) { 1835 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1836 uintptr_t mr_start = (uintptr_t)mr->addr; 1837 uintptr_t mr_end = mr_start + mr->len; 1838 1839 if (mr_start <= addr && addr < mr_end) { 1840 lkey = rte_cpu_to_be_32(mr->lkey); 1841 entry->start = mr_start; 1842 entry->end = mr_end; 1843 entry->lkey = lkey; 1844 break; 1845 } 1846 } 1847 return lkey; 1848 } 1849 1850 /** 1851 * Update bottom-half cache from the list of mempool registrations. 1852 * 1853 * @param mr_ctrl 1854 * Per-queue MR control handle. 1855 * @param entry 1856 * Pointer to an entry in the bottom-half cache to update 1857 * with the MR lkey looked up. 1858 * @param mp 1859 * Mempool containing the address. 1860 * @param addr 1861 * Address to lookup. 1862 * @return 1863 * MR lkey on success, UINT32_MAX on failure. 1864 */ 1865 static uint32_t 1866 mlx5_lookup_mempool_regs(struct mlx5_mr_ctrl *mr_ctrl, 1867 struct mr_cache_entry *entry, 1868 struct rte_mempool *mp, uintptr_t addr) 1869 { 1870 struct mlx5_mr_share_cache *share_cache = 1871 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1872 dev_gen); 1873 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1874 struct mlx5_mempool_reg *mpr; 1875 uint32_t lkey = UINT32_MAX; 1876 1877 /* If local cache table is full, try to double it. */ 1878 if (unlikely(bt->len == bt->size)) 1879 mr_btree_expand(bt, bt->size << 1); 1880 /* Look up in mempool registrations. */ 1881 rte_rwlock_read_lock(&share_cache->rwlock); 1882 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1883 if (mpr != NULL) 1884 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1885 rte_rwlock_read_unlock(&share_cache->rwlock); 1886 /* 1887 * Update local cache. Even if it fails, return the found entry 1888 * to update top-half cache. Next time, this entry will be found 1889 * in the global cache. 1890 */ 1891 if (lkey != UINT32_MAX) 1892 mr_btree_insert(bt, entry); 1893 return lkey; 1894 } 1895 1896 /** 1897 * Populate cache with LKeys of all MRs used by the mempool. 1898 * It is intended to be used to register Rx mempools in advance. 1899 * 1900 * @param mr_ctrl 1901 * Per-queue MR control handle. 1902 * @param mp 1903 * Registered memory pool. 1904 * 1905 * @return 1906 * 0 on success, (-1) on failure and rte_errno is set. 1907 */ 1908 int 1909 mlx5_mr_mempool_populate_cache(struct mlx5_mr_ctrl *mr_ctrl, 1910 struct rte_mempool *mp) 1911 { 1912 struct mlx5_mr_share_cache *share_cache = 1913 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 1914 dev_gen); 1915 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1916 struct mlx5_mempool_reg *mpr; 1917 unsigned int i; 1918 1919 /* 1920 * Registration is valid after the lock is released, 1921 * because the function is called after the mempool is registered. 1922 */ 1923 rte_rwlock_read_lock(&share_cache->rwlock); 1924 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1925 rte_rwlock_read_unlock(&share_cache->rwlock); 1926 if (mpr == NULL) { 1927 DRV_LOG(ERR, "Mempool %s is not registered", mp->name); 1928 rte_errno = ENOENT; 1929 return -1; 1930 } 1931 for (i = 0; i < mpr->mrs_n; i++) { 1932 struct mlx5_mempool_mr *mr = &mpr->mrs[i]; 1933 struct mr_cache_entry entry; 1934 uint32_t lkey; 1935 uint16_t idx; 1936 1937 lkey = mr_btree_lookup(bt, &idx, (uintptr_t)mr->pmd_mr.addr); 1938 if (lkey != UINT32_MAX) 1939 continue; 1940 if (bt->len == bt->size) 1941 mr_btree_expand(bt, bt->size << 1); 1942 entry.start = (uintptr_t)mr->pmd_mr.addr; 1943 entry.end = entry.start + mr->pmd_mr.len; 1944 entry.lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 1945 if (mr_btree_insert(bt, &entry) < 0) { 1946 DRV_LOG(ERR, "Cannot insert cache entry for mempool %s MR %08x", 1947 mp->name, entry.lkey); 1948 rte_errno = EINVAL; 1949 return -1; 1950 } 1951 } 1952 return 0; 1953 } 1954 1955 /** 1956 * Bottom-half lookup for the address from the mempool. 1957 * 1958 * @param mr_ctrl 1959 * Per-queue MR control handle. 1960 * @param mp 1961 * Mempool containing the address. 1962 * @param addr 1963 * Address to lookup. 1964 * @return 1965 * MR lkey on success, UINT32_MAX on failure. 1966 */ 1967 uint32_t 1968 mlx5_mr_mempool2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, 1969 struct rte_mempool *mp, uintptr_t addr) 1970 { 1971 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1972 uint32_t lkey; 1973 uint16_t bh_idx = 0; 1974 1975 /* Binary-search MR translation table. */ 1976 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1977 /* Update top-half cache. */ 1978 if (likely(lkey != UINT32_MAX)) { 1979 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1980 } else { 1981 lkey = mlx5_lookup_mempool_regs(mr_ctrl, repl, mp, addr); 1982 /* Can only fail if the address is not from the mempool. */ 1983 if (unlikely(lkey == UINT32_MAX)) 1984 return UINT32_MAX; 1985 } 1986 /* Update the most recently used entry. */ 1987 mr_ctrl->mru = mr_ctrl->head; 1988 /* Point to the next victim, the oldest. */ 1989 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1990 return lkey; 1991 } 1992 1993 uint32_t 1994 mlx5_mr_mb2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, struct rte_mbuf *mb) 1995 { 1996 struct rte_mempool *mp; 1997 struct mlx5_mprq_buf *buf; 1998 uint32_t lkey; 1999 uintptr_t addr = (uintptr_t)mb->buf_addr; 2000 struct mlx5_mr_share_cache *share_cache = 2001 container_of(mr_ctrl->dev_gen_ptr, struct mlx5_mr_share_cache, 2002 dev_gen); 2003 struct mlx5_common_device *cdev = 2004 container_of(share_cache, struct mlx5_common_device, mr_scache); 2005 bool external, mprq, pinned = false; 2006 2007 /* Recover MPRQ mempool. */ 2008 external = RTE_MBUF_HAS_EXTBUF(mb); 2009 if (external && mb->shinfo->free_cb == mlx5_mprq_buf_free_cb) { 2010 mprq = true; 2011 buf = mb->shinfo->fcb_opaque; 2012 mp = buf->mp; 2013 } else { 2014 mprq = false; 2015 mp = mlx5_mb2mp(mb); 2016 pinned = rte_pktmbuf_priv_flags(mp) & 2017 RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF; 2018 } 2019 if (!external || mprq || pinned) { 2020 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2021 if (lkey != UINT32_MAX) 2022 return lkey; 2023 /* MPRQ is always registered. */ 2024 MLX5_ASSERT(!mprq); 2025 } 2026 /* Register pinned external memory if the mempool is not used for Rx. */ 2027 if (cdev->config.mr_mempool_reg_en && pinned) { 2028 if (mlx5_mr_mempool_register(cdev, mp, true) < 0) 2029 return UINT32_MAX; 2030 lkey = mlx5_mr_mempool2mr_bh(mr_ctrl, mp, addr); 2031 MLX5_ASSERT(lkey != UINT32_MAX); 2032 return lkey; 2033 } 2034 /* Fallback to generic mechanism in corner cases. */ 2035 return mlx5_mr_addr2mr_bh(mr_ctrl, addr); 2036 } 2037