1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common_mp.h" 16 #include "mlx5_common_mr.h" 17 #include "mlx5_common_log.h" 18 #include "mlx5_malloc.h" 19 20 struct mr_find_contig_memsegs_data { 21 uintptr_t addr; 22 uintptr_t start; 23 uintptr_t end; 24 const struct rte_memseg_list *msl; 25 }; 26 27 /* Virtual memory range. */ 28 struct mlx5_range { 29 uintptr_t start; 30 uintptr_t end; 31 }; 32 33 /** Memory region for a mempool. */ 34 struct mlx5_mempool_mr { 35 struct mlx5_pmd_mr pmd_mr; 36 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 37 }; 38 39 /* Mempool registration. */ 40 struct mlx5_mempool_reg { 41 LIST_ENTRY(mlx5_mempool_reg) next; 42 /** Registered mempool, used to designate registrations. */ 43 struct rte_mempool *mp; 44 /** Memory regions for the address ranges of the mempool. */ 45 struct mlx5_mempool_mr *mrs; 46 /** Number of memory regions. */ 47 unsigned int mrs_n; 48 }; 49 50 /** 51 * Expand B-tree table to a given size. Can't be called with holding 52 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 53 * 54 * @param bt 55 * Pointer to B-tree structure. 56 * @param n 57 * Number of entries for expansion. 58 * 59 * @return 60 * 0 on success, -1 on failure. 61 */ 62 static int 63 mr_btree_expand(struct mlx5_mr_btree *bt, int n) 64 { 65 void *mem; 66 int ret = 0; 67 68 if (n <= bt->size) 69 return ret; 70 /* 71 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 72 * used inside if there's no room to expand. Because this is a quite 73 * rare case and a part of very slow path, it is very acceptable. 74 * Initially cache_bh[] will be given practically enough space and once 75 * it is expanded, expansion wouldn't be needed again ever. 76 */ 77 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 78 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 79 if (mem == NULL) { 80 /* Not an error, B-tree search will be skipped. */ 81 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 82 (void *)bt); 83 ret = -1; 84 } else { 85 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 86 bt->table = mem; 87 bt->size = n; 88 } 89 return ret; 90 } 91 92 /** 93 * Look up LKey from given B-tree lookup table, store the last index and return 94 * searched LKey. 95 * 96 * @param bt 97 * Pointer to B-tree structure. 98 * @param[out] idx 99 * Pointer to index. Even on search failure, returns index where it stops 100 * searching so that index can be used when inserting a new entry. 101 * @param addr 102 * Search key. 103 * 104 * @return 105 * Searched LKey on success, UINT32_MAX on no match. 106 */ 107 static uint32_t 108 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) 109 { 110 struct mr_cache_entry *lkp_tbl; 111 uint16_t n; 112 uint16_t base = 0; 113 114 MLX5_ASSERT(bt != NULL); 115 lkp_tbl = *bt->table; 116 n = bt->len; 117 /* First entry must be NULL for comparison. */ 118 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 119 lkp_tbl[0].lkey == UINT32_MAX)); 120 /* Binary search. */ 121 do { 122 register uint16_t delta = n >> 1; 123 124 if (addr < lkp_tbl[base + delta].start) { 125 n = delta; 126 } else { 127 base += delta; 128 n -= delta; 129 } 130 } while (n > 1); 131 MLX5_ASSERT(addr >= lkp_tbl[base].start); 132 *idx = base; 133 if (addr < lkp_tbl[base].end) 134 return lkp_tbl[base].lkey; 135 /* Not found. */ 136 return UINT32_MAX; 137 } 138 139 /** 140 * Insert an entry to B-tree lookup table. 141 * 142 * @param bt 143 * Pointer to B-tree structure. 144 * @param entry 145 * Pointer to new entry to insert. 146 * 147 * @return 148 * 0 on success, -1 on failure. 149 */ 150 static int 151 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 152 { 153 struct mr_cache_entry *lkp_tbl; 154 uint16_t idx = 0; 155 size_t shift; 156 157 MLX5_ASSERT(bt != NULL); 158 MLX5_ASSERT(bt->len <= bt->size); 159 MLX5_ASSERT(bt->len > 0); 160 lkp_tbl = *bt->table; 161 /* Find out the slot for insertion. */ 162 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 163 DRV_LOG(DEBUG, 164 "abort insertion to B-tree(%p): already exist at" 165 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 166 (void *)bt, idx, entry->start, entry->end, entry->lkey); 167 /* Already exist, return. */ 168 return 0; 169 } 170 /* If table is full, return error. */ 171 if (unlikely(bt->len == bt->size)) { 172 bt->overflow = 1; 173 return -1; 174 } 175 /* Insert entry. */ 176 ++idx; 177 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 178 if (shift) 179 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 180 lkp_tbl[idx] = *entry; 181 bt->len++; 182 DRV_LOG(DEBUG, 183 "inserted B-tree(%p)[%u]," 184 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 185 (void *)bt, idx, entry->start, entry->end, entry->lkey); 186 return 0; 187 } 188 189 /** 190 * Initialize B-tree and allocate memory for lookup table. 191 * 192 * @param bt 193 * Pointer to B-tree structure. 194 * @param n 195 * Number of entries to allocate. 196 * @param socket 197 * NUMA socket on which memory must be allocated. 198 * 199 * @return 200 * 0 on success, a negative errno value otherwise and rte_errno is set. 201 */ 202 int 203 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 204 { 205 if (bt == NULL) { 206 rte_errno = EINVAL; 207 return -rte_errno; 208 } 209 MLX5_ASSERT(!bt->table && !bt->size); 210 memset(bt, 0, sizeof(*bt)); 211 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 212 sizeof(struct mr_cache_entry) * n, 213 0, socket); 214 if (bt->table == NULL) { 215 rte_errno = ENOMEM; 216 DRV_LOG(DEBUG, 217 "failed to allocate memory for btree cache on socket " 218 "%d", socket); 219 return -rte_errno; 220 } 221 bt->size = n; 222 /* First entry must be NULL for binary search. */ 223 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 224 .lkey = UINT32_MAX, 225 }; 226 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 227 (void *)bt, (void *)bt->table); 228 return 0; 229 } 230 231 /** 232 * Free B-tree resources. 233 * 234 * @param bt 235 * Pointer to B-tree structure. 236 */ 237 void 238 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 239 { 240 if (bt == NULL) 241 return; 242 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 243 (void *)bt, (void *)bt->table); 244 mlx5_free(bt->table); 245 memset(bt, 0, sizeof(*bt)); 246 } 247 248 /** 249 * Dump all the entries in a B-tree 250 * 251 * @param bt 252 * Pointer to B-tree structure. 253 */ 254 void 255 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 256 { 257 #ifdef RTE_LIBRTE_MLX5_DEBUG 258 int idx; 259 struct mr_cache_entry *lkp_tbl; 260 261 if (bt == NULL) 262 return; 263 lkp_tbl = *bt->table; 264 for (idx = 0; idx < bt->len; ++idx) { 265 struct mr_cache_entry *entry = &lkp_tbl[idx]; 266 267 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 268 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 269 (void *)bt, idx, entry->start, entry->end, entry->lkey); 270 } 271 #endif 272 } 273 274 /** 275 * Initialize per-queue MR control descriptor. 276 * 277 * @param mr_ctrl 278 * Pointer to MR control structure. 279 * @param dev_gen_ptr 280 * Pointer to generation number of global cache. 281 * @param socket 282 * NUMA socket on which memory must be allocated. 283 * 284 * @return 285 * 0 on success, a negative errno value otherwise and rte_errno is set. 286 */ 287 int 288 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr, 289 int socket) 290 { 291 if (mr_ctrl == NULL) { 292 rte_errno = EINVAL; 293 return -rte_errno; 294 } 295 /* Save pointer of global generation number to check memory event. */ 296 mr_ctrl->dev_gen_ptr = dev_gen_ptr; 297 /* Initialize B-tree and allocate memory for bottom-half cache table. */ 298 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N, 299 socket); 300 } 301 302 /** 303 * Find virtually contiguous memory chunk in a given MR. 304 * 305 * @param dev 306 * Pointer to MR structure. 307 * @param[out] entry 308 * Pointer to returning MR cache entry. If not found, this will not be 309 * updated. 310 * @param start_idx 311 * Start index of the memseg bitmap. 312 * 313 * @return 314 * Next index to go on lookup. 315 */ 316 static int 317 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 318 int base_idx) 319 { 320 uintptr_t start = 0; 321 uintptr_t end = 0; 322 uint32_t idx = 0; 323 324 /* MR for external memory doesn't have memseg list. */ 325 if (mr->msl == NULL) { 326 MLX5_ASSERT(mr->ms_bmp_n == 1); 327 MLX5_ASSERT(mr->ms_n == 1); 328 MLX5_ASSERT(base_idx == 0); 329 /* 330 * Can't search it from memseg list but get it directly from 331 * pmd_mr as there's only one chunk. 332 */ 333 entry->start = (uintptr_t)mr->pmd_mr.addr; 334 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 335 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 336 /* Returning 1 ends iteration. */ 337 return 1; 338 } 339 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 340 if (rte_bitmap_get(mr->ms_bmp, idx)) { 341 const struct rte_memseg_list *msl; 342 const struct rte_memseg *ms; 343 344 msl = mr->msl; 345 ms = rte_fbarray_get(&msl->memseg_arr, 346 mr->ms_base_idx + idx); 347 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 348 if (!start) 349 start = ms->addr_64; 350 end = ms->addr_64 + ms->hugepage_sz; 351 } else if (start) { 352 /* Passed the end of a fragment. */ 353 break; 354 } 355 } 356 if (start) { 357 /* Found one chunk. */ 358 entry->start = start; 359 entry->end = end; 360 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 361 } 362 return idx; 363 } 364 365 /** 366 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 367 * Then, this entry will have to be searched by mr_lookup_list() in 368 * mlx5_mr_create() on miss. 369 * 370 * @param share_cache 371 * Pointer to a global shared MR cache. 372 * @param mr 373 * Pointer to MR to insert. 374 * 375 * @return 376 * 0 on success, -1 on failure. 377 */ 378 int 379 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 380 struct mlx5_mr *mr) 381 { 382 unsigned int n; 383 384 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 385 (void *)mr, (void *)share_cache); 386 for (n = 0; n < mr->ms_bmp_n; ) { 387 struct mr_cache_entry entry; 388 389 memset(&entry, 0, sizeof(entry)); 390 /* Find a contiguous chunk and advance the index. */ 391 n = mr_find_next_chunk(mr, &entry, n); 392 if (!entry.end) 393 break; 394 if (mr_btree_insert(&share_cache->cache, &entry) < 0) { 395 /* 396 * Overflowed, but the global table cannot be expanded 397 * because of deadlock. 398 */ 399 return -1; 400 } 401 } 402 return 0; 403 } 404 405 /** 406 * Look up address in the original global MR list. 407 * 408 * @param share_cache 409 * Pointer to a global shared MR cache. 410 * @param[out] entry 411 * Pointer to returning MR cache entry. If no match, this will not be updated. 412 * @param addr 413 * Search key. 414 * 415 * @return 416 * Found MR on match, NULL otherwise. 417 */ 418 struct mlx5_mr * 419 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 420 struct mr_cache_entry *entry, uintptr_t addr) 421 { 422 struct mlx5_mr *mr; 423 424 /* Iterate all the existing MRs. */ 425 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 426 unsigned int n; 427 428 if (mr->ms_n == 0) 429 continue; 430 for (n = 0; n < mr->ms_bmp_n; ) { 431 struct mr_cache_entry ret; 432 433 memset(&ret, 0, sizeof(ret)); 434 n = mr_find_next_chunk(mr, &ret, n); 435 if (addr >= ret.start && addr < ret.end) { 436 /* Found. */ 437 *entry = ret; 438 return mr; 439 } 440 } 441 } 442 return NULL; 443 } 444 445 /** 446 * Look up address on global MR cache. 447 * 448 * @param share_cache 449 * Pointer to a global shared MR cache. 450 * @param[out] entry 451 * Pointer to returning MR cache entry. If no match, this will not be updated. 452 * @param addr 453 * Search key. 454 * 455 * @return 456 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 457 */ 458 uint32_t 459 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 460 struct mr_cache_entry *entry, uintptr_t addr) 461 { 462 uint16_t idx; 463 uint32_t lkey = UINT32_MAX; 464 struct mlx5_mr *mr; 465 466 /* 467 * If the global cache has overflowed since it failed to expand the 468 * B-tree table, it can't have all the existing MRs. Then, the address 469 * has to be searched by traversing the original MR list instead, which 470 * is very slow path. Otherwise, the global cache is all inclusive. 471 */ 472 if (!unlikely(share_cache->cache.overflow)) { 473 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 474 if (lkey != UINT32_MAX) 475 *entry = (*share_cache->cache.table)[idx]; 476 } else { 477 /* Falling back to the slowest path. */ 478 mr = mlx5_mr_lookup_list(share_cache, entry, addr); 479 if (mr != NULL) 480 lkey = entry->lkey; 481 } 482 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 483 addr < entry->end)); 484 return lkey; 485 } 486 487 /** 488 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 489 * can raise memory free event and the callback function will spin on the lock. 490 * 491 * @param mr 492 * Pointer to MR to free. 493 */ 494 void 495 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 496 { 497 if (mr == NULL) 498 return; 499 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 500 dereg_mr_cb(&mr->pmd_mr); 501 if (mr->ms_bmp != NULL) 502 rte_bitmap_free(mr->ms_bmp); 503 mlx5_free(mr); 504 } 505 506 void 507 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 508 { 509 struct mlx5_mr *mr; 510 511 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 512 /* Flush cache to rebuild. */ 513 share_cache->cache.len = 1; 514 share_cache->cache.overflow = 0; 515 /* Iterate all the existing MRs. */ 516 LIST_FOREACH(mr, &share_cache->mr_list, mr) 517 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 518 return; 519 } 520 521 /** 522 * Release resources of detached MR having no online entry. 523 * 524 * @param share_cache 525 * Pointer to a global shared MR cache. 526 */ 527 static void 528 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 529 { 530 struct mlx5_mr *mr_next; 531 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 532 533 /* Must be called from the primary process. */ 534 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 535 /* 536 * MR can't be freed with holding the lock because rte_free() could call 537 * memory free callback function. This will be a deadlock situation. 538 */ 539 rte_rwlock_write_lock(&share_cache->rwlock); 540 /* Detach the whole free list and release it after unlocking. */ 541 free_list = share_cache->mr_free_list; 542 LIST_INIT(&share_cache->mr_free_list); 543 rte_rwlock_write_unlock(&share_cache->rwlock); 544 /* Release resources. */ 545 mr_next = LIST_FIRST(&free_list); 546 while (mr_next != NULL) { 547 struct mlx5_mr *mr = mr_next; 548 549 mr_next = LIST_NEXT(mr, mr); 550 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 551 } 552 } 553 554 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 555 static int 556 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 557 const struct rte_memseg *ms, size_t len, void *arg) 558 { 559 struct mr_find_contig_memsegs_data *data = arg; 560 561 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 562 return 0; 563 /* Found, save it and stop walking. */ 564 data->start = ms->addr_64; 565 data->end = ms->addr_64 + len; 566 data->msl = msl; 567 return 1; 568 } 569 570 /** 571 * Create a new global Memory Region (MR) for a missing virtual address. 572 * This API should be called on a secondary process, then a request is sent to 573 * the primary process in order to create a MR for the address. As the global MR 574 * list is on the shared memory, following LKey lookup should succeed unless the 575 * request fails. 576 * 577 * @param pd 578 * Pointer to pd of a device (net, regex, vdpa,...). 579 * @param share_cache 580 * Pointer to a global shared MR cache. 581 * @param[out] entry 582 * Pointer to returning MR cache entry, found in the global cache or newly 583 * created. If failed to create one, this will not be updated. 584 * @param addr 585 * Target virtual address to register. 586 * @param mr_ext_memseg_en 587 * Configurable flag about external memory segment enable or not. 588 * 589 * @return 590 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 591 */ 592 static uint32_t 593 mlx5_mr_create_secondary(void *pd __rte_unused, 594 struct mlx5_mp_id *mp_id, 595 struct mlx5_mr_share_cache *share_cache, 596 struct mr_cache_entry *entry, uintptr_t addr, 597 unsigned int mr_ext_memseg_en __rte_unused) 598 { 599 int ret; 600 601 DRV_LOG(DEBUG, "port %u requesting MR creation for address (%p)", 602 mp_id->port_id, (void *)addr); 603 ret = mlx5_mp_req_mr_create(mp_id, addr); 604 if (ret) { 605 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 606 (void *)addr); 607 return UINT32_MAX; 608 } 609 rte_rwlock_read_lock(&share_cache->rwlock); 610 /* Fill in output data. */ 611 mlx5_mr_lookup_cache(share_cache, entry, addr); 612 /* Lookup can't fail. */ 613 MLX5_ASSERT(entry->lkey != UINT32_MAX); 614 rte_rwlock_read_unlock(&share_cache->rwlock); 615 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 616 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 617 (void *)addr, entry->start, entry->end, entry->lkey); 618 return entry->lkey; 619 } 620 621 /** 622 * Create a new global Memory Region (MR) for a missing virtual address. 623 * Register entire virtually contiguous memory chunk around the address. 624 * 625 * @param pd 626 * Pointer to pd of a device (net, regex, vdpa,...). 627 * @param share_cache 628 * Pointer to a global shared MR cache. 629 * @param[out] entry 630 * Pointer to returning MR cache entry, found in the global cache or newly 631 * created. If failed to create one, this will not be updated. 632 * @param addr 633 * Target virtual address to register. 634 * @param mr_ext_memseg_en 635 * Configurable flag about external memory segment enable or not. 636 * 637 * @return 638 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 639 */ 640 uint32_t 641 mlx5_mr_create_primary(void *pd, 642 struct mlx5_mr_share_cache *share_cache, 643 struct mr_cache_entry *entry, uintptr_t addr, 644 unsigned int mr_ext_memseg_en) 645 { 646 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 647 struct mr_find_contig_memsegs_data data_re; 648 const struct rte_memseg_list *msl; 649 const struct rte_memseg *ms; 650 struct mlx5_mr *mr = NULL; 651 int ms_idx_shift = -1; 652 uint32_t bmp_size; 653 void *bmp_mem; 654 uint32_t ms_n; 655 uint32_t n; 656 size_t len; 657 658 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 659 /* 660 * Release detached MRs if any. This can't be called with holding either 661 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 662 * been detached by the memory free event but it couldn't be released 663 * inside the callback due to deadlock. As a result, releasing resources 664 * is quite opportunistic. 665 */ 666 mlx5_mr_garbage_collect(share_cache); 667 /* 668 * If enabled, find out a contiguous virtual address chunk in use, to 669 * which the given address belongs, in order to register maximum range. 670 * In the best case where mempools are not dynamically recreated and 671 * '--socket-mem' is specified as an EAL option, it is very likely to 672 * have only one MR(LKey) per a socket and per a hugepage-size even 673 * though the system memory is highly fragmented. As the whole memory 674 * chunk will be pinned by kernel, it can't be reused unless entire 675 * chunk is freed from EAL. 676 * 677 * If disabled, just register one memseg (page). Then, memory 678 * consumption will be minimized but it may drop performance if there 679 * are many MRs to lookup on the datapath. 680 */ 681 if (!mr_ext_memseg_en) { 682 data.msl = rte_mem_virt2memseg_list((void *)addr); 683 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 684 data.end = data.start + data.msl->page_sz; 685 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 686 DRV_LOG(WARNING, 687 "Unable to find virtually contiguous" 688 " chunk for address (%p)." 689 " rte_memseg_contig_walk() failed.", (void *)addr); 690 rte_errno = ENXIO; 691 goto err_nolock; 692 } 693 alloc_resources: 694 /* Addresses must be page-aligned. */ 695 MLX5_ASSERT(data.msl); 696 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 697 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 698 msl = data.msl; 699 ms = rte_mem_virt2memseg((void *)data.start, msl); 700 len = data.end - data.start; 701 MLX5_ASSERT(ms); 702 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 703 /* Number of memsegs in the range. */ 704 ms_n = len / msl->page_sz; 705 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 706 " page_sz=0x%" PRIx64 ", ms_n=%u", 707 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 708 /* Size of memory for bitmap. */ 709 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 710 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 711 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 712 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 713 if (mr == NULL) { 714 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 715 " address (%p).", (void *)addr); 716 rte_errno = ENOMEM; 717 goto err_nolock; 718 } 719 mr->msl = msl; 720 /* 721 * Save the index of the first memseg and initialize memseg bitmap. To 722 * see if a memseg of ms_idx in the memseg-list is still valid, check: 723 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 724 */ 725 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 726 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 727 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 728 if (mr->ms_bmp == NULL) { 729 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 730 " address (%p).", (void *)addr); 731 rte_errno = EINVAL; 732 goto err_nolock; 733 } 734 /* 735 * Should recheck whether the extended contiguous chunk is still valid. 736 * Because memory_hotplug_lock can't be held if there's any memory 737 * related calls in a critical path, resource allocation above can't be 738 * locked. If the memory has been changed at this point, try again with 739 * just single page. If not, go on with the big chunk atomically from 740 * here. 741 */ 742 rte_mcfg_mem_read_lock(); 743 data_re = data; 744 if (len > msl->page_sz && 745 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 746 DRV_LOG(DEBUG, 747 "Unable to find virtually contiguous chunk for address " 748 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 749 rte_errno = ENXIO; 750 goto err_memlock; 751 } 752 if (data.start != data_re.start || data.end != data_re.end) { 753 /* 754 * The extended contiguous chunk has been changed. Try again 755 * with single memseg instead. 756 */ 757 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 758 data.end = data.start + msl->page_sz; 759 rte_mcfg_mem_read_unlock(); 760 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 761 goto alloc_resources; 762 } 763 MLX5_ASSERT(data.msl == data_re.msl); 764 rte_rwlock_write_lock(&share_cache->rwlock); 765 /* 766 * Check the address is really missing. If other thread already created 767 * one or it is not found due to overflow, abort and return. 768 */ 769 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 770 /* 771 * Insert to the global cache table. It may fail due to 772 * low-on-memory. Then, this entry will have to be searched 773 * here again. 774 */ 775 mr_btree_insert(&share_cache->cache, entry); 776 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 777 (void *)addr); 778 rte_rwlock_write_unlock(&share_cache->rwlock); 779 rte_mcfg_mem_read_unlock(); 780 /* 781 * Must be unlocked before calling rte_free() because 782 * mlx5_mr_mem_event_free_cb() can be called inside. 783 */ 784 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 785 return entry->lkey; 786 } 787 /* 788 * Trim start and end addresses for verbs MR. Set bits for registering 789 * memsegs but exclude already registered ones. Bitmap can be 790 * fragmented. 791 */ 792 for (n = 0; n < ms_n; ++n) { 793 uintptr_t start; 794 struct mr_cache_entry ret; 795 796 memset(&ret, 0, sizeof(ret)); 797 start = data_re.start + n * msl->page_sz; 798 /* Exclude memsegs already registered by other MRs. */ 799 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 800 UINT32_MAX) { 801 /* 802 * Start from the first unregistered memseg in the 803 * extended range. 804 */ 805 if (ms_idx_shift == -1) { 806 mr->ms_base_idx += n; 807 data.start = start; 808 ms_idx_shift = n; 809 } 810 data.end = start + msl->page_sz; 811 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 812 ++mr->ms_n; 813 } 814 } 815 len = data.end - data.start; 816 mr->ms_bmp_n = len / msl->page_sz; 817 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 818 /* 819 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 820 * be called with holding the memory lock because it doesn't use 821 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 822 * through mlx5_alloc_verbs_buf(). 823 */ 824 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 825 if (mr->pmd_mr.obj == NULL) { 826 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 827 (void *)addr); 828 rte_errno = EINVAL; 829 goto err_mrlock; 830 } 831 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 832 MLX5_ASSERT(mr->pmd_mr.len); 833 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 834 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 835 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 836 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 837 (void *)mr, (void *)addr, data.start, data.end, 838 rte_cpu_to_be_32(mr->pmd_mr.lkey), 839 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 840 /* Insert to the global cache table. */ 841 mlx5_mr_insert_cache(share_cache, mr); 842 /* Fill in output data. */ 843 mlx5_mr_lookup_cache(share_cache, entry, addr); 844 /* Lookup can't fail. */ 845 MLX5_ASSERT(entry->lkey != UINT32_MAX); 846 rte_rwlock_write_unlock(&share_cache->rwlock); 847 rte_mcfg_mem_read_unlock(); 848 return entry->lkey; 849 err_mrlock: 850 rte_rwlock_write_unlock(&share_cache->rwlock); 851 err_memlock: 852 rte_mcfg_mem_read_unlock(); 853 err_nolock: 854 /* 855 * In case of error, as this can be called in a datapath, a warning 856 * message per an error is preferable instead. Must be unlocked before 857 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 858 * inside. 859 */ 860 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 861 return UINT32_MAX; 862 } 863 864 /** 865 * Create a new global Memory Region (MR) for a missing virtual address. 866 * This can be called from primary and secondary process. 867 * 868 * @param pd 869 * Pointer to pd handle of a device (net, regex, vdpa,...). 870 * @param share_cache 871 * Pointer to a global shared MR cache. 872 * @param[out] entry 873 * Pointer to returning MR cache entry, found in the global cache or newly 874 * created. If failed to create one, this will not be updated. 875 * @param addr 876 * Target virtual address to register. 877 * 878 * @return 879 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 880 */ 881 static uint32_t 882 mlx5_mr_create(void *pd, struct mlx5_mp_id *mp_id, 883 struct mlx5_mr_share_cache *share_cache, 884 struct mr_cache_entry *entry, uintptr_t addr, 885 unsigned int mr_ext_memseg_en) 886 { 887 uint32_t ret = 0; 888 889 switch (rte_eal_process_type()) { 890 case RTE_PROC_PRIMARY: 891 ret = mlx5_mr_create_primary(pd, share_cache, entry, 892 addr, mr_ext_memseg_en); 893 break; 894 case RTE_PROC_SECONDARY: 895 ret = mlx5_mr_create_secondary(pd, mp_id, share_cache, entry, 896 addr, mr_ext_memseg_en); 897 break; 898 default: 899 break; 900 } 901 return ret; 902 } 903 904 /** 905 * Look up address in the global MR cache table. If not found, create a new MR. 906 * Insert the found/created entry to local bottom-half cache table. 907 * 908 * @param pd 909 * Pointer to pd of a device (net, regex, vdpa,...). 910 * @param share_cache 911 * Pointer to a global shared MR cache. 912 * @param mr_ctrl 913 * Pointer to per-queue MR control structure. 914 * @param[out] entry 915 * Pointer to returning MR cache entry, found in the global cache or newly 916 * created. If failed to create one, this is not written. 917 * @param addr 918 * Search key. 919 * 920 * @return 921 * Searched LKey on success, UINT32_MAX on no match. 922 */ 923 static uint32_t 924 mr_lookup_caches(void *pd, struct mlx5_mp_id *mp_id, 925 struct mlx5_mr_share_cache *share_cache, 926 struct mlx5_mr_ctrl *mr_ctrl, 927 struct mr_cache_entry *entry, uintptr_t addr, 928 unsigned int mr_ext_memseg_en) 929 { 930 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 931 uint32_t lkey; 932 uint16_t idx; 933 934 /* If local cache table is full, try to double it. */ 935 if (unlikely(bt->len == bt->size)) 936 mr_btree_expand(bt, bt->size << 1); 937 /* Look up in the global cache. */ 938 rte_rwlock_read_lock(&share_cache->rwlock); 939 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 940 if (lkey != UINT32_MAX) { 941 /* Found. */ 942 *entry = (*share_cache->cache.table)[idx]; 943 rte_rwlock_read_unlock(&share_cache->rwlock); 944 /* 945 * Update local cache. Even if it fails, return the found entry 946 * to update top-half cache. Next time, this entry will be found 947 * in the global cache. 948 */ 949 mr_btree_insert(bt, entry); 950 return lkey; 951 } 952 rte_rwlock_read_unlock(&share_cache->rwlock); 953 /* First time to see the address? Create a new MR. */ 954 lkey = mlx5_mr_create(pd, mp_id, share_cache, entry, addr, 955 mr_ext_memseg_en); 956 /* 957 * Update the local cache if successfully created a new global MR. Even 958 * if failed to create one, there's no action to take in this datapath 959 * code. As returning LKey is invalid, this will eventually make HW 960 * fail. 961 */ 962 if (lkey != UINT32_MAX) 963 mr_btree_insert(bt, entry); 964 return lkey; 965 } 966 967 /** 968 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 969 * misses, search in the global MR cache table and update the new entry to 970 * per-queue local caches. 971 * 972 * @param pd 973 * Pointer to pd of a device (net, regex, vdpa,...). 974 * @param share_cache 975 * Pointer to a global shared MR cache. 976 * @param mr_ctrl 977 * Pointer to per-queue MR control structure. 978 * @param addr 979 * Search key. 980 * 981 * @return 982 * Searched LKey on success, UINT32_MAX on no match. 983 */ 984 uint32_t mlx5_mr_addr2mr_bh(void *pd, struct mlx5_mp_id *mp_id, 985 struct mlx5_mr_share_cache *share_cache, 986 struct mlx5_mr_ctrl *mr_ctrl, 987 uintptr_t addr, unsigned int mr_ext_memseg_en) 988 { 989 uint32_t lkey; 990 uint16_t bh_idx = 0; 991 /* Victim in top-half cache to replace with new entry. */ 992 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 993 994 /* Binary-search MR translation table. */ 995 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 996 /* Update top-half cache. */ 997 if (likely(lkey != UINT32_MAX)) { 998 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 999 } else { 1000 /* 1001 * If missed in local lookup table, search in the global cache 1002 * and local cache_bh[] will be updated inside if possible. 1003 * Top-half cache entry will also be updated. 1004 */ 1005 lkey = mr_lookup_caches(pd, mp_id, share_cache, mr_ctrl, 1006 repl, addr, mr_ext_memseg_en); 1007 if (unlikely(lkey == UINT32_MAX)) 1008 return UINT32_MAX; 1009 } 1010 /* Update the most recently used entry. */ 1011 mr_ctrl->mru = mr_ctrl->head; 1012 /* Point to the next victim, the oldest. */ 1013 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1014 return lkey; 1015 } 1016 1017 /** 1018 * Release all the created MRs and resources on global MR cache of a device. 1019 * list. 1020 * 1021 * @param share_cache 1022 * Pointer to a global shared MR cache. 1023 */ 1024 void 1025 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 1026 { 1027 struct mlx5_mr *mr_next; 1028 1029 rte_rwlock_write_lock(&share_cache->rwlock); 1030 /* Detach from MR list and move to free list. */ 1031 mr_next = LIST_FIRST(&share_cache->mr_list); 1032 while (mr_next != NULL) { 1033 struct mlx5_mr *mr = mr_next; 1034 1035 mr_next = LIST_NEXT(mr, mr); 1036 LIST_REMOVE(mr, mr); 1037 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1038 } 1039 LIST_INIT(&share_cache->mr_list); 1040 /* Free global cache. */ 1041 mlx5_mr_btree_free(&share_cache->cache); 1042 rte_rwlock_write_unlock(&share_cache->rwlock); 1043 /* Free all remaining MRs. */ 1044 mlx5_mr_garbage_collect(share_cache); 1045 } 1046 1047 /** 1048 * Flush all of the local cache entries. 1049 * 1050 * @param mr_ctrl 1051 * Pointer to per-queue MR local cache. 1052 */ 1053 void 1054 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1055 { 1056 /* Reset the most-recently-used index. */ 1057 mr_ctrl->mru = 0; 1058 /* Reset the linear search array. */ 1059 mr_ctrl->head = 0; 1060 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1061 /* Reset the B-tree table. */ 1062 mr_ctrl->cache_bh.len = 1; 1063 mr_ctrl->cache_bh.overflow = 0; 1064 /* Update the generation number. */ 1065 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1066 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1067 (void *)mr_ctrl, mr_ctrl->cur_gen); 1068 } 1069 1070 /** 1071 * Creates a memory region for external memory, that is memory which is not 1072 * part of the DPDK memory segments. 1073 * 1074 * @param pd 1075 * Pointer to pd of a device (net, regex, vdpa,...). 1076 * @param addr 1077 * Starting virtual address of memory. 1078 * @param len 1079 * Length of memory segment being mapped. 1080 * @param socked_id 1081 * Socket to allocate heap memory for the control structures. 1082 * 1083 * @return 1084 * Pointer to MR structure on success, NULL otherwise. 1085 */ 1086 struct mlx5_mr * 1087 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1088 mlx5_reg_mr_t reg_mr_cb) 1089 { 1090 struct mlx5_mr *mr = NULL; 1091 1092 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1093 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1094 RTE_CACHE_LINE_SIZE, socket_id); 1095 if (mr == NULL) 1096 return NULL; 1097 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1098 if (mr->pmd_mr.obj == NULL) { 1099 DRV_LOG(WARNING, 1100 "Fail to create MR for address (%p)", 1101 (void *)addr); 1102 mlx5_free(mr); 1103 return NULL; 1104 } 1105 mr->msl = NULL; /* Mark it is external memory. */ 1106 mr->ms_bmp = NULL; 1107 mr->ms_n = 1; 1108 mr->ms_bmp_n = 1; 1109 DRV_LOG(DEBUG, 1110 "MR CREATED (%p) for external memory %p:\n" 1111 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1112 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1113 (void *)mr, (void *)addr, 1114 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1115 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1116 return mr; 1117 } 1118 1119 /** 1120 * Callback for memory free event. Iterate freed memsegs and check whether it 1121 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1122 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1123 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1124 * secondary process, the garbage collector will be called in primary process 1125 * as the secondary process can't call mlx5_mr_create(). 1126 * 1127 * The global cache must be rebuilt if there's any change and this event has to 1128 * be propagated to dataplane threads to flush the local caches. 1129 * 1130 * @param share_cache 1131 * Pointer to a global shared MR cache. 1132 * @param ibdev_name 1133 * Name of ibv device. 1134 * @param addr 1135 * Address of freed memory. 1136 * @param len 1137 * Size of freed memory. 1138 */ 1139 void 1140 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1141 const char *ibdev_name, const void *addr, size_t len) 1142 { 1143 const struct rte_memseg_list *msl; 1144 struct mlx5_mr *mr; 1145 int ms_n; 1146 int i; 1147 int rebuild = 0; 1148 1149 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1150 ibdev_name, addr, len); 1151 msl = rte_mem_virt2memseg_list(addr); 1152 /* addr and len must be page-aligned. */ 1153 MLX5_ASSERT((uintptr_t)addr == 1154 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1155 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1156 ms_n = len / msl->page_sz; 1157 rte_rwlock_write_lock(&share_cache->rwlock); 1158 /* Clear bits of freed memsegs from MR. */ 1159 for (i = 0; i < ms_n; ++i) { 1160 const struct rte_memseg *ms; 1161 struct mr_cache_entry entry; 1162 uintptr_t start; 1163 int ms_idx; 1164 uint32_t pos; 1165 1166 /* Find MR having this memseg. */ 1167 start = (uintptr_t)addr + i * msl->page_sz; 1168 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1169 if (mr == NULL) 1170 continue; 1171 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1172 ms = rte_mem_virt2memseg((void *)start, msl); 1173 MLX5_ASSERT(ms != NULL); 1174 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1175 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1176 pos = ms_idx - mr->ms_base_idx; 1177 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1178 MLX5_ASSERT(pos < mr->ms_bmp_n); 1179 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1180 ibdev_name, (void *)mr, pos, (void *)start); 1181 rte_bitmap_clear(mr->ms_bmp, pos); 1182 if (--mr->ms_n == 0) { 1183 LIST_REMOVE(mr, mr); 1184 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1185 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1186 ibdev_name, (void *)mr); 1187 } 1188 /* 1189 * MR is fragmented or will be freed. the global cache must be 1190 * rebuilt. 1191 */ 1192 rebuild = 1; 1193 } 1194 if (rebuild) { 1195 mlx5_mr_rebuild_cache(share_cache); 1196 /* 1197 * No explicit wmb is needed after updating dev_gen due to 1198 * store-release ordering in unlock that provides the 1199 * implicit barrier at the software visible level. 1200 */ 1201 ++share_cache->dev_gen; 1202 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1203 share_cache->dev_gen); 1204 } 1205 rte_rwlock_write_unlock(&share_cache->rwlock); 1206 } 1207 1208 /** 1209 * Dump all the created MRs and the global cache entries. 1210 * 1211 * @param sh 1212 * Pointer to Ethernet device shared context. 1213 */ 1214 void 1215 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1216 { 1217 #ifdef RTE_LIBRTE_MLX5_DEBUG 1218 struct mlx5_mr *mr; 1219 int mr_n = 0; 1220 int chunk_n = 0; 1221 1222 rte_rwlock_read_lock(&share_cache->rwlock); 1223 /* Iterate all the existing MRs. */ 1224 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1225 unsigned int n; 1226 1227 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1228 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1229 mr->ms_n, mr->ms_bmp_n); 1230 if (mr->ms_n == 0) 1231 continue; 1232 for (n = 0; n < mr->ms_bmp_n; ) { 1233 struct mr_cache_entry ret = { 0, }; 1234 1235 n = mr_find_next_chunk(mr, &ret, n); 1236 if (!ret.end) 1237 break; 1238 DRV_LOG(DEBUG, 1239 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1240 chunk_n++, ret.start, ret.end); 1241 } 1242 } 1243 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1244 mlx5_mr_btree_dump(&share_cache->cache); 1245 rte_rwlock_read_unlock(&share_cache->rwlock); 1246 #endif 1247 } 1248 1249 static int 1250 mlx5_range_compare_start(const void *lhs, const void *rhs) 1251 { 1252 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1253 1254 if (r1->start > r2->start) 1255 return 1; 1256 else if (r1->start < r2->start) 1257 return -1; 1258 return 0; 1259 } 1260 1261 static void 1262 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1263 struct rte_mempool_memhdr *memhdr, 1264 unsigned int idx) 1265 { 1266 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1267 uint64_t page_size = rte_mem_page_size(); 1268 1269 RTE_SET_USED(mp); 1270 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 1271 range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size); 1272 } 1273 1274 /** 1275 * Get VA-contiguous ranges of the mempool memory. 1276 * Each range start and end is aligned to the system page size. 1277 * 1278 * @param[in] mp 1279 * Analyzed mempool. 1280 * @param[out] out 1281 * Receives the ranges, caller must release it with free(). 1282 * @param[out] ount_n 1283 * Receives the number of @p out elements. 1284 * 1285 * @return 1286 * 0 on success, (-1) on failure. 1287 */ 1288 static int 1289 mlx5_get_mempool_ranges(struct rte_mempool *mp, struct mlx5_range **out, 1290 unsigned int *out_n) 1291 { 1292 struct mlx5_range *chunks; 1293 unsigned int chunks_n = mp->nb_mem_chunks, contig_n, i; 1294 1295 /* Collect page-aligned memory ranges of the mempool. */ 1296 chunks = calloc(sizeof(chunks[0]), chunks_n); 1297 if (chunks == NULL) 1298 return -1; 1299 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, chunks); 1300 /* Merge adjacent chunks and place them at the beginning. */ 1301 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1302 contig_n = 1; 1303 for (i = 1; i < chunks_n; i++) 1304 if (chunks[i - 1].end != chunks[i].start) { 1305 chunks[contig_n - 1].end = chunks[i - 1].end; 1306 chunks[contig_n] = chunks[i]; 1307 contig_n++; 1308 } 1309 /* Extend the last contiguous chunk to the end of the mempool. */ 1310 chunks[contig_n - 1].end = chunks[i - 1].end; 1311 *out = chunks; 1312 *out_n = contig_n; 1313 return 0; 1314 } 1315 1316 /** 1317 * Analyze mempool memory to select memory ranges to register. 1318 * 1319 * @param[in] mp 1320 * Mempool to analyze. 1321 * @param[out] out 1322 * Receives memory ranges to register, aligned to the system page size. 1323 * The caller must release them with free(). 1324 * @param[out] out_n 1325 * Receives the number of @p out items. 1326 * @param[out] share_hugepage 1327 * Receives True if the entire pool resides within a single hugepage. 1328 * 1329 * @return 1330 * 0 on success, (-1) on failure. 1331 */ 1332 static int 1333 mlx5_mempool_reg_analyze(struct rte_mempool *mp, struct mlx5_range **out, 1334 unsigned int *out_n, bool *share_hugepage) 1335 { 1336 struct mlx5_range *ranges = NULL; 1337 unsigned int i, ranges_n = 0; 1338 struct rte_memseg_list *msl; 1339 1340 if (mlx5_get_mempool_ranges(mp, &ranges, &ranges_n) < 0) { 1341 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1342 mp->name); 1343 return -1; 1344 } 1345 /* Check if the hugepage of the pool can be shared. */ 1346 *share_hugepage = false; 1347 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1348 if (msl != NULL) { 1349 uint64_t hugepage_sz = 0; 1350 1351 /* Check that all ranges are on pages of the same size. */ 1352 for (i = 0; i < ranges_n; i++) { 1353 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1354 break; 1355 hugepage_sz = msl->page_sz; 1356 } 1357 if (i == ranges_n) { 1358 /* 1359 * If the entire pool is within one hugepage, 1360 * combine all ranges into one of the hugepage size. 1361 */ 1362 uintptr_t reg_start = ranges[0].start; 1363 uintptr_t reg_end = ranges[ranges_n - 1].end; 1364 uintptr_t hugepage_start = 1365 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1366 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1367 if (reg_end < hugepage_end) { 1368 ranges[0].start = hugepage_start; 1369 ranges[0].end = hugepage_end; 1370 ranges_n = 1; 1371 *share_hugepage = true; 1372 } 1373 } 1374 } 1375 *out = ranges; 1376 *out_n = ranges_n; 1377 return 0; 1378 } 1379 1380 /** Create a registration object for the mempool. */ 1381 static struct mlx5_mempool_reg * 1382 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n) 1383 { 1384 struct mlx5_mempool_reg *mpr = NULL; 1385 1386 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1387 sizeof(*mpr) + mrs_n * sizeof(mpr->mrs[0]), 1388 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1389 if (mpr == NULL) { 1390 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1391 mp->name); 1392 return NULL; 1393 } 1394 mpr->mp = mp; 1395 mpr->mrs = (struct mlx5_mempool_mr *)(mpr + 1); 1396 mpr->mrs_n = mrs_n; 1397 return mpr; 1398 } 1399 1400 /** 1401 * Destroy a mempool registration object. 1402 * 1403 * @param standalone 1404 * Whether @p mpr owns its MRs excludively, i.e. they are not shared. 1405 */ 1406 static void 1407 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1408 struct mlx5_mempool_reg *mpr, bool standalone) 1409 { 1410 if (standalone) { 1411 unsigned int i; 1412 1413 for (i = 0; i < mpr->mrs_n; i++) 1414 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1415 } 1416 mlx5_free(mpr); 1417 } 1418 1419 /** Find registration object of a mempool. */ 1420 static struct mlx5_mempool_reg * 1421 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1422 struct rte_mempool *mp) 1423 { 1424 struct mlx5_mempool_reg *mpr; 1425 1426 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1427 if (mpr->mp == mp) 1428 break; 1429 return mpr; 1430 } 1431 1432 /** Increment reference counters of MRs used in the registration. */ 1433 static void 1434 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1435 { 1436 unsigned int i; 1437 1438 for (i = 0; i < mpr->mrs_n; i++) 1439 __atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1440 } 1441 1442 /** 1443 * Decrement reference counters of MRs used in the registration. 1444 * 1445 * @return True if no more references to @p mpr MRs exist, False otherwise. 1446 */ 1447 static bool 1448 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1449 { 1450 unsigned int i; 1451 bool ret = false; 1452 1453 for (i = 0; i < mpr->mrs_n; i++) 1454 ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1, 1455 __ATOMIC_RELAXED) == 0; 1456 return ret; 1457 } 1458 1459 static int 1460 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1461 void *pd, struct rte_mempool *mp) 1462 { 1463 struct mlx5_range *ranges = NULL; 1464 struct mlx5_mempool_reg *mpr, *new_mpr; 1465 unsigned int i, ranges_n; 1466 bool share_hugepage; 1467 int ret = -1; 1468 1469 /* Early check to avoid unnecessary creation of MRs. */ 1470 rte_rwlock_read_lock(&share_cache->rwlock); 1471 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1472 rte_rwlock_read_unlock(&share_cache->rwlock); 1473 if (mpr != NULL) { 1474 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1475 mp->name, pd); 1476 rte_errno = EEXIST; 1477 goto exit; 1478 } 1479 if (mlx5_mempool_reg_analyze(mp, &ranges, &ranges_n, 1480 &share_hugepage) < 0) { 1481 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1482 rte_errno = ENOMEM; 1483 goto exit; 1484 } 1485 new_mpr = mlx5_mempool_reg_create(mp, ranges_n); 1486 if (new_mpr == NULL) { 1487 DRV_LOG(ERR, 1488 "Cannot create a registration object for mempool %s in PD %p", 1489 mp->name, pd); 1490 rte_errno = ENOMEM; 1491 goto exit; 1492 } 1493 /* 1494 * If the entire mempool fits in a single hugepage, the MR for this 1495 * hugepage can be shared across mempools that also fit in it. 1496 */ 1497 if (share_hugepage) { 1498 rte_rwlock_write_lock(&share_cache->rwlock); 1499 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1500 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1501 break; 1502 } 1503 if (mpr != NULL) { 1504 new_mpr->mrs = mpr->mrs; 1505 mlx5_mempool_reg_attach(new_mpr); 1506 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1507 new_mpr, next); 1508 } 1509 rte_rwlock_write_unlock(&share_cache->rwlock); 1510 if (mpr != NULL) { 1511 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1512 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1513 mpr->mp->name); 1514 ret = 0; 1515 goto exit; 1516 } 1517 } 1518 for (i = 0; i < ranges_n; i++) { 1519 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1520 const struct mlx5_range *range = &ranges[i]; 1521 size_t len = range->end - range->start; 1522 1523 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1524 &mr->pmd_mr) < 0) { 1525 DRV_LOG(ERR, 1526 "Failed to create an MR in PD %p for address range " 1527 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1528 pd, range->start, range->end, len, mp->name); 1529 break; 1530 } 1531 DRV_LOG(DEBUG, 1532 "Created a new MR %#x in PD %p for address range " 1533 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1534 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1535 mp->name); 1536 } 1537 if (i != ranges_n) { 1538 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1539 rte_errno = EINVAL; 1540 goto exit; 1541 } 1542 /* Concurrent registration is not supposed to happen. */ 1543 rte_rwlock_write_lock(&share_cache->rwlock); 1544 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1545 if (mpr == NULL) { 1546 mlx5_mempool_reg_attach(new_mpr); 1547 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1548 new_mpr, next); 1549 ret = 0; 1550 } 1551 rte_rwlock_write_unlock(&share_cache->rwlock); 1552 if (mpr != NULL) { 1553 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1554 mp->name, pd); 1555 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1556 rte_errno = EEXIST; 1557 goto exit; 1558 } 1559 exit: 1560 free(ranges); 1561 return ret; 1562 } 1563 1564 static int 1565 mlx5_mr_mempool_register_secondary(struct mlx5_mr_share_cache *share_cache, 1566 void *pd, struct rte_mempool *mp, 1567 struct mlx5_mp_id *mp_id) 1568 { 1569 if (mp_id == NULL) { 1570 rte_errno = EINVAL; 1571 return -1; 1572 } 1573 return mlx5_mp_req_mempool_reg(mp_id, share_cache, pd, mp, true); 1574 } 1575 1576 /** 1577 * Register the memory of a mempool in the protection domain. 1578 * 1579 * @param share_cache 1580 * Shared MR cache of the protection domain. 1581 * @param pd 1582 * Protection domain object. 1583 * @param mp 1584 * Mempool to register. 1585 * @param mp_id 1586 * Multi-process identifier, may be NULL for the primary process. 1587 * 1588 * @return 1589 * 0 on success, (-1) on failure and rte_errno is set. 1590 */ 1591 int 1592 mlx5_mr_mempool_register(struct mlx5_mr_share_cache *share_cache, void *pd, 1593 struct rte_mempool *mp, struct mlx5_mp_id *mp_id) 1594 { 1595 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1596 return 0; 1597 switch (rte_eal_process_type()) { 1598 case RTE_PROC_PRIMARY: 1599 return mlx5_mr_mempool_register_primary(share_cache, pd, mp); 1600 case RTE_PROC_SECONDARY: 1601 return mlx5_mr_mempool_register_secondary(share_cache, pd, mp, 1602 mp_id); 1603 default: 1604 return -1; 1605 } 1606 } 1607 1608 static int 1609 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1610 struct rte_mempool *mp) 1611 { 1612 struct mlx5_mempool_reg *mpr; 1613 bool standalone = false; 1614 1615 rte_rwlock_write_lock(&share_cache->rwlock); 1616 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1617 if (mpr->mp == mp) { 1618 LIST_REMOVE(mpr, next); 1619 standalone = mlx5_mempool_reg_detach(mpr); 1620 if (standalone) 1621 /* 1622 * The unlock operation below provides a memory 1623 * barrier due to its store-release semantics. 1624 */ 1625 ++share_cache->dev_gen; 1626 break; 1627 } 1628 rte_rwlock_write_unlock(&share_cache->rwlock); 1629 if (mpr == NULL) { 1630 rte_errno = ENOENT; 1631 return -1; 1632 } 1633 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1634 return 0; 1635 } 1636 1637 static int 1638 mlx5_mr_mempool_unregister_secondary(struct mlx5_mr_share_cache *share_cache, 1639 struct rte_mempool *mp, 1640 struct mlx5_mp_id *mp_id) 1641 { 1642 if (mp_id == NULL) { 1643 rte_errno = EINVAL; 1644 return -1; 1645 } 1646 return mlx5_mp_req_mempool_reg(mp_id, share_cache, NULL, mp, false); 1647 } 1648 1649 /** 1650 * Unregister the memory of a mempool from the protection domain. 1651 * 1652 * @param share_cache 1653 * Shared MR cache of the protection domain. 1654 * @param mp 1655 * Mempool to unregister. 1656 * @param mp_id 1657 * Multi-process identifier, may be NULL for the primary process. 1658 * 1659 * @return 1660 * 0 on success, (-1) on failure and rte_errno is set. 1661 */ 1662 int 1663 mlx5_mr_mempool_unregister(struct mlx5_mr_share_cache *share_cache, 1664 struct rte_mempool *mp, struct mlx5_mp_id *mp_id) 1665 { 1666 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1667 return 0; 1668 switch (rte_eal_process_type()) { 1669 case RTE_PROC_PRIMARY: 1670 return mlx5_mr_mempool_unregister_primary(share_cache, mp); 1671 case RTE_PROC_SECONDARY: 1672 return mlx5_mr_mempool_unregister_secondary(share_cache, mp, 1673 mp_id); 1674 default: 1675 return -1; 1676 } 1677 } 1678 1679 /** 1680 * Lookup a MR key by and address in a registered mempool. 1681 * 1682 * @param mpr 1683 * Mempool registration object. 1684 * @param addr 1685 * Address within the mempool. 1686 * @param entry 1687 * Bottom-half cache entry to fill. 1688 * 1689 * @return 1690 * MR key or UINT32_MAX on failure, which can only happen 1691 * if the address is not from within the mempool. 1692 */ 1693 static uint32_t 1694 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1695 struct mr_cache_entry *entry) 1696 { 1697 uint32_t lkey = UINT32_MAX; 1698 unsigned int i; 1699 1700 for (i = 0; i < mpr->mrs_n; i++) { 1701 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1702 uintptr_t mr_addr = (uintptr_t)mr->addr; 1703 1704 if (mr_addr <= addr) { 1705 lkey = rte_cpu_to_be_32(mr->lkey); 1706 entry->start = mr_addr; 1707 entry->end = mr_addr + mr->len; 1708 entry->lkey = lkey; 1709 break; 1710 } 1711 } 1712 return lkey; 1713 } 1714 1715 /** 1716 * Update bottom-half cache from the list of mempool registrations. 1717 * 1718 * @param share_cache 1719 * Pointer to a global shared MR cache. 1720 * @param mr_ctrl 1721 * Per-queue MR control handle. 1722 * @param entry 1723 * Pointer to an entry in the bottom-half cache to update 1724 * with the MR lkey looked up. 1725 * @param mp 1726 * Mempool containing the address. 1727 * @param addr 1728 * Address to lookup. 1729 * @return 1730 * MR lkey on success, UINT32_MAX on failure. 1731 */ 1732 static uint32_t 1733 mlx5_lookup_mempool_regs(struct mlx5_mr_share_cache *share_cache, 1734 struct mlx5_mr_ctrl *mr_ctrl, 1735 struct mr_cache_entry *entry, 1736 struct rte_mempool *mp, uintptr_t addr) 1737 { 1738 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1739 struct mlx5_mempool_reg *mpr; 1740 uint32_t lkey = UINT32_MAX; 1741 1742 /* If local cache table is full, try to double it. */ 1743 if (unlikely(bt->len == bt->size)) 1744 mr_btree_expand(bt, bt->size << 1); 1745 /* Look up in mempool registrations. */ 1746 rte_rwlock_read_lock(&share_cache->rwlock); 1747 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1748 if (mpr != NULL) 1749 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1750 rte_rwlock_read_unlock(&share_cache->rwlock); 1751 /* 1752 * Update local cache. Even if it fails, return the found entry 1753 * to update top-half cache. Next time, this entry will be found 1754 * in the global cache. 1755 */ 1756 if (lkey != UINT32_MAX) 1757 mr_btree_insert(bt, entry); 1758 return lkey; 1759 } 1760 1761 /** 1762 * Bottom-half lookup for the address from the mempool. 1763 * 1764 * @param share_cache 1765 * Pointer to a global shared MR cache. 1766 * @param mr_ctrl 1767 * Per-queue MR control handle. 1768 * @param mp 1769 * Mempool containing the address. 1770 * @param addr 1771 * Address to lookup. 1772 * @return 1773 * MR lkey on success, UINT32_MAX on failure. 1774 */ 1775 uint32_t 1776 mlx5_mr_mempool2mr_bh(struct mlx5_mr_share_cache *share_cache, 1777 struct mlx5_mr_ctrl *mr_ctrl, 1778 struct rte_mempool *mp, uintptr_t addr) 1779 { 1780 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1781 uint32_t lkey; 1782 uint16_t bh_idx = 0; 1783 1784 /* Binary-search MR translation table. */ 1785 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1786 /* Update top-half cache. */ 1787 if (likely(lkey != UINT32_MAX)) { 1788 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1789 } else { 1790 lkey = mlx5_lookup_mempool_regs(share_cache, mr_ctrl, repl, 1791 mp, addr); 1792 /* Can only fail if the address is not from the mempool. */ 1793 if (unlikely(lkey == UINT32_MAX)) 1794 return UINT32_MAX; 1795 } 1796 /* Update the most recently used entry. */ 1797 mr_ctrl->mru = mr_ctrl->head; 1798 /* Point to the next victim, the oldest. */ 1799 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1800 return lkey; 1801 } 1802