1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common_mp.h" 16 #include "mlx5_common_mr.h" 17 #include "mlx5_common_log.h" 18 #include "mlx5_malloc.h" 19 20 struct mr_find_contig_memsegs_data { 21 uintptr_t addr; 22 uintptr_t start; 23 uintptr_t end; 24 const struct rte_memseg_list *msl; 25 }; 26 27 /* Virtual memory range. */ 28 struct mlx5_range { 29 uintptr_t start; 30 uintptr_t end; 31 }; 32 33 /** Memory region for a mempool. */ 34 struct mlx5_mempool_mr { 35 struct mlx5_pmd_mr pmd_mr; 36 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 37 }; 38 39 /* Mempool registration. */ 40 struct mlx5_mempool_reg { 41 LIST_ENTRY(mlx5_mempool_reg) next; 42 /** Registered mempool, used to designate registrations. */ 43 struct rte_mempool *mp; 44 /** Memory regions for the address ranges of the mempool. */ 45 struct mlx5_mempool_mr *mrs; 46 /** Number of memory regions. */ 47 unsigned int mrs_n; 48 }; 49 50 /** 51 * Expand B-tree table to a given size. Can't be called with holding 52 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 53 * 54 * @param bt 55 * Pointer to B-tree structure. 56 * @param n 57 * Number of entries for expansion. 58 * 59 * @return 60 * 0 on success, -1 on failure. 61 */ 62 static int 63 mr_btree_expand(struct mlx5_mr_btree *bt, int n) 64 { 65 void *mem; 66 int ret = 0; 67 68 if (n <= bt->size) 69 return ret; 70 /* 71 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 72 * used inside if there's no room to expand. Because this is a quite 73 * rare case and a part of very slow path, it is very acceptable. 74 * Initially cache_bh[] will be given practically enough space and once 75 * it is expanded, expansion wouldn't be needed again ever. 76 */ 77 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 78 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 79 if (mem == NULL) { 80 /* Not an error, B-tree search will be skipped. */ 81 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 82 (void *)bt); 83 ret = -1; 84 } else { 85 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 86 bt->table = mem; 87 bt->size = n; 88 } 89 return ret; 90 } 91 92 /** 93 * Look up LKey from given B-tree lookup table, store the last index and return 94 * searched LKey. 95 * 96 * @param bt 97 * Pointer to B-tree structure. 98 * @param[out] idx 99 * Pointer to index. Even on search failure, returns index where it stops 100 * searching so that index can be used when inserting a new entry. 101 * @param addr 102 * Search key. 103 * 104 * @return 105 * Searched LKey on success, UINT32_MAX on no match. 106 */ 107 static uint32_t 108 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) 109 { 110 struct mr_cache_entry *lkp_tbl; 111 uint16_t n; 112 uint16_t base = 0; 113 114 MLX5_ASSERT(bt != NULL); 115 lkp_tbl = *bt->table; 116 n = bt->len; 117 /* First entry must be NULL for comparison. */ 118 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 119 lkp_tbl[0].lkey == UINT32_MAX)); 120 /* Binary search. */ 121 do { 122 register uint16_t delta = n >> 1; 123 124 if (addr < lkp_tbl[base + delta].start) { 125 n = delta; 126 } else { 127 base += delta; 128 n -= delta; 129 } 130 } while (n > 1); 131 MLX5_ASSERT(addr >= lkp_tbl[base].start); 132 *idx = base; 133 if (addr < lkp_tbl[base].end) 134 return lkp_tbl[base].lkey; 135 /* Not found. */ 136 return UINT32_MAX; 137 } 138 139 /** 140 * Insert an entry to B-tree lookup table. 141 * 142 * @param bt 143 * Pointer to B-tree structure. 144 * @param entry 145 * Pointer to new entry to insert. 146 * 147 * @return 148 * 0 on success, -1 on failure. 149 */ 150 static int 151 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 152 { 153 struct mr_cache_entry *lkp_tbl; 154 uint16_t idx = 0; 155 size_t shift; 156 157 MLX5_ASSERT(bt != NULL); 158 MLX5_ASSERT(bt->len <= bt->size); 159 MLX5_ASSERT(bt->len > 0); 160 lkp_tbl = *bt->table; 161 /* Find out the slot for insertion. */ 162 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 163 DRV_LOG(DEBUG, 164 "abort insertion to B-tree(%p): already exist at" 165 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 166 (void *)bt, idx, entry->start, entry->end, entry->lkey); 167 /* Already exist, return. */ 168 return 0; 169 } 170 /* If table is full, return error. */ 171 if (unlikely(bt->len == bt->size)) { 172 bt->overflow = 1; 173 return -1; 174 } 175 /* Insert entry. */ 176 ++idx; 177 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 178 if (shift) 179 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 180 lkp_tbl[idx] = *entry; 181 bt->len++; 182 DRV_LOG(DEBUG, 183 "inserted B-tree(%p)[%u]," 184 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 185 (void *)bt, idx, entry->start, entry->end, entry->lkey); 186 return 0; 187 } 188 189 /** 190 * Initialize B-tree and allocate memory for lookup table. 191 * 192 * @param bt 193 * Pointer to B-tree structure. 194 * @param n 195 * Number of entries to allocate. 196 * @param socket 197 * NUMA socket on which memory must be allocated. 198 * 199 * @return 200 * 0 on success, a negative errno value otherwise and rte_errno is set. 201 */ 202 static int 203 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 204 { 205 if (bt == NULL) { 206 rte_errno = EINVAL; 207 return -rte_errno; 208 } 209 MLX5_ASSERT(!bt->table && !bt->size); 210 memset(bt, 0, sizeof(*bt)); 211 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 212 sizeof(struct mr_cache_entry) * n, 213 0, socket); 214 if (bt->table == NULL) { 215 rte_errno = ENOMEM; 216 DRV_LOG(DEBUG, 217 "failed to allocate memory for btree cache on socket " 218 "%d", socket); 219 return -rte_errno; 220 } 221 bt->size = n; 222 /* First entry must be NULL for binary search. */ 223 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 224 .lkey = UINT32_MAX, 225 }; 226 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 227 (void *)bt, (void *)bt->table); 228 return 0; 229 } 230 231 /** 232 * Free B-tree resources. 233 * 234 * @param bt 235 * Pointer to B-tree structure. 236 */ 237 void 238 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 239 { 240 if (bt == NULL) 241 return; 242 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 243 (void *)bt, (void *)bt->table); 244 mlx5_free(bt->table); 245 memset(bt, 0, sizeof(*bt)); 246 } 247 248 /** 249 * Dump all the entries in a B-tree 250 * 251 * @param bt 252 * Pointer to B-tree structure. 253 */ 254 void 255 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 256 { 257 #ifdef RTE_LIBRTE_MLX5_DEBUG 258 int idx; 259 struct mr_cache_entry *lkp_tbl; 260 261 if (bt == NULL) 262 return; 263 lkp_tbl = *bt->table; 264 for (idx = 0; idx < bt->len; ++idx) { 265 struct mr_cache_entry *entry = &lkp_tbl[idx]; 266 267 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 268 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 269 (void *)bt, idx, entry->start, entry->end, entry->lkey); 270 } 271 #endif 272 } 273 274 /** 275 * Initialize per-queue MR control descriptor. 276 * 277 * @param mr_ctrl 278 * Pointer to MR control structure. 279 * @param dev_gen_ptr 280 * Pointer to generation number of global cache. 281 * @param socket 282 * NUMA socket on which memory must be allocated. 283 * 284 * @return 285 * 0 on success, a negative errno value otherwise and rte_errno is set. 286 */ 287 int 288 mlx5_mr_ctrl_init(struct mlx5_mr_ctrl *mr_ctrl, uint32_t *dev_gen_ptr, 289 int socket) 290 { 291 if (mr_ctrl == NULL) { 292 rte_errno = EINVAL; 293 return -rte_errno; 294 } 295 /* Save pointer of global generation number to check memory event. */ 296 mr_ctrl->dev_gen_ptr = dev_gen_ptr; 297 /* Initialize B-tree and allocate memory for bottom-half cache table. */ 298 return mlx5_mr_btree_init(&mr_ctrl->cache_bh, MLX5_MR_BTREE_CACHE_N, 299 socket); 300 } 301 302 /** 303 * Find virtually contiguous memory chunk in a given MR. 304 * 305 * @param dev 306 * Pointer to MR structure. 307 * @param[out] entry 308 * Pointer to returning MR cache entry. If not found, this will not be 309 * updated. 310 * @param start_idx 311 * Start index of the memseg bitmap. 312 * 313 * @return 314 * Next index to go on lookup. 315 */ 316 static int 317 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 318 int base_idx) 319 { 320 uintptr_t start = 0; 321 uintptr_t end = 0; 322 uint32_t idx = 0; 323 324 /* MR for external memory doesn't have memseg list. */ 325 if (mr->msl == NULL) { 326 MLX5_ASSERT(mr->ms_bmp_n == 1); 327 MLX5_ASSERT(mr->ms_n == 1); 328 MLX5_ASSERT(base_idx == 0); 329 /* 330 * Can't search it from memseg list but get it directly from 331 * pmd_mr as there's only one chunk. 332 */ 333 entry->start = (uintptr_t)mr->pmd_mr.addr; 334 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 335 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 336 /* Returning 1 ends iteration. */ 337 return 1; 338 } 339 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 340 if (rte_bitmap_get(mr->ms_bmp, idx)) { 341 const struct rte_memseg_list *msl; 342 const struct rte_memseg *ms; 343 344 msl = mr->msl; 345 ms = rte_fbarray_get(&msl->memseg_arr, 346 mr->ms_base_idx + idx); 347 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 348 if (!start) 349 start = ms->addr_64; 350 end = ms->addr_64 + ms->hugepage_sz; 351 } else if (start) { 352 /* Passed the end of a fragment. */ 353 break; 354 } 355 } 356 if (start) { 357 /* Found one chunk. */ 358 entry->start = start; 359 entry->end = end; 360 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 361 } 362 return idx; 363 } 364 365 /** 366 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 367 * Then, this entry will have to be searched by mr_lookup_list() in 368 * mlx5_mr_create() on miss. 369 * 370 * @param share_cache 371 * Pointer to a global shared MR cache. 372 * @param mr 373 * Pointer to MR to insert. 374 * 375 * @return 376 * 0 on success, -1 on failure. 377 */ 378 int 379 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 380 struct mlx5_mr *mr) 381 { 382 unsigned int n; 383 384 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 385 (void *)mr, (void *)share_cache); 386 for (n = 0; n < mr->ms_bmp_n; ) { 387 struct mr_cache_entry entry; 388 389 memset(&entry, 0, sizeof(entry)); 390 /* Find a contiguous chunk and advance the index. */ 391 n = mr_find_next_chunk(mr, &entry, n); 392 if (!entry.end) 393 break; 394 if (mr_btree_insert(&share_cache->cache, &entry) < 0) { 395 /* 396 * Overflowed, but the global table cannot be expanded 397 * because of deadlock. 398 */ 399 return -1; 400 } 401 } 402 return 0; 403 } 404 405 /** 406 * Look up address in the original global MR list. 407 * 408 * @param share_cache 409 * Pointer to a global shared MR cache. 410 * @param[out] entry 411 * Pointer to returning MR cache entry. If no match, this will not be updated. 412 * @param addr 413 * Search key. 414 * 415 * @return 416 * Found MR on match, NULL otherwise. 417 */ 418 struct mlx5_mr * 419 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 420 struct mr_cache_entry *entry, uintptr_t addr) 421 { 422 struct mlx5_mr *mr; 423 424 /* Iterate all the existing MRs. */ 425 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 426 unsigned int n; 427 428 if (mr->ms_n == 0) 429 continue; 430 for (n = 0; n < mr->ms_bmp_n; ) { 431 struct mr_cache_entry ret; 432 433 memset(&ret, 0, sizeof(ret)); 434 n = mr_find_next_chunk(mr, &ret, n); 435 if (addr >= ret.start && addr < ret.end) { 436 /* Found. */ 437 *entry = ret; 438 return mr; 439 } 440 } 441 } 442 return NULL; 443 } 444 445 /** 446 * Look up address on global MR cache. 447 * 448 * @param share_cache 449 * Pointer to a global shared MR cache. 450 * @param[out] entry 451 * Pointer to returning MR cache entry. If no match, this will not be updated. 452 * @param addr 453 * Search key. 454 * 455 * @return 456 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 457 */ 458 uint32_t 459 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 460 struct mr_cache_entry *entry, uintptr_t addr) 461 { 462 uint16_t idx; 463 uint32_t lkey = UINT32_MAX; 464 struct mlx5_mr *mr; 465 466 /* 467 * If the global cache has overflowed since it failed to expand the 468 * B-tree table, it can't have all the existing MRs. Then, the address 469 * has to be searched by traversing the original MR list instead, which 470 * is very slow path. Otherwise, the global cache is all inclusive. 471 */ 472 if (!unlikely(share_cache->cache.overflow)) { 473 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 474 if (lkey != UINT32_MAX) 475 *entry = (*share_cache->cache.table)[idx]; 476 } else { 477 /* Falling back to the slowest path. */ 478 mr = mlx5_mr_lookup_list(share_cache, entry, addr); 479 if (mr != NULL) 480 lkey = entry->lkey; 481 } 482 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 483 addr < entry->end)); 484 return lkey; 485 } 486 487 /** 488 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 489 * can raise memory free event and the callback function will spin on the lock. 490 * 491 * @param mr 492 * Pointer to MR to free. 493 */ 494 void 495 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 496 { 497 if (mr == NULL) 498 return; 499 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 500 dereg_mr_cb(&mr->pmd_mr); 501 if (mr->ms_bmp != NULL) 502 rte_bitmap_free(mr->ms_bmp); 503 mlx5_free(mr); 504 } 505 506 void 507 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 508 { 509 struct mlx5_mr *mr; 510 511 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 512 /* Flush cache to rebuild. */ 513 share_cache->cache.len = 1; 514 share_cache->cache.overflow = 0; 515 /* Iterate all the existing MRs. */ 516 LIST_FOREACH(mr, &share_cache->mr_list, mr) 517 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 518 return; 519 } 520 521 /** 522 * Release resources of detached MR having no online entry. 523 * 524 * @param share_cache 525 * Pointer to a global shared MR cache. 526 */ 527 static void 528 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 529 { 530 struct mlx5_mr *mr_next; 531 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 532 533 /* Must be called from the primary process. */ 534 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 535 /* 536 * MR can't be freed with holding the lock because rte_free() could call 537 * memory free callback function. This will be a deadlock situation. 538 */ 539 rte_rwlock_write_lock(&share_cache->rwlock); 540 /* Detach the whole free list and release it after unlocking. */ 541 free_list = share_cache->mr_free_list; 542 LIST_INIT(&share_cache->mr_free_list); 543 rte_rwlock_write_unlock(&share_cache->rwlock); 544 /* Release resources. */ 545 mr_next = LIST_FIRST(&free_list); 546 while (mr_next != NULL) { 547 struct mlx5_mr *mr = mr_next; 548 549 mr_next = LIST_NEXT(mr, mr); 550 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 551 } 552 } 553 554 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 555 static int 556 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 557 const struct rte_memseg *ms, size_t len, void *arg) 558 { 559 struct mr_find_contig_memsegs_data *data = arg; 560 561 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 562 return 0; 563 /* Found, save it and stop walking. */ 564 data->start = ms->addr_64; 565 data->end = ms->addr_64 + len; 566 data->msl = msl; 567 return 1; 568 } 569 570 /** 571 * Create a new global Memory Region (MR) for a missing virtual address. 572 * This API should be called on a secondary process, then a request is sent to 573 * the primary process in order to create a MR for the address. As the global MR 574 * list is on the shared memory, following LKey lookup should succeed unless the 575 * request fails. 576 * 577 * @param pd 578 * Pointer to pd of a device (net, regex, vdpa,...). 579 * @param share_cache 580 * Pointer to a global shared MR cache. 581 * @param[out] entry 582 * Pointer to returning MR cache entry, found in the global cache or newly 583 * created. If failed to create one, this will not be updated. 584 * @param addr 585 * Target virtual address to register. 586 * @param mr_ext_memseg_en 587 * Configurable flag about external memory segment enable or not. 588 * 589 * @return 590 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 591 */ 592 static uint32_t 593 mlx5_mr_create_secondary(void *pd __rte_unused, 594 struct mlx5_mp_id *mp_id, 595 struct mlx5_mr_share_cache *share_cache, 596 struct mr_cache_entry *entry, uintptr_t addr, 597 unsigned int mr_ext_memseg_en __rte_unused) 598 { 599 int ret; 600 601 DRV_LOG(DEBUG, "port %u requesting MR creation for address (%p)", 602 mp_id->port_id, (void *)addr); 603 ret = mlx5_mp_req_mr_create(mp_id, addr); 604 if (ret) { 605 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 606 (void *)addr); 607 return UINT32_MAX; 608 } 609 rte_rwlock_read_lock(&share_cache->rwlock); 610 /* Fill in output data. */ 611 mlx5_mr_lookup_cache(share_cache, entry, addr); 612 /* Lookup can't fail. */ 613 MLX5_ASSERT(entry->lkey != UINT32_MAX); 614 rte_rwlock_read_unlock(&share_cache->rwlock); 615 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 616 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 617 (void *)addr, entry->start, entry->end, entry->lkey); 618 return entry->lkey; 619 } 620 621 /** 622 * Create a new global Memory Region (MR) for a missing virtual address. 623 * Register entire virtually contiguous memory chunk around the address. 624 * 625 * @param pd 626 * Pointer to pd of a device (net, regex, vdpa,...). 627 * @param share_cache 628 * Pointer to a global shared MR cache. 629 * @param[out] entry 630 * Pointer to returning MR cache entry, found in the global cache or newly 631 * created. If failed to create one, this will not be updated. 632 * @param addr 633 * Target virtual address to register. 634 * @param mr_ext_memseg_en 635 * Configurable flag about external memory segment enable or not. 636 * 637 * @return 638 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 639 */ 640 uint32_t 641 mlx5_mr_create_primary(void *pd, 642 struct mlx5_mr_share_cache *share_cache, 643 struct mr_cache_entry *entry, uintptr_t addr, 644 unsigned int mr_ext_memseg_en) 645 { 646 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 647 struct mr_find_contig_memsegs_data data_re; 648 const struct rte_memseg_list *msl; 649 const struct rte_memseg *ms; 650 struct mlx5_mr *mr = NULL; 651 int ms_idx_shift = -1; 652 uint32_t bmp_size; 653 void *bmp_mem; 654 uint32_t ms_n; 655 uint32_t n; 656 size_t len; 657 658 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 659 /* 660 * Release detached MRs if any. This can't be called with holding either 661 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 662 * been detached by the memory free event but it couldn't be released 663 * inside the callback due to deadlock. As a result, releasing resources 664 * is quite opportunistic. 665 */ 666 mlx5_mr_garbage_collect(share_cache); 667 /* 668 * If enabled, find out a contiguous virtual address chunk in use, to 669 * which the given address belongs, in order to register maximum range. 670 * In the best case where mempools are not dynamically recreated and 671 * '--socket-mem' is specified as an EAL option, it is very likely to 672 * have only one MR(LKey) per a socket and per a hugepage-size even 673 * though the system memory is highly fragmented. As the whole memory 674 * chunk will be pinned by kernel, it can't be reused unless entire 675 * chunk is freed from EAL. 676 * 677 * If disabled, just register one memseg (page). Then, memory 678 * consumption will be minimized but it may drop performance if there 679 * are many MRs to lookup on the datapath. 680 */ 681 if (!mr_ext_memseg_en) { 682 data.msl = rte_mem_virt2memseg_list((void *)addr); 683 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 684 data.end = data.start + data.msl->page_sz; 685 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 686 DRV_LOG(WARNING, 687 "Unable to find virtually contiguous" 688 " chunk for address (%p)." 689 " rte_memseg_contig_walk() failed.", (void *)addr); 690 rte_errno = ENXIO; 691 goto err_nolock; 692 } 693 alloc_resources: 694 /* Addresses must be page-aligned. */ 695 MLX5_ASSERT(data.msl); 696 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 697 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 698 msl = data.msl; 699 ms = rte_mem_virt2memseg((void *)data.start, msl); 700 len = data.end - data.start; 701 MLX5_ASSERT(ms); 702 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 703 /* Number of memsegs in the range. */ 704 ms_n = len / msl->page_sz; 705 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 706 " page_sz=0x%" PRIx64 ", ms_n=%u", 707 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 708 /* Size of memory for bitmap. */ 709 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 710 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 711 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 712 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 713 if (mr == NULL) { 714 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 715 " address (%p).", (void *)addr); 716 rte_errno = ENOMEM; 717 goto err_nolock; 718 } 719 mr->msl = msl; 720 /* 721 * Save the index of the first memseg and initialize memseg bitmap. To 722 * see if a memseg of ms_idx in the memseg-list is still valid, check: 723 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 724 */ 725 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 726 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 727 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 728 if (mr->ms_bmp == NULL) { 729 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 730 " address (%p).", (void *)addr); 731 rte_errno = EINVAL; 732 goto err_nolock; 733 } 734 /* 735 * Should recheck whether the extended contiguous chunk is still valid. 736 * Because memory_hotplug_lock can't be held if there's any memory 737 * related calls in a critical path, resource allocation above can't be 738 * locked. If the memory has been changed at this point, try again with 739 * just single page. If not, go on with the big chunk atomically from 740 * here. 741 */ 742 rte_mcfg_mem_read_lock(); 743 data_re = data; 744 if (len > msl->page_sz && 745 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 746 DRV_LOG(DEBUG, 747 "Unable to find virtually contiguous chunk for address " 748 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 749 rte_errno = ENXIO; 750 goto err_memlock; 751 } 752 if (data.start != data_re.start || data.end != data_re.end) { 753 /* 754 * The extended contiguous chunk has been changed. Try again 755 * with single memseg instead. 756 */ 757 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 758 data.end = data.start + msl->page_sz; 759 rte_mcfg_mem_read_unlock(); 760 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 761 goto alloc_resources; 762 } 763 MLX5_ASSERT(data.msl == data_re.msl); 764 rte_rwlock_write_lock(&share_cache->rwlock); 765 /* 766 * Check the address is really missing. If other thread already created 767 * one or it is not found due to overflow, abort and return. 768 */ 769 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 770 /* 771 * Insert to the global cache table. It may fail due to 772 * low-on-memory. Then, this entry will have to be searched 773 * here again. 774 */ 775 mr_btree_insert(&share_cache->cache, entry); 776 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 777 (void *)addr); 778 rte_rwlock_write_unlock(&share_cache->rwlock); 779 rte_mcfg_mem_read_unlock(); 780 /* 781 * Must be unlocked before calling rte_free() because 782 * mlx5_mr_mem_event_free_cb() can be called inside. 783 */ 784 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 785 return entry->lkey; 786 } 787 /* 788 * Trim start and end addresses for verbs MR. Set bits for registering 789 * memsegs but exclude already registered ones. Bitmap can be 790 * fragmented. 791 */ 792 for (n = 0; n < ms_n; ++n) { 793 uintptr_t start; 794 struct mr_cache_entry ret; 795 796 memset(&ret, 0, sizeof(ret)); 797 start = data_re.start + n * msl->page_sz; 798 /* Exclude memsegs already registered by other MRs. */ 799 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 800 UINT32_MAX) { 801 /* 802 * Start from the first unregistered memseg in the 803 * extended range. 804 */ 805 if (ms_idx_shift == -1) { 806 mr->ms_base_idx += n; 807 data.start = start; 808 ms_idx_shift = n; 809 } 810 data.end = start + msl->page_sz; 811 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 812 ++mr->ms_n; 813 } 814 } 815 len = data.end - data.start; 816 mr->ms_bmp_n = len / msl->page_sz; 817 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 818 /* 819 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 820 * be called with holding the memory lock because it doesn't use 821 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 822 * through mlx5_alloc_verbs_buf(). 823 */ 824 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 825 if (mr->pmd_mr.obj == NULL) { 826 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 827 (void *)addr); 828 rte_errno = EINVAL; 829 goto err_mrlock; 830 } 831 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 832 MLX5_ASSERT(mr->pmd_mr.len); 833 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 834 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 835 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 836 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 837 (void *)mr, (void *)addr, data.start, data.end, 838 rte_cpu_to_be_32(mr->pmd_mr.lkey), 839 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 840 /* Insert to the global cache table. */ 841 mlx5_mr_insert_cache(share_cache, mr); 842 /* Fill in output data. */ 843 mlx5_mr_lookup_cache(share_cache, entry, addr); 844 /* Lookup can't fail. */ 845 MLX5_ASSERT(entry->lkey != UINT32_MAX); 846 rte_rwlock_write_unlock(&share_cache->rwlock); 847 rte_mcfg_mem_read_unlock(); 848 return entry->lkey; 849 err_mrlock: 850 rte_rwlock_write_unlock(&share_cache->rwlock); 851 err_memlock: 852 rte_mcfg_mem_read_unlock(); 853 err_nolock: 854 /* 855 * In case of error, as this can be called in a datapath, a warning 856 * message per an error is preferable instead. Must be unlocked before 857 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 858 * inside. 859 */ 860 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 861 return UINT32_MAX; 862 } 863 864 /** 865 * Create a new global Memory Region (MR) for a missing virtual address. 866 * This can be called from primary and secondary process. 867 * 868 * @param pd 869 * Pointer to pd handle of a device (net, regex, vdpa,...). 870 * @param share_cache 871 * Pointer to a global shared MR cache. 872 * @param[out] entry 873 * Pointer to returning MR cache entry, found in the global cache or newly 874 * created. If failed to create one, this will not be updated. 875 * @param addr 876 * Target virtual address to register. 877 * 878 * @return 879 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 880 */ 881 static uint32_t 882 mlx5_mr_create(void *pd, struct mlx5_mp_id *mp_id, 883 struct mlx5_mr_share_cache *share_cache, 884 struct mr_cache_entry *entry, uintptr_t addr, 885 unsigned int mr_ext_memseg_en) 886 { 887 uint32_t ret = 0; 888 889 switch (rte_eal_process_type()) { 890 case RTE_PROC_PRIMARY: 891 ret = mlx5_mr_create_primary(pd, share_cache, entry, 892 addr, mr_ext_memseg_en); 893 break; 894 case RTE_PROC_SECONDARY: 895 ret = mlx5_mr_create_secondary(pd, mp_id, share_cache, entry, 896 addr, mr_ext_memseg_en); 897 break; 898 default: 899 break; 900 } 901 return ret; 902 } 903 904 /** 905 * Look up address in the global MR cache table. If not found, create a new MR. 906 * Insert the found/created entry to local bottom-half cache table. 907 * 908 * @param pd 909 * Pointer to pd of a device (net, regex, vdpa,...). 910 * @param share_cache 911 * Pointer to a global shared MR cache. 912 * @param mr_ctrl 913 * Pointer to per-queue MR control structure. 914 * @param[out] entry 915 * Pointer to returning MR cache entry, found in the global cache or newly 916 * created. If failed to create one, this is not written. 917 * @param addr 918 * Search key. 919 * 920 * @return 921 * Searched LKey on success, UINT32_MAX on no match. 922 */ 923 static uint32_t 924 mr_lookup_caches(void *pd, struct mlx5_mp_id *mp_id, 925 struct mlx5_mr_share_cache *share_cache, 926 struct mlx5_mr_ctrl *mr_ctrl, 927 struct mr_cache_entry *entry, uintptr_t addr, 928 unsigned int mr_ext_memseg_en) 929 { 930 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 931 uint32_t lkey; 932 uint16_t idx; 933 934 /* If local cache table is full, try to double it. */ 935 if (unlikely(bt->len == bt->size)) 936 mr_btree_expand(bt, bt->size << 1); 937 /* Look up in the global cache. */ 938 rte_rwlock_read_lock(&share_cache->rwlock); 939 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 940 if (lkey != UINT32_MAX) { 941 /* Found. */ 942 *entry = (*share_cache->cache.table)[idx]; 943 rte_rwlock_read_unlock(&share_cache->rwlock); 944 /* 945 * Update local cache. Even if it fails, return the found entry 946 * to update top-half cache. Next time, this entry will be found 947 * in the global cache. 948 */ 949 mr_btree_insert(bt, entry); 950 return lkey; 951 } 952 rte_rwlock_read_unlock(&share_cache->rwlock); 953 /* First time to see the address? Create a new MR. */ 954 lkey = mlx5_mr_create(pd, mp_id, share_cache, entry, addr, 955 mr_ext_memseg_en); 956 /* 957 * Update the local cache if successfully created a new global MR. Even 958 * if failed to create one, there's no action to take in this datapath 959 * code. As returning LKey is invalid, this will eventually make HW 960 * fail. 961 */ 962 if (lkey != UINT32_MAX) 963 mr_btree_insert(bt, entry); 964 return lkey; 965 } 966 967 /** 968 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 969 * misses, search in the global MR cache table and update the new entry to 970 * per-queue local caches. 971 * 972 * @param pd 973 * Pointer to pd of a device (net, regex, vdpa,...). 974 * @param share_cache 975 * Pointer to a global shared MR cache. 976 * @param mr_ctrl 977 * Pointer to per-queue MR control structure. 978 * @param addr 979 * Search key. 980 * 981 * @return 982 * Searched LKey on success, UINT32_MAX on no match. 983 */ 984 uint32_t mlx5_mr_addr2mr_bh(void *pd, struct mlx5_mp_id *mp_id, 985 struct mlx5_mr_share_cache *share_cache, 986 struct mlx5_mr_ctrl *mr_ctrl, 987 uintptr_t addr, unsigned int mr_ext_memseg_en) 988 { 989 uint32_t lkey; 990 uint16_t bh_idx = 0; 991 /* Victim in top-half cache to replace with new entry. */ 992 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 993 994 /* Binary-search MR translation table. */ 995 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 996 /* Update top-half cache. */ 997 if (likely(lkey != UINT32_MAX)) { 998 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 999 } else { 1000 /* 1001 * If missed in local lookup table, search in the global cache 1002 * and local cache_bh[] will be updated inside if possible. 1003 * Top-half cache entry will also be updated. 1004 */ 1005 lkey = mr_lookup_caches(pd, mp_id, share_cache, mr_ctrl, 1006 repl, addr, mr_ext_memseg_en); 1007 if (unlikely(lkey == UINT32_MAX)) 1008 return UINT32_MAX; 1009 } 1010 /* Update the most recently used entry. */ 1011 mr_ctrl->mru = mr_ctrl->head; 1012 /* Point to the next victim, the oldest. */ 1013 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1014 return lkey; 1015 } 1016 1017 /** 1018 * Release all the created MRs and resources on global MR cache of a device. 1019 * list. 1020 * 1021 * @param share_cache 1022 * Pointer to a global shared MR cache. 1023 */ 1024 void 1025 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 1026 { 1027 struct mlx5_mr *mr_next; 1028 1029 rte_rwlock_write_lock(&share_cache->rwlock); 1030 /* Detach from MR list and move to free list. */ 1031 mr_next = LIST_FIRST(&share_cache->mr_list); 1032 while (mr_next != NULL) { 1033 struct mlx5_mr *mr = mr_next; 1034 1035 mr_next = LIST_NEXT(mr, mr); 1036 LIST_REMOVE(mr, mr); 1037 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1038 } 1039 LIST_INIT(&share_cache->mr_list); 1040 /* Free global cache. */ 1041 mlx5_mr_btree_free(&share_cache->cache); 1042 rte_rwlock_write_unlock(&share_cache->rwlock); 1043 /* Free all remaining MRs. */ 1044 mlx5_mr_garbage_collect(share_cache); 1045 } 1046 1047 /** 1048 * Initialize global MR cache of a device. 1049 * 1050 * @param share_cache 1051 * Pointer to a global shared MR cache. 1052 * @param socket 1053 * NUMA socket on which memory must be allocated. 1054 * 1055 * @return 1056 * 0 on success, a negative errno value otherwise and rte_errno is set. 1057 */ 1058 int 1059 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket) 1060 { 1061 /* Set the reg_mr and dereg_mr callback functions */ 1062 mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb, 1063 &share_cache->dereg_mr_cb); 1064 rte_rwlock_init(&share_cache->rwlock); 1065 /* Initialize B-tree and allocate memory for global MR cache table. */ 1066 return mlx5_mr_btree_init(&share_cache->cache, 1067 MLX5_MR_BTREE_CACHE_N * 2, socket); 1068 } 1069 1070 /** 1071 * Flush all of the local cache entries. 1072 * 1073 * @param mr_ctrl 1074 * Pointer to per-queue MR local cache. 1075 */ 1076 void 1077 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1078 { 1079 /* Reset the most-recently-used index. */ 1080 mr_ctrl->mru = 0; 1081 /* Reset the linear search array. */ 1082 mr_ctrl->head = 0; 1083 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1084 /* Reset the B-tree table. */ 1085 mr_ctrl->cache_bh.len = 1; 1086 mr_ctrl->cache_bh.overflow = 0; 1087 /* Update the generation number. */ 1088 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1089 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1090 (void *)mr_ctrl, mr_ctrl->cur_gen); 1091 } 1092 1093 /** 1094 * Creates a memory region for external memory, that is memory which is not 1095 * part of the DPDK memory segments. 1096 * 1097 * @param pd 1098 * Pointer to pd of a device (net, regex, vdpa,...). 1099 * @param addr 1100 * Starting virtual address of memory. 1101 * @param len 1102 * Length of memory segment being mapped. 1103 * @param socked_id 1104 * Socket to allocate heap memory for the control structures. 1105 * 1106 * @return 1107 * Pointer to MR structure on success, NULL otherwise. 1108 */ 1109 struct mlx5_mr * 1110 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1111 mlx5_reg_mr_t reg_mr_cb) 1112 { 1113 struct mlx5_mr *mr = NULL; 1114 1115 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1116 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1117 RTE_CACHE_LINE_SIZE, socket_id); 1118 if (mr == NULL) 1119 return NULL; 1120 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1121 if (mr->pmd_mr.obj == NULL) { 1122 DRV_LOG(WARNING, 1123 "Fail to create MR for address (%p)", 1124 (void *)addr); 1125 mlx5_free(mr); 1126 return NULL; 1127 } 1128 mr->msl = NULL; /* Mark it is external memory. */ 1129 mr->ms_bmp = NULL; 1130 mr->ms_n = 1; 1131 mr->ms_bmp_n = 1; 1132 DRV_LOG(DEBUG, 1133 "MR CREATED (%p) for external memory %p:\n" 1134 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1135 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1136 (void *)mr, (void *)addr, 1137 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1138 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1139 return mr; 1140 } 1141 1142 /** 1143 * Callback for memory free event. Iterate freed memsegs and check whether it 1144 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1145 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1146 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1147 * secondary process, the garbage collector will be called in primary process 1148 * as the secondary process can't call mlx5_mr_create(). 1149 * 1150 * The global cache must be rebuilt if there's any change and this event has to 1151 * be propagated to dataplane threads to flush the local caches. 1152 * 1153 * @param share_cache 1154 * Pointer to a global shared MR cache. 1155 * @param ibdev_name 1156 * Name of ibv device. 1157 * @param addr 1158 * Address of freed memory. 1159 * @param len 1160 * Size of freed memory. 1161 */ 1162 void 1163 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1164 const char *ibdev_name, const void *addr, size_t len) 1165 { 1166 const struct rte_memseg_list *msl; 1167 struct mlx5_mr *mr; 1168 int ms_n; 1169 int i; 1170 int rebuild = 0; 1171 1172 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1173 ibdev_name, addr, len); 1174 msl = rte_mem_virt2memseg_list(addr); 1175 /* addr and len must be page-aligned. */ 1176 MLX5_ASSERT((uintptr_t)addr == 1177 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1178 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1179 ms_n = len / msl->page_sz; 1180 rte_rwlock_write_lock(&share_cache->rwlock); 1181 /* Clear bits of freed memsegs from MR. */ 1182 for (i = 0; i < ms_n; ++i) { 1183 const struct rte_memseg *ms; 1184 struct mr_cache_entry entry; 1185 uintptr_t start; 1186 int ms_idx; 1187 uint32_t pos; 1188 1189 /* Find MR having this memseg. */ 1190 start = (uintptr_t)addr + i * msl->page_sz; 1191 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1192 if (mr == NULL) 1193 continue; 1194 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1195 ms = rte_mem_virt2memseg((void *)start, msl); 1196 MLX5_ASSERT(ms != NULL); 1197 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1198 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1199 pos = ms_idx - mr->ms_base_idx; 1200 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1201 MLX5_ASSERT(pos < mr->ms_bmp_n); 1202 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1203 ibdev_name, (void *)mr, pos, (void *)start); 1204 rte_bitmap_clear(mr->ms_bmp, pos); 1205 if (--mr->ms_n == 0) { 1206 LIST_REMOVE(mr, mr); 1207 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1208 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1209 ibdev_name, (void *)mr); 1210 } 1211 /* 1212 * MR is fragmented or will be freed. the global cache must be 1213 * rebuilt. 1214 */ 1215 rebuild = 1; 1216 } 1217 if (rebuild) { 1218 mlx5_mr_rebuild_cache(share_cache); 1219 /* 1220 * No explicit wmb is needed after updating dev_gen due to 1221 * store-release ordering in unlock that provides the 1222 * implicit barrier at the software visible level. 1223 */ 1224 ++share_cache->dev_gen; 1225 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1226 share_cache->dev_gen); 1227 } 1228 rte_rwlock_write_unlock(&share_cache->rwlock); 1229 } 1230 1231 /** 1232 * Dump all the created MRs and the global cache entries. 1233 * 1234 * @param sh 1235 * Pointer to Ethernet device shared context. 1236 */ 1237 void 1238 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1239 { 1240 #ifdef RTE_LIBRTE_MLX5_DEBUG 1241 struct mlx5_mr *mr; 1242 int mr_n = 0; 1243 int chunk_n = 0; 1244 1245 rte_rwlock_read_lock(&share_cache->rwlock); 1246 /* Iterate all the existing MRs. */ 1247 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1248 unsigned int n; 1249 1250 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1251 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1252 mr->ms_n, mr->ms_bmp_n); 1253 if (mr->ms_n == 0) 1254 continue; 1255 for (n = 0; n < mr->ms_bmp_n; ) { 1256 struct mr_cache_entry ret = { 0, }; 1257 1258 n = mr_find_next_chunk(mr, &ret, n); 1259 if (!ret.end) 1260 break; 1261 DRV_LOG(DEBUG, 1262 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1263 chunk_n++, ret.start, ret.end); 1264 } 1265 } 1266 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1267 mlx5_mr_btree_dump(&share_cache->cache); 1268 rte_rwlock_read_unlock(&share_cache->rwlock); 1269 #endif 1270 } 1271 1272 static int 1273 mlx5_range_compare_start(const void *lhs, const void *rhs) 1274 { 1275 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1276 1277 if (r1->start > r2->start) 1278 return 1; 1279 else if (r1->start < r2->start) 1280 return -1; 1281 return 0; 1282 } 1283 1284 static void 1285 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1286 struct rte_mempool_memhdr *memhdr, 1287 unsigned int idx) 1288 { 1289 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1290 uint64_t page_size = rte_mem_page_size(); 1291 1292 RTE_SET_USED(mp); 1293 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 1294 range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size); 1295 } 1296 1297 /** 1298 * Get VA-contiguous ranges of the mempool memory. 1299 * Each range start and end is aligned to the system page size. 1300 * 1301 * @param[in] mp 1302 * Analyzed mempool. 1303 * @param[out] out 1304 * Receives the ranges, caller must release it with free(). 1305 * @param[out] ount_n 1306 * Receives the number of @p out elements. 1307 * 1308 * @return 1309 * 0 on success, (-1) on failure. 1310 */ 1311 static int 1312 mlx5_get_mempool_ranges(struct rte_mempool *mp, struct mlx5_range **out, 1313 unsigned int *out_n) 1314 { 1315 struct mlx5_range *chunks; 1316 unsigned int chunks_n = mp->nb_mem_chunks, contig_n, i; 1317 1318 /* Collect page-aligned memory ranges of the mempool. */ 1319 chunks = calloc(sizeof(chunks[0]), chunks_n); 1320 if (chunks == NULL) 1321 return -1; 1322 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, chunks); 1323 /* Merge adjacent chunks and place them at the beginning. */ 1324 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1325 contig_n = 1; 1326 for (i = 1; i < chunks_n; i++) 1327 if (chunks[i - 1].end != chunks[i].start) { 1328 chunks[contig_n - 1].end = chunks[i - 1].end; 1329 chunks[contig_n] = chunks[i]; 1330 contig_n++; 1331 } 1332 /* Extend the last contiguous chunk to the end of the mempool. */ 1333 chunks[contig_n - 1].end = chunks[i - 1].end; 1334 *out = chunks; 1335 *out_n = contig_n; 1336 return 0; 1337 } 1338 1339 /** 1340 * Analyze mempool memory to select memory ranges to register. 1341 * 1342 * @param[in] mp 1343 * Mempool to analyze. 1344 * @param[out] out 1345 * Receives memory ranges to register, aligned to the system page size. 1346 * The caller must release them with free(). 1347 * @param[out] out_n 1348 * Receives the number of @p out items. 1349 * @param[out] share_hugepage 1350 * Receives True if the entire pool resides within a single hugepage. 1351 * 1352 * @return 1353 * 0 on success, (-1) on failure. 1354 */ 1355 static int 1356 mlx5_mempool_reg_analyze(struct rte_mempool *mp, struct mlx5_range **out, 1357 unsigned int *out_n, bool *share_hugepage) 1358 { 1359 struct mlx5_range *ranges = NULL; 1360 unsigned int i, ranges_n = 0; 1361 struct rte_memseg_list *msl; 1362 1363 if (mlx5_get_mempool_ranges(mp, &ranges, &ranges_n) < 0) { 1364 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1365 mp->name); 1366 return -1; 1367 } 1368 /* Check if the hugepage of the pool can be shared. */ 1369 *share_hugepage = false; 1370 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1371 if (msl != NULL) { 1372 uint64_t hugepage_sz = 0; 1373 1374 /* Check that all ranges are on pages of the same size. */ 1375 for (i = 0; i < ranges_n; i++) { 1376 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1377 break; 1378 hugepage_sz = msl->page_sz; 1379 } 1380 if (i == ranges_n) { 1381 /* 1382 * If the entire pool is within one hugepage, 1383 * combine all ranges into one of the hugepage size. 1384 */ 1385 uintptr_t reg_start = ranges[0].start; 1386 uintptr_t reg_end = ranges[ranges_n - 1].end; 1387 uintptr_t hugepage_start = 1388 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1389 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1390 if (reg_end < hugepage_end) { 1391 ranges[0].start = hugepage_start; 1392 ranges[0].end = hugepage_end; 1393 ranges_n = 1; 1394 *share_hugepage = true; 1395 } 1396 } 1397 } 1398 *out = ranges; 1399 *out_n = ranges_n; 1400 return 0; 1401 } 1402 1403 /** Create a registration object for the mempool. */ 1404 static struct mlx5_mempool_reg * 1405 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n) 1406 { 1407 struct mlx5_mempool_reg *mpr = NULL; 1408 1409 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1410 sizeof(*mpr) + mrs_n * sizeof(mpr->mrs[0]), 1411 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1412 if (mpr == NULL) { 1413 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1414 mp->name); 1415 return NULL; 1416 } 1417 mpr->mp = mp; 1418 mpr->mrs = (struct mlx5_mempool_mr *)(mpr + 1); 1419 mpr->mrs_n = mrs_n; 1420 return mpr; 1421 } 1422 1423 /** 1424 * Destroy a mempool registration object. 1425 * 1426 * @param standalone 1427 * Whether @p mpr owns its MRs excludively, i.e. they are not shared. 1428 */ 1429 static void 1430 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1431 struct mlx5_mempool_reg *mpr, bool standalone) 1432 { 1433 if (standalone) { 1434 unsigned int i; 1435 1436 for (i = 0; i < mpr->mrs_n; i++) 1437 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1438 } 1439 mlx5_free(mpr); 1440 } 1441 1442 /** Find registration object of a mempool. */ 1443 static struct mlx5_mempool_reg * 1444 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1445 struct rte_mempool *mp) 1446 { 1447 struct mlx5_mempool_reg *mpr; 1448 1449 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1450 if (mpr->mp == mp) 1451 break; 1452 return mpr; 1453 } 1454 1455 /** Increment reference counters of MRs used in the registration. */ 1456 static void 1457 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1458 { 1459 unsigned int i; 1460 1461 for (i = 0; i < mpr->mrs_n; i++) 1462 __atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1463 } 1464 1465 /** 1466 * Decrement reference counters of MRs used in the registration. 1467 * 1468 * @return True if no more references to @p mpr MRs exist, False otherwise. 1469 */ 1470 static bool 1471 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1472 { 1473 unsigned int i; 1474 bool ret = false; 1475 1476 for (i = 0; i < mpr->mrs_n; i++) 1477 ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1, 1478 __ATOMIC_RELAXED) == 0; 1479 return ret; 1480 } 1481 1482 static int 1483 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1484 void *pd, struct rte_mempool *mp) 1485 { 1486 struct mlx5_range *ranges = NULL; 1487 struct mlx5_mempool_reg *mpr, *new_mpr; 1488 unsigned int i, ranges_n; 1489 bool share_hugepage; 1490 int ret = -1; 1491 1492 /* Early check to avoid unnecessary creation of MRs. */ 1493 rte_rwlock_read_lock(&share_cache->rwlock); 1494 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1495 rte_rwlock_read_unlock(&share_cache->rwlock); 1496 if (mpr != NULL) { 1497 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1498 mp->name, pd); 1499 rte_errno = EEXIST; 1500 goto exit; 1501 } 1502 if (mlx5_mempool_reg_analyze(mp, &ranges, &ranges_n, 1503 &share_hugepage) < 0) { 1504 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1505 rte_errno = ENOMEM; 1506 goto exit; 1507 } 1508 new_mpr = mlx5_mempool_reg_create(mp, ranges_n); 1509 if (new_mpr == NULL) { 1510 DRV_LOG(ERR, 1511 "Cannot create a registration object for mempool %s in PD %p", 1512 mp->name, pd); 1513 rte_errno = ENOMEM; 1514 goto exit; 1515 } 1516 /* 1517 * If the entire mempool fits in a single hugepage, the MR for this 1518 * hugepage can be shared across mempools that also fit in it. 1519 */ 1520 if (share_hugepage) { 1521 rte_rwlock_write_lock(&share_cache->rwlock); 1522 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1523 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1524 break; 1525 } 1526 if (mpr != NULL) { 1527 new_mpr->mrs = mpr->mrs; 1528 mlx5_mempool_reg_attach(new_mpr); 1529 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1530 new_mpr, next); 1531 } 1532 rte_rwlock_write_unlock(&share_cache->rwlock); 1533 if (mpr != NULL) { 1534 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1535 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1536 mpr->mp->name); 1537 ret = 0; 1538 goto exit; 1539 } 1540 } 1541 for (i = 0; i < ranges_n; i++) { 1542 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1543 const struct mlx5_range *range = &ranges[i]; 1544 size_t len = range->end - range->start; 1545 1546 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1547 &mr->pmd_mr) < 0) { 1548 DRV_LOG(ERR, 1549 "Failed to create an MR in PD %p for address range " 1550 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1551 pd, range->start, range->end, len, mp->name); 1552 break; 1553 } 1554 DRV_LOG(DEBUG, 1555 "Created a new MR %#x in PD %p for address range " 1556 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1557 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1558 mp->name); 1559 } 1560 if (i != ranges_n) { 1561 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1562 rte_errno = EINVAL; 1563 goto exit; 1564 } 1565 /* Concurrent registration is not supposed to happen. */ 1566 rte_rwlock_write_lock(&share_cache->rwlock); 1567 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1568 if (mpr == NULL) { 1569 mlx5_mempool_reg_attach(new_mpr); 1570 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1571 new_mpr, next); 1572 ret = 0; 1573 } 1574 rte_rwlock_write_unlock(&share_cache->rwlock); 1575 if (mpr != NULL) { 1576 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1577 mp->name, pd); 1578 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1579 rte_errno = EEXIST; 1580 goto exit; 1581 } 1582 exit: 1583 free(ranges); 1584 return ret; 1585 } 1586 1587 static int 1588 mlx5_mr_mempool_register_secondary(struct mlx5_mr_share_cache *share_cache, 1589 void *pd, struct rte_mempool *mp, 1590 struct mlx5_mp_id *mp_id) 1591 { 1592 if (mp_id == NULL) { 1593 rte_errno = EINVAL; 1594 return -1; 1595 } 1596 return mlx5_mp_req_mempool_reg(mp_id, share_cache, pd, mp, true); 1597 } 1598 1599 /** 1600 * Register the memory of a mempool in the protection domain. 1601 * 1602 * @param share_cache 1603 * Shared MR cache of the protection domain. 1604 * @param pd 1605 * Protection domain object. 1606 * @param mp 1607 * Mempool to register. 1608 * @param mp_id 1609 * Multi-process identifier, may be NULL for the primary process. 1610 * 1611 * @return 1612 * 0 on success, (-1) on failure and rte_errno is set. 1613 */ 1614 int 1615 mlx5_mr_mempool_register(struct mlx5_mr_share_cache *share_cache, void *pd, 1616 struct rte_mempool *mp, struct mlx5_mp_id *mp_id) 1617 { 1618 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1619 return 0; 1620 switch (rte_eal_process_type()) { 1621 case RTE_PROC_PRIMARY: 1622 return mlx5_mr_mempool_register_primary(share_cache, pd, mp); 1623 case RTE_PROC_SECONDARY: 1624 return mlx5_mr_mempool_register_secondary(share_cache, pd, mp, 1625 mp_id); 1626 default: 1627 return -1; 1628 } 1629 } 1630 1631 static int 1632 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1633 struct rte_mempool *mp) 1634 { 1635 struct mlx5_mempool_reg *mpr; 1636 bool standalone = false; 1637 1638 rte_rwlock_write_lock(&share_cache->rwlock); 1639 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1640 if (mpr->mp == mp) { 1641 LIST_REMOVE(mpr, next); 1642 standalone = mlx5_mempool_reg_detach(mpr); 1643 if (standalone) 1644 /* 1645 * The unlock operation below provides a memory 1646 * barrier due to its store-release semantics. 1647 */ 1648 ++share_cache->dev_gen; 1649 break; 1650 } 1651 rte_rwlock_write_unlock(&share_cache->rwlock); 1652 if (mpr == NULL) { 1653 rte_errno = ENOENT; 1654 return -1; 1655 } 1656 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1657 return 0; 1658 } 1659 1660 static int 1661 mlx5_mr_mempool_unregister_secondary(struct mlx5_mr_share_cache *share_cache, 1662 struct rte_mempool *mp, 1663 struct mlx5_mp_id *mp_id) 1664 { 1665 if (mp_id == NULL) { 1666 rte_errno = EINVAL; 1667 return -1; 1668 } 1669 return mlx5_mp_req_mempool_reg(mp_id, share_cache, NULL, mp, false); 1670 } 1671 1672 /** 1673 * Unregister the memory of a mempool from the protection domain. 1674 * 1675 * @param share_cache 1676 * Shared MR cache of the protection domain. 1677 * @param mp 1678 * Mempool to unregister. 1679 * @param mp_id 1680 * Multi-process identifier, may be NULL for the primary process. 1681 * 1682 * @return 1683 * 0 on success, (-1) on failure and rte_errno is set. 1684 */ 1685 int 1686 mlx5_mr_mempool_unregister(struct mlx5_mr_share_cache *share_cache, 1687 struct rte_mempool *mp, struct mlx5_mp_id *mp_id) 1688 { 1689 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1690 return 0; 1691 switch (rte_eal_process_type()) { 1692 case RTE_PROC_PRIMARY: 1693 return mlx5_mr_mempool_unregister_primary(share_cache, mp); 1694 case RTE_PROC_SECONDARY: 1695 return mlx5_mr_mempool_unregister_secondary(share_cache, mp, 1696 mp_id); 1697 default: 1698 return -1; 1699 } 1700 } 1701 1702 /** 1703 * Lookup a MR key by and address in a registered mempool. 1704 * 1705 * @param mpr 1706 * Mempool registration object. 1707 * @param addr 1708 * Address within the mempool. 1709 * @param entry 1710 * Bottom-half cache entry to fill. 1711 * 1712 * @return 1713 * MR key or UINT32_MAX on failure, which can only happen 1714 * if the address is not from within the mempool. 1715 */ 1716 static uint32_t 1717 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1718 struct mr_cache_entry *entry) 1719 { 1720 uint32_t lkey = UINT32_MAX; 1721 unsigned int i; 1722 1723 for (i = 0; i < mpr->mrs_n; i++) { 1724 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1725 uintptr_t mr_addr = (uintptr_t)mr->addr; 1726 1727 if (mr_addr <= addr) { 1728 lkey = rte_cpu_to_be_32(mr->lkey); 1729 entry->start = mr_addr; 1730 entry->end = mr_addr + mr->len; 1731 entry->lkey = lkey; 1732 break; 1733 } 1734 } 1735 return lkey; 1736 } 1737 1738 /** 1739 * Update bottom-half cache from the list of mempool registrations. 1740 * 1741 * @param share_cache 1742 * Pointer to a global shared MR cache. 1743 * @param mr_ctrl 1744 * Per-queue MR control handle. 1745 * @param entry 1746 * Pointer to an entry in the bottom-half cache to update 1747 * with the MR lkey looked up. 1748 * @param mp 1749 * Mempool containing the address. 1750 * @param addr 1751 * Address to lookup. 1752 * @return 1753 * MR lkey on success, UINT32_MAX on failure. 1754 */ 1755 static uint32_t 1756 mlx5_lookup_mempool_regs(struct mlx5_mr_share_cache *share_cache, 1757 struct mlx5_mr_ctrl *mr_ctrl, 1758 struct mr_cache_entry *entry, 1759 struct rte_mempool *mp, uintptr_t addr) 1760 { 1761 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1762 struct mlx5_mempool_reg *mpr; 1763 uint32_t lkey = UINT32_MAX; 1764 1765 /* If local cache table is full, try to double it. */ 1766 if (unlikely(bt->len == bt->size)) 1767 mr_btree_expand(bt, bt->size << 1); 1768 /* Look up in mempool registrations. */ 1769 rte_rwlock_read_lock(&share_cache->rwlock); 1770 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1771 if (mpr != NULL) 1772 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1773 rte_rwlock_read_unlock(&share_cache->rwlock); 1774 /* 1775 * Update local cache. Even if it fails, return the found entry 1776 * to update top-half cache. Next time, this entry will be found 1777 * in the global cache. 1778 */ 1779 if (lkey != UINT32_MAX) 1780 mr_btree_insert(bt, entry); 1781 return lkey; 1782 } 1783 1784 /** 1785 * Bottom-half lookup for the address from the mempool. 1786 * 1787 * @param share_cache 1788 * Pointer to a global shared MR cache. 1789 * @param mr_ctrl 1790 * Per-queue MR control handle. 1791 * @param mp 1792 * Mempool containing the address. 1793 * @param addr 1794 * Address to lookup. 1795 * @return 1796 * MR lkey on success, UINT32_MAX on failure. 1797 */ 1798 uint32_t 1799 mlx5_mr_mempool2mr_bh(struct mlx5_mr_share_cache *share_cache, 1800 struct mlx5_mr_ctrl *mr_ctrl, 1801 struct rte_mempool *mp, uintptr_t addr) 1802 { 1803 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1804 uint32_t lkey; 1805 uint16_t bh_idx = 0; 1806 1807 /* Binary-search MR translation table. */ 1808 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1809 /* Update top-half cache. */ 1810 if (likely(lkey != UINT32_MAX)) { 1811 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1812 } else { 1813 lkey = mlx5_lookup_mempool_regs(share_cache, mr_ctrl, repl, 1814 mp, addr); 1815 /* Can only fail if the address is not from the mempool. */ 1816 if (unlikely(lkey == UINT32_MAX)) 1817 return UINT32_MAX; 1818 } 1819 /* Update the most recently used entry. */ 1820 mr_ctrl->mru = mr_ctrl->head; 1821 /* Point to the next victim, the oldest. */ 1822 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1823 return lkey; 1824 } 1825