1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright 2016 6WIND S.A. 3 * Copyright 2020 Mellanox Technologies, Ltd 4 */ 5 #include <stddef.h> 6 7 #include <rte_eal_memconfig.h> 8 #include <rte_eal_paging.h> 9 #include <rte_errno.h> 10 #include <rte_mempool.h> 11 #include <rte_malloc.h> 12 #include <rte_rwlock.h> 13 14 #include "mlx5_glue.h" 15 #include "mlx5_common_mp.h" 16 #include "mlx5_common_mr.h" 17 #include "mlx5_common_log.h" 18 #include "mlx5_malloc.h" 19 20 struct mr_find_contig_memsegs_data { 21 uintptr_t addr; 22 uintptr_t start; 23 uintptr_t end; 24 const struct rte_memseg_list *msl; 25 }; 26 27 /* Virtual memory range. */ 28 struct mlx5_range { 29 uintptr_t start; 30 uintptr_t end; 31 }; 32 33 /** Memory region for a mempool. */ 34 struct mlx5_mempool_mr { 35 struct mlx5_pmd_mr pmd_mr; 36 uint32_t refcnt; /**< Number of mempools sharing this MR. */ 37 }; 38 39 /* Mempool registration. */ 40 struct mlx5_mempool_reg { 41 LIST_ENTRY(mlx5_mempool_reg) next; 42 /** Registered mempool, used to designate registrations. */ 43 struct rte_mempool *mp; 44 /** Memory regions for the address ranges of the mempool. */ 45 struct mlx5_mempool_mr *mrs; 46 /** Number of memory regions. */ 47 unsigned int mrs_n; 48 }; 49 50 /** 51 * Expand B-tree table to a given size. Can't be called with holding 52 * memory_hotplug_lock or share_cache.rwlock due to rte_realloc(). 53 * 54 * @param bt 55 * Pointer to B-tree structure. 56 * @param n 57 * Number of entries for expansion. 58 * 59 * @return 60 * 0 on success, -1 on failure. 61 */ 62 static int 63 mr_btree_expand(struct mlx5_mr_btree *bt, int n) 64 { 65 void *mem; 66 int ret = 0; 67 68 if (n <= bt->size) 69 return ret; 70 /* 71 * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is 72 * used inside if there's no room to expand. Because this is a quite 73 * rare case and a part of very slow path, it is very acceptable. 74 * Initially cache_bh[] will be given practically enough space and once 75 * it is expanded, expansion wouldn't be needed again ever. 76 */ 77 mem = mlx5_realloc(bt->table, MLX5_MEM_RTE | MLX5_MEM_ZERO, 78 n * sizeof(struct mr_cache_entry), 0, SOCKET_ID_ANY); 79 if (mem == NULL) { 80 /* Not an error, B-tree search will be skipped. */ 81 DRV_LOG(WARNING, "failed to expand MR B-tree (%p) table", 82 (void *)bt); 83 ret = -1; 84 } else { 85 DRV_LOG(DEBUG, "expanded MR B-tree table (size=%u)", n); 86 bt->table = mem; 87 bt->size = n; 88 } 89 return ret; 90 } 91 92 /** 93 * Look up LKey from given B-tree lookup table, store the last index and return 94 * searched LKey. 95 * 96 * @param bt 97 * Pointer to B-tree structure. 98 * @param[out] idx 99 * Pointer to index. Even on search failure, returns index where it stops 100 * searching so that index can be used when inserting a new entry. 101 * @param addr 102 * Search key. 103 * 104 * @return 105 * Searched LKey on success, UINT32_MAX on no match. 106 */ 107 static uint32_t 108 mr_btree_lookup(struct mlx5_mr_btree *bt, uint16_t *idx, uintptr_t addr) 109 { 110 struct mr_cache_entry *lkp_tbl; 111 uint16_t n; 112 uint16_t base = 0; 113 114 MLX5_ASSERT(bt != NULL); 115 lkp_tbl = *bt->table; 116 n = bt->len; 117 /* First entry must be NULL for comparison. */ 118 MLX5_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 && 119 lkp_tbl[0].lkey == UINT32_MAX)); 120 /* Binary search. */ 121 do { 122 register uint16_t delta = n >> 1; 123 124 if (addr < lkp_tbl[base + delta].start) { 125 n = delta; 126 } else { 127 base += delta; 128 n -= delta; 129 } 130 } while (n > 1); 131 MLX5_ASSERT(addr >= lkp_tbl[base].start); 132 *idx = base; 133 if (addr < lkp_tbl[base].end) 134 return lkp_tbl[base].lkey; 135 /* Not found. */ 136 return UINT32_MAX; 137 } 138 139 /** 140 * Insert an entry to B-tree lookup table. 141 * 142 * @param bt 143 * Pointer to B-tree structure. 144 * @param entry 145 * Pointer to new entry to insert. 146 * 147 * @return 148 * 0 on success, -1 on failure. 149 */ 150 static int 151 mr_btree_insert(struct mlx5_mr_btree *bt, struct mr_cache_entry *entry) 152 { 153 struct mr_cache_entry *lkp_tbl; 154 uint16_t idx = 0; 155 size_t shift; 156 157 MLX5_ASSERT(bt != NULL); 158 MLX5_ASSERT(bt->len <= bt->size); 159 MLX5_ASSERT(bt->len > 0); 160 lkp_tbl = *bt->table; 161 /* Find out the slot for insertion. */ 162 if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { 163 DRV_LOG(DEBUG, 164 "abort insertion to B-tree(%p): already exist at" 165 " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 166 (void *)bt, idx, entry->start, entry->end, entry->lkey); 167 /* Already exist, return. */ 168 return 0; 169 } 170 /* If table is full, return error. */ 171 if (unlikely(bt->len == bt->size)) { 172 bt->overflow = 1; 173 return -1; 174 } 175 /* Insert entry. */ 176 ++idx; 177 shift = (bt->len - idx) * sizeof(struct mr_cache_entry); 178 if (shift) 179 memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); 180 lkp_tbl[idx] = *entry; 181 bt->len++; 182 DRV_LOG(DEBUG, 183 "inserted B-tree(%p)[%u]," 184 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 185 (void *)bt, idx, entry->start, entry->end, entry->lkey); 186 return 0; 187 } 188 189 /** 190 * Initialize B-tree and allocate memory for lookup table. 191 * 192 * @param bt 193 * Pointer to B-tree structure. 194 * @param n 195 * Number of entries to allocate. 196 * @param socket 197 * NUMA socket on which memory must be allocated. 198 * 199 * @return 200 * 0 on success, a negative errno value otherwise and rte_errno is set. 201 */ 202 int 203 mlx5_mr_btree_init(struct mlx5_mr_btree *bt, int n, int socket) 204 { 205 if (bt == NULL) { 206 rte_errno = EINVAL; 207 return -rte_errno; 208 } 209 MLX5_ASSERT(!bt->table && !bt->size); 210 memset(bt, 0, sizeof(*bt)); 211 bt->table = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 212 sizeof(struct mr_cache_entry) * n, 213 0, socket); 214 if (bt->table == NULL) { 215 rte_errno = ENOMEM; 216 DRV_LOG(DEBUG, 217 "failed to allocate memory for btree cache on socket " 218 "%d", socket); 219 return -rte_errno; 220 } 221 bt->size = n; 222 /* First entry must be NULL for binary search. */ 223 (*bt->table)[bt->len++] = (struct mr_cache_entry) { 224 .lkey = UINT32_MAX, 225 }; 226 DRV_LOG(DEBUG, "initialized B-tree %p with table %p", 227 (void *)bt, (void *)bt->table); 228 return 0; 229 } 230 231 /** 232 * Free B-tree resources. 233 * 234 * @param bt 235 * Pointer to B-tree structure. 236 */ 237 void 238 mlx5_mr_btree_free(struct mlx5_mr_btree *bt) 239 { 240 if (bt == NULL) 241 return; 242 DRV_LOG(DEBUG, "freeing B-tree %p with table %p", 243 (void *)bt, (void *)bt->table); 244 mlx5_free(bt->table); 245 memset(bt, 0, sizeof(*bt)); 246 } 247 248 /** 249 * Dump all the entries in a B-tree 250 * 251 * @param bt 252 * Pointer to B-tree structure. 253 */ 254 void 255 mlx5_mr_btree_dump(struct mlx5_mr_btree *bt __rte_unused) 256 { 257 #ifdef RTE_LIBRTE_MLX5_DEBUG 258 int idx; 259 struct mr_cache_entry *lkp_tbl; 260 261 if (bt == NULL) 262 return; 263 lkp_tbl = *bt->table; 264 for (idx = 0; idx < bt->len; ++idx) { 265 struct mr_cache_entry *entry = &lkp_tbl[idx]; 266 267 DRV_LOG(DEBUG, "B-tree(%p)[%u]," 268 " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", 269 (void *)bt, idx, entry->start, entry->end, entry->lkey); 270 } 271 #endif 272 } 273 274 /** 275 * Find virtually contiguous memory chunk in a given MR. 276 * 277 * @param dev 278 * Pointer to MR structure. 279 * @param[out] entry 280 * Pointer to returning MR cache entry. If not found, this will not be 281 * updated. 282 * @param start_idx 283 * Start index of the memseg bitmap. 284 * 285 * @return 286 * Next index to go on lookup. 287 */ 288 static int 289 mr_find_next_chunk(struct mlx5_mr *mr, struct mr_cache_entry *entry, 290 int base_idx) 291 { 292 uintptr_t start = 0; 293 uintptr_t end = 0; 294 uint32_t idx = 0; 295 296 /* MR for external memory doesn't have memseg list. */ 297 if (mr->msl == NULL) { 298 MLX5_ASSERT(mr->ms_bmp_n == 1); 299 MLX5_ASSERT(mr->ms_n == 1); 300 MLX5_ASSERT(base_idx == 0); 301 /* 302 * Can't search it from memseg list but get it directly from 303 * pmd_mr as there's only one chunk. 304 */ 305 entry->start = (uintptr_t)mr->pmd_mr.addr; 306 entry->end = (uintptr_t)mr->pmd_mr.addr + mr->pmd_mr.len; 307 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 308 /* Returning 1 ends iteration. */ 309 return 1; 310 } 311 for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { 312 if (rte_bitmap_get(mr->ms_bmp, idx)) { 313 const struct rte_memseg_list *msl; 314 const struct rte_memseg *ms; 315 316 msl = mr->msl; 317 ms = rte_fbarray_get(&msl->memseg_arr, 318 mr->ms_base_idx + idx); 319 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 320 if (!start) 321 start = ms->addr_64; 322 end = ms->addr_64 + ms->hugepage_sz; 323 } else if (start) { 324 /* Passed the end of a fragment. */ 325 break; 326 } 327 } 328 if (start) { 329 /* Found one chunk. */ 330 entry->start = start; 331 entry->end = end; 332 entry->lkey = rte_cpu_to_be_32(mr->pmd_mr.lkey); 333 } 334 return idx; 335 } 336 337 /** 338 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. 339 * Then, this entry will have to be searched by mr_lookup_list() in 340 * mlx5_mr_create() on miss. 341 * 342 * @param share_cache 343 * Pointer to a global shared MR cache. 344 * @param mr 345 * Pointer to MR to insert. 346 * 347 * @return 348 * 0 on success, -1 on failure. 349 */ 350 int 351 mlx5_mr_insert_cache(struct mlx5_mr_share_cache *share_cache, 352 struct mlx5_mr *mr) 353 { 354 unsigned int n; 355 356 DRV_LOG(DEBUG, "Inserting MR(%p) to global cache(%p)", 357 (void *)mr, (void *)share_cache); 358 for (n = 0; n < mr->ms_bmp_n; ) { 359 struct mr_cache_entry entry; 360 361 memset(&entry, 0, sizeof(entry)); 362 /* Find a contiguous chunk and advance the index. */ 363 n = mr_find_next_chunk(mr, &entry, n); 364 if (!entry.end) 365 break; 366 if (mr_btree_insert(&share_cache->cache, &entry) < 0) { 367 /* 368 * Overflowed, but the global table cannot be expanded 369 * because of deadlock. 370 */ 371 return -1; 372 } 373 } 374 return 0; 375 } 376 377 /** 378 * Look up address in the original global MR list. 379 * 380 * @param share_cache 381 * Pointer to a global shared MR cache. 382 * @param[out] entry 383 * Pointer to returning MR cache entry. If no match, this will not be updated. 384 * @param addr 385 * Search key. 386 * 387 * @return 388 * Found MR on match, NULL otherwise. 389 */ 390 struct mlx5_mr * 391 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache, 392 struct mr_cache_entry *entry, uintptr_t addr) 393 { 394 struct mlx5_mr *mr; 395 396 /* Iterate all the existing MRs. */ 397 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 398 unsigned int n; 399 400 if (mr->ms_n == 0) 401 continue; 402 for (n = 0; n < mr->ms_bmp_n; ) { 403 struct mr_cache_entry ret; 404 405 memset(&ret, 0, sizeof(ret)); 406 n = mr_find_next_chunk(mr, &ret, n); 407 if (addr >= ret.start && addr < ret.end) { 408 /* Found. */ 409 *entry = ret; 410 return mr; 411 } 412 } 413 } 414 return NULL; 415 } 416 417 /** 418 * Look up address on global MR cache. 419 * 420 * @param share_cache 421 * Pointer to a global shared MR cache. 422 * @param[out] entry 423 * Pointer to returning MR cache entry. If no match, this will not be updated. 424 * @param addr 425 * Search key. 426 * 427 * @return 428 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 429 */ 430 uint32_t 431 mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache, 432 struct mr_cache_entry *entry, uintptr_t addr) 433 { 434 uint16_t idx; 435 uint32_t lkey = UINT32_MAX; 436 struct mlx5_mr *mr; 437 438 /* 439 * If the global cache has overflowed since it failed to expand the 440 * B-tree table, it can't have all the existing MRs. Then, the address 441 * has to be searched by traversing the original MR list instead, which 442 * is very slow path. Otherwise, the global cache is all inclusive. 443 */ 444 if (!unlikely(share_cache->cache.overflow)) { 445 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 446 if (lkey != UINT32_MAX) 447 *entry = (*share_cache->cache.table)[idx]; 448 } else { 449 /* Falling back to the slowest path. */ 450 mr = mlx5_mr_lookup_list(share_cache, entry, addr); 451 if (mr != NULL) 452 lkey = entry->lkey; 453 } 454 MLX5_ASSERT(lkey == UINT32_MAX || (addr >= entry->start && 455 addr < entry->end)); 456 return lkey; 457 } 458 459 /** 460 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() 461 * can raise memory free event and the callback function will spin on the lock. 462 * 463 * @param mr 464 * Pointer to MR to free. 465 */ 466 void 467 mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb) 468 { 469 if (mr == NULL) 470 return; 471 DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr); 472 dereg_mr_cb(&mr->pmd_mr); 473 if (mr->ms_bmp != NULL) 474 rte_bitmap_free(mr->ms_bmp); 475 mlx5_free(mr); 476 } 477 478 void 479 mlx5_mr_rebuild_cache(struct mlx5_mr_share_cache *share_cache) 480 { 481 struct mlx5_mr *mr; 482 483 DRV_LOG(DEBUG, "Rebuild dev cache[] %p", (void *)share_cache); 484 /* Flush cache to rebuild. */ 485 share_cache->cache.len = 1; 486 share_cache->cache.overflow = 0; 487 /* Iterate all the existing MRs. */ 488 LIST_FOREACH(mr, &share_cache->mr_list, mr) 489 if (mlx5_mr_insert_cache(share_cache, mr) < 0) 490 return; 491 } 492 493 /** 494 * Release resources of detached MR having no online entry. 495 * 496 * @param share_cache 497 * Pointer to a global shared MR cache. 498 */ 499 static void 500 mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache) 501 { 502 struct mlx5_mr *mr_next; 503 struct mlx5_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); 504 505 /* Must be called from the primary process. */ 506 MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY); 507 /* 508 * MR can't be freed with holding the lock because rte_free() could call 509 * memory free callback function. This will be a deadlock situation. 510 */ 511 rte_rwlock_write_lock(&share_cache->rwlock); 512 /* Detach the whole free list and release it after unlocking. */ 513 free_list = share_cache->mr_free_list; 514 LIST_INIT(&share_cache->mr_free_list); 515 rte_rwlock_write_unlock(&share_cache->rwlock); 516 /* Release resources. */ 517 mr_next = LIST_FIRST(&free_list); 518 while (mr_next != NULL) { 519 struct mlx5_mr *mr = mr_next; 520 521 mr_next = LIST_NEXT(mr, mr); 522 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 523 } 524 } 525 526 /* Called during rte_memseg_contig_walk() by mlx5_mr_create(). */ 527 static int 528 mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, 529 const struct rte_memseg *ms, size_t len, void *arg) 530 { 531 struct mr_find_contig_memsegs_data *data = arg; 532 533 if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) 534 return 0; 535 /* Found, save it and stop walking. */ 536 data->start = ms->addr_64; 537 data->end = ms->addr_64 + len; 538 data->msl = msl; 539 return 1; 540 } 541 542 /** 543 * Create a new global Memory Region (MR) for a missing virtual address. 544 * This API should be called on a secondary process, then a request is sent to 545 * the primary process in order to create a MR for the address. As the global MR 546 * list is on the shared memory, following LKey lookup should succeed unless the 547 * request fails. 548 * 549 * @param pd 550 * Pointer to pd of a device (net, regex, vdpa,...). 551 * @param share_cache 552 * Pointer to a global shared MR cache. 553 * @param[out] entry 554 * Pointer to returning MR cache entry, found in the global cache or newly 555 * created. If failed to create one, this will not be updated. 556 * @param addr 557 * Target virtual address to register. 558 * @param mr_ext_memseg_en 559 * Configurable flag about external memory segment enable or not. 560 * 561 * @return 562 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 563 */ 564 static uint32_t 565 mlx5_mr_create_secondary(void *pd __rte_unused, 566 struct mlx5_mp_id *mp_id, 567 struct mlx5_mr_share_cache *share_cache, 568 struct mr_cache_entry *entry, uintptr_t addr, 569 unsigned int mr_ext_memseg_en __rte_unused) 570 { 571 int ret; 572 573 DRV_LOG(DEBUG, "port %u requesting MR creation for address (%p)", 574 mp_id->port_id, (void *)addr); 575 ret = mlx5_mp_req_mr_create(mp_id, addr); 576 if (ret) { 577 DRV_LOG(DEBUG, "Fail to request MR creation for address (%p)", 578 (void *)addr); 579 return UINT32_MAX; 580 } 581 rte_rwlock_read_lock(&share_cache->rwlock); 582 /* Fill in output data. */ 583 mlx5_mr_lookup_cache(share_cache, entry, addr); 584 /* Lookup can't fail. */ 585 MLX5_ASSERT(entry->lkey != UINT32_MAX); 586 rte_rwlock_read_unlock(&share_cache->rwlock); 587 DRV_LOG(DEBUG, "MR CREATED by primary process for %p:\n" 588 " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", 589 (void *)addr, entry->start, entry->end, entry->lkey); 590 return entry->lkey; 591 } 592 593 /** 594 * Create a new global Memory Region (MR) for a missing virtual address. 595 * Register entire virtually contiguous memory chunk around the address. 596 * 597 * @param pd 598 * Pointer to pd of a device (net, regex, vdpa,...). 599 * @param share_cache 600 * Pointer to a global shared MR cache. 601 * @param[out] entry 602 * Pointer to returning MR cache entry, found in the global cache or newly 603 * created. If failed to create one, this will not be updated. 604 * @param addr 605 * Target virtual address to register. 606 * @param mr_ext_memseg_en 607 * Configurable flag about external memory segment enable or not. 608 * 609 * @return 610 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 611 */ 612 uint32_t 613 mlx5_mr_create_primary(void *pd, 614 struct mlx5_mr_share_cache *share_cache, 615 struct mr_cache_entry *entry, uintptr_t addr, 616 unsigned int mr_ext_memseg_en) 617 { 618 struct mr_find_contig_memsegs_data data = {.addr = addr, }; 619 struct mr_find_contig_memsegs_data data_re; 620 const struct rte_memseg_list *msl; 621 const struct rte_memseg *ms; 622 struct mlx5_mr *mr = NULL; 623 int ms_idx_shift = -1; 624 uint32_t bmp_size; 625 void *bmp_mem; 626 uint32_t ms_n; 627 uint32_t n; 628 size_t len; 629 630 DRV_LOG(DEBUG, "Creating a MR using address (%p)", (void *)addr); 631 /* 632 * Release detached MRs if any. This can't be called with holding either 633 * memory_hotplug_lock or share_cache->rwlock. MRs on the free list have 634 * been detached by the memory free event but it couldn't be released 635 * inside the callback due to deadlock. As a result, releasing resources 636 * is quite opportunistic. 637 */ 638 mlx5_mr_garbage_collect(share_cache); 639 /* 640 * If enabled, find out a contiguous virtual address chunk in use, to 641 * which the given address belongs, in order to register maximum range. 642 * In the best case where mempools are not dynamically recreated and 643 * '--socket-mem' is specified as an EAL option, it is very likely to 644 * have only one MR(LKey) per a socket and per a hugepage-size even 645 * though the system memory is highly fragmented. As the whole memory 646 * chunk will be pinned by kernel, it can't be reused unless entire 647 * chunk is freed from EAL. 648 * 649 * If disabled, just register one memseg (page). Then, memory 650 * consumption will be minimized but it may drop performance if there 651 * are many MRs to lookup on the datapath. 652 */ 653 if (!mr_ext_memseg_en) { 654 data.msl = rte_mem_virt2memseg_list((void *)addr); 655 data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); 656 data.end = data.start + data.msl->page_sz; 657 } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { 658 DRV_LOG(WARNING, 659 "Unable to find virtually contiguous" 660 " chunk for address (%p)." 661 " rte_memseg_contig_walk() failed.", (void *)addr); 662 rte_errno = ENXIO; 663 goto err_nolock; 664 } 665 alloc_resources: 666 /* Addresses must be page-aligned. */ 667 MLX5_ASSERT(data.msl); 668 MLX5_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz)); 669 MLX5_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz)); 670 msl = data.msl; 671 ms = rte_mem_virt2memseg((void *)data.start, msl); 672 len = data.end - data.start; 673 MLX5_ASSERT(ms); 674 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 675 /* Number of memsegs in the range. */ 676 ms_n = len / msl->page_sz; 677 DRV_LOG(DEBUG, "Extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 678 " page_sz=0x%" PRIx64 ", ms_n=%u", 679 (void *)addr, data.start, data.end, msl->page_sz, ms_n); 680 /* Size of memory for bitmap. */ 681 bmp_size = rte_bitmap_get_memory_footprint(ms_n); 682 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 683 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE) + 684 bmp_size, RTE_CACHE_LINE_SIZE, msl->socket_id); 685 if (mr == NULL) { 686 DRV_LOG(DEBUG, "Unable to allocate memory for a new MR of" 687 " address (%p).", (void *)addr); 688 rte_errno = ENOMEM; 689 goto err_nolock; 690 } 691 mr->msl = msl; 692 /* 693 * Save the index of the first memseg and initialize memseg bitmap. To 694 * see if a memseg of ms_idx in the memseg-list is still valid, check: 695 * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) 696 */ 697 mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 698 bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); 699 mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); 700 if (mr->ms_bmp == NULL) { 701 DRV_LOG(DEBUG, "Unable to initialize bitmap for a new MR of" 702 " address (%p).", (void *)addr); 703 rte_errno = EINVAL; 704 goto err_nolock; 705 } 706 /* 707 * Should recheck whether the extended contiguous chunk is still valid. 708 * Because memory_hotplug_lock can't be held if there's any memory 709 * related calls in a critical path, resource allocation above can't be 710 * locked. If the memory has been changed at this point, try again with 711 * just single page. If not, go on with the big chunk atomically from 712 * here. 713 */ 714 rte_mcfg_mem_read_lock(); 715 data_re = data; 716 if (len > msl->page_sz && 717 !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { 718 DRV_LOG(DEBUG, 719 "Unable to find virtually contiguous chunk for address " 720 "(%p). rte_memseg_contig_walk() failed.", (void *)addr); 721 rte_errno = ENXIO; 722 goto err_memlock; 723 } 724 if (data.start != data_re.start || data.end != data_re.end) { 725 /* 726 * The extended contiguous chunk has been changed. Try again 727 * with single memseg instead. 728 */ 729 data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); 730 data.end = data.start + msl->page_sz; 731 rte_mcfg_mem_read_unlock(); 732 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 733 goto alloc_resources; 734 } 735 MLX5_ASSERT(data.msl == data_re.msl); 736 rte_rwlock_write_lock(&share_cache->rwlock); 737 /* 738 * Check the address is really missing. If other thread already created 739 * one or it is not found due to overflow, abort and return. 740 */ 741 if (mlx5_mr_lookup_cache(share_cache, entry, addr) != UINT32_MAX) { 742 /* 743 * Insert to the global cache table. It may fail due to 744 * low-on-memory. Then, this entry will have to be searched 745 * here again. 746 */ 747 mr_btree_insert(&share_cache->cache, entry); 748 DRV_LOG(DEBUG, "Found MR for %p on final lookup, abort", 749 (void *)addr); 750 rte_rwlock_write_unlock(&share_cache->rwlock); 751 rte_mcfg_mem_read_unlock(); 752 /* 753 * Must be unlocked before calling rte_free() because 754 * mlx5_mr_mem_event_free_cb() can be called inside. 755 */ 756 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 757 return entry->lkey; 758 } 759 /* 760 * Trim start and end addresses for verbs MR. Set bits for registering 761 * memsegs but exclude already registered ones. Bitmap can be 762 * fragmented. 763 */ 764 for (n = 0; n < ms_n; ++n) { 765 uintptr_t start; 766 struct mr_cache_entry ret; 767 768 memset(&ret, 0, sizeof(ret)); 769 start = data_re.start + n * msl->page_sz; 770 /* Exclude memsegs already registered by other MRs. */ 771 if (mlx5_mr_lookup_cache(share_cache, &ret, start) == 772 UINT32_MAX) { 773 /* 774 * Start from the first unregistered memseg in the 775 * extended range. 776 */ 777 if (ms_idx_shift == -1) { 778 mr->ms_base_idx += n; 779 data.start = start; 780 ms_idx_shift = n; 781 } 782 data.end = start + msl->page_sz; 783 rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); 784 ++mr->ms_n; 785 } 786 } 787 len = data.end - data.start; 788 mr->ms_bmp_n = len / msl->page_sz; 789 MLX5_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n); 790 /* 791 * Finally create an MR for the memory chunk. Verbs: ibv_reg_mr() can 792 * be called with holding the memory lock because it doesn't use 793 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket() 794 * through mlx5_alloc_verbs_buf(). 795 */ 796 share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr); 797 if (mr->pmd_mr.obj == NULL) { 798 DRV_LOG(DEBUG, "Fail to create an MR for address (%p)", 799 (void *)addr); 800 rte_errno = EINVAL; 801 goto err_mrlock; 802 } 803 MLX5_ASSERT((uintptr_t)mr->pmd_mr.addr == data.start); 804 MLX5_ASSERT(mr->pmd_mr.len); 805 LIST_INSERT_HEAD(&share_cache->mr_list, mr, mr); 806 DRV_LOG(DEBUG, "MR CREATED (%p) for %p:\n" 807 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 808 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 809 (void *)mr, (void *)addr, data.start, data.end, 810 rte_cpu_to_be_32(mr->pmd_mr.lkey), 811 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 812 /* Insert to the global cache table. */ 813 mlx5_mr_insert_cache(share_cache, mr); 814 /* Fill in output data. */ 815 mlx5_mr_lookup_cache(share_cache, entry, addr); 816 /* Lookup can't fail. */ 817 MLX5_ASSERT(entry->lkey != UINT32_MAX); 818 rte_rwlock_write_unlock(&share_cache->rwlock); 819 rte_mcfg_mem_read_unlock(); 820 return entry->lkey; 821 err_mrlock: 822 rte_rwlock_write_unlock(&share_cache->rwlock); 823 err_memlock: 824 rte_mcfg_mem_read_unlock(); 825 err_nolock: 826 /* 827 * In case of error, as this can be called in a datapath, a warning 828 * message per an error is preferable instead. Must be unlocked before 829 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called 830 * inside. 831 */ 832 mlx5_mr_free(mr, share_cache->dereg_mr_cb); 833 return UINT32_MAX; 834 } 835 836 /** 837 * Create a new global Memory Region (MR) for a missing virtual address. 838 * This can be called from primary and secondary process. 839 * 840 * @param pd 841 * Pointer to pd handle of a device (net, regex, vdpa,...). 842 * @param share_cache 843 * Pointer to a global shared MR cache. 844 * @param[out] entry 845 * Pointer to returning MR cache entry, found in the global cache or newly 846 * created. If failed to create one, this will not be updated. 847 * @param addr 848 * Target virtual address to register. 849 * 850 * @return 851 * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. 852 */ 853 static uint32_t 854 mlx5_mr_create(void *pd, struct mlx5_mp_id *mp_id, 855 struct mlx5_mr_share_cache *share_cache, 856 struct mr_cache_entry *entry, uintptr_t addr, 857 unsigned int mr_ext_memseg_en) 858 { 859 uint32_t ret = 0; 860 861 switch (rte_eal_process_type()) { 862 case RTE_PROC_PRIMARY: 863 ret = mlx5_mr_create_primary(pd, share_cache, entry, 864 addr, mr_ext_memseg_en); 865 break; 866 case RTE_PROC_SECONDARY: 867 ret = mlx5_mr_create_secondary(pd, mp_id, share_cache, entry, 868 addr, mr_ext_memseg_en); 869 break; 870 default: 871 break; 872 } 873 return ret; 874 } 875 876 /** 877 * Look up address in the global MR cache table. If not found, create a new MR. 878 * Insert the found/created entry to local bottom-half cache table. 879 * 880 * @param pd 881 * Pointer to pd of a device (net, regex, vdpa,...). 882 * @param share_cache 883 * Pointer to a global shared MR cache. 884 * @param mr_ctrl 885 * Pointer to per-queue MR control structure. 886 * @param[out] entry 887 * Pointer to returning MR cache entry, found in the global cache or newly 888 * created. If failed to create one, this is not written. 889 * @param addr 890 * Search key. 891 * 892 * @return 893 * Searched LKey on success, UINT32_MAX on no match. 894 */ 895 static uint32_t 896 mr_lookup_caches(void *pd, struct mlx5_mp_id *mp_id, 897 struct mlx5_mr_share_cache *share_cache, 898 struct mlx5_mr_ctrl *mr_ctrl, 899 struct mr_cache_entry *entry, uintptr_t addr, 900 unsigned int mr_ext_memseg_en) 901 { 902 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 903 uint32_t lkey; 904 uint16_t idx; 905 906 /* If local cache table is full, try to double it. */ 907 if (unlikely(bt->len == bt->size)) 908 mr_btree_expand(bt, bt->size << 1); 909 /* Look up in the global cache. */ 910 rte_rwlock_read_lock(&share_cache->rwlock); 911 lkey = mr_btree_lookup(&share_cache->cache, &idx, addr); 912 if (lkey != UINT32_MAX) { 913 /* Found. */ 914 *entry = (*share_cache->cache.table)[idx]; 915 rte_rwlock_read_unlock(&share_cache->rwlock); 916 /* 917 * Update local cache. Even if it fails, return the found entry 918 * to update top-half cache. Next time, this entry will be found 919 * in the global cache. 920 */ 921 mr_btree_insert(bt, entry); 922 return lkey; 923 } 924 rte_rwlock_read_unlock(&share_cache->rwlock); 925 /* First time to see the address? Create a new MR. */ 926 lkey = mlx5_mr_create(pd, mp_id, share_cache, entry, addr, 927 mr_ext_memseg_en); 928 /* 929 * Update the local cache if successfully created a new global MR. Even 930 * if failed to create one, there's no action to take in this datapath 931 * code. As returning LKey is invalid, this will eventually make HW 932 * fail. 933 */ 934 if (lkey != UINT32_MAX) 935 mr_btree_insert(bt, entry); 936 return lkey; 937 } 938 939 /** 940 * Bottom-half of LKey search on datapath. First search in cache_bh[] and if 941 * misses, search in the global MR cache table and update the new entry to 942 * per-queue local caches. 943 * 944 * @param pd 945 * Pointer to pd of a device (net, regex, vdpa,...). 946 * @param share_cache 947 * Pointer to a global shared MR cache. 948 * @param mr_ctrl 949 * Pointer to per-queue MR control structure. 950 * @param addr 951 * Search key. 952 * 953 * @return 954 * Searched LKey on success, UINT32_MAX on no match. 955 */ 956 uint32_t mlx5_mr_addr2mr_bh(void *pd, struct mlx5_mp_id *mp_id, 957 struct mlx5_mr_share_cache *share_cache, 958 struct mlx5_mr_ctrl *mr_ctrl, 959 uintptr_t addr, unsigned int mr_ext_memseg_en) 960 { 961 uint32_t lkey; 962 uint16_t bh_idx = 0; 963 /* Victim in top-half cache to replace with new entry. */ 964 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 965 966 /* Binary-search MR translation table. */ 967 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 968 /* Update top-half cache. */ 969 if (likely(lkey != UINT32_MAX)) { 970 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 971 } else { 972 /* 973 * If missed in local lookup table, search in the global cache 974 * and local cache_bh[] will be updated inside if possible. 975 * Top-half cache entry will also be updated. 976 */ 977 lkey = mr_lookup_caches(pd, mp_id, share_cache, mr_ctrl, 978 repl, addr, mr_ext_memseg_en); 979 if (unlikely(lkey == UINT32_MAX)) 980 return UINT32_MAX; 981 } 982 /* Update the most recently used entry. */ 983 mr_ctrl->mru = mr_ctrl->head; 984 /* Point to the next victim, the oldest. */ 985 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 986 return lkey; 987 } 988 989 /** 990 * Release all the created MRs and resources on global MR cache of a device. 991 * list. 992 * 993 * @param share_cache 994 * Pointer to a global shared MR cache. 995 */ 996 void 997 mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache) 998 { 999 struct mlx5_mr *mr_next; 1000 1001 rte_rwlock_write_lock(&share_cache->rwlock); 1002 /* Detach from MR list and move to free list. */ 1003 mr_next = LIST_FIRST(&share_cache->mr_list); 1004 while (mr_next != NULL) { 1005 struct mlx5_mr *mr = mr_next; 1006 1007 mr_next = LIST_NEXT(mr, mr); 1008 LIST_REMOVE(mr, mr); 1009 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1010 } 1011 LIST_INIT(&share_cache->mr_list); 1012 /* Free global cache. */ 1013 mlx5_mr_btree_free(&share_cache->cache); 1014 rte_rwlock_write_unlock(&share_cache->rwlock); 1015 /* Free all remaining MRs. */ 1016 mlx5_mr_garbage_collect(share_cache); 1017 } 1018 1019 /** 1020 * Flush all of the local cache entries. 1021 * 1022 * @param mr_ctrl 1023 * Pointer to per-queue MR local cache. 1024 */ 1025 void 1026 mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl) 1027 { 1028 /* Reset the most-recently-used index. */ 1029 mr_ctrl->mru = 0; 1030 /* Reset the linear search array. */ 1031 mr_ctrl->head = 0; 1032 memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); 1033 /* Reset the B-tree table. */ 1034 mr_ctrl->cache_bh.len = 1; 1035 mr_ctrl->cache_bh.overflow = 0; 1036 /* Update the generation number. */ 1037 mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; 1038 DRV_LOG(DEBUG, "mr_ctrl(%p): flushed, cur_gen=%d", 1039 (void *)mr_ctrl, mr_ctrl->cur_gen); 1040 } 1041 1042 /** 1043 * Creates a memory region for external memory, that is memory which is not 1044 * part of the DPDK memory segments. 1045 * 1046 * @param pd 1047 * Pointer to pd of a device (net, regex, vdpa,...). 1048 * @param addr 1049 * Starting virtual address of memory. 1050 * @param len 1051 * Length of memory segment being mapped. 1052 * @param socked_id 1053 * Socket to allocate heap memory for the control structures. 1054 * 1055 * @return 1056 * Pointer to MR structure on success, NULL otherwise. 1057 */ 1058 struct mlx5_mr * 1059 mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id, 1060 mlx5_reg_mr_t reg_mr_cb) 1061 { 1062 struct mlx5_mr *mr = NULL; 1063 1064 mr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1065 RTE_ALIGN_CEIL(sizeof(*mr), RTE_CACHE_LINE_SIZE), 1066 RTE_CACHE_LINE_SIZE, socket_id); 1067 if (mr == NULL) 1068 return NULL; 1069 reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr); 1070 if (mr->pmd_mr.obj == NULL) { 1071 DRV_LOG(WARNING, 1072 "Fail to create MR for address (%p)", 1073 (void *)addr); 1074 mlx5_free(mr); 1075 return NULL; 1076 } 1077 mr->msl = NULL; /* Mark it is external memory. */ 1078 mr->ms_bmp = NULL; 1079 mr->ms_n = 1; 1080 mr->ms_bmp_n = 1; 1081 DRV_LOG(DEBUG, 1082 "MR CREATED (%p) for external memory %p:\n" 1083 " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," 1084 " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", 1085 (void *)mr, (void *)addr, 1086 addr, addr + len, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1087 mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); 1088 return mr; 1089 } 1090 1091 /** 1092 * Callback for memory free event. Iterate freed memsegs and check whether it 1093 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a 1094 * result, the MR would be fragmented. If it becomes empty, the MR will be freed 1095 * later by mlx5_mr_garbage_collect(). Even if this callback is called from a 1096 * secondary process, the garbage collector will be called in primary process 1097 * as the secondary process can't call mlx5_mr_create(). 1098 * 1099 * The global cache must be rebuilt if there's any change and this event has to 1100 * be propagated to dataplane threads to flush the local caches. 1101 * 1102 * @param share_cache 1103 * Pointer to a global shared MR cache. 1104 * @param ibdev_name 1105 * Name of ibv device. 1106 * @param addr 1107 * Address of freed memory. 1108 * @param len 1109 * Size of freed memory. 1110 */ 1111 void 1112 mlx5_free_mr_by_addr(struct mlx5_mr_share_cache *share_cache, 1113 const char *ibdev_name, const void *addr, size_t len) 1114 { 1115 const struct rte_memseg_list *msl; 1116 struct mlx5_mr *mr; 1117 int ms_n; 1118 int i; 1119 int rebuild = 0; 1120 1121 DRV_LOG(DEBUG, "device %s free callback: addr=%p, len=%zu", 1122 ibdev_name, addr, len); 1123 msl = rte_mem_virt2memseg_list(addr); 1124 /* addr and len must be page-aligned. */ 1125 MLX5_ASSERT((uintptr_t)addr == 1126 RTE_ALIGN((uintptr_t)addr, msl->page_sz)); 1127 MLX5_ASSERT(len == RTE_ALIGN(len, msl->page_sz)); 1128 ms_n = len / msl->page_sz; 1129 rte_rwlock_write_lock(&share_cache->rwlock); 1130 /* Clear bits of freed memsegs from MR. */ 1131 for (i = 0; i < ms_n; ++i) { 1132 const struct rte_memseg *ms; 1133 struct mr_cache_entry entry; 1134 uintptr_t start; 1135 int ms_idx; 1136 uint32_t pos; 1137 1138 /* Find MR having this memseg. */ 1139 start = (uintptr_t)addr + i * msl->page_sz; 1140 mr = mlx5_mr_lookup_list(share_cache, &entry, start); 1141 if (mr == NULL) 1142 continue; 1143 MLX5_ASSERT(mr->msl); /* Can't be external memory. */ 1144 ms = rte_mem_virt2memseg((void *)start, msl); 1145 MLX5_ASSERT(ms != NULL); 1146 MLX5_ASSERT(msl->page_sz == ms->hugepage_sz); 1147 ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); 1148 pos = ms_idx - mr->ms_base_idx; 1149 MLX5_ASSERT(rte_bitmap_get(mr->ms_bmp, pos)); 1150 MLX5_ASSERT(pos < mr->ms_bmp_n); 1151 DRV_LOG(DEBUG, "device %s MR(%p): clear bitmap[%u] for addr %p", 1152 ibdev_name, (void *)mr, pos, (void *)start); 1153 rte_bitmap_clear(mr->ms_bmp, pos); 1154 if (--mr->ms_n == 0) { 1155 LIST_REMOVE(mr, mr); 1156 LIST_INSERT_HEAD(&share_cache->mr_free_list, mr, mr); 1157 DRV_LOG(DEBUG, "device %s remove MR(%p) from list", 1158 ibdev_name, (void *)mr); 1159 } 1160 /* 1161 * MR is fragmented or will be freed. the global cache must be 1162 * rebuilt. 1163 */ 1164 rebuild = 1; 1165 } 1166 if (rebuild) { 1167 mlx5_mr_rebuild_cache(share_cache); 1168 /* 1169 * No explicit wmb is needed after updating dev_gen due to 1170 * store-release ordering in unlock that provides the 1171 * implicit barrier at the software visible level. 1172 */ 1173 ++share_cache->dev_gen; 1174 DRV_LOG(DEBUG, "broadcasting local cache flush, gen=%d", 1175 share_cache->dev_gen); 1176 } 1177 rte_rwlock_write_unlock(&share_cache->rwlock); 1178 } 1179 1180 /** 1181 * Dump all the created MRs and the global cache entries. 1182 * 1183 * @param sh 1184 * Pointer to Ethernet device shared context. 1185 */ 1186 void 1187 mlx5_mr_dump_cache(struct mlx5_mr_share_cache *share_cache __rte_unused) 1188 { 1189 #ifdef RTE_LIBRTE_MLX5_DEBUG 1190 struct mlx5_mr *mr; 1191 int mr_n = 0; 1192 int chunk_n = 0; 1193 1194 rte_rwlock_read_lock(&share_cache->rwlock); 1195 /* Iterate all the existing MRs. */ 1196 LIST_FOREACH(mr, &share_cache->mr_list, mr) { 1197 unsigned int n; 1198 1199 DRV_LOG(DEBUG, "MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", 1200 mr_n++, rte_cpu_to_be_32(mr->pmd_mr.lkey), 1201 mr->ms_n, mr->ms_bmp_n); 1202 if (mr->ms_n == 0) 1203 continue; 1204 for (n = 0; n < mr->ms_bmp_n; ) { 1205 struct mr_cache_entry ret = { 0, }; 1206 1207 n = mr_find_next_chunk(mr, &ret, n); 1208 if (!ret.end) 1209 break; 1210 DRV_LOG(DEBUG, 1211 " chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", 1212 chunk_n++, ret.start, ret.end); 1213 } 1214 } 1215 DRV_LOG(DEBUG, "Dumping global cache %p", (void *)share_cache); 1216 mlx5_mr_btree_dump(&share_cache->cache); 1217 rte_rwlock_read_unlock(&share_cache->rwlock); 1218 #endif 1219 } 1220 1221 static int 1222 mlx5_range_compare_start(const void *lhs, const void *rhs) 1223 { 1224 const struct mlx5_range *r1 = lhs, *r2 = rhs; 1225 1226 if (r1->start > r2->start) 1227 return 1; 1228 else if (r1->start < r2->start) 1229 return -1; 1230 return 0; 1231 } 1232 1233 static void 1234 mlx5_range_from_mempool_chunk(struct rte_mempool *mp, void *opaque, 1235 struct rte_mempool_memhdr *memhdr, 1236 unsigned int idx) 1237 { 1238 struct mlx5_range *ranges = opaque, *range = &ranges[idx]; 1239 uint64_t page_size = rte_mem_page_size(); 1240 1241 RTE_SET_USED(mp); 1242 range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size); 1243 range->end = RTE_ALIGN_CEIL(range->start + memhdr->len, page_size); 1244 } 1245 1246 /** 1247 * Get VA-contiguous ranges of the mempool memory. 1248 * Each range start and end is aligned to the system page size. 1249 * 1250 * @param[in] mp 1251 * Analyzed mempool. 1252 * @param[out] out 1253 * Receives the ranges, caller must release it with free(). 1254 * @param[out] ount_n 1255 * Receives the number of @p out elements. 1256 * 1257 * @return 1258 * 0 on success, (-1) on failure. 1259 */ 1260 static int 1261 mlx5_get_mempool_ranges(struct rte_mempool *mp, struct mlx5_range **out, 1262 unsigned int *out_n) 1263 { 1264 struct mlx5_range *chunks; 1265 unsigned int chunks_n = mp->nb_mem_chunks, contig_n, i; 1266 1267 /* Collect page-aligned memory ranges of the mempool. */ 1268 chunks = calloc(sizeof(chunks[0]), chunks_n); 1269 if (chunks == NULL) 1270 return -1; 1271 rte_mempool_mem_iter(mp, mlx5_range_from_mempool_chunk, chunks); 1272 /* Merge adjacent chunks and place them at the beginning. */ 1273 qsort(chunks, chunks_n, sizeof(chunks[0]), mlx5_range_compare_start); 1274 contig_n = 1; 1275 for (i = 1; i < chunks_n; i++) 1276 if (chunks[i - 1].end != chunks[i].start) { 1277 chunks[contig_n - 1].end = chunks[i - 1].end; 1278 chunks[contig_n] = chunks[i]; 1279 contig_n++; 1280 } 1281 /* Extend the last contiguous chunk to the end of the mempool. */ 1282 chunks[contig_n - 1].end = chunks[i - 1].end; 1283 *out = chunks; 1284 *out_n = contig_n; 1285 return 0; 1286 } 1287 1288 /** 1289 * Analyze mempool memory to select memory ranges to register. 1290 * 1291 * @param[in] mp 1292 * Mempool to analyze. 1293 * @param[out] out 1294 * Receives memory ranges to register, aligned to the system page size. 1295 * The caller must release them with free(). 1296 * @param[out] out_n 1297 * Receives the number of @p out items. 1298 * @param[out] share_hugepage 1299 * Receives True if the entire pool resides within a single hugepage. 1300 * 1301 * @return 1302 * 0 on success, (-1) on failure. 1303 */ 1304 static int 1305 mlx5_mempool_reg_analyze(struct rte_mempool *mp, struct mlx5_range **out, 1306 unsigned int *out_n, bool *share_hugepage) 1307 { 1308 struct mlx5_range *ranges = NULL; 1309 unsigned int i, ranges_n = 0; 1310 struct rte_memseg_list *msl; 1311 1312 if (mlx5_get_mempool_ranges(mp, &ranges, &ranges_n) < 0) { 1313 DRV_LOG(ERR, "Cannot get address ranges for mempool %s", 1314 mp->name); 1315 return -1; 1316 } 1317 /* Check if the hugepage of the pool can be shared. */ 1318 *share_hugepage = false; 1319 msl = rte_mem_virt2memseg_list((void *)ranges[0].start); 1320 if (msl != NULL) { 1321 uint64_t hugepage_sz = 0; 1322 1323 /* Check that all ranges are on pages of the same size. */ 1324 for (i = 0; i < ranges_n; i++) { 1325 if (hugepage_sz != 0 && hugepage_sz != msl->page_sz) 1326 break; 1327 hugepage_sz = msl->page_sz; 1328 } 1329 if (i == ranges_n) { 1330 /* 1331 * If the entire pool is within one hugepage, 1332 * combine all ranges into one of the hugepage size. 1333 */ 1334 uintptr_t reg_start = ranges[0].start; 1335 uintptr_t reg_end = ranges[ranges_n - 1].end; 1336 uintptr_t hugepage_start = 1337 RTE_ALIGN_FLOOR(reg_start, hugepage_sz); 1338 uintptr_t hugepage_end = hugepage_start + hugepage_sz; 1339 if (reg_end < hugepage_end) { 1340 ranges[0].start = hugepage_start; 1341 ranges[0].end = hugepage_end; 1342 ranges_n = 1; 1343 *share_hugepage = true; 1344 } 1345 } 1346 } 1347 *out = ranges; 1348 *out_n = ranges_n; 1349 return 0; 1350 } 1351 1352 /** Create a registration object for the mempool. */ 1353 static struct mlx5_mempool_reg * 1354 mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n) 1355 { 1356 struct mlx5_mempool_reg *mpr = NULL; 1357 1358 mpr = mlx5_malloc(MLX5_MEM_RTE | MLX5_MEM_ZERO, 1359 sizeof(*mpr) + mrs_n * sizeof(mpr->mrs[0]), 1360 RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY); 1361 if (mpr == NULL) { 1362 DRV_LOG(ERR, "Cannot allocate mempool %s registration object", 1363 mp->name); 1364 return NULL; 1365 } 1366 mpr->mp = mp; 1367 mpr->mrs = (struct mlx5_mempool_mr *)(mpr + 1); 1368 mpr->mrs_n = mrs_n; 1369 return mpr; 1370 } 1371 1372 /** 1373 * Destroy a mempool registration object. 1374 * 1375 * @param standalone 1376 * Whether @p mpr owns its MRs excludively, i.e. they are not shared. 1377 */ 1378 static void 1379 mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache, 1380 struct mlx5_mempool_reg *mpr, bool standalone) 1381 { 1382 if (standalone) { 1383 unsigned int i; 1384 1385 for (i = 0; i < mpr->mrs_n; i++) 1386 share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr); 1387 } 1388 mlx5_free(mpr); 1389 } 1390 1391 /** Find registration object of a mempool. */ 1392 static struct mlx5_mempool_reg * 1393 mlx5_mempool_reg_lookup(struct mlx5_mr_share_cache *share_cache, 1394 struct rte_mempool *mp) 1395 { 1396 struct mlx5_mempool_reg *mpr; 1397 1398 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1399 if (mpr->mp == mp) 1400 break; 1401 return mpr; 1402 } 1403 1404 /** Increment reference counters of MRs used in the registration. */ 1405 static void 1406 mlx5_mempool_reg_attach(struct mlx5_mempool_reg *mpr) 1407 { 1408 unsigned int i; 1409 1410 for (i = 0; i < mpr->mrs_n; i++) 1411 __atomic_add_fetch(&mpr->mrs[i].refcnt, 1, __ATOMIC_RELAXED); 1412 } 1413 1414 /** 1415 * Decrement reference counters of MRs used in the registration. 1416 * 1417 * @return True if no more references to @p mpr MRs exist, False otherwise. 1418 */ 1419 static bool 1420 mlx5_mempool_reg_detach(struct mlx5_mempool_reg *mpr) 1421 { 1422 unsigned int i; 1423 bool ret = false; 1424 1425 for (i = 0; i < mpr->mrs_n; i++) 1426 ret |= __atomic_sub_fetch(&mpr->mrs[i].refcnt, 1, 1427 __ATOMIC_RELAXED) == 0; 1428 return ret; 1429 } 1430 1431 static int 1432 mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache, 1433 void *pd, struct rte_mempool *mp) 1434 { 1435 struct mlx5_range *ranges = NULL; 1436 struct mlx5_mempool_reg *mpr, *new_mpr; 1437 unsigned int i, ranges_n; 1438 bool share_hugepage; 1439 int ret = -1; 1440 1441 /* Early check to avoid unnecessary creation of MRs. */ 1442 rte_rwlock_read_lock(&share_cache->rwlock); 1443 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1444 rte_rwlock_read_unlock(&share_cache->rwlock); 1445 if (mpr != NULL) { 1446 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1447 mp->name, pd); 1448 rte_errno = EEXIST; 1449 goto exit; 1450 } 1451 if (mlx5_mempool_reg_analyze(mp, &ranges, &ranges_n, 1452 &share_hugepage) < 0) { 1453 DRV_LOG(ERR, "Cannot get mempool %s memory ranges", mp->name); 1454 rte_errno = ENOMEM; 1455 goto exit; 1456 } 1457 new_mpr = mlx5_mempool_reg_create(mp, ranges_n); 1458 if (new_mpr == NULL) { 1459 DRV_LOG(ERR, 1460 "Cannot create a registration object for mempool %s in PD %p", 1461 mp->name, pd); 1462 rte_errno = ENOMEM; 1463 goto exit; 1464 } 1465 /* 1466 * If the entire mempool fits in a single hugepage, the MR for this 1467 * hugepage can be shared across mempools that also fit in it. 1468 */ 1469 if (share_hugepage) { 1470 rte_rwlock_write_lock(&share_cache->rwlock); 1471 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) { 1472 if (mpr->mrs[0].pmd_mr.addr == (void *)ranges[0].start) 1473 break; 1474 } 1475 if (mpr != NULL) { 1476 new_mpr->mrs = mpr->mrs; 1477 mlx5_mempool_reg_attach(new_mpr); 1478 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1479 new_mpr, next); 1480 } 1481 rte_rwlock_write_unlock(&share_cache->rwlock); 1482 if (mpr != NULL) { 1483 DRV_LOG(DEBUG, "Shared MR %#x in PD %p for mempool %s with mempool %s", 1484 mpr->mrs[0].pmd_mr.lkey, pd, mp->name, 1485 mpr->mp->name); 1486 ret = 0; 1487 goto exit; 1488 } 1489 } 1490 for (i = 0; i < ranges_n; i++) { 1491 struct mlx5_mempool_mr *mr = &new_mpr->mrs[i]; 1492 const struct mlx5_range *range = &ranges[i]; 1493 size_t len = range->end - range->start; 1494 1495 if (share_cache->reg_mr_cb(pd, (void *)range->start, len, 1496 &mr->pmd_mr) < 0) { 1497 DRV_LOG(ERR, 1498 "Failed to create an MR in PD %p for address range " 1499 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1500 pd, range->start, range->end, len, mp->name); 1501 break; 1502 } 1503 DRV_LOG(DEBUG, 1504 "Created a new MR %#x in PD %p for address range " 1505 "[0x%" PRIxPTR ", 0x%" PRIxPTR "] (%zu bytes) for mempool %s", 1506 mr->pmd_mr.lkey, pd, range->start, range->end, len, 1507 mp->name); 1508 } 1509 if (i != ranges_n) { 1510 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1511 rte_errno = EINVAL; 1512 goto exit; 1513 } 1514 /* Concurrent registration is not supposed to happen. */ 1515 rte_rwlock_write_lock(&share_cache->rwlock); 1516 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1517 if (mpr == NULL) { 1518 mlx5_mempool_reg_attach(new_mpr); 1519 LIST_INSERT_HEAD(&share_cache->mempool_reg_list, 1520 new_mpr, next); 1521 ret = 0; 1522 } 1523 rte_rwlock_write_unlock(&share_cache->rwlock); 1524 if (mpr != NULL) { 1525 DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p", 1526 mp->name, pd); 1527 mlx5_mempool_reg_destroy(share_cache, new_mpr, true); 1528 rte_errno = EEXIST; 1529 goto exit; 1530 } 1531 exit: 1532 free(ranges); 1533 return ret; 1534 } 1535 1536 static int 1537 mlx5_mr_mempool_register_secondary(struct mlx5_mr_share_cache *share_cache, 1538 void *pd, struct rte_mempool *mp, 1539 struct mlx5_mp_id *mp_id) 1540 { 1541 if (mp_id == NULL) { 1542 rte_errno = EINVAL; 1543 return -1; 1544 } 1545 return mlx5_mp_req_mempool_reg(mp_id, share_cache, pd, mp, true); 1546 } 1547 1548 /** 1549 * Register the memory of a mempool in the protection domain. 1550 * 1551 * @param share_cache 1552 * Shared MR cache of the protection domain. 1553 * @param pd 1554 * Protection domain object. 1555 * @param mp 1556 * Mempool to register. 1557 * @param mp_id 1558 * Multi-process identifier, may be NULL for the primary process. 1559 * 1560 * @return 1561 * 0 on success, (-1) on failure and rte_errno is set. 1562 */ 1563 int 1564 mlx5_mr_mempool_register(struct mlx5_mr_share_cache *share_cache, void *pd, 1565 struct rte_mempool *mp, struct mlx5_mp_id *mp_id) 1566 { 1567 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1568 return 0; 1569 switch (rte_eal_process_type()) { 1570 case RTE_PROC_PRIMARY: 1571 return mlx5_mr_mempool_register_primary(share_cache, pd, mp); 1572 case RTE_PROC_SECONDARY: 1573 return mlx5_mr_mempool_register_secondary(share_cache, pd, mp, 1574 mp_id); 1575 default: 1576 return -1; 1577 } 1578 } 1579 1580 static int 1581 mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache, 1582 struct rte_mempool *mp) 1583 { 1584 struct mlx5_mempool_reg *mpr; 1585 bool standalone = false; 1586 1587 rte_rwlock_write_lock(&share_cache->rwlock); 1588 LIST_FOREACH(mpr, &share_cache->mempool_reg_list, next) 1589 if (mpr->mp == mp) { 1590 LIST_REMOVE(mpr, next); 1591 standalone = mlx5_mempool_reg_detach(mpr); 1592 if (standalone) 1593 /* 1594 * The unlock operation below provides a memory 1595 * barrier due to its store-release semantics. 1596 */ 1597 ++share_cache->dev_gen; 1598 break; 1599 } 1600 rte_rwlock_write_unlock(&share_cache->rwlock); 1601 if (mpr == NULL) { 1602 rte_errno = ENOENT; 1603 return -1; 1604 } 1605 mlx5_mempool_reg_destroy(share_cache, mpr, standalone); 1606 return 0; 1607 } 1608 1609 static int 1610 mlx5_mr_mempool_unregister_secondary(struct mlx5_mr_share_cache *share_cache, 1611 struct rte_mempool *mp, 1612 struct mlx5_mp_id *mp_id) 1613 { 1614 if (mp_id == NULL) { 1615 rte_errno = EINVAL; 1616 return -1; 1617 } 1618 return mlx5_mp_req_mempool_reg(mp_id, share_cache, NULL, mp, false); 1619 } 1620 1621 /** 1622 * Unregister the memory of a mempool from the protection domain. 1623 * 1624 * @param share_cache 1625 * Shared MR cache of the protection domain. 1626 * @param mp 1627 * Mempool to unregister. 1628 * @param mp_id 1629 * Multi-process identifier, may be NULL for the primary process. 1630 * 1631 * @return 1632 * 0 on success, (-1) on failure and rte_errno is set. 1633 */ 1634 int 1635 mlx5_mr_mempool_unregister(struct mlx5_mr_share_cache *share_cache, 1636 struct rte_mempool *mp, struct mlx5_mp_id *mp_id) 1637 { 1638 if (mp->flags & RTE_MEMPOOL_F_NON_IO) 1639 return 0; 1640 switch (rte_eal_process_type()) { 1641 case RTE_PROC_PRIMARY: 1642 return mlx5_mr_mempool_unregister_primary(share_cache, mp); 1643 case RTE_PROC_SECONDARY: 1644 return mlx5_mr_mempool_unregister_secondary(share_cache, mp, 1645 mp_id); 1646 default: 1647 return -1; 1648 } 1649 } 1650 1651 /** 1652 * Lookup a MR key by and address in a registered mempool. 1653 * 1654 * @param mpr 1655 * Mempool registration object. 1656 * @param addr 1657 * Address within the mempool. 1658 * @param entry 1659 * Bottom-half cache entry to fill. 1660 * 1661 * @return 1662 * MR key or UINT32_MAX on failure, which can only happen 1663 * if the address is not from within the mempool. 1664 */ 1665 static uint32_t 1666 mlx5_mempool_reg_addr2mr(struct mlx5_mempool_reg *mpr, uintptr_t addr, 1667 struct mr_cache_entry *entry) 1668 { 1669 uint32_t lkey = UINT32_MAX; 1670 unsigned int i; 1671 1672 for (i = 0; i < mpr->mrs_n; i++) { 1673 const struct mlx5_pmd_mr *mr = &mpr->mrs[i].pmd_mr; 1674 uintptr_t mr_addr = (uintptr_t)mr->addr; 1675 1676 if (mr_addr <= addr) { 1677 lkey = rte_cpu_to_be_32(mr->lkey); 1678 entry->start = mr_addr; 1679 entry->end = mr_addr + mr->len; 1680 entry->lkey = lkey; 1681 break; 1682 } 1683 } 1684 return lkey; 1685 } 1686 1687 /** 1688 * Update bottom-half cache from the list of mempool registrations. 1689 * 1690 * @param share_cache 1691 * Pointer to a global shared MR cache. 1692 * @param mr_ctrl 1693 * Per-queue MR control handle. 1694 * @param entry 1695 * Pointer to an entry in the bottom-half cache to update 1696 * with the MR lkey looked up. 1697 * @param mp 1698 * Mempool containing the address. 1699 * @param addr 1700 * Address to lookup. 1701 * @return 1702 * MR lkey on success, UINT32_MAX on failure. 1703 */ 1704 static uint32_t 1705 mlx5_lookup_mempool_regs(struct mlx5_mr_share_cache *share_cache, 1706 struct mlx5_mr_ctrl *mr_ctrl, 1707 struct mr_cache_entry *entry, 1708 struct rte_mempool *mp, uintptr_t addr) 1709 { 1710 struct mlx5_mr_btree *bt = &mr_ctrl->cache_bh; 1711 struct mlx5_mempool_reg *mpr; 1712 uint32_t lkey = UINT32_MAX; 1713 1714 /* If local cache table is full, try to double it. */ 1715 if (unlikely(bt->len == bt->size)) 1716 mr_btree_expand(bt, bt->size << 1); 1717 /* Look up in mempool registrations. */ 1718 rte_rwlock_read_lock(&share_cache->rwlock); 1719 mpr = mlx5_mempool_reg_lookup(share_cache, mp); 1720 if (mpr != NULL) 1721 lkey = mlx5_mempool_reg_addr2mr(mpr, addr, entry); 1722 rte_rwlock_read_unlock(&share_cache->rwlock); 1723 /* 1724 * Update local cache. Even if it fails, return the found entry 1725 * to update top-half cache. Next time, this entry will be found 1726 * in the global cache. 1727 */ 1728 if (lkey != UINT32_MAX) 1729 mr_btree_insert(bt, entry); 1730 return lkey; 1731 } 1732 1733 /** 1734 * Bottom-half lookup for the address from the mempool. 1735 * 1736 * @param share_cache 1737 * Pointer to a global shared MR cache. 1738 * @param mr_ctrl 1739 * Per-queue MR control handle. 1740 * @param mp 1741 * Mempool containing the address. 1742 * @param addr 1743 * Address to lookup. 1744 * @return 1745 * MR lkey on success, UINT32_MAX on failure. 1746 */ 1747 uint32_t 1748 mlx5_mr_mempool2mr_bh(struct mlx5_mr_share_cache *share_cache, 1749 struct mlx5_mr_ctrl *mr_ctrl, 1750 struct rte_mempool *mp, uintptr_t addr) 1751 { 1752 struct mr_cache_entry *repl = &mr_ctrl->cache[mr_ctrl->head]; 1753 uint32_t lkey; 1754 uint16_t bh_idx = 0; 1755 1756 /* Binary-search MR translation table. */ 1757 lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); 1758 /* Update top-half cache. */ 1759 if (likely(lkey != UINT32_MAX)) { 1760 *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; 1761 } else { 1762 lkey = mlx5_lookup_mempool_regs(share_cache, mr_ctrl, repl, 1763 mp, addr); 1764 /* Can only fail if the address is not from the mempool. */ 1765 if (unlikely(lkey == UINT32_MAX)) 1766 return UINT32_MAX; 1767 } 1768 /* Update the most recently used entry. */ 1769 mr_ctrl->mru = mr_ctrl->head; 1770 /* Point to the next victim, the oldest. */ 1771 mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N; 1772 return lkey; 1773 } 1774