1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 #include "spdk_internal/rdma_utils.h" 7 8 #include "spdk/log.h" 9 #include "spdk/string.h" 10 #include "spdk/likely.h" 11 #include "spdk/net.h" 12 #include "spdk/file.h" 13 14 #include "spdk_internal/assert.h" 15 16 #include <rdma/rdma_cma.h> 17 #include <rdma/rdma_verbs.h> 18 19 struct rdma_utils_device { 20 struct ibv_pd *pd; 21 struct ibv_context *context; 22 int ref; 23 bool removed; 24 TAILQ_ENTRY(rdma_utils_device) tailq; 25 }; 26 27 struct spdk_rdma_utils_mem_map { 28 struct spdk_mem_map *map; 29 struct ibv_pd *pd; 30 struct spdk_nvme_rdma_hooks *hooks; 31 uint32_t ref_count; 32 uint32_t access_flags; 33 LIST_ENTRY(spdk_rdma_utils_mem_map) link; 34 }; 35 36 struct rdma_utils_memory_domain { 37 TAILQ_ENTRY(rdma_utils_memory_domain) link; 38 uint32_t ref; 39 enum spdk_dma_device_type type; 40 struct ibv_pd *pd; 41 struct spdk_memory_domain *domain; 42 struct spdk_memory_domain_rdma_ctx rdma_ctx; 43 }; 44 45 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER; 46 static struct ibv_context **g_ctx_list = NULL; 47 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list); 48 49 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER( 50 &g_rdma_utils_mr_maps); 51 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; 52 53 static TAILQ_HEAD(, rdma_utils_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER( 54 g_memory_domains); 55 static pthread_mutex_t g_memory_domains_lock = PTHREAD_MUTEX_INITIALIZER; 56 57 static int 58 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 59 enum spdk_mem_map_notify_action action, 60 void *vaddr, size_t size) 61 { 62 struct spdk_rdma_utils_mem_map *rmap = cb_ctx; 63 struct ibv_pd *pd = rmap->pd; 64 struct ibv_mr *mr; 65 uint32_t access_flags; 66 int rc; 67 68 switch (action) { 69 case SPDK_MEM_MAP_NOTIFY_REGISTER: 70 if (rmap->hooks && rmap->hooks->get_rkey) { 71 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 72 rmap->hooks->get_rkey(pd, vaddr, size)); 73 } else { 74 access_flags = rmap->access_flags; 75 #ifdef IBV_ACCESS_OPTIONAL_FIRST 76 access_flags |= IBV_ACCESS_RELAXED_ORDERING; 77 #endif 78 mr = ibv_reg_mr(pd, vaddr, size, access_flags); 79 if (mr == NULL) { 80 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 81 return -1; 82 } else { 83 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 84 } 85 } 86 break; 87 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 88 if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) { 89 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 90 if (mr) { 91 ibv_dereg_mr(mr); 92 } 93 } 94 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 95 break; 96 default: 97 SPDK_UNREACHABLE(); 98 } 99 100 return rc; 101 } 102 103 static int 104 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 105 { 106 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 107 return addr_1 == addr_2; 108 } 109 110 const struct spdk_mem_map_ops g_rdma_map_ops = { 111 .notify_cb = rdma_utils_mem_notify, 112 .are_contiguous = rdma_check_contiguous_entries 113 }; 114 115 static void 116 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map) 117 { 118 assert(map); 119 120 if (map->hooks) { 121 spdk_free(map); 122 } else { 123 free(map); 124 } 125 } 126 127 struct spdk_rdma_utils_mem_map * 128 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks, 129 uint32_t access_flags) 130 { 131 struct spdk_rdma_utils_mem_map *map; 132 133 if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) { 134 /* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */ 135 access_flags |= IBV_ACCESS_REMOTE_WRITE; 136 } 137 138 pthread_mutex_lock(&g_rdma_mr_maps_mutex); 139 /* Look up existing mem map registration for this pd */ 140 LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) { 141 if (map->pd == pd && map->access_flags == access_flags) { 142 map->ref_count++; 143 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 144 return map; 145 } 146 } 147 148 if (hooks) { 149 map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 150 } else { 151 map = calloc(1, sizeof(*map)); 152 } 153 if (!map) { 154 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 155 SPDK_ERRLOG("Memory allocation failed\n"); 156 return NULL; 157 } 158 map->pd = pd; 159 map->ref_count = 1; 160 map->hooks = hooks; 161 map->access_flags = access_flags; 162 map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map); 163 if (!map->map) { 164 SPDK_ERRLOG("Unable to create memory map\n"); 165 _rdma_free_mem_map(map); 166 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 167 return NULL; 168 } 169 LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link); 170 171 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 172 173 return map; 174 } 175 176 void 177 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map) 178 { 179 struct spdk_rdma_utils_mem_map *map; 180 181 if (!_map) { 182 return; 183 } 184 185 map = *_map; 186 if (!map) { 187 return; 188 } 189 *_map = NULL; 190 191 pthread_mutex_lock(&g_rdma_mr_maps_mutex); 192 assert(map->ref_count > 0); 193 map->ref_count--; 194 if (map->ref_count != 0) { 195 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 196 return; 197 } 198 199 LIST_REMOVE(map, link); 200 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 201 if (map->map) { 202 spdk_mem_map_free(&map->map); 203 } 204 _rdma_free_mem_map(map); 205 } 206 207 int 208 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address, 209 size_t length, struct spdk_rdma_utils_memory_translation *translation) 210 { 211 uint64_t real_length = length; 212 213 assert(map); 214 assert(address); 215 assert(translation); 216 217 if (map->hooks && map->hooks->get_rkey) { 218 translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY; 219 translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length); 220 } else { 221 translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR; 222 translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address, 223 &real_length); 224 if (spdk_unlikely(!translation->mr_or_key.mr)) { 225 SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length); 226 return -EINVAL; 227 } 228 } 229 230 assert(real_length >= length); 231 232 return 0; 233 } 234 235 236 static struct rdma_utils_device * 237 rdma_add_dev(struct ibv_context *context) 238 { 239 struct rdma_utils_device *dev; 240 241 dev = calloc(1, sizeof(*dev)); 242 if (dev == NULL) { 243 SPDK_ERRLOG("Failed to allocate RDMA device object.\n"); 244 return NULL; 245 } 246 247 dev->pd = ibv_alloc_pd(context); 248 if (dev->pd == NULL) { 249 SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno); 250 free(dev); 251 return NULL; 252 } 253 254 dev->context = context; 255 TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq); 256 257 return dev; 258 } 259 260 static void 261 rdma_remove_dev(struct rdma_utils_device *dev) 262 { 263 if (!dev->removed || dev->ref > 0) { 264 return; 265 } 266 267 /* Deallocate protection domain only if the device is already removed and 268 * there is no reference. 269 */ 270 TAILQ_REMOVE(&g_dev_list, dev, tailq); 271 ibv_dealloc_pd(dev->pd); 272 free(dev); 273 } 274 275 static int 276 ctx_cmp(const void *_c1, const void *_c2) 277 { 278 struct ibv_context *c1 = *(struct ibv_context **)_c1; 279 struct ibv_context *c2 = *(struct ibv_context **)_c2; 280 281 return c1 < c2 ? -1 : c1 > c2; 282 } 283 284 static int 285 rdma_sync_dev_list(void) 286 { 287 struct ibv_context **new_ctx_list; 288 int i, j; 289 int num_devs = 0; 290 291 /* 292 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices, 293 * and sets num_devs to the number of the returned devices. 294 */ 295 new_ctx_list = rdma_get_devices(&num_devs); 296 if (new_ctx_list == NULL) { 297 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 298 return -ENODEV; 299 } 300 301 if (num_devs == 0) { 302 rdma_free_devices(new_ctx_list); 303 SPDK_ERRLOG("Returned RDMA device array was empty\n"); 304 return -ENODEV; 305 } 306 307 /* 308 * Sort new_ctx_list by addresses to update devices easily. 309 */ 310 qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp); 311 312 if (g_ctx_list == NULL) { 313 /* If no old array, this is the first call. Add all devices. */ 314 for (i = 0; new_ctx_list[i] != NULL; i++) { 315 rdma_add_dev(new_ctx_list[i]); 316 } 317 318 goto exit; 319 } 320 321 for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) { 322 struct ibv_context *new_ctx = new_ctx_list[i]; 323 struct ibv_context *old_ctx = g_ctx_list[j]; 324 bool add = false, remove = false; 325 326 /* 327 * If a context exists only in the new array, create a device for it, 328 * or if a context exists only in the old array, try removing the 329 * corresponding device. 330 */ 331 332 if (old_ctx == NULL) { 333 add = true; 334 } else if (new_ctx == NULL) { 335 remove = true; 336 } else if (new_ctx < old_ctx) { 337 add = true; 338 } else if (old_ctx < new_ctx) { 339 remove = true; 340 } 341 342 if (add) { 343 rdma_add_dev(new_ctx_list[i]); 344 i++; 345 } else if (remove) { 346 struct rdma_utils_device *dev, *tmp; 347 348 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 349 if (dev->context == g_ctx_list[j]) { 350 dev->removed = true; 351 rdma_remove_dev(dev); 352 } 353 } 354 j++; 355 } else { 356 i++; 357 j++; 358 } 359 } 360 361 /* Free the old array. */ 362 rdma_free_devices(g_ctx_list); 363 364 exit: 365 /* 366 * Keep the newly returned array so that allocated protection domains 367 * are not freed unexpectedly. 368 */ 369 g_ctx_list = new_ctx_list; 370 return 0; 371 } 372 373 struct ibv_pd * 374 spdk_rdma_utils_get_pd(struct ibv_context *context) 375 { 376 struct rdma_utils_device *dev; 377 int rc; 378 379 pthread_mutex_lock(&g_dev_mutex); 380 381 rc = rdma_sync_dev_list(); 382 if (rc != 0) { 383 pthread_mutex_unlock(&g_dev_mutex); 384 385 SPDK_ERRLOG("Failed to sync RDMA device list\n"); 386 return NULL; 387 } 388 389 TAILQ_FOREACH(dev, &g_dev_list, tailq) { 390 if (dev->context == context && !dev->removed) { 391 dev->ref++; 392 pthread_mutex_unlock(&g_dev_mutex); 393 394 return dev->pd; 395 } 396 } 397 398 pthread_mutex_unlock(&g_dev_mutex); 399 400 SPDK_ERRLOG("Failed to get PD\n"); 401 return NULL; 402 } 403 404 void 405 spdk_rdma_utils_put_pd(struct ibv_pd *pd) 406 { 407 struct rdma_utils_device *dev, *tmp; 408 409 pthread_mutex_lock(&g_dev_mutex); 410 411 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 412 if (dev->pd == pd) { 413 assert(dev->ref > 0); 414 dev->ref--; 415 416 rdma_remove_dev(dev); 417 } 418 } 419 420 rdma_sync_dev_list(); 421 422 pthread_mutex_unlock(&g_dev_mutex); 423 } 424 425 __attribute__((destructor)) static void 426 _rdma_utils_fini(void) 427 { 428 struct rdma_utils_device *dev, *tmp; 429 430 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 431 dev->removed = true; 432 dev->ref = 0; 433 rdma_remove_dev(dev); 434 } 435 436 if (g_ctx_list != NULL) { 437 rdma_free_devices(g_ctx_list); 438 g_ctx_list = NULL; 439 } 440 } 441 442 struct spdk_memory_domain * 443 spdk_rdma_utils_get_memory_domain(struct ibv_pd *pd) 444 { 445 struct rdma_utils_memory_domain *domain = NULL; 446 struct spdk_memory_domain_ctx ctx; 447 int rc; 448 449 pthread_mutex_lock(&g_memory_domains_lock); 450 451 TAILQ_FOREACH(domain, &g_memory_domains, link) { 452 if (domain->pd == pd) { 453 domain->ref++; 454 pthread_mutex_unlock(&g_memory_domains_lock); 455 return domain->domain; 456 } 457 } 458 459 domain = calloc(1, sizeof(*domain)); 460 if (!domain) { 461 SPDK_ERRLOG("Memory allocation failed\n"); 462 pthread_mutex_unlock(&g_memory_domains_lock); 463 return NULL; 464 } 465 466 domain->rdma_ctx.size = sizeof(domain->rdma_ctx); 467 domain->rdma_ctx.ibv_pd = pd; 468 ctx.size = sizeof(ctx); 469 ctx.user_ctx = &domain->rdma_ctx; 470 471 rc = spdk_memory_domain_create(&domain->domain, SPDK_DMA_DEVICE_TYPE_RDMA, &ctx, 472 SPDK_RDMA_DMA_DEVICE); 473 if (rc) { 474 SPDK_ERRLOG("Failed to create memory domain\n"); 475 free(domain); 476 pthread_mutex_unlock(&g_memory_domains_lock); 477 return NULL; 478 } 479 480 domain->pd = pd; 481 domain->ref = 1; 482 TAILQ_INSERT_TAIL(&g_memory_domains, domain, link); 483 484 pthread_mutex_unlock(&g_memory_domains_lock); 485 486 return domain->domain; 487 } 488 489 int 490 spdk_rdma_utils_put_memory_domain(struct spdk_memory_domain *_domain) 491 { 492 struct rdma_utils_memory_domain *domain = NULL; 493 494 if (!_domain) { 495 return 0; 496 } 497 498 pthread_mutex_lock(&g_memory_domains_lock); 499 500 TAILQ_FOREACH(domain, &g_memory_domains, link) { 501 if (domain->domain == _domain) { 502 break; 503 } 504 } 505 506 if (!domain) { 507 pthread_mutex_unlock(&g_memory_domains_lock); 508 return -ENODEV; 509 } 510 assert(domain->ref > 0); 511 512 domain->ref--; 513 514 if (domain->ref == 0) { 515 spdk_memory_domain_destroy(domain->domain); 516 TAILQ_REMOVE(&g_memory_domains, domain, link); 517 free(domain); 518 } 519 520 pthread_mutex_unlock(&g_memory_domains_lock); 521 522 return 0; 523 } 524 525 int32_t 526 spdk_rdma_cm_id_get_numa_id(struct rdma_cm_id *cm_id) 527 { 528 struct sockaddr *sa; 529 char addr[64]; 530 char ifc[64]; 531 uint32_t numa_id; 532 int rc; 533 534 sa = rdma_get_local_addr(cm_id); 535 if (sa == NULL) { 536 return SPDK_ENV_NUMA_ID_ANY; 537 } 538 rc = spdk_net_get_address_string(sa, addr, sizeof(addr)); 539 if (rc) { 540 return SPDK_ENV_NUMA_ID_ANY; 541 } 542 rc = spdk_net_get_interface_name(addr, ifc, sizeof(ifc)); 543 if (rc) { 544 return SPDK_ENV_NUMA_ID_ANY; 545 } 546 rc = spdk_read_sysfs_attribute_uint32(&numa_id, 547 "/sys/class/net/%s/device/numa_node", ifc); 548 if (rc || numa_id > INT32_MAX) { 549 return SPDK_ENV_NUMA_ID_ANY; 550 } 551 return (int32_t)numa_id; 552 } 553