1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 #include "spdk_internal/rdma_utils.h" 7 8 #include "spdk/log.h" 9 #include "spdk/string.h" 10 #include "spdk/likely.h" 11 12 #include "spdk_internal/assert.h" 13 14 #include <rdma/rdma_cma.h> 15 #include <rdma/rdma_verbs.h> 16 17 struct rdma_utils_device { 18 struct ibv_pd *pd; 19 struct ibv_context *context; 20 int ref; 21 bool removed; 22 TAILQ_ENTRY(rdma_utils_device) tailq; 23 }; 24 25 struct spdk_rdma_utils_mem_map { 26 struct spdk_mem_map *map; 27 struct ibv_pd *pd; 28 struct spdk_nvme_rdma_hooks *hooks; 29 uint32_t ref_count; 30 uint32_t access_flags; 31 LIST_ENTRY(spdk_rdma_utils_mem_map) link; 32 }; 33 34 struct rdma_utils_memory_domain { 35 TAILQ_ENTRY(rdma_utils_memory_domain) link; 36 uint32_t ref; 37 enum spdk_dma_device_type type; 38 struct ibv_pd *pd; 39 struct spdk_memory_domain *domain; 40 struct spdk_memory_domain_rdma_ctx rdma_ctx; 41 }; 42 43 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER; 44 static struct ibv_context **g_ctx_list = NULL; 45 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list); 46 47 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER( 48 &g_rdma_utils_mr_maps); 49 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; 50 51 static TAILQ_HEAD(, rdma_utils_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER( 52 g_memory_domains); 53 static pthread_mutex_t g_memory_domains_lock = PTHREAD_MUTEX_INITIALIZER; 54 55 static int 56 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 57 enum spdk_mem_map_notify_action action, 58 void *vaddr, size_t size) 59 { 60 struct spdk_rdma_utils_mem_map *rmap = cb_ctx; 61 struct ibv_pd *pd = rmap->pd; 62 struct ibv_mr *mr; 63 uint32_t access_flags; 64 int rc; 65 66 switch (action) { 67 case SPDK_MEM_MAP_NOTIFY_REGISTER: 68 if (rmap->hooks && rmap->hooks->get_rkey) { 69 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 70 rmap->hooks->get_rkey(pd, vaddr, size)); 71 } else { 72 access_flags = rmap->access_flags; 73 #ifdef IBV_ACCESS_OPTIONAL_FIRST 74 access_flags |= IBV_ACCESS_RELAXED_ORDERING; 75 #endif 76 mr = ibv_reg_mr(pd, vaddr, size, access_flags); 77 if (mr == NULL) { 78 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 79 return -1; 80 } else { 81 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 82 } 83 } 84 break; 85 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 86 if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) { 87 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 88 if (mr) { 89 ibv_dereg_mr(mr); 90 } 91 } 92 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 93 break; 94 default: 95 SPDK_UNREACHABLE(); 96 } 97 98 return rc; 99 } 100 101 static int 102 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 103 { 104 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 105 return addr_1 == addr_2; 106 } 107 108 const struct spdk_mem_map_ops g_rdma_map_ops = { 109 .notify_cb = rdma_utils_mem_notify, 110 .are_contiguous = rdma_check_contiguous_entries 111 }; 112 113 static void 114 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map) 115 { 116 assert(map); 117 118 if (map->hooks) { 119 spdk_free(map); 120 } else { 121 free(map); 122 } 123 } 124 125 struct spdk_rdma_utils_mem_map * 126 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks, 127 uint32_t access_flags) 128 { 129 struct spdk_rdma_utils_mem_map *map; 130 131 if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) { 132 /* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */ 133 access_flags |= IBV_ACCESS_REMOTE_WRITE; 134 } 135 136 pthread_mutex_lock(&g_rdma_mr_maps_mutex); 137 /* Look up existing mem map registration for this pd */ 138 LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) { 139 if (map->pd == pd && map->access_flags == access_flags) { 140 map->ref_count++; 141 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 142 return map; 143 } 144 } 145 146 if (hooks) { 147 map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 148 } else { 149 map = calloc(1, sizeof(*map)); 150 } 151 if (!map) { 152 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 153 SPDK_ERRLOG("Memory allocation failed\n"); 154 return NULL; 155 } 156 map->pd = pd; 157 map->ref_count = 1; 158 map->hooks = hooks; 159 map->access_flags = access_flags; 160 map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map); 161 if (!map->map) { 162 SPDK_ERRLOG("Unable to create memory map\n"); 163 _rdma_free_mem_map(map); 164 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 165 return NULL; 166 } 167 LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link); 168 169 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 170 171 return map; 172 } 173 174 void 175 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map) 176 { 177 struct spdk_rdma_utils_mem_map *map; 178 179 if (!_map) { 180 return; 181 } 182 183 map = *_map; 184 if (!map) { 185 return; 186 } 187 *_map = NULL; 188 189 pthread_mutex_lock(&g_rdma_mr_maps_mutex); 190 assert(map->ref_count > 0); 191 map->ref_count--; 192 if (map->ref_count != 0) { 193 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 194 return; 195 } 196 197 LIST_REMOVE(map, link); 198 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 199 if (map->map) { 200 spdk_mem_map_free(&map->map); 201 } 202 _rdma_free_mem_map(map); 203 } 204 205 int 206 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address, 207 size_t length, struct spdk_rdma_utils_memory_translation *translation) 208 { 209 uint64_t real_length = length; 210 211 assert(map); 212 assert(address); 213 assert(translation); 214 215 if (map->hooks && map->hooks->get_rkey) { 216 translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY; 217 translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length); 218 } else { 219 translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR; 220 translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address, 221 &real_length); 222 if (spdk_unlikely(!translation->mr_or_key.mr)) { 223 SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length); 224 return -EINVAL; 225 } 226 } 227 228 assert(real_length >= length); 229 230 return 0; 231 } 232 233 234 static struct rdma_utils_device * 235 rdma_add_dev(struct ibv_context *context) 236 { 237 struct rdma_utils_device *dev; 238 239 dev = calloc(1, sizeof(*dev)); 240 if (dev == NULL) { 241 SPDK_ERRLOG("Failed to allocate RDMA device object.\n"); 242 return NULL; 243 } 244 245 dev->pd = ibv_alloc_pd(context); 246 if (dev->pd == NULL) { 247 SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno); 248 free(dev); 249 return NULL; 250 } 251 252 dev->context = context; 253 TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq); 254 255 return dev; 256 } 257 258 static void 259 rdma_remove_dev(struct rdma_utils_device *dev) 260 { 261 if (!dev->removed || dev->ref > 0) { 262 return; 263 } 264 265 /* Deallocate protection domain only if the device is already removed and 266 * there is no reference. 267 */ 268 TAILQ_REMOVE(&g_dev_list, dev, tailq); 269 ibv_dealloc_pd(dev->pd); 270 free(dev); 271 } 272 273 static int 274 ctx_cmp(const void *_c1, const void *_c2) 275 { 276 struct ibv_context *c1 = *(struct ibv_context **)_c1; 277 struct ibv_context *c2 = *(struct ibv_context **)_c2; 278 279 return c1 < c2 ? -1 : c1 > c2; 280 } 281 282 static int 283 rdma_sync_dev_list(void) 284 { 285 struct ibv_context **new_ctx_list; 286 int i, j; 287 int num_devs = 0; 288 289 /* 290 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices, 291 * and sets num_devs to the number of the returned devices. 292 */ 293 new_ctx_list = rdma_get_devices(&num_devs); 294 if (new_ctx_list == NULL) { 295 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 296 return -ENODEV; 297 } 298 299 if (num_devs == 0) { 300 rdma_free_devices(new_ctx_list); 301 SPDK_ERRLOG("Returned RDMA device array was empty\n"); 302 return -ENODEV; 303 } 304 305 /* 306 * Sort new_ctx_list by addresses to update devices easily. 307 */ 308 qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp); 309 310 if (g_ctx_list == NULL) { 311 /* If no old array, this is the first call. Add all devices. */ 312 for (i = 0; new_ctx_list[i] != NULL; i++) { 313 rdma_add_dev(new_ctx_list[i]); 314 } 315 316 goto exit; 317 } 318 319 for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) { 320 struct ibv_context *new_ctx = new_ctx_list[i]; 321 struct ibv_context *old_ctx = g_ctx_list[j]; 322 bool add = false, remove = false; 323 324 /* 325 * If a context exists only in the new array, create a device for it, 326 * or if a context exists only in the old array, try removing the 327 * corresponding device. 328 */ 329 330 if (old_ctx == NULL) { 331 add = true; 332 } else if (new_ctx == NULL) { 333 remove = true; 334 } else if (new_ctx < old_ctx) { 335 add = true; 336 } else if (old_ctx < new_ctx) { 337 remove = true; 338 } 339 340 if (add) { 341 rdma_add_dev(new_ctx_list[i]); 342 i++; 343 } else if (remove) { 344 struct rdma_utils_device *dev, *tmp; 345 346 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 347 if (dev->context == g_ctx_list[j]) { 348 dev->removed = true; 349 rdma_remove_dev(dev); 350 } 351 } 352 j++; 353 } else { 354 i++; 355 j++; 356 } 357 } 358 359 /* Free the old array. */ 360 rdma_free_devices(g_ctx_list); 361 362 exit: 363 /* 364 * Keep the newly returned array so that allocated protection domains 365 * are not freed unexpectedly. 366 */ 367 g_ctx_list = new_ctx_list; 368 return 0; 369 } 370 371 struct ibv_pd * 372 spdk_rdma_utils_get_pd(struct ibv_context *context) 373 { 374 struct rdma_utils_device *dev; 375 int rc; 376 377 pthread_mutex_lock(&g_dev_mutex); 378 379 rc = rdma_sync_dev_list(); 380 if (rc != 0) { 381 pthread_mutex_unlock(&g_dev_mutex); 382 383 SPDK_ERRLOG("Failed to sync RDMA device list\n"); 384 return NULL; 385 } 386 387 TAILQ_FOREACH(dev, &g_dev_list, tailq) { 388 if (dev->context == context && !dev->removed) { 389 dev->ref++; 390 pthread_mutex_unlock(&g_dev_mutex); 391 392 return dev->pd; 393 } 394 } 395 396 pthread_mutex_unlock(&g_dev_mutex); 397 398 SPDK_ERRLOG("Failed to get PD\n"); 399 return NULL; 400 } 401 402 void 403 spdk_rdma_utils_put_pd(struct ibv_pd *pd) 404 { 405 struct rdma_utils_device *dev, *tmp; 406 407 pthread_mutex_lock(&g_dev_mutex); 408 409 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 410 if (dev->pd == pd) { 411 assert(dev->ref > 0); 412 dev->ref--; 413 414 rdma_remove_dev(dev); 415 } 416 } 417 418 rdma_sync_dev_list(); 419 420 pthread_mutex_unlock(&g_dev_mutex); 421 } 422 423 __attribute__((destructor)) static void 424 _rdma_utils_fini(void) 425 { 426 struct rdma_utils_device *dev, *tmp; 427 428 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 429 dev->removed = true; 430 dev->ref = 0; 431 rdma_remove_dev(dev); 432 } 433 434 if (g_ctx_list != NULL) { 435 rdma_free_devices(g_ctx_list); 436 g_ctx_list = NULL; 437 } 438 } 439 440 struct spdk_memory_domain * 441 spdk_rdma_utils_get_memory_domain(struct ibv_pd *pd) 442 { 443 struct rdma_utils_memory_domain *domain = NULL; 444 struct spdk_memory_domain_ctx ctx; 445 int rc; 446 447 pthread_mutex_lock(&g_memory_domains_lock); 448 449 TAILQ_FOREACH(domain, &g_memory_domains, link) { 450 if (domain->pd == pd) { 451 domain->ref++; 452 pthread_mutex_unlock(&g_memory_domains_lock); 453 return domain->domain; 454 } 455 } 456 457 domain = calloc(1, sizeof(*domain)); 458 if (!domain) { 459 SPDK_ERRLOG("Memory allocation failed\n"); 460 pthread_mutex_unlock(&g_memory_domains_lock); 461 return NULL; 462 } 463 464 domain->rdma_ctx.size = sizeof(domain->rdma_ctx); 465 domain->rdma_ctx.ibv_pd = pd; 466 ctx.size = sizeof(ctx); 467 ctx.user_ctx = &domain->rdma_ctx; 468 469 rc = spdk_memory_domain_create(&domain->domain, SPDK_DMA_DEVICE_TYPE_RDMA, &ctx, 470 SPDK_RDMA_DMA_DEVICE); 471 if (rc) { 472 SPDK_ERRLOG("Failed to create memory domain\n"); 473 free(domain); 474 pthread_mutex_unlock(&g_memory_domains_lock); 475 return NULL; 476 } 477 478 domain->pd = pd; 479 domain->ref = 1; 480 TAILQ_INSERT_TAIL(&g_memory_domains, domain, link); 481 482 pthread_mutex_unlock(&g_memory_domains_lock); 483 484 return domain->domain; 485 } 486 487 int 488 spdk_rdma_utils_put_memory_domain(struct spdk_memory_domain *_domain) 489 { 490 struct rdma_utils_memory_domain *domain = NULL; 491 492 if (!_domain) { 493 return 0; 494 } 495 496 pthread_mutex_lock(&g_memory_domains_lock); 497 498 TAILQ_FOREACH(domain, &g_memory_domains, link) { 499 if (domain->domain == _domain) { 500 break; 501 } 502 } 503 504 if (!domain) { 505 pthread_mutex_unlock(&g_memory_domains_lock); 506 return -ENODEV; 507 } 508 assert(domain->ref > 0); 509 510 domain->ref--; 511 512 if (domain->ref == 0) { 513 spdk_memory_domain_destroy(domain->domain); 514 TAILQ_REMOVE(&g_memory_domains, domain, link); 515 free(domain); 516 } 517 518 pthread_mutex_unlock(&g_memory_domains_lock); 519 520 return 0; 521 } 522