1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 4 */ 5 6 #include "spdk_internal/rdma_utils.h" 7 8 #include "spdk/log.h" 9 #include "spdk/string.h" 10 #include "spdk/likely.h" 11 12 #include "spdk_internal/assert.h" 13 14 #include <rdma/rdma_cma.h> 15 #include <rdma/rdma_verbs.h> 16 17 struct rdma_utils_device { 18 struct ibv_pd *pd; 19 struct ibv_context *context; 20 int ref; 21 bool removed; 22 TAILQ_ENTRY(rdma_utils_device) tailq; 23 }; 24 25 struct spdk_rdma_utils_mem_map { 26 struct spdk_mem_map *map; 27 struct ibv_pd *pd; 28 struct spdk_nvme_rdma_hooks *hooks; 29 uint32_t ref_count; 30 uint32_t access_flags; 31 LIST_ENTRY(spdk_rdma_utils_mem_map) link; 32 }; 33 34 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER; 35 static struct ibv_context **g_ctx_list = NULL; 36 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list); 37 38 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER( 39 &g_rdma_utils_mr_maps); 40 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; 41 42 static int 43 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map, 44 enum spdk_mem_map_notify_action action, 45 void *vaddr, size_t size) 46 { 47 struct spdk_rdma_utils_mem_map *rmap = cb_ctx; 48 struct ibv_pd *pd = rmap->pd; 49 struct ibv_mr *mr; 50 uint32_t access_flags; 51 int rc; 52 53 switch (action) { 54 case SPDK_MEM_MAP_NOTIFY_REGISTER: 55 if (rmap->hooks && rmap->hooks->get_rkey) { 56 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, 57 rmap->hooks->get_rkey(pd, vaddr, size)); 58 } else { 59 access_flags = rmap->access_flags; 60 #ifdef IBV_ACCESS_OPTIONAL_FIRST 61 access_flags |= IBV_ACCESS_RELAXED_ORDERING; 62 #endif 63 mr = ibv_reg_mr(pd, vaddr, size, access_flags); 64 if (mr == NULL) { 65 SPDK_ERRLOG("ibv_reg_mr() failed\n"); 66 return -1; 67 } else { 68 rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); 69 } 70 } 71 break; 72 case SPDK_MEM_MAP_NOTIFY_UNREGISTER: 73 if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) { 74 mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); 75 if (mr) { 76 ibv_dereg_mr(mr); 77 } 78 } 79 rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); 80 break; 81 default: 82 SPDK_UNREACHABLE(); 83 } 84 85 return rc; 86 } 87 88 static int 89 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) 90 { 91 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ 92 return addr_1 == addr_2; 93 } 94 95 const struct spdk_mem_map_ops g_rdma_map_ops = { 96 .notify_cb = rdma_utils_mem_notify, 97 .are_contiguous = rdma_check_contiguous_entries 98 }; 99 100 static void 101 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map) 102 { 103 assert(map); 104 105 if (map->hooks) { 106 spdk_free(map); 107 } else { 108 free(map); 109 } 110 } 111 112 struct spdk_rdma_utils_mem_map * 113 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks, 114 uint32_t access_flags) 115 { 116 struct spdk_rdma_utils_mem_map *map; 117 118 if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) { 119 /* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */ 120 access_flags |= IBV_ACCESS_REMOTE_WRITE; 121 } 122 123 pthread_mutex_lock(&g_rdma_mr_maps_mutex); 124 /* Look up existing mem map registration for this pd */ 125 LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) { 126 if (map->pd == pd && map->access_flags == access_flags) { 127 map->ref_count++; 128 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 129 return map; 130 } 131 } 132 133 if (hooks) { 134 map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 135 } else { 136 map = calloc(1, sizeof(*map)); 137 } 138 if (!map) { 139 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 140 SPDK_ERRLOG("Memory allocation failed\n"); 141 return NULL; 142 } 143 map->pd = pd; 144 map->ref_count = 1; 145 map->hooks = hooks; 146 map->access_flags = access_flags; 147 map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map); 148 if (!map->map) { 149 SPDK_ERRLOG("Unable to create memory map\n"); 150 _rdma_free_mem_map(map); 151 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 152 return NULL; 153 } 154 LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link); 155 156 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 157 158 return map; 159 } 160 161 void 162 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map) 163 { 164 struct spdk_rdma_utils_mem_map *map; 165 166 if (!_map) { 167 return; 168 } 169 170 map = *_map; 171 if (!map) { 172 return; 173 } 174 *_map = NULL; 175 176 pthread_mutex_lock(&g_rdma_mr_maps_mutex); 177 assert(map->ref_count > 0); 178 map->ref_count--; 179 if (map->ref_count != 0) { 180 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 181 return; 182 } 183 184 LIST_REMOVE(map, link); 185 pthread_mutex_unlock(&g_rdma_mr_maps_mutex); 186 if (map->map) { 187 spdk_mem_map_free(&map->map); 188 } 189 _rdma_free_mem_map(map); 190 } 191 192 int 193 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address, 194 size_t length, struct spdk_rdma_utils_memory_translation *translation) 195 { 196 uint64_t real_length = length; 197 198 assert(map); 199 assert(address); 200 assert(translation); 201 202 if (map->hooks && map->hooks->get_rkey) { 203 translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY; 204 translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length); 205 } else { 206 translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR; 207 translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address, 208 &real_length); 209 if (spdk_unlikely(!translation->mr_or_key.mr)) { 210 SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length); 211 return -EINVAL; 212 } 213 } 214 215 assert(real_length >= length); 216 217 return 0; 218 } 219 220 221 static struct rdma_utils_device * 222 rdma_add_dev(struct ibv_context *context) 223 { 224 struct rdma_utils_device *dev; 225 226 dev = calloc(1, sizeof(*dev)); 227 if (dev == NULL) { 228 SPDK_ERRLOG("Failed to allocate RDMA device object.\n"); 229 return NULL; 230 } 231 232 dev->pd = ibv_alloc_pd(context); 233 if (dev->pd == NULL) { 234 SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno); 235 free(dev); 236 return NULL; 237 } 238 239 dev->context = context; 240 TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq); 241 242 return dev; 243 } 244 245 static void 246 rdma_remove_dev(struct rdma_utils_device *dev) 247 { 248 if (!dev->removed || dev->ref > 0) { 249 return; 250 } 251 252 /* Deallocate protection domain only if the device is already removed and 253 * there is no reference. 254 */ 255 TAILQ_REMOVE(&g_dev_list, dev, tailq); 256 ibv_dealloc_pd(dev->pd); 257 free(dev); 258 } 259 260 static int 261 ctx_cmp(const void *_c1, const void *_c2) 262 { 263 struct ibv_context *c1 = *(struct ibv_context **)_c1; 264 struct ibv_context *c2 = *(struct ibv_context **)_c2; 265 266 return c1 < c2 ? -1 : c1 > c2; 267 } 268 269 static int 270 rdma_sync_dev_list(void) 271 { 272 struct ibv_context **new_ctx_list; 273 int i, j; 274 int num_devs = 0; 275 276 /* 277 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices, 278 * and sets num_devs to the number of the returned devices. 279 */ 280 new_ctx_list = rdma_get_devices(&num_devs); 281 if (new_ctx_list == NULL) { 282 SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); 283 return -ENODEV; 284 } 285 286 if (num_devs == 0) { 287 rdma_free_devices(new_ctx_list); 288 SPDK_ERRLOG("Returned RDMA device array was empty\n"); 289 return -ENODEV; 290 } 291 292 /* 293 * Sort new_ctx_list by addresses to update devices easily. 294 */ 295 qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp); 296 297 if (g_ctx_list == NULL) { 298 /* If no old array, this is the first call. Add all devices. */ 299 for (i = 0; new_ctx_list[i] != NULL; i++) { 300 rdma_add_dev(new_ctx_list[i]); 301 } 302 303 goto exit; 304 } 305 306 for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) { 307 struct ibv_context *new_ctx = new_ctx_list[i]; 308 struct ibv_context *old_ctx = g_ctx_list[j]; 309 bool add = false, remove = false; 310 311 /* 312 * If a context exists only in the new array, create a device for it, 313 * or if a context exists only in the old array, try removing the 314 * corresponding device. 315 */ 316 317 if (old_ctx == NULL) { 318 add = true; 319 } else if (new_ctx == NULL) { 320 remove = true; 321 } else if (new_ctx < old_ctx) { 322 add = true; 323 } else if (old_ctx < new_ctx) { 324 remove = true; 325 } 326 327 if (add) { 328 rdma_add_dev(new_ctx_list[i]); 329 i++; 330 } else if (remove) { 331 struct rdma_utils_device *dev, *tmp; 332 333 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 334 if (dev->context == g_ctx_list[j]) { 335 dev->removed = true; 336 rdma_remove_dev(dev); 337 } 338 } 339 j++; 340 } else { 341 i++; 342 j++; 343 } 344 } 345 346 /* Free the old array. */ 347 rdma_free_devices(g_ctx_list); 348 349 exit: 350 /* 351 * Keep the newly returned array so that allocated protection domains 352 * are not freed unexpectedly. 353 */ 354 g_ctx_list = new_ctx_list; 355 return 0; 356 } 357 358 struct ibv_pd * 359 spdk_rdma_utils_get_pd(struct ibv_context *context) 360 { 361 struct rdma_utils_device *dev; 362 int rc; 363 364 pthread_mutex_lock(&g_dev_mutex); 365 366 rc = rdma_sync_dev_list(); 367 if (rc != 0) { 368 pthread_mutex_unlock(&g_dev_mutex); 369 370 SPDK_ERRLOG("Failed to sync RDMA device list\n"); 371 return NULL; 372 } 373 374 TAILQ_FOREACH(dev, &g_dev_list, tailq) { 375 if (dev->context == context && !dev->removed) { 376 dev->ref++; 377 pthread_mutex_unlock(&g_dev_mutex); 378 379 return dev->pd; 380 } 381 } 382 383 pthread_mutex_unlock(&g_dev_mutex); 384 385 SPDK_ERRLOG("Failed to get PD\n"); 386 return NULL; 387 } 388 389 void 390 spdk_rdma_utils_put_pd(struct ibv_pd *pd) 391 { 392 struct rdma_utils_device *dev, *tmp; 393 394 pthread_mutex_lock(&g_dev_mutex); 395 396 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 397 if (dev->pd == pd) { 398 assert(dev->ref > 0); 399 dev->ref--; 400 401 rdma_remove_dev(dev); 402 } 403 } 404 405 rdma_sync_dev_list(); 406 407 pthread_mutex_unlock(&g_dev_mutex); 408 } 409 410 __attribute__((destructor)) static void 411 _rdma_utils_fini(void) 412 { 413 struct rdma_utils_device *dev, *tmp; 414 415 TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) { 416 dev->removed = true; 417 dev->ref = 0; 418 rdma_remove_dev(dev); 419 } 420 421 if (g_ctx_list != NULL) { 422 rdma_free_devices(g_ctx_list); 423 g_ctx_list = NULL; 424 } 425 } 426