xref: /spdk/lib/rdma_utils/rdma_utils.c (revision 8a4b722644b813f499cc1ee74dfd5b8f50eedf94)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation. All rights reserved.
3  *   Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4  */
5 
6 #include "spdk_internal/rdma_utils.h"
7 
8 #include "spdk/log.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 #include "spdk/net.h"
12 #include "spdk/file.h"
13 
14 #include "spdk_internal/assert.h"
15 
16 #include <rdma/rdma_cma.h>
17 #include <rdma/rdma_verbs.h>
18 
19 struct rdma_utils_device {
20 	struct ibv_pd			*pd;
21 	struct ibv_context		*context;
22 	int				ref;
23 	bool				removed;
24 	TAILQ_ENTRY(rdma_utils_device)	tailq;
25 };
26 
27 struct spdk_rdma_utils_mem_map {
28 	struct spdk_mem_map			*map;
29 	struct ibv_pd				*pd;
30 	struct spdk_nvme_rdma_hooks		*hooks;
31 	uint32_t				ref_count;
32 	uint32_t				access_flags;
33 	LIST_ENTRY(spdk_rdma_utils_mem_map)	link;
34 };
35 
36 struct rdma_utils_memory_domain {
37 	TAILQ_ENTRY(rdma_utils_memory_domain) link;
38 	uint32_t ref;
39 	enum spdk_dma_device_type type;
40 	struct ibv_pd *pd;
41 	struct spdk_memory_domain *domain;
42 	struct spdk_memory_domain_rdma_ctx rdma_ctx;
43 };
44 
45 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
46 static struct ibv_context **g_ctx_list = NULL;
47 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
48 
49 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER(
50 			&g_rdma_utils_mr_maps);
51 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
52 
53 static TAILQ_HEAD(, rdma_utils_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER(
54 			g_memory_domains);
55 static pthread_mutex_t g_memory_domains_lock = PTHREAD_MUTEX_INITIALIZER;
56 
57 static int
58 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
59 		      enum spdk_mem_map_notify_action action,
60 		      void *vaddr, size_t size)
61 {
62 	struct spdk_rdma_utils_mem_map *rmap = cb_ctx;
63 	struct ibv_pd *pd = rmap->pd;
64 	struct ibv_mr *mr;
65 	uint32_t access_flags;
66 	int rc;
67 
68 	switch (action) {
69 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
70 		if (rmap->hooks && rmap->hooks->get_rkey) {
71 			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
72 							  rmap->hooks->get_rkey(pd, vaddr, size));
73 		} else {
74 			access_flags = rmap->access_flags;
75 #ifdef IBV_ACCESS_OPTIONAL_FIRST
76 			access_flags |= IBV_ACCESS_RELAXED_ORDERING;
77 #endif
78 			mr = ibv_reg_mr(pd, vaddr, size, access_flags);
79 			if (mr == NULL) {
80 				SPDK_ERRLOG("ibv_reg_mr() failed\n");
81 				return -1;
82 			} else {
83 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
84 			}
85 		}
86 		break;
87 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
88 		if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) {
89 			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
90 			if (mr) {
91 				ibv_dereg_mr(mr);
92 			}
93 		}
94 		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
95 		break;
96 	default:
97 		SPDK_UNREACHABLE();
98 	}
99 
100 	return rc;
101 }
102 
103 static int
104 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
105 {
106 	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
107 	return addr_1 == addr_2;
108 }
109 
110 const struct spdk_mem_map_ops g_rdma_map_ops = {
111 	.notify_cb = rdma_utils_mem_notify,
112 	.are_contiguous = rdma_check_contiguous_entries
113 };
114 
115 static void
116 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map)
117 {
118 	assert(map);
119 
120 	if (map->hooks) {
121 		spdk_free(map);
122 	} else {
123 		free(map);
124 	}
125 }
126 
127 struct spdk_rdma_utils_mem_map *
128 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks,
129 			       uint32_t access_flags)
130 {
131 	struct spdk_rdma_utils_mem_map *map;
132 
133 	if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) {
134 		/* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */
135 		access_flags |= IBV_ACCESS_REMOTE_WRITE;
136 	}
137 
138 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
139 	/* Look up existing mem map registration for this pd */
140 	LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) {
141 		if (map->pd == pd && map->access_flags == access_flags) {
142 			map->ref_count++;
143 			pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
144 			return map;
145 		}
146 	}
147 
148 	if (hooks) {
149 		map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
150 	} else {
151 		map = calloc(1, sizeof(*map));
152 	}
153 	if (!map) {
154 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
155 		SPDK_ERRLOG("Memory allocation failed\n");
156 		return NULL;
157 	}
158 	map->pd = pd;
159 	map->ref_count = 1;
160 	map->hooks = hooks;
161 	map->access_flags = access_flags;
162 	map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map);
163 	if (!map->map) {
164 		SPDK_ERRLOG("Unable to create memory map\n");
165 		_rdma_free_mem_map(map);
166 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
167 		return NULL;
168 	}
169 	LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link);
170 
171 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
172 
173 	return map;
174 }
175 
176 void
177 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map)
178 {
179 	struct spdk_rdma_utils_mem_map *map;
180 
181 	if (!_map) {
182 		return;
183 	}
184 
185 	map = *_map;
186 	if (!map) {
187 		return;
188 	}
189 	*_map = NULL;
190 
191 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
192 	assert(map->ref_count > 0);
193 	map->ref_count--;
194 	if (map->ref_count != 0) {
195 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
196 		return;
197 	}
198 
199 	LIST_REMOVE(map, link);
200 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
201 	if (map->map) {
202 		spdk_mem_map_free(&map->map);
203 	}
204 	_rdma_free_mem_map(map);
205 }
206 
207 int
208 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address,
209 				size_t length, struct spdk_rdma_utils_memory_translation *translation)
210 {
211 	uint64_t real_length = length;
212 
213 	assert(map);
214 	assert(address);
215 	assert(translation);
216 
217 	if (map->hooks && map->hooks->get_rkey) {
218 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY;
219 		translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length);
220 	} else {
221 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR;
222 		translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address,
223 					    &real_length);
224 		if (spdk_unlikely(!translation->mr_or_key.mr)) {
225 			SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length);
226 			return -EINVAL;
227 		}
228 	}
229 
230 	assert(real_length >= length);
231 
232 	return 0;
233 }
234 
235 
236 static struct rdma_utils_device *
237 rdma_add_dev(struct ibv_context *context)
238 {
239 	struct rdma_utils_device *dev;
240 
241 	dev = calloc(1, sizeof(*dev));
242 	if (dev == NULL) {
243 		SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
244 		return NULL;
245 	}
246 
247 	dev->pd = ibv_alloc_pd(context);
248 	if (dev->pd == NULL) {
249 		SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
250 		free(dev);
251 		return NULL;
252 	}
253 
254 	dev->context = context;
255 	TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
256 
257 	return dev;
258 }
259 
260 static void
261 rdma_remove_dev(struct rdma_utils_device *dev)
262 {
263 	if (!dev->removed || dev->ref > 0) {
264 		return;
265 	}
266 
267 	/* Deallocate protection domain only if the device is already removed and
268 	 * there is no reference.
269 	 */
270 	TAILQ_REMOVE(&g_dev_list, dev, tailq);
271 	ibv_dealloc_pd(dev->pd);
272 	free(dev);
273 }
274 
275 static int
276 ctx_cmp(const void *_c1, const void *_c2)
277 {
278 	struct ibv_context *c1 = *(struct ibv_context **)_c1;
279 	struct ibv_context *c2 = *(struct ibv_context **)_c2;
280 
281 	return c1 < c2 ? -1 : c1 > c2;
282 }
283 
284 static int
285 rdma_sync_dev_list(void)
286 {
287 	struct ibv_context **new_ctx_list;
288 	int i, j;
289 	int num_devs = 0;
290 
291 	/*
292 	 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
293 	 * and sets num_devs to the number of the returned devices.
294 	 */
295 	new_ctx_list = rdma_get_devices(&num_devs);
296 	if (new_ctx_list == NULL) {
297 		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
298 		return -ENODEV;
299 	}
300 
301 	if (num_devs == 0) {
302 		rdma_free_devices(new_ctx_list);
303 		SPDK_ERRLOG("Returned RDMA device array was empty\n");
304 		return -ENODEV;
305 	}
306 
307 	/*
308 	 * Sort new_ctx_list by addresses to update devices easily.
309 	 */
310 	qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
311 
312 	if (g_ctx_list == NULL) {
313 		/* If no old array, this is the first call. Add all devices. */
314 		for (i = 0; new_ctx_list[i] != NULL; i++) {
315 			rdma_add_dev(new_ctx_list[i]);
316 		}
317 
318 		goto exit;
319 	}
320 
321 	for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
322 		struct ibv_context *new_ctx = new_ctx_list[i];
323 		struct ibv_context *old_ctx = g_ctx_list[j];
324 		bool add = false, remove = false;
325 
326 		/*
327 		 * If a context exists only in the new array, create a device for it,
328 		 * or if a context exists only in the old array, try removing the
329 		 * corresponding device.
330 		 */
331 
332 		if (old_ctx == NULL) {
333 			add = true;
334 		} else if (new_ctx == NULL) {
335 			remove = true;
336 		} else if (new_ctx < old_ctx) {
337 			add = true;
338 		} else if (old_ctx < new_ctx) {
339 			remove = true;
340 		}
341 
342 		if (add) {
343 			rdma_add_dev(new_ctx_list[i]);
344 			i++;
345 		} else if (remove) {
346 			struct rdma_utils_device *dev, *tmp;
347 
348 			TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
349 				if (dev->context == g_ctx_list[j]) {
350 					dev->removed = true;
351 					rdma_remove_dev(dev);
352 				}
353 			}
354 			j++;
355 		} else {
356 			i++;
357 			j++;
358 		}
359 	}
360 
361 	/* Free the old array. */
362 	rdma_free_devices(g_ctx_list);
363 
364 exit:
365 	/*
366 	 * Keep the newly returned array so that allocated protection domains
367 	 * are not freed unexpectedly.
368 	 */
369 	g_ctx_list = new_ctx_list;
370 	return 0;
371 }
372 
373 struct ibv_pd *
374 spdk_rdma_utils_get_pd(struct ibv_context *context)
375 {
376 	struct rdma_utils_device *dev;
377 	int rc;
378 
379 	pthread_mutex_lock(&g_dev_mutex);
380 
381 	rc = rdma_sync_dev_list();
382 	if (rc != 0) {
383 		pthread_mutex_unlock(&g_dev_mutex);
384 
385 		SPDK_ERRLOG("Failed to sync RDMA device list\n");
386 		return NULL;
387 	}
388 
389 	TAILQ_FOREACH(dev, &g_dev_list, tailq) {
390 		if (dev->context == context && !dev->removed) {
391 			dev->ref++;
392 			pthread_mutex_unlock(&g_dev_mutex);
393 
394 			return dev->pd;
395 		}
396 	}
397 
398 	pthread_mutex_unlock(&g_dev_mutex);
399 
400 	SPDK_ERRLOG("Failed to get PD\n");
401 	return NULL;
402 }
403 
404 void
405 spdk_rdma_utils_put_pd(struct ibv_pd *pd)
406 {
407 	struct rdma_utils_device *dev, *tmp;
408 
409 	pthread_mutex_lock(&g_dev_mutex);
410 
411 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
412 		if (dev->pd == pd) {
413 			assert(dev->ref > 0);
414 			dev->ref--;
415 
416 			rdma_remove_dev(dev);
417 		}
418 	}
419 
420 	rdma_sync_dev_list();
421 
422 	pthread_mutex_unlock(&g_dev_mutex);
423 }
424 
425 __attribute__((destructor)) static void
426 _rdma_utils_fini(void)
427 {
428 	struct rdma_utils_device *dev, *tmp;
429 
430 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
431 		dev->removed = true;
432 		dev->ref = 0;
433 		rdma_remove_dev(dev);
434 	}
435 
436 	if (g_ctx_list != NULL) {
437 		rdma_free_devices(g_ctx_list);
438 		g_ctx_list = NULL;
439 	}
440 }
441 
442 struct spdk_memory_domain *
443 spdk_rdma_utils_get_memory_domain(struct ibv_pd *pd)
444 {
445 	struct rdma_utils_memory_domain *domain = NULL;
446 	struct spdk_memory_domain_ctx ctx;
447 	int rc;
448 
449 	pthread_mutex_lock(&g_memory_domains_lock);
450 
451 	TAILQ_FOREACH(domain, &g_memory_domains, link) {
452 		if (domain->pd == pd) {
453 			domain->ref++;
454 			pthread_mutex_unlock(&g_memory_domains_lock);
455 			return domain->domain;
456 		}
457 	}
458 
459 	domain = calloc(1, sizeof(*domain));
460 	if (!domain) {
461 		SPDK_ERRLOG("Memory allocation failed\n");
462 		pthread_mutex_unlock(&g_memory_domains_lock);
463 		return NULL;
464 	}
465 
466 	domain->rdma_ctx.size = sizeof(domain->rdma_ctx);
467 	domain->rdma_ctx.ibv_pd = pd;
468 	ctx.size = sizeof(ctx);
469 	ctx.user_ctx = &domain->rdma_ctx;
470 
471 	rc = spdk_memory_domain_create(&domain->domain, SPDK_DMA_DEVICE_TYPE_RDMA, &ctx,
472 				       SPDK_RDMA_DMA_DEVICE);
473 	if (rc) {
474 		SPDK_ERRLOG("Failed to create memory domain\n");
475 		free(domain);
476 		pthread_mutex_unlock(&g_memory_domains_lock);
477 		return NULL;
478 	}
479 
480 	domain->pd = pd;
481 	domain->ref = 1;
482 	TAILQ_INSERT_TAIL(&g_memory_domains, domain, link);
483 
484 	pthread_mutex_unlock(&g_memory_domains_lock);
485 
486 	return domain->domain;
487 }
488 
489 int
490 spdk_rdma_utils_put_memory_domain(struct spdk_memory_domain *_domain)
491 {
492 	struct rdma_utils_memory_domain *domain = NULL;
493 
494 	if (!_domain) {
495 		return 0;
496 	}
497 
498 	pthread_mutex_lock(&g_memory_domains_lock);
499 
500 	TAILQ_FOREACH(domain, &g_memory_domains, link) {
501 		if (domain->domain == _domain) {
502 			break;
503 		}
504 	}
505 
506 	if (!domain) {
507 		pthread_mutex_unlock(&g_memory_domains_lock);
508 		return -ENODEV;
509 	}
510 	assert(domain->ref > 0);
511 
512 	domain->ref--;
513 
514 	if (domain->ref == 0) {
515 		spdk_memory_domain_destroy(domain->domain);
516 		TAILQ_REMOVE(&g_memory_domains, domain, link);
517 		free(domain);
518 	}
519 
520 	pthread_mutex_unlock(&g_memory_domains_lock);
521 
522 	return 0;
523 }
524 
525 int32_t
526 spdk_rdma_cm_id_get_numa_id(struct rdma_cm_id *cm_id)
527 {
528 	struct sockaddr	*sa;
529 	char		addr[64];
530 	char		ifc[64];
531 	uint32_t	numa_id;
532 	int		rc;
533 
534 	sa = rdma_get_local_addr(cm_id);
535 	if (sa == NULL) {
536 		return SPDK_ENV_NUMA_ID_ANY;
537 	}
538 	rc = spdk_net_get_address_string(sa, addr, sizeof(addr));
539 	if (rc) {
540 		return SPDK_ENV_NUMA_ID_ANY;
541 	}
542 	rc = spdk_net_get_interface_name(addr, ifc, sizeof(ifc));
543 	if (rc) {
544 		return SPDK_ENV_NUMA_ID_ANY;
545 	}
546 	rc = spdk_read_sysfs_attribute_uint32(&numa_id,
547 					      "/sys/class/net/%s/device/numa_node", ifc);
548 	if (rc || numa_id > INT32_MAX) {
549 		return SPDK_ENV_NUMA_ID_ANY;
550 	}
551 	return (int32_t)numa_id;
552 }
553