xref: /spdk/lib/rdma_utils/rdma_utils.c (revision 8afdeef3becfe9409cc9e7372bd0bc10e8b7d46d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation. All rights reserved.
3  *   Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4  */
5 
6 #include "spdk_internal/rdma_utils.h"
7 
8 #include "spdk/log.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 
12 #include "spdk_internal/assert.h"
13 
14 #include <rdma/rdma_cma.h>
15 #include <rdma/rdma_verbs.h>
16 
17 struct rdma_utils_device {
18 	struct ibv_pd			*pd;
19 	struct ibv_context		*context;
20 	int				ref;
21 	bool				removed;
22 	TAILQ_ENTRY(rdma_utils_device)	tailq;
23 };
24 
25 struct spdk_rdma_utils_mem_map {
26 	struct spdk_mem_map			*map;
27 	struct ibv_pd				*pd;
28 	struct spdk_nvme_rdma_hooks		*hooks;
29 	uint32_t				ref_count;
30 	uint32_t				access_flags;
31 	LIST_ENTRY(spdk_rdma_utils_mem_map)	link;
32 };
33 
34 struct rdma_utils_memory_domain {
35 	TAILQ_ENTRY(rdma_utils_memory_domain) link;
36 	uint32_t ref;
37 	enum spdk_dma_device_type type;
38 	struct ibv_pd *pd;
39 	struct spdk_memory_domain *domain;
40 	struct spdk_memory_domain_rdma_ctx rdma_ctx;
41 };
42 
43 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
44 static struct ibv_context **g_ctx_list = NULL;
45 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
46 
47 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER(
48 			&g_rdma_utils_mr_maps);
49 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
50 
51 static TAILQ_HEAD(, rdma_utils_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER(
52 			g_memory_domains);
53 static pthread_mutex_t g_memory_domains_lock = PTHREAD_MUTEX_INITIALIZER;
54 
55 static int
56 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
57 		      enum spdk_mem_map_notify_action action,
58 		      void *vaddr, size_t size)
59 {
60 	struct spdk_rdma_utils_mem_map *rmap = cb_ctx;
61 	struct ibv_pd *pd = rmap->pd;
62 	struct ibv_mr *mr;
63 	uint32_t access_flags;
64 	int rc;
65 
66 	switch (action) {
67 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
68 		if (rmap->hooks && rmap->hooks->get_rkey) {
69 			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
70 							  rmap->hooks->get_rkey(pd, vaddr, size));
71 		} else {
72 			access_flags = rmap->access_flags;
73 #ifdef IBV_ACCESS_OPTIONAL_FIRST
74 			access_flags |= IBV_ACCESS_RELAXED_ORDERING;
75 #endif
76 			mr = ibv_reg_mr(pd, vaddr, size, access_flags);
77 			if (mr == NULL) {
78 				SPDK_ERRLOG("ibv_reg_mr() failed\n");
79 				return -1;
80 			} else {
81 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
82 			}
83 		}
84 		break;
85 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
86 		if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) {
87 			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
88 			if (mr) {
89 				ibv_dereg_mr(mr);
90 			}
91 		}
92 		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
93 		break;
94 	default:
95 		SPDK_UNREACHABLE();
96 	}
97 
98 	return rc;
99 }
100 
101 static int
102 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
103 {
104 	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
105 	return addr_1 == addr_2;
106 }
107 
108 const struct spdk_mem_map_ops g_rdma_map_ops = {
109 	.notify_cb = rdma_utils_mem_notify,
110 	.are_contiguous = rdma_check_contiguous_entries
111 };
112 
113 static void
114 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map)
115 {
116 	assert(map);
117 
118 	if (map->hooks) {
119 		spdk_free(map);
120 	} else {
121 		free(map);
122 	}
123 }
124 
125 struct spdk_rdma_utils_mem_map *
126 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks,
127 			       uint32_t access_flags)
128 {
129 	struct spdk_rdma_utils_mem_map *map;
130 
131 	if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) {
132 		/* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */
133 		access_flags |= IBV_ACCESS_REMOTE_WRITE;
134 	}
135 
136 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
137 	/* Look up existing mem map registration for this pd */
138 	LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) {
139 		if (map->pd == pd && map->access_flags == access_flags) {
140 			map->ref_count++;
141 			pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
142 			return map;
143 		}
144 	}
145 
146 	if (hooks) {
147 		map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
148 	} else {
149 		map = calloc(1, sizeof(*map));
150 	}
151 	if (!map) {
152 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
153 		SPDK_ERRLOG("Memory allocation failed\n");
154 		return NULL;
155 	}
156 	map->pd = pd;
157 	map->ref_count = 1;
158 	map->hooks = hooks;
159 	map->access_flags = access_flags;
160 	map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map);
161 	if (!map->map) {
162 		SPDK_ERRLOG("Unable to create memory map\n");
163 		_rdma_free_mem_map(map);
164 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
165 		return NULL;
166 	}
167 	LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link);
168 
169 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
170 
171 	return map;
172 }
173 
174 void
175 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map)
176 {
177 	struct spdk_rdma_utils_mem_map *map;
178 
179 	if (!_map) {
180 		return;
181 	}
182 
183 	map = *_map;
184 	if (!map) {
185 		return;
186 	}
187 	*_map = NULL;
188 
189 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
190 	assert(map->ref_count > 0);
191 	map->ref_count--;
192 	if (map->ref_count != 0) {
193 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
194 		return;
195 	}
196 
197 	LIST_REMOVE(map, link);
198 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
199 	if (map->map) {
200 		spdk_mem_map_free(&map->map);
201 	}
202 	_rdma_free_mem_map(map);
203 }
204 
205 int
206 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address,
207 				size_t length, struct spdk_rdma_utils_memory_translation *translation)
208 {
209 	uint64_t real_length = length;
210 
211 	assert(map);
212 	assert(address);
213 	assert(translation);
214 
215 	if (map->hooks && map->hooks->get_rkey) {
216 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY;
217 		translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length);
218 	} else {
219 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR;
220 		translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address,
221 					    &real_length);
222 		if (spdk_unlikely(!translation->mr_or_key.mr)) {
223 			SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length);
224 			return -EINVAL;
225 		}
226 	}
227 
228 	assert(real_length >= length);
229 
230 	return 0;
231 }
232 
233 
234 static struct rdma_utils_device *
235 rdma_add_dev(struct ibv_context *context)
236 {
237 	struct rdma_utils_device *dev;
238 
239 	dev = calloc(1, sizeof(*dev));
240 	if (dev == NULL) {
241 		SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
242 		return NULL;
243 	}
244 
245 	dev->pd = ibv_alloc_pd(context);
246 	if (dev->pd == NULL) {
247 		SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
248 		free(dev);
249 		return NULL;
250 	}
251 
252 	dev->context = context;
253 	TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
254 
255 	return dev;
256 }
257 
258 static void
259 rdma_remove_dev(struct rdma_utils_device *dev)
260 {
261 	if (!dev->removed || dev->ref > 0) {
262 		return;
263 	}
264 
265 	/* Deallocate protection domain only if the device is already removed and
266 	 * there is no reference.
267 	 */
268 	TAILQ_REMOVE(&g_dev_list, dev, tailq);
269 	ibv_dealloc_pd(dev->pd);
270 	free(dev);
271 }
272 
273 static int
274 ctx_cmp(const void *_c1, const void *_c2)
275 {
276 	struct ibv_context *c1 = *(struct ibv_context **)_c1;
277 	struct ibv_context *c2 = *(struct ibv_context **)_c2;
278 
279 	return c1 < c2 ? -1 : c1 > c2;
280 }
281 
282 static int
283 rdma_sync_dev_list(void)
284 {
285 	struct ibv_context **new_ctx_list;
286 	int i, j;
287 	int num_devs = 0;
288 
289 	/*
290 	 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
291 	 * and sets num_devs to the number of the returned devices.
292 	 */
293 	new_ctx_list = rdma_get_devices(&num_devs);
294 	if (new_ctx_list == NULL) {
295 		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
296 		return -ENODEV;
297 	}
298 
299 	if (num_devs == 0) {
300 		rdma_free_devices(new_ctx_list);
301 		SPDK_ERRLOG("Returned RDMA device array was empty\n");
302 		return -ENODEV;
303 	}
304 
305 	/*
306 	 * Sort new_ctx_list by addresses to update devices easily.
307 	 */
308 	qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
309 
310 	if (g_ctx_list == NULL) {
311 		/* If no old array, this is the first call. Add all devices. */
312 		for (i = 0; new_ctx_list[i] != NULL; i++) {
313 			rdma_add_dev(new_ctx_list[i]);
314 		}
315 
316 		goto exit;
317 	}
318 
319 	for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
320 		struct ibv_context *new_ctx = new_ctx_list[i];
321 		struct ibv_context *old_ctx = g_ctx_list[j];
322 		bool add = false, remove = false;
323 
324 		/*
325 		 * If a context exists only in the new array, create a device for it,
326 		 * or if a context exists only in the old array, try removing the
327 		 * corresponding device.
328 		 */
329 
330 		if (old_ctx == NULL) {
331 			add = true;
332 		} else if (new_ctx == NULL) {
333 			remove = true;
334 		} else if (new_ctx < old_ctx) {
335 			add = true;
336 		} else if (old_ctx < new_ctx) {
337 			remove = true;
338 		}
339 
340 		if (add) {
341 			rdma_add_dev(new_ctx_list[i]);
342 			i++;
343 		} else if (remove) {
344 			struct rdma_utils_device *dev, *tmp;
345 
346 			TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
347 				if (dev->context == g_ctx_list[j]) {
348 					dev->removed = true;
349 					rdma_remove_dev(dev);
350 				}
351 			}
352 			j++;
353 		} else {
354 			i++;
355 			j++;
356 		}
357 	}
358 
359 	/* Free the old array. */
360 	rdma_free_devices(g_ctx_list);
361 
362 exit:
363 	/*
364 	 * Keep the newly returned array so that allocated protection domains
365 	 * are not freed unexpectedly.
366 	 */
367 	g_ctx_list = new_ctx_list;
368 	return 0;
369 }
370 
371 struct ibv_pd *
372 spdk_rdma_utils_get_pd(struct ibv_context *context)
373 {
374 	struct rdma_utils_device *dev;
375 	int rc;
376 
377 	pthread_mutex_lock(&g_dev_mutex);
378 
379 	rc = rdma_sync_dev_list();
380 	if (rc != 0) {
381 		pthread_mutex_unlock(&g_dev_mutex);
382 
383 		SPDK_ERRLOG("Failed to sync RDMA device list\n");
384 		return NULL;
385 	}
386 
387 	TAILQ_FOREACH(dev, &g_dev_list, tailq) {
388 		if (dev->context == context && !dev->removed) {
389 			dev->ref++;
390 			pthread_mutex_unlock(&g_dev_mutex);
391 
392 			return dev->pd;
393 		}
394 	}
395 
396 	pthread_mutex_unlock(&g_dev_mutex);
397 
398 	SPDK_ERRLOG("Failed to get PD\n");
399 	return NULL;
400 }
401 
402 void
403 spdk_rdma_utils_put_pd(struct ibv_pd *pd)
404 {
405 	struct rdma_utils_device *dev, *tmp;
406 
407 	pthread_mutex_lock(&g_dev_mutex);
408 
409 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
410 		if (dev->pd == pd) {
411 			assert(dev->ref > 0);
412 			dev->ref--;
413 
414 			rdma_remove_dev(dev);
415 		}
416 	}
417 
418 	rdma_sync_dev_list();
419 
420 	pthread_mutex_unlock(&g_dev_mutex);
421 }
422 
423 __attribute__((destructor)) static void
424 _rdma_utils_fini(void)
425 {
426 	struct rdma_utils_device *dev, *tmp;
427 
428 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
429 		dev->removed = true;
430 		dev->ref = 0;
431 		rdma_remove_dev(dev);
432 	}
433 
434 	if (g_ctx_list != NULL) {
435 		rdma_free_devices(g_ctx_list);
436 		g_ctx_list = NULL;
437 	}
438 }
439 
440 struct spdk_memory_domain *
441 spdk_rdma_utils_get_memory_domain(struct ibv_pd *pd)
442 {
443 	struct rdma_utils_memory_domain *domain = NULL;
444 	struct spdk_memory_domain_ctx ctx;
445 	int rc;
446 
447 	pthread_mutex_lock(&g_memory_domains_lock);
448 
449 	TAILQ_FOREACH(domain, &g_memory_domains, link) {
450 		if (domain->pd == pd) {
451 			domain->ref++;
452 			pthread_mutex_unlock(&g_memory_domains_lock);
453 			return domain->domain;
454 		}
455 	}
456 
457 	domain = calloc(1, sizeof(*domain));
458 	if (!domain) {
459 		SPDK_ERRLOG("Memory allocation failed\n");
460 		pthread_mutex_unlock(&g_memory_domains_lock);
461 		return NULL;
462 	}
463 
464 	domain->rdma_ctx.size = sizeof(domain->rdma_ctx);
465 	domain->rdma_ctx.ibv_pd = pd;
466 	ctx.size = sizeof(ctx);
467 	ctx.user_ctx = &domain->rdma_ctx;
468 
469 	rc = spdk_memory_domain_create(&domain->domain, SPDK_DMA_DEVICE_TYPE_RDMA, &ctx,
470 				       SPDK_RDMA_DMA_DEVICE);
471 	if (rc) {
472 		SPDK_ERRLOG("Failed to create memory domain\n");
473 		free(domain);
474 		pthread_mutex_unlock(&g_memory_domains_lock);
475 		return NULL;
476 	}
477 
478 	domain->pd = pd;
479 	domain->ref = 1;
480 	TAILQ_INSERT_TAIL(&g_memory_domains, domain, link);
481 
482 	pthread_mutex_unlock(&g_memory_domains_lock);
483 
484 	return domain->domain;
485 }
486 
487 int
488 spdk_rdma_utils_put_memory_domain(struct spdk_memory_domain *_domain)
489 {
490 	struct rdma_utils_memory_domain *domain = NULL;
491 
492 	if (!_domain) {
493 		return 0;
494 	}
495 
496 	pthread_mutex_lock(&g_memory_domains_lock);
497 
498 	TAILQ_FOREACH(domain, &g_memory_domains, link) {
499 		if (domain->domain == _domain) {
500 			break;
501 		}
502 	}
503 
504 	if (!domain) {
505 		pthread_mutex_unlock(&g_memory_domains_lock);
506 		return -ENODEV;
507 	}
508 	assert(domain->ref > 0);
509 
510 	domain->ref--;
511 
512 	if (domain->ref == 0) {
513 		spdk_memory_domain_destroy(domain->domain);
514 		TAILQ_REMOVE(&g_memory_domains, domain, link);
515 		free(domain);
516 	}
517 
518 	pthread_mutex_unlock(&g_memory_domains_lock);
519 
520 	return 0;
521 }
522