xref: /spdk/lib/rdma_utils/rdma_utils.c (revision 8ffb2c09ef6fbf68b189dda05ed8fd479c3ba144)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation. All rights reserved.
3  *   Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4  */
5 
6 #include "spdk_internal/rdma_utils.h"
7 
8 #include "spdk/log.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 
12 #include "spdk_internal/assert.h"
13 
14 #include <rdma/rdma_cma.h>
15 #include <rdma/rdma_verbs.h>
16 
17 struct rdma_utils_device {
18 	struct ibv_pd			*pd;
19 	struct ibv_context		*context;
20 	int				ref;
21 	bool				removed;
22 	TAILQ_ENTRY(rdma_utils_device)	tailq;
23 };
24 
25 struct spdk_rdma_utils_mem_map {
26 	struct spdk_mem_map			*map;
27 	struct ibv_pd				*pd;
28 	struct spdk_nvme_rdma_hooks		*hooks;
29 	uint32_t				ref_count;
30 	uint32_t				access_flags;
31 	LIST_ENTRY(spdk_rdma_utils_mem_map)	link;
32 };
33 
34 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
35 static struct ibv_context **g_ctx_list = NULL;
36 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
37 
38 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER(
39 			&g_rdma_utils_mr_maps);
40 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
41 
42 static int
43 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
44 		      enum spdk_mem_map_notify_action action,
45 		      void *vaddr, size_t size)
46 {
47 	struct spdk_rdma_utils_mem_map *rmap = cb_ctx;
48 	struct ibv_pd *pd = rmap->pd;
49 	struct ibv_mr *mr;
50 	uint32_t access_flags;
51 	int rc;
52 
53 	switch (action) {
54 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
55 		if (rmap->hooks && rmap->hooks->get_rkey) {
56 			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
57 							  rmap->hooks->get_rkey(pd, vaddr, size));
58 		} else {
59 			access_flags = rmap->access_flags;
60 #ifdef IBV_ACCESS_OPTIONAL_FIRST
61 			access_flags |= IBV_ACCESS_RELAXED_ORDERING;
62 #endif
63 			mr = ibv_reg_mr(pd, vaddr, size, access_flags);
64 			if (mr == NULL) {
65 				SPDK_ERRLOG("ibv_reg_mr() failed\n");
66 				return -1;
67 			} else {
68 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
69 			}
70 		}
71 		break;
72 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
73 		if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) {
74 			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
75 			if (mr) {
76 				ibv_dereg_mr(mr);
77 			}
78 		}
79 		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
80 		break;
81 	default:
82 		SPDK_UNREACHABLE();
83 	}
84 
85 	return rc;
86 }
87 
88 static int
89 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
90 {
91 	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
92 	return addr_1 == addr_2;
93 }
94 
95 const struct spdk_mem_map_ops g_rdma_map_ops = {
96 	.notify_cb = rdma_utils_mem_notify,
97 	.are_contiguous = rdma_check_contiguous_entries
98 };
99 
100 static void
101 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map)
102 {
103 	assert(map);
104 
105 	if (map->hooks) {
106 		spdk_free(map);
107 	} else {
108 		free(map);
109 	}
110 }
111 
112 struct spdk_rdma_utils_mem_map *
113 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks,
114 			       uint32_t access_flags)
115 {
116 	struct spdk_rdma_utils_mem_map *map;
117 
118 	if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) {
119 		/* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */
120 		access_flags |= IBV_ACCESS_REMOTE_WRITE;
121 	}
122 
123 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
124 	/* Look up existing mem map registration for this pd */
125 	LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) {
126 		if (map->pd == pd && map->access_flags == access_flags) {
127 			map->ref_count++;
128 			pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
129 			return map;
130 		}
131 	}
132 
133 	if (hooks) {
134 		map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
135 	} else {
136 		map = calloc(1, sizeof(*map));
137 	}
138 	if (!map) {
139 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
140 		SPDK_ERRLOG("Memory allocation failed\n");
141 		return NULL;
142 	}
143 	map->pd = pd;
144 	map->ref_count = 1;
145 	map->hooks = hooks;
146 	map->access_flags = access_flags;
147 	map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map);
148 	if (!map->map) {
149 		SPDK_ERRLOG("Unable to create memory map\n");
150 		_rdma_free_mem_map(map);
151 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
152 		return NULL;
153 	}
154 	LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link);
155 
156 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
157 
158 	return map;
159 }
160 
161 void
162 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map)
163 {
164 	struct spdk_rdma_utils_mem_map *map;
165 
166 	if (!_map) {
167 		return;
168 	}
169 
170 	map = *_map;
171 	if (!map) {
172 		return;
173 	}
174 	*_map = NULL;
175 
176 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
177 	assert(map->ref_count > 0);
178 	map->ref_count--;
179 	if (map->ref_count != 0) {
180 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
181 		return;
182 	}
183 
184 	LIST_REMOVE(map, link);
185 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
186 	if (map->map) {
187 		spdk_mem_map_free(&map->map);
188 	}
189 	_rdma_free_mem_map(map);
190 }
191 
192 int
193 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address,
194 				size_t length, struct spdk_rdma_utils_memory_translation *translation)
195 {
196 	uint64_t real_length = length;
197 
198 	assert(map);
199 	assert(address);
200 	assert(translation);
201 
202 	if (map->hooks && map->hooks->get_rkey) {
203 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY;
204 		translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length);
205 	} else {
206 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR;
207 		translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address,
208 					    &real_length);
209 		if (spdk_unlikely(!translation->mr_or_key.mr)) {
210 			SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length);
211 			return -EINVAL;
212 		}
213 	}
214 
215 	assert(real_length >= length);
216 
217 	return 0;
218 }
219 
220 
221 static struct rdma_utils_device *
222 rdma_add_dev(struct ibv_context *context)
223 {
224 	struct rdma_utils_device *dev;
225 
226 	dev = calloc(1, sizeof(*dev));
227 	if (dev == NULL) {
228 		SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
229 		return NULL;
230 	}
231 
232 	dev->pd = ibv_alloc_pd(context);
233 	if (dev->pd == NULL) {
234 		SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
235 		free(dev);
236 		return NULL;
237 	}
238 
239 	dev->context = context;
240 	TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
241 
242 	return dev;
243 }
244 
245 static void
246 rdma_remove_dev(struct rdma_utils_device *dev)
247 {
248 	if (!dev->removed || dev->ref > 0) {
249 		return;
250 	}
251 
252 	/* Deallocate protection domain only if the device is already removed and
253 	 * there is no reference.
254 	 */
255 	TAILQ_REMOVE(&g_dev_list, dev, tailq);
256 	ibv_dealloc_pd(dev->pd);
257 	free(dev);
258 }
259 
260 static int
261 ctx_cmp(const void *_c1, const void *_c2)
262 {
263 	struct ibv_context *c1 = *(struct ibv_context **)_c1;
264 	struct ibv_context *c2 = *(struct ibv_context **)_c2;
265 
266 	return c1 < c2 ? -1 : c1 > c2;
267 }
268 
269 static int
270 rdma_sync_dev_list(void)
271 {
272 	struct ibv_context **new_ctx_list;
273 	int i, j;
274 	int num_devs = 0;
275 
276 	/*
277 	 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
278 	 * and sets num_devs to the number of the returned devices.
279 	 */
280 	new_ctx_list = rdma_get_devices(&num_devs);
281 	if (new_ctx_list == NULL) {
282 		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
283 		return -ENODEV;
284 	}
285 
286 	if (num_devs == 0) {
287 		rdma_free_devices(new_ctx_list);
288 		SPDK_ERRLOG("Returned RDMA device array was empty\n");
289 		return -ENODEV;
290 	}
291 
292 	/*
293 	 * Sort new_ctx_list by addresses to update devices easily.
294 	 */
295 	qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
296 
297 	if (g_ctx_list == NULL) {
298 		/* If no old array, this is the first call. Add all devices. */
299 		for (i = 0; new_ctx_list[i] != NULL; i++) {
300 			rdma_add_dev(new_ctx_list[i]);
301 		}
302 
303 		goto exit;
304 	}
305 
306 	for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
307 		struct ibv_context *new_ctx = new_ctx_list[i];
308 		struct ibv_context *old_ctx = g_ctx_list[j];
309 		bool add = false, remove = false;
310 
311 		/*
312 		 * If a context exists only in the new array, create a device for it,
313 		 * or if a context exists only in the old array, try removing the
314 		 * corresponding device.
315 		 */
316 
317 		if (old_ctx == NULL) {
318 			add = true;
319 		} else if (new_ctx == NULL) {
320 			remove = true;
321 		} else if (new_ctx < old_ctx) {
322 			add = true;
323 		} else if (old_ctx < new_ctx) {
324 			remove = true;
325 		}
326 
327 		if (add) {
328 			rdma_add_dev(new_ctx_list[i]);
329 			i++;
330 		} else if (remove) {
331 			struct rdma_utils_device *dev, *tmp;
332 
333 			TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
334 				if (dev->context == g_ctx_list[j]) {
335 					dev->removed = true;
336 					rdma_remove_dev(dev);
337 				}
338 			}
339 			j++;
340 		} else {
341 			i++;
342 			j++;
343 		}
344 	}
345 
346 	/* Free the old array. */
347 	rdma_free_devices(g_ctx_list);
348 
349 exit:
350 	/*
351 	 * Keep the newly returned array so that allocated protection domains
352 	 * are not freed unexpectedly.
353 	 */
354 	g_ctx_list = new_ctx_list;
355 	return 0;
356 }
357 
358 struct ibv_pd *
359 spdk_rdma_utils_get_pd(struct ibv_context *context)
360 {
361 	struct rdma_utils_device *dev;
362 	int rc;
363 
364 	pthread_mutex_lock(&g_dev_mutex);
365 
366 	rc = rdma_sync_dev_list();
367 	if (rc != 0) {
368 		pthread_mutex_unlock(&g_dev_mutex);
369 
370 		SPDK_ERRLOG("Failed to sync RDMA device list\n");
371 		return NULL;
372 	}
373 
374 	TAILQ_FOREACH(dev, &g_dev_list, tailq) {
375 		if (dev->context == context && !dev->removed) {
376 			dev->ref++;
377 			pthread_mutex_unlock(&g_dev_mutex);
378 
379 			return dev->pd;
380 		}
381 	}
382 
383 	pthread_mutex_unlock(&g_dev_mutex);
384 
385 	SPDK_ERRLOG("Failed to get PD\n");
386 	return NULL;
387 }
388 
389 void
390 spdk_rdma_utils_put_pd(struct ibv_pd *pd)
391 {
392 	struct rdma_utils_device *dev, *tmp;
393 
394 	pthread_mutex_lock(&g_dev_mutex);
395 
396 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
397 		if (dev->pd == pd) {
398 			assert(dev->ref > 0);
399 			dev->ref--;
400 
401 			rdma_remove_dev(dev);
402 		}
403 	}
404 
405 	rdma_sync_dev_list();
406 
407 	pthread_mutex_unlock(&g_dev_mutex);
408 }
409 
410 __attribute__((destructor)) static void
411 _rdma_utils_fini(void)
412 {
413 	struct rdma_utils_device *dev, *tmp;
414 
415 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
416 		dev->removed = true;
417 		dev->ref = 0;
418 		rdma_remove_dev(dev);
419 	}
420 
421 	if (g_ctx_list != NULL) {
422 		rdma_free_devices(g_ctx_list);
423 		g_ctx_list = NULL;
424 	}
425 }
426