xref: /spdk/lib/rdma_utils/rdma_utils.c (revision 8a01b4d6366393ba157b7c42f076389b9cebaafa)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation. All rights reserved.
3  *   Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4  */
5 
6 #include "spdk_internal/rdma_utils.h"
7 
8 #include "spdk/log.h"
9 #include "spdk/string.h"
10 #include "spdk/likely.h"
11 
12 #include "spdk_internal/assert.h"
13 
14 #include <rdma/rdma_cma.h>
15 #include <rdma/rdma_verbs.h>
16 
17 struct rdma_utils_device {
18 	struct ibv_pd			*pd;
19 	struct ibv_context		*context;
20 	int				ref;
21 	bool				removed;
22 	TAILQ_ENTRY(rdma_utils_device)	tailq;
23 };
24 
25 struct spdk_rdma_utils_mem_map {
26 	struct spdk_mem_map			*map;
27 	struct ibv_pd				*pd;
28 	struct spdk_nvme_rdma_hooks		*hooks;
29 	uint32_t				ref_count;
30 	enum spdk_rdma_utils_memory_map_role	role;
31 	LIST_ENTRY(spdk_rdma_utils_mem_map)	link;
32 };
33 
34 static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
35 static struct ibv_context **g_ctx_list = NULL;
36 static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
37 
38 static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER(
39 			&g_rdma_utils_mr_maps);
40 static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
41 
42 static int
43 rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
44 		      enum spdk_mem_map_notify_action action,
45 		      void *vaddr, size_t size)
46 {
47 	struct spdk_rdma_utils_mem_map *rmap = cb_ctx;
48 	struct ibv_pd *pd = rmap->pd;
49 	struct ibv_mr *mr;
50 	uint32_t access_flags = 0;
51 	int rc;
52 
53 	switch (action) {
54 	case SPDK_MEM_MAP_NOTIFY_REGISTER:
55 		if (rmap->hooks && rmap->hooks->get_rkey) {
56 			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
57 							  rmap->hooks->get_rkey(pd, vaddr, size));
58 		} else {
59 			switch (rmap->role) {
60 			case SPDK_RDMA_UTILS_MEMORY_MAP_ROLE_TARGET:
61 				access_flags = IBV_ACCESS_LOCAL_WRITE;
62 				if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) {
63 					/* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */
64 					access_flags |= IBV_ACCESS_REMOTE_WRITE;
65 				}
66 				break;
67 			case SPDK_RDMA_UTILS_MEMORY_MAP_ROLE_INITIATOR:
68 				access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
69 				break;
70 			default:
71 				SPDK_UNREACHABLE();
72 			}
73 #ifdef IBV_ACCESS_OPTIONAL_FIRST
74 			access_flags |= IBV_ACCESS_RELAXED_ORDERING;
75 #endif
76 			mr = ibv_reg_mr(pd, vaddr, size, access_flags);
77 			if (mr == NULL) {
78 				SPDK_ERRLOG("ibv_reg_mr() failed\n");
79 				return -1;
80 			} else {
81 				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
82 			}
83 		}
84 		break;
85 	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
86 		if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) {
87 			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
88 			if (mr) {
89 				ibv_dereg_mr(mr);
90 			}
91 		}
92 		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
93 		break;
94 	default:
95 		SPDK_UNREACHABLE();
96 	}
97 
98 	return rc;
99 }
100 
101 static int
102 rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
103 {
104 	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
105 	return addr_1 == addr_2;
106 }
107 
108 const struct spdk_mem_map_ops g_rdma_map_ops = {
109 	.notify_cb = rdma_utils_mem_notify,
110 	.are_contiguous = rdma_check_contiguous_entries
111 };
112 
113 static void
114 _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map)
115 {
116 	assert(map);
117 
118 	if (map->hooks) {
119 		spdk_free(map);
120 	} else {
121 		free(map);
122 	}
123 }
124 
125 struct spdk_rdma_utils_mem_map *
126 spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks,
127 			       enum spdk_rdma_utils_memory_map_role role)
128 {
129 	struct spdk_rdma_utils_mem_map *map;
130 
131 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
132 	/* Look up existing mem map registration for this pd */
133 	LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) {
134 		if (map->pd == pd && map->role == role) {
135 			map->ref_count++;
136 			pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
137 			return map;
138 		}
139 	}
140 
141 	if (hooks) {
142 		map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
143 	} else {
144 		map = calloc(1, sizeof(*map));
145 	}
146 	if (!map) {
147 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
148 		SPDK_ERRLOG("Memory allocation failed\n");
149 		return NULL;
150 	}
151 	map->pd = pd;
152 	map->ref_count = 1;
153 	map->hooks = hooks;
154 	map->role = role;
155 	map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map);
156 	if (!map->map) {
157 		SPDK_ERRLOG("Unable to create memory map\n");
158 		_rdma_free_mem_map(map);
159 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
160 		return NULL;
161 	}
162 	LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link);
163 
164 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
165 
166 	return map;
167 }
168 
169 void
170 spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map)
171 {
172 	struct spdk_rdma_utils_mem_map *map;
173 
174 	if (!_map) {
175 		return;
176 	}
177 
178 	map = *_map;
179 	if (!map) {
180 		return;
181 	}
182 	*_map = NULL;
183 
184 	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
185 	assert(map->ref_count > 0);
186 	map->ref_count--;
187 	if (map->ref_count != 0) {
188 		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
189 		return;
190 	}
191 
192 	LIST_REMOVE(map, link);
193 	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
194 	if (map->map) {
195 		spdk_mem_map_free(&map->map);
196 	}
197 	_rdma_free_mem_map(map);
198 }
199 
200 int
201 spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address,
202 				size_t length, struct spdk_rdma_utils_memory_translation *translation)
203 {
204 	uint64_t real_length = length;
205 
206 	assert(map);
207 	assert(address);
208 	assert(translation);
209 
210 	if (map->hooks && map->hooks->get_rkey) {
211 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY;
212 		translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length);
213 	} else {
214 		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR;
215 		translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address,
216 					    &real_length);
217 		if (spdk_unlikely(!translation->mr_or_key.mr)) {
218 			SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length);
219 			return -EINVAL;
220 		}
221 	}
222 
223 	assert(real_length >= length);
224 
225 	return 0;
226 }
227 
228 
229 static struct rdma_utils_device *
230 rdma_add_dev(struct ibv_context *context)
231 {
232 	struct rdma_utils_device *dev;
233 
234 	dev = calloc(1, sizeof(*dev));
235 	if (dev == NULL) {
236 		SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
237 		return NULL;
238 	}
239 
240 	dev->pd = ibv_alloc_pd(context);
241 	if (dev->pd == NULL) {
242 		SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
243 		free(dev);
244 		return NULL;
245 	}
246 
247 	dev->context = context;
248 	TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
249 
250 	return dev;
251 }
252 
253 static void
254 rdma_remove_dev(struct rdma_utils_device *dev)
255 {
256 	if (!dev->removed || dev->ref > 0) {
257 		return;
258 	}
259 
260 	/* Deallocate protection domain only if the device is already removed and
261 	 * there is no reference.
262 	 */
263 	TAILQ_REMOVE(&g_dev_list, dev, tailq);
264 	ibv_dealloc_pd(dev->pd);
265 	free(dev);
266 }
267 
268 static int
269 ctx_cmp(const void *_c1, const void *_c2)
270 {
271 	struct ibv_context *c1 = *(struct ibv_context **)_c1;
272 	struct ibv_context *c2 = *(struct ibv_context **)_c2;
273 
274 	return c1 < c2 ? -1 : c1 > c2;
275 }
276 
277 static int
278 rdma_sync_dev_list(void)
279 {
280 	struct ibv_context **new_ctx_list;
281 	int i, j;
282 	int num_devs = 0;
283 
284 	/*
285 	 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
286 	 * and sets num_devs to the number of the returned devices.
287 	 */
288 	new_ctx_list = rdma_get_devices(&num_devs);
289 	if (new_ctx_list == NULL) {
290 		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
291 		return -ENODEV;
292 	}
293 
294 	if (num_devs == 0) {
295 		rdma_free_devices(new_ctx_list);
296 		SPDK_ERRLOG("Returned RDMA device array was empty\n");
297 		return -ENODEV;
298 	}
299 
300 	/*
301 	 * Sort new_ctx_list by addresses to update devices easily.
302 	 */
303 	qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
304 
305 	if (g_ctx_list == NULL) {
306 		/* If no old array, this is the first call. Add all devices. */
307 		for (i = 0; new_ctx_list[i] != NULL; i++) {
308 			rdma_add_dev(new_ctx_list[i]);
309 		}
310 
311 		goto exit;
312 	}
313 
314 	for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
315 		struct ibv_context *new_ctx = new_ctx_list[i];
316 		struct ibv_context *old_ctx = g_ctx_list[j];
317 		bool add = false, remove = false;
318 
319 		/*
320 		 * If a context exists only in the new array, create a device for it,
321 		 * or if a context exists only in the old array, try removing the
322 		 * corresponding device.
323 		 */
324 
325 		if (old_ctx == NULL) {
326 			add = true;
327 		} else if (new_ctx == NULL) {
328 			remove = true;
329 		} else if (new_ctx < old_ctx) {
330 			add = true;
331 		} else if (old_ctx < new_ctx) {
332 			remove = true;
333 		}
334 
335 		if (add) {
336 			rdma_add_dev(new_ctx_list[i]);
337 			i++;
338 		} else if (remove) {
339 			struct rdma_utils_device *dev, *tmp;
340 
341 			TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
342 				if (dev->context == g_ctx_list[j]) {
343 					dev->removed = true;
344 					rdma_remove_dev(dev);
345 				}
346 			}
347 			j++;
348 		} else {
349 			i++;
350 			j++;
351 		}
352 	}
353 
354 	/* Free the old array. */
355 	rdma_free_devices(g_ctx_list);
356 
357 exit:
358 	/*
359 	 * Keep the newly returned array so that allocated protection domains
360 	 * are not freed unexpectedly.
361 	 */
362 	g_ctx_list = new_ctx_list;
363 	return 0;
364 }
365 
366 struct ibv_pd *
367 spdk_rdma_utils_get_pd(struct ibv_context *context)
368 {
369 	struct rdma_utils_device *dev;
370 	int rc;
371 
372 	pthread_mutex_lock(&g_dev_mutex);
373 
374 	rc = rdma_sync_dev_list();
375 	if (rc != 0) {
376 		pthread_mutex_unlock(&g_dev_mutex);
377 
378 		SPDK_ERRLOG("Failed to sync RDMA device list\n");
379 		return NULL;
380 	}
381 
382 	TAILQ_FOREACH(dev, &g_dev_list, tailq) {
383 		if (dev->context == context && !dev->removed) {
384 			dev->ref++;
385 			pthread_mutex_unlock(&g_dev_mutex);
386 
387 			return dev->pd;
388 		}
389 	}
390 
391 	pthread_mutex_unlock(&g_dev_mutex);
392 
393 	SPDK_ERRLOG("Failed to get PD\n");
394 	return NULL;
395 }
396 
397 void
398 spdk_rdma_utils_put_pd(struct ibv_pd *pd)
399 {
400 	struct rdma_utils_device *dev, *tmp;
401 
402 	pthread_mutex_lock(&g_dev_mutex);
403 
404 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
405 		if (dev->pd == pd) {
406 			assert(dev->ref > 0);
407 			dev->ref--;
408 
409 			rdma_remove_dev(dev);
410 		}
411 	}
412 
413 	rdma_sync_dev_list();
414 
415 	pthread_mutex_unlock(&g_dev_mutex);
416 }
417 
418 __attribute__((destructor)) static void
419 _rdma_utils_fini(void)
420 {
421 	struct rdma_utils_device *dev, *tmp;
422 
423 	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
424 		dev->removed = true;
425 		dev->ref = 0;
426 		rdma_remove_dev(dev);
427 	}
428 
429 	if (g_ctx_list != NULL) {
430 		rdma_free_devices(g_ctx_list);
431 		g_ctx_list = NULL;
432 	}
433 }
434