xref: /spdk/lib/rdma_utils/rdma_utils.c (revision 4586880f596e61c5a599d0766bb47c004bbd2dd6)
1  /*   SPDX-License-Identifier: BSD-3-Clause
2   *   Copyright (c) Intel Corporation. All rights reserved.
3   *   Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4   */
5  
6  #include "spdk_internal/rdma_utils.h"
7  
8  #include "spdk/log.h"
9  #include "spdk/string.h"
10  #include "spdk/likely.h"
11  
12  #include "spdk_internal/assert.h"
13  
14  #include <rdma/rdma_cma.h>
15  #include <rdma/rdma_verbs.h>
16  
17  struct rdma_utils_device {
18  	struct ibv_pd			*pd;
19  	struct ibv_context		*context;
20  	int				ref;
21  	bool				removed;
22  	TAILQ_ENTRY(rdma_utils_device)	tailq;
23  };
24  
25  struct spdk_rdma_utils_mem_map {
26  	struct spdk_mem_map			*map;
27  	struct ibv_pd				*pd;
28  	struct spdk_nvme_rdma_hooks		*hooks;
29  	uint32_t				ref_count;
30  	uint32_t				access_flags;
31  	LIST_ENTRY(spdk_rdma_utils_mem_map)	link;
32  };
33  
34  struct rdma_utils_memory_domain {
35  	TAILQ_ENTRY(rdma_utils_memory_domain) link;
36  	uint32_t ref;
37  	enum spdk_dma_device_type type;
38  	struct ibv_pd *pd;
39  	struct spdk_memory_domain *domain;
40  	struct spdk_memory_domain_rdma_ctx rdma_ctx;
41  };
42  
43  static pthread_mutex_t g_dev_mutex = PTHREAD_MUTEX_INITIALIZER;
44  static struct ibv_context **g_ctx_list = NULL;
45  static TAILQ_HEAD(, rdma_utils_device) g_dev_list = TAILQ_HEAD_INITIALIZER(g_dev_list);
46  
47  static LIST_HEAD(, spdk_rdma_utils_mem_map) g_rdma_utils_mr_maps = LIST_HEAD_INITIALIZER(
48  			&g_rdma_utils_mr_maps);
49  static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
50  
51  static TAILQ_HEAD(, rdma_utils_memory_domain) g_memory_domains = TAILQ_HEAD_INITIALIZER(
52  			g_memory_domains);
53  static pthread_mutex_t g_memory_domains_lock = PTHREAD_MUTEX_INITIALIZER;
54  
55  static int
56  rdma_utils_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
57  		      enum spdk_mem_map_notify_action action,
58  		      void *vaddr, size_t size)
59  {
60  	struct spdk_rdma_utils_mem_map *rmap = cb_ctx;
61  	struct ibv_pd *pd = rmap->pd;
62  	struct ibv_mr *mr;
63  	uint32_t access_flags;
64  	int rc;
65  
66  	switch (action) {
67  	case SPDK_MEM_MAP_NOTIFY_REGISTER:
68  		if (rmap->hooks && rmap->hooks->get_rkey) {
69  			rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
70  							  rmap->hooks->get_rkey(pd, vaddr, size));
71  		} else {
72  			access_flags = rmap->access_flags;
73  #ifdef IBV_ACCESS_OPTIONAL_FIRST
74  			access_flags |= IBV_ACCESS_RELAXED_ORDERING;
75  #endif
76  			mr = ibv_reg_mr(pd, vaddr, size, access_flags);
77  			if (mr == NULL) {
78  				SPDK_ERRLOG("ibv_reg_mr() failed\n");
79  				return -1;
80  			} else {
81  				rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
82  			}
83  		}
84  		break;
85  	case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
86  		if (rmap->hooks == NULL || rmap->hooks->get_rkey == NULL) {
87  			mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
88  			if (mr) {
89  				ibv_dereg_mr(mr);
90  			}
91  		}
92  		rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
93  		break;
94  	default:
95  		SPDK_UNREACHABLE();
96  	}
97  
98  	return rc;
99  }
100  
101  static int
102  rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
103  {
104  	/* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
105  	return addr_1 == addr_2;
106  }
107  
108  const struct spdk_mem_map_ops g_rdma_map_ops = {
109  	.notify_cb = rdma_utils_mem_notify,
110  	.are_contiguous = rdma_check_contiguous_entries
111  };
112  
113  static void
114  _rdma_free_mem_map(struct spdk_rdma_utils_mem_map *map)
115  {
116  	assert(map);
117  
118  	if (map->hooks) {
119  		spdk_free(map);
120  	} else {
121  		free(map);
122  	}
123  }
124  
125  struct spdk_rdma_utils_mem_map *
126  spdk_rdma_utils_create_mem_map(struct ibv_pd *pd, struct spdk_nvme_rdma_hooks *hooks,
127  			       uint32_t access_flags)
128  {
129  	struct spdk_rdma_utils_mem_map *map;
130  
131  	if (pd->context->device->transport_type == IBV_TRANSPORT_IWARP) {
132  		/* IWARP requires REMOTE_WRITE permission for RDMA_READ operation */
133  		access_flags |= IBV_ACCESS_REMOTE_WRITE;
134  	}
135  
136  	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
137  	/* Look up existing mem map registration for this pd */
138  	LIST_FOREACH(map, &g_rdma_utils_mr_maps, link) {
139  		if (map->pd == pd && map->access_flags == access_flags) {
140  			map->ref_count++;
141  			pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
142  			return map;
143  		}
144  	}
145  
146  	if (hooks) {
147  		map = spdk_zmalloc(sizeof(*map), 0, NULL, SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
148  	} else {
149  		map = calloc(1, sizeof(*map));
150  	}
151  	if (!map) {
152  		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
153  		SPDK_ERRLOG("Memory allocation failed\n");
154  		return NULL;
155  	}
156  	map->pd = pd;
157  	map->ref_count = 1;
158  	map->hooks = hooks;
159  	map->access_flags = access_flags;
160  	map->map = spdk_mem_map_alloc(0, &g_rdma_map_ops, map);
161  	if (!map->map) {
162  		SPDK_ERRLOG("Unable to create memory map\n");
163  		_rdma_free_mem_map(map);
164  		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
165  		return NULL;
166  	}
167  	LIST_INSERT_HEAD(&g_rdma_utils_mr_maps, map, link);
168  
169  	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
170  
171  	return map;
172  }
173  
174  void
175  spdk_rdma_utils_free_mem_map(struct spdk_rdma_utils_mem_map **_map)
176  {
177  	struct spdk_rdma_utils_mem_map *map;
178  
179  	if (!_map) {
180  		return;
181  	}
182  
183  	map = *_map;
184  	if (!map) {
185  		return;
186  	}
187  	*_map = NULL;
188  
189  	pthread_mutex_lock(&g_rdma_mr_maps_mutex);
190  	assert(map->ref_count > 0);
191  	map->ref_count--;
192  	if (map->ref_count != 0) {
193  		pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
194  		return;
195  	}
196  
197  	LIST_REMOVE(map, link);
198  	pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
199  	if (map->map) {
200  		spdk_mem_map_free(&map->map);
201  	}
202  	_rdma_free_mem_map(map);
203  }
204  
205  int
206  spdk_rdma_utils_get_translation(struct spdk_rdma_utils_mem_map *map, void *address,
207  				size_t length, struct spdk_rdma_utils_memory_translation *translation)
208  {
209  	uint64_t real_length = length;
210  
211  	assert(map);
212  	assert(address);
213  	assert(translation);
214  
215  	if (map->hooks && map->hooks->get_rkey) {
216  		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_KEY;
217  		translation->mr_or_key.key = spdk_mem_map_translate(map->map, (uint64_t)address, &real_length);
218  	} else {
219  		translation->translation_type = SPDK_RDMA_UTILS_TRANSLATION_MR;
220  		translation->mr_or_key.mr = (struct ibv_mr *)spdk_mem_map_translate(map->map, (uint64_t)address,
221  					    &real_length);
222  		if (spdk_unlikely(!translation->mr_or_key.mr)) {
223  			SPDK_ERRLOG("No translation for ptr %p, size %zu\n", address, length);
224  			return -EINVAL;
225  		}
226  	}
227  
228  	assert(real_length >= length);
229  
230  	return 0;
231  }
232  
233  
234  static struct rdma_utils_device *
235  rdma_add_dev(struct ibv_context *context)
236  {
237  	struct rdma_utils_device *dev;
238  
239  	dev = calloc(1, sizeof(*dev));
240  	if (dev == NULL) {
241  		SPDK_ERRLOG("Failed to allocate RDMA device object.\n");
242  		return NULL;
243  	}
244  
245  	dev->pd = ibv_alloc_pd(context);
246  	if (dev->pd == NULL) {
247  		SPDK_ERRLOG("ibv_alloc_pd() failed: %s (%d)\n", spdk_strerror(errno), errno);
248  		free(dev);
249  		return NULL;
250  	}
251  
252  	dev->context = context;
253  	TAILQ_INSERT_TAIL(&g_dev_list, dev, tailq);
254  
255  	return dev;
256  }
257  
258  static void
259  rdma_remove_dev(struct rdma_utils_device *dev)
260  {
261  	if (!dev->removed || dev->ref > 0) {
262  		return;
263  	}
264  
265  	/* Deallocate protection domain only if the device is already removed and
266  	 * there is no reference.
267  	 */
268  	TAILQ_REMOVE(&g_dev_list, dev, tailq);
269  	ibv_dealloc_pd(dev->pd);
270  	free(dev);
271  }
272  
273  static int
274  ctx_cmp(const void *_c1, const void *_c2)
275  {
276  	struct ibv_context *c1 = *(struct ibv_context **)_c1;
277  	struct ibv_context *c2 = *(struct ibv_context **)_c2;
278  
279  	return c1 < c2 ? -1 : c1 > c2;
280  }
281  
282  static int
283  rdma_sync_dev_list(void)
284  {
285  	struct ibv_context **new_ctx_list;
286  	int i, j;
287  	int num_devs = 0;
288  
289  	/*
290  	 * rdma_get_devices() returns a NULL terminated array of opened RDMA devices,
291  	 * and sets num_devs to the number of the returned devices.
292  	 */
293  	new_ctx_list = rdma_get_devices(&num_devs);
294  	if (new_ctx_list == NULL) {
295  		SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
296  		return -ENODEV;
297  	}
298  
299  	if (num_devs == 0) {
300  		rdma_free_devices(new_ctx_list);
301  		SPDK_ERRLOG("Returned RDMA device array was empty\n");
302  		return -ENODEV;
303  	}
304  
305  	/*
306  	 * Sort new_ctx_list by addresses to update devices easily.
307  	 */
308  	qsort(new_ctx_list, num_devs, sizeof(struct ibv_context *), ctx_cmp);
309  
310  	if (g_ctx_list == NULL) {
311  		/* If no old array, this is the first call. Add all devices. */
312  		for (i = 0; new_ctx_list[i] != NULL; i++) {
313  			rdma_add_dev(new_ctx_list[i]);
314  		}
315  
316  		goto exit;
317  	}
318  
319  	for (i = j = 0; new_ctx_list[i] != NULL || g_ctx_list[j] != NULL;) {
320  		struct ibv_context *new_ctx = new_ctx_list[i];
321  		struct ibv_context *old_ctx = g_ctx_list[j];
322  		bool add = false, remove = false;
323  
324  		/*
325  		 * If a context exists only in the new array, create a device for it,
326  		 * or if a context exists only in the old array, try removing the
327  		 * corresponding device.
328  		 */
329  
330  		if (old_ctx == NULL) {
331  			add = true;
332  		} else if (new_ctx == NULL) {
333  			remove = true;
334  		} else if (new_ctx < old_ctx) {
335  			add = true;
336  		} else if (old_ctx < new_ctx) {
337  			remove = true;
338  		}
339  
340  		if (add) {
341  			rdma_add_dev(new_ctx_list[i]);
342  			i++;
343  		} else if (remove) {
344  			struct rdma_utils_device *dev, *tmp;
345  
346  			TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
347  				if (dev->context == g_ctx_list[j]) {
348  					dev->removed = true;
349  					rdma_remove_dev(dev);
350  				}
351  			}
352  			j++;
353  		} else {
354  			i++;
355  			j++;
356  		}
357  	}
358  
359  	/* Free the old array. */
360  	rdma_free_devices(g_ctx_list);
361  
362  exit:
363  	/*
364  	 * Keep the newly returned array so that allocated protection domains
365  	 * are not freed unexpectedly.
366  	 */
367  	g_ctx_list = new_ctx_list;
368  	return 0;
369  }
370  
371  struct ibv_pd *
372  spdk_rdma_utils_get_pd(struct ibv_context *context)
373  {
374  	struct rdma_utils_device *dev;
375  	int rc;
376  
377  	pthread_mutex_lock(&g_dev_mutex);
378  
379  	rc = rdma_sync_dev_list();
380  	if (rc != 0) {
381  		pthread_mutex_unlock(&g_dev_mutex);
382  
383  		SPDK_ERRLOG("Failed to sync RDMA device list\n");
384  		return NULL;
385  	}
386  
387  	TAILQ_FOREACH(dev, &g_dev_list, tailq) {
388  		if (dev->context == context && !dev->removed) {
389  			dev->ref++;
390  			pthread_mutex_unlock(&g_dev_mutex);
391  
392  			return dev->pd;
393  		}
394  	}
395  
396  	pthread_mutex_unlock(&g_dev_mutex);
397  
398  	SPDK_ERRLOG("Failed to get PD\n");
399  	return NULL;
400  }
401  
402  void
403  spdk_rdma_utils_put_pd(struct ibv_pd *pd)
404  {
405  	struct rdma_utils_device *dev, *tmp;
406  
407  	pthread_mutex_lock(&g_dev_mutex);
408  
409  	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
410  		if (dev->pd == pd) {
411  			assert(dev->ref > 0);
412  			dev->ref--;
413  
414  			rdma_remove_dev(dev);
415  		}
416  	}
417  
418  	rdma_sync_dev_list();
419  
420  	pthread_mutex_unlock(&g_dev_mutex);
421  }
422  
423  __attribute__((destructor)) static void
424  _rdma_utils_fini(void)
425  {
426  	struct rdma_utils_device *dev, *tmp;
427  
428  	TAILQ_FOREACH_SAFE(dev, &g_dev_list, tailq, tmp) {
429  		dev->removed = true;
430  		dev->ref = 0;
431  		rdma_remove_dev(dev);
432  	}
433  
434  	if (g_ctx_list != NULL) {
435  		rdma_free_devices(g_ctx_list);
436  		g_ctx_list = NULL;
437  	}
438  }
439  
440  struct spdk_memory_domain *
441  spdk_rdma_utils_get_memory_domain(struct ibv_pd *pd)
442  {
443  	struct rdma_utils_memory_domain *domain = NULL;
444  	struct spdk_memory_domain_ctx ctx;
445  	int rc;
446  
447  	pthread_mutex_lock(&g_memory_domains_lock);
448  
449  	TAILQ_FOREACH(domain, &g_memory_domains, link) {
450  		if (domain->pd == pd) {
451  			domain->ref++;
452  			pthread_mutex_unlock(&g_memory_domains_lock);
453  			return domain->domain;
454  		}
455  	}
456  
457  	domain = calloc(1, sizeof(*domain));
458  	if (!domain) {
459  		SPDK_ERRLOG("Memory allocation failed\n");
460  		pthread_mutex_unlock(&g_memory_domains_lock);
461  		return NULL;
462  	}
463  
464  	domain->rdma_ctx.size = sizeof(domain->rdma_ctx);
465  	domain->rdma_ctx.ibv_pd = pd;
466  	ctx.size = sizeof(ctx);
467  	ctx.user_ctx = &domain->rdma_ctx;
468  
469  	rc = spdk_memory_domain_create(&domain->domain, SPDK_DMA_DEVICE_TYPE_RDMA, &ctx,
470  				       SPDK_RDMA_DMA_DEVICE);
471  	if (rc) {
472  		SPDK_ERRLOG("Failed to create memory domain\n");
473  		free(domain);
474  		pthread_mutex_unlock(&g_memory_domains_lock);
475  		return NULL;
476  	}
477  
478  	domain->pd = pd;
479  	domain->ref = 1;
480  	TAILQ_INSERT_TAIL(&g_memory_domains, domain, link);
481  
482  	pthread_mutex_unlock(&g_memory_domains_lock);
483  
484  	return domain->domain;
485  }
486  
487  int
488  spdk_rdma_utils_put_memory_domain(struct spdk_memory_domain *_domain)
489  {
490  	struct rdma_utils_memory_domain *domain = NULL;
491  
492  	if (!_domain) {
493  		return 0;
494  	}
495  
496  	pthread_mutex_lock(&g_memory_domains_lock);
497  
498  	TAILQ_FOREACH(domain, &g_memory_domains, link) {
499  		if (domain->domain == _domain) {
500  			break;
501  		}
502  	}
503  
504  	if (!domain) {
505  		pthread_mutex_unlock(&g_memory_domains_lock);
506  		return -ENODEV;
507  	}
508  	assert(domain->ref > 0);
509  
510  	domain->ref--;
511  
512  	if (domain->ref == 0) {
513  		spdk_memory_domain_destroy(domain->domain);
514  		TAILQ_REMOVE(&g_memory_domains, domain, link);
515  		free(domain);
516  	}
517  
518  	pthread_mutex_unlock(&g_memory_domains_lock);
519  
520  	return 0;
521  }
522