xref: /spdk/lib/env_dpdk/memory.c (revision 22898a91b9b6f289933db19b0175821cfb7e7820)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include <rte_config.h>
39 #include <rte_eal_memconfig.h>
40 
41 #include "spdk_internal/assert.h"
42 
43 #include "spdk/assert.h"
44 #include "spdk/likely.h"
45 #include "spdk/queue.h"
46 #include "spdk/util.h"
47 
48 #if DEBUG
49 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
50 #else
51 #define DEBUG_PRINT(...)
52 #endif
53 
54 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
55 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
56 
57 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
58 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB + 1)) - 1))
59 
60 /* Translation of a single 2MB page. */
61 struct map_2mb {
62 	uint64_t translation_2mb;
63 };
64 
65 /* Second-level map table indexed by bits [21..29] of the virtual address.
66  * Each entry contains the address translation or error for entries that haven't
67  * been retrieved yet.
68  */
69 struct map_1gb {
70 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB + 1)];
71 };
72 
73 /* Top-level map table indexed by bits [30..46] of the virtual address.
74  * Each entry points to a second-level map table or NULL.
75  */
76 struct map_256tb {
77 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB + 1)];
78 };
79 
80 /* Page-granularity memory address translation */
81 struct spdk_mem_map {
82 	struct map_256tb map_256tb;
83 	pthread_mutex_t mutex;
84 	uint64_t default_translation;
85 	spdk_mem_map_notify_cb notify_cb;
86 	void *cb_ctx;
87 	TAILQ_ENTRY(spdk_mem_map) tailq;
88 };
89 
90 static struct spdk_mem_map *g_mem_reg_map;
91 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
92 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
93 
94 /*
95  * Walk the currently registered memory via the main memory registration map
96  * and call the new map's notify callback for each virtually contiguous region.
97  */
98 static void
99 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
100 {
101 	size_t idx_256tb;
102 	uint64_t contig_start = 0;
103 	uint64_t contig_end = 0;
104 
105 #define END_RANGE()										\
106 	do {											\
107 		if (contig_start != 0) {							\
108 			/* End of of a virtually contiguous range */				\
109 			map->notify_cb(map->cb_ctx, map, action,				\
110 				       (void *)contig_start,					\
111 				       contig_end - contig_start + 2 * 1024 * 1024);		\
112 		}										\
113 		contig_start = 0;								\
114 	} while (0)
115 
116 
117 	if (!g_mem_reg_map) {
118 		return;
119 	}
120 
121 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
122 	pthread_mutex_lock(&g_mem_reg_map->mutex);
123 
124 	for (idx_256tb = 0;
125 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
126 	     idx_256tb++) {
127 		const struct map_1gb *map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
128 		uint64_t idx_1gb;
129 
130 		if (!map_1gb) {
131 			END_RANGE();
132 			continue;
133 		}
134 
135 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
136 			if (map_1gb->map[idx_1gb].translation_2mb != 0) {
137 				/* Rebuild the virtual address from the indexes */
138 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
139 
140 				if (contig_start == 0) {
141 					contig_start = vaddr;
142 				}
143 				contig_end = vaddr;
144 			} else {
145 				END_RANGE();
146 			}
147 		}
148 	}
149 
150 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
151 }
152 
153 struct spdk_mem_map *
154 spdk_mem_map_alloc(uint64_t default_translation, spdk_mem_map_notify_cb notify_cb, void *cb_ctx)
155 {
156 	struct spdk_mem_map *map;
157 
158 	map = calloc(1, sizeof(*map));
159 	if (map == NULL) {
160 		return NULL;
161 	}
162 
163 	if (pthread_mutex_init(&map->mutex, NULL)) {
164 		free(map);
165 		return NULL;
166 	}
167 
168 	map->default_translation = default_translation;
169 	map->notify_cb = notify_cb;
170 	map->cb_ctx = cb_ctx;
171 
172 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
173 
174 	if (notify_cb) {
175 		spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
176 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
177 	}
178 
179 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
180 
181 	return map;
182 }
183 
184 void
185 spdk_mem_map_free(struct spdk_mem_map **pmap)
186 {
187 	struct spdk_mem_map *map;
188 	size_t i;
189 
190 	if (!pmap) {
191 		return;
192 	}
193 
194 	map = *pmap;
195 
196 	if (!map) {
197 		return;
198 	}
199 
200 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
201 	spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
202 	TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
203 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
204 
205 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
206 		free(map->map_256tb.map[i]);
207 	}
208 
209 	pthread_mutex_destroy(&map->mutex);
210 
211 	free(map);
212 	*pmap = NULL;
213 }
214 
215 int
216 spdk_mem_register(void *vaddr, size_t len)
217 {
218 	struct spdk_mem_map *map;
219 	int rc;
220 	void *seg_vaddr;
221 	size_t seg_len;
222 
223 	if ((uintptr_t)vaddr & ~MASK_256TB) {
224 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
225 		return -EINVAL;
226 	}
227 
228 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
229 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
230 			    __func__, vaddr, len);
231 		return -EINVAL;
232 	}
233 
234 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
235 
236 	seg_vaddr = vaddr;
237 	seg_len = 0;
238 	while (len > 0) {
239 		uint64_t ref_count;
240 
241 		/* In g_mem_reg_map, the "translation" is the reference count */
242 		ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr);
243 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, ref_count + 1);
244 
245 		if (ref_count > 0) {
246 			if (seg_len > 0) {
247 				TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
248 					rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
249 					if (rc != 0) {
250 						pthread_mutex_unlock(&g_spdk_mem_map_mutex);
251 						return rc;
252 					}
253 				}
254 			}
255 
256 			seg_vaddr = vaddr + VALUE_2MB;
257 			seg_len = 0;
258 		} else {
259 			seg_len += VALUE_2MB;
260 		}
261 
262 		vaddr += VALUE_2MB;
263 		len -= VALUE_2MB;
264 	}
265 
266 	if (seg_len > 0) {
267 		TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
268 			rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
269 			if (rc != 0) {
270 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
271 				return rc;
272 			}
273 		}
274 	}
275 
276 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
277 	return 0;
278 }
279 
280 int
281 spdk_mem_unregister(void *vaddr, size_t len)
282 {
283 	struct spdk_mem_map *map;
284 	int rc;
285 	void *seg_vaddr;
286 	size_t seg_len;
287 	uint64_t ref_count;
288 
289 	if ((uintptr_t)vaddr & ~MASK_256TB) {
290 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
291 		return -EINVAL;
292 	}
293 
294 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
295 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
296 			    __func__, vaddr, len);
297 		return -EINVAL;
298 	}
299 
300 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
301 
302 	seg_vaddr = vaddr;
303 	seg_len = len;
304 	while (seg_len > 0) {
305 		ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr);
306 		if (ref_count == 0) {
307 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
308 			return -EINVAL;
309 		}
310 		seg_vaddr += VALUE_2MB;
311 		seg_len -= VALUE_2MB;
312 	}
313 
314 	seg_vaddr = vaddr;
315 	seg_len = 0;
316 	while (len > 0) {
317 		/* In g_mem_reg_map, the "translation" is the reference count */
318 		ref_count = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr);
319 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, ref_count - 1);
320 
321 		if (ref_count > 1) {
322 			if (seg_len > 0) {
323 				TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
324 					rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
325 					if (rc != 0) {
326 						pthread_mutex_unlock(&g_spdk_mem_map_mutex);
327 						return rc;
328 					}
329 				}
330 			}
331 
332 			seg_vaddr = vaddr + VALUE_2MB;
333 			seg_len = 0;
334 		} else {
335 			seg_len += VALUE_2MB;
336 		}
337 
338 		vaddr += VALUE_2MB;
339 		len -= VALUE_2MB;
340 	}
341 
342 	if (seg_len > 0) {
343 		TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
344 			rc = map->notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
345 			if (rc != 0) {
346 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
347 				return rc;
348 			}
349 		}
350 	}
351 
352 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
353 	return 0;
354 }
355 
356 static struct map_1gb *
357 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
358 {
359 	struct map_1gb *map_1gb;
360 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
361 	size_t i;
362 
363 	map_1gb = map->map_256tb.map[idx_256tb];
364 
365 	if (!map_1gb) {
366 		pthread_mutex_lock(&map->mutex);
367 
368 		/* Recheck to make sure nobody else got the mutex first. */
369 		map_1gb = map->map_256tb.map[idx_256tb];
370 		if (!map_1gb) {
371 			map_1gb = malloc(sizeof(struct map_1gb));
372 			if (map_1gb) {
373 				/* initialize all entries to default translation */
374 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
375 					map_1gb->map[i].translation_2mb = map->default_translation;
376 				}
377 				map->map_256tb.map[idx_256tb] = map_1gb;
378 			}
379 		}
380 
381 		pthread_mutex_unlock(&map->mutex);
382 
383 		if (!map_1gb) {
384 			DEBUG_PRINT("allocation failed\n");
385 			return NULL;
386 		}
387 	}
388 
389 	return map_1gb;
390 }
391 
392 int
393 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
394 			     uint64_t translation)
395 {
396 	uint64_t vfn_2mb;
397 	struct map_1gb *map_1gb;
398 	uint64_t idx_1gb;
399 	struct map_2mb *map_2mb;
400 
401 	/* For now, only 2 MB-aligned registrations are supported */
402 	if ((uintptr_t)vaddr & ~MASK_256TB) {
403 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
404 		return -EINVAL;
405 	}
406 
407 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
408 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
409 			    __func__, vaddr, size);
410 		return -EINVAL;
411 	}
412 
413 	vfn_2mb = vaddr >> SHIFT_2MB;
414 
415 	while (size) {
416 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
417 		if (!map_1gb) {
418 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
419 			return -ENOMEM;
420 		}
421 
422 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
423 		map_2mb = &map_1gb->map[idx_1gb];
424 		map_2mb->translation_2mb = translation;
425 
426 		size -= VALUE_2MB;
427 		vfn_2mb++;
428 	}
429 
430 	return 0;
431 }
432 
433 int
434 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
435 {
436 	uint64_t vfn_2mb;
437 	struct map_1gb *map_1gb;
438 	uint64_t idx_1gb;
439 	struct map_2mb *map_2mb;
440 
441 	/* For now, only 2 MB-aligned registrations are supported */
442 	if ((uintptr_t)vaddr & ~MASK_256TB) {
443 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
444 		return -EINVAL;
445 	}
446 
447 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
448 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
449 			    __func__, vaddr, size);
450 		return -EINVAL;
451 	}
452 
453 	vfn_2mb = vaddr >> SHIFT_2MB;
454 
455 	while (size) {
456 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
457 		if (!map_1gb) {
458 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
459 			return -ENOMEM;
460 		}
461 
462 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
463 		map_2mb = &map_1gb->map[idx_1gb];
464 		map_2mb->translation_2mb = map->default_translation;
465 
466 		size -= VALUE_2MB;
467 		vfn_2mb++;
468 	}
469 
470 	return 0;
471 }
472 
473 uint64_t
474 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr)
475 {
476 	const struct map_1gb *map_1gb;
477 	const struct map_2mb *map_2mb;
478 	uint64_t idx_256tb;
479 	uint64_t idx_1gb;
480 	uint64_t vfn_2mb;
481 
482 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
483 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
484 		return map->default_translation;
485 	}
486 
487 	vfn_2mb = vaddr >> SHIFT_2MB;
488 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
489 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
490 
491 	map_1gb = map->map_256tb.map[idx_256tb];
492 	if (spdk_unlikely(!map_1gb)) {
493 		return map->default_translation;
494 	}
495 
496 	map_2mb = &map_1gb->map[idx_1gb];
497 
498 	return map_2mb->translation_2mb;
499 }
500 
501 int
502 spdk_mem_map_init(void)
503 {
504 	struct rte_mem_config *mcfg;
505 	size_t seg_idx;
506 
507 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
508 	if (g_mem_reg_map == NULL) {
509 		DEBUG_PRINT("memory registration map allocation failed\n");
510 		return -1;
511 	}
512 
513 	/*
514 	 * Walk all DPDK memory segments and register them
515 	 * with the master memory map
516 	 */
517 	mcfg = rte_eal_get_configuration()->mem_config;
518 
519 	for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
520 		struct rte_memseg *seg = &mcfg->memseg[seg_idx];
521 
522 		if (seg->addr == NULL) {
523 			break;
524 		}
525 
526 		spdk_mem_register(seg->addr, seg->len);
527 	}
528 	return 0;
529 }
530