xref: /spdk/lib/env_dpdk/memory.c (revision 552e21cce6cccbf833ed9109827e08337377d7ce)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "env_internal.h"
37 
38 #include <rte_config.h>
39 #include <rte_eal_memconfig.h>
40 
41 #include "spdk_internal/assert.h"
42 #include "spdk_internal/memory.h"
43 
44 #include "spdk/assert.h"
45 #include "spdk/likely.h"
46 #include "spdk/queue.h"
47 #include "spdk/util.h"
48 
49 #if DEBUG
50 #define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
51 #else
52 #define DEBUG_PRINT(...)
53 #endif
54 
55 #define FN_2MB_TO_4KB(fn)	(fn << (SHIFT_2MB - SHIFT_4KB))
56 #define FN_4KB_TO_2MB(fn)	(fn >> (SHIFT_2MB - SHIFT_4KB))
57 
58 #define MAP_256TB_IDX(vfn_2mb)	((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
59 #define MAP_1GB_IDX(vfn_2mb)	((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
60 
61 /* Page is registered */
62 #define REG_MAP_REGISTERED	(1ULL << 62)
63 
64 /* A notification region barrier. The 2MB translation entry that's marked
65  * with this flag must be unregistered separately. This allows contiguous
66  * regions to be unregistered in the same chunks they were registered.
67  */
68 #define REG_MAP_NOTIFY_START	(1ULL << 63)
69 
70 /* Translation of a single 2MB page. */
71 struct map_2mb {
72 	uint64_t translation_2mb;
73 };
74 
75 /* Second-level map table indexed by bits [21..29] of the virtual address.
76  * Each entry contains the address translation or error for entries that haven't
77  * been retrieved yet.
78  */
79 struct map_1gb {
80 	struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
81 };
82 
83 /* Top-level map table indexed by bits [30..47] of the virtual address.
84  * Each entry points to a second-level map table or NULL.
85  */
86 struct map_256tb {
87 	struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
88 };
89 
90 /* Page-granularity memory address translation */
91 struct spdk_mem_map {
92 	struct map_256tb map_256tb;
93 	pthread_mutex_t mutex;
94 	uint64_t default_translation;
95 	struct spdk_mem_map_ops ops;
96 	void *cb_ctx;
97 	TAILQ_ENTRY(spdk_mem_map) tailq;
98 };
99 
100 /* Registrations map. The 64 bit translations are bit fields with the
101  * following layout (starting with the low bits):
102  *    0 - 61 : reserved
103  *   62 - 63 : flags
104  */
105 static struct spdk_mem_map *g_mem_reg_map;
106 static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
107 static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
108 
109 /*
110  * Walk the currently registered memory via the main memory registration map
111  * and call the new map's notify callback for each virtually contiguous region.
112  */
113 static int
114 spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
115 {
116 	size_t idx_256tb;
117 	uint64_t idx_1gb;
118 	uint64_t contig_start = UINT64_MAX;
119 	uint64_t contig_end = UINT64_MAX;
120 	struct map_1gb *map_1gb;
121 	int rc;
122 
123 	if (!g_mem_reg_map) {
124 		return -EINVAL;
125 	}
126 
127 	/* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
128 	pthread_mutex_lock(&g_mem_reg_map->mutex);
129 
130 	for (idx_256tb = 0;
131 	     idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
132 	     idx_256tb++) {
133 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
134 
135 		if (!map_1gb) {
136 			if (contig_start != UINT64_MAX) {
137 				/* End of of a virtually contiguous range */
138 				rc = map->ops.notify_cb(map->cb_ctx, map, action,
139 							(void *)contig_start,
140 							contig_end - contig_start + VALUE_2MB);
141 				/* Don't bother handling unregister failures. It can't be any worse */
142 				if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
143 					goto err_unregister;
144 				}
145 			}
146 			contig_start = UINT64_MAX;
147 			continue;
148 		}
149 
150 		for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
151 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
152 			    (contig_start == UINT64_MAX ||
153 			     (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
154 				/* Rebuild the virtual address from the indexes */
155 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
156 
157 				if (contig_start == UINT64_MAX) {
158 					contig_start = vaddr;
159 				}
160 
161 				contig_end = vaddr;
162 			} else {
163 				if (contig_start != UINT64_MAX) {
164 					/* End of of a virtually contiguous range */
165 					rc = map->ops.notify_cb(map->cb_ctx, map, action,
166 								(void *)contig_start,
167 								contig_end - contig_start + VALUE_2MB);
168 					/* Don't bother handling unregister failures. It can't be any worse */
169 					if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
170 						goto err_unregister;
171 					}
172 
173 					/* This page might be a part of a neighbour region, so process
174 					 * it again. The idx_1gb will be incremented immediately.
175 					 */
176 					idx_1gb--;
177 				}
178 				contig_start = UINT64_MAX;
179 			}
180 		}
181 	}
182 
183 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
184 	return 0;
185 
186 err_unregister:
187 	/* Unwind to the first empty translation so we don't unregister
188 	 * a region that just failed to register.
189 	 */
190 	idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
191 	idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
192 	contig_start = UINT64_MAX;
193 	contig_end = UINT64_MAX;
194 
195 	/* Unregister any memory we managed to register before the failure */
196 	for (; idx_256tb < SIZE_MAX; idx_256tb--) {
197 		map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
198 
199 		if (!map_1gb) {
200 			if (contig_end != UINT64_MAX) {
201 				/* End of of a virtually contiguous range */
202 				map->ops.notify_cb(map->cb_ctx, map,
203 						   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
204 						   (void *)contig_start,
205 						   contig_end - contig_start + VALUE_2MB);
206 			}
207 			contig_end = UINT64_MAX;
208 			continue;
209 		}
210 
211 		for (; idx_1gb < UINT64_MAX; idx_1gb--) {
212 			if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
213 			    (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
214 				/* Rebuild the virtual address from the indexes */
215 				uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
216 
217 				if (contig_end == UINT64_MAX) {
218 					contig_end = vaddr;
219 				}
220 				contig_start = vaddr;
221 			} else {
222 				if (contig_end != UINT64_MAX) {
223 					/* End of of a virtually contiguous range */
224 					map->ops.notify_cb(map->cb_ctx, map,
225 							   SPDK_MEM_MAP_NOTIFY_UNREGISTER,
226 							   (void *)contig_start,
227 							   contig_end - contig_start + VALUE_2MB);
228 					idx_1gb++;
229 				}
230 				contig_end = UINT64_MAX;
231 			}
232 		}
233 		idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
234 	}
235 
236 	pthread_mutex_unlock(&g_mem_reg_map->mutex);
237 	return rc;
238 }
239 
240 struct spdk_mem_map *
241 spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
242 {
243 	struct spdk_mem_map *map;
244 	int rc;
245 
246 	map = calloc(1, sizeof(*map));
247 	if (map == NULL) {
248 		return NULL;
249 	}
250 
251 	if (pthread_mutex_init(&map->mutex, NULL)) {
252 		free(map);
253 		return NULL;
254 	}
255 
256 	map->default_translation = default_translation;
257 	map->cb_ctx = cb_ctx;
258 	if (ops) {
259 		map->ops = *ops;
260 	}
261 
262 	if (ops && ops->notify_cb) {
263 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
264 		rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
265 		if (rc != 0) {
266 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
267 			DEBUG_PRINT("Initial mem_map notify failed\n");
268 			pthread_mutex_destroy(&map->mutex);
269 			free(map);
270 			return NULL;
271 		}
272 		TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
273 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
274 	}
275 
276 	return map;
277 }
278 
279 void
280 spdk_mem_map_free(struct spdk_mem_map **pmap)
281 {
282 	struct spdk_mem_map *map;
283 	size_t i;
284 
285 	if (!pmap) {
286 		return;
287 	}
288 
289 	map = *pmap;
290 
291 	if (!map) {
292 		return;
293 	}
294 
295 	if (map->ops.notify_cb) {
296 		pthread_mutex_lock(&g_spdk_mem_map_mutex);
297 		spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
298 		TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
299 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
300 	}
301 
302 	for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
303 		free(map->map_256tb.map[i]);
304 	}
305 
306 	pthread_mutex_destroy(&map->mutex);
307 
308 	free(map);
309 	*pmap = NULL;
310 }
311 
312 int
313 spdk_mem_register(void *vaddr, size_t len)
314 {
315 	struct spdk_mem_map *map;
316 	int rc;
317 	void *seg_vaddr;
318 	size_t seg_len;
319 	uint64_t reg;
320 
321 	if ((uintptr_t)vaddr & ~MASK_256TB) {
322 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
323 		return -EINVAL;
324 	}
325 
326 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
327 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
328 			    __func__, vaddr, len);
329 		return -EINVAL;
330 	}
331 
332 	if (len == 0) {
333 		return 0;
334 	}
335 
336 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
337 
338 	seg_vaddr = vaddr;
339 	seg_len = len;
340 	while (seg_len > 0) {
341 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
342 		if (reg & REG_MAP_REGISTERED) {
343 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
344 			return -EBUSY;
345 		}
346 		seg_vaddr += VALUE_2MB;
347 		seg_len -= VALUE_2MB;
348 	}
349 
350 	seg_vaddr = vaddr;
351 	seg_len = 0;
352 	while (len > 0) {
353 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
354 					     seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
355 		seg_len += VALUE_2MB;
356 		vaddr += VALUE_2MB;
357 		len -= VALUE_2MB;
358 	}
359 
360 	TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
361 		rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
362 		if (rc != 0) {
363 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
364 			return rc;
365 		}
366 	}
367 
368 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
369 	return 0;
370 }
371 
372 int
373 spdk_mem_unregister(void *vaddr, size_t len)
374 {
375 	struct spdk_mem_map *map;
376 	int rc;
377 	void *seg_vaddr;
378 	size_t seg_len;
379 	uint64_t reg, newreg;
380 
381 	if ((uintptr_t)vaddr & ~MASK_256TB) {
382 		DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
383 		return -EINVAL;
384 	}
385 
386 	if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
387 		DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
388 			    __func__, vaddr, len);
389 		return -EINVAL;
390 	}
391 
392 	pthread_mutex_lock(&g_spdk_mem_map_mutex);
393 
394 	/* The first page must be a start of a region. Also check if it's
395 	 * registered to make sure we don't return -ERANGE for non-registered
396 	 * regions.
397 	 */
398 	reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
399 	if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
400 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
401 		return -ERANGE;
402 	}
403 
404 	seg_vaddr = vaddr;
405 	seg_len = len;
406 	while (seg_len > 0) {
407 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
408 		if ((reg & REG_MAP_REGISTERED) == 0) {
409 			pthread_mutex_unlock(&g_spdk_mem_map_mutex);
410 			return -EINVAL;
411 		}
412 		seg_vaddr += VALUE_2MB;
413 		seg_len -= VALUE_2MB;
414 	}
415 
416 	newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
417 	/* If the next page is registered, it must be a start of a region as well,
418 	 * otherwise we'd be unregistering only a part of a region.
419 	 */
420 	if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
421 		pthread_mutex_unlock(&g_spdk_mem_map_mutex);
422 		return -ERANGE;
423 	}
424 	seg_vaddr = vaddr;
425 	seg_len = 0;
426 
427 	while (len > 0) {
428 		reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
429 		spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
430 
431 		if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
432 			TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
433 				rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
434 				if (rc != 0) {
435 					pthread_mutex_unlock(&g_spdk_mem_map_mutex);
436 					return rc;
437 				}
438 			}
439 
440 			seg_vaddr = vaddr;
441 			seg_len = VALUE_2MB;
442 		} else {
443 			seg_len += VALUE_2MB;
444 		}
445 
446 		vaddr += VALUE_2MB;
447 		len -= VALUE_2MB;
448 	}
449 
450 	if (seg_len > 0) {
451 		TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
452 			rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
453 			if (rc != 0) {
454 				pthread_mutex_unlock(&g_spdk_mem_map_mutex);
455 				return rc;
456 			}
457 		}
458 	}
459 
460 	pthread_mutex_unlock(&g_spdk_mem_map_mutex);
461 	return 0;
462 }
463 
464 static struct map_1gb *
465 spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
466 {
467 	struct map_1gb *map_1gb;
468 	uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
469 	size_t i;
470 
471 	if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
472 		return NULL;
473 	}
474 
475 	map_1gb = map->map_256tb.map[idx_256tb];
476 
477 	if (!map_1gb) {
478 		pthread_mutex_lock(&map->mutex);
479 
480 		/* Recheck to make sure nobody else got the mutex first. */
481 		map_1gb = map->map_256tb.map[idx_256tb];
482 		if (!map_1gb) {
483 			map_1gb = malloc(sizeof(struct map_1gb));
484 			if (map_1gb) {
485 				/* initialize all entries to default translation */
486 				for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
487 					map_1gb->map[i].translation_2mb = map->default_translation;
488 				}
489 				map->map_256tb.map[idx_256tb] = map_1gb;
490 			}
491 		}
492 
493 		pthread_mutex_unlock(&map->mutex);
494 
495 		if (!map_1gb) {
496 			DEBUG_PRINT("allocation failed\n");
497 			return NULL;
498 		}
499 	}
500 
501 	return map_1gb;
502 }
503 
504 int
505 spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
506 			     uint64_t translation)
507 {
508 	uint64_t vfn_2mb;
509 	struct map_1gb *map_1gb;
510 	uint64_t idx_1gb;
511 	struct map_2mb *map_2mb;
512 
513 	if ((uintptr_t)vaddr & ~MASK_256TB) {
514 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
515 		return -EINVAL;
516 	}
517 
518 	/* For now, only 2 MB-aligned registrations are supported */
519 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
520 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
521 			    __func__, vaddr, size);
522 		return -EINVAL;
523 	}
524 
525 	vfn_2mb = vaddr >> SHIFT_2MB;
526 
527 	while (size) {
528 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
529 		if (!map_1gb) {
530 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
531 			return -ENOMEM;
532 		}
533 
534 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
535 		map_2mb = &map_1gb->map[idx_1gb];
536 		map_2mb->translation_2mb = translation;
537 
538 		size -= VALUE_2MB;
539 		vfn_2mb++;
540 	}
541 
542 	return 0;
543 }
544 
545 int
546 spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
547 {
548 	uint64_t vfn_2mb;
549 	struct map_1gb *map_1gb;
550 	uint64_t idx_1gb;
551 	struct map_2mb *map_2mb;
552 
553 	if ((uintptr_t)vaddr & ~MASK_256TB) {
554 		DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
555 		return -EINVAL;
556 	}
557 
558 	/* For now, only 2 MB-aligned registrations are supported */
559 	if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
560 		DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
561 			    __func__, vaddr, size);
562 		return -EINVAL;
563 	}
564 
565 	vfn_2mb = vaddr >> SHIFT_2MB;
566 
567 	while (size) {
568 		map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb);
569 		if (!map_1gb) {
570 			DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
571 			return -ENOMEM;
572 		}
573 
574 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
575 		map_2mb = &map_1gb->map[idx_1gb];
576 		map_2mb->translation_2mb = map->default_translation;
577 
578 		size -= VALUE_2MB;
579 		vfn_2mb++;
580 	}
581 
582 	return 0;
583 }
584 
585 uint64_t
586 spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
587 {
588 	const struct map_1gb *map_1gb;
589 	const struct map_2mb *map_2mb;
590 	uint64_t idx_256tb;
591 	uint64_t idx_1gb;
592 	uint64_t vfn_2mb;
593 	uint64_t cur_size;
594 	uint64_t prev_translation;
595 	uint64_t orig_translation;
596 
597 	if (spdk_unlikely(vaddr & ~MASK_256TB)) {
598 		DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
599 		return map->default_translation;
600 	}
601 
602 	vfn_2mb = vaddr >> SHIFT_2MB;
603 	idx_256tb = MAP_256TB_IDX(vfn_2mb);
604 	idx_1gb = MAP_1GB_IDX(vfn_2mb);
605 
606 	map_1gb = map->map_256tb.map[idx_256tb];
607 	if (spdk_unlikely(!map_1gb)) {
608 		return map->default_translation;
609 	}
610 
611 	cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
612 	map_2mb = &map_1gb->map[idx_1gb];
613 	if (size == NULL || map->ops.are_contiguous == NULL ||
614 	    map_2mb->translation_2mb == map->default_translation) {
615 		if (size != NULL) {
616 			*size = spdk_min(*size, cur_size);
617 		}
618 		return map_2mb->translation_2mb;
619 	}
620 
621 	orig_translation = map_2mb->translation_2mb;
622 	prev_translation = orig_translation;
623 	while (cur_size < *size) {
624 		vfn_2mb++;
625 		idx_256tb = MAP_256TB_IDX(vfn_2mb);
626 		idx_1gb = MAP_1GB_IDX(vfn_2mb);
627 
628 		map_1gb = map->map_256tb.map[idx_256tb];
629 		if (spdk_unlikely(!map_1gb)) {
630 			break;
631 		}
632 
633 		map_2mb = &map_1gb->map[idx_1gb];
634 		if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
635 			break;
636 		}
637 
638 		cur_size += VALUE_2MB;
639 		prev_translation = map_2mb->translation_2mb;
640 	}
641 
642 	*size = spdk_min(*size, cur_size);
643 	return orig_translation;
644 }
645 
646 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
647 static void
648 memory_hotplug_cb(enum rte_mem_event event_type,
649 		  const void *addr, size_t len, void *arg)
650 {
651 	if (event_type == RTE_MEM_EVENT_ALLOC) {
652 		spdk_mem_register((void *)addr, len);
653 
654 		/* Now mark each segment so that DPDK won't later free it.
655 		 * This ensures we don't have to deal with the memory
656 		 * getting freed in different units than it was allocated.
657 		 */
658 		while (len > 0) {
659 			struct rte_memseg *seg;
660 
661 			seg = rte_mem_virt2memseg(addr, NULL);
662 			assert(seg != NULL);
663 			seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
664 			addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
665 			len -= seg->hugepage_sz;
666 		}
667 	} else if (event_type == RTE_MEM_EVENT_FREE) {
668 		spdk_mem_unregister((void *)addr, len);
669 	}
670 }
671 
672 static int
673 memory_iter_cb(const struct rte_memseg_list *msl,
674 	       const struct rte_memseg *ms, size_t len, void *arg)
675 {
676 	return spdk_mem_register(ms->addr, len);
677 }
678 #endif
679 
680 int
681 spdk_mem_map_init(void)
682 {
683 	g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
684 	if (g_mem_reg_map == NULL) {
685 		DEBUG_PRINT("memory registration map allocation failed\n");
686 		return -1;
687 	}
688 
689 	/*
690 	 * Walk all DPDK memory segments and register them
691 	 * with the master memory map
692 	 */
693 #if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0)
694 	rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
695 	rte_memseg_contig_walk(memory_iter_cb, NULL);
696 #else
697 	struct rte_mem_config *mcfg;
698 	size_t seg_idx;
699 
700 	mcfg = rte_eal_get_configuration()->mem_config;
701 	for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) {
702 		struct rte_memseg *seg = &mcfg->memseg[seg_idx];
703 
704 		if (seg->addr == NULL) {
705 			break;
706 		}
707 
708 		spdk_mem_register(seg->addr, seg->len);
709 	}
710 #endif
711 	return 0;
712 }
713