xref: /openbsd-src/sys/uvm/uvm_addr.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /*	$OpenBSD: uvm_addr.c,v 1.16 2016/06/02 18:48:01 tedu Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 /* #define DEBUG */
20 
21 #include <sys/param.h>
22 #include <uvm/uvm.h>
23 #include <uvm/uvm_addr.h>
24 #include <sys/pool.h>
25 
26 /* Max gap between hint allocations. */
27 #define UADDR_HINT_MAXGAP	(4 * PAGE_SIZE)
28 /* Number of pivots in pivot allocator. */
29 #define NUM_PIVOTS		16
30 /*
31  * Max number (inclusive) of pages the pivot allocator
32  * will place between allocations.
33  *
34  * The uaddr_pivot_random() function attempts to bias towards
35  * small space between allocations, so putting a large number here is fine.
36  */
37 #define PIVOT_RND		8
38 /*
39  * Number of allocations that a pivot can supply before expiring.
40  * When a pivot expires, a new pivot has to be found.
41  *
42  * Must be at least 1.
43  */
44 #define PIVOT_EXPIRE		1024
45 
46 
47 /* Pool with uvm_addr_state structures. */
48 struct pool uaddr_pool;
49 struct pool uaddr_hint_pool;
50 struct pool uaddr_bestfit_pool;
51 struct pool uaddr_pivot_pool;
52 struct pool uaddr_rnd_pool;
53 
54 /* uvm_addr state for hint based selector. */
55 struct uaddr_hint_state {
56 	struct uvm_addr_state		 uaddr;
57 	vsize_t				 max_dist;
58 };
59 
60 /* uvm_addr state for bestfit selector. */
61 struct uaddr_bestfit_state {
62 	struct uvm_addr_state		 ubf_uaddr;
63 	struct uaddr_free_rbtree	 ubf_free;
64 };
65 
66 /* uvm_addr state for rnd selector. */
67 struct uaddr_rnd_state {
68 	struct uvm_addr_state		 ur_uaddr;
69 #if 0
70 	TAILQ_HEAD(, vm_map_entry)	 ur_free;
71 #endif
72 };
73 
74 /* Definition of a pivot in pivot selector. */
75 struct uaddr_pivot {
76 	vaddr_t				 addr;	/* End of prev. allocation. */
77 	int				 expire;/* Best before date. */
78 	int				 dir;	/* Direction. */
79 	struct vm_map_entry		*entry; /* Will contain next alloc. */
80 };
81 /* uvm_addr state for pivot selector. */
82 struct uaddr_pivot_state {
83 	struct uvm_addr_state		 up_uaddr;
84 
85 	/* Free space tree, for fast pivot selection. */
86 	struct uaddr_free_rbtree	 up_free;
87 
88 	/* List of pivots. The pointers point to after the last allocation. */
89 	struct uaddr_pivot		 up_pivots[NUM_PIVOTS];
90 };
91 
92 /*
93  * Free space comparison.
94  * Compares smaller free-space before larger free-space.
95  */
96 static __inline int
97 uvm_mapent_fspace_cmp(struct vm_map_entry *e1, struct vm_map_entry *e2)
98 {
99 	if (e1->fspace != e2->fspace)
100 		return (e1->fspace < e2->fspace ? -1 : 1);
101 	return (e1->start < e2->start ? -1 : e1->start > e2->start);
102 }
103 
104 /* Forward declaration (see below). */
105 extern const struct uvm_addr_functions uaddr_kernel_functions;
106 struct uvm_addr_state uaddr_kbootstrap;
107 
108 /* Support functions. */
109 #ifndef SMALL_KERNEL
110 struct vm_map_entry	*uvm_addr_entrybyspace(struct uaddr_free_rbtree*,
111 			    vsize_t);
112 #endif /* !SMALL_KERNEL */
113 void			 uaddr_kinsert(struct vm_map *,
114 			    struct uvm_addr_state *, struct vm_map_entry *);
115 void			 uaddr_kremove(struct vm_map *,
116 			    struct uvm_addr_state *, struct vm_map_entry *);
117 void			 uaddr_kbootstrapdestroy(struct uvm_addr_state *);
118 
119 void			 uaddr_destroy(struct uvm_addr_state *);
120 void			 uaddr_hint_destroy(struct uvm_addr_state *);
121 void			 uaddr_kbootstrap_destroy(struct uvm_addr_state *);
122 void			 uaddr_rnd_destroy(struct uvm_addr_state *);
123 void			 uaddr_bestfit_destroy(struct uvm_addr_state *);
124 void			 uaddr_pivot_destroy(struct uvm_addr_state *);
125 
126 #if 0
127 int			 uaddr_lin_select(struct vm_map *,
128 			    struct uvm_addr_state *, struct vm_map_entry **,
129 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
130 			    vaddr_t);
131 #endif
132 int			 uaddr_kbootstrap_select(struct vm_map *,
133 			    struct uvm_addr_state *, struct vm_map_entry **,
134 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
135 			    vaddr_t);
136 int			 uaddr_rnd_select(struct vm_map *,
137 			    struct uvm_addr_state *, struct vm_map_entry **,
138 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
139 			    vaddr_t);
140 int			 uaddr_hint_select(struct vm_map *,
141 			    struct uvm_addr_state*, struct vm_map_entry **,
142 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
143 			    vaddr_t);
144 int			 uaddr_bestfit_select(struct vm_map *,
145 			    struct uvm_addr_state*, struct vm_map_entry **,
146 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
147 			    vaddr_t);
148 #ifndef SMALL_KERNEL
149 int			 uaddr_pivot_select(struct vm_map *,
150 			    struct uvm_addr_state *, struct vm_map_entry **,
151 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
152 			    vaddr_t);
153 int			 uaddr_stack_brk_select(struct vm_map *,
154 			    struct uvm_addr_state *, struct vm_map_entry **,
155 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
156 			    vaddr_t);
157 #endif /* !SMALL_KERNEL */
158 
159 void			 uaddr_rnd_insert(struct vm_map *,
160 			    struct uvm_addr_state *, struct vm_map_entry *);
161 void			 uaddr_rnd_remove(struct vm_map *,
162 			    struct uvm_addr_state *, struct vm_map_entry *);
163 void			 uaddr_bestfit_insert(struct vm_map *,
164 			    struct uvm_addr_state *, struct vm_map_entry *);
165 void			 uaddr_bestfit_remove(struct vm_map *,
166 			    struct uvm_addr_state *, struct vm_map_entry *);
167 void			 uaddr_pivot_insert(struct vm_map *,
168 			    struct uvm_addr_state *, struct vm_map_entry *);
169 void			 uaddr_pivot_remove(struct vm_map *,
170 			    struct uvm_addr_state *, struct vm_map_entry *);
171 
172 #ifndef SMALL_KERNEL
173 vsize_t			 uaddr_pivot_random(void);
174 int			 uaddr_pivot_newpivot(struct vm_map *,
175 			    struct uaddr_pivot_state *, struct uaddr_pivot *,
176 			    struct vm_map_entry **, vaddr_t *,
177 			    vsize_t, vaddr_t, vaddr_t, vsize_t, vsize_t);
178 #endif /* !SMALL_KERNEL */
179 
180 #if defined(DEBUG) || defined(DDB)
181 void			 uaddr_pivot_print(struct uvm_addr_state *, boolean_t,
182 			    int (*)(const char *, ...));
183 #if 0
184 void			 uaddr_rnd_print(struct uvm_addr_state *, boolean_t,
185 			    int (*)(const char *, ...));
186 #endif
187 #endif /* DEBUG || DDB */
188 
189 
190 #ifndef SMALL_KERNEL
191 /*
192  * Find smallest entry in tree that will fit sz bytes.
193  */
194 struct vm_map_entry *
195 uvm_addr_entrybyspace(struct uaddr_free_rbtree *free, vsize_t sz)
196 {
197 	struct vm_map_entry	*tmp, *res;
198 
199 	tmp = RB_ROOT(free);
200 	res = NULL;
201 	while (tmp) {
202 		if (tmp->fspace >= sz) {
203 			res = tmp;
204 			tmp = RB_LEFT(tmp, dfree.rbtree);
205 		} else if (tmp->fspace < sz)
206 			tmp = RB_RIGHT(tmp, dfree.rbtree);
207 	}
208 	return res;
209 }
210 #endif /* !SMALL_KERNEL */
211 
212 static __inline vaddr_t
213 uvm_addr_align_forward(vaddr_t addr, vaddr_t align, vaddr_t offset)
214 {
215 	vaddr_t adjusted;
216 
217 	KASSERT(offset < align || (align == 0 && offset == 0));
218 	KASSERT((align & (align - 1)) == 0);
219 	KASSERT((offset & PAGE_MASK) == 0);
220 
221 	align = MAX(align, PAGE_SIZE);
222 	adjusted = addr & ~(align - 1);
223 	adjusted += offset;
224 	return (adjusted < addr ? adjusted + align : adjusted);
225 }
226 
227 static __inline vaddr_t
228 uvm_addr_align_backward(vaddr_t addr, vaddr_t align, vaddr_t offset)
229 {
230 	vaddr_t adjusted;
231 
232 	KASSERT(offset < align || (align == 0 && offset == 0));
233 	KASSERT((align & (align - 1)) == 0);
234 	KASSERT((offset & PAGE_MASK) == 0);
235 
236 	align = MAX(align, PAGE_SIZE);
237 	adjusted = addr & ~(align - 1);
238 	adjusted += offset;
239 	return (adjusted > addr ? adjusted - align : adjusted);
240 }
241 
242 /*
243  * Try to fit the requested space into the entry.
244  */
245 int
246 uvm_addr_fitspace(vaddr_t *min_result, vaddr_t *max_result,
247     vaddr_t low_addr, vaddr_t high_addr, vsize_t sz,
248     vaddr_t align, vaddr_t offset,
249     vsize_t before_gap, vsize_t after_gap)
250 {
251 	vaddr_t	tmp;
252 	vsize_t	fspace;
253 
254 	if (low_addr > high_addr)
255 		return ENOMEM;
256 	fspace = high_addr - low_addr;
257 	if (fspace < sz + before_gap + after_gap)
258 		return ENOMEM;
259 
260 	/* Calculate lowest address. */
261 	low_addr += before_gap;
262 	low_addr = uvm_addr_align_forward(tmp = low_addr, align, offset);
263 	if (low_addr < tmp)	/* Overflow during alignment. */
264 		return ENOMEM;
265 	if (high_addr - after_gap - sz < low_addr)
266 		return ENOMEM;
267 
268 	/* Calculate highest address. */
269 	high_addr -= after_gap + sz;
270 	high_addr = uvm_addr_align_backward(tmp = high_addr, align, offset);
271 	if (high_addr > tmp)	/* Overflow during alignment. */
272 		return ENOMEM;
273 	if (low_addr > high_addr)
274 		return ENOMEM;
275 
276 	*min_result = low_addr;
277 	*max_result = high_addr;
278 	return 0;
279 }
280 
281 
282 /*
283  * Initialize uvm_addr.
284  */
285 void
286 uvm_addr_init(void)
287 {
288 	pool_init(&uaddr_pool, sizeof(struct uvm_addr_state),
289 	    0, 0, PR_WAITOK, "uaddr", NULL);
290 	pool_setipl(&uaddr_pool, IPL_VM);
291 	pool_init(&uaddr_hint_pool, sizeof(struct uaddr_hint_state),
292 	    0, 0, PR_WAITOK, "uaddrhint", NULL);
293 	pool_setipl(&uaddr_hint_pool, IPL_VM);
294 	pool_init(&uaddr_bestfit_pool, sizeof(struct uaddr_bestfit_state),
295 	    0, 0, PR_WAITOK, "uaddrbest", NULL);
296 	pool_setipl(&uaddr_bestfit_pool, IPL_VM);
297 	pool_init(&uaddr_pivot_pool, sizeof(struct uaddr_pivot_state),
298 	    0, 0, PR_WAITOK, "uaddrpivot", NULL);
299 	pool_setipl(&uaddr_pivot_pool, IPL_VM);
300 	pool_init(&uaddr_rnd_pool, sizeof(struct uaddr_rnd_state),
301 	    0, 0, PR_WAITOK, "uaddrrnd", NULL);
302 	pool_setipl(&uaddr_rnd_pool, IPL_VM);
303 
304 	uaddr_kbootstrap.uaddr_minaddr = PAGE_SIZE;
305 	uaddr_kbootstrap.uaddr_maxaddr = -(vaddr_t)PAGE_SIZE;
306 	uaddr_kbootstrap.uaddr_functions = &uaddr_kernel_functions;
307 }
308 
309 /*
310  * Invoke destructor function of uaddr.
311  */
312 void
313 uvm_addr_destroy(struct uvm_addr_state *uaddr)
314 {
315 	if (uaddr)
316 		(*uaddr->uaddr_functions->uaddr_destroy)(uaddr);
317 }
318 
319 /*
320  * Move address forward to satisfy align, offset.
321  */
322 vaddr_t
323 uvm_addr_align(vaddr_t addr, vaddr_t align, vaddr_t offset)
324 {
325 	vaddr_t result = (addr & ~(align - 1)) + offset;
326 	if (result < addr)
327 		result += align;
328 	return result;
329 }
330 
331 /*
332  * Move address backwards to satisfy align, offset.
333  */
334 vaddr_t
335 uvm_addr_align_back(vaddr_t addr, vaddr_t align, vaddr_t offset)
336 {
337 	vaddr_t result = (addr & ~(align - 1)) + offset;
338 	if (result > addr)
339 		result -= align;
340 	return result;
341 }
342 
343 /*
344  * Directional first fit.
345  *
346  * Do a linear search for free space, starting at addr in entry.
347  * direction ==  1: search forward
348  * direction == -1: search backward
349  *
350  * Output: low <= addr <= high and entry will contain addr.
351  * 0 will be returned if no space is available.
352  *
353  * gap describes the space that must appear between the preceding entry.
354  */
355 int
356 uvm_addr_linsearch(struct vm_map *map, struct uvm_addr_state *uaddr,
357     struct vm_map_entry **entry_out, vaddr_t *addr_out,
358     vaddr_t hint, vsize_t sz, vaddr_t align, vaddr_t offset,
359     int direction, vaddr_t low, vaddr_t high,
360     vsize_t before_gap, vsize_t after_gap)
361 {
362 	struct vm_map_entry	*entry;
363 	vaddr_t			 low_addr, high_addr;
364 
365 	KASSERT(entry_out != NULL && addr_out != NULL);
366 	KASSERT(direction == -1 || direction == 1);
367 	KASSERT((hint & PAGE_MASK) == 0 && (high & PAGE_MASK) == 0 &&
368 	    (low & PAGE_MASK) == 0 &&
369 	    (before_gap & PAGE_MASK) == 0 && (after_gap & PAGE_MASK) == 0);
370 	KASSERT(high + sz > high); /* Check for overflow. */
371 
372 	/* Hint magic. */
373 	if (hint == 0)
374 		hint = (direction == 1 ? low : high);
375 	else if (hint > high) {
376 		if (direction != -1)
377 			return ENOMEM;
378 		hint = high;
379 	} else if (hint < low) {
380 		if (direction != 1)
381 			return ENOMEM;
382 		hint = low;
383 	}
384 
385 	for (entry = uvm_map_entrybyaddr(&map->addr,
386 	    hint - (direction == -1 ? 1 : 0)); entry != NULL;
387 	    entry = (direction == 1 ?
388 	    RB_NEXT(uvm_map_addr, &map->addr, entry) :
389 	    RB_PREV(uvm_map_addr, &map->addr, entry))) {
390 		if (VMMAP_FREE_START(entry) > high ||
391 		    VMMAP_FREE_END(entry) < low) {
392 			break;
393 		}
394 
395 		if (uvm_addr_fitspace(&low_addr, &high_addr,
396 		    MAX(low, VMMAP_FREE_START(entry)),
397 		    MIN(high, VMMAP_FREE_END(entry)),
398 		    sz, align, offset, before_gap, after_gap) == 0) {
399 			*entry_out = entry;
400 			if (hint >= low_addr && hint <= high_addr) {
401 				*addr_out = hint;
402 			} else {
403 				*addr_out = (direction == 1 ?
404 				    low_addr : high_addr);
405 			}
406 			return 0;
407 		}
408 	}
409 
410 	return ENOMEM;
411 }
412 
413 /*
414  * Invoke address selector of uaddr.
415  * uaddr may be NULL, in which case the algorithm will fail with ENOMEM.
416  *
417  * Will invoke uvm_addr_isavail to fill in last_out.
418  */
419 int
420 uvm_addr_invoke(struct vm_map *map, struct uvm_addr_state *uaddr,
421     struct vm_map_entry **entry_out, struct vm_map_entry **last_out,
422     vaddr_t *addr_out,
423     vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint)
424 {
425 	int error;
426 
427 	if (uaddr == NULL)
428 		return ENOMEM;
429 
430 	hint &= ~((vaddr_t)PAGE_MASK);
431 	if (hint != 0 &&
432 	    !(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr))
433 		return ENOMEM;
434 
435 	error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr,
436 	    entry_out, addr_out, sz, align, offset, prot, hint);
437 
438 	if (error == 0) {
439 		KASSERT(*entry_out != NULL);
440 		*last_out = NULL;
441 		if (!uvm_map_isavail(map, uaddr, entry_out, last_out,
442 		    *addr_out, sz)) {
443 			panic("uvm_addr_invoke: address selector %p "
444 			    "(%s 0x%lx-0x%lx) "
445 			    "returned unavailable address 0x%lx sz 0x%lx",
446 			    uaddr, uaddr->uaddr_functions->uaddr_name,
447 			    uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr,
448 			    *addr_out, sz);
449 		}
450 	}
451 
452 	return error;
453 }
454 
455 #if defined(DEBUG) || defined(DDB)
456 void
457 uvm_addr_print(struct uvm_addr_state *uaddr, const char *slot, boolean_t full,
458     int (*pr)(const char *, ...))
459 {
460 	if (uaddr == NULL) {
461 		(*pr)("- uvm_addr %s: NULL\n", slot);
462 		return;
463 	}
464 
465 	(*pr)("- uvm_addr %s: %p (%s 0x%lx-0x%lx)\n", slot, uaddr,
466 	    uaddr->uaddr_functions->uaddr_name,
467 	    uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr);
468 	if (uaddr->uaddr_functions->uaddr_print == NULL)
469 		return;
470 
471 	(*uaddr->uaddr_functions->uaddr_print)(uaddr, full, pr);
472 }
473 #endif /* DEBUG || DDB */
474 
475 /*
476  * Destroy a uvm_addr_state structure.
477  * The uaddr must have been previously allocated from uaddr_state_pool.
478  */
479 void
480 uaddr_destroy(struct uvm_addr_state *uaddr)
481 {
482 	pool_put(&uaddr_pool, uaddr);
483 }
484 
485 
486 #if 0
487 /*
488  * Linear allocator.
489  * This allocator uses a first-fit algorithm.
490  *
491  * If hint is set, search will start at the hint position.
492  * Only searches forward.
493  */
494 const struct uvm_addr_functions uaddr_lin_functions = {
495 	.uaddr_select = &uaddr_lin_select,
496 	.uaddr_destroy = &uaddr_destroy,
497 	.uaddr_name = "uaddr_lin"
498 };
499 
500 struct uvm_addr_state *
501 uaddr_lin_create(vaddr_t minaddr, vaddr_t maxaddr)
502 {
503 	struct uvm_addr_state *uaddr;
504 
505 	uaddr = pool_get(&uaddr_pool, PR_WAITOK);
506 	uaddr->uaddr_minaddr = minaddr;
507 	uaddr->uaddr_maxaddr = maxaddr;
508 	uaddr->uaddr_functions = &uaddr_lin_functions;
509 	return uaddr;
510 }
511 
512 int
513 uaddr_lin_select(struct vm_map *map, struct uvm_addr_state *uaddr,
514     struct vm_map_entry **entry_out, vaddr_t *addr_out,
515     vsize_t sz, vaddr_t align, vaddr_t offset,
516     vm_prot_t prot, vaddr_t hint)
517 {
518 	vaddr_t guard_sz;
519 
520 	/* Deal with guardpages: search for space with one extra page. */
521 	guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
522 
523 	if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr < sz + guard_sz)
524 		return ENOMEM;
525 	return uvm_addr_linsearch(map, uaddr, entry_out, addr_out, 0, sz,
526 	    align, offset, 1, uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr - sz,
527 	    0, guard_sz);
528 }
529 #endif
530 
531 /*
532  * Randomized allocator.
533  * This allocator use uvm_map_hint to acquire a random address and searches
534  * from there.
535  */
536 
537 const struct uvm_addr_functions uaddr_rnd_functions = {
538 	.uaddr_select = &uaddr_rnd_select,
539 	.uaddr_free_insert = &uaddr_rnd_insert,
540 	.uaddr_free_remove = &uaddr_rnd_remove,
541 	.uaddr_destroy = &uaddr_rnd_destroy,
542 #if defined(DEBUG) || defined(DDB)
543 #if 0
544 	.uaddr_print = &uaddr_rnd_print,
545 #endif
546 #endif /* DEBUG || DDB */
547 	.uaddr_name = "uaddr_rnd"
548 };
549 
550 struct uvm_addr_state *
551 uaddr_rnd_create(vaddr_t minaddr, vaddr_t maxaddr)
552 {
553 	struct uaddr_rnd_state *uaddr;
554 
555 	uaddr = pool_get(&uaddr_rnd_pool, PR_WAITOK);
556 	uaddr->ur_uaddr.uaddr_minaddr = minaddr;
557 	uaddr->ur_uaddr.uaddr_maxaddr = maxaddr;
558 	uaddr->ur_uaddr.uaddr_functions = &uaddr_rnd_functions;
559 #if 0
560 	TAILQ_INIT(&uaddr->ur_free);
561 #endif
562 	return &uaddr->ur_uaddr;
563 }
564 
565 int
566 uaddr_rnd_select(struct vm_map *map, struct uvm_addr_state *uaddr,
567     struct vm_map_entry **entry_out, vaddr_t *addr_out,
568     vsize_t sz, vaddr_t align, vaddr_t offset,
569     vm_prot_t prot, vaddr_t hint)
570 {
571 	struct vmspace		*vm;
572 	vaddr_t			 minaddr, maxaddr;
573 	vaddr_t			 guard_sz;
574 	vaddr_t			 low_addr, high_addr;
575 	struct vm_map_entry	*entry, *next;
576 	vsize_t			 before_gap, after_gap;
577 	vaddr_t			 tmp;
578 
579 	KASSERT((map->flags & VM_MAP_ISVMSPACE) != 0);
580 	vm = (struct vmspace *)map;
581 
582 	/* Deal with guardpages: search for space with one extra page. */
583 	guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
584 
585 	minaddr = uvm_addr_align_forward(uaddr->uaddr_minaddr, align, offset);
586 	maxaddr = uvm_addr_align_backward(uaddr->uaddr_maxaddr - sz - guard_sz,
587 	    align, offset);
588 
589 	/* Quick fail if the allocation won't fit. */
590 	if (minaddr >= maxaddr)
591 		return ENOMEM;
592 
593 	/* Select a hint. */
594 	if (hint == 0)
595 		hint = uvm_map_hint(vm, prot, minaddr, maxaddr);
596 	/* Clamp hint to uaddr range. */
597 	hint = MIN(MAX(hint, minaddr), maxaddr);
598 
599 	/* Align hint to align,offset parameters. */
600 	tmp = hint;
601 	hint = uvm_addr_align_forward(tmp, align, offset);
602 	/* Check for overflow during alignment. */
603 	if (hint < tmp || hint > maxaddr)
604 		return ENOMEM; /* Compatibility mode: never look backwards. */
605 
606 	before_gap = 0;
607 	after_gap = guard_sz;
608 	hint -= MIN(hint, before_gap);
609 
610 	/*
611 	 * Use the augmented address tree to look up the first entry
612 	 * at or after hint with sufficient space.
613 	 *
614 	 * This code is the original optimized code, but will fail if the
615 	 * subtree it looks at does have sufficient space, but fails to meet
616 	 * the align constraint.
617 	 *
618 	 * Guard: subtree is not exhausted and max(fspace) >= required.
619 	 */
620 	entry = uvm_map_entrybyaddr(&map->addr, hint);
621 
622 	/* Walk up the tree, until there is at least sufficient space. */
623 	while (entry != NULL &&
624 	    entry->fspace_augment < before_gap + after_gap + sz)
625 		entry = RB_PARENT(entry, daddrs.addr_entry);
626 
627 	while (entry != NULL) {
628 		/* Test if this fits. */
629 		if (VMMAP_FREE_END(entry) > hint &&
630 		    uvm_map_uaddr_e(map, entry) == uaddr &&
631 		    uvm_addr_fitspace(&low_addr, &high_addr,
632 		    MAX(uaddr->uaddr_minaddr, VMMAP_FREE_START(entry)),
633 		    MIN(uaddr->uaddr_maxaddr, VMMAP_FREE_END(entry)),
634 		    sz, align, offset, before_gap, after_gap) == 0) {
635 			*entry_out = entry;
636 			if (hint >= low_addr && hint <= high_addr)
637 				*addr_out = hint;
638 			else
639 				*addr_out = low_addr;
640 			return 0;
641 		}
642 
643 		/* RB_NEXT, but skip subtrees that cannot possible fit. */
644 		next = RB_RIGHT(entry, daddrs.addr_entry);
645 		if (next != NULL &&
646 		    next->fspace_augment >= before_gap + after_gap + sz) {
647 			entry = next;
648 			while ((next = RB_LEFT(entry, daddrs.addr_entry)) !=
649 			    NULL)
650 				entry = next;
651 		} else {
652 do_parent:
653 			next = RB_PARENT(entry, daddrs.addr_entry);
654 			if (next == NULL)
655 				entry = NULL;
656 			else if (RB_LEFT(next, daddrs.addr_entry) == entry)
657 				entry = next;
658 			else {
659 				entry = next;
660 				goto do_parent;
661 			}
662 		}
663 	}
664 
665 	/* Lookup failed. */
666 	return ENOMEM;
667 }
668 
669 /*
670  * Destroy a uaddr_rnd_state structure.
671  */
672 void
673 uaddr_rnd_destroy(struct uvm_addr_state *uaddr)
674 {
675 	pool_put(&uaddr_rnd_pool, uaddr);
676 }
677 
678 /*
679  * Add entry to tailq.
680  */
681 void
682 uaddr_rnd_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
683     struct vm_map_entry *entry)
684 {
685 	return;
686 }
687 
688 /*
689  * Remove entry from tailq.
690  */
691 void
692 uaddr_rnd_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
693     struct vm_map_entry *entry)
694 {
695 	return;
696 }
697 
698 #if 0
699 #if defined(DEBUG) || defined(DDB)
700 void
701 uaddr_rnd_print(struct uvm_addr_state *uaddr_p, boolean_t full,
702     int (*pr)(const char*, ...))
703 {
704 	struct vm_map_entry	*entry;
705 	struct uaddr_rnd_state	*uaddr;
706 	vaddr_t			 addr;
707 	size_t			 count;
708 	vsize_t			 space;
709 
710 	uaddr = (struct uaddr_rnd_state *)uaddr_p;
711 	addr = 0;
712 	count = 0;
713 	space = 0;
714 	TAILQ_FOREACH(entry, &uaddr->ur_free, dfree.tailq) {
715 		count++;
716 		space += entry->fspace;
717 
718 		if (full) {
719 			(*pr)("\tentry %p: 0x%lx-0x%lx G=0x%lx F=0x%lx\n",
720 			    entry, entry->start, entry->end,
721 			    entry->guard, entry->fspace);
722 			(*pr)("\t\tfree: 0x%lx-0x%lx\n",
723 			    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
724 		}
725 		if (entry->start < addr) {
726 			if (!full)
727 				(*pr)("\tentry %p: 0x%lx-0x%lx "
728 				    "G=0x%lx F=0x%lx\n",
729 				    entry, entry->start, entry->end,
730 				    entry->guard, entry->fspace);
731 			(*pr)("\t\tstart=0x%lx, expected at least 0x%lx\n",
732 			    entry->start, addr);
733 		}
734 
735 		addr = VMMAP_FREE_END(entry);
736 	}
737 	(*pr)("\t0x%lu entries, 0x%lx free bytes\n", count, space);
738 }
739 #endif /* DEBUG || DDB */
740 #endif
741 
742 /*
743  * An allocator that selects an address within distance of the hint.
744  *
745  * If no hint is given, the allocator refuses to allocate.
746  */
747 const struct uvm_addr_functions uaddr_hint_functions = {
748 	.uaddr_select = &uaddr_hint_select,
749 	.uaddr_destroy = &uaddr_hint_destroy,
750 	.uaddr_name = "uaddr_hint"
751 };
752 
753 /*
754  * Create uaddr_hint state.
755  */
756 struct uvm_addr_state *
757 uaddr_hint_create(vaddr_t minaddr, vaddr_t maxaddr, vsize_t max_dist)
758 {
759 	struct uaddr_hint_state *ua_hint;
760 
761 	KASSERT(uaddr_hint_pool.pr_size == sizeof(*ua_hint));
762 
763 	ua_hint = pool_get(&uaddr_hint_pool, PR_WAITOK);
764 	ua_hint->uaddr.uaddr_minaddr = minaddr;
765 	ua_hint->uaddr.uaddr_maxaddr = maxaddr;
766 	ua_hint->uaddr.uaddr_functions = &uaddr_hint_functions;
767 	ua_hint->max_dist = max_dist;
768 	return &ua_hint->uaddr;
769 }
770 
771 /*
772  * Destroy uaddr_hint state.
773  */
774 void
775 uaddr_hint_destroy(struct uvm_addr_state *uaddr)
776 {
777 	pool_put(&uaddr_hint_pool, uaddr);
778 }
779 
780 /*
781  * Hint selector.
782  *
783  * Attempts to find an address that is within max_dist of the hint.
784  */
785 int
786 uaddr_hint_select(struct vm_map *map, struct uvm_addr_state *uaddr_param,
787     struct vm_map_entry **entry_out, vaddr_t *addr_out,
788     vsize_t sz, vaddr_t align, vaddr_t offset,
789     vm_prot_t prot, vaddr_t hint)
790 {
791 	struct uaddr_hint_state	*uaddr =
792 	    (struct uaddr_hint_state *)uaddr_param;
793 	vsize_t			 before_gap, after_gap;
794 	vaddr_t			 low, high;
795 	int			 dir;
796 
797 	if (hint == 0)
798 		return ENOMEM;
799 
800 	/* Calculate upper and lower bound for selected address. */
801 	high = hint + uaddr->max_dist;
802 	if (high < hint)	/* overflow */
803 		high = map->max_offset;
804 	high = MIN(high, uaddr->uaddr.uaddr_maxaddr);
805 	if (high < sz)
806 		return ENOMEM;	/* Protect against underflow. */
807 	high -= sz;
808 
809 	/* Calculate lower bound for selected address. */
810 	low = hint - uaddr->max_dist;
811 	if (low > hint)		/* underflow */
812 		low = map->min_offset;
813 	low = MAX(low, uaddr->uaddr.uaddr_minaddr);
814 
815 	/* Search strategy setup. */
816 	before_gap = PAGE_SIZE +
817 	    (arc4random_uniform(UADDR_HINT_MAXGAP) & ~(vaddr_t)PAGE_MASK);
818 	after_gap = PAGE_SIZE +
819 	    (arc4random_uniform(UADDR_HINT_MAXGAP) & ~(vaddr_t)PAGE_MASK);
820 	dir = (arc4random() & 0x01) ? 1 : -1;
821 
822 	/*
823 	 * Try to search:
824 	 * - forward,  with gap
825 	 * - backward, with gap
826 	 * - forward,  without gap
827 	 * - backward, without gap
828 	 * (Where forward is in the direction specified by dir and
829 	 * backward is in the direction specified by -dir).
830 	 */
831 	if (uvm_addr_linsearch(map, uaddr_param,
832 	    entry_out, addr_out, hint, sz, align, offset,
833 	    dir, low, high, before_gap, after_gap) == 0)
834 		return 0;
835 	if (uvm_addr_linsearch(map, uaddr_param,
836 	    entry_out, addr_out, hint, sz, align, offset,
837 	    -dir, low, high, before_gap, after_gap) == 0)
838 		return 0;
839 
840 	if (uvm_addr_linsearch(map, uaddr_param,
841 	    entry_out, addr_out, hint, sz, align, offset,
842 	    dir, low, high, 0, 0) == 0)
843 		return 0;
844 	if (uvm_addr_linsearch(map, uaddr_param,
845 	    entry_out, addr_out, hint, sz, align, offset,
846 	    -dir, low, high, 0, 0) == 0)
847 		return 0;
848 
849 	return ENOMEM;
850 }
851 
852 /*
853  * Kernel allocation bootstrap logic.
854  */
855 const struct uvm_addr_functions uaddr_kernel_functions = {
856 	.uaddr_select = &uaddr_kbootstrap_select,
857 	.uaddr_destroy = &uaddr_kbootstrap_destroy,
858 	.uaddr_name = "uaddr_kbootstrap"
859 };
860 
861 /*
862  * Select an address from the map.
863  *
864  * This function ignores the uaddr spec and instead uses the map directly.
865  * Because of that property, the uaddr algorithm can be shared across all
866  * kernel maps.
867  */
868 int
869 uaddr_kbootstrap_select(struct vm_map *map, struct uvm_addr_state *uaddr,
870     struct vm_map_entry **entry_out, vaddr_t *addr_out,
871     vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint)
872 {
873 	vaddr_t tmp;
874 
875 	RB_FOREACH(*entry_out, uvm_map_addr, &map->addr) {
876 		if (VMMAP_FREE_END(*entry_out) <= uvm_maxkaddr &&
877 		    uvm_addr_fitspace(addr_out, &tmp,
878 		    VMMAP_FREE_START(*entry_out), VMMAP_FREE_END(*entry_out),
879 		    sz, align, offset, 0, 0) == 0)
880 			return 0;
881 	}
882 
883 	return ENOMEM;
884 }
885 
886 /*
887  * Don't destroy the kernel bootstrap allocator.
888  */
889 void
890 uaddr_kbootstrap_destroy(struct uvm_addr_state *uaddr)
891 {
892 	KASSERT(uaddr == (struct uvm_addr_state *)&uaddr_kbootstrap);
893 }
894 
895 #ifndef SMALL_KERNEL
896 /*
897  * Best fit algorithm.
898  */
899 
900 const struct uvm_addr_functions uaddr_bestfit_functions = {
901 	.uaddr_select = &uaddr_bestfit_select,
902 	.uaddr_free_insert = &uaddr_bestfit_insert,
903 	.uaddr_free_remove = &uaddr_bestfit_remove,
904 	.uaddr_destroy = &uaddr_bestfit_destroy,
905 	.uaddr_name = "uaddr_bestfit"
906 };
907 
908 struct uvm_addr_state *
909 uaddr_bestfit_create(vaddr_t minaddr, vaddr_t maxaddr)
910 {
911 	struct uaddr_bestfit_state *uaddr;
912 
913 	uaddr = pool_get(&uaddr_bestfit_pool, PR_WAITOK);
914 	uaddr->ubf_uaddr.uaddr_minaddr = minaddr;
915 	uaddr->ubf_uaddr.uaddr_maxaddr = maxaddr;
916 	uaddr->ubf_uaddr.uaddr_functions = &uaddr_bestfit_functions;
917 	RB_INIT(&uaddr->ubf_free);
918 	return &uaddr->ubf_uaddr;
919 }
920 
921 void
922 uaddr_bestfit_destroy(struct uvm_addr_state *uaddr)
923 {
924 	pool_put(&uaddr_bestfit_pool, uaddr);
925 }
926 
927 void
928 uaddr_bestfit_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
929     struct vm_map_entry *entry)
930 {
931 	struct uaddr_bestfit_state	*uaddr;
932 	struct vm_map_entry		*rb_rv;
933 
934 	uaddr = (struct uaddr_bestfit_state *)uaddr_p;
935 	if ((rb_rv = RB_INSERT(uaddr_free_rbtree, &uaddr->ubf_free, entry)) !=
936 	    NULL) {
937 		panic("%s: duplicate insertion: state %p "
938 		    "interting %p, colliding with %p", __func__,
939 		    uaddr, entry, rb_rv);
940 	}
941 }
942 
943 void
944 uaddr_bestfit_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
945     struct vm_map_entry *entry)
946 {
947 	struct uaddr_bestfit_state	*uaddr;
948 
949 	uaddr = (struct uaddr_bestfit_state *)uaddr_p;
950 	if (RB_REMOVE(uaddr_free_rbtree, &uaddr->ubf_free, entry) != entry)
951 		panic("%s: entry was not in tree", __func__);
952 }
953 
954 int
955 uaddr_bestfit_select(struct vm_map *map, struct uvm_addr_state *uaddr_p,
956     struct vm_map_entry **entry_out, vaddr_t *addr_out,
957     vsize_t sz, vaddr_t align, vaddr_t offset,
958     vm_prot_t prot, vaddr_t hint)
959 {
960 	vaddr_t				 min, max;
961 	struct uaddr_bestfit_state	*uaddr;
962 	struct vm_map_entry		*entry;
963 	vsize_t				 guardsz;
964 
965 	uaddr = (struct uaddr_bestfit_state *)uaddr_p;
966 	guardsz = ((map->flags & VM_MAP_GUARDPAGES) ? PAGE_SIZE : 0);
967 
968 	/*
969 	 * Find smallest item on freelist capable of holding item.
970 	 * Deal with guardpages: search for space with one extra page.
971 	 */
972 	entry = uvm_addr_entrybyspace(&uaddr->ubf_free, sz + guardsz);
973 	if (entry == NULL)
974 		return ENOMEM;
975 
976 	/* Walk the tree until we find an entry that fits.  */
977 	while (uvm_addr_fitspace(&min, &max,
978 	    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
979 	    sz, align, offset, 0, guardsz) != 0) {
980 		entry = RB_NEXT(uaddr_free_rbtree, &uaddr->ubf_free, entry);
981 		if (entry == NULL)
982 			return ENOMEM;
983 	}
984 
985 	/* Return the address that generates the least fragmentation. */
986 	*entry_out = entry;
987 	*addr_out = (min - VMMAP_FREE_START(entry) <=
988 	    VMMAP_FREE_END(entry) - guardsz - sz - max ?
989 	    min : max);
990 	return 0;
991 }
992 #endif /* !SMALL_KERNEL */
993 
994 
995 #ifndef SMALL_KERNEL
996 /*
997  * A userspace allocator based on pivots.
998  */
999 
1000 const struct uvm_addr_functions uaddr_pivot_functions = {
1001 	.uaddr_select = &uaddr_pivot_select,
1002 	.uaddr_free_insert = &uaddr_pivot_insert,
1003 	.uaddr_free_remove = &uaddr_pivot_remove,
1004 	.uaddr_destroy = &uaddr_pivot_destroy,
1005 #if defined(DEBUG) || defined(DDB)
1006 	.uaddr_print = &uaddr_pivot_print,
1007 #endif /* DEBUG || DDB */
1008 	.uaddr_name = "uaddr_pivot"
1009 };
1010 
1011 /*
1012  * A special random function for pivots.
1013  *
1014  * This function will return:
1015  * - a random number
1016  * - a multiple of PAGE_SIZE
1017  * - at least PAGE_SIZE
1018  *
1019  * The random function has a slightly higher change to return a small number.
1020  */
1021 vsize_t
1022 uaddr_pivot_random(void)
1023 {
1024 	int			r;
1025 
1026 	/*
1027 	 * The sum of two six-sided dice will have a normal distribution.
1028 	 * We map the highest probable number to 1, by folding the curve
1029 	 * (think of a graph on a piece of paper, that you fold).
1030 	 *
1031 	 * Because the fold happens at PIVOT_RND - 1, the numbers 0 and 1
1032 	 * have the same and highest probability of happening.
1033 	 */
1034 	r = arc4random_uniform(PIVOT_RND) + arc4random_uniform(PIVOT_RND) -
1035 	    (PIVOT_RND - 1);
1036 	if (r < 0)
1037 		r = -r;
1038 
1039 	/*
1040 	 * Make the returned value at least PAGE_SIZE and a multiple of
1041 	 * PAGE_SIZE.
1042 	 */
1043 	return (vaddr_t)(1 + r) << PAGE_SHIFT;
1044 }
1045 
1046 /*
1047  * Select a new pivot.
1048  *
1049  * A pivot must:
1050  * - be chosen random
1051  * - have a randomly chosen gap before it, where the uaddr_state starts
1052  * - have a randomly chosen gap after it, before the uaddr_state ends
1053  *
1054  * Furthermore, the pivot must provide sufficient space for the allocation.
1055  * The addr will be set to the selected address.
1056  *
1057  * Returns ENOMEM on failure.
1058  */
1059 int
1060 uaddr_pivot_newpivot(struct vm_map *map, struct uaddr_pivot_state *uaddr,
1061     struct uaddr_pivot *pivot,
1062     struct vm_map_entry **entry_out, vaddr_t *addr_out,
1063     vsize_t sz, vaddr_t align, vaddr_t offset,
1064     vsize_t before_gap, vsize_t after_gap)
1065 {
1066 	struct vm_map_entry		*entry, *found;
1067 	vaddr_t				 minaddr, maxaddr;
1068 	vsize_t				 dist;
1069 	vaddr_t				 found_minaddr, found_maxaddr;
1070 	vaddr_t				 min, max;
1071 	vsize_t				 arc4_arg;
1072 	int				 fit_error;
1073 	u_int32_t			 path;
1074 
1075 	minaddr = uaddr->up_uaddr.uaddr_minaddr;
1076 	maxaddr = uaddr->up_uaddr.uaddr_maxaddr;
1077 	KASSERT(minaddr < maxaddr);
1078 #ifdef DIAGNOSTIC
1079 	if (minaddr + 2 * PAGE_SIZE > maxaddr) {
1080 		panic("uaddr_pivot_newpivot: cannot grant random pivot "
1081 		    "in area less than 2 pages (size = 0x%lx)",
1082 		    maxaddr - minaddr);
1083 	}
1084 #endif /* DIAGNOSTIC */
1085 
1086 	/*
1087 	 * Gap calculation: 1/32 of the size of the managed area.
1088 	 *
1089 	 * At most: sufficient to not get truncated at arc4random.
1090 	 * At least: 2 PAGE_SIZE
1091 	 *
1092 	 * minaddr and maxaddr will be changed according to arc4random.
1093 	 */
1094 	dist = MAX((maxaddr - minaddr) / 32, 2 * (vaddr_t)PAGE_SIZE);
1095 	if (dist >> PAGE_SHIFT > 0xffffffff) {
1096 		minaddr += (vsize_t)arc4random() << PAGE_SHIFT;
1097 		maxaddr -= (vsize_t)arc4random() << PAGE_SHIFT;
1098 	} else {
1099 		minaddr += (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) <<
1100 		    PAGE_SHIFT;
1101 		maxaddr -= (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) <<
1102 		    PAGE_SHIFT;
1103 	}
1104 
1105 	/*
1106 	 * A very fast way to find an entry that will be large enough
1107 	 * to hold the allocation, but still is found more or less
1108 	 * randomly: the tree path selector has a 50% chance to go for
1109 	 * a bigger or smaller entry.
1110 	 *
1111 	 * Note that the memory may actually be available,
1112 	 * but the fragmentation may be so bad and the gaps chosen
1113 	 * so unfortunately, that the allocation will not succeed.
1114 	 * Or the alignment can only be satisfied by an entry that
1115 	 * is not visited in the randomly selected path.
1116 	 *
1117 	 * This code finds an entry with sufficient space in O(log n) time.
1118 	 */
1119 	path = arc4random();
1120 	found = NULL;
1121 	entry = RB_ROOT(&uaddr->up_free);
1122 	while (entry != NULL) {
1123 		fit_error = uvm_addr_fitspace(&min, &max,
1124 		    MAX(VMMAP_FREE_START(entry), minaddr),
1125 		    MIN(VMMAP_FREE_END(entry), maxaddr),
1126 		    sz, align, offset, before_gap, after_gap);
1127 
1128 		/* It fits, save this entry. */
1129 		if (fit_error == 0) {
1130 			found = entry;
1131 			found_minaddr = min;
1132 			found_maxaddr = max;
1133 		}
1134 
1135 		/* Next. */
1136 		if (fit_error != 0)
1137 			entry = RB_RIGHT(entry, dfree.rbtree);
1138 		else if	((path & 0x1) == 0) {
1139 			path >>= 1;
1140 			entry = RB_RIGHT(entry, dfree.rbtree);
1141 		} else {
1142 			path >>= 1;
1143 			entry = RB_LEFT(entry, dfree.rbtree);
1144 		}
1145 	}
1146 	if (found == NULL)
1147 		return ENOMEM;	/* Not found a large enough region. */
1148 
1149 	/*
1150 	 * Calculate a random address within found.
1151 	 *
1152 	 * found_minaddr and found_maxaddr are already aligned, so be sure
1153 	 * to select a multiple of align as the offset in the entry.
1154 	 * Preferably, arc4random_uniform is used to provide no bias within
1155 	 * the entry.
1156 	 * However if the size of the entry exceeds arc4random_uniforms
1157 	 * argument limit, we simply use arc4random (thus limiting ourselves
1158 	 * to 4G * PAGE_SIZE bytes offset).
1159 	 */
1160 	if (found_maxaddr == found_minaddr)
1161 		*addr_out = found_minaddr;
1162 	else {
1163 		KASSERT(align >= PAGE_SIZE && (align & (align - 1)) == 0);
1164 		arc4_arg = found_maxaddr - found_minaddr;
1165 		if (arc4_arg > 0xffffffff) {
1166 			*addr_out = found_minaddr +
1167 			    (arc4random() & (align - 1));
1168 		} else {
1169 			*addr_out = found_minaddr +
1170 			    (arc4random_uniform(arc4_arg) & (align - 1));
1171 		}
1172 	}
1173 	/* Address was found in this entry. */
1174 	*entry_out = found;
1175 
1176 	/*
1177 	 * Set up new pivot and return selected address.
1178 	 *
1179 	 * Depending on the direction of the pivot, the pivot must be placed
1180 	 * at the bottom or the top of the allocation:
1181 	 * - if the pivot moves upwards, place the pivot at the top of the
1182 	 *   allocation,
1183 	 * - if the pivot moves downwards, place the pivot at the bottom
1184 	 *   of the allocation.
1185 	 */
1186 	pivot->entry = found;
1187 	pivot->dir = (arc4random() & 0x1 ? 1 : -1);
1188 	if (pivot->dir > 0)
1189 		pivot->addr = *addr_out + sz;
1190 	else
1191 		pivot->addr = *addr_out;
1192 	pivot->expire = PIVOT_EXPIRE - 1; /* First use is right now. */
1193 	return 0;
1194 }
1195 
1196 /*
1197  * Pivot selector.
1198  *
1199  * Each time the selector is invoked, it will select a random pivot, which
1200  * it will use to select memory with. The memory will be placed at the pivot,
1201  * with a randomly sized gap between the allocation and the pivot.
1202  * The pivot will then move so it will never revisit this address.
1203  *
1204  * Each allocation, the pivot expiry timer ticks. Once the pivot becomes
1205  * expired, it will be replaced with a newly created pivot. Pivots also
1206  * automatically expire if they fail to provide memory for an allocation.
1207  *
1208  * Expired pivots are replaced using the uaddr_pivot_newpivot() function,
1209  * which will ensure the pivot points at memory in such a way that the
1210  * allocation will succeed.
1211  * As an added bonus, the uaddr_pivot_newpivot() function will perform the
1212  * allocation immediately and move the pivot as appropriate.
1213  *
1214  * If uaddr_pivot_newpivot() fails to find a new pivot that will allow the
1215  * allocation to succeed, it will not create a new pivot and the allocation
1216  * will fail.
1217  *
1218  * A pivot running into used memory will automatically expire (because it will
1219  * fail to allocate).
1220  *
1221  * Characteristics of the allocator:
1222  * - best case, an allocation is O(log N)
1223  *   (it would be O(1), if it werent for the need to check if the memory is
1224  *   free; although that can be avoided...)
1225  * - worst case, an allocation is O(log N)
1226  *   (the uaddr_pivot_newpivot() function has that complexity)
1227  * - failed allocations always take O(log N)
1228  *   (the uaddr_pivot_newpivot() function will walk that deep into the tree).
1229  */
1230 int
1231 uaddr_pivot_select(struct vm_map *map, struct uvm_addr_state *uaddr_p,
1232     struct vm_map_entry **entry_out, vaddr_t *addr_out,
1233     vsize_t sz, vaddr_t align, vaddr_t offset,
1234     vm_prot_t prot, vaddr_t hint)
1235 {
1236 	struct uaddr_pivot_state	*uaddr;
1237 	struct vm_map_entry		*entry;
1238 	struct uaddr_pivot		*pivot;
1239 	vaddr_t				 min, max;
1240 	vsize_t				 before_gap, after_gap;
1241 	int				 err;
1242 
1243 	/* Hint must be handled by dedicated hint allocator. */
1244 	if (hint != 0)
1245 		return EINVAL;
1246 
1247 	/*
1248 	 * Select a random pivot and a random gap sizes around the allocation.
1249 	 */
1250 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1251 	pivot = &uaddr->up_pivots[
1252 	    arc4random_uniform(nitems(uaddr->up_pivots))];
1253 	before_gap = uaddr_pivot_random();
1254 	after_gap = uaddr_pivot_random();
1255 	if (pivot->addr == 0 || pivot->entry == NULL || pivot->expire == 0)
1256 		goto expired;	/* Pivot is invalid (null or expired). */
1257 
1258 	/* Attempt to use the pivot to map the entry. */
1259 	entry = pivot->entry;
1260 	if (pivot->dir > 0) {
1261 		if (uvm_addr_fitspace(&min, &max,
1262 		    MAX(VMMAP_FREE_START(entry), pivot->addr),
1263 		    VMMAP_FREE_END(entry), sz, align, offset,
1264 		    before_gap, after_gap) == 0) {
1265 			*addr_out = min;
1266 			*entry_out = entry;
1267 			pivot->addr = min + sz;
1268 			pivot->expire--;
1269 			return 0;
1270 		}
1271 	} else {
1272 		if (uvm_addr_fitspace(&min, &max,
1273 		    VMMAP_FREE_START(entry),
1274 		    MIN(VMMAP_FREE_END(entry), pivot->addr),
1275 		    sz, align, offset, before_gap, after_gap) == 0) {
1276 			*addr_out = max;
1277 			*entry_out = entry;
1278 			pivot->addr = max;
1279 			pivot->expire--;
1280 			return 0;
1281 		}
1282 	}
1283 
1284 expired:
1285 	/*
1286 	 * Pivot expired or allocation failed.
1287 	 * Use pivot selector to do the allocation and find a new pivot.
1288 	 */
1289 	err = uaddr_pivot_newpivot(map, uaddr, pivot, entry_out, addr_out,
1290 	    sz, align, offset, before_gap, after_gap);
1291 	return err;
1292 }
1293 
1294 /*
1295  * Free the pivot.
1296  */
1297 void
1298 uaddr_pivot_destroy(struct uvm_addr_state *uaddr)
1299 {
1300 	pool_put(&uaddr_pivot_pool, uaddr);
1301 }
1302 
1303 /*
1304  * Insert an entry with free space in the space tree.
1305  */
1306 void
1307 uaddr_pivot_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
1308     struct vm_map_entry *entry)
1309 {
1310 	struct uaddr_pivot_state	*uaddr;
1311 	struct vm_map_entry		*rb_rv;
1312 	struct uaddr_pivot		*p;
1313 	vaddr_t				 check_addr;
1314 	vaddr_t				 start, end;
1315 
1316 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1317 	if ((rb_rv = RB_INSERT(uaddr_free_rbtree, &uaddr->up_free, entry)) !=
1318 	    NULL) {
1319 		panic("%s: duplicate insertion: state %p "
1320 		    "inserting entry %p which collides with %p", __func__,
1321 		    uaddr, entry, rb_rv);
1322 	}
1323 
1324 	start = VMMAP_FREE_START(entry);
1325 	end = VMMAP_FREE_END(entry);
1326 
1327 	/*
1328 	 * Update all pivots that are contained in this entry.
1329 	 */
1330 	for (p = &uaddr->up_pivots[0];
1331 	    p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) {
1332 		check_addr = p->addr;
1333 		if (check_addr == 0)
1334 			continue;
1335 		if (p->dir < 0)
1336 			check_addr--;
1337 
1338 		if (start <= check_addr &&
1339 		    check_addr < end) {
1340 			KASSERT(p->entry == NULL);
1341 			p->entry = entry;
1342 		}
1343 	}
1344 }
1345 
1346 /*
1347  * Remove an entry with free space from the space tree.
1348  */
1349 void
1350 uaddr_pivot_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
1351     struct vm_map_entry *entry)
1352 {
1353 	struct uaddr_pivot_state	*uaddr;
1354 	struct uaddr_pivot		*p;
1355 
1356 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1357 	if (RB_REMOVE(uaddr_free_rbtree, &uaddr->up_free, entry) != entry)
1358 		panic("%s: entry was not in tree", __func__);
1359 
1360 	/*
1361 	 * Inform any pivot with this entry that the entry is gone.
1362 	 * Note that this does not automatically invalidate the pivot.
1363 	 */
1364 	for (p = &uaddr->up_pivots[0];
1365 	    p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) {
1366 		if (p->entry == entry)
1367 			p->entry = NULL;
1368 	}
1369 }
1370 
1371 /*
1372  * Create a new pivot selector.
1373  *
1374  * Initially, all pivots are in the expired state.
1375  * Two reasons for this:
1376  * - it means this allocator will not take a huge amount of time
1377  * - pivots select better on demand, because the pivot selection will be
1378  *   affected by preceding allocations:
1379  *   the next pivots will likely end up in different segments of free memory,
1380  *   that was segmented by an earlier allocation; better spread.
1381  */
1382 struct uvm_addr_state *
1383 uaddr_pivot_create(vaddr_t minaddr, vaddr_t maxaddr)
1384 {
1385 	struct uaddr_pivot_state *uaddr;
1386 
1387 	uaddr = pool_get(&uaddr_pivot_pool, PR_WAITOK);
1388 	uaddr->up_uaddr.uaddr_minaddr = minaddr;
1389 	uaddr->up_uaddr.uaddr_maxaddr = maxaddr;
1390 	uaddr->up_uaddr.uaddr_functions = &uaddr_pivot_functions;
1391 	RB_INIT(&uaddr->up_free);
1392 	memset(uaddr->up_pivots, 0, sizeof(uaddr->up_pivots));
1393 
1394 	return &uaddr->up_uaddr;
1395 }
1396 
1397 #if defined(DEBUG) || defined(DDB)
1398 /*
1399  * Print the uaddr_pivot_state.
1400  *
1401  * If full, a listing of all entries in the state will be provided.
1402  */
1403 void
1404 uaddr_pivot_print(struct uvm_addr_state *uaddr_p, boolean_t full,
1405     int (*pr)(const char *, ...))
1406 {
1407 	struct uaddr_pivot_state	*uaddr;
1408 	struct uaddr_pivot		*pivot;
1409 	struct vm_map_entry		*entry;
1410 	int				 i;
1411 	vaddr_t				 check_addr;
1412 
1413 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1414 
1415 	for (i = 0; i < NUM_PIVOTS; i++) {
1416 		pivot = &uaddr->up_pivots[i];
1417 
1418 		(*pr)("\tpivot 0x%lx, epires in %d, direction %d\n",
1419 		    pivot->addr, pivot->expire, pivot->dir);
1420 	}
1421 	if (!full)
1422 		return;
1423 
1424 	if (RB_EMPTY(&uaddr->up_free))
1425 		(*pr)("\tempty\n");
1426 	/* Print list of free space. */
1427 	RB_FOREACH(entry, uaddr_free_rbtree, &uaddr->up_free) {
1428 		(*pr)("\t0x%lx - 0x%lx free (0x%lx bytes)\n",
1429 		    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
1430 		    VMMAP_FREE_END(entry) - VMMAP_FREE_START(entry));
1431 
1432 		for (i = 0; i < NUM_PIVOTS; i++) {
1433 			pivot = &uaddr->up_pivots[i];
1434 			check_addr = pivot->addr;
1435 			if (check_addr == 0)
1436 				continue;
1437 			if (pivot->dir < 0)
1438 				check_addr--;
1439 
1440 			if (VMMAP_FREE_START(entry) <= check_addr &&
1441 			    check_addr < VMMAP_FREE_END(entry)) {
1442 				(*pr)("\t\tcontains pivot %d (0x%lx)\n",
1443 				    i, pivot->addr);
1444 			}
1445 		}
1446 	}
1447 }
1448 #endif /* DEBUG || DDB */
1449 #endif /* !SMALL_KERNEL */
1450 
1451 #ifndef SMALL_KERNEL
1452 /*
1453  * Strategy for uaddr_stack_brk_select.
1454  */
1455 struct uaddr_bs_strat {
1456 	vaddr_t			start;		/* Start of area. */
1457 	vaddr_t			end;		/* End of area. */
1458 	int			dir;		/* Search direction. */
1459 };
1460 
1461 /*
1462  * Stack/break allocator.
1463  *
1464  * Stack area is grown into in the opposite direction of the stack growth,
1465  * brk area is grown downward (because sbrk() grows upward).
1466  *
1467  * Both areas are grown into proportially: a weighted chance is used to
1468  * select which one (stack or brk area) to try. If the allocation fails,
1469  * the other one is tested.
1470  */
1471 const struct uvm_addr_functions uaddr_stack_brk_functions = {
1472 	.uaddr_select = &uaddr_stack_brk_select,
1473 	.uaddr_destroy = &uaddr_destroy,
1474 	.uaddr_name = "uaddr_stckbrk"
1475 };
1476 
1477 /*
1478  * Stack/brk address selector.
1479  */
1480 int
1481 uaddr_stack_brk_select(struct vm_map *map, struct uvm_addr_state *uaddr,
1482     struct vm_map_entry **entry_out, vaddr_t *addr_out,
1483     vsize_t sz, vaddr_t align, vaddr_t offset,
1484     vm_prot_t prot, vaddr_t hint)
1485 {
1486 	vsize_t			before_gap, after_gap;
1487 	int			stack_idx, brk_idx;
1488 	struct uaddr_bs_strat	strat[2], *s;
1489 	vsize_t			sb_size;
1490 
1491 	/*
1492 	 * Choose gap size and if the stack is searched before or after the
1493 	 * brk area.
1494 	 */
1495 	before_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT;
1496 	after_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT;
1497 
1498 	sb_size = (map->s_end - map->s_start) + (map->b_end - map->b_start);
1499 	sb_size >>= PAGE_SHIFT;
1500 	if (arc4random_uniform(MAX(sb_size, 0xffffffff)) >
1501 	    map->b_end - map->b_start) {
1502 		brk_idx = 1;
1503 		stack_idx = 0;
1504 	} else {
1505 		brk_idx = 0;
1506 		stack_idx = 1;
1507 	}
1508 
1509 	/* Set up stack search strategy. */
1510 	s = &strat[stack_idx];
1511 	s->start = MAX(map->s_start, uaddr->uaddr_minaddr);
1512 	s->end = MIN(map->s_end, uaddr->uaddr_maxaddr);
1513 #ifdef MACHINE_STACK_GROWS_UP
1514 	s->dir = -1;
1515 #else
1516 	s->dir =  1;
1517 #endif
1518 
1519 	/* Set up brk search strategy. */
1520 	s = &strat[brk_idx];
1521 	s->start = MAX(map->b_start, uaddr->uaddr_minaddr);
1522 	s->end = MIN(map->b_end, uaddr->uaddr_maxaddr);
1523 	s->dir = -1;	/* Opposite of brk() growth. */
1524 
1525 	/* Linear search for space.  */
1526 	for (s = &strat[0]; s < &strat[nitems(strat)]; s++) {
1527 		if (s->end - s->start < sz)
1528 			continue;
1529 		if (uvm_addr_linsearch(map, uaddr, entry_out, addr_out,
1530 		    0, sz, align, offset, s->dir, s->start, s->end - sz,
1531 		    before_gap, after_gap) == 0)
1532 			return 0;
1533 	}
1534 
1535 	return ENOMEM;
1536 }
1537 
1538 struct uvm_addr_state *
1539 uaddr_stack_brk_create(vaddr_t minaddr, vaddr_t maxaddr)
1540 {
1541 	struct uvm_addr_state* uaddr;
1542 
1543 	uaddr = pool_get(&uaddr_pool, PR_WAITOK);
1544 	uaddr->uaddr_minaddr = minaddr;
1545 	uaddr->uaddr_maxaddr = maxaddr;
1546 	uaddr->uaddr_functions = &uaddr_stack_brk_functions;
1547 	return uaddr;
1548 }
1549 #endif /* !SMALL_KERNEL */
1550 
1551 
1552 #ifndef SMALL_KERNEL
1553 RB_GENERATE(uaddr_free_rbtree, vm_map_entry, dfree.rbtree,
1554     uvm_mapent_fspace_cmp);
1555 #endif /* !SMALL_KERNEL */
1556