xref: /openbsd-src/sys/uvm/uvm_addr.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /*	$OpenBSD: uvm_addr.c,v 1.8 2014/06/10 20:08:48 kettenis Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 /* #define DEBUG */
20 
21 #include <sys/param.h>
22 #include <uvm/uvm.h>
23 #include <uvm/uvm_addr.h>
24 #include <sys/pool.h>
25 #include <dev/rndvar.h>
26 
27 /* Max gap between hint allocations. */
28 #define UADDR_HINT_MAXGAP	(4 * PAGE_SIZE)
29 /* Number of pivots in pivot allocator. */
30 #define NUM_PIVOTS		16
31 /*
32  * Max number (inclusive) of pages the pivot allocator
33  * will place between allocations.
34  *
35  * The uaddr_pivot_random() function attempts to bias towards
36  * small space between allocations, so putting a large number here is fine.
37  */
38 #define PIVOT_RND		8
39 /*
40  * Number of allocations that a pivot can supply before expiring.
41  * When a pivot expires, a new pivot has to be found.
42  *
43  * Must be at least 1.
44  */
45 #define PIVOT_EXPIRE		1024
46 
47 
48 /* Pool with uvm_addr_state structures. */
49 struct pool uaddr_pool;
50 struct pool uaddr_hint_pool;
51 struct pool uaddr_bestfit_pool;
52 struct pool uaddr_pivot_pool;
53 struct pool uaddr_rnd_pool;
54 
55 /* uvm_addr state for hint based selector. */
56 struct uaddr_hint_state {
57 	struct uvm_addr_state		 uaddr;
58 	vsize_t				 max_dist;
59 };
60 
61 /* uvm_addr state for bestfit selector. */
62 struct uaddr_bestfit_state {
63 	struct uvm_addr_state		 ubf_uaddr;
64 	struct uaddr_free_rbtree	 ubf_free;
65 };
66 
67 /* uvm_addr state for rnd selector. */
68 struct uaddr_rnd_state {
69 	struct uvm_addr_state		 ur_uaddr;
70 	TAILQ_HEAD(, vm_map_entry)	 ur_free;
71 };
72 
73 /* Definition of a pivot in pivot selector. */
74 struct uaddr_pivot {
75 	vaddr_t				 addr;	/* End of prev. allocation. */
76 	int				 expire;/* Best before date. */
77 	int				 dir;	/* Direction. */
78 	struct vm_map_entry		*entry; /* Will contain next alloc. */
79 };
80 /* uvm_addr state for pivot selector. */
81 struct uaddr_pivot_state {
82 	struct uvm_addr_state		 up_uaddr;
83 
84 	/* Free space tree, for fast pivot selection. */
85 	struct uaddr_free_rbtree	 up_free;
86 
87 	/* List of pivots. The pointers point to after the last allocation. */
88 	struct uaddr_pivot		 up_pivots[NUM_PIVOTS];
89 };
90 
91 /*
92  * Free space comparison.
93  * Compares smaller free-space before larger free-space.
94  */
95 static __inline int
96 uvm_mapent_fspace_cmp(struct vm_map_entry *e1, struct vm_map_entry *e2)
97 {
98 	if (e1->fspace != e2->fspace)
99 		return (e1->fspace < e2->fspace ? -1 : 1);
100 	return (e1->start < e2->start ? -1 : e1->start > e2->start);
101 }
102 
103 /* Forward declaration (see below). */
104 extern const struct uvm_addr_functions uaddr_kernel_functions;
105 struct uvm_addr_state uaddr_kbootstrap;
106 
107 /* Support functions. */
108 #ifndef SMALL_KERNEL
109 struct vm_map_entry	*uvm_addr_entrybyspace(struct uaddr_free_rbtree*,
110 			    vsize_t);
111 #endif /* !SMALL_KERNEL */
112 void			 uaddr_kinsert(struct vm_map *,
113 			    struct uvm_addr_state *, struct vm_map_entry *);
114 void			 uaddr_kremove(struct vm_map *,
115 			    struct uvm_addr_state *, struct vm_map_entry *);
116 void			 uaddr_kbootstrapdestroy(struct uvm_addr_state *);
117 
118 void			 uaddr_destroy(struct uvm_addr_state *);
119 void			 uaddr_hint_destroy(struct uvm_addr_state *);
120 void			 uaddr_kbootstrap_destroy(struct uvm_addr_state *);
121 void			 uaddr_rnd_destroy(struct uvm_addr_state *);
122 void			 uaddr_bestfit_destroy(struct uvm_addr_state *);
123 void			 uaddr_pivot_destroy(struct uvm_addr_state *);
124 
125 int			 uaddr_lin_select(struct vm_map *,
126 			    struct uvm_addr_state *, struct vm_map_entry **,
127 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
128 			    vaddr_t);
129 int			 uaddr_kbootstrap_select(struct vm_map *,
130 			    struct uvm_addr_state *, struct vm_map_entry **,
131 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
132 			    vaddr_t);
133 int			 uaddr_rnd_select(struct vm_map *,
134 			    struct uvm_addr_state *, struct vm_map_entry **,
135 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
136 			    vaddr_t);
137 int			 uaddr_hint_select(struct vm_map *,
138 			    struct uvm_addr_state*, struct vm_map_entry **,
139 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
140 			    vaddr_t);
141 int			 uaddr_bestfit_select(struct vm_map *,
142 			    struct uvm_addr_state*, struct vm_map_entry **,
143 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
144 			    vaddr_t);
145 #ifndef SMALL_KERNEL
146 int			 uaddr_pivot_select(struct vm_map *,
147 			    struct uvm_addr_state *, struct vm_map_entry **,
148 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
149 			    vaddr_t);
150 int			 uaddr_stack_brk_select(struct vm_map *,
151 			    struct uvm_addr_state *, struct vm_map_entry **,
152 			    vaddr_t *, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
153 			    vaddr_t);
154 #endif /* !SMALL_KERNEL */
155 
156 void			 uaddr_rnd_insert(struct vm_map *,
157 			    struct uvm_addr_state *, struct vm_map_entry *);
158 void			 uaddr_rnd_remove(struct vm_map *,
159 			    struct uvm_addr_state *, struct vm_map_entry *);
160 void			 uaddr_bestfit_insert(struct vm_map *,
161 			    struct uvm_addr_state *, struct vm_map_entry *);
162 void			 uaddr_bestfit_remove(struct vm_map *,
163 			    struct uvm_addr_state *, struct vm_map_entry *);
164 void			 uaddr_pivot_insert(struct vm_map *,
165 			    struct uvm_addr_state *, struct vm_map_entry *);
166 void			 uaddr_pivot_remove(struct vm_map *,
167 			    struct uvm_addr_state *, struct vm_map_entry *);
168 
169 #ifndef SMALL_KERNEL
170 vsize_t			 uaddr_pivot_random(void);
171 int			 uaddr_pivot_newpivot(struct vm_map *,
172 			    struct uaddr_pivot_state *, struct uaddr_pivot *,
173 			    struct vm_map_entry **, vaddr_t *,
174 			    vsize_t, vaddr_t, vaddr_t, vsize_t, vsize_t);
175 #endif /* !SMALL_KERNEL */
176 
177 #if defined(DEBUG) || defined(DDB)
178 void			 uaddr_pivot_print(struct uvm_addr_state *, boolean_t,
179 			    int (*)(const char *, ...));
180 void			 uaddr_rnd_print(struct uvm_addr_state *, boolean_t,
181 			    int (*)(const char *, ...));
182 #endif /* DEBUG || DDB */
183 
184 
185 #ifndef SMALL_KERNEL
186 /*
187  * Find smallest entry in tree that will fit sz bytes.
188  */
189 struct vm_map_entry *
190 uvm_addr_entrybyspace(struct uaddr_free_rbtree *free, vsize_t sz)
191 {
192 	struct vm_map_entry	*tmp, *res;
193 
194 	tmp = RB_ROOT(free);
195 	res = NULL;
196 	while (tmp) {
197 		if (tmp->fspace >= sz) {
198 			res = tmp;
199 			tmp = RB_LEFT(tmp, dfree.rbtree);
200 		} else if (tmp->fspace < sz)
201 			tmp = RB_RIGHT(tmp, dfree.rbtree);
202 	}
203 	return res;
204 }
205 #endif /* !SMALL_KERNEL */
206 
207 static __inline vaddr_t
208 uvm_addr_align_forward(vaddr_t addr, vaddr_t align, vaddr_t offset)
209 {
210 	vaddr_t adjusted;
211 
212 	KASSERT(offset < align || (align == 0 && offset == 0));
213 	KASSERT((align & (align - 1)) == 0);
214 	KASSERT((offset & PAGE_MASK) == 0);
215 
216 	align = MAX(align, PAGE_SIZE);
217 	adjusted = addr & ~(align - 1);
218 	adjusted += offset;
219 	return (adjusted < addr ? adjusted + align : adjusted);
220 }
221 
222 static __inline vaddr_t
223 uvm_addr_align_backward(vaddr_t addr, vaddr_t align, vaddr_t offset)
224 {
225 	vaddr_t adjusted;
226 
227 	KASSERT(offset < align || (align == 0 && offset == 0));
228 	KASSERT((align & (align - 1)) == 0);
229 	KASSERT((offset & PAGE_MASK) == 0);
230 
231 	align = MAX(align, PAGE_SIZE);
232 	adjusted = addr & ~(align - 1);
233 	adjusted += offset;
234 	return (adjusted > addr ? adjusted - align : adjusted);
235 }
236 
237 /*
238  * Try to fit the requested space into the entry.
239  */
240 int
241 uvm_addr_fitspace(vaddr_t *min_result, vaddr_t *max_result,
242     vaddr_t low_addr, vaddr_t high_addr, vsize_t sz,
243     vaddr_t align, vaddr_t offset,
244     vsize_t before_gap, vsize_t after_gap)
245 {
246 	vaddr_t	tmp;
247 	vsize_t	fspace;
248 
249 	if (low_addr > high_addr)
250 		return ENOMEM;
251 	fspace = high_addr - low_addr;
252 	if (fspace < sz + before_gap + after_gap)
253 		return ENOMEM;
254 
255 	/* Calculate lowest address. */
256 	low_addr += before_gap;
257 	low_addr = uvm_addr_align_forward(tmp = low_addr, align, offset);
258 	if (low_addr < tmp)	/* Overflow during alignment. */
259 		return ENOMEM;
260 	if (high_addr - after_gap - sz < low_addr)
261 		return ENOMEM;
262 
263 	/* Calculate highest address. */
264 	high_addr -= after_gap + sz;
265 	high_addr = uvm_addr_align_backward(tmp = high_addr, align, offset);
266 	if (high_addr > tmp)	/* Overflow during alignment. */
267 		return ENOMEM;
268 	if (low_addr > high_addr)
269 		return ENOMEM;
270 
271 	*min_result = low_addr;
272 	*max_result = high_addr;
273 	return 0;
274 }
275 
276 
277 /*
278  * Initialize uvm_addr.
279  */
280 void
281 uvm_addr_init(void)
282 {
283 	pool_init(&uaddr_pool, sizeof(struct uvm_addr_state),
284 	    0, 0, 0, "uaddr", &pool_allocator_nointr);
285 	pool_init(&uaddr_hint_pool, sizeof(struct uaddr_hint_state),
286 	    0, 0, 0, "uaddrhint", &pool_allocator_nointr);
287 	pool_init(&uaddr_bestfit_pool, sizeof(struct uaddr_bestfit_state),
288 	    0, 0, 0, "uaddrbest", &pool_allocator_nointr);
289 	pool_init(&uaddr_pivot_pool, sizeof(struct uaddr_pivot_state),
290 	    0, 0, 0, "uaddrpivot", &pool_allocator_nointr);
291 	pool_init(&uaddr_rnd_pool, sizeof(struct uaddr_rnd_state),
292 	    0, 0, 0, "uaddrrnd", &pool_allocator_nointr);
293 
294 	uaddr_kbootstrap.uaddr_minaddr = PAGE_SIZE;
295 	uaddr_kbootstrap.uaddr_maxaddr = -(vaddr_t)PAGE_SIZE;
296 	uaddr_kbootstrap.uaddr_functions = &uaddr_kernel_functions;
297 }
298 
299 /*
300  * Invoke destructor function of uaddr.
301  */
302 void
303 uvm_addr_destroy(struct uvm_addr_state *uaddr)
304 {
305 	if (uaddr)
306 		(*uaddr->uaddr_functions->uaddr_destroy)(uaddr);
307 }
308 
309 /*
310  * Move address forward to satisfy align, offset.
311  */
312 vaddr_t
313 uvm_addr_align(vaddr_t addr, vaddr_t align, vaddr_t offset)
314 {
315 	vaddr_t result = (addr & ~(align - 1)) + offset;
316 	if (result < addr)
317 		result += align;
318 	return result;
319 }
320 
321 /*
322  * Move address backwards to satisfy align, offset.
323  */
324 vaddr_t
325 uvm_addr_align_back(vaddr_t addr, vaddr_t align, vaddr_t offset)
326 {
327 	vaddr_t result = (addr & ~(align - 1)) + offset;
328 	if (result > addr)
329 		result -= align;
330 	return result;
331 }
332 
333 /*
334  * Directional first fit.
335  *
336  * Do a linear search for free space, starting at addr in entry.
337  * direction ==  1: search forward
338  * direction == -1: search backward
339  *
340  * Output: low <= addr <= high and entry will contain addr.
341  * 0 will be returned if no space is available.
342  *
343  * gap describes the space that must appear between the preceding entry.
344  */
345 int
346 uvm_addr_linsearch(struct vm_map *map, struct uvm_addr_state *uaddr,
347     struct vm_map_entry **entry_out, vaddr_t *addr_out,
348     vaddr_t hint, vsize_t sz, vaddr_t align, vaddr_t offset,
349     int direction, vaddr_t low, vaddr_t high,
350     vsize_t before_gap, vsize_t after_gap)
351 {
352 	struct vm_map_entry	*entry;
353 	vaddr_t			 low_addr, high_addr;
354 
355 	KASSERT(entry_out != NULL && addr_out != NULL);
356 	KASSERT(direction == -1 || direction == 1);
357 	KASSERT((hint & PAGE_MASK) == 0 && (high & PAGE_MASK) == 0 &&
358 	    (low & PAGE_MASK) == 0 &&
359 	    (before_gap & PAGE_MASK) == 0 && (after_gap & PAGE_MASK) == 0);
360 	KASSERT(high + sz > high); /* Check for overflow. */
361 
362 	/* Hint magic. */
363 	if (hint == 0)
364 		hint = (direction == 1 ? low : high);
365 	else if (hint > high) {
366 		if (direction != -1)
367 			return ENOMEM;
368 		hint = high;
369 	} else if (hint < low) {
370 		if (direction != 1)
371 			return ENOMEM;
372 		hint = low;
373 	}
374 
375 	for (entry = uvm_map_entrybyaddr(&map->addr,
376 	    hint - (direction == -1 ? 1 : 0)); entry != NULL;
377 	    entry = (direction == 1 ?
378 	    RB_NEXT(uvm_map_addr, &map->addr, entry) :
379 	    RB_PREV(uvm_map_addr, &map->addr, entry))) {
380 		if (VMMAP_FREE_START(entry) > high ||
381 		    VMMAP_FREE_END(entry) < low) {
382 			break;
383 		}
384 
385 		if (uvm_addr_fitspace(&low_addr, &high_addr,
386 		    MAX(low, VMMAP_FREE_START(entry)),
387 		    MIN(high, VMMAP_FREE_END(entry)),
388 		    sz, align, offset, before_gap, after_gap) == 0) {
389 			*entry_out = entry;
390 			if (hint >= low_addr && hint <= high_addr) {
391 				*addr_out = hint;
392 			} else {
393 				*addr_out = (direction == 1 ?
394 				    low_addr : high_addr);
395 			}
396 			return 0;
397 		}
398 	}
399 
400 	return ENOMEM;
401 }
402 
403 /*
404  * Invoke address selector of uaddr.
405  * uaddr may be NULL, in which case the algorithm will fail with ENOMEM.
406  *
407  * Will invoke uvm_addr_isavail to fill in last_out.
408  */
409 int
410 uvm_addr_invoke(struct vm_map *map, struct uvm_addr_state *uaddr,
411     struct vm_map_entry **entry_out, struct vm_map_entry **last_out,
412     vaddr_t *addr_out,
413     vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint)
414 {
415 	int error;
416 
417 	if (uaddr == NULL)
418 		return ENOMEM;
419 
420 	hint &= ~((vaddr_t)PAGE_MASK);
421 	if (hint != 0 &&
422 	    !(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr))
423 		return ENOMEM;
424 
425 	error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr,
426 	    entry_out, addr_out, sz, align, offset, prot, hint);
427 
428 	if (error == 0) {
429 		KASSERT(*entry_out != NULL);
430 		*last_out = NULL;
431 		if (!uvm_map_isavail(map, uaddr, entry_out, last_out,
432 		    *addr_out, sz)) {
433 			panic("uvm_addr_invoke: address selector %p "
434 			    "(%s 0x%lx-0x%lx) "
435 			    "returned unavailable address 0x%lx",
436 			    uaddr, uaddr->uaddr_functions->uaddr_name,
437 			    uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr,
438 			    *addr_out);
439 		}
440 	}
441 
442 	return error;
443 }
444 
445 #if defined(DEBUG) || defined(DDB)
446 void
447 uvm_addr_print(struct uvm_addr_state *uaddr, const char *slot, boolean_t full,
448     int (*pr)(const char *, ...))
449 {
450 	if (uaddr == NULL) {
451 		(*pr)("- uvm_addr %s: NULL\n", slot);
452 		return;
453 	}
454 
455 	(*pr)("- uvm_addr %s: %p (%s 0x%lx-0x%lx)\n", slot, uaddr,
456 	    uaddr->uaddr_functions->uaddr_name,
457 	    uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr);
458 	if (uaddr->uaddr_functions->uaddr_print == NULL)
459 		return;
460 
461 	(*uaddr->uaddr_functions->uaddr_print)(uaddr, full, pr);
462 }
463 #endif /* DEBUG || DDB */
464 
465 /*
466  * Destroy a uvm_addr_state structure.
467  * The uaddr must have been previously allocated from uaddr_state_pool.
468  */
469 void
470 uaddr_destroy(struct uvm_addr_state *uaddr)
471 {
472 	pool_put(&uaddr_pool, uaddr);
473 }
474 
475 
476 /*
477  * Lineair allocator.
478  * This allocator uses a first-fit algorithm.
479  *
480  * If hint is set, search will start at the hint position.
481  * Only searches forward.
482  */
483 const struct uvm_addr_functions uaddr_lin_functions = {
484 	.uaddr_select = &uaddr_lin_select,
485 	.uaddr_destroy = &uaddr_destroy,
486 	.uaddr_name = "uaddr_lin"
487 };
488 
489 struct uvm_addr_state *
490 uaddr_lin_create(vaddr_t minaddr, vaddr_t maxaddr)
491 {
492 	struct uvm_addr_state *uaddr;
493 
494 	uaddr = pool_get(&uaddr_pool, PR_WAITOK);
495 	uaddr->uaddr_minaddr = minaddr;
496 	uaddr->uaddr_maxaddr = maxaddr;
497 	uaddr->uaddr_functions = &uaddr_lin_functions;
498 	return uaddr;
499 }
500 
501 int
502 uaddr_lin_select(struct vm_map *map, struct uvm_addr_state *uaddr,
503     struct vm_map_entry **entry_out, vaddr_t *addr_out,
504     vsize_t sz, vaddr_t align, vaddr_t offset,
505     vm_prot_t prot, vaddr_t hint)
506 {
507 	vaddr_t guard_sz;
508 
509 	/* Deal with guardpages: search for space with one extra page. */
510 	guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
511 
512 	if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr < sz + guard_sz)
513 		return ENOMEM;
514 	return uvm_addr_linsearch(map, uaddr, entry_out, addr_out, 0, sz,
515 	    align, offset, 1, uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr - sz,
516 	    0, guard_sz);
517 }
518 
519 
520 /*
521  * Randomized allocator.
522  * This allocator use uvm_map_hint to acquire a random address and searches
523  * from there.
524  */
525 
526 const struct uvm_addr_functions uaddr_rnd_functions = {
527 	.uaddr_select = &uaddr_rnd_select,
528 	.uaddr_free_insert = &uaddr_rnd_insert,
529 	.uaddr_free_remove = &uaddr_rnd_remove,
530 	.uaddr_destroy = &uaddr_rnd_destroy,
531 #if defined(DEBUG) || defined(DDB)
532 	.uaddr_print = &uaddr_rnd_print,
533 #endif /* DEBUG || DDB */
534 	.uaddr_name = "uaddr_rnd"
535 };
536 
537 struct uvm_addr_state *
538 uaddr_rnd_create(vaddr_t minaddr, vaddr_t maxaddr)
539 {
540 	struct uaddr_rnd_state *uaddr;
541 
542 	uaddr = pool_get(&uaddr_rnd_pool, PR_WAITOK);
543 	uaddr->ur_uaddr.uaddr_minaddr = minaddr;
544 	uaddr->ur_uaddr.uaddr_maxaddr = maxaddr;
545 	uaddr->ur_uaddr.uaddr_functions = &uaddr_rnd_functions;
546 	TAILQ_INIT(&uaddr->ur_free);
547 	return &uaddr->ur_uaddr;
548 }
549 
550 int
551 uaddr_rnd_select(struct vm_map *map, struct uvm_addr_state *uaddr,
552     struct vm_map_entry **entry_out, vaddr_t *addr_out,
553     vsize_t sz, vaddr_t align, vaddr_t offset,
554     vm_prot_t prot, vaddr_t hint)
555 {
556 	struct vmspace		*vm;
557 	vaddr_t			 guard_sz;
558 	vaddr_t			 low_addr, high_addr;
559 	struct vm_map_entry	*entry, *next;
560 	vsize_t			 before_gap, after_gap;
561 	vaddr_t			 tmp;
562 
563 	KASSERT((map->flags & VM_MAP_ISVMSPACE) != 0);
564 	vm = (struct vmspace *)map;
565 
566 	/* Deal with guardpages: search for space with one extra page. */
567 	guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE);
568 
569 	/* Quick fail if the allocation won't fit. */
570 	if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr < sz + guard_sz)
571 		return ENOMEM;
572 
573 	/* Select a hint. */
574 	if (hint == 0)
575 		hint = uvm_map_hint(vm, prot);
576 	/* Clamp hint to uaddr range. */
577 	hint = MIN(MAX(hint, uaddr->uaddr_minaddr),
578 	    uaddr->uaddr_maxaddr - sz - guard_sz);
579 
580 	/* Align hint to align,offset parameters. */
581 	tmp = hint;
582 	hint = uvm_addr_align_forward(tmp, align, offset);
583 	/* Check for overflow during alignment. */
584 	if (hint < tmp || hint > uaddr->uaddr_maxaddr - sz - guard_sz)
585 		return ENOMEM; /* Compatibility mode: never look backwards. */
586 
587 	before_gap = 0;
588 	after_gap = guard_sz;
589 	hint -= MIN(hint, before_gap);
590 
591 	/*
592 	 * Use the augmented address tree to look up the first entry
593 	 * at or after hint with sufficient space.
594 	 *
595 	 * This code is the original optimized code, but will fail if the
596 	 * subtree it looks at does have sufficient space, but fails to meet
597 	 * the align constraint.
598 	 *
599 	 * Guard: subtree is not exhausted and max(fspace) >= required.
600 	 */
601 	entry = uvm_map_entrybyaddr(&map->addr, hint);
602 
603 	/* Walk up the tree, until there is at least sufficient space. */
604 	while (entry != NULL &&
605 	    entry->fspace_augment < before_gap + after_gap + sz)
606 		entry = RB_PARENT(entry, daddrs.addr_entry);
607 
608 	while (entry != NULL) {
609 		/* Test if this fits. */
610 		if (VMMAP_FREE_END(entry) > hint &&
611 		    uvm_map_uaddr_e(map, entry) == uaddr &&
612 		    uvm_addr_fitspace(&low_addr, &high_addr,
613 		    MAX(uaddr->uaddr_minaddr, VMMAP_FREE_START(entry)),
614 		    MIN(uaddr->uaddr_maxaddr, VMMAP_FREE_END(entry)),
615 		    sz, align, offset, before_gap, after_gap) == 0) {
616 			*entry_out = entry;
617 			if (hint >= low_addr && hint <= high_addr)
618 				*addr_out = hint;
619 			else
620 				*addr_out = low_addr;
621 			return 0;
622 		}
623 
624 		/* RB_NEXT, but skip subtrees that cannot possible fit. */
625 		next = RB_RIGHT(entry, daddrs.addr_entry);
626 		if (next != NULL &&
627 		    next->fspace_augment >= before_gap + after_gap + sz) {
628 			entry = next;
629 			while ((next = RB_LEFT(entry, daddrs.addr_entry)) !=
630 			    NULL)
631 				entry = next;
632 		} else {
633 do_parent:
634 			next = RB_PARENT(entry, daddrs.addr_entry);
635 			if (next == NULL)
636 				entry = NULL;
637 			else if (RB_LEFT(next, daddrs.addr_entry) == entry)
638 				entry = next;
639 			else {
640 				entry = next;
641 				goto do_parent;
642 			}
643 		}
644 	}
645 
646 	/* Lookup failed. */
647 	return ENOMEM;
648 }
649 
650 /*
651  * Destroy a uaddr_rnd_state structure.
652  */
653 void
654 uaddr_rnd_destroy(struct uvm_addr_state *uaddr)
655 {
656 	pool_put(&uaddr_rnd_pool, uaddr);
657 }
658 
659 /*
660  * Add entry to tailq.
661  */
662 void
663 uaddr_rnd_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
664     struct vm_map_entry *entry)
665 {
666 	return;
667 }
668 
669 /*
670  * Remove entry from tailq.
671  */
672 void
673 uaddr_rnd_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
674     struct vm_map_entry *entry)
675 {
676 	return;
677 }
678 
679 #if defined(DEBUG) || defined(DDB)
680 void
681 uaddr_rnd_print(struct uvm_addr_state *uaddr_p, boolean_t full,
682     int (*pr)(const char*, ...))
683 {
684 	struct vm_map_entry	*entry;
685 	struct uaddr_rnd_state	*uaddr;
686 	vaddr_t			 addr;
687 	size_t			 count;
688 	vsize_t			 space;
689 
690 	uaddr = (struct uaddr_rnd_state *)uaddr_p;
691 	addr = 0;
692 	count = 0;
693 	space = 0;
694 	TAILQ_FOREACH(entry, &uaddr->ur_free, dfree.tailq) {
695 		count++;
696 		space += entry->fspace;
697 
698 		if (full) {
699 			(*pr)("\tentry %p: 0x%lx-0x%lx G=0x%lx F=0x%lx\n",
700 			    entry, entry->start, entry->end,
701 			    entry->guard, entry->fspace);
702 			(*pr)("\t\tfree: 0x%lx-0x%lx\n",
703 			    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
704 		}
705 		if (entry->start < addr) {
706 			if (!full)
707 				(*pr)("\tentry %p: 0x%lx-0x%lx "
708 				    "G=0x%lx F=0x%lx\n",
709 				    entry, entry->start, entry->end,
710 				    entry->guard, entry->fspace);
711 			(*pr)("\t\tstart=0x%lx, expected at least 0x%lx\n",
712 			    entry->start, addr);
713 		}
714 
715 		addr = VMMAP_FREE_END(entry);
716 	}
717 	(*pr)("\t0x%lu entries, 0x%lx free bytes\n", count, space);
718 }
719 #endif /* DEBUG || DDB */
720 
721 
722 /*
723  * An allocator that selects an address within distance of the hint.
724  *
725  * If no hint is given, the allocator refuses to allocate.
726  */
727 const struct uvm_addr_functions uaddr_hint_functions = {
728 	.uaddr_select = &uaddr_hint_select,
729 	.uaddr_destroy = &uaddr_hint_destroy,
730 	.uaddr_name = "uaddr_hint"
731 };
732 
733 /*
734  * Create uaddr_hint state.
735  */
736 struct uvm_addr_state *
737 uaddr_hint_create(vaddr_t minaddr, vaddr_t maxaddr, vsize_t max_dist)
738 {
739 	struct uaddr_hint_state *ua_hint;
740 
741 	KASSERT(uaddr_hint_pool.pr_size == sizeof(*ua_hint));
742 
743 	ua_hint = pool_get(&uaddr_hint_pool, PR_WAITOK);
744 	ua_hint->uaddr.uaddr_minaddr = minaddr;
745 	ua_hint->uaddr.uaddr_maxaddr = maxaddr;
746 	ua_hint->uaddr.uaddr_functions = &uaddr_hint_functions;
747 	ua_hint->max_dist = max_dist;
748 	return &ua_hint->uaddr;
749 }
750 
751 /*
752  * Destroy uaddr_hint state.
753  */
754 void
755 uaddr_hint_destroy(struct uvm_addr_state *uaddr)
756 {
757 	pool_put(&uaddr_hint_pool, uaddr);
758 }
759 
760 /*
761  * Hint selector.
762  *
763  * Attempts to find an address that is within max_dist of the hint.
764  */
765 int
766 uaddr_hint_select(struct vm_map *map, struct uvm_addr_state *uaddr_param,
767     struct vm_map_entry **entry_out, vaddr_t *addr_out,
768     vsize_t sz, vaddr_t align, vaddr_t offset,
769     vm_prot_t prot, vaddr_t hint)
770 {
771 	struct uaddr_hint_state	*uaddr =
772 	    (struct uaddr_hint_state *)uaddr_param;
773 	vsize_t			 before_gap, after_gap;
774 	vaddr_t			 low, high;
775 	int			 dir;
776 
777 	if (hint == 0)
778 		return ENOMEM;
779 
780 	/* Calculate upper and lower bound for selected address. */
781 	high = hint + uaddr->max_dist;
782 	if (high < hint)	/* overflow */
783 		high = map->max_offset;
784 	high = MIN(high, uaddr->uaddr.uaddr_maxaddr);
785 	if (high < sz)
786 		return ENOMEM;	/* Protect against underflow. */
787 	high -= sz;
788 
789 	/* Calculate lower bound for selected address. */
790 	low = hint - uaddr->max_dist;
791 	if (low > hint)		/* underflow */
792 		low = map->min_offset;
793 	low = MAX(low, uaddr->uaddr.uaddr_minaddr);
794 
795 	/* Search strategy setup. */
796 	before_gap = PAGE_SIZE +
797 	    (arc4random_uniform(UADDR_HINT_MAXGAP) & ~(vaddr_t)PAGE_MASK);
798 	after_gap = PAGE_SIZE +
799 	    (arc4random_uniform(UADDR_HINT_MAXGAP) & ~(vaddr_t)PAGE_MASK);
800 	dir = (arc4random() & 0x01) ? 1 : -1;
801 
802 	/*
803 	 * Try to search:
804 	 * - forward,  with gap
805 	 * - backward, with gap
806 	 * - forward,  without gap
807 	 * - backward, without gap
808 	 * (Where forward is in the direction specified by dir and
809 	 * backward is in the direction specified by -dir).
810 	 */
811 	if (uvm_addr_linsearch(map, uaddr_param,
812 	    entry_out, addr_out, hint, sz, align, offset,
813 	    dir, low, high, before_gap, after_gap) == 0)
814 		return 0;
815 	if (uvm_addr_linsearch(map, uaddr_param,
816 	    entry_out, addr_out, hint, sz, align, offset,
817 	    -dir, low, high, before_gap, after_gap) == 0)
818 		return 0;
819 
820 	if (uvm_addr_linsearch(map, uaddr_param,
821 	    entry_out, addr_out, hint, sz, align, offset,
822 	    dir, low, high, 0, 0) == 0)
823 		return 0;
824 	if (uvm_addr_linsearch(map, uaddr_param,
825 	    entry_out, addr_out, hint, sz, align, offset,
826 	    -dir, low, high, 0, 0) == 0)
827 		return 0;
828 
829 	return ENOMEM;
830 }
831 
832 /*
833  * Kernel allocation bootstrap logic.
834  */
835 const struct uvm_addr_functions uaddr_kernel_functions = {
836 	.uaddr_select = &uaddr_kbootstrap_select,
837 	.uaddr_destroy = &uaddr_kbootstrap_destroy,
838 	.uaddr_name = "uaddr_kbootstrap"
839 };
840 
841 /*
842  * Select an address from the map.
843  *
844  * This function ignores the uaddr spec and instead uses the map directly.
845  * Because of that property, the uaddr algorithm can be shared across all
846  * kernel maps.
847  */
848 int
849 uaddr_kbootstrap_select(struct vm_map *map, struct uvm_addr_state *uaddr,
850     struct vm_map_entry **entry_out, vaddr_t *addr_out,
851     vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint)
852 {
853 	vaddr_t tmp;
854 
855 	RB_FOREACH(*entry_out, uvm_map_addr, &map->addr) {
856 		if (VMMAP_FREE_END(*entry_out) <= uvm_maxkaddr &&
857 		    uvm_addr_fitspace(addr_out, &tmp,
858 		    VMMAP_FREE_START(*entry_out), VMMAP_FREE_END(*entry_out),
859 		    sz, align, offset, 0, 0) == 0)
860 			return 0;
861 	}
862 
863 	return ENOMEM;
864 }
865 
866 /*
867  * Don't destroy the kernel bootstrap allocator.
868  */
869 void
870 uaddr_kbootstrap_destroy(struct uvm_addr_state *uaddr)
871 {
872 	KASSERT(uaddr == (struct uvm_addr_state *)&uaddr_kbootstrap);
873 }
874 
875 #ifndef SMALL_KERNEL
876 /*
877  * Best fit algorithm.
878  */
879 
880 const struct uvm_addr_functions uaddr_bestfit_functions = {
881 	.uaddr_select = &uaddr_bestfit_select,
882 	.uaddr_free_insert = &uaddr_bestfit_insert,
883 	.uaddr_free_remove = &uaddr_bestfit_remove,
884 	.uaddr_destroy = &uaddr_bestfit_destroy,
885 	.uaddr_name = "uaddr_bestfit"
886 };
887 
888 struct uvm_addr_state *
889 uaddr_bestfit_create(vaddr_t minaddr, vaddr_t maxaddr)
890 {
891 	struct uaddr_bestfit_state *uaddr;
892 
893 	uaddr = pool_get(&uaddr_bestfit_pool, PR_WAITOK);
894 	uaddr->ubf_uaddr.uaddr_minaddr = minaddr;
895 	uaddr->ubf_uaddr.uaddr_maxaddr = maxaddr;
896 	uaddr->ubf_uaddr.uaddr_functions = &uaddr_bestfit_functions;
897 	RB_INIT(&uaddr->ubf_free);
898 	return &uaddr->ubf_uaddr;
899 }
900 
901 void
902 uaddr_bestfit_destroy(struct uvm_addr_state *uaddr)
903 {
904 	pool_put(&uaddr_bestfit_pool, uaddr);
905 }
906 
907 void
908 uaddr_bestfit_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
909     struct vm_map_entry *entry)
910 {
911 	struct uaddr_bestfit_state	*uaddr;
912 	struct vm_map_entry		*rb_rv;
913 
914 	uaddr = (struct uaddr_bestfit_state *)uaddr_p;
915 	if ((rb_rv = RB_INSERT(uaddr_free_rbtree, &uaddr->ubf_free, entry)) !=
916 	    NULL) {
917 		panic("%s: duplicate insertion: state %p "
918 		    "interting %p, colliding with %p", __func__,
919 		    uaddr, entry, rb_rv);
920 	}
921 }
922 
923 void
924 uaddr_bestfit_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
925     struct vm_map_entry *entry)
926 {
927 	struct uaddr_bestfit_state	*uaddr;
928 
929 	uaddr = (struct uaddr_bestfit_state *)uaddr_p;
930 	if (RB_REMOVE(uaddr_free_rbtree, &uaddr->ubf_free, entry) != entry)
931 		panic("%s: entry was not in tree", __func__);
932 }
933 
934 int
935 uaddr_bestfit_select(struct vm_map *map, struct uvm_addr_state *uaddr_p,
936     struct vm_map_entry **entry_out, vaddr_t *addr_out,
937     vsize_t sz, vaddr_t align, vaddr_t offset,
938     vm_prot_t prot, vaddr_t hint)
939 {
940 	vaddr_t				 min, max;
941 	struct uaddr_bestfit_state	*uaddr;
942 	struct vm_map_entry		*entry;
943 	vsize_t				 guardsz;
944 
945 	uaddr = (struct uaddr_bestfit_state *)uaddr_p;
946 	guardsz = ((map->flags & VM_MAP_GUARDPAGES) ? PAGE_SIZE : 0);
947 
948 	/*
949 	 * Find smallest item on freelist capable of holding item.
950 	 * Deal with guardpages: search for space with one extra page.
951 	 */
952 	entry = uvm_addr_entrybyspace(&uaddr->ubf_free, sz + guardsz);
953 	if (entry == NULL)
954 		return ENOMEM;
955 
956 	/* Walk the tree until we find an entry that fits.  */
957 	while (uvm_addr_fitspace(&min, &max,
958 	    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
959 	    sz, align, offset, 0, guardsz) != 0) {
960 		entry = RB_NEXT(uaddr_free_rbtree, &uaddr->ubf_free, entry);
961 		if (entry == NULL)
962 			return ENOMEM;
963 	}
964 
965 	/* Return the address that generates the least fragmentation. */
966 	*entry_out = entry;
967 	*addr_out = (min - VMMAP_FREE_START(entry) <=
968 	    VMMAP_FREE_END(entry) - guardsz - sz - max ?
969 	    min : max);
970 	return 0;
971 }
972 #endif /* !SMALL_KERNEL */
973 
974 
975 #ifndef SMALL_KERNEL
976 /*
977  * A userspace allocator based on pivots.
978  */
979 
980 const struct uvm_addr_functions uaddr_pivot_functions = {
981 	.uaddr_select = &uaddr_pivot_select,
982 	.uaddr_free_insert = &uaddr_pivot_insert,
983 	.uaddr_free_remove = &uaddr_pivot_remove,
984 	.uaddr_destroy = &uaddr_pivot_destroy,
985 #if defined(DEBUG) || defined(DDB)
986 	.uaddr_print = &uaddr_pivot_print,
987 #endif /* DEBUG || DDB */
988 	.uaddr_name = "uaddr_pivot"
989 };
990 
991 /*
992  * A special random function for pivots.
993  *
994  * This function will return:
995  * - a random number
996  * - a multiple of PAGE_SIZE
997  * - at least PAGE_SIZE
998  *
999  * The random function has a slightly higher change to return a small number.
1000  */
1001 vsize_t
1002 uaddr_pivot_random()
1003 {
1004 	int			r;
1005 
1006 	/*
1007 	 * The sum of two six-sided dice will have a normal distribution.
1008 	 * We map the highest probable number to 1, by folding the curve
1009 	 * (think of a graph on a piece of paper, that you fold).
1010 	 *
1011 	 * Because the fold happens at PIVOT_RND - 1, the numbers 0 and 1
1012 	 * have the same and highest probability of happening.
1013 	 */
1014 	r = arc4random_uniform(PIVOT_RND) + arc4random_uniform(PIVOT_RND) -
1015 	    (PIVOT_RND - 1);
1016 	if (r < 0)
1017 		r = -r;
1018 
1019 	/*
1020 	 * Make the returned value at least PAGE_SIZE and a multiple of
1021 	 * PAGE_SIZE.
1022 	 */
1023 	return (vaddr_t)(1 + r) << PAGE_SHIFT;
1024 }
1025 
1026 /*
1027  * Select a new pivot.
1028  *
1029  * A pivot must:
1030  * - be chosen random
1031  * - have a randomly chosen gap before it, where the uaddr_state starts
1032  * - have a randomly chosen gap after it, before the uaddr_state ends
1033  *
1034  * Furthermore, the pivot must provide sufficient space for the allocation.
1035  * The addr will be set to the selected address.
1036  *
1037  * Returns ENOMEM on failure.
1038  */
1039 int
1040 uaddr_pivot_newpivot(struct vm_map *map, struct uaddr_pivot_state *uaddr,
1041     struct uaddr_pivot *pivot,
1042     struct vm_map_entry **entry_out, vaddr_t *addr_out,
1043     vsize_t sz, vaddr_t align, vaddr_t offset,
1044     vsize_t before_gap, vsize_t after_gap)
1045 {
1046 	struct vm_map_entry		*entry, *found;
1047 	vaddr_t				 minaddr, maxaddr;
1048 	vsize_t				 dist;
1049 	vaddr_t				 found_minaddr, found_maxaddr;
1050 	vaddr_t				 min, max;
1051 	vsize_t				 arc4_arg;
1052 	int				 fit_error;
1053 	u_int32_t			 path;
1054 
1055 	minaddr = uaddr->up_uaddr.uaddr_minaddr;
1056 	maxaddr = uaddr->up_uaddr.uaddr_maxaddr;
1057 	KASSERT(minaddr < maxaddr);
1058 #ifdef DIAGNOSTIC
1059 	if (minaddr + 2 * PAGE_SIZE > maxaddr) {
1060 		panic("uaddr_pivot_newpivot: cannot grant random pivot "
1061 		    "in area less than 2 pages (size = 0x%lx)",
1062 		    maxaddr - minaddr);
1063 	}
1064 #endif /* DIAGNOSTIC */
1065 
1066 	/*
1067 	 * Gap calculation: 1/32 of the size of the managed area.
1068 	 *
1069 	 * At most: sufficient to not get truncated at arc4random.
1070 	 * At least: 2 PAGE_SIZE
1071 	 *
1072 	 * minaddr and maxaddr will be changed according to arc4random.
1073 	 */
1074 	dist = MAX((maxaddr - minaddr) / 32, 2 * (vaddr_t)PAGE_SIZE);
1075 	if (dist >> PAGE_SHIFT > 0xffffffff) {
1076 		minaddr += (vsize_t)arc4random() << PAGE_SHIFT;
1077 		maxaddr -= (vsize_t)arc4random() << PAGE_SHIFT;
1078 	} else {
1079 		minaddr += (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) <<
1080 		    PAGE_SHIFT;
1081 		maxaddr -= (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) <<
1082 		    PAGE_SHIFT;
1083 	}
1084 
1085 	/*
1086 	 * A very fast way to find an entry that will be large enough
1087 	 * to hold the allocation, but still is found more or less
1088 	 * randomly: the tree path selector has a 50% chance to go for
1089 	 * a bigger or smaller entry.
1090 	 *
1091 	 * Note that the memory may actually be available,
1092 	 * but the fragmentation may be so bad and the gaps chosen
1093 	 * so unfortunately, that the allocation will not succeed.
1094 	 * Or the alignment can only be satisfied by an entry that
1095 	 * is not visited in the randomly selected path.
1096 	 *
1097 	 * This code finds an entry with sufficient space in O(log n) time.
1098 	 */
1099 	path = arc4random();
1100 	found = NULL;
1101 	entry = RB_ROOT(&uaddr->up_free);
1102 	while (entry != NULL) {
1103 		fit_error = uvm_addr_fitspace(&min, &max,
1104 		    MAX(VMMAP_FREE_START(entry), minaddr),
1105 		    MIN(VMMAP_FREE_END(entry), maxaddr),
1106 		    sz, align, offset, before_gap, after_gap);
1107 
1108 		/* It fits, save this entry. */
1109 		if (fit_error == 0) {
1110 			found = entry;
1111 			found_minaddr = min;
1112 			found_maxaddr = max;
1113 		}
1114 
1115 		/* Next. */
1116 		if (fit_error != 0)
1117 			entry = RB_RIGHT(entry, dfree.rbtree);
1118 		else if	((path & 0x1) == 0) {
1119 			path >>= 1;
1120 			entry = RB_RIGHT(entry, dfree.rbtree);
1121 		} else {
1122 			path >>= 1;
1123 			entry = RB_LEFT(entry, dfree.rbtree);
1124 		}
1125 	}
1126 	if (found == NULL)
1127 		return ENOMEM;	/* Not found a large enough region. */
1128 
1129 	/*
1130 	 * Calculate a random address within found.
1131 	 *
1132 	 * found_minaddr and found_maxaddr are already aligned, so be sure
1133 	 * to select a multiple of align as the offset in the entry.
1134 	 * Preferably, arc4random_uniform is used to provide no bias within
1135 	 * the entry.
1136 	 * However if the size of the entry exceeds arc4random_uniforms
1137 	 * argument limit, we simply use arc4random (thus limiting ourselves
1138 	 * to 4G * PAGE_SIZE bytes offset).
1139 	 */
1140 	if (found_maxaddr == found_minaddr)
1141 		*addr_out = found_minaddr;
1142 	else {
1143 		KASSERT(align >= PAGE_SIZE && (align & (align - 1)) == 0);
1144 		arc4_arg = found_maxaddr - found_minaddr;
1145 		if (arc4_arg > 0xffffffff) {
1146 			*addr_out = found_minaddr +
1147 			    (arc4random() & (align - 1));
1148 		} else {
1149 			*addr_out = found_minaddr +
1150 			    (arc4random_uniform(arc4_arg) & (align - 1));
1151 		}
1152 	}
1153 	/* Address was found in this entry. */
1154 	*entry_out = found;
1155 
1156 	/*
1157 	 * Set up new pivot and return selected address.
1158 	 *
1159 	 * Depending on the direction of the pivot, the pivot must be placed
1160 	 * at the bottom or the top of the allocation:
1161 	 * - if the pivot moves upwards, place the pivot at the top of the
1162 	 *   allocation,
1163 	 * - if the pivot moves downwards, place the pivot at the bottom
1164 	 *   of the allocation.
1165 	 */
1166 	pivot->entry = found;
1167 	pivot->dir = (arc4random() & 0x1 ? 1 : -1);
1168 	if (pivot->dir > 0)
1169 		pivot->addr = *addr_out + sz;
1170 	else
1171 		pivot->addr = *addr_out;
1172 	pivot->expire = PIVOT_EXPIRE - 1; /* First use is right now. */
1173 	return 0;
1174 }
1175 
1176 /*
1177  * Pivot selector.
1178  *
1179  * Each time the selector is invoked, it will select a random pivot, which
1180  * it will use to select memory with. The memory will be placed at the pivot,
1181  * with a randomly sized gap between the allocation and the pivot.
1182  * The pivot will then move so it will never revisit this address.
1183  *
1184  * Each allocation, the pivot expiry timer ticks. Once the pivot becomes
1185  * expired, it will be replaced with a newly created pivot. Pivots also
1186  * automatically expire if they fail to provide memory for an allocation.
1187  *
1188  * Expired pivots are replaced using the uaddr_pivot_newpivot() function,
1189  * which will ensure the pivot points at memory in such a way that the
1190  * allocation will succeed.
1191  * As an added bonus, the uaddr_pivot_newpivot() function will perform the
1192  * allocation immediately and move the pivot as appropriate.
1193  *
1194  * If uaddr_pivot_newpivot() fails to find a new pivot that will allow the
1195  * allocation to succeed, it will not create a new pivot and the allocation
1196  * will fail.
1197  *
1198  * A pivot running into used memory will automatically expire (because it will
1199  * fail to allocate).
1200  *
1201  * Characteristics of the allocator:
1202  * - best case, an allocation is O(log N)
1203  *   (it would be O(1), if it werent for the need to check if the memory is
1204  *   free; although that can be avoided...)
1205  * - worst case, an allocation is O(log N)
1206  *   (the uaddr_pivot_newpivot() function has that complexity)
1207  * - failed allocations always take O(log N)
1208  *   (the uaddr_pivot_newpivot() function will walk that deep into the tree).
1209  */
1210 int
1211 uaddr_pivot_select(struct vm_map *map, struct uvm_addr_state *uaddr_p,
1212     struct vm_map_entry **entry_out, vaddr_t *addr_out,
1213     vsize_t sz, vaddr_t align, vaddr_t offset,
1214     vm_prot_t prot, vaddr_t hint)
1215 {
1216 	struct uaddr_pivot_state	*uaddr;
1217 	struct vm_map_entry		*entry;
1218 	struct uaddr_pivot		*pivot;
1219 	vaddr_t				 min, max;
1220 	vsize_t				 before_gap, after_gap;
1221 	int				 err;
1222 
1223 	/* Hint must be handled by dedicated hint allocator. */
1224 	if (hint != 0)
1225 		return EINVAL;
1226 
1227 	/*
1228 	 * Select a random pivot and a random gap sizes around the allocation.
1229 	 */
1230 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1231 	pivot = &uaddr->up_pivots[
1232 	    arc4random_uniform(nitems(uaddr->up_pivots))];
1233 	before_gap = uaddr_pivot_random();
1234 	after_gap = uaddr_pivot_random();
1235 	if (pivot->addr == 0 || pivot->entry == NULL || pivot->expire == 0)
1236 		goto expired;	/* Pivot is invalid (null or expired). */
1237 
1238 	/* Attempt to use the pivot to map the entry. */
1239 	entry = pivot->entry;
1240 	if (pivot->dir > 0) {
1241 		if (uvm_addr_fitspace(&min, &max,
1242 		    MAX(VMMAP_FREE_START(entry), pivot->addr),
1243 		    VMMAP_FREE_END(entry), sz, align, offset,
1244 		    before_gap, after_gap) == 0) {
1245 			*addr_out = min;
1246 			*entry_out = entry;
1247 			pivot->addr = min + sz;
1248 			pivot->expire--;
1249 			return 0;
1250 		}
1251 	} else {
1252 		if (uvm_addr_fitspace(&min, &max,
1253 		    VMMAP_FREE_START(entry),
1254 		    MIN(VMMAP_FREE_END(entry), pivot->addr),
1255 		    sz, align, offset, before_gap, after_gap) == 0) {
1256 			*addr_out = max;
1257 			*entry_out = entry;
1258 			pivot->addr = max;
1259 			pivot->expire--;
1260 			return 0;
1261 		}
1262 	}
1263 
1264 expired:
1265 	/*
1266 	 * Pivot expired or allocation failed.
1267 	 * Use pivot selector to do the allocation and find a new pivot.
1268 	 */
1269 	err = uaddr_pivot_newpivot(map, uaddr, pivot, entry_out, addr_out,
1270 	    sz, align, offset, before_gap, after_gap);
1271 	return err;
1272 }
1273 
1274 /*
1275  * Free the pivot.
1276  */
1277 void
1278 uaddr_pivot_destroy(struct uvm_addr_state *uaddr)
1279 {
1280 	pool_put(&uaddr_pivot_pool, uaddr);
1281 }
1282 
1283 /*
1284  * Insert an entry with free space in the space tree.
1285  */
1286 void
1287 uaddr_pivot_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p,
1288     struct vm_map_entry *entry)
1289 {
1290 	struct uaddr_pivot_state	*uaddr;
1291 	struct vm_map_entry		*rb_rv;
1292 	struct uaddr_pivot		*p;
1293 	vaddr_t				 check_addr;
1294 	vaddr_t				 start, end;
1295 
1296 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1297 	if ((rb_rv = RB_INSERT(uaddr_free_rbtree, &uaddr->up_free, entry)) !=
1298 	    NULL) {
1299 		panic("%s: duplicate insertion: state %p "
1300 		    "inserting entry %p which collides with %p", __func__,
1301 		    uaddr, entry, rb_rv);
1302 	}
1303 
1304 	start = VMMAP_FREE_START(entry);
1305 	end = VMMAP_FREE_END(entry);
1306 
1307 	/*
1308 	 * Update all pivots that are contained in this entry.
1309 	 */
1310 	for (p = &uaddr->up_pivots[0];
1311 	    p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) {
1312 		check_addr = p->addr;
1313 		if (check_addr == 0)
1314 			continue;
1315 		if (p->dir < 0)
1316 			check_addr--;
1317 
1318 		if (start <= check_addr &&
1319 		    check_addr < end) {
1320 			KASSERT(p->entry == NULL);
1321 			p->entry = entry;
1322 		}
1323 	}
1324 }
1325 
1326 /*
1327  * Remove an entry with free space from the space tree.
1328  */
1329 void
1330 uaddr_pivot_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p,
1331     struct vm_map_entry *entry)
1332 {
1333 	struct uaddr_pivot_state	*uaddr;
1334 	struct uaddr_pivot		*p;
1335 
1336 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1337 	if (RB_REMOVE(uaddr_free_rbtree, &uaddr->up_free, entry) != entry)
1338 		panic("%s: entry was not in tree", __func__);
1339 
1340 	/*
1341 	 * Inform any pivot with this entry that the entry is gone.
1342 	 * Note that this does not automatically invalidate the pivot.
1343 	 */
1344 	for (p = &uaddr->up_pivots[0];
1345 	    p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) {
1346 		if (p->entry == entry)
1347 			p->entry = NULL;
1348 	}
1349 }
1350 
1351 /*
1352  * Create a new pivot selector.
1353  *
1354  * Initially, all pivots are in the expired state.
1355  * Two reasons for this:
1356  * - it means this allocator will not take a huge amount of time
1357  * - pivots select better on demand, because the pivot selection will be
1358  *   affected by preceding allocations:
1359  *   the next pivots will likely end up in different segments of free memory,
1360  *   that was segmented by an earlier allocation; better spread.
1361  */
1362 struct uvm_addr_state *
1363 uaddr_pivot_create(vaddr_t minaddr, vaddr_t maxaddr)
1364 {
1365 	struct uaddr_pivot_state *uaddr;
1366 
1367 	uaddr = pool_get(&uaddr_pivot_pool, PR_WAITOK);
1368 	uaddr->up_uaddr.uaddr_minaddr = minaddr;
1369 	uaddr->up_uaddr.uaddr_maxaddr = maxaddr;
1370 	uaddr->up_uaddr.uaddr_functions = &uaddr_pivot_functions;
1371 	RB_INIT(&uaddr->up_free);
1372 	bzero(uaddr->up_pivots, sizeof(uaddr->up_pivots));
1373 
1374 	return &uaddr->up_uaddr;
1375 }
1376 
1377 #if defined(DEBUG) || defined(DDB)
1378 /*
1379  * Print the uaddr_pivot_state.
1380  *
1381  * If full, a listing of all entries in the state will be provided.
1382  */
1383 void
1384 uaddr_pivot_print(struct uvm_addr_state *uaddr_p, boolean_t full,
1385     int (*pr)(const char *, ...))
1386 {
1387 	struct uaddr_pivot_state	*uaddr;
1388 	struct uaddr_pivot		*pivot;
1389 	struct vm_map_entry		*entry;
1390 	int				 i;
1391 	vaddr_t				 check_addr;
1392 
1393 	uaddr = (struct uaddr_pivot_state *)uaddr_p;
1394 
1395 	for (i = 0; i < NUM_PIVOTS; i++) {
1396 		pivot = &uaddr->up_pivots[i];
1397 
1398 		(*pr)("\tpivot 0x%lx, epires in %d, direction %d\n",
1399 		    pivot->addr, pivot->expire, pivot->dir);
1400 	}
1401 	if (!full)
1402 		return;
1403 
1404 	if (RB_EMPTY(&uaddr->up_free))
1405 		(*pr)("\tempty\n");
1406 	/* Print list of free space. */
1407 	RB_FOREACH(entry, uaddr_free_rbtree, &uaddr->up_free) {
1408 		(*pr)("\t0x%lx - 0x%lx free (0x%lx bytes)\n",
1409 		    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry),
1410 		    VMMAP_FREE_END(entry) - VMMAP_FREE_START(entry));
1411 
1412 		for (i = 0; i < NUM_PIVOTS; i++) {
1413 			pivot = &uaddr->up_pivots[i];
1414 			check_addr = pivot->addr;
1415 			if (check_addr == 0)
1416 				continue;
1417 			if (pivot->dir < 0)
1418 				check_addr--;
1419 
1420 			if (VMMAP_FREE_START(entry) <= check_addr &&
1421 			    check_addr < VMMAP_FREE_END(entry)) {
1422 				(*pr)("\t\tcontains pivot %d (0x%lx)\n",
1423 				    i, pivot->addr);
1424 			}
1425 		}
1426 	}
1427 }
1428 #endif /* DEBUG || DDB */
1429 #endif /* !SMALL_KERNEL */
1430 
1431 #ifndef SMALL_KERNEL
1432 /*
1433  * Strategy for uaddr_stack_brk_select.
1434  */
1435 struct uaddr_bs_strat {
1436 	vaddr_t			start;		/* Start of area. */
1437 	vaddr_t			end;		/* End of area. */
1438 	int			dir;		/* Search direction. */
1439 };
1440 
1441 /*
1442  * Stack/break allocator.
1443  *
1444  * Stack area is grown into in the opposite direction of the stack growth,
1445  * brk area is grown downward (because sbrk() grows upward).
1446  *
1447  * Both areas are grown into proportially: a weighted chance is used to
1448  * select which one (stack or brk area) to try. If the allocation fails,
1449  * the other one is tested.
1450  */
1451 const struct uvm_addr_functions uaddr_stack_brk_functions = {
1452 	.uaddr_select = &uaddr_stack_brk_select,
1453 	.uaddr_destroy = &uaddr_destroy,
1454 	.uaddr_name = "uaddr_stckbrk"
1455 };
1456 
1457 /*
1458  * Stack/brk address selector.
1459  */
1460 int
1461 uaddr_stack_brk_select(struct vm_map *map, struct uvm_addr_state *uaddr,
1462     struct vm_map_entry **entry_out, vaddr_t *addr_out,
1463     vsize_t sz, vaddr_t align, vaddr_t offset,
1464     vm_prot_t prot, vaddr_t hint)
1465 {
1466 	vsize_t			before_gap, after_gap;
1467 	int			stack_idx, brk_idx;
1468 	struct uaddr_bs_strat	strat[2], *s;
1469 	vsize_t			sb_size;
1470 
1471 	/*
1472 	 * Choose gap size and if the stack is searched before or after the
1473 	 * brk area.
1474 	 */
1475 	before_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT;
1476 	after_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT;
1477 
1478 	sb_size = (map->s_end - map->s_start) + (map->b_end - map->b_start);
1479 	sb_size >>= PAGE_SHIFT;
1480 	if (arc4random_uniform(MAX(sb_size, 0xffffffff)) >
1481 	    map->b_end - map->b_start) {
1482 		brk_idx = 1;
1483 		stack_idx = 0;
1484 	} else {
1485 		brk_idx = 0;
1486 		stack_idx = 1;
1487 	}
1488 
1489 	/* Set up stack search strategy. */
1490 	s = &strat[stack_idx];
1491 	s->start = MAX(map->s_start, uaddr->uaddr_minaddr);
1492 	s->end = MIN(map->s_end, uaddr->uaddr_maxaddr);
1493 #ifdef MACHINE_STACK_GROWS_UP
1494 	s->dir = -1;
1495 #else
1496 	s->dir =  1;
1497 #endif
1498 
1499 	/* Set up brk search strategy. */
1500 	s = &strat[brk_idx];
1501 	s->start = MAX(map->b_start, uaddr->uaddr_minaddr);
1502 	s->end = MIN(map->b_end, uaddr->uaddr_maxaddr);
1503 	s->dir = -1;	/* Opposite of brk() growth. */
1504 
1505 	/* Linear search for space.  */
1506 	for (s = &strat[0]; s < &strat[nitems(strat)]; s++) {
1507 		if (s->end - s->start < sz)
1508 			continue;
1509 		if (uvm_addr_linsearch(map, uaddr, entry_out, addr_out,
1510 		    0, sz, align, offset, s->dir, s->start, s->end - sz,
1511 		    before_gap, after_gap) == 0)
1512 			return 0;
1513 	}
1514 
1515 	return ENOMEM;
1516 }
1517 
1518 struct uvm_addr_state *
1519 uaddr_stack_brk_create(vaddr_t minaddr, vaddr_t maxaddr)
1520 {
1521 	struct uvm_addr_state* uaddr;
1522 
1523 	uaddr = pool_get(&uaddr_pool, PR_WAITOK);
1524 	uaddr->uaddr_minaddr = minaddr;
1525 	uaddr->uaddr_maxaddr = maxaddr;
1526 	uaddr->uaddr_functions = &uaddr_stack_brk_functions;
1527 	return uaddr;
1528 }
1529 #endif /* !SMALL_KERNEL */
1530 
1531 
1532 #ifndef SMALL_KERNEL
1533 RB_GENERATE(uaddr_free_rbtree, vm_map_entry, dfree.rbtree,
1534     uvm_mapent_fspace_cmp);
1535 #endif /* !SMALL_KERNEL */
1536