xref: /openbsd-src/sys/uvm/uvm_map.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*	$OpenBSD: uvm_map.c,v 1.225 2016/09/16 02:35:42 dlg Exp $	*/
2 /*	$NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  *
19  *
20  * Copyright (c) 1997 Charles D. Cranor and Washington University.
21  * Copyright (c) 1991, 1993, The Regents of the University of California.
22  *
23  * All rights reserved.
24  *
25  * This code is derived from software contributed to Berkeley by
26  * The Mach Operating System project at Carnegie-Mellon University.
27  *
28  * Redistribution and use in source and binary forms, with or without
29  * modification, are permitted provided that the following conditions
30  * are met:
31  * 1. Redistributions of source code must retain the above copyright
32  *    notice, this list of conditions and the following disclaimer.
33  * 2. Redistributions in binary form must reproduce the above copyright
34  *    notice, this list of conditions and the following disclaimer in the
35  *    documentation and/or other materials provided with the distribution.
36  * 3. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	@(#)vm_map.c    8.3 (Berkeley) 1/12/94
53  * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
54  *
55  *
56  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
57  * All rights reserved.
58  *
59  * Permission to use, copy, modify and distribute this software and
60  * its documentation is hereby granted, provided that both the copyright
61  * notice and this permission notice appear in all copies of the
62  * software, derivative works or modified versions, and any portions
63  * thereof, and that both notices appear in supporting documentation.
64  *
65  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
66  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
67  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
68  *
69  * Carnegie Mellon requests users of this software to return to
70  *
71  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
72  *  School of Computer Science
73  *  Carnegie Mellon University
74  *  Pittsburgh PA 15213-3890
75  *
76  * any improvements or extensions that they make and grant Carnegie the
77  * rights to redistribute these changes.
78  */
79 
80 /*
81  * uvm_map.c: uvm map operations
82  */
83 
84 /* #define DEBUG */
85 /* #define VMMAP_DEBUG */
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/mman.h>
90 #include <sys/proc.h>
91 #include <sys/malloc.h>
92 #include <sys/pool.h>
93 #include <sys/sysctl.h>
94 
95 #ifdef SYSVSHM
96 #include <sys/shm.h>
97 #endif
98 
99 #include <uvm/uvm.h>
100 
101 #ifdef DDB
102 #include <uvm/uvm_ddb.h>
103 #endif
104 
105 #include <uvm/uvm_addr.h>
106 
107 
108 vsize_t			 uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t);
109 int			 uvm_mapent_isjoinable(struct vm_map*,
110 			    struct vm_map_entry*, struct vm_map_entry*);
111 struct vm_map_entry	*uvm_mapent_merge(struct vm_map*, struct vm_map_entry*,
112 			    struct vm_map_entry*, struct uvm_map_deadq*);
113 struct vm_map_entry	*uvm_mapent_tryjoin(struct vm_map*,
114 			    struct vm_map_entry*, struct uvm_map_deadq*);
115 struct vm_map_entry	*uvm_map_mkentry(struct vm_map*, struct vm_map_entry*,
116 			    struct vm_map_entry*, vaddr_t, vsize_t, int,
117 			    struct uvm_map_deadq*, struct vm_map_entry*);
118 struct vm_map_entry	*uvm_mapent_alloc(struct vm_map*, int);
119 void			 uvm_mapent_free(struct vm_map_entry*);
120 void			 uvm_unmap_kill_entry(struct vm_map*,
121 			    struct vm_map_entry*);
122 void			 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *);
123 void			 uvm_mapent_mkfree(struct vm_map*,
124 			    struct vm_map_entry*, struct vm_map_entry**,
125 			    struct uvm_map_deadq*, boolean_t);
126 void			 uvm_map_pageable_pgon(struct vm_map*,
127 			    struct vm_map_entry*, struct vm_map_entry*,
128 			    vaddr_t, vaddr_t);
129 int			 uvm_map_pageable_wire(struct vm_map*,
130 			    struct vm_map_entry*, struct vm_map_entry*,
131 			    vaddr_t, vaddr_t, int);
132 void			 uvm_map_setup_entries(struct vm_map*);
133 void			 uvm_map_setup_md(struct vm_map*);
134 void			 uvm_map_teardown(struct vm_map*);
135 void			 uvm_map_vmspace_update(struct vm_map*,
136 			    struct uvm_map_deadq*, int);
137 void			 uvm_map_kmem_grow(struct vm_map*,
138 			    struct uvm_map_deadq*, vsize_t, int);
139 void			 uvm_map_freelist_update_clear(struct vm_map*,
140 			    struct uvm_map_deadq*);
141 void			 uvm_map_freelist_update_refill(struct vm_map *, int);
142 void			 uvm_map_freelist_update(struct vm_map*,
143 			    struct uvm_map_deadq*, vaddr_t, vaddr_t,
144 			    vaddr_t, vaddr_t, int);
145 struct vm_map_entry	*uvm_map_fix_space(struct vm_map*, struct vm_map_entry*,
146 			    vaddr_t, vaddr_t, int);
147 int			 uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int,
148 			    struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t,
149 			    int);
150 int			 uvm_map_findspace(struct vm_map*,
151 			    struct vm_map_entry**, struct vm_map_entry**,
152 			    vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
153 			    vaddr_t);
154 vsize_t			 uvm_map_addr_augment_get(struct vm_map_entry*);
155 void			 uvm_map_addr_augment(struct vm_map_entry*);
156 
157 /*
158  * Tree management functions.
159  */
160 
161 static __inline void	 uvm_mapent_copy(struct vm_map_entry*,
162 			    struct vm_map_entry*);
163 static inline int	 uvm_mapentry_addrcmp(const struct vm_map_entry*,
164 			    const struct vm_map_entry*);
165 void			 uvm_mapent_free_insert(struct vm_map*,
166 			    struct uvm_addr_state*, struct vm_map_entry*);
167 void			 uvm_mapent_free_remove(struct vm_map*,
168 			    struct uvm_addr_state*, struct vm_map_entry*);
169 void			 uvm_mapent_addr_insert(struct vm_map*,
170 			    struct vm_map_entry*);
171 void			 uvm_mapent_addr_remove(struct vm_map*,
172 			    struct vm_map_entry*);
173 void			 uvm_map_splitentry(struct vm_map*,
174 			    struct vm_map_entry*, struct vm_map_entry*,
175 			    vaddr_t);
176 vsize_t			 uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t);
177 int			 uvm_mapent_bias(struct vm_map*, struct vm_map_entry*);
178 
179 /*
180  * uvm_vmspace_fork helper functions.
181  */
182 struct vm_map_entry	*uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
183 			    vsize_t, vm_prot_t, vm_prot_t,
184 			    struct vm_map_entry*, struct uvm_map_deadq*, int,
185 			    int);
186 struct vm_map_entry	*uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
187 			    vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
188 			    struct vm_map_entry*, struct uvm_map_deadq*);
189 struct vm_map_entry	*uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
190 			    struct vm_map*, struct vm_map_entry*,
191 			    struct uvm_map_deadq*);
192 struct vm_map_entry	*uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
193 			    struct vm_map*, struct vm_map_entry*,
194 			    struct uvm_map_deadq*);
195 struct vm_map_entry	*uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
196 			    struct vm_map*, struct vm_map_entry*,
197 			    struct uvm_map_deadq*);
198 
199 /*
200  * Tree validation.
201  */
202 #ifdef VMMAP_DEBUG
203 void			 uvm_tree_assert(struct vm_map*, int, char*,
204 			    char*, int);
205 #define UVM_ASSERT(map, cond, file, line)				\
206 	uvm_tree_assert((map), (cond), #cond, (file), (line))
207 void			 uvm_tree_sanity(struct vm_map*, char*, int);
208 void			 uvm_tree_size_chk(struct vm_map*, char*, int);
209 void			 vmspace_validate(struct vm_map*);
210 #else
211 #define uvm_tree_sanity(_map, _file, _line)		do {} while (0)
212 #define uvm_tree_size_chk(_map, _file, _line)		do {} while (0)
213 #define vmspace_validate(_map)				do {} while (0)
214 #endif
215 
216 /*
217  * All architectures will have pmap_prefer.
218  */
219 #ifndef PMAP_PREFER
220 #define PMAP_PREFER_ALIGN()	(vaddr_t)PAGE_SIZE
221 #define PMAP_PREFER_OFFSET(off)	0
222 #define PMAP_PREFER(addr, off)	(addr)
223 #endif
224 
225 
226 /*
227  * The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
228  * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes.
229  *
230  * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size
231  * each time.
232  */
233 #define VM_MAP_KSIZE_INIT	(512 * (vaddr_t)PAGE_SIZE)
234 #define VM_MAP_KSIZE_DELTA	(256 * (vaddr_t)PAGE_SIZE)
235 #define VM_MAP_KSIZE_ALLOCMUL	4
236 /*
237  * When selecting a random free-space block, look at most FSPACE_DELTA blocks
238  * ahead.
239  */
240 #define FSPACE_DELTA		8
241 /*
242  * Put allocations adjecent to previous allocations when the free-space tree
243  * is larger than FSPACE_COMPACT entries.
244  *
245  * Alignment and PMAP_PREFER may still cause the entry to not be fully
246  * adjecent. Note that this strategy reduces memory fragmentation (by leaving
247  * a large space before or after the allocation).
248  */
249 #define FSPACE_COMPACT		128
250 /*
251  * Make the address selection skip at most this many bytes from the start of
252  * the free space in which the allocation takes place.
253  *
254  * The main idea behind a randomized address space is that an attacker cannot
255  * know where to target his attack. Therefore, the location of objects must be
256  * as random as possible. However, the goal is not to create the most sparse
257  * map that is possible.
258  * FSPACE_MAXOFF pushes the considered range in bytes down to less insane
259  * sizes, thereby reducing the sparseness. The biggest randomization comes
260  * from fragmentation, i.e. FSPACE_COMPACT.
261  */
262 #define FSPACE_MAXOFF		((vaddr_t)32 * 1024 * 1024)
263 /*
264  * Allow for small gaps in the overflow areas.
265  * Gap size is in bytes and does not have to be a multiple of page-size.
266  */
267 #define FSPACE_BIASGAP		((vaddr_t)32 * 1024)
268 
269 /* auto-allocate address lower bound */
270 #define VMMAP_MIN_ADDR		PAGE_SIZE
271 
272 
273 #ifdef DEADBEEF0
274 #define UVMMAP_DEADBEEF		((unsigned long)DEADBEEF0)
275 #else
276 #define UVMMAP_DEADBEEF		((unsigned long)0xdeadd0d0)
277 #endif
278 
279 #ifdef DEBUG
280 int uvm_map_printlocks = 0;
281 
282 #define LPRINTF(_args)							\
283 	do {								\
284 		if (uvm_map_printlocks)					\
285 			printf _args;					\
286 	} while (0)
287 #else
288 #define LPRINTF(_args)	do {} while (0)
289 #endif
290 
291 static struct mutex uvm_kmapent_mtx;
292 static struct timeval uvm_kmapent_last_warn_time;
293 static struct timeval uvm_kmapent_warn_rate = { 10, 0 };
294 
295 const char vmmapbsy[] = "vmmapbsy";
296 
297 /*
298  * pool for vmspace structures.
299  */
300 struct pool uvm_vmspace_pool;
301 
302 /*
303  * pool for dynamically-allocated map entries.
304  */
305 struct pool uvm_map_entry_pool;
306 struct pool uvm_map_entry_kmem_pool;
307 
308 /*
309  * This global represents the end of the kernel virtual address
310  * space. If we want to exceed this, we must grow the kernel
311  * virtual address space dynamically.
312  *
313  * Note, this variable is locked by kernel_map's lock.
314  */
315 vaddr_t uvm_maxkaddr;
316 
317 /*
318  * Locking predicate.
319  */
320 #define UVM_MAP_REQ_WRITE(_map)						\
321 	do {								\
322 		if ((_map)->ref_count > 0) {				\
323 			if (((_map)->flags & VM_MAP_INTRSAFE) == 0)	\
324 				rw_assert_wrlock(&(_map)->lock);	\
325 			else						\
326 				MUTEX_ASSERT_LOCKED(&(_map)->mtx);	\
327 		}							\
328 	} while (0)
329 
330 /*
331  * Tree describing entries by address.
332  *
333  * Addresses are unique.
334  * Entries with start == end may only exist if they are the first entry
335  * (sorted by address) within a free-memory tree.
336  */
337 
338 static inline int
339 uvm_mapentry_addrcmp(const struct vm_map_entry *e1,
340     const struct vm_map_entry *e2)
341 {
342 	return e1->start < e2->start ? -1 : e1->start > e2->start;
343 }
344 
345 /*
346  * Copy mapentry.
347  */
348 static __inline void
349 uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
350 {
351 	caddr_t csrc, cdst;
352 	size_t sz;
353 
354 	csrc = (caddr_t)src;
355 	cdst = (caddr_t)dst;
356 	csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
357 	cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
358 
359 	sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) -
360 	    offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
361 	memcpy(cdst, csrc, sz);
362 }
363 
364 /*
365  * Handle free-list insertion.
366  */
367 void
368 uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr,
369     struct vm_map_entry *entry)
370 {
371 	const struct uvm_addr_functions *fun;
372 #ifdef VMMAP_DEBUG
373 	vaddr_t min, max, bound;
374 #endif
375 
376 #ifdef VMMAP_DEBUG
377 	/*
378 	 * Boundary check.
379 	 * Boundaries are folded if they go on the same free list.
380 	 */
381 	min = VMMAP_FREE_START(entry);
382 	max = VMMAP_FREE_END(entry);
383 
384 	while (min < max) {
385 		bound = uvm_map_boundary(map, min, max);
386 		KASSERT(uvm_map_uaddr(map, min) == uaddr);
387 		min = bound;
388 	}
389 #endif
390 	KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
391 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
392 
393 	UVM_MAP_REQ_WRITE(map);
394 
395 	/* Actual insert: forward to uaddr pointer. */
396 	if (uaddr != NULL) {
397 		fun = uaddr->uaddr_functions;
398 		KDASSERT(fun != NULL);
399 		if (fun->uaddr_free_insert != NULL)
400 			(*fun->uaddr_free_insert)(map, uaddr, entry);
401 		entry->etype |= UVM_ET_FREEMAPPED;
402 	}
403 
404 	/* Update fspace augmentation. */
405 	uvm_map_addr_augment(entry);
406 }
407 
408 /*
409  * Handle free-list removal.
410  */
411 void
412 uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr,
413     struct vm_map_entry *entry)
414 {
415 	const struct uvm_addr_functions *fun;
416 
417 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
418 	KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
419 	UVM_MAP_REQ_WRITE(map);
420 
421 	if (uaddr != NULL) {
422 		fun = uaddr->uaddr_functions;
423 		if (fun->uaddr_free_remove != NULL)
424 			(*fun->uaddr_free_remove)(map, uaddr, entry);
425 		entry->etype &= ~UVM_ET_FREEMAPPED;
426 	}
427 }
428 
429 /*
430  * Handle address tree insertion.
431  */
432 void
433 uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry)
434 {
435 	struct vm_map_entry *res;
436 
437 	if (!RBT_CHECK(uvm_map_addr, entry, UVMMAP_DEADBEEF))
438 		panic("uvm_mapent_addr_insert: entry still in addr list");
439 	KDASSERT(entry->start <= entry->end);
440 	KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 &&
441 	    (entry->end & (vaddr_t)PAGE_MASK) == 0);
442 
443 	UVM_MAP_REQ_WRITE(map);
444 	res = RBT_INSERT(uvm_map_addr, &map->addr, entry);
445 	if (res != NULL) {
446 		panic("uvm_mapent_addr_insert: map %p entry %p "
447 		    "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision "
448 		    "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)",
449 		    map, entry,
450 		    entry->start, entry->end, entry->guard, entry->fspace,
451 		    res, res->start, res->end, res->guard, res->fspace);
452 	}
453 }
454 
455 /*
456  * Handle address tree removal.
457  */
458 void
459 uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry)
460 {
461 	struct vm_map_entry *res;
462 
463 	UVM_MAP_REQ_WRITE(map);
464 	res = RBT_REMOVE(uvm_map_addr, &map->addr, entry);
465 	if (res != entry)
466 		panic("uvm_mapent_addr_remove");
467 	RBT_POISON(uvm_map_addr, entry, UVMMAP_DEADBEEF);
468 }
469 
470 /*
471  * uvm_map_reference: add reference to a map
472  *
473  * XXX check map reference counter lock
474  */
475 #define uvm_map_reference(_map)						\
476 	do {								\
477 		map->ref_count++;					\
478 	} while (0)
479 
480 /*
481  * Calculate the dused delta.
482  */
483 vsize_t
484 uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max)
485 {
486 	struct vmspace *vm;
487 	vsize_t sz;
488 	vaddr_t lmax;
489 	vaddr_t stack_begin, stack_end; /* Position of stack. */
490 
491 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
492 	vm = (struct vmspace *)map;
493 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
494 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
495 
496 	sz = 0;
497 	while (min != max) {
498 		lmax = max;
499 		if (min < stack_begin && lmax > stack_begin)
500 			lmax = stack_begin;
501 		else if (min < stack_end && lmax > stack_end)
502 			lmax = stack_end;
503 
504 		if (min >= stack_begin && min < stack_end) {
505 			/* nothing */
506 		} else
507 			sz += lmax - min;
508 		min = lmax;
509 	}
510 
511 	return sz >> PAGE_SHIFT;
512 }
513 
514 /*
515  * Find the entry describing the given address.
516  */
517 struct vm_map_entry*
518 uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr)
519 {
520 	struct vm_map_entry *iter;
521 
522 	iter = RBT_ROOT(uvm_map_addr, atree);
523 	while (iter != NULL) {
524 		if (iter->start > addr)
525 			iter = RBT_LEFT(uvm_map_addr, iter);
526 		else if (VMMAP_FREE_END(iter) <= addr)
527 			iter = RBT_RIGHT(uvm_map_addr, iter);
528 		else
529 			return iter;
530 	}
531 	return NULL;
532 }
533 
534 /*
535  * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry)
536  *
537  * Push dead entries into a linked list.
538  * Since the linked list abuses the address tree for storage, the entry
539  * may not be linked in a map.
540  *
541  * *head must be initialized to NULL before the first call to this macro.
542  * uvm_unmap_detach(*head, 0) will remove dead entries.
543  */
544 static __inline void
545 dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry)
546 {
547 	TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq);
548 }
549 #define DEAD_ENTRY_PUSH(_headptr, _entry)				\
550 	dead_entry_push((_headptr), (_entry))
551 
552 /*
553  * Helper function for uvm_map_findspace_tree.
554  *
555  * Given allocation constraints and pmap constraints, finds the
556  * lowest and highest address in a range that can be used for the
557  * allocation.
558  *
559  * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs.
560  *
561  *
562  * Big chunk of math with a seasoning of dragons.
563  */
564 int
565 uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg,
566     struct vm_map_entry *sel, vaddr_t align,
567     vaddr_t pmap_align, vaddr_t pmap_off, int bias)
568 {
569 	vaddr_t sel_min, sel_max;
570 #ifdef PMAP_PREFER
571 	vaddr_t pmap_min, pmap_max;
572 #endif /* PMAP_PREFER */
573 #ifdef DIAGNOSTIC
574 	int bad;
575 #endif /* DIAGNOSTIC */
576 
577 	sel_min = VMMAP_FREE_START(sel);
578 	sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0);
579 
580 #ifdef PMAP_PREFER
581 
582 	/*
583 	 * There are two special cases, in which we can satisfy the align
584 	 * requirement and the pmap_prefer requirement.
585 	 * - when pmap_off == 0, we always select the largest of the two
586 	 * - when pmap_off % align == 0 and pmap_align > align, we simply
587 	 *   satisfy the pmap_align requirement and automatically
588 	 *   satisfy the align requirement.
589 	 */
590 	if (align > PAGE_SIZE &&
591 	    !(pmap_align > align && (pmap_off & (align - 1)) == 0)) {
592 		/*
593 		 * Simple case: only use align.
594 		 */
595 		sel_min = roundup(sel_min, align);
596 		sel_max &= ~(align - 1);
597 
598 		if (sel_min > sel_max)
599 			return ENOMEM;
600 
601 		/* Correct for bias. */
602 		if (sel_max - sel_min > FSPACE_BIASGAP) {
603 			if (bias > 0) {
604 				sel_min = sel_max - FSPACE_BIASGAP;
605 				sel_min = roundup(sel_min, align);
606 			} else if (bias < 0) {
607 				sel_max = sel_min + FSPACE_BIASGAP;
608 				sel_max &= ~(align - 1);
609 			}
610 		}
611 	} else if (pmap_align != 0) {
612 		/*
613 		 * Special case: satisfy both pmap_prefer and
614 		 * align argument.
615 		 */
616 		pmap_max = sel_max & ~(pmap_align - 1);
617 		pmap_min = sel_min;
618 		if (pmap_max < sel_min)
619 			return ENOMEM;
620 
621 		/* Adjust pmap_min for BIASGAP for top-addr bias. */
622 		if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP)
623 			pmap_min = pmap_max - FSPACE_BIASGAP;
624 		/* Align pmap_min. */
625 		pmap_min &= ~(pmap_align - 1);
626 		if (pmap_min < sel_min)
627 			pmap_min += pmap_align;
628 		if (pmap_min > pmap_max)
629 			return ENOMEM;
630 
631 		/* Adjust pmap_max for BIASGAP for bottom-addr bias. */
632 		if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) {
633 			pmap_max = (pmap_min + FSPACE_BIASGAP) &
634 			    ~(pmap_align - 1);
635 		}
636 		if (pmap_min > pmap_max)
637 			return ENOMEM;
638 
639 		/* Apply pmap prefer offset. */
640 		pmap_max |= pmap_off;
641 		if (pmap_max > sel_max)
642 			pmap_max -= pmap_align;
643 		pmap_min |= pmap_off;
644 		if (pmap_min < sel_min)
645 			pmap_min += pmap_align;
646 
647 		/*
648 		 * Fixup: it's possible that pmap_min and pmap_max
649 		 * cross eachother. In this case, try to find one
650 		 * address that is allowed.
651 		 * (This usually happens in biased case.)
652 		 */
653 		if (pmap_min > pmap_max) {
654 			if (pmap_min < sel_max)
655 				pmap_max = pmap_min;
656 			else if (pmap_max > sel_min)
657 				pmap_min = pmap_max;
658 			else
659 				return ENOMEM;
660 		}
661 
662 		/* Internal validation. */
663 		KDASSERT(pmap_min <= pmap_max);
664 
665 		sel_min = pmap_min;
666 		sel_max = pmap_max;
667 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
668 		sel_min = sel_max - FSPACE_BIASGAP;
669 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
670 		sel_max = sel_min + FSPACE_BIASGAP;
671 
672 #else
673 
674 	if (align > PAGE_SIZE) {
675 		sel_min = roundup(sel_min, align);
676 		sel_max &= ~(align - 1);
677 		if (sel_min > sel_max)
678 			return ENOMEM;
679 
680 		if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) {
681 			if (bias > 0) {
682 				sel_min = roundup(sel_max - FSPACE_BIASGAP,
683 				    align);
684 			} else {
685 				sel_max = (sel_min + FSPACE_BIASGAP) &
686 				    ~(align - 1);
687 			}
688 		}
689 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
690 		sel_min = sel_max - FSPACE_BIASGAP;
691 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
692 		sel_max = sel_min + FSPACE_BIASGAP;
693 
694 #endif
695 
696 	if (sel_min > sel_max)
697 		return ENOMEM;
698 
699 #ifdef DIAGNOSTIC
700 	bad = 0;
701 	/* Lower boundary check. */
702 	if (sel_min < VMMAP_FREE_START(sel)) {
703 		printf("sel_min: 0x%lx, but should be at least 0x%lx\n",
704 		    sel_min, VMMAP_FREE_START(sel));
705 		bad++;
706 	}
707 	/* Upper boundary check. */
708 	if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) {
709 		printf("sel_max: 0x%lx, but should be at most 0x%lx\n",
710 		    sel_max,
711 		    VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0));
712 		bad++;
713 	}
714 	/* Lower boundary alignment. */
715 	if (align != 0 && (sel_min & (align - 1)) != 0) {
716 		printf("sel_min: 0x%lx, not aligned to 0x%lx\n",
717 		    sel_min, align);
718 		bad++;
719 	}
720 	/* Upper boundary alignment. */
721 	if (align != 0 && (sel_max & (align - 1)) != 0) {
722 		printf("sel_max: 0x%lx, not aligned to 0x%lx\n",
723 		    sel_max, align);
724 		bad++;
725 	}
726 	/* Lower boundary PMAP_PREFER check. */
727 	if (pmap_align != 0 && align == 0 &&
728 	    (sel_min & (pmap_align - 1)) != pmap_off) {
729 		printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
730 		    sel_min, sel_min & (pmap_align - 1), pmap_off);
731 		bad++;
732 	}
733 	/* Upper boundary PMAP_PREFER check. */
734 	if (pmap_align != 0 && align == 0 &&
735 	    (sel_max & (pmap_align - 1)) != pmap_off) {
736 		printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
737 		    sel_max, sel_max & (pmap_align - 1), pmap_off);
738 		bad++;
739 	}
740 
741 	if (bad) {
742 		panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, "
743 		    "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, "
744 		    "bias = %d, "
745 		    "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)",
746 		    sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off,
747 		    bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel));
748 	}
749 #endif /* DIAGNOSTIC */
750 
751 	*min = sel_min;
752 	*max = sel_max;
753 	return 0;
754 }
755 
756 /*
757  * Test if memory starting at addr with sz bytes is free.
758  *
759  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
760  * the space.
761  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
762  */
763 int
764 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
765     struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr,
766     vaddr_t addr, vsize_t sz)
767 {
768 	struct uvm_addr_state *free;
769 	struct uvm_map_addr *atree;
770 	struct vm_map_entry *i, *i_end;
771 
772 	if (addr + sz < addr)
773 		return 0;
774 
775 	/*
776 	 * Kernel memory above uvm_maxkaddr is considered unavailable.
777 	 */
778 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
779 		if (addr + sz > uvm_maxkaddr)
780 			return 0;
781 	}
782 
783 	atree = &map->addr;
784 
785 	/*
786 	 * Fill in first, last, so they point at the entries containing the
787 	 * first and last address of the range.
788 	 * Note that if they are not NULL, we don't perform the lookup.
789 	 */
790 	KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL);
791 	if (*start_ptr == NULL) {
792 		*start_ptr = uvm_map_entrybyaddr(atree, addr);
793 		if (*start_ptr == NULL)
794 			return 0;
795 	} else
796 		KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr));
797 	if (*end_ptr == NULL) {
798 		if (VMMAP_FREE_END(*start_ptr) >= addr + sz)
799 			*end_ptr = *start_ptr;
800 		else {
801 			*end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1);
802 			if (*end_ptr == NULL)
803 				return 0;
804 		}
805 	} else
806 		KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1));
807 
808 	/* Validation. */
809 	KDASSERT(*start_ptr != NULL && *end_ptr != NULL);
810 	KDASSERT((*start_ptr)->start <= addr &&
811 	    VMMAP_FREE_END(*start_ptr) > addr &&
812 	    (*end_ptr)->start < addr + sz &&
813 	    VMMAP_FREE_END(*end_ptr) >= addr + sz);
814 
815 	/*
816 	 * Check the none of the entries intersects with <addr, addr+sz>.
817 	 * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is
818 	 * considered unavailable unless called by those allocators.
819 	 */
820 	i = *start_ptr;
821 	i_end = RBT_NEXT(uvm_map_addr, *end_ptr);
822 	for (; i != i_end;
823 	    i = RBT_NEXT(uvm_map_addr, i)) {
824 		if (i->start != i->end && i->end > addr)
825 			return 0;
826 
827 		/*
828 		 * uaddr_exe and uaddr_brk_stack may only be used
829 		 * by these allocators and the NULL uaddr (i.e. no
830 		 * uaddr).
831 		 * Reject if this requirement is not met.
832 		 */
833 		if (uaddr != NULL) {
834 			free = uvm_map_uaddr_e(map, i);
835 
836 			if (uaddr != free && free != NULL &&
837 			    (free == map->uaddr_exe ||
838 			     free == map->uaddr_brk_stack))
839 				return 0;
840 		}
841 	}
842 
843 	return -1;
844 }
845 
846 /*
847  * Invoke each address selector until an address is found.
848  * Will not invoke uaddr_exe.
849  */
850 int
851 uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
852     struct vm_map_entry**last, vaddr_t *addr, vsize_t sz,
853     vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint)
854 {
855 	struct uvm_addr_state *uaddr;
856 	int i;
857 
858 	/*
859 	 * Allocation for sz bytes at any address,
860 	 * using the addr selectors in order.
861 	 */
862 	for (i = 0; i < nitems(map->uaddr_any); i++) {
863 		uaddr = map->uaddr_any[i];
864 
865 		if (uvm_addr_invoke(map, uaddr, first, last,
866 		    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
867 			return 0;
868 	}
869 
870 	/* Fall back to brk() and stack() address selectors. */
871 	uaddr = map->uaddr_brk_stack;
872 	if (uvm_addr_invoke(map, uaddr, first, last,
873 	    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
874 		return 0;
875 
876 	return ENOMEM;
877 }
878 
879 /* Calculate entry augmentation value. */
880 vsize_t
881 uvm_map_addr_augment_get(struct vm_map_entry *entry)
882 {
883 	vsize_t			 augment;
884 	struct vm_map_entry	*left, *right;
885 
886 	augment = entry->fspace;
887 	if ((left = RBT_LEFT(uvm_map_addr, entry)) != NULL)
888 		augment = MAX(augment, left->fspace_augment);
889 	if ((right = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
890 		augment = MAX(augment, right->fspace_augment);
891 	return augment;
892 }
893 
894 /*
895  * Update augmentation data in entry.
896  */
897 void
898 uvm_map_addr_augment(struct vm_map_entry *entry)
899 {
900 	vsize_t			 augment;
901 
902 	while (entry != NULL) {
903 		/* Calculate value for augmentation. */
904 		augment = uvm_map_addr_augment_get(entry);
905 
906 		/*
907 		 * Descend update.
908 		 * Once we find an entry that already has the correct value,
909 		 * stop, since it means all its parents will use the correct
910 		 * value too.
911 		 */
912 		if (entry->fspace_augment == augment)
913 			return;
914 		entry->fspace_augment = augment;
915 		entry = RBT_PARENT(uvm_map_addr, entry);
916 	}
917 }
918 
919 /*
920  * uvm_mapanon: establish a valid mapping in map for an anon
921  *
922  * => *addr and sz must be a multiple of PAGE_SIZE.
923  * => *addr is ignored, except if flags contains UVM_FLAG_FIXED.
924  * => map must be unlocked.
925  *
926  * => align: align vaddr, must be a power-of-2.
927  *    Align is only a hint and will be ignored if the alignment fails.
928  */
929 int
930 uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz,
931     vsize_t align, unsigned int flags)
932 {
933 	struct vm_map_entry	*first, *last, *entry, *new;
934 	struct uvm_map_deadq	 dead;
935 	vm_prot_t		 prot;
936 	vm_prot_t		 maxprot;
937 	vm_inherit_t		 inherit;
938 	int			 advice;
939 	int			 error;
940 	vaddr_t			 pmap_align, pmap_offset;
941 	vaddr_t			 hint;
942 
943 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE);
944 	KASSERT(map != kernel_map);
945 	KASSERT((map->flags & UVM_FLAG_HOLE) == 0);
946 
947 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
948 	splassert(IPL_NONE);
949 
950 	/*
951 	 * We use pmap_align and pmap_offset as alignment and offset variables.
952 	 *
953 	 * Because the align parameter takes precedence over pmap prefer,
954 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
955 	 * if pmap_prefer will not align.
956 	 */
957 	pmap_align = MAX(align, PAGE_SIZE);
958 	pmap_offset = 0;
959 
960 	/* Decode parameters. */
961 	prot = UVM_PROTECTION(flags);
962 	maxprot = UVM_MAXPROTECTION(flags);
963 	advice = UVM_ADVICE(flags);
964 	inherit = UVM_INHERIT(flags);
965 	error = 0;
966 	hint = trunc_page(*addr);
967 	TAILQ_INIT(&dead);
968 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
969 	KASSERT((align & (align - 1)) == 0);
970 
971 	/* Check protection. */
972 	if ((prot & maxprot) != prot)
973 		return EACCES;
974 
975 	/*
976 	 * Before grabbing the lock, allocate a map entry for later
977 	 * use to ensure we don't wait for memory while holding the
978 	 * vm_map_lock.
979 	 */
980 	new = uvm_mapent_alloc(map, flags);
981 	if (new == NULL)
982 		return(ENOMEM);
983 
984 	if (flags & UVM_FLAG_TRYLOCK) {
985 		if (vm_map_lock_try(map) == FALSE) {
986 			error = EFAULT;
987 			goto out;
988 		}
989 	} else
990 		vm_map_lock(map);
991 
992 	first = last = NULL;
993 	if (flags & UVM_FLAG_FIXED) {
994 		/*
995 		 * Fixed location.
996 		 *
997 		 * Note: we ignore align, pmap_prefer.
998 		 * Fill in first, last and *addr.
999 		 */
1000 		KASSERT((*addr & PAGE_MASK) == 0);
1001 
1002 		/* Check that the space is available. */
1003 		if (flags & UVM_FLAG_UNMAP)
1004 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1005 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1006 			error = ENOMEM;
1007 			goto unlock;
1008 		}
1009 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1010 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1011 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1012 		/*
1013 		 * Address used as hint.
1014 		 *
1015 		 * Note: we enforce the alignment restriction,
1016 		 * but ignore pmap_prefer.
1017 		 */
1018 	} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1019 		/* Run selection algorithm for executables. */
1020 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1021 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1022 
1023 		if (error != 0)
1024 			goto unlock;
1025 	} else {
1026 		/* Update freelists from vmspace. */
1027 		uvm_map_vmspace_update(map, &dead, flags);
1028 
1029 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1030 		    pmap_align, pmap_offset, prot, hint);
1031 
1032 		if (error != 0)
1033 			goto unlock;
1034 	}
1035 
1036 	/* Double-check if selected address doesn't cause overflow. */
1037 	if (*addr + sz < *addr) {
1038 		error = ENOMEM;
1039 		goto unlock;
1040 	}
1041 
1042 	/* If we only want a query, return now. */
1043 	if (flags & UVM_FLAG_QUERY) {
1044 		error = 0;
1045 		goto unlock;
1046 	}
1047 
1048 	/*
1049 	 * Create new entry.
1050 	 * first and last may be invalidated after this call.
1051 	 */
1052 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1053 	    new);
1054 	if (entry == NULL) {
1055 		error = ENOMEM;
1056 		goto unlock;
1057 	}
1058 	new = NULL;
1059 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1060 	entry->object.uvm_obj = NULL;
1061 	entry->offset = 0;
1062 	entry->protection = prot;
1063 	entry->max_protection = maxprot;
1064 	entry->inheritance = inherit;
1065 	entry->wired_count = 0;
1066 	entry->advice = advice;
1067 	if (flags & UVM_FLAG_NOFAULT)
1068 		entry->etype |= UVM_ET_NOFAULT;
1069 	if (flags & UVM_FLAG_COPYONW) {
1070 		entry->etype |= UVM_ET_COPYONWRITE;
1071 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1072 			entry->etype |= UVM_ET_NEEDSCOPY;
1073 	}
1074 	if (flags & UVM_FLAG_OVERLAY) {
1075 		KERNEL_LOCK();
1076 		entry->aref.ar_pageoff = 0;
1077 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1078 		KERNEL_UNLOCK();
1079 	}
1080 
1081 	/* Update map and process statistics. */
1082 	map->size += sz;
1083 	((struct vmspace *)map)->vm_dused += uvmspace_dused(map, *addr, *addr + sz);
1084 
1085 unlock:
1086 	vm_map_unlock(map);
1087 
1088 	/*
1089 	 * Remove dead entries.
1090 	 *
1091 	 * Dead entries may be the result of merging.
1092 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1093 	 * destroy free-space entries.
1094 	 */
1095 	uvm_unmap_detach(&dead, 0);
1096 out:
1097 	if (new)
1098 		uvm_mapent_free(new);
1099 	return error;
1100 }
1101 
1102 /*
1103  * uvm_map: establish a valid mapping in map
1104  *
1105  * => *addr and sz must be a multiple of PAGE_SIZE.
1106  * => map must be unlocked.
1107  * => <uobj,uoffset> value meanings (4 cases):
1108  *	[1] <NULL,uoffset>		== uoffset is a hint for PMAP_PREFER
1109  *	[2] <NULL,UVM_UNKNOWN_OFFSET>	== don't PMAP_PREFER
1110  *	[3] <uobj,uoffset>		== normal mapping
1111  *	[4] <uobj,UVM_UNKNOWN_OFFSET>	== uvm_map finds offset based on VA
1112  *
1113  *   case [4] is for kernel mappings where we don't know the offset until
1114  *   we've found a virtual address.   note that kernel object offsets are
1115  *   always relative to vm_map_min(kernel_map).
1116  *
1117  * => align: align vaddr, must be a power-of-2.
1118  *    Align is only a hint and will be ignored if the alignment fails.
1119  */
1120 int
1121 uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz,
1122     struct uvm_object *uobj, voff_t uoffset,
1123     vsize_t align, unsigned int flags)
1124 {
1125 	struct vm_map_entry	*first, *last, *entry, *new;
1126 	struct uvm_map_deadq	 dead;
1127 	vm_prot_t		 prot;
1128 	vm_prot_t		 maxprot;
1129 	vm_inherit_t		 inherit;
1130 	int			 advice;
1131 	int			 error;
1132 	vaddr_t			 pmap_align, pmap_offset;
1133 	vaddr_t			 hint;
1134 
1135 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1136 		splassert(IPL_NONE);
1137 	else
1138 		splassert(IPL_VM);
1139 
1140 	/*
1141 	 * We use pmap_align and pmap_offset as alignment and offset variables.
1142 	 *
1143 	 * Because the align parameter takes precedence over pmap prefer,
1144 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
1145 	 * if pmap_prefer will not align.
1146 	 */
1147 	if (uoffset == UVM_UNKNOWN_OFFSET) {
1148 		pmap_align = MAX(align, PAGE_SIZE);
1149 		pmap_offset = 0;
1150 	} else {
1151 		pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE);
1152 		pmap_offset = PMAP_PREFER_OFFSET(uoffset);
1153 
1154 		if (align == 0 ||
1155 		    (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) {
1156 			/* pmap_offset satisfies align, no change. */
1157 		} else {
1158 			/* Align takes precedence over pmap prefer. */
1159 			pmap_align = align;
1160 			pmap_offset = 0;
1161 		}
1162 	}
1163 
1164 	/* Decode parameters. */
1165 	prot = UVM_PROTECTION(flags);
1166 	maxprot = UVM_MAXPROTECTION(flags);
1167 	advice = UVM_ADVICE(flags);
1168 	inherit = UVM_INHERIT(flags);
1169 	error = 0;
1170 	hint = trunc_page(*addr);
1171 	TAILQ_INIT(&dead);
1172 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
1173 	KASSERT((align & (align - 1)) == 0);
1174 
1175 	/* Holes are incompatible with other types of mappings. */
1176 	if (flags & UVM_FLAG_HOLE) {
1177 		KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) &&
1178 		    (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0);
1179 	}
1180 
1181 	/* Unset hint for kernel_map non-fixed allocations. */
1182 	if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED))
1183 		hint = 0;
1184 
1185 	/* Check protection. */
1186 	if ((prot & maxprot) != prot)
1187 		return EACCES;
1188 
1189 	if (map == kernel_map &&
1190 	    (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
1191 		panic("uvm_map: kernel map W^X violation requested");
1192 
1193 	/*
1194 	 * Before grabbing the lock, allocate a map entry for later
1195 	 * use to ensure we don't wait for memory while holding the
1196 	 * vm_map_lock.
1197 	 */
1198 	new = uvm_mapent_alloc(map, flags);
1199 	if (new == NULL)
1200 		return(ENOMEM);
1201 
1202 	if (flags & UVM_FLAG_TRYLOCK) {
1203 		if (vm_map_lock_try(map) == FALSE) {
1204 			error = EFAULT;
1205 			goto out;
1206 		}
1207 	} else {
1208 		vm_map_lock(map);
1209 	}
1210 
1211 	first = last = NULL;
1212 	if (flags & UVM_FLAG_FIXED) {
1213 		/*
1214 		 * Fixed location.
1215 		 *
1216 		 * Note: we ignore align, pmap_prefer.
1217 		 * Fill in first, last and *addr.
1218 		 */
1219 		KASSERT((*addr & PAGE_MASK) == 0);
1220 
1221 		/*
1222 		 * Grow pmap to include allocated address.
1223 		 * If the growth fails, the allocation will fail too.
1224 		 */
1225 		if ((map->flags & VM_MAP_ISVMSPACE) == 0 &&
1226 		    uvm_maxkaddr < (*addr + sz)) {
1227 			uvm_map_kmem_grow(map, &dead,
1228 			    *addr + sz - uvm_maxkaddr, flags);
1229 		}
1230 
1231 		/* Check that the space is available. */
1232 		if (flags & UVM_FLAG_UNMAP)
1233 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1234 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1235 			error = ENOMEM;
1236 			goto unlock;
1237 		}
1238 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1239 	    (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE &&
1240 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1241 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1242 		/*
1243 		 * Address used as hint.
1244 		 *
1245 		 * Note: we enforce the alignment restriction,
1246 		 * but ignore pmap_prefer.
1247 		 */
1248 	} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1249 		/* Run selection algorithm for executables. */
1250 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1251 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1252 
1253 		/* Grow kernel memory and try again. */
1254 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1255 			uvm_map_kmem_grow(map, &dead, sz, flags);
1256 
1257 			error = uvm_addr_invoke(map, map->uaddr_exe,
1258 			    &first, &last, addr, sz,
1259 			    pmap_align, pmap_offset, prot, hint);
1260 		}
1261 
1262 		if (error != 0)
1263 			goto unlock;
1264 	} else {
1265 		/* Update freelists from vmspace. */
1266 		if (map->flags & VM_MAP_ISVMSPACE)
1267 			uvm_map_vmspace_update(map, &dead, flags);
1268 
1269 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1270 		    pmap_align, pmap_offset, prot, hint);
1271 
1272 		/* Grow kernel memory and try again. */
1273 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1274 			uvm_map_kmem_grow(map, &dead, sz, flags);
1275 
1276 			error = uvm_map_findspace(map, &first, &last, addr, sz,
1277 			    pmap_align, pmap_offset, prot, hint);
1278 		}
1279 
1280 		if (error != 0)
1281 			goto unlock;
1282 	}
1283 
1284 	/* Double-check if selected address doesn't cause overflow. */
1285 	if (*addr + sz < *addr) {
1286 		error = ENOMEM;
1287 		goto unlock;
1288 	}
1289 
1290 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE ||
1291 	    uvm_maxkaddr >= *addr + sz);
1292 
1293 	/* If we only want a query, return now. */
1294 	if (flags & UVM_FLAG_QUERY) {
1295 		error = 0;
1296 		goto unlock;
1297 	}
1298 
1299 	if (uobj == NULL)
1300 		uoffset = 0;
1301 	else if (uoffset == UVM_UNKNOWN_OFFSET) {
1302 		KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
1303 		uoffset = *addr - vm_map_min(kernel_map);
1304 	}
1305 
1306 	/*
1307 	 * Create new entry.
1308 	 * first and last may be invalidated after this call.
1309 	 */
1310 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1311 	    new);
1312 	if (entry == NULL) {
1313 		error = ENOMEM;
1314 		goto unlock;
1315 	}
1316 	new = NULL;
1317 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1318 	entry->object.uvm_obj = uobj;
1319 	entry->offset = uoffset;
1320 	entry->protection = prot;
1321 	entry->max_protection = maxprot;
1322 	entry->inheritance = inherit;
1323 	entry->wired_count = 0;
1324 	entry->advice = advice;
1325 	if (uobj)
1326 		entry->etype |= UVM_ET_OBJ;
1327 	else if (flags & UVM_FLAG_HOLE)
1328 		entry->etype |= UVM_ET_HOLE;
1329 	if (flags & UVM_FLAG_NOFAULT)
1330 		entry->etype |= UVM_ET_NOFAULT;
1331 	if (flags & UVM_FLAG_COPYONW) {
1332 		entry->etype |= UVM_ET_COPYONWRITE;
1333 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1334 			entry->etype |= UVM_ET_NEEDSCOPY;
1335 	}
1336 	if (flags & UVM_FLAG_OVERLAY) {
1337 		entry->aref.ar_pageoff = 0;
1338 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1339 	}
1340 
1341 	/* Update map and process statistics. */
1342 	if (!(flags & UVM_FLAG_HOLE)) {
1343 		map->size += sz;
1344 		if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL) {
1345 			((struct vmspace *)map)->vm_dused +=
1346 			    uvmspace_dused(map, *addr, *addr + sz);
1347 		}
1348 	}
1349 
1350 	/*
1351 	 * Try to merge entry.
1352 	 *
1353 	 * Userland allocations are kept separated most of the time.
1354 	 * Forego the effort of merging what most of the time can't be merged
1355 	 * and only try the merge if it concerns a kernel entry.
1356 	 */
1357 	if ((flags & UVM_FLAG_NOMERGE) == 0 &&
1358 	    (map->flags & VM_MAP_ISVMSPACE) == 0)
1359 		uvm_mapent_tryjoin(map, entry, &dead);
1360 
1361 unlock:
1362 	vm_map_unlock(map);
1363 
1364 	/*
1365 	 * Remove dead entries.
1366 	 *
1367 	 * Dead entries may be the result of merging.
1368 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1369 	 * destroy free-space entries.
1370 	 */
1371 	if (map->flags & VM_MAP_INTRSAFE)
1372 		uvm_unmap_detach_intrsafe(&dead);
1373 	else
1374 		uvm_unmap_detach(&dead, 0);
1375 out:
1376 	if (new)
1377 		uvm_mapent_free(new);
1378 	return error;
1379 }
1380 
1381 /*
1382  * True iff e1 and e2 can be joined together.
1383  */
1384 int
1385 uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1,
1386     struct vm_map_entry *e2)
1387 {
1388 	KDASSERT(e1 != NULL && e2 != NULL);
1389 
1390 	/* Must be the same entry type and not have free memory between. */
1391 	if (e1->etype != e2->etype || e1->end != e2->start)
1392 		return 0;
1393 
1394 	/* Submaps are never joined. */
1395 	if (UVM_ET_ISSUBMAP(e1))
1396 		return 0;
1397 
1398 	/* Never merge wired memory. */
1399 	if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2))
1400 		return 0;
1401 
1402 	/* Protection, inheritance and advice must be equal. */
1403 	if (e1->protection != e2->protection ||
1404 	    e1->max_protection != e2->max_protection ||
1405 	    e1->inheritance != e2->inheritance ||
1406 	    e1->advice != e2->advice)
1407 		return 0;
1408 
1409 	/* If uvm_object: object itself and offsets within object must match. */
1410 	if (UVM_ET_ISOBJ(e1)) {
1411 		if (e1->object.uvm_obj != e2->object.uvm_obj)
1412 			return 0;
1413 		if (e1->offset + (e1->end - e1->start) != e2->offset)
1414 			return 0;
1415 	}
1416 
1417 	/*
1418 	 * Cannot join shared amaps.
1419 	 * Note: no need to lock amap to look at refs, since we don't care
1420 	 * about its exact value.
1421 	 * If it is 1 (i.e. we have the only reference) it will stay there.
1422 	 */
1423 	if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1)
1424 		return 0;
1425 	if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1)
1426 		return 0;
1427 
1428 	/* Apprently, e1 and e2 match. */
1429 	return 1;
1430 }
1431 
1432 /*
1433  * Join support function.
1434  *
1435  * Returns the merged entry on succes.
1436  * Returns NULL if the merge failed.
1437  */
1438 struct vm_map_entry*
1439 uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1,
1440     struct vm_map_entry *e2, struct uvm_map_deadq *dead)
1441 {
1442 	struct uvm_addr_state *free;
1443 
1444 	/*
1445 	 * Merging is not supported for map entries that
1446 	 * contain an amap in e1. This should never happen
1447 	 * anyway, because only kernel entries are merged.
1448 	 * These do not contain amaps.
1449 	 * e2 contains no real information in its amap,
1450 	 * so it can be erased immediately.
1451 	 */
1452 	KASSERT(e1->aref.ar_amap == NULL);
1453 
1454 	/*
1455 	 * Don't drop obj reference:
1456 	 * uvm_unmap_detach will do this for us.
1457 	 */
1458 	free = uvm_map_uaddr_e(map, e1);
1459 	uvm_mapent_free_remove(map, free, e1);
1460 
1461 	free = uvm_map_uaddr_e(map, e2);
1462 	uvm_mapent_free_remove(map, free, e2);
1463 	uvm_mapent_addr_remove(map, e2);
1464 	e1->end = e2->end;
1465 	e1->guard = e2->guard;
1466 	e1->fspace = e2->fspace;
1467 	uvm_mapent_free_insert(map, free, e1);
1468 
1469 	DEAD_ENTRY_PUSH(dead, e2);
1470 	return e1;
1471 }
1472 
1473 /*
1474  * Attempt forward and backward joining of entry.
1475  *
1476  * Returns entry after joins.
1477  * We are guaranteed that the amap of entry is either non-existant or
1478  * has never been used.
1479  */
1480 struct vm_map_entry*
1481 uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry,
1482     struct uvm_map_deadq *dead)
1483 {
1484 	struct vm_map_entry *other;
1485 	struct vm_map_entry *merged;
1486 
1487 	/* Merge with previous entry. */
1488 	other = RBT_PREV(uvm_map_addr, entry);
1489 	if (other && uvm_mapent_isjoinable(map, other, entry)) {
1490 		merged = uvm_mapent_merge(map, other, entry, dead);
1491 		if (merged)
1492 			entry = merged;
1493 	}
1494 
1495 	/*
1496 	 * Merge with next entry.
1497 	 *
1498 	 * Because amap can only extend forward and the next entry
1499 	 * probably contains sensible info, only perform forward merging
1500 	 * in the absence of an amap.
1501 	 */
1502 	other = RBT_NEXT(uvm_map_addr, entry);
1503 	if (other && entry->aref.ar_amap == NULL &&
1504 	    other->aref.ar_amap == NULL &&
1505 	    uvm_mapent_isjoinable(map, entry, other)) {
1506 		merged = uvm_mapent_merge(map, entry, other, dead);
1507 		if (merged)
1508 			entry = merged;
1509 	}
1510 
1511 	return entry;
1512 }
1513 
1514 /*
1515  * Kill entries that are no longer in a map.
1516  */
1517 void
1518 uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags)
1519 {
1520 	struct vm_map_entry *entry;
1521 	int waitok = flags & UVM_PLA_WAITOK;
1522 
1523 	if (TAILQ_EMPTY(deadq))
1524 		return;
1525 
1526 	KERNEL_LOCK();
1527 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1528 		if (waitok)
1529 			uvm_pause();
1530 		/* Drop reference to amap, if we've got one. */
1531 		if (entry->aref.ar_amap)
1532 			amap_unref(entry->aref.ar_amap,
1533 			    entry->aref.ar_pageoff,
1534 			    atop(entry->end - entry->start),
1535 			    flags & AMAP_REFALL);
1536 
1537 		/* Drop reference to our backing object, if we've got one. */
1538 		if (UVM_ET_ISSUBMAP(entry)) {
1539 			/* ... unlikely to happen, but play it safe */
1540 			uvm_map_deallocate(entry->object.sub_map);
1541 		} else if (UVM_ET_ISOBJ(entry) &&
1542 		    entry->object.uvm_obj->pgops->pgo_detach) {
1543 			entry->object.uvm_obj->pgops->pgo_detach(
1544 			    entry->object.uvm_obj);
1545 		}
1546 
1547 		/* Step to next. */
1548 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1549 		uvm_mapent_free(entry);
1550 	}
1551 	KERNEL_UNLOCK();
1552 }
1553 
1554 void
1555 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq)
1556 {
1557 	struct vm_map_entry *entry;
1558 
1559 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1560 		KASSERT(entry->aref.ar_amap == NULL);
1561 		KASSERT(!UVM_ET_ISSUBMAP(entry));
1562 		KASSERT(!UVM_ET_ISOBJ(entry));
1563 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1564 		uvm_mapent_free(entry);
1565 	}
1566 }
1567 
1568 /*
1569  * Create and insert new entry.
1570  *
1571  * Returned entry contains new addresses and is inserted properly in the tree.
1572  * first and last are (probably) no longer valid.
1573  */
1574 struct vm_map_entry*
1575 uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first,
1576     struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags,
1577     struct uvm_map_deadq *dead, struct vm_map_entry *new)
1578 {
1579 	struct vm_map_entry *entry, *prev;
1580 	struct uvm_addr_state *free;
1581 	vaddr_t min, max;	/* free space boundaries for new entry */
1582 
1583 	KDASSERT(map != NULL);
1584 	KDASSERT(first != NULL);
1585 	KDASSERT(last != NULL);
1586 	KDASSERT(dead != NULL);
1587 	KDASSERT(sz > 0);
1588 	KDASSERT(addr + sz > addr);
1589 	KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr);
1590 	KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz);
1591 	KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz));
1592 	uvm_tree_sanity(map, __FILE__, __LINE__);
1593 
1594 	min = addr + sz;
1595 	max = VMMAP_FREE_END(last);
1596 
1597 	/* Initialize new entry. */
1598 	if (new == NULL)
1599 		entry = uvm_mapent_alloc(map, flags);
1600 	else
1601 		entry = new;
1602 	if (entry == NULL)
1603 		return NULL;
1604 	entry->offset = 0;
1605 	entry->etype = 0;
1606 	entry->wired_count = 0;
1607 	entry->aref.ar_pageoff = 0;
1608 	entry->aref.ar_amap = NULL;
1609 
1610 	entry->start = addr;
1611 	entry->end = min;
1612 	entry->guard = 0;
1613 	entry->fspace = 0;
1614 
1615 	/* Reset free space in first. */
1616 	free = uvm_map_uaddr_e(map, first);
1617 	uvm_mapent_free_remove(map, free, first);
1618 	first->guard = 0;
1619 	first->fspace = 0;
1620 
1621 	/*
1622 	 * Remove all entries that are fully replaced.
1623 	 * We are iterating using last in reverse order.
1624 	 */
1625 	for (; first != last; last = prev) {
1626 		prev = RBT_PREV(uvm_map_addr, last);
1627 
1628 		KDASSERT(last->start == last->end);
1629 		free = uvm_map_uaddr_e(map, last);
1630 		uvm_mapent_free_remove(map, free, last);
1631 		uvm_mapent_addr_remove(map, last);
1632 		DEAD_ENTRY_PUSH(dead, last);
1633 	}
1634 	/* Remove first if it is entirely inside <addr, addr+sz>.  */
1635 	if (first->start == addr) {
1636 		uvm_mapent_addr_remove(map, first);
1637 		DEAD_ENTRY_PUSH(dead, first);
1638 	} else {
1639 		uvm_map_fix_space(map, first, VMMAP_FREE_START(first),
1640 		    addr, flags);
1641 	}
1642 
1643 	/* Finally, link in entry. */
1644 	uvm_mapent_addr_insert(map, entry);
1645 	uvm_map_fix_space(map, entry, min, max, flags);
1646 
1647 	uvm_tree_sanity(map, __FILE__, __LINE__);
1648 	return entry;
1649 }
1650 
1651 
1652 /*
1653  * uvm_mapent_alloc: allocate a map entry
1654  */
1655 struct vm_map_entry *
1656 uvm_mapent_alloc(struct vm_map *map, int flags)
1657 {
1658 	struct vm_map_entry *me, *ne;
1659 	int pool_flags;
1660 	int i;
1661 
1662 	pool_flags = PR_WAITOK;
1663 	if (flags & UVM_FLAG_TRYLOCK)
1664 		pool_flags = PR_NOWAIT;
1665 
1666 	if (map->flags & VM_MAP_INTRSAFE || cold) {
1667 		mtx_enter(&uvm_kmapent_mtx);
1668 		if (SLIST_EMPTY(&uvm.kentry_free)) {
1669 			ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty,
1670 			    &kd_nowait);
1671 			if (ne == NULL)
1672 				panic("uvm_mapent_alloc: cannot allocate map "
1673 				    "entry");
1674 			for (i = 0; i < PAGE_SIZE / sizeof(*ne); i++) {
1675 				SLIST_INSERT_HEAD(&uvm.kentry_free,
1676 				    &ne[i], daddrs.addr_kentry);
1677 			}
1678 			if (ratecheck(&uvm_kmapent_last_warn_time,
1679 			    &uvm_kmapent_warn_rate))
1680 				printf("uvm_mapent_alloc: out of static "
1681 				    "map entries\n");
1682 		}
1683 		me = SLIST_FIRST(&uvm.kentry_free);
1684 		SLIST_REMOVE_HEAD(&uvm.kentry_free, daddrs.addr_kentry);
1685 		uvmexp.kmapent++;
1686 		mtx_leave(&uvm_kmapent_mtx);
1687 		me->flags = UVM_MAP_STATIC;
1688 	} else if (map == kernel_map) {
1689 		splassert(IPL_NONE);
1690 		me = pool_get(&uvm_map_entry_kmem_pool, pool_flags);
1691 		if (me == NULL)
1692 			goto out;
1693 		me->flags = UVM_MAP_KMEM;
1694 	} else {
1695 		splassert(IPL_NONE);
1696 		me = pool_get(&uvm_map_entry_pool, pool_flags);
1697 		if (me == NULL)
1698 			goto out;
1699 		me->flags = 0;
1700 	}
1701 
1702 	if (me != NULL) {
1703 		RBT_POISON(uvm_map_addr, me, UVMMAP_DEADBEEF);
1704 	}
1705 
1706 out:
1707 	return(me);
1708 }
1709 
1710 /*
1711  * uvm_mapent_free: free map entry
1712  *
1713  * => XXX: static pool for kernel map?
1714  */
1715 void
1716 uvm_mapent_free(struct vm_map_entry *me)
1717 {
1718 	if (me->flags & UVM_MAP_STATIC) {
1719 		mtx_enter(&uvm_kmapent_mtx);
1720 		SLIST_INSERT_HEAD(&uvm.kentry_free, me, daddrs.addr_kentry);
1721 		uvmexp.kmapent--;
1722 		mtx_leave(&uvm_kmapent_mtx);
1723 	} else if (me->flags & UVM_MAP_KMEM) {
1724 		splassert(IPL_NONE);
1725 		pool_put(&uvm_map_entry_kmem_pool, me);
1726 	} else {
1727 		splassert(IPL_NONE);
1728 		pool_put(&uvm_map_entry_pool, me);
1729 	}
1730 }
1731 
1732 /*
1733  * uvm_map_lookup_entry: find map entry at or before an address.
1734  *
1735  * => map must at least be read-locked by caller
1736  * => entry is returned in "entry"
1737  * => return value is true if address is in the returned entry
1738  * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is
1739  * returned for those mappings.
1740  */
1741 boolean_t
1742 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
1743     struct vm_map_entry **entry)
1744 {
1745 	*entry = uvm_map_entrybyaddr(&map->addr, address);
1746 	return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
1747 	    (*entry)->start <= address && (*entry)->end > address;
1748 }
1749 
1750 /*
1751  * uvm_map_pie: return a random load address for a PIE executable
1752  * properly aligned.
1753  */
1754 #ifndef VM_PIE_MAX_ADDR
1755 #define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4)
1756 #endif
1757 
1758 #ifndef VM_PIE_MIN_ADDR
1759 #define VM_PIE_MIN_ADDR VM_MIN_ADDRESS
1760 #endif
1761 
1762 #ifndef VM_PIE_MIN_ALIGN
1763 #define VM_PIE_MIN_ALIGN PAGE_SIZE
1764 #endif
1765 
1766 vaddr_t
1767 uvm_map_pie(vaddr_t align)
1768 {
1769 	vaddr_t addr, space, min;
1770 
1771 	align = MAX(align, VM_PIE_MIN_ALIGN);
1772 
1773 	/* round up to next alignment */
1774 	min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1);
1775 
1776 	if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR)
1777 		return (align);
1778 
1779 	space = (VM_PIE_MAX_ADDR - min) / align;
1780 	space = MIN(space, (u_int32_t)-1);
1781 
1782 	addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align;
1783 	addr += min;
1784 
1785 	return (addr);
1786 }
1787 
1788 void
1789 uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end)
1790 {
1791 	struct uvm_map_deadq dead;
1792 
1793 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0 &&
1794 	    (end & (vaddr_t)PAGE_MASK) == 0);
1795 	TAILQ_INIT(&dead);
1796 	vm_map_lock(map);
1797 	uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE);
1798 	vm_map_unlock(map);
1799 
1800 	if (map->flags & VM_MAP_INTRSAFE)
1801 		uvm_unmap_detach_intrsafe(&dead);
1802 	else
1803 		uvm_unmap_detach(&dead, 0);
1804 }
1805 
1806 /*
1807  * Mark entry as free.
1808  *
1809  * entry will be put on the dead list.
1810  * The free space will be merged into the previous or a new entry,
1811  * unless markfree is false.
1812  */
1813 void
1814 uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry,
1815     struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead,
1816     boolean_t markfree)
1817 {
1818 	struct uvm_addr_state	*free;
1819 	struct vm_map_entry	*prev;
1820 	vaddr_t			 addr;	/* Start of freed range. */
1821 	vaddr_t			 end;	/* End of freed range. */
1822 
1823 	prev = *prev_ptr;
1824 	if (prev == entry)
1825 		*prev_ptr = prev = NULL;
1826 
1827 	if (prev == NULL ||
1828 	    VMMAP_FREE_END(prev) != entry->start)
1829 		prev = RBT_PREV(uvm_map_addr, entry);
1830 
1831 	/* Entry is describing only free memory and has nothing to drain into. */
1832 	if (prev == NULL && entry->start == entry->end && markfree) {
1833 		*prev_ptr = entry;
1834 		return;
1835 	}
1836 
1837 	addr = entry->start;
1838 	end = VMMAP_FREE_END(entry);
1839 	free = uvm_map_uaddr_e(map, entry);
1840 	uvm_mapent_free_remove(map, free, entry);
1841 	uvm_mapent_addr_remove(map, entry);
1842 	DEAD_ENTRY_PUSH(dead, entry);
1843 
1844 	if (markfree) {
1845 		if (prev) {
1846 			free = uvm_map_uaddr_e(map, prev);
1847 			uvm_mapent_free_remove(map, free, prev);
1848 		}
1849 		*prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0);
1850 	}
1851 }
1852 
1853 /*
1854  * Unwire and release referenced amap and object from map entry.
1855  */
1856 void
1857 uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry)
1858 {
1859 	/* Unwire removed map entry. */
1860 	if (VM_MAPENT_ISWIRED(entry)) {
1861 		KERNEL_LOCK();
1862 		entry->wired_count = 0;
1863 		uvm_fault_unwire_locked(map, entry->start, entry->end);
1864 		KERNEL_UNLOCK();
1865 	}
1866 
1867 	/* Entry-type specific code. */
1868 	if (UVM_ET_ISHOLE(entry)) {
1869 		/* Nothing to be done for holes. */
1870 	} else if (map->flags & VM_MAP_INTRSAFE) {
1871 		KASSERT(vm_map_pmap(map) == pmap_kernel());
1872 		uvm_km_pgremove_intrsafe(entry->start, entry->end);
1873 		pmap_kremove(entry->start, entry->end - entry->start);
1874 	} else if (UVM_ET_ISOBJ(entry) &&
1875 	    UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
1876 		KASSERT(vm_map_pmap(map) == pmap_kernel());
1877 		/*
1878 		 * Note: kernel object mappings are currently used in
1879 		 * two ways:
1880 		 *  [1] "normal" mappings of pages in the kernel object
1881 		 *  [2] uvm_km_valloc'd allocations in which we
1882 		 *      pmap_enter in some non-kernel-object page
1883 		 *      (e.g. vmapbuf).
1884 		 *
1885 		 * for case [1], we need to remove the mapping from
1886 		 * the pmap and then remove the page from the kernel
1887 		 * object (because, once pages in a kernel object are
1888 		 * unmapped they are no longer needed, unlike, say,
1889 		 * a vnode where you might want the data to persist
1890 		 * until flushed out of a queue).
1891 		 *
1892 		 * for case [2], we need to remove the mapping from
1893 		 * the pmap.  there shouldn't be any pages at the
1894 		 * specified offset in the kernel object [but it
1895 		 * doesn't hurt to call uvm_km_pgremove just to be
1896 		 * safe?]
1897 		 *
1898 		 * uvm_km_pgremove currently does the following:
1899 		 *   for pages in the kernel object range:
1900 		 *     - drops the swap slot
1901 		 *     - uvm_pagefree the page
1902 		 *
1903 		 * note there is version of uvm_km_pgremove() that
1904 		 * is used for "intrsafe" objects.
1905 		 */
1906 		/*
1907 		 * remove mappings from pmap and drop the pages
1908 		 * from the object.  offsets are always relative
1909 		 * to vm_map_min(kernel_map).
1910 		 */
1911 		pmap_remove(pmap_kernel(), entry->start, entry->end);
1912 		uvm_km_pgremove(entry->object.uvm_obj,
1913 		    entry->start - vm_map_min(kernel_map),
1914 		    entry->end - vm_map_min(kernel_map));
1915 
1916 		/*
1917 		 * null out kernel_object reference, we've just
1918 		 * dropped it
1919 		 */
1920 		entry->etype &= ~UVM_ET_OBJ;
1921 		entry->object.uvm_obj = NULL;  /* to be safe */
1922 	} else {
1923 		/* remove mappings the standard way. */
1924 		pmap_remove(map->pmap, entry->start, entry->end);
1925 	}
1926 }
1927 
1928 /*
1929  * Remove all entries from start to end.
1930  *
1931  * If remove_holes, then remove ET_HOLE entries as well.
1932  * If markfree, entry will be properly marked free, otherwise, no replacement
1933  * entry will be put in the tree (corrupting the tree).
1934  */
1935 void
1936 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
1937     struct uvm_map_deadq *dead, boolean_t remove_holes,
1938     boolean_t markfree)
1939 {
1940 	struct vm_map_entry *prev_hint, *next, *entry;
1941 
1942 	start = MAX(start, map->min_offset);
1943 	end = MIN(end, map->max_offset);
1944 	if (start >= end)
1945 		return;
1946 
1947 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1948 		splassert(IPL_NONE);
1949 	else
1950 		splassert(IPL_VM);
1951 
1952 	/* Find first affected entry. */
1953 	entry = uvm_map_entrybyaddr(&map->addr, start);
1954 	KDASSERT(entry != NULL && entry->start <= start);
1955 	if (entry->end <= start && markfree)
1956 		entry = RBT_NEXT(uvm_map_addr, entry);
1957 	else
1958 		UVM_MAP_CLIP_START(map, entry, start);
1959 
1960 	/*
1961 	 * Iterate entries until we reach end address.
1962 	 * prev_hint hints where the freed space can be appended to.
1963 	 */
1964 	prev_hint = NULL;
1965 	for (; entry != NULL && entry->start < end; entry = next) {
1966 		KDASSERT(entry->start >= start);
1967 		if (entry->end > end || !markfree)
1968 			UVM_MAP_CLIP_END(map, entry, end);
1969 		KDASSERT(entry->start >= start && entry->end <= end);
1970 		next = RBT_NEXT(uvm_map_addr, entry);
1971 
1972 		/* Don't remove holes unless asked to do so. */
1973 		if (UVM_ET_ISHOLE(entry)) {
1974 			if (!remove_holes) {
1975 				prev_hint = entry;
1976 				continue;
1977 			}
1978 		}
1979 
1980 		/* Kill entry. */
1981 		uvm_unmap_kill_entry(map, entry);
1982 
1983 		/* Update space usage. */
1984 		if ((map->flags & VM_MAP_ISVMSPACE) &&
1985 		    entry->object.uvm_obj == NULL &&
1986 		    !UVM_ET_ISHOLE(entry)) {
1987 			((struct vmspace *)map)->vm_dused -=
1988 			    uvmspace_dused(map, entry->start, entry->end);
1989 		}
1990 		if (!UVM_ET_ISHOLE(entry))
1991 			map->size -= entry->end - entry->start;
1992 
1993 		/* Actual removal of entry. */
1994 		uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree);
1995 	}
1996 
1997 	pmap_update(vm_map_pmap(map));
1998 
1999 #ifdef VMMAP_DEBUG
2000 	if (markfree) {
2001 		for (entry = uvm_map_entrybyaddr(&map->addr, start);
2002 		    entry != NULL && entry->start < end;
2003 		    entry = RBT_NEXT(uvm_map_addr, entry)) {
2004 			KDASSERT(entry->end <= start ||
2005 			    entry->start == entry->end ||
2006 			    UVM_ET_ISHOLE(entry));
2007 		}
2008 	} else {
2009 		vaddr_t a;
2010 		for (a = start; a < end; a += PAGE_SIZE)
2011 			KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL);
2012 	}
2013 #endif
2014 }
2015 
2016 /*
2017  * Mark all entries from first until end (exclusive) as pageable.
2018  *
2019  * Lock must be exclusive on entry and will not be touched.
2020  */
2021 void
2022 uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first,
2023     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr)
2024 {
2025 	struct vm_map_entry *iter;
2026 
2027 	for (iter = first; iter != end;
2028 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2029 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2030 		if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2031 			continue;
2032 
2033 		iter->wired_count = 0;
2034 		uvm_fault_unwire_locked(map, iter->start, iter->end);
2035 	}
2036 }
2037 
2038 /*
2039  * Mark all entries from first until end (exclusive) as wired.
2040  *
2041  * Lockflags determines the lock state on return from this function.
2042  * Lock must be exclusive on entry.
2043  */
2044 int
2045 uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first,
2046     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr,
2047     int lockflags)
2048 {
2049 	struct vm_map_entry *iter;
2050 #ifdef DIAGNOSTIC
2051 	unsigned int timestamp_save;
2052 #endif
2053 	int error;
2054 
2055 	/*
2056 	 * Wire pages in two passes:
2057 	 *
2058 	 * 1: holding the write lock, we create any anonymous maps that need
2059 	 *    to be created.  then we clip each map entry to the region to
2060 	 *    be wired and increment its wiring count.
2061 	 *
2062 	 * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
2063 	 *    in the pages for any newly wired area (wired_count == 1).
2064 	 *
2065 	 *    downgrading to a read lock for uvm_fault_wire avoids a possible
2066 	 *    deadlock with another thread that may have faulted on one of
2067 	 *    the pages to be wired (it would mark the page busy, blocking
2068 	 *    us, then in turn block on the map lock that we hold).
2069 	 *    because we keep the read lock on the map, the copy-on-write
2070 	 *    status of the entries we modify here cannot change.
2071 	 */
2072 	for (iter = first; iter != end;
2073 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2074 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2075 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2076 		    iter->protection == PROT_NONE)
2077 			continue;
2078 
2079 		/*
2080 		 * Perform actions of vm_map_lookup that need the write lock.
2081 		 * - create an anonymous map for copy-on-write
2082 		 * - anonymous map for zero-fill
2083 		 * Skip submaps.
2084 		 */
2085 		if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) &&
2086 		    UVM_ET_ISNEEDSCOPY(iter) &&
2087 		    ((iter->protection & PROT_WRITE) ||
2088 		    iter->object.uvm_obj == NULL)) {
2089 			amap_copy(map, iter, M_WAITOK, TRUE,
2090 			    iter->start, iter->end);
2091 		}
2092 		iter->wired_count++;
2093 	}
2094 
2095 	/*
2096 	 * Pass 2.
2097 	 */
2098 #ifdef DIAGNOSTIC
2099 	timestamp_save = map->timestamp;
2100 #endif
2101 	vm_map_busy(map);
2102 	vm_map_downgrade(map);
2103 
2104 	error = 0;
2105 	for (iter = first; error == 0 && iter != end;
2106 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2107 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2108 		    iter->protection == PROT_NONE)
2109 			continue;
2110 
2111 		error = uvm_fault_wire(map, iter->start, iter->end,
2112 		    iter->protection);
2113 	}
2114 
2115 	if (error) {
2116 		/*
2117 		 * uvm_fault_wire failure
2118 		 *
2119 		 * Reacquire lock and undo our work.
2120 		 */
2121 		vm_map_upgrade(map);
2122 		vm_map_unbusy(map);
2123 #ifdef DIAGNOSTIC
2124 		if (timestamp_save != map->timestamp)
2125 			panic("uvm_map_pageable_wire: stale map");
2126 #endif
2127 
2128 		/*
2129 		 * first is no longer needed to restart loops.
2130 		 * Use it as iterator to unmap successful mappings.
2131 		 */
2132 		for (; first != iter;
2133 		    first = RBT_NEXT(uvm_map_addr, first)) {
2134 			if (UVM_ET_ISHOLE(first) ||
2135 			    first->start == first->end ||
2136 			    first->protection == PROT_NONE)
2137 				continue;
2138 
2139 			first->wired_count--;
2140 			if (!VM_MAPENT_ISWIRED(first)) {
2141 				uvm_fault_unwire_locked(map,
2142 				    iter->start, iter->end);
2143 			}
2144 		}
2145 
2146 		/* decrease counter in the rest of the entries */
2147 		for (; iter != end;
2148 		    iter = RBT_NEXT(uvm_map_addr, iter)) {
2149 			if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2150 			    iter->protection == PROT_NONE)
2151 				continue;
2152 
2153 			iter->wired_count--;
2154 		}
2155 
2156 		if ((lockflags & UVM_LK_EXIT) == 0)
2157 			vm_map_unlock(map);
2158 		return error;
2159 	}
2160 
2161 	/* We are currently holding a read lock. */
2162 	if ((lockflags & UVM_LK_EXIT) == 0) {
2163 		vm_map_unbusy(map);
2164 		vm_map_unlock_read(map);
2165 	} else {
2166 		vm_map_upgrade(map);
2167 		vm_map_unbusy(map);
2168 #ifdef DIAGNOSTIC
2169 		if (timestamp_save != map->timestamp)
2170 			panic("uvm_map_pageable_wire: stale map");
2171 #endif
2172 	}
2173 	return 0;
2174 }
2175 
2176 /*
2177  * uvm_map_pageable: set pageability of a range in a map.
2178  *
2179  * Flags:
2180  * UVM_LK_ENTER: map is already locked by caller
2181  * UVM_LK_EXIT:  don't unlock map on exit
2182  *
2183  * The full range must be in use (entries may not have fspace != 0).
2184  * UVM_ET_HOLE counts as unmapped.
2185  */
2186 int
2187 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
2188     boolean_t new_pageable, int lockflags)
2189 {
2190 	struct vm_map_entry *first, *last, *tmp;
2191 	int error;
2192 
2193 	start = trunc_page(start);
2194 	end = round_page(end);
2195 
2196 	if (start > end)
2197 		return EINVAL;
2198 	if (start == end)
2199 		return 0;	/* nothing to do */
2200 	if (start < map->min_offset)
2201 		return EFAULT; /* why? see first XXX below */
2202 	if (end > map->max_offset)
2203 		return EINVAL; /* why? see second XXX below */
2204 
2205 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2206 	if ((lockflags & UVM_LK_ENTER) == 0)
2207 		vm_map_lock(map);
2208 
2209 	/*
2210 	 * Find first entry.
2211 	 *
2212 	 * Initial test on start is different, because of the different
2213 	 * error returned. Rest is tested further down.
2214 	 */
2215 	first = uvm_map_entrybyaddr(&map->addr, start);
2216 	if (first->end <= start || UVM_ET_ISHOLE(first)) {
2217 		/*
2218 		 * XXX if the first address is not mapped, it is EFAULT?
2219 		 */
2220 		error = EFAULT;
2221 		goto out;
2222 	}
2223 
2224 	/* Check that the range has no holes. */
2225 	for (last = first; last != NULL && last->start < end;
2226 	    last = RBT_NEXT(uvm_map_addr, last)) {
2227 		if (UVM_ET_ISHOLE(last) ||
2228 		    (last->end < end && VMMAP_FREE_END(last) != last->end)) {
2229 			/*
2230 			 * XXX unmapped memory in range, why is it EINVAL
2231 			 * instead of EFAULT?
2232 			 */
2233 			error = EINVAL;
2234 			goto out;
2235 		}
2236 	}
2237 
2238 	/*
2239 	 * Last ended at the first entry after the range.
2240 	 * Move back one step.
2241 	 *
2242 	 * Note that last may be NULL.
2243 	 */
2244 	if (last == NULL) {
2245 		last = RBT_MAX(uvm_map_addr, &map->addr);
2246 		if (last->end < end) {
2247 			error = EINVAL;
2248 			goto out;
2249 		}
2250 	} else {
2251 		KASSERT(last != first);
2252 		last = RBT_PREV(uvm_map_addr, last);
2253 	}
2254 
2255 	/* Wire/unwire pages here. */
2256 	if (new_pageable) {
2257 		/*
2258 		 * Mark pageable.
2259 		 * entries that are not wired are untouched.
2260 		 */
2261 		if (VM_MAPENT_ISWIRED(first))
2262 			UVM_MAP_CLIP_START(map, first, start);
2263 		/*
2264 		 * Split last at end.
2265 		 * Make tmp be the first entry after what is to be touched.
2266 		 * If last is not wired, don't touch it.
2267 		 */
2268 		if (VM_MAPENT_ISWIRED(last)) {
2269 			UVM_MAP_CLIP_END(map, last, end);
2270 			tmp = RBT_NEXT(uvm_map_addr, last);
2271 		} else
2272 			tmp = last;
2273 
2274 		uvm_map_pageable_pgon(map, first, tmp, start, end);
2275 		error = 0;
2276 
2277 out:
2278 		if ((lockflags & UVM_LK_EXIT) == 0)
2279 			vm_map_unlock(map);
2280 		return error;
2281 	} else {
2282 		/*
2283 		 * Mark entries wired.
2284 		 * entries are always touched (because recovery needs this).
2285 		 */
2286 		if (!VM_MAPENT_ISWIRED(first))
2287 			UVM_MAP_CLIP_START(map, first, start);
2288 		/*
2289 		 * Split last at end.
2290 		 * Make tmp be the first entry after what is to be touched.
2291 		 * If last is not wired, don't touch it.
2292 		 */
2293 		if (!VM_MAPENT_ISWIRED(last)) {
2294 			UVM_MAP_CLIP_END(map, last, end);
2295 			tmp = RBT_NEXT(uvm_map_addr, last);
2296 		} else
2297 			tmp = last;
2298 
2299 		return uvm_map_pageable_wire(map, first, tmp, start, end,
2300 		    lockflags);
2301 	}
2302 }
2303 
2304 /*
2305  * uvm_map_pageable_all: special case of uvm_map_pageable - affects
2306  * all mapped regions.
2307  *
2308  * Map must not be locked.
2309  * If no flags are specified, all ragions are unwired.
2310  */
2311 int
2312 uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
2313 {
2314 	vsize_t size;
2315 	struct vm_map_entry *iter;
2316 
2317 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2318 	vm_map_lock(map);
2319 
2320 	if (flags == 0) {
2321 		uvm_map_pageable_pgon(map, RBT_MIN(uvm_map_addr, &map->addr),
2322 		    NULL, map->min_offset, map->max_offset);
2323 
2324 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
2325 		vm_map_unlock(map);
2326 		return 0;
2327 	}
2328 
2329 	if (flags & MCL_FUTURE)
2330 		vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
2331 	if (!(flags & MCL_CURRENT)) {
2332 		vm_map_unlock(map);
2333 		return 0;
2334 	}
2335 
2336 	/*
2337 	 * Count number of pages in all non-wired entries.
2338 	 * If the number exceeds the limit, abort.
2339 	 */
2340 	size = 0;
2341 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2342 		if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2343 			continue;
2344 
2345 		size += iter->end - iter->start;
2346 	}
2347 
2348 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
2349 		vm_map_unlock(map);
2350 		return ENOMEM;
2351 	}
2352 
2353 	/* XXX non-pmap_wired_count case must be handled by caller */
2354 #ifdef pmap_wired_count
2355 	if (limit != 0 &&
2356 	    size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) {
2357 		vm_map_unlock(map);
2358 		return ENOMEM;
2359 	}
2360 #endif
2361 
2362 	/*
2363 	 * uvm_map_pageable_wire will release lcok
2364 	 */
2365 	return uvm_map_pageable_wire(map, RBT_MIN(uvm_map_addr, &map->addr),
2366 	    NULL, map->min_offset, map->max_offset, 0);
2367 }
2368 
2369 /*
2370  * Initialize map.
2371  *
2372  * Allocates sufficient entries to describe the free memory in the map.
2373  */
2374 void
2375 uvm_map_setup(struct vm_map *map, vaddr_t min, vaddr_t max, int flags)
2376 {
2377 	int i;
2378 
2379 	KASSERT((min & (vaddr_t)PAGE_MASK) == 0);
2380 	KASSERT((max & (vaddr_t)PAGE_MASK) == 0 ||
2381 	    (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
2382 
2383 	/*
2384 	 * Update parameters.
2385 	 *
2386 	 * This code handles (vaddr_t)-1 and other page mask ending addresses
2387 	 * properly.
2388 	 * We lose the top page if the full virtual address space is used.
2389 	 */
2390 	if (max & (vaddr_t)PAGE_MASK) {
2391 		max += 1;
2392 		if (max == 0) /* overflow */
2393 			max -= PAGE_SIZE;
2394 	}
2395 
2396 	RBT_INIT(uvm_map_addr, &map->addr);
2397 	map->uaddr_exe = NULL;
2398 	for (i = 0; i < nitems(map->uaddr_any); ++i)
2399 		map->uaddr_any[i] = NULL;
2400 	map->uaddr_brk_stack = NULL;
2401 
2402 	map->size = 0;
2403 	map->ref_count = 0;
2404 	map->min_offset = min;
2405 	map->max_offset = max;
2406 	map->b_start = map->b_end = 0; /* Empty brk() area by default. */
2407 	map->s_start = map->s_end = 0; /* Empty stack area by default. */
2408 	map->flags = flags;
2409 	map->timestamp = 0;
2410 	rw_init(&map->lock, "vmmaplk");
2411 	mtx_init(&map->mtx, IPL_VM);
2412 	mtx_init(&map->flags_lock, IPL_VM);
2413 
2414 	/* Configure the allocators. */
2415 	if (flags & VM_MAP_ISVMSPACE)
2416 		uvm_map_setup_md(map);
2417 	else
2418 		map->uaddr_any[3] = &uaddr_kbootstrap;
2419 
2420 	/*
2421 	 * Fill map entries.
2422 	 * We do not need to write-lock the map here because only the current
2423 	 * thread sees it right now. Initialize ref_count to 0 above to avoid
2424 	 * bogus triggering of lock-not-held assertions.
2425 	 */
2426 	uvm_map_setup_entries(map);
2427 	uvm_tree_sanity(map, __FILE__, __LINE__);
2428 	map->ref_count = 1;
2429 }
2430 
2431 /*
2432  * Destroy the map.
2433  *
2434  * This is the inverse operation to uvm_map_setup.
2435  */
2436 void
2437 uvm_map_teardown(struct vm_map *map)
2438 {
2439 	struct uvm_map_deadq	 dead_entries;
2440 	struct vm_map_entry	*entry, *tmp;
2441 #ifdef VMMAP_DEBUG
2442 	size_t			 numq, numt;
2443 #endif
2444 	int			 i;
2445 
2446 	KERNEL_ASSERT_LOCKED();
2447 	KERNEL_UNLOCK();
2448 	KERNEL_ASSERT_UNLOCKED();
2449 
2450 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
2451 
2452 	/* Remove address selectors. */
2453 	uvm_addr_destroy(map->uaddr_exe);
2454 	map->uaddr_exe = NULL;
2455 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2456 		uvm_addr_destroy(map->uaddr_any[i]);
2457 		map->uaddr_any[i] = NULL;
2458 	}
2459 	uvm_addr_destroy(map->uaddr_brk_stack);
2460 	map->uaddr_brk_stack = NULL;
2461 
2462 	/*
2463 	 * Remove entries.
2464 	 *
2465 	 * The following is based on graph breadth-first search.
2466 	 *
2467 	 * In color terms:
2468 	 * - the dead_entries set contains all nodes that are reachable
2469 	 *   (i.e. both the black and the grey nodes)
2470 	 * - any entry not in dead_entries is white
2471 	 * - any entry that appears in dead_entries before entry,
2472 	 *   is black, the rest is grey.
2473 	 * The set [entry, end] is also referred to as the wavefront.
2474 	 *
2475 	 * Since the tree is always a fully connected graph, the breadth-first
2476 	 * search guarantees that each vmmap_entry is visited exactly once.
2477 	 * The vm_map is broken down in linear time.
2478 	 */
2479 	TAILQ_INIT(&dead_entries);
2480 	if ((entry = RBT_ROOT(uvm_map_addr, &map->addr)) != NULL)
2481 		DEAD_ENTRY_PUSH(&dead_entries, entry);
2482 	while (entry != NULL) {
2483 		sched_pause();
2484 		uvm_unmap_kill_entry(map, entry);
2485 		if ((tmp = RBT_LEFT(uvm_map_addr, entry)) != NULL)
2486 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2487 		if ((tmp = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
2488 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2489 		/* Update wave-front. */
2490 		entry = TAILQ_NEXT(entry, dfree.deadq);
2491 	}
2492 
2493 #ifdef VMMAP_DEBUG
2494 	numt = numq = 0;
2495 	RBT_FOREACH(entry, uvm_map_addr, &map->addr)
2496 		numt++;
2497 	TAILQ_FOREACH(entry, &dead_entries, dfree.deadq)
2498 		numq++;
2499 	KASSERT(numt == numq);
2500 #endif
2501 	uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK);
2502 
2503 	KERNEL_LOCK();
2504 
2505 	pmap_destroy(map->pmap);
2506 	map->pmap = NULL;
2507 }
2508 
2509 /*
2510  * Populate map with free-memory entries.
2511  *
2512  * Map must be initialized and empty.
2513  */
2514 void
2515 uvm_map_setup_entries(struct vm_map *map)
2516 {
2517 	KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
2518 
2519 	uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0);
2520 }
2521 
2522 /*
2523  * Split entry at given address.
2524  *
2525  * orig:  entry that is to be split.
2526  * next:  a newly allocated map entry that is not linked.
2527  * split: address at which the split is done.
2528  */
2529 void
2530 uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig,
2531     struct vm_map_entry *next, vaddr_t split)
2532 {
2533 	struct uvm_addr_state *free, *free_before;
2534 	vsize_t adj;
2535 
2536 	if ((split & PAGE_MASK) != 0) {
2537 		panic("uvm_map_splitentry: split address 0x%lx "
2538 		    "not on page boundary!", split);
2539 	}
2540 	KDASSERT(map != NULL && orig != NULL && next != NULL);
2541 	uvm_tree_sanity(map, __FILE__, __LINE__);
2542 	KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split);
2543 
2544 #ifdef VMMAP_DEBUG
2545 	KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, orig) == orig);
2546 	KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, next) != next);
2547 #endif /* VMMAP_DEBUG */
2548 
2549 	/*
2550 	 * Free space will change, unlink from free space tree.
2551 	 */
2552 	free = uvm_map_uaddr_e(map, orig);
2553 	uvm_mapent_free_remove(map, free, orig);
2554 
2555 	adj = split - orig->start;
2556 
2557 	uvm_mapent_copy(orig, next);
2558 	if (split >= orig->end) {
2559 		next->etype = 0;
2560 		next->offset = 0;
2561 		next->wired_count = 0;
2562 		next->start = next->end = split;
2563 		next->guard = 0;
2564 		next->fspace = VMMAP_FREE_END(orig) - split;
2565 		next->aref.ar_amap = NULL;
2566 		next->aref.ar_pageoff = 0;
2567 		orig->guard = MIN(orig->guard, split - orig->end);
2568 		orig->fspace = split - VMMAP_FREE_START(orig);
2569 	} else {
2570 		orig->fspace = 0;
2571 		orig->guard = 0;
2572 		orig->end = next->start = split;
2573 
2574 		if (next->aref.ar_amap) {
2575 			KERNEL_LOCK();
2576 			amap_splitref(&orig->aref, &next->aref, adj);
2577 			KERNEL_UNLOCK();
2578 		}
2579 		if (UVM_ET_ISSUBMAP(orig)) {
2580 			uvm_map_reference(next->object.sub_map);
2581 			next->offset += adj;
2582 		} else if (UVM_ET_ISOBJ(orig)) {
2583 			if (next->object.uvm_obj->pgops &&
2584 			    next->object.uvm_obj->pgops->pgo_reference) {
2585 				KERNEL_LOCK();
2586 				next->object.uvm_obj->pgops->pgo_reference(
2587 				    next->object.uvm_obj);
2588 				KERNEL_UNLOCK();
2589 			}
2590 			next->offset += adj;
2591 		}
2592 	}
2593 
2594 	/*
2595 	 * Link next into address tree.
2596 	 * Link orig and next into free-space tree.
2597 	 *
2598 	 * Don't insert 'next' into the addr tree until orig has been linked,
2599 	 * in case the free-list looks at adjecent entries in the addr tree
2600 	 * for its decisions.
2601 	 */
2602 	if (orig->fspace > 0)
2603 		free_before = free;
2604 	else
2605 		free_before = uvm_map_uaddr_e(map, orig);
2606 	uvm_mapent_free_insert(map, free_before, orig);
2607 	uvm_mapent_addr_insert(map, next);
2608 	uvm_mapent_free_insert(map, free, next);
2609 
2610 	uvm_tree_sanity(map, __FILE__, __LINE__);
2611 }
2612 
2613 
2614 #ifdef VMMAP_DEBUG
2615 
2616 void
2617 uvm_tree_assert(struct vm_map *map, int test, char *test_str,
2618     char *file, int line)
2619 {
2620 	char* map_special;
2621 
2622 	if (test)
2623 		return;
2624 
2625 	if (map == kernel_map)
2626 		map_special = " (kernel_map)";
2627 	else if (map == kmem_map)
2628 		map_special = " (kmem_map)";
2629 	else
2630 		map_special = "";
2631 	panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file,
2632 	    line, test_str);
2633 }
2634 
2635 /*
2636  * Check that map is sane.
2637  */
2638 void
2639 uvm_tree_sanity(struct vm_map *map, char *file, int line)
2640 {
2641 	struct vm_map_entry	*iter;
2642 	vaddr_t			 addr;
2643 	vaddr_t			 min, max, bound; /* Bounds checker. */
2644 	struct uvm_addr_state	*free;
2645 
2646 	addr = vm_map_min(map);
2647 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2648 		/*
2649 		 * Valid start, end.
2650 		 * Catch overflow for end+fspace.
2651 		 */
2652 		UVM_ASSERT(map, iter->end >= iter->start, file, line);
2653 		UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line);
2654 
2655 		/* May not be empty. */
2656 		UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter),
2657 		    file, line);
2658 
2659 		/* Addresses for entry must lie within map boundaries. */
2660 		UVM_ASSERT(map, iter->start >= vm_map_min(map) &&
2661 		    VMMAP_FREE_END(iter) <= vm_map_max(map), file, line);
2662 
2663 		/* Tree may not have gaps. */
2664 		UVM_ASSERT(map, iter->start == addr, file, line);
2665 		addr = VMMAP_FREE_END(iter);
2666 
2667 		/*
2668 		 * Free space may not cross boundaries, unless the same
2669 		 * free list is used on both sides of the border.
2670 		 */
2671 		min = VMMAP_FREE_START(iter);
2672 		max = VMMAP_FREE_END(iter);
2673 
2674 		while (min < max &&
2675 		    (bound = uvm_map_boundary(map, min, max)) != max) {
2676 			UVM_ASSERT(map,
2677 			    uvm_map_uaddr(map, bound - 1) ==
2678 			    uvm_map_uaddr(map, bound),
2679 			    file, line);
2680 			min = bound;
2681 		}
2682 
2683 		free = uvm_map_uaddr_e(map, iter);
2684 		if (free) {
2685 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0,
2686 			    file, line);
2687 		} else {
2688 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0,
2689 			    file, line);
2690 		}
2691 	}
2692 	UVM_ASSERT(map, addr == vm_map_max(map), file, line);
2693 }
2694 
2695 void
2696 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
2697 {
2698 	struct vm_map_entry *iter;
2699 	vsize_t size;
2700 
2701 	size = 0;
2702 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2703 		if (!UVM_ET_ISHOLE(iter))
2704 			size += iter->end - iter->start;
2705 	}
2706 
2707 	if (map->size != size)
2708 		printf("map size = 0x%lx, should be 0x%lx\n", map->size, size);
2709 	UVM_ASSERT(map, map->size == size, file, line);
2710 
2711 	vmspace_validate(map);
2712 }
2713 
2714 /*
2715  * This function validates the statistics on vmspace.
2716  */
2717 void
2718 vmspace_validate(struct vm_map *map)
2719 {
2720 	struct vmspace *vm;
2721 	struct vm_map_entry *iter;
2722 	vaddr_t imin, imax;
2723 	vaddr_t stack_begin, stack_end; /* Position of stack. */
2724 	vsize_t stack, heap; /* Measured sizes. */
2725 
2726 	if (!(map->flags & VM_MAP_ISVMSPACE))
2727 		return;
2728 
2729 	vm = (struct vmspace *)map;
2730 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2731 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2732 
2733 	stack = heap = 0;
2734 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2735 		imin = imax = iter->start;
2736 
2737 		if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL)
2738 			continue;
2739 
2740 		/*
2741 		 * Update stack, heap.
2742 		 * Keep in mind that (theoretically) the entries of
2743 		 * userspace and stack may be joined.
2744 		 */
2745 		while (imin != iter->end) {
2746 			/*
2747 			 * Set imax to the first boundary crossed between
2748 			 * imin and stack addresses.
2749 			 */
2750 			imax = iter->end;
2751 			if (imin < stack_begin && imax > stack_begin)
2752 				imax = stack_begin;
2753 			else if (imin < stack_end && imax > stack_end)
2754 				imax = stack_end;
2755 
2756 			if (imin >= stack_begin && imin < stack_end)
2757 				stack += imax - imin;
2758 			else
2759 				heap += imax - imin;
2760 			imin = imax;
2761 		}
2762 	}
2763 
2764 	heap >>= PAGE_SHIFT;
2765 	if (heap != vm->vm_dused) {
2766 		printf("vmspace stack range: 0x%lx-0x%lx\n",
2767 		    stack_begin, stack_end);
2768 		panic("vmspace_validate: vmspace.vm_dused invalid, "
2769 		    "expected %ld pgs, got %ld pgs in map %p",
2770 		    heap, vm->vm_dused,
2771 		    map);
2772 	}
2773 }
2774 
2775 #endif /* VMMAP_DEBUG */
2776 
2777 /*
2778  * uvm_map_init: init mapping system at boot time.   note that we allocate
2779  * and init the static pool of structs vm_map_entry for the kernel here.
2780  */
2781 void
2782 uvm_map_init(void)
2783 {
2784 	static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
2785 	int lcv;
2786 
2787 	/* now set up static pool of kernel map entries ... */
2788 	mtx_init(&uvm_kmapent_mtx, IPL_VM);
2789 	SLIST_INIT(&uvm.kentry_free);
2790 	for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
2791 		SLIST_INSERT_HEAD(&uvm.kentry_free,
2792 		    &kernel_map_entry[lcv], daddrs.addr_kentry);
2793 	}
2794 
2795 	/* initialize the map-related pools. */
2796 	pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), 0,
2797 	    IPL_NONE, PR_WAITOK, "vmsppl", NULL);
2798 	pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), 0,
2799 	    IPL_VM, PR_WAITOK, "vmmpepl", NULL);
2800 	pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), 0,
2801 	    IPL_NONE, 0, "vmmpekpl", NULL);
2802 	pool_sethiwat(&uvm_map_entry_pool, 8192);
2803 
2804 	uvm_addr_init();
2805 }
2806 
2807 #if defined(DDB)
2808 
2809 /*
2810  * DDB hooks
2811  */
2812 
2813 /*
2814  * uvm_map_printit: actually prints the map
2815  */
2816 void
2817 uvm_map_printit(struct vm_map *map, boolean_t full,
2818     int (*pr)(const char *, ...))
2819 {
2820 	struct vmspace			*vm;
2821 	struct vm_map_entry		*entry;
2822 	struct uvm_addr_state		*free;
2823 	int				 in_free, i;
2824 	char				 buf[8];
2825 
2826 	(*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
2827 	(*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n",
2828 	    map->b_start, map->b_end);
2829 	(*pr)("\tstack allocate range: 0x%lx-0x%lx\n",
2830 	    map->s_start, map->s_end);
2831 	(*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n",
2832 	    map->size, map->ref_count, map->timestamp,
2833 	    map->flags);
2834 	(*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
2835 	    pmap_resident_count(map->pmap));
2836 
2837 	/* struct vmspace handling. */
2838 	if (map->flags & VM_MAP_ISVMSPACE) {
2839 		vm = (struct vmspace *)map;
2840 
2841 		(*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n",
2842 		    vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss);
2843 		(*pr)("\tvm_tsize=%u vm_dsize=%u\n",
2844 		    vm->vm_tsize, vm->vm_dsize);
2845 		(*pr)("\tvm_taddr=%p vm_daddr=%p\n",
2846 		    vm->vm_taddr, vm->vm_daddr);
2847 		(*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n",
2848 		    vm->vm_maxsaddr, vm->vm_minsaddr);
2849 	}
2850 
2851 	if (!full)
2852 		goto print_uaddr;
2853 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
2854 		(*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
2855 		    entry, entry->start, entry->end, entry->object.uvm_obj,
2856 		    (long long)entry->offset, entry->aref.ar_amap,
2857 		    entry->aref.ar_pageoff);
2858 		(*pr)("\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
2859 		    "wc=%d, adv=%d\n",
2860 		    (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
2861 		    (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
2862 		    (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
2863 		    entry->protection, entry->max_protection,
2864 		    entry->inheritance, entry->wired_count, entry->advice);
2865 
2866 		free = uvm_map_uaddr_e(map, entry);
2867 		in_free = (free != NULL);
2868 		(*pr)("\thole=%c, free=%c, guard=0x%lx, "
2869 		    "free=0x%lx-0x%lx\n",
2870 		    (entry->etype & UVM_ET_HOLE) ? 'T' : 'F',
2871 		    in_free ? 'T' : 'F',
2872 		    entry->guard,
2873 		    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
2874 		(*pr)("\tfspace_augment=%lu\n", entry->fspace_augment);
2875 		(*pr)("\tfreemapped=%c, uaddr=%p\n",
2876 		    (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free);
2877 		if (free) {
2878 			(*pr)("\t\t(0x%lx-0x%lx %s)\n",
2879 			    free->uaddr_minaddr, free->uaddr_maxaddr,
2880 			    free->uaddr_functions->uaddr_name);
2881 		}
2882 	}
2883 
2884 print_uaddr:
2885 	uvm_addr_print(map->uaddr_exe, "exe", full, pr);
2886 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2887 		snprintf(&buf[0], sizeof(buf), "any[%d]", i);
2888 		uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr);
2889 	}
2890 	uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr);
2891 }
2892 
2893 /*
2894  * uvm_object_printit: actually prints the object
2895  */
2896 void
2897 uvm_object_printit(uobj, full, pr)
2898 	struct uvm_object *uobj;
2899 	boolean_t full;
2900 	int (*pr)(const char *, ...);
2901 {
2902 	struct vm_page *pg;
2903 	int cnt = 0;
2904 
2905 	(*pr)("OBJECT %p: pgops=%p, npages=%d, ",
2906 	    uobj, uobj->pgops, uobj->uo_npages);
2907 	if (UVM_OBJ_IS_KERN_OBJECT(uobj))
2908 		(*pr)("refs=<SYSTEM>\n");
2909 	else
2910 		(*pr)("refs=%d\n", uobj->uo_refs);
2911 
2912 	if (!full) {
2913 		return;
2914 	}
2915 	(*pr)("  PAGES <pg,offset>:\n  ");
2916 	RBT_FOREACH(pg, uvm_objtree, &uobj->memt) {
2917 		(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
2918 		if ((cnt % 3) == 2) {
2919 			(*pr)("\n  ");
2920 		}
2921 		cnt++;
2922 	}
2923 	if ((cnt % 3) != 2) {
2924 		(*pr)("\n");
2925 	}
2926 }
2927 
2928 /*
2929  * uvm_page_printit: actually print the page
2930  */
2931 static const char page_flagbits[] =
2932 	"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
2933 	"\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
2934 	"\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
2935 
2936 void
2937 uvm_page_printit(pg, full, pr)
2938 	struct vm_page *pg;
2939 	boolean_t full;
2940 	int (*pr)(const char *, ...);
2941 {
2942 	struct vm_page *tpg;
2943 	struct uvm_object *uobj;
2944 	struct pglist *pgl;
2945 
2946 	(*pr)("PAGE %p:\n", pg);
2947 	(*pr)("  flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n",
2948 	    pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count,
2949 	    (long long)pg->phys_addr);
2950 	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
2951 	    pg->uobject, pg->uanon, (long long)pg->offset);
2952 #if defined(UVM_PAGE_TRKOWN)
2953 	if (pg->pg_flags & PG_BUSY)
2954 		(*pr)("  owning process = %d, tag=%s",
2955 		    pg->owner, pg->owner_tag);
2956 	else
2957 		(*pr)("  page not busy, no owner");
2958 #else
2959 	(*pr)("  [page ownership tracking disabled]");
2960 #endif
2961 	(*pr)("\tvm_page_md %p\n", &pg->mdpage);
2962 
2963 	if (!full)
2964 		return;
2965 
2966 	/* cross-verify object/anon */
2967 	if ((pg->pg_flags & PQ_FREE) == 0) {
2968 		if (pg->pg_flags & PQ_ANON) {
2969 			if (pg->uanon == NULL || pg->uanon->an_page != pg)
2970 			    (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2971 				(pg->uanon) ? pg->uanon->an_page : NULL);
2972 			else
2973 				(*pr)("  anon backpointer is OK\n");
2974 		} else {
2975 			uobj = pg->uobject;
2976 			if (uobj) {
2977 				(*pr)("  checking object list\n");
2978 				RBT_FOREACH(tpg, uvm_objtree, &uobj->memt) {
2979 					if (tpg == pg) {
2980 						break;
2981 					}
2982 				}
2983 				if (tpg)
2984 					(*pr)("  page found on object list\n");
2985 				else
2986 					(*pr)("  >>> PAGE NOT FOUND "
2987 					    "ON OBJECT LIST! <<<\n");
2988 			}
2989 		}
2990 	}
2991 
2992 	/* cross-verify page queue */
2993 	if (pg->pg_flags & PQ_FREE) {
2994 		if (uvm_pmr_isfree(pg))
2995 			(*pr)("  page found in uvm_pmemrange\n");
2996 		else
2997 			(*pr)("  >>> page not found in uvm_pmemrange <<<\n");
2998 		pgl = NULL;
2999 	} else if (pg->pg_flags & PQ_INACTIVE) {
3000 		pgl = (pg->pg_flags & PQ_SWAPBACKED) ?
3001 		    &uvm.page_inactive_swp : &uvm.page_inactive_obj;
3002 	} else if (pg->pg_flags & PQ_ACTIVE) {
3003 		pgl = &uvm.page_active;
3004  	} else {
3005 		pgl = NULL;
3006 	}
3007 
3008 	if (pgl) {
3009 		(*pr)("  checking pageq list\n");
3010 		TAILQ_FOREACH(tpg, pgl, pageq) {
3011 			if (tpg == pg) {
3012 				break;
3013 			}
3014 		}
3015 		if (tpg)
3016 			(*pr)("  page found on pageq list\n");
3017 		else
3018 			(*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
3019 	}
3020 }
3021 #endif
3022 
3023 /*
3024  * uvm_map_protect: change map protection
3025  *
3026  * => set_max means set max_protection.
3027  * => map must be unlocked.
3028  */
3029 int
3030 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
3031     vm_prot_t new_prot, boolean_t set_max)
3032 {
3033 	struct vm_map_entry *first, *iter;
3034 	vm_prot_t old_prot;
3035 	vm_prot_t mask;
3036 	int error;
3037 
3038 	if (start > end)
3039 		return EINVAL;
3040 	start = MAX(start, map->min_offset);
3041 	end = MIN(end, map->max_offset);
3042 	if (start >= end)
3043 		return 0;
3044 
3045 	error = 0;
3046 	vm_map_lock(map);
3047 
3048 	/*
3049 	 * Set up first and last.
3050 	 * - first will contain first entry at or after start.
3051 	 */
3052 	first = uvm_map_entrybyaddr(&map->addr, start);
3053 	KDASSERT(first != NULL);
3054 	if (first->end < start)
3055 		first = RBT_NEXT(uvm_map_addr, first);
3056 
3057 	/* First, check for protection violations. */
3058 	for (iter = first; iter != NULL && iter->start < end;
3059 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
3060 		/* Treat memory holes as free space. */
3061 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3062 			continue;
3063 
3064 		if (UVM_ET_ISSUBMAP(iter)) {
3065 			error = EINVAL;
3066 			goto out;
3067 		}
3068 		if ((new_prot & iter->max_protection) != new_prot) {
3069 			error = EACCES;
3070 			goto out;
3071 		}
3072 		if (map == kernel_map &&
3073 		    (new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
3074 			panic("uvm_map_protect: kernel map W^X violation requested");
3075 	}
3076 
3077 	/* Fix protections.  */
3078 	for (iter = first; iter != NULL && iter->start < end;
3079 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
3080 		/* Treat memory holes as free space. */
3081 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3082 			continue;
3083 
3084 		old_prot = iter->protection;
3085 
3086 		/*
3087 		 * Skip adapting protection iff old and new protection
3088 		 * are equal.
3089 		 */
3090 		if (set_max) {
3091 			if (old_prot == (new_prot & old_prot) &&
3092 			    iter->max_protection == new_prot)
3093 				continue;
3094 		} else {
3095 			if (old_prot == new_prot)
3096 				continue;
3097 		}
3098 
3099 		UVM_MAP_CLIP_START(map, iter, start);
3100 		UVM_MAP_CLIP_END(map, iter, end);
3101 
3102 		if (set_max) {
3103 			iter->max_protection = new_prot;
3104 			iter->protection &= new_prot;
3105 		} else
3106 			iter->protection = new_prot;
3107 
3108 		/*
3109 		 * update physical map if necessary.  worry about copy-on-write
3110 		 * here -- CHECK THIS XXX
3111 		 */
3112 		if (iter->protection != old_prot) {
3113 			mask = UVM_ET_ISCOPYONWRITE(iter) ?
3114 			    ~PROT_WRITE : PROT_MASK;
3115 
3116 			/* update pmap */
3117 			if ((iter->protection & mask) == PROT_NONE &&
3118 			    VM_MAPENT_ISWIRED(iter)) {
3119 				/*
3120 				 * TODO(ariane) this is stupid. wired_count
3121 				 * is 0 if not wired, otherwise anything
3122 				 * larger than 0 (incremented once each time
3123 				 * wire is called).
3124 				 * Mostly to be able to undo the damage on
3125 				 * failure. Not the actually be a wired
3126 				 * refcounter...
3127 				 * Originally: iter->wired_count--;
3128 				 * (don't we have to unwire this in the pmap
3129 				 * as well?)
3130 				 */
3131 				iter->wired_count = 0;
3132 			}
3133 			pmap_protect(map->pmap, iter->start, iter->end,
3134 			    iter->protection & mask);
3135 		}
3136 
3137 		/*
3138 		 * If the map is configured to lock any future mappings,
3139 		 * wire this entry now if the old protection was PROT_NONE
3140 		 * and the new protection is not PROT_NONE.
3141 		 */
3142 		if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
3143 		    VM_MAPENT_ISWIRED(iter) == 0 &&
3144 		    old_prot == PROT_NONE &&
3145 		    new_prot != PROT_NONE) {
3146 			if (uvm_map_pageable(map, iter->start, iter->end,
3147 			    FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) {
3148 				/*
3149 				 * If locking the entry fails, remember the
3150 				 * error if it's the first one.  Note we
3151 				 * still continue setting the protection in
3152 				 * the map, but it will return the resource
3153 				 * storage condition regardless.
3154 				 *
3155 				 * XXX Ignore what the actual error is,
3156 				 * XXX just call it a resource shortage
3157 				 * XXX so that it doesn't get confused
3158 				 * XXX what uvm_map_protect() itself would
3159 				 * XXX normally return.
3160 				 */
3161 				error = ENOMEM;
3162 			}
3163 		}
3164 	}
3165 	pmap_update(map->pmap);
3166 
3167 out:
3168 	vm_map_unlock(map);
3169 	return error;
3170 }
3171 
3172 /*
3173  * uvmspace_alloc: allocate a vmspace structure.
3174  *
3175  * - structure includes vm_map and pmap
3176  * - XXX: no locking on this structure
3177  * - refcnt set to 1, rest must be init'd by caller
3178  */
3179 struct vmspace *
3180 uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable,
3181     boolean_t remove_holes)
3182 {
3183 	struct vmspace *vm;
3184 
3185 	vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO);
3186 	uvmspace_init(vm, NULL, min, max, pageable, remove_holes);
3187 	return (vm);
3188 }
3189 
3190 /*
3191  * uvmspace_init: initialize a vmspace structure.
3192  *
3193  * - XXX: no locking on this structure
3194  * - refcnt set to 1, rest must be init'd by caller
3195  */
3196 void
3197 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max,
3198     boolean_t pageable, boolean_t remove_holes)
3199 {
3200 	KASSERT(pmap == NULL || pmap == pmap_kernel());
3201 
3202 	if (pmap)
3203 		pmap_reference(pmap);
3204 	else
3205 		pmap = pmap_create();
3206 	vm->vm_map.pmap = pmap;
3207 
3208 	uvm_map_setup(&vm->vm_map, min, max,
3209 	    (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE);
3210 
3211 	vm->vm_refcnt = 1;
3212 
3213 	if (remove_holes)
3214 		pmap_remove_holes(vm);
3215 }
3216 
3217 /*
3218  * uvmspace_share: share a vmspace between two processes
3219  *
3220  * - XXX: no locking on vmspace
3221  * - used for vfork
3222  */
3223 
3224 struct vmspace *
3225 uvmspace_share(struct process *pr)
3226 {
3227 	struct vmspace *vm = pr->ps_vmspace;
3228 
3229 	vm->vm_refcnt++;
3230 	return vm;
3231 }
3232 
3233 /*
3234  * uvmspace_exec: the process wants to exec a new program
3235  *
3236  * - XXX: no locking on vmspace
3237  */
3238 
3239 void
3240 uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end)
3241 {
3242 	struct process *pr = p->p_p;
3243 	struct vmspace *nvm, *ovm = pr->ps_vmspace;
3244 	struct vm_map *map = &ovm->vm_map;
3245 	struct uvm_map_deadq dead_entries;
3246 
3247 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0);
3248 	KASSERT((end & (vaddr_t)PAGE_MASK) == 0 ||
3249 	    (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
3250 
3251 	pmap_unuse_final(p);   /* before stack addresses go away */
3252 	TAILQ_INIT(&dead_entries);
3253 
3254 	/* see if more than one process is using this vmspace...  */
3255 	if (ovm->vm_refcnt == 1) {
3256 		/*
3257 		 * If pr is the only process using its vmspace then
3258 		 * we can safely recycle that vmspace for the program
3259 		 * that is being exec'd.
3260 		 */
3261 
3262 #ifdef SYSVSHM
3263 		/*
3264 		 * SYSV SHM semantics require us to kill all segments on an exec
3265 		 */
3266 		if (ovm->vm_shm)
3267 			shmexit(ovm);
3268 #endif
3269 
3270 		/*
3271 		 * POSIX 1003.1b -- "lock future mappings" is revoked
3272 		 * when a process execs another program image.
3273 		 */
3274 		vm_map_lock(map);
3275 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
3276 
3277 		/*
3278 		 * now unmap the old program
3279 		 *
3280 		 * Instead of attempting to keep the map valid, we simply
3281 		 * nuke all entries and ask uvm_map_setup to reinitialize
3282 		 * the map to the new boundaries.
3283 		 *
3284 		 * uvm_unmap_remove will actually nuke all entries for us
3285 		 * (as in, not replace them with free-memory entries).
3286 		 */
3287 		uvm_unmap_remove(map, map->min_offset, map->max_offset,
3288 		    &dead_entries, TRUE, FALSE);
3289 
3290 		KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
3291 
3292 		/* Nuke statistics and boundaries. */
3293 		memset(&ovm->vm_startcopy, 0,
3294 		    (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy);
3295 
3296 
3297 		if (end & (vaddr_t)PAGE_MASK) {
3298 			end += 1;
3299 			if (end == 0) /* overflow */
3300 				end -= PAGE_SIZE;
3301 		}
3302 
3303 		/* Setup new boundaries and populate map with entries. */
3304 		map->min_offset = start;
3305 		map->max_offset = end;
3306 		uvm_map_setup_entries(map);
3307 		vm_map_unlock(map);
3308 
3309 		/* but keep MMU holes unavailable */
3310 		pmap_remove_holes(ovm);
3311 	} else {
3312 		/*
3313 		 * pr's vmspace is being shared, so we can't reuse
3314 		 * it for pr since it is still being used for others.
3315 		 * allocate a new vmspace for pr
3316 		 */
3317 		nvm = uvmspace_alloc(start, end,
3318 		    (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE);
3319 
3320 		/* install new vmspace and drop our ref to the old one. */
3321 		pmap_deactivate(p);
3322 		p->p_vmspace = pr->ps_vmspace = nvm;
3323 		pmap_activate(p);
3324 
3325 		uvmspace_free(ovm);
3326 	}
3327 
3328 	/* Release dead entries */
3329 	uvm_unmap_detach(&dead_entries, 0);
3330 }
3331 
3332 /*
3333  * uvmspace_free: free a vmspace data structure
3334  *
3335  * - XXX: no locking on vmspace
3336  */
3337 void
3338 uvmspace_free(struct vmspace *vm)
3339 {
3340 	if (--vm->vm_refcnt == 0) {
3341 		/*
3342 		 * lock the map, to wait out all other references to it.  delete
3343 		 * all of the mappings and pages they hold, then call the pmap
3344 		 * module to reclaim anything left.
3345 		 */
3346 #ifdef SYSVSHM
3347 		/* Get rid of any SYSV shared memory segments. */
3348 		if (vm->vm_shm != NULL)
3349 			shmexit(vm);
3350 #endif
3351 
3352 		uvm_map_teardown(&vm->vm_map);
3353 		pool_put(&uvm_vmspace_pool, vm);
3354 	}
3355 }
3356 
3357 /*
3358  * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
3359  * srcmap to the address range [dstaddr, dstaddr + sz) in
3360  * dstmap.
3361  *
3362  * The whole address range in srcmap must be backed by an object
3363  * (no holes).
3364  *
3365  * If successful, the address ranges share memory and the destination
3366  * address range uses the protection flags in prot.
3367  *
3368  * This routine assumes that sz is a multiple of PAGE_SIZE and
3369  * that dstaddr and srcaddr are page-aligned.
3370  */
3371 int
3372 uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
3373     struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
3374 {
3375 	int ret = 0;
3376 	vaddr_t unmap_end;
3377 	vaddr_t dstva;
3378 	vsize_t off, len, n = sz;
3379 	struct vm_map_entry *first = NULL, *last = NULL;
3380 	struct vm_map_entry *src_entry, *psrc_entry = NULL;
3381 	struct uvm_map_deadq dead;
3382 
3383 	if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
3384 		return EINVAL;
3385 
3386 	TAILQ_INIT(&dead);
3387 	vm_map_lock(dstmap);
3388 	vm_map_lock_read(srcmap);
3389 
3390 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
3391 		ret = ENOMEM;
3392 		goto exit_unlock;
3393 	}
3394 	if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
3395 		ret = EINVAL;
3396 		goto exit_unlock;
3397 	}
3398 
3399 	unmap_end = dstaddr;
3400 	for (; src_entry != NULL;
3401 	    psrc_entry = src_entry,
3402 	    src_entry = RBT_NEXT(uvm_map_addr, src_entry)) {
3403 		/* hole in address space, bail out */
3404 		if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
3405 			break;
3406 		if (src_entry->start >= srcaddr + sz)
3407 			break;
3408 
3409 		if (UVM_ET_ISSUBMAP(src_entry))
3410 			panic("uvm_share: encountered a submap (illegal)");
3411 		if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
3412 		    UVM_ET_ISNEEDSCOPY(src_entry))
3413 			panic("uvm_share: non-copy_on_write map entries "
3414 			    "marked needs_copy (illegal)");
3415 
3416 		dstva = dstaddr;
3417 		if (src_entry->start > srcaddr) {
3418 			dstva += src_entry->start - srcaddr;
3419 			off = 0;
3420 		} else
3421 			off = srcaddr - src_entry->start;
3422 
3423 		if (n < src_entry->end - src_entry->start)
3424 			len = n;
3425 		else
3426 			len = src_entry->end - src_entry->start;
3427 		n -= len;
3428 
3429 		if (uvm_mapent_share(dstmap, dstva, len, off, prot, prot,
3430 		    srcmap, src_entry, &dead) == NULL)
3431 			break;
3432 
3433 		unmap_end = dstva + len;
3434 		if (n == 0)
3435 			goto exit_unlock;
3436 	}
3437 
3438 	ret = EINVAL;
3439 	uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
3440 
3441 exit_unlock:
3442 	vm_map_unlock_read(srcmap);
3443 	vm_map_unlock(dstmap);
3444 	uvm_unmap_detach(&dead, 0);
3445 
3446 	return ret;
3447 }
3448 
3449 /*
3450  * Clone map entry into other map.
3451  *
3452  * Mapping will be placed at dstaddr, for the same length.
3453  * Space must be available.
3454  * Reference counters are incremented.
3455  */
3456 struct vm_map_entry *
3457 uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3458     vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
3459     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
3460     int mapent_flags, int amap_share_flags)
3461 {
3462 	struct vm_map_entry *new_entry, *first, *last;
3463 
3464 	KDASSERT(!UVM_ET_ISSUBMAP(old_entry));
3465 
3466 	/* Create new entry (linked in on creation). Fill in first, last. */
3467 	first = last = NULL;
3468 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) {
3469 		panic("uvmspace_fork: no space in map for "
3470 		    "entry in empty map");
3471 	}
3472 	new_entry = uvm_map_mkentry(dstmap, first, last,
3473 	    dstaddr, dstlen, mapent_flags, dead, NULL);
3474 	if (new_entry == NULL)
3475 		return NULL;
3476 	/* old_entry -> new_entry */
3477 	new_entry->object = old_entry->object;
3478 	new_entry->offset = old_entry->offset;
3479 	new_entry->aref = old_entry->aref;
3480 	new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
3481 	new_entry->protection = prot;
3482 	new_entry->max_protection = maxprot;
3483 	new_entry->inheritance = old_entry->inheritance;
3484 	new_entry->advice = old_entry->advice;
3485 
3486 	/* gain reference to object backing the map (can't be a submap). */
3487 	if (new_entry->aref.ar_amap) {
3488 		new_entry->aref.ar_pageoff += off >> PAGE_SHIFT;
3489 		amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3490 		    (new_entry->end - new_entry->start) >> PAGE_SHIFT,
3491 		    amap_share_flags);
3492 	}
3493 
3494 	if (UVM_ET_ISOBJ(new_entry) &&
3495 	    new_entry->object.uvm_obj->pgops->pgo_reference) {
3496 		new_entry->offset += off;
3497 		new_entry->object.uvm_obj->pgops->pgo_reference
3498 		    (new_entry->object.uvm_obj);
3499 	}
3500 
3501 	return new_entry;
3502 }
3503 
3504 struct vm_map_entry *
3505 uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3506     vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
3507     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3508 {
3509 	/*
3510 	 * If old_entry refers to a copy-on-write region that has not yet been
3511 	 * written to (needs_copy flag is set), then we need to allocate a new
3512 	 * amap for old_entry.
3513 	 *
3514 	 * If we do not do this, and the process owning old_entry does a copy-on
3515 	 * write later, old_entry and new_entry will refer to different memory
3516 	 * regions, and the memory between the processes is no longer shared.
3517 	 *
3518 	 * [in other words, we need to clear needs_copy]
3519 	 */
3520 
3521 	if (UVM_ET_ISNEEDSCOPY(old_entry)) {
3522 		/* get our own amap, clears needs_copy */
3523 		amap_copy(old_map, old_entry, M_WAITOK, FALSE,
3524 		    0, 0);
3525 		/* XXXCDC: WAITOK??? */
3526 	}
3527 
3528 	return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
3529 	    prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
3530 }
3531 
3532 /*
3533  * share the mapping: this means we want the old and
3534  * new entries to share amaps and backing objects.
3535  */
3536 struct vm_map_entry *
3537 uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
3538     struct vm_map *old_map,
3539     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3540 {
3541 	struct vm_map_entry *new_entry;
3542 
3543 	new_entry = uvm_mapent_share(new_map, old_entry->start,
3544 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3545 	    old_entry->max_protection, old_map, old_entry, dead);
3546 
3547 	/*
3548 	 * pmap_copy the mappings: this routine is optional
3549 	 * but if it is there it will reduce the number of
3550 	 * page faults in the new proc.
3551 	 */
3552 	if (!UVM_ET_ISHOLE(new_entry))
3553 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->start,
3554 		    (new_entry->end - new_entry->start), new_entry->start);
3555 
3556 	return (new_entry);
3557 }
3558 
3559 /*
3560  * copy-on-write the mapping (using mmap's
3561  * MAP_PRIVATE semantics)
3562  *
3563  * allocate new_entry, adjust reference counts.
3564  * (note that new references are read-only).
3565  */
3566 struct vm_map_entry *
3567 uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
3568     struct vm_map *old_map,
3569     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3570 {
3571 	struct vm_map_entry	*new_entry;
3572 	boolean_t		 protect_child;
3573 
3574 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3575 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3576 	    old_entry->max_protection, old_entry, dead, 0, 0);
3577 
3578 	new_entry->etype |=
3579 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3580 
3581 	/*
3582 	 * the new entry will need an amap.  it will either
3583 	 * need to be copied from the old entry or created
3584 	 * from scratch (if the old entry does not have an
3585 	 * amap).  can we defer this process until later
3586 	 * (by setting "needs_copy") or do we need to copy
3587 	 * the amap now?
3588 	 *
3589 	 * we must copy the amap now if any of the following
3590 	 * conditions hold:
3591 	 * 1. the old entry has an amap and that amap is
3592 	 *    being shared.  this means that the old (parent)
3593 	 *    process is sharing the amap with another
3594 	 *    process.  if we do not clear needs_copy here
3595 	 *    we will end up in a situation where both the
3596 	 *    parent and child process are referring to the
3597 	 *    same amap with "needs_copy" set.  if the
3598 	 *    parent write-faults, the fault routine will
3599 	 *    clear "needs_copy" in the parent by allocating
3600 	 *    a new amap.   this is wrong because the
3601 	 *    parent is supposed to be sharing the old amap
3602 	 *    and the new amap will break that.
3603 	 *
3604 	 * 2. if the old entry has an amap and a non-zero
3605 	 *    wire count then we are going to have to call
3606 	 *    amap_cow_now to avoid page faults in the
3607 	 *    parent process.   since amap_cow_now requires
3608 	 *    "needs_copy" to be clear we might as well
3609 	 *    clear it here as well.
3610 	 *
3611 	 */
3612 	if (old_entry->aref.ar_amap != NULL &&
3613 	    ((amap_flags(old_entry->aref.ar_amap) &
3614 	    AMAP_SHARED) != 0 ||
3615 	    VM_MAPENT_ISWIRED(old_entry))) {
3616 		amap_copy(new_map, new_entry, M_WAITOK, FALSE,
3617 		    0, 0);
3618 		/* XXXCDC: M_WAITOK ... ok? */
3619 	}
3620 
3621 	/*
3622 	 * if the parent's entry is wired down, then the
3623 	 * parent process does not want page faults on
3624 	 * access to that memory.  this means that we
3625 	 * cannot do copy-on-write because we can't write
3626 	 * protect the old entry.   in this case we
3627 	 * resolve all copy-on-write faults now, using
3628 	 * amap_cow_now.   note that we have already
3629 	 * allocated any needed amap (above).
3630 	 */
3631 	if (VM_MAPENT_ISWIRED(old_entry)) {
3632 		/*
3633 		 * resolve all copy-on-write faults now
3634 		 * (note that there is nothing to do if
3635 		 * the old mapping does not have an amap).
3636 		 * XXX: is it worthwhile to bother with
3637 		 * pmap_copy in this case?
3638 		 */
3639 		if (old_entry->aref.ar_amap)
3640 			amap_cow_now(new_map, new_entry);
3641 	} else {
3642 		if (old_entry->aref.ar_amap) {
3643 			/*
3644 			 * setup mappings to trigger copy-on-write faults
3645 			 * we must write-protect the parent if it has
3646 			 * an amap and it is not already "needs_copy"...
3647 			 * if it is already "needs_copy" then the parent
3648 			 * has already been write-protected by a previous
3649 			 * fork operation.
3650 			 *
3651 			 * if we do not write-protect the parent, then
3652 			 * we must be sure to write-protect the child
3653 			 * after the pmap_copy() operation.
3654 			 *
3655 			 * XXX: pmap_copy should have some way of telling
3656 			 * us that it didn't do anything so we can avoid
3657 			 * calling pmap_protect needlessly.
3658 			 */
3659 			if (!UVM_ET_ISNEEDSCOPY(old_entry)) {
3660 				if (old_entry->max_protection & PROT_WRITE) {
3661 					pmap_protect(old_map->pmap,
3662 					    old_entry->start,
3663 					    old_entry->end,
3664 					    old_entry->protection &
3665 					    ~PROT_WRITE);
3666 					pmap_update(old_map->pmap);
3667 				}
3668 				old_entry->etype |= UVM_ET_NEEDSCOPY;
3669 			}
3670 
3671 	  		/* parent must now be write-protected */
3672 	  		protect_child = FALSE;
3673 		} else {
3674 			/*
3675 			 * we only need to protect the child if the
3676 			 * parent has write access.
3677 			 */
3678 			if (old_entry->max_protection & PROT_WRITE)
3679 				protect_child = TRUE;
3680 			else
3681 				protect_child = FALSE;
3682 		}
3683 		/*
3684 		 * copy the mappings
3685 		 * XXX: need a way to tell if this does anything
3686 		 */
3687 		if (!UVM_ET_ISHOLE(new_entry))
3688 			pmap_copy(new_map->pmap, old_map->pmap,
3689 			    new_entry->start,
3690 			    (old_entry->end - old_entry->start),
3691 			    old_entry->start);
3692 
3693 		/* protect the child's mappings if necessary */
3694 		if (protect_child) {
3695 			pmap_protect(new_map->pmap, new_entry->start,
3696 			    new_entry->end,
3697 			    new_entry->protection &
3698 			    ~PROT_WRITE);
3699 		}
3700 	}
3701 
3702 	return (new_entry);
3703 }
3704 
3705 /*
3706  * zero the mapping: the new entry will be zero initialized
3707  */
3708 struct vm_map_entry *
3709 uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
3710     struct vm_map *old_map,
3711     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3712 {
3713 	struct vm_map_entry *new_entry;
3714 
3715 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3716 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3717 	    old_entry->max_protection, old_entry, dead, 0, 0);
3718 
3719 	new_entry->etype |=
3720 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3721 
3722 	if (new_entry->aref.ar_amap) {
3723 		amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3724 		    atop(new_entry->end - new_entry->start), 0);
3725 		new_entry->aref.ar_amap = NULL;
3726 		new_entry->aref.ar_pageoff = 0;
3727 	}
3728 
3729 	if (UVM_ET_ISOBJ(new_entry)) {
3730 		if (new_entry->object.uvm_obj->pgops->pgo_detach)
3731 			new_entry->object.uvm_obj->pgops->pgo_detach(
3732 			    new_entry->object.uvm_obj);
3733 		new_entry->object.uvm_obj = NULL;
3734 		new_entry->etype &= ~UVM_ET_OBJ;
3735 	}
3736 
3737 	return (new_entry);
3738 }
3739 
3740 /*
3741  * uvmspace_fork: fork a process' main map
3742  *
3743  * => create a new vmspace for child process from parent.
3744  * => parent's map must not be locked.
3745  */
3746 struct vmspace *
3747 uvmspace_fork(struct process *pr)
3748 {
3749 	struct vmspace *vm1 = pr->ps_vmspace;
3750 	struct vmspace *vm2;
3751 	struct vm_map *old_map = &vm1->vm_map;
3752 	struct vm_map *new_map;
3753 	struct vm_map_entry *old_entry, *new_entry;
3754 	struct uvm_map_deadq dead;
3755 
3756 	vm_map_lock(old_map);
3757 
3758 	vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
3759 	    (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE);
3760 	memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
3761 	    (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
3762 	vm2->vm_dused = 0; /* Statistic managed by us. */
3763 	new_map = &vm2->vm_map;
3764 	vm_map_lock(new_map);
3765 
3766 	/* go entry-by-entry */
3767 	TAILQ_INIT(&dead);
3768 	RBT_FOREACH(old_entry, uvm_map_addr, &old_map->addr) {
3769 		if (old_entry->start == old_entry->end)
3770 			continue;
3771 
3772 		/* first, some sanity checks on the old entry */
3773 		if (UVM_ET_ISSUBMAP(old_entry)) {
3774 			panic("fork: encountered a submap during fork "
3775 			    "(illegal)");
3776 		}
3777 
3778 		if (!UVM_ET_ISCOPYONWRITE(old_entry) &&
3779 		    UVM_ET_ISNEEDSCOPY(old_entry)) {
3780 			panic("fork: non-copy_on_write map entry marked "
3781 			    "needs_copy (illegal)");
3782 		}
3783 
3784 		/* Apply inheritance. */
3785 		switch (old_entry->inheritance) {
3786 		case MAP_INHERIT_SHARE:
3787 			new_entry = uvm_mapent_forkshared(vm2, new_map,
3788 			    old_map, old_entry, &dead);
3789 			break;
3790 		case MAP_INHERIT_COPY:
3791 			new_entry = uvm_mapent_forkcopy(vm2, new_map,
3792 			    old_map, old_entry, &dead);
3793 			break;
3794 		case MAP_INHERIT_ZERO:
3795 			new_entry = uvm_mapent_forkzero(vm2, new_map,
3796 			    old_map, old_entry, &dead);
3797 			break;
3798 		default:
3799 			continue;
3800 		}
3801 
3802 	 	/* Update process statistics. */
3803 		if (!UVM_ET_ISHOLE(new_entry))
3804 			new_map->size += new_entry->end - new_entry->start;
3805 		if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry)) {
3806 			vm2->vm_dused += uvmspace_dused(
3807 			    new_map, new_entry->start, new_entry->end);
3808 		}
3809 	}
3810 
3811 	vm_map_unlock(old_map);
3812 	vm_map_unlock(new_map);
3813 
3814 	/*
3815 	 * This can actually happen, if multiple entries described a
3816 	 * space in which an entry was inherited.
3817 	 */
3818 	uvm_unmap_detach(&dead, 0);
3819 
3820 #ifdef SYSVSHM
3821 	if (vm1->vm_shm)
3822 		shmfork(vm1, vm2);
3823 #endif
3824 
3825 	return vm2;
3826 }
3827 
3828 /*
3829  * uvm_map_hint: return the beginning of the best area suitable for
3830  * creating a new mapping with "prot" protection.
3831  */
3832 vaddr_t
3833 uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr,
3834     vaddr_t maxaddr)
3835 {
3836 	vaddr_t addr;
3837 	vaddr_t spacing;
3838 
3839 #ifdef __i386__
3840 	/*
3841 	 * If executable skip first two pages, otherwise start
3842 	 * after data + heap region.
3843 	 */
3844 	if ((prot & PROT_EXEC) != 0 &&
3845 	    (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) {
3846 		addr = (PAGE_SIZE*2) +
3847 		    (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
3848 		return (round_page(addr));
3849 	}
3850 #endif
3851 
3852 #if defined (__LP64__)
3853 	spacing = (MIN((4UL * 1024 * 1024 * 1024), BRKSIZ) - 1);
3854 #else
3855 	spacing = (MIN((256 * 1024 * 1024), BRKSIZ) - 1);
3856 #endif
3857 
3858 	addr = (vaddr_t)vm->vm_daddr;
3859 	/*
3860 	 * Start malloc/mmap after the brk.
3861 	 * If the random spacing area has been used up,
3862 	 * the brk area becomes fair game for mmap as well.
3863 	 */
3864 	if (vm->vm_dused < spacing >> PAGE_SHIFT)
3865 		addr += BRKSIZ;
3866 	if (addr < maxaddr) {
3867 		while (spacing > maxaddr - addr)
3868 			spacing >>= 1;
3869 	}
3870 	addr += arc4random() & spacing;
3871 	return (round_page(addr));
3872 }
3873 
3874 /*
3875  * uvm_map_submap: punch down part of a map into a submap
3876  *
3877  * => only the kernel_map is allowed to be submapped
3878  * => the purpose of submapping is to break up the locking granularity
3879  *	of a larger map
3880  * => the range specified must have been mapped previously with a uvm_map()
3881  *	call [with uobj==NULL] to create a blank map entry in the main map.
3882  *	[And it had better still be blank!]
3883  * => maps which contain submaps should never be copied or forked.
3884  * => to remove a submap, use uvm_unmap() on the main map
3885  *	and then uvm_map_deallocate() the submap.
3886  * => main map must be unlocked.
3887  * => submap must have been init'd and have a zero reference count.
3888  *	[need not be locked as we don't actually reference it]
3889  */
3890 int
3891 uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
3892     struct vm_map *submap)
3893 {
3894 	struct vm_map_entry *entry;
3895 	int result;
3896 
3897 	if (start > map->max_offset || end > map->max_offset ||
3898 	    start < map->min_offset || end < map->min_offset)
3899 		return EINVAL;
3900 
3901 	vm_map_lock(map);
3902 
3903 	if (uvm_map_lookup_entry(map, start, &entry)) {
3904 		UVM_MAP_CLIP_START(map, entry, start);
3905 		UVM_MAP_CLIP_END(map, entry, end);
3906 	} else
3907 		entry = NULL;
3908 
3909 	if (entry != NULL &&
3910 	    entry->start == start && entry->end == end &&
3911 	    entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
3912 	    !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
3913 		entry->etype |= UVM_ET_SUBMAP;
3914 		entry->object.sub_map = submap;
3915 		entry->offset = 0;
3916 		uvm_map_reference(submap);
3917 		result = 0;
3918 	} else
3919 		result = EINVAL;
3920 
3921 	vm_map_unlock(map);
3922 	return(result);
3923 }
3924 
3925 /*
3926  * uvm_map_checkprot: check protection in map
3927  *
3928  * => must allow specific protection in a fully allocated region.
3929  * => map mut be read or write locked by caller.
3930  */
3931 boolean_t
3932 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
3933     vm_prot_t protection)
3934 {
3935 	struct vm_map_entry *entry;
3936 
3937 	if (start < map->min_offset || end > map->max_offset || start > end)
3938 		return FALSE;
3939 	if (start == end)
3940 		return TRUE;
3941 
3942 	/*
3943 	 * Iterate entries.
3944 	 */
3945 	for (entry = uvm_map_entrybyaddr(&map->addr, start);
3946 	    entry != NULL && entry->start < end;
3947 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
3948 		/* Fail if a hole is found. */
3949 		if (UVM_ET_ISHOLE(entry) ||
3950 		    (entry->end < end && entry->end != VMMAP_FREE_END(entry)))
3951 			return FALSE;
3952 
3953 		/* Check protection. */
3954 		if ((entry->protection & protection) != protection)
3955 			return FALSE;
3956 	}
3957 	return TRUE;
3958 }
3959 
3960 /*
3961  * uvm_map_create: create map
3962  */
3963 vm_map_t
3964 uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags)
3965 {
3966 	vm_map_t map;
3967 
3968 	map = malloc(sizeof *map, M_VMMAP, M_WAITOK);
3969 	map->pmap = pmap;
3970 	uvm_map_setup(map, min, max, flags);
3971 	return (map);
3972 }
3973 
3974 /*
3975  * uvm_map_deallocate: drop reference to a map
3976  *
3977  * => caller must not lock map
3978  * => we will zap map if ref count goes to zero
3979  */
3980 void
3981 uvm_map_deallocate(vm_map_t map)
3982 {
3983 	int c;
3984 	struct uvm_map_deadq dead;
3985 
3986 	c = --map->ref_count;
3987 	if (c > 0) {
3988 		return;
3989 	}
3990 
3991 	/*
3992 	 * all references gone.   unmap and free.
3993 	 *
3994 	 * No lock required: we are only one to access this map.
3995 	 */
3996 	TAILQ_INIT(&dead);
3997 	uvm_tree_sanity(map, __FILE__, __LINE__);
3998 	uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead,
3999 	    TRUE, FALSE);
4000 	pmap_destroy(map->pmap);
4001 	KASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
4002 	free(map, M_VMMAP, sizeof *map);
4003 
4004 	uvm_unmap_detach(&dead, 0);
4005 }
4006 
4007 /*
4008  * uvm_map_inherit: set inheritance code for range of addrs in map.
4009  *
4010  * => map must be unlocked
4011  * => note that the inherit code is used during a "fork".  see fork
4012  *	code for details.
4013  */
4014 int
4015 uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
4016     vm_inherit_t new_inheritance)
4017 {
4018 	struct vm_map_entry *entry;
4019 
4020 	switch (new_inheritance) {
4021 	case MAP_INHERIT_NONE:
4022 	case MAP_INHERIT_COPY:
4023 	case MAP_INHERIT_SHARE:
4024 	case MAP_INHERIT_ZERO:
4025 		break;
4026 	default:
4027 		return (EINVAL);
4028 	}
4029 
4030 	if (start > end)
4031 		return EINVAL;
4032 	start = MAX(start, map->min_offset);
4033 	end = MIN(end, map->max_offset);
4034 	if (start >= end)
4035 		return 0;
4036 
4037 	vm_map_lock(map);
4038 
4039 	entry = uvm_map_entrybyaddr(&map->addr, start);
4040 	if (entry->end > start)
4041 		UVM_MAP_CLIP_START(map, entry, start);
4042 	else
4043 		entry = RBT_NEXT(uvm_map_addr, entry);
4044 
4045 	while (entry != NULL && entry->start < end) {
4046 		UVM_MAP_CLIP_END(map, entry, end);
4047 		entry->inheritance = new_inheritance;
4048 		entry = RBT_NEXT(uvm_map_addr, entry);
4049 	}
4050 
4051 	vm_map_unlock(map);
4052 	return (0);
4053 }
4054 
4055 /*
4056  * uvm_map_advice: set advice code for range of addrs in map.
4057  *
4058  * => map must be unlocked
4059  */
4060 int
4061 uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
4062 {
4063 	struct vm_map_entry *entry;
4064 
4065 	switch (new_advice) {
4066 	case MADV_NORMAL:
4067 	case MADV_RANDOM:
4068 	case MADV_SEQUENTIAL:
4069 		break;
4070 	default:
4071 		return (EINVAL);
4072 	}
4073 
4074 	if (start > end)
4075 		return EINVAL;
4076 	start = MAX(start, map->min_offset);
4077 	end = MIN(end, map->max_offset);
4078 	if (start >= end)
4079 		return 0;
4080 
4081 	vm_map_lock(map);
4082 
4083 	entry = uvm_map_entrybyaddr(&map->addr, start);
4084 	if (entry != NULL && entry->end > start)
4085 		UVM_MAP_CLIP_START(map, entry, start);
4086 	else if (entry!= NULL)
4087 		entry = RBT_NEXT(uvm_map_addr, entry);
4088 
4089 	/*
4090 	 * XXXJRT: disallow holes?
4091 	 */
4092 	while (entry != NULL && entry->start < end) {
4093 		UVM_MAP_CLIP_END(map, entry, end);
4094 		entry->advice = new_advice;
4095 		entry = RBT_NEXT(uvm_map_addr, entry);
4096 	}
4097 
4098 	vm_map_unlock(map);
4099 	return (0);
4100 }
4101 
4102 /*
4103  * uvm_map_extract: extract a mapping from a map and put it somewhere
4104  * in the kernel_map, setting protection to max_prot.
4105  *
4106  * => map should be unlocked (we will write lock it and kernel_map)
4107  * => returns 0 on success, error code otherwise
4108  * => start must be page aligned
4109  * => len must be page sized
4110  * => flags:
4111  *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
4112  * Mappings are QREF's.
4113  */
4114 int
4115 uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
4116     vaddr_t *dstaddrp, int flags)
4117 {
4118 	struct uvm_map_deadq dead;
4119 	struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2;
4120 	vaddr_t dstaddr;
4121 	vaddr_t end;
4122 	vaddr_t cp_start;
4123 	vsize_t cp_len, cp_off;
4124 	int error;
4125 
4126 	TAILQ_INIT(&dead);
4127 	end = start + len;
4128 
4129 	/*
4130 	 * Sanity check on the parameters.
4131 	 * Also, since the mapping may not contain gaps, error out if the
4132 	 * mapped area is not in source map.
4133 	 */
4134 	if ((start & (vaddr_t)PAGE_MASK) != 0 ||
4135 	    (end & (vaddr_t)PAGE_MASK) != 0 || end < start)
4136 		return EINVAL;
4137 	if (start < srcmap->min_offset || end > srcmap->max_offset)
4138 		return EINVAL;
4139 
4140 	/* Initialize dead entries. Handle len == 0 case. */
4141 	if (len == 0)
4142 		return 0;
4143 
4144 	/* Acquire lock on srcmap. */
4145 	vm_map_lock(srcmap);
4146 
4147 	/* Lock srcmap, lookup first and last entry in <start,len>. */
4148 	first = uvm_map_entrybyaddr(&srcmap->addr, start);
4149 
4150 	/* Check that the range is contiguous. */
4151 	for (entry = first; entry != NULL && entry->end < end;
4152 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4153 		if (VMMAP_FREE_END(entry) != entry->end ||
4154 		    UVM_ET_ISHOLE(entry)) {
4155 			error = EINVAL;
4156 			goto fail;
4157 		}
4158 	}
4159 	if (entry == NULL || UVM_ET_ISHOLE(entry)) {
4160 		error = EINVAL;
4161 		goto fail;
4162 	}
4163 
4164 	/*
4165 	 * Handle need-copy flag.
4166 	 * This may invalidate last, hence the re-initialization during the
4167 	 * loop.
4168 	 *
4169 	 * Also, perform clipping of last if not UVM_EXTRACT_QREF.
4170 	 */
4171 	for (entry = first; entry != NULL && entry->start < end;
4172 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4173 		if (UVM_ET_ISNEEDSCOPY(entry))
4174 			amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end);
4175 		if (UVM_ET_ISNEEDSCOPY(entry)) {
4176 			/*
4177 			 * amap_copy failure
4178 			 */
4179 			error = ENOMEM;
4180 			goto fail;
4181 		}
4182 	}
4183 
4184 	/* Lock destination map (kernel_map). */
4185 	vm_map_lock(kernel_map);
4186 
4187 	if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len,
4188 	    MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start),
4189 	    PROT_NONE, 0) != 0) {
4190 		error = ENOMEM;
4191 		goto fail2;
4192 	}
4193 	*dstaddrp = dstaddr;
4194 
4195 	/*
4196 	 * We now have srcmap and kernel_map locked.
4197 	 * dstaddr contains the destination offset in dstmap.
4198 	 */
4199 	/* step 1: start looping through map entries, performing extraction. */
4200 	for (entry = first; entry != NULL && entry->start < end;
4201 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4202 		KDASSERT(!UVM_ET_ISNEEDSCOPY(entry));
4203 		if (UVM_ET_ISHOLE(entry))
4204 			continue;
4205 
4206 		/* Calculate uvm_mapent_clone parameters. */
4207 		cp_start = entry->start;
4208 		if (cp_start < start) {
4209 			cp_off = start - cp_start;
4210 			cp_start = start;
4211 		} else
4212 			cp_off = 0;
4213 		cp_len = MIN(entry->end, end) - cp_start;
4214 
4215 		newentry = uvm_mapent_clone(kernel_map,
4216 		    cp_start - start + dstaddr, cp_len, cp_off,
4217 		    entry->protection, entry->max_protection,
4218 		    entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
4219 		if (newentry == NULL) {
4220 			error = ENOMEM;
4221 			goto fail2_unmap;
4222 		}
4223 		kernel_map->size += cp_len;
4224 		if (flags & UVM_EXTRACT_FIXPROT)
4225 			newentry->protection = newentry->max_protection;
4226 
4227 		/*
4228 		 * Step 2: perform pmap copy.
4229 		 * (Doing this in the loop saves one RB traversal.)
4230 		 */
4231 		pmap_copy(kernel_map->pmap, srcmap->pmap,
4232 		    cp_start - start + dstaddr, cp_len, cp_start);
4233 	}
4234 	pmap_update(kernel_map->pmap);
4235 
4236 	error = 0;
4237 
4238 	/* Unmap copied entries on failure. */
4239 fail2_unmap:
4240 	if (error) {
4241 		uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead,
4242 		    FALSE, TRUE);
4243 	}
4244 
4245 	/* Release maps, release dead entries. */
4246 fail2:
4247 	vm_map_unlock(kernel_map);
4248 
4249 fail:
4250 	vm_map_unlock(srcmap);
4251 
4252 	uvm_unmap_detach(&dead, 0);
4253 
4254 	return error;
4255 }
4256 
4257 /*
4258  * uvm_map_clean: clean out a map range
4259  *
4260  * => valid flags:
4261  *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
4262  *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
4263  *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
4264  *   if (flags & PGO_FREE): any cached pages are freed after clean
4265  * => returns an error if any part of the specified range isn't mapped
4266  * => never a need to flush amap layer since the anonymous memory has
4267  *	no permanent home, but may deactivate pages there
4268  * => called from sys_msync() and sys_madvise()
4269  * => caller must not write-lock map (read OK).
4270  * => we may sleep while cleaning if SYNCIO [with map read-locked]
4271  */
4272 
4273 int
4274 uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
4275 {
4276 	struct vm_map_entry *first, *entry;
4277 	struct vm_amap *amap;
4278 	struct vm_anon *anon;
4279 	struct vm_page *pg;
4280 	struct uvm_object *uobj;
4281 	vaddr_t cp_start, cp_end;
4282 	int refs;
4283 	int error;
4284 	boolean_t rv;
4285 
4286 	KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
4287 	    (PGO_FREE|PGO_DEACTIVATE));
4288 
4289 	if (start > end || start < map->min_offset || end > map->max_offset)
4290 		return EINVAL;
4291 
4292 	vm_map_lock_read(map);
4293 	first = uvm_map_entrybyaddr(&map->addr, start);
4294 
4295 	/* Make a first pass to check for holes. */
4296 	for (entry = first; entry != NULL && entry->start < end;
4297 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4298 		if (UVM_ET_ISSUBMAP(entry)) {
4299 			vm_map_unlock_read(map);
4300 			return EINVAL;
4301 		}
4302 		if (UVM_ET_ISSUBMAP(entry) ||
4303 		    UVM_ET_ISHOLE(entry) ||
4304 		    (entry->end < end &&
4305 		    VMMAP_FREE_END(entry) != entry->end)) {
4306 			vm_map_unlock_read(map);
4307 			return EFAULT;
4308 		}
4309 	}
4310 
4311 	error = 0;
4312 	for (entry = first; entry != NULL && entry->start < end;
4313 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4314 		amap = entry->aref.ar_amap;	/* top layer */
4315 		if (UVM_ET_ISOBJ(entry))
4316 			uobj = entry->object.uvm_obj;
4317 		else
4318 			uobj = NULL;
4319 
4320 		/*
4321 		 * No amap cleaning necessary if:
4322 		 *  - there's no amap
4323 		 *  - we're not deactivating or freeing pages.
4324 		 */
4325 		if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
4326 			goto flush_object;
4327 
4328 		cp_start = MAX(entry->start, start);
4329 		cp_end = MIN(entry->end, end);
4330 
4331 		for (; cp_start != cp_end; cp_start += PAGE_SIZE) {
4332 			anon = amap_lookup(&entry->aref,
4333 			    cp_start - entry->start);
4334 			if (anon == NULL)
4335 				continue;
4336 
4337 			pg = anon->an_page;
4338 			if (pg == NULL) {
4339 				continue;
4340 			}
4341 			KASSERT(pg->pg_flags & PQ_ANON);
4342 
4343 			switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
4344 			/*
4345 			 * XXX In these first 3 cases, we always just
4346 			 * XXX deactivate the page.  We may want to
4347 			 * XXX handle the different cases more
4348 			 * XXX specifically, in the future.
4349 			 */
4350 			case PGO_CLEANIT|PGO_FREE:
4351 			case PGO_CLEANIT|PGO_DEACTIVATE:
4352 			case PGO_DEACTIVATE:
4353 deactivate_it:
4354 				/* skip the page if it's wired */
4355 				if (pg->wire_count != 0)
4356 					break;
4357 
4358 				uvm_lock_pageq();
4359 
4360 				KASSERT(pg->uanon == anon);
4361 
4362 				/* zap all mappings for the page. */
4363 				pmap_page_protect(pg, PROT_NONE);
4364 
4365 				/* ...and deactivate the page. */
4366 				uvm_pagedeactivate(pg);
4367 
4368 				uvm_unlock_pageq();
4369 				break;
4370 			case PGO_FREE:
4371 				/*
4372 				 * If there are multiple references to
4373 				 * the amap, just deactivate the page.
4374 				 */
4375 				if (amap_refs(amap) > 1)
4376 					goto deactivate_it;
4377 
4378 				/* XXX skip the page if it's wired */
4379 				if (pg->wire_count != 0) {
4380 					break;
4381 				}
4382 				amap_unadd(&entry->aref,
4383 				    cp_start - entry->start);
4384 				refs = --anon->an_ref;
4385 				if (refs == 0)
4386 					uvm_anfree(anon);
4387 				break;
4388 			default:
4389 				panic("uvm_map_clean: weird flags");
4390 			}
4391 		}
4392 
4393 flush_object:
4394 		cp_start = MAX(entry->start, start);
4395 		cp_end = MIN(entry->end, end);
4396 
4397 		/*
4398 		 * flush pages if we've got a valid backing object.
4399 		 *
4400 		 * Don't PGO_FREE if we don't have write permission
4401 		 * and don't flush if this is a copy-on-write object
4402 		 * since we can't know our permissions on it.
4403 		 */
4404 		if (uobj != NULL &&
4405 		    ((flags & PGO_FREE) == 0 ||
4406 		     ((entry->max_protection & PROT_WRITE) != 0 &&
4407 		      (entry->etype & UVM_ET_COPYONWRITE) == 0))) {
4408 			rv = uobj->pgops->pgo_flush(uobj,
4409 			    cp_start - entry->start + entry->offset,
4410 			    cp_end - entry->start + entry->offset, flags);
4411 
4412 			if (rv == FALSE)
4413 				error = EFAULT;
4414 		}
4415 	}
4416 
4417 	vm_map_unlock_read(map);
4418 	return error;
4419 }
4420 
4421 /*
4422  * UVM_MAP_CLIP_END implementation
4423  */
4424 void
4425 uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4426 {
4427 	struct vm_map_entry *tmp;
4428 
4429 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4430 	tmp = uvm_mapent_alloc(map, 0);
4431 
4432 	/* Invoke splitentry. */
4433 	uvm_map_splitentry(map, entry, tmp, addr);
4434 }
4435 
4436 /*
4437  * UVM_MAP_CLIP_START implementation
4438  *
4439  * Clippers are required to not change the pointers to the entry they are
4440  * clipping on.
4441  * Since uvm_map_splitentry turns the original entry into the lowest
4442  * entry (address wise) we do a swap between the new entry and the original
4443  * entry, prior to calling uvm_map_splitentry.
4444  */
4445 void
4446 uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4447 {
4448 	struct vm_map_entry *tmp;
4449 	struct uvm_addr_state *free;
4450 
4451 	/* Unlink original. */
4452 	free = uvm_map_uaddr_e(map, entry);
4453 	uvm_mapent_free_remove(map, free, entry);
4454 	uvm_mapent_addr_remove(map, entry);
4455 
4456 	/* Copy entry. */
4457 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4458 	tmp = uvm_mapent_alloc(map, 0);
4459 	uvm_mapent_copy(entry, tmp);
4460 
4461 	/* Put new entry in place of original entry. */
4462 	uvm_mapent_addr_insert(map, tmp);
4463 	uvm_mapent_free_insert(map, free, tmp);
4464 
4465 	/* Invoke splitentry. */
4466 	uvm_map_splitentry(map, tmp, entry, addr);
4467 }
4468 
4469 /*
4470  * Boundary fixer.
4471  */
4472 static __inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t);
4473 static __inline vaddr_t
4474 uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound)
4475 {
4476 	return (min < bound && max > bound) ? bound : max;
4477 }
4478 
4479 /*
4480  * Choose free list based on address at start of free space.
4481  *
4482  * The uvm_addr_state returned contains addr and is the first of:
4483  * - uaddr_exe
4484  * - uaddr_brk_stack
4485  * - uaddr_any
4486  */
4487 struct uvm_addr_state*
4488 uvm_map_uaddr(struct vm_map *map, vaddr_t addr)
4489 {
4490 	struct uvm_addr_state *uaddr;
4491 	int i;
4492 
4493 	/* Special case the first page, to prevent mmap from returning 0. */
4494 	if (addr < VMMAP_MIN_ADDR)
4495 		return NULL;
4496 
4497 	/* Upper bound for kernel maps at uvm_maxkaddr. */
4498 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
4499 		if (addr >= uvm_maxkaddr)
4500 			return NULL;
4501 	}
4502 
4503 	/* Is the address inside the exe-only map? */
4504 	if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr &&
4505 	    addr < map->uaddr_exe->uaddr_maxaddr)
4506 		return map->uaddr_exe;
4507 
4508 	/* Check if the space falls inside brk/stack area. */
4509 	if ((addr >= map->b_start && addr < map->b_end) ||
4510 	    (addr >= map->s_start && addr < map->s_end)) {
4511 		if (map->uaddr_brk_stack != NULL &&
4512 		    addr >= map->uaddr_brk_stack->uaddr_minaddr &&
4513 		    addr < map->uaddr_brk_stack->uaddr_maxaddr) {
4514 			return map->uaddr_brk_stack;
4515 		} else
4516 			return NULL;
4517 	}
4518 
4519 	/*
4520 	 * Check the other selectors.
4521 	 *
4522 	 * These selectors are only marked as the owner, if they have insert
4523 	 * functions.
4524 	 */
4525 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4526 		uaddr = map->uaddr_any[i];
4527 		if (uaddr == NULL)
4528 			continue;
4529 		if (uaddr->uaddr_functions->uaddr_free_insert == NULL)
4530 			continue;
4531 
4532 		if (addr >= uaddr->uaddr_minaddr &&
4533 		    addr < uaddr->uaddr_maxaddr)
4534 			return uaddr;
4535 	}
4536 
4537 	return NULL;
4538 }
4539 
4540 /*
4541  * Choose free list based on address at start of free space.
4542  *
4543  * The uvm_addr_state returned contains addr and is the first of:
4544  * - uaddr_exe
4545  * - uaddr_brk_stack
4546  * - uaddr_any
4547  */
4548 struct uvm_addr_state*
4549 uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry)
4550 {
4551 	return uvm_map_uaddr(map, VMMAP_FREE_START(entry));
4552 }
4553 
4554 /*
4555  * Returns the first free-memory boundary that is crossed by [min-max].
4556  */
4557 vsize_t
4558 uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max)
4559 {
4560 	struct uvm_addr_state	*uaddr;
4561 	int			 i;
4562 
4563 	/* Never return first page. */
4564 	max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR);
4565 
4566 	/* Treat the maxkaddr special, if the map is a kernel_map. */
4567 	if ((map->flags & VM_MAP_ISVMSPACE) == 0)
4568 		max = uvm_map_boundfix(min, max, uvm_maxkaddr);
4569 
4570 	/* Check for exe-only boundaries. */
4571 	if (map->uaddr_exe != NULL) {
4572 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr);
4573 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr);
4574 	}
4575 
4576 	/* Check for exe-only boundaries. */
4577 	if (map->uaddr_brk_stack != NULL) {
4578 		max = uvm_map_boundfix(min, max,
4579 		    map->uaddr_brk_stack->uaddr_minaddr);
4580 		max = uvm_map_boundfix(min, max,
4581 		    map->uaddr_brk_stack->uaddr_maxaddr);
4582 	}
4583 
4584 	/* Check other boundaries. */
4585 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4586 		uaddr = map->uaddr_any[i];
4587 		if (uaddr != NULL) {
4588 			max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr);
4589 			max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr);
4590 		}
4591 	}
4592 
4593 	/* Boundaries at stack and brk() area. */
4594 	max = uvm_map_boundfix(min, max, map->s_start);
4595 	max = uvm_map_boundfix(min, max, map->s_end);
4596 	max = uvm_map_boundfix(min, max, map->b_start);
4597 	max = uvm_map_boundfix(min, max, map->b_end);
4598 
4599 	return max;
4600 }
4601 
4602 /*
4603  * Update map allocation start and end addresses from proc vmspace.
4604  */
4605 void
4606 uvm_map_vmspace_update(struct vm_map *map,
4607     struct uvm_map_deadq *dead, int flags)
4608 {
4609 	struct vmspace *vm;
4610 	vaddr_t b_start, b_end, s_start, s_end;
4611 
4612 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
4613 	KASSERT(offsetof(struct vmspace, vm_map) == 0);
4614 
4615 	/*
4616 	 * Derive actual allocation boundaries from vmspace.
4617 	 */
4618 	vm = (struct vmspace *)map;
4619 	b_start = (vaddr_t)vm->vm_daddr;
4620 	b_end   = b_start + BRKSIZ;
4621 	s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4622 	s_end   = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4623 #ifdef DIAGNOSTIC
4624 	if ((b_start & (vaddr_t)PAGE_MASK) != 0 ||
4625 	    (b_end & (vaddr_t)PAGE_MASK) != 0 ||
4626 	    (s_start & (vaddr_t)PAGE_MASK) != 0 ||
4627 	    (s_end & (vaddr_t)PAGE_MASK) != 0) {
4628 		panic("uvm_map_vmspace_update: vmspace %p invalid bounds: "
4629 		    "b=0x%lx-0x%lx s=0x%lx-0x%lx",
4630 		    vm, b_start, b_end, s_start, s_end);
4631 	}
4632 #endif
4633 
4634 	if (__predict_true(map->b_start == b_start && map->b_end == b_end &&
4635 	    map->s_start == s_start && map->s_end == s_end))
4636 		return;
4637 
4638 	uvm_map_freelist_update(map, dead, b_start, b_end,
4639 	    s_start, s_end, flags);
4640 }
4641 
4642 /*
4643  * Grow kernel memory.
4644  *
4645  * This function is only called for kernel maps when an allocation fails.
4646  *
4647  * If the map has a gap that is large enough to accommodate alloc_sz, this
4648  * function will make sure map->free will include it.
4649  */
4650 void
4651 uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead,
4652     vsize_t alloc_sz, int flags)
4653 {
4654 	vsize_t sz;
4655 	vaddr_t end;
4656 	struct vm_map_entry *entry;
4657 
4658 	/* Kernel memory only. */
4659 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0);
4660 	/* Destroy free list. */
4661 	uvm_map_freelist_update_clear(map, dead);
4662 
4663 	/* Include the guard page in the hard minimum requirement of alloc_sz. */
4664 	if (map->flags & VM_MAP_GUARDPAGES)
4665 		alloc_sz += PAGE_SIZE;
4666 
4667 	/*
4668 	 * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA.
4669 	 *
4670 	 * Don't handle the case where the multiplication overflows:
4671 	 * if that happens, the allocation is probably too big anyway.
4672 	 */
4673 	sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA);
4674 
4675 	/*
4676 	 * Walk forward until a gap large enough for alloc_sz shows up.
4677 	 *
4678 	 * We assume the kernel map has no boundaries.
4679 	 * uvm_maxkaddr may be zero.
4680 	 */
4681 	end = MAX(uvm_maxkaddr, map->min_offset);
4682 	entry = uvm_map_entrybyaddr(&map->addr, end);
4683 	while (entry && entry->fspace < alloc_sz)
4684 		entry = RBT_NEXT(uvm_map_addr, entry);
4685 	if (entry) {
4686 		end = MAX(VMMAP_FREE_START(entry), end);
4687 		end += MIN(sz, map->max_offset - end);
4688 	} else
4689 		end = map->max_offset;
4690 
4691 	/* Reserve pmap entries. */
4692 #ifdef PMAP_GROWKERNEL
4693 	uvm_maxkaddr = pmap_growkernel(end);
4694 #else
4695 	uvm_maxkaddr = MAX(uvm_maxkaddr, end);
4696 #endif
4697 
4698 	/* Rebuild free list. */
4699 	uvm_map_freelist_update_refill(map, flags);
4700 }
4701 
4702 /*
4703  * Freelist update subfunction: unlink all entries from freelists.
4704  */
4705 void
4706 uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead)
4707 {
4708 	struct uvm_addr_state *free;
4709 	struct vm_map_entry *entry, *prev, *next;
4710 
4711 	prev = NULL;
4712 	for (entry = RBT_MIN(uvm_map_addr, &map->addr); entry != NULL;
4713 	    entry = next) {
4714 		next = RBT_NEXT(uvm_map_addr, entry);
4715 
4716 		free = uvm_map_uaddr_e(map, entry);
4717 		uvm_mapent_free_remove(map, free, entry);
4718 
4719 		if (prev != NULL && entry->start == entry->end) {
4720 			prev->fspace += VMMAP_FREE_END(entry) - entry->end;
4721 			uvm_mapent_addr_remove(map, entry);
4722 			DEAD_ENTRY_PUSH(dead, entry);
4723 		} else
4724 			prev = entry;
4725 	}
4726 }
4727 
4728 /*
4729  * Freelist update subfunction: refill the freelists with entries.
4730  */
4731 void
4732 uvm_map_freelist_update_refill(struct vm_map *map, int flags)
4733 {
4734 	struct vm_map_entry *entry;
4735 	vaddr_t min, max;
4736 
4737 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
4738 		min = VMMAP_FREE_START(entry);
4739 		max = VMMAP_FREE_END(entry);
4740 		entry->fspace = 0;
4741 
4742 		entry = uvm_map_fix_space(map, entry, min, max, flags);
4743 	}
4744 
4745 	uvm_tree_sanity(map, __FILE__, __LINE__);
4746 }
4747 
4748 /*
4749  * Change {a,b}_{start,end} allocation ranges and associated free lists.
4750  */
4751 void
4752 uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead,
4753     vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
4754 {
4755 	KDASSERT(b_end >= b_start && s_end >= s_start);
4756 
4757 	/* Clear all free lists. */
4758 	uvm_map_freelist_update_clear(map, dead);
4759 
4760 	/* Apply new bounds. */
4761 	map->b_start = b_start;
4762 	map->b_end   = b_end;
4763 	map->s_start = s_start;
4764 	map->s_end   = s_end;
4765 
4766 	/* Refill free lists. */
4767 	uvm_map_freelist_update_refill(map, flags);
4768 }
4769 
4770 /*
4771  * Assign a uvm_addr_state to the specified pointer in vm_map.
4772  *
4773  * May sleep.
4774  */
4775 void
4776 uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which,
4777     struct uvm_addr_state *newval)
4778 {
4779 	struct uvm_map_deadq dead;
4780 
4781 	/* Pointer which must be in this map. */
4782 	KASSERT(which != NULL);
4783 	KASSERT((void*)map <= (void*)(which) &&
4784 	    (void*)(which) < (void*)(map + 1));
4785 
4786 	vm_map_lock(map);
4787 	TAILQ_INIT(&dead);
4788 	uvm_map_freelist_update_clear(map, &dead);
4789 
4790 	uvm_addr_destroy(*which);
4791 	*which = newval;
4792 
4793 	uvm_map_freelist_update_refill(map, 0);
4794 	vm_map_unlock(map);
4795 	uvm_unmap_detach(&dead, 0);
4796 }
4797 
4798 /*
4799  * Correct space insert.
4800  *
4801  * Entry must not be on any freelist.
4802  */
4803 struct vm_map_entry*
4804 uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry,
4805     vaddr_t min, vaddr_t max, int flags)
4806 {
4807 	struct uvm_addr_state	*free, *entfree;
4808 	vaddr_t			 lmax;
4809 
4810 	KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0);
4811 	KDASSERT(min <= max);
4812 	KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) ||
4813 	    min == map->min_offset);
4814 
4815 	/*
4816 	 * During the function, entfree will always point at the uaddr state
4817 	 * for entry.
4818 	 */
4819 	entfree = (entry == NULL ? NULL :
4820 	    uvm_map_uaddr_e(map, entry));
4821 
4822 	while (min != max) {
4823 		/* Claim guard page for entry. */
4824 		if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL &&
4825 		    VMMAP_FREE_END(entry) == entry->end &&
4826 		    entry->start != entry->end) {
4827 			if (max - min == 2 * PAGE_SIZE) {
4828 				/*
4829 				 * If the free-space gap is exactly 2 pages,
4830 				 * we make the guard 2 pages instead of 1.
4831 				 * Because in a guarded map, an area needs
4832 				 * at least 2 pages to allocate from:
4833 				 * one page for the allocation and one for
4834 				 * the guard.
4835 				 */
4836 				entry->guard = 2 * PAGE_SIZE;
4837 				min = max;
4838 			} else {
4839 				entry->guard = PAGE_SIZE;
4840 				min += PAGE_SIZE;
4841 			}
4842 			continue;
4843 		}
4844 
4845 		/*
4846 		 * Handle the case where entry has a 2-page guard, but the
4847 		 * space after entry is freed.
4848 		 */
4849 		if (entry != NULL && entry->fspace == 0 &&
4850 		    entry->guard > PAGE_SIZE) {
4851 			entry->guard = PAGE_SIZE;
4852 			min = VMMAP_FREE_START(entry);
4853 		}
4854 
4855 		lmax = uvm_map_boundary(map, min, max);
4856 		free = uvm_map_uaddr(map, min);
4857 
4858 		/*
4859 		 * Entries are merged if they point at the same uvm_free().
4860 		 * Exception to that rule: if min == uvm_maxkaddr, a new
4861 		 * entry is started regardless (otherwise the allocators
4862 		 * will get confused).
4863 		 */
4864 		if (entry != NULL && free == entfree &&
4865 		    !((map->flags & VM_MAP_ISVMSPACE) == 0 &&
4866 		    min == uvm_maxkaddr)) {
4867 			KDASSERT(VMMAP_FREE_END(entry) == min);
4868 			entry->fspace += lmax - min;
4869 		} else {
4870 			/*
4871 			 * Commit entry to free list: it'll not be added to
4872 			 * anymore.
4873 			 * We'll start a new entry and add to that entry
4874 			 * instead.
4875 			 */
4876 			if (entry != NULL)
4877 				uvm_mapent_free_insert(map, entfree, entry);
4878 
4879 			/* New entry for new uaddr. */
4880 			entry = uvm_mapent_alloc(map, flags);
4881 			KDASSERT(entry != NULL);
4882 			entry->end = entry->start = min;
4883 			entry->guard = 0;
4884 			entry->fspace = lmax - min;
4885 			entry->object.uvm_obj = NULL;
4886 			entry->offset = 0;
4887 			entry->etype = 0;
4888 			entry->protection = entry->max_protection = 0;
4889 			entry->inheritance = 0;
4890 			entry->wired_count = 0;
4891 			entry->advice = 0;
4892 			entry->aref.ar_pageoff = 0;
4893 			entry->aref.ar_amap = NULL;
4894 			uvm_mapent_addr_insert(map, entry);
4895 
4896 			entfree = free;
4897 		}
4898 
4899 		min = lmax;
4900 	}
4901 	/* Finally put entry on the uaddr state. */
4902 	if (entry != NULL)
4903 		uvm_mapent_free_insert(map, entfree, entry);
4904 
4905 	return entry;
4906 }
4907 
4908 /*
4909  * MQuery style of allocation.
4910  *
4911  * This allocator searches forward until sufficient space is found to map
4912  * the given size.
4913  *
4914  * XXX: factor in offset (via pmap_prefer) and protection?
4915  */
4916 int
4917 uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset,
4918     int flags)
4919 {
4920 	struct vm_map_entry *entry, *last;
4921 	vaddr_t addr;
4922 	vaddr_t tmp, pmap_align, pmap_offset;
4923 	int error;
4924 
4925 	addr = *addr_p;
4926 	vm_map_lock_read(map);
4927 
4928 	/* Configure pmap prefer. */
4929 	if (offset != UVM_UNKNOWN_OFFSET) {
4930 		pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN());
4931 		pmap_offset = PMAP_PREFER_OFFSET(offset);
4932 	} else {
4933 		pmap_align = PAGE_SIZE;
4934 		pmap_offset = 0;
4935 	}
4936 
4937 	/* Align address to pmap_prefer unless FLAG_FIXED is set. */
4938 	if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) {
4939 	  	tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
4940 		if (tmp < addr)
4941 			tmp += pmap_align;
4942 		addr = tmp;
4943 	}
4944 
4945 	/* First, check if the requested range is fully available. */
4946 	entry = uvm_map_entrybyaddr(&map->addr, addr);
4947 	last = NULL;
4948 	if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
4949 		error = 0;
4950 		goto out;
4951 	}
4952 	if (flags & UVM_FLAG_FIXED) {
4953 		error = EINVAL;
4954 		goto out;
4955 	}
4956 
4957 	error = ENOMEM; /* Default error from here. */
4958 
4959 	/*
4960 	 * At this point, the memory at <addr, sz> is not available.
4961 	 * The reasons are:
4962 	 * [1] it's outside the map,
4963 	 * [2] it starts in used memory (and therefore needs to move
4964 	 *     toward the first free page in entry),
4965 	 * [3] it starts in free memory but bumps into used memory.
4966 	 *
4967 	 * Note that for case [2], the forward moving is handled by the
4968 	 * for loop below.
4969 	 */
4970 	if (entry == NULL) {
4971 		/* [1] Outside the map. */
4972 		if (addr >= map->max_offset)
4973 			goto out;
4974 		else
4975 			entry = RBT_MIN(uvm_map_addr, &map->addr);
4976 	} else if (VMMAP_FREE_START(entry) <= addr) {
4977 		/* [3] Bumped into used memory. */
4978 		entry = RBT_NEXT(uvm_map_addr, entry);
4979 	}
4980 
4981 	/* Test if the next entry is sufficient for the allocation. */
4982 	for (; entry != NULL;
4983 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4984 		if (entry->fspace == 0)
4985 			continue;
4986 		addr = VMMAP_FREE_START(entry);
4987 
4988 restart:	/* Restart address checks on address change. */
4989 		tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
4990 		if (tmp < addr)
4991 			tmp += pmap_align;
4992 		addr = tmp;
4993 		if (addr >= VMMAP_FREE_END(entry))
4994 			continue;
4995 
4996 		/* Skip brk() allocation addresses. */
4997 		if (addr + sz > map->b_start && addr < map->b_end) {
4998 			if (VMMAP_FREE_END(entry) > map->b_end) {
4999 				addr = map->b_end;
5000 				goto restart;
5001 			} else
5002 				continue;
5003 		}
5004 		/* Skip stack allocation addresses. */
5005 		if (addr + sz > map->s_start && addr < map->s_end) {
5006 			if (VMMAP_FREE_END(entry) > map->s_end) {
5007 				addr = map->s_end;
5008 				goto restart;
5009 			} else
5010 				continue;
5011 		}
5012 
5013 		last = NULL;
5014 		if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5015 			error = 0;
5016 			goto out;
5017 		}
5018 	}
5019 
5020 out:
5021 	vm_map_unlock_read(map);
5022 	if (error == 0)
5023 		*addr_p = addr;
5024 	return error;
5025 }
5026 
5027 /*
5028  * Determine allocation bias.
5029  *
5030  * Returns 1 if we should bias to high addresses, -1 for a bias towards low
5031  * addresses, or 0 for no bias.
5032  * The bias mechanism is intended to avoid clashing with brk() and stack
5033  * areas.
5034  */
5035 int
5036 uvm_mapent_bias(struct vm_map *map, struct vm_map_entry *entry)
5037 {
5038 	vaddr_t start, end;
5039 
5040 	start = VMMAP_FREE_START(entry);
5041 	end = VMMAP_FREE_END(entry);
5042 
5043 	/* Stay at the top of brk() area. */
5044 	if (end >= map->b_start && start < map->b_end)
5045 		return 1;
5046 	/* Stay at the far end of the stack area. */
5047 	if (end >= map->s_start && start < map->s_end) {
5048 #ifdef MACHINE_STACK_GROWS_UP
5049 		return 1;
5050 #else
5051 		return -1;
5052 #endif
5053 	}
5054 
5055 	/* No bias, this area is meant for us. */
5056 	return 0;
5057 }
5058 
5059 
5060 boolean_t
5061 vm_map_lock_try_ln(struct vm_map *map, char *file, int line)
5062 {
5063 	boolean_t rv;
5064 
5065 	if (map->flags & VM_MAP_INTRSAFE) {
5066 		rv = mtx_enter_try(&map->mtx);
5067 	} else {
5068 		mtx_enter(&map->flags_lock);
5069 		if (map->flags & VM_MAP_BUSY) {
5070 			mtx_leave(&map->flags_lock);
5071 			return (FALSE);
5072 		}
5073 		mtx_leave(&map->flags_lock);
5074 		rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0);
5075 		/* check if the lock is busy and back out if we won the race */
5076 		if (rv) {
5077 			mtx_enter(&map->flags_lock);
5078 			if (map->flags & VM_MAP_BUSY) {
5079 				rw_exit(&map->lock);
5080 				rv = FALSE;
5081 			}
5082 			mtx_leave(&map->flags_lock);
5083 		}
5084 	}
5085 
5086 	if (rv) {
5087 		map->timestamp++;
5088 		LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5089 		uvm_tree_sanity(map, file, line);
5090 		uvm_tree_size_chk(map, file, line);
5091 	}
5092 
5093 	return (rv);
5094 }
5095 
5096 void
5097 vm_map_lock_ln(struct vm_map *map, char *file, int line)
5098 {
5099 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5100 		do {
5101 			mtx_enter(&map->flags_lock);
5102 tryagain:
5103 			while (map->flags & VM_MAP_BUSY) {
5104 				map->flags |= VM_MAP_WANTLOCK;
5105 				msleep(&map->flags, &map->flags_lock,
5106 				    PVM, vmmapbsy, 0);
5107 			}
5108 			mtx_leave(&map->flags_lock);
5109 		} while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0);
5110 		/* check if the lock is busy and back out if we won the race */
5111 		mtx_enter(&map->flags_lock);
5112 		if (map->flags & VM_MAP_BUSY) {
5113 			rw_exit(&map->lock);
5114 			goto tryagain;
5115 		}
5116 		mtx_leave(&map->flags_lock);
5117 	} else {
5118 		mtx_enter(&map->mtx);
5119 	}
5120 
5121 	map->timestamp++;
5122 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5123 	uvm_tree_sanity(map, file, line);
5124 	uvm_tree_size_chk(map, file, line);
5125 }
5126 
5127 void
5128 vm_map_lock_read_ln(struct vm_map *map, char *file, int line)
5129 {
5130 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5131 		rw_enter_read(&map->lock);
5132 	else
5133 		mtx_enter(&map->mtx);
5134 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5135 	uvm_tree_sanity(map, file, line);
5136 	uvm_tree_size_chk(map, file, line);
5137 }
5138 
5139 void
5140 vm_map_unlock_ln(struct vm_map *map, char *file, int line)
5141 {
5142 	uvm_tree_sanity(map, file, line);
5143 	uvm_tree_size_chk(map, file, line);
5144 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5145 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5146 		rw_exit(&map->lock);
5147 	else
5148 		mtx_leave(&map->mtx);
5149 }
5150 
5151 void
5152 vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
5153 {
5154 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5155 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5156 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5157 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5158 		rw_exit_read(&map->lock);
5159 	else
5160 		mtx_leave(&map->mtx);
5161 }
5162 
5163 void
5164 vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
5165 {
5166 	uvm_tree_sanity(map, file, line);
5167 	uvm_tree_size_chk(map, file, line);
5168 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5169 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5170 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5171 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5172 		rw_enter(&map->lock, RW_DOWNGRADE);
5173 }
5174 
5175 void
5176 vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
5177 {
5178 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5179 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5180 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5181 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5182 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5183 		rw_exit_read(&map->lock);
5184 		rw_enter_write(&map->lock);
5185 	}
5186 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5187 	uvm_tree_sanity(map, file, line);
5188 }
5189 
5190 void
5191 vm_map_busy_ln(struct vm_map *map, char *file, int line)
5192 {
5193 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5194 	mtx_enter(&map->flags_lock);
5195 	map->flags |= VM_MAP_BUSY;
5196 	mtx_leave(&map->flags_lock);
5197 }
5198 
5199 void
5200 vm_map_unbusy_ln(struct vm_map *map, char *file, int line)
5201 {
5202 	int oflags;
5203 
5204 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5205 	mtx_enter(&map->flags_lock);
5206 	oflags = map->flags;
5207 	map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK);
5208 	mtx_leave(&map->flags_lock);
5209 	if (oflags & VM_MAP_WANTLOCK)
5210 		wakeup(&map->flags);
5211 }
5212 
5213 #ifndef SMALL_KERNEL
5214 int
5215 uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve,
5216     size_t *lenp)
5217 {
5218 	struct vm_map_entry *entry;
5219 	vaddr_t start;
5220 	int cnt, maxcnt, error = 0;
5221 
5222 	KASSERT(*lenp > 0);
5223 	KASSERT((*lenp % sizeof(*kve)) == 0);
5224 	cnt = 0;
5225 	maxcnt = *lenp / sizeof(*kve);
5226 	KASSERT(maxcnt > 0);
5227 
5228 	/*
5229 	 * Return only entries whose address is above the given base
5230 	 * address.  This allows userland to iterate without knowing the
5231 	 * number of entries beforehand.
5232 	 */
5233 	start = (vaddr_t)kve[0].kve_start;
5234 
5235 	vm_map_lock(map);
5236 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
5237 		if (cnt == maxcnt) {
5238 			error = ENOMEM;
5239 			break;
5240 		}
5241 		if (start != 0 && entry->start < start)
5242 			continue;
5243 		kve->kve_start = entry->start;
5244 		kve->kve_end = entry->end;
5245 		kve->kve_guard = entry->guard;
5246 		kve->kve_fspace = entry->fspace;
5247 		kve->kve_fspace_augment = entry->fspace_augment;
5248 		kve->kve_offset = entry->offset;
5249 		kve->kve_wired_count = entry->wired_count;
5250 		kve->kve_etype = entry->etype;
5251 		kve->kve_protection = entry->protection;
5252 		kve->kve_max_protection = entry->max_protection;
5253 		kve->kve_advice = entry->advice;
5254 		kve->kve_inheritance = entry->inheritance;
5255 		kve->kve_flags = entry->flags;
5256 		kve++;
5257 		cnt++;
5258 	}
5259 	vm_map_unlock(map);
5260 
5261 	KASSERT(cnt <= maxcnt);
5262 
5263 	*lenp = sizeof(*kve) * cnt;
5264 	return error;
5265 }
5266 #endif
5267 
5268 
5269 RBT_GENERATE_AUGMENT(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
5270     uvm_mapentry_addrcmp, uvm_map_addr_augment);
5271 
5272 
5273 /*
5274  * MD code: vmspace allocator setup.
5275  */
5276 
5277 #ifdef __i386__
5278 void
5279 uvm_map_setup_md(struct vm_map *map)
5280 {
5281 	vaddr_t		min, max;
5282 
5283 	min = map->min_offset;
5284 	max = map->max_offset;
5285 
5286 	/*
5287 	 * Ensure the selectors will not try to manage page 0;
5288 	 * it's too special.
5289 	 */
5290 	if (min < VMMAP_MIN_ADDR)
5291 		min = VMMAP_MIN_ADDR;
5292 
5293 #if 0	/* Cool stuff, not yet */
5294 	/* Hinted allocations. */
5295 	map->uaddr_any[1] = uaddr_hint_create(min, max, 1024 * 1024 * 1024);
5296 
5297 	/* Executable code is special. */
5298 	map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR);
5299 	/* Place normal allocations beyond executable mappings. */
5300 	map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max);
5301 #else	/* Crappy stuff, for now */
5302 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5303 #endif
5304 
5305 #ifndef SMALL_KERNEL
5306 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5307 #endif /* !SMALL_KERNEL */
5308 }
5309 #elif __LP64__
5310 void
5311 uvm_map_setup_md(struct vm_map *map)
5312 {
5313 	vaddr_t		min, max;
5314 
5315 	min = map->min_offset;
5316 	max = map->max_offset;
5317 
5318 	/*
5319 	 * Ensure the selectors will not try to manage page 0;
5320 	 * it's too special.
5321 	 */
5322 	if (min < VMMAP_MIN_ADDR)
5323 		min = VMMAP_MIN_ADDR;
5324 
5325 #if 0	/* Cool stuff, not yet */
5326 	/* Hinted allocations above 4GB */
5327 	map->uaddr_any[0] =
5328 	    uaddr_hint_create(0x100000000ULL, max, 1024 * 1024 * 1024);
5329 	/* Hinted allocations below 4GB */
5330 	map->uaddr_any[1] = uaddr_hint_create(min, 0x100000000ULL,
5331 	    1024 * 1024 * 1024);
5332 	/* Normal allocations, always above 4GB */
5333 	map->uaddr_any[3] = uaddr_pivot_create(MAX(min, 0x100000000ULL), max);
5334 #else	/* Crappy stuff, for now */
5335 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5336 #endif
5337 
5338 #ifndef SMALL_KERNEL
5339 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5340 #endif /* !SMALL_KERNEL */
5341 }
5342 #else	/* non-i386, 32 bit */
5343 void
5344 uvm_map_setup_md(struct vm_map *map)
5345 {
5346 	vaddr_t		min, max;
5347 
5348 	min = map->min_offset;
5349 	max = map->max_offset;
5350 
5351 	/*
5352 	 * Ensure the selectors will not try to manage page 0;
5353 	 * it's too special.
5354 	 */
5355 	if (min < VMMAP_MIN_ADDR)
5356 		min = VMMAP_MIN_ADDR;
5357 
5358 #if 0	/* Cool stuff, not yet */
5359 	/* Hinted allocations. */
5360 	map->uaddr_any[1] = uaddr_hint_create(min, max, 1024 * 1024 * 1024);
5361 	/* Normal allocations. */
5362 	map->uaddr_any[3] = uaddr_pivot_create(min, max);
5363 #else	/* Crappy stuff, for now */
5364 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5365 #endif
5366 
5367 #ifndef SMALL_KERNEL
5368 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5369 #endif /* !SMALL_KERNEL */
5370 }
5371 #endif
5372