xref: /openbsd-src/sys/uvm/uvm_map.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /*	$OpenBSD: uvm_map.c,v 1.217 2016/06/17 10:48:25 dlg Exp $	*/
2 /*	$NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  *
19  *
20  * Copyright (c) 1997 Charles D. Cranor and Washington University.
21  * Copyright (c) 1991, 1993, The Regents of the University of California.
22  *
23  * All rights reserved.
24  *
25  * This code is derived from software contributed to Berkeley by
26  * The Mach Operating System project at Carnegie-Mellon University.
27  *
28  * Redistribution and use in source and binary forms, with or without
29  * modification, are permitted provided that the following conditions
30  * are met:
31  * 1. Redistributions of source code must retain the above copyright
32  *    notice, this list of conditions and the following disclaimer.
33  * 2. Redistributions in binary form must reproduce the above copyright
34  *    notice, this list of conditions and the following disclaimer in the
35  *    documentation and/or other materials provided with the distribution.
36  * 3. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	@(#)vm_map.c    8.3 (Berkeley) 1/12/94
53  * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
54  *
55  *
56  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
57  * All rights reserved.
58  *
59  * Permission to use, copy, modify and distribute this software and
60  * its documentation is hereby granted, provided that both the copyright
61  * notice and this permission notice appear in all copies of the
62  * software, derivative works or modified versions, and any portions
63  * thereof, and that both notices appear in supporting documentation.
64  *
65  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
66  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
67  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
68  *
69  * Carnegie Mellon requests users of this software to return to
70  *
71  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
72  *  School of Computer Science
73  *  Carnegie Mellon University
74  *  Pittsburgh PA 15213-3890
75  *
76  * any improvements or extensions that they make and grant Carnegie the
77  * rights to redistribute these changes.
78  */
79 
80 /*
81  * uvm_map.c: uvm map operations
82  */
83 
84 /* #define DEBUG */
85 /* #define VMMAP_DEBUG */
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/mman.h>
90 #include <sys/proc.h>
91 #include <sys/malloc.h>
92 #include <sys/pool.h>
93 #include <sys/sysctl.h>
94 
95 #ifdef SYSVSHM
96 #include <sys/shm.h>
97 #endif
98 
99 #include <uvm/uvm.h>
100 
101 #ifdef DDB
102 #include <uvm/uvm_ddb.h>
103 #endif
104 
105 #include <uvm/uvm_addr.h>
106 
107 
108 vsize_t			 uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t);
109 int			 uvm_mapent_isjoinable(struct vm_map*,
110 			    struct vm_map_entry*, struct vm_map_entry*);
111 struct vm_map_entry	*uvm_mapent_merge(struct vm_map*, struct vm_map_entry*,
112 			    struct vm_map_entry*, struct uvm_map_deadq*);
113 struct vm_map_entry	*uvm_mapent_tryjoin(struct vm_map*,
114 			    struct vm_map_entry*, struct uvm_map_deadq*);
115 struct vm_map_entry	*uvm_map_mkentry(struct vm_map*, struct vm_map_entry*,
116 			    struct vm_map_entry*, vaddr_t, vsize_t, int,
117 			    struct uvm_map_deadq*, struct vm_map_entry*);
118 struct vm_map_entry	*uvm_mapent_alloc(struct vm_map*, int);
119 void			 uvm_mapent_free(struct vm_map_entry*);
120 void			 uvm_unmap_kill_entry(struct vm_map*,
121 			    struct vm_map_entry*);
122 void			 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *);
123 void			 uvm_mapent_mkfree(struct vm_map*,
124 			    struct vm_map_entry*, struct vm_map_entry**,
125 			    struct uvm_map_deadq*, boolean_t);
126 void			 uvm_map_pageable_pgon(struct vm_map*,
127 			    struct vm_map_entry*, struct vm_map_entry*,
128 			    vaddr_t, vaddr_t);
129 int			 uvm_map_pageable_wire(struct vm_map*,
130 			    struct vm_map_entry*, struct vm_map_entry*,
131 			    vaddr_t, vaddr_t, int);
132 void			 uvm_map_setup_entries(struct vm_map*);
133 void			 uvm_map_setup_md(struct vm_map*);
134 void			 uvm_map_teardown(struct vm_map*);
135 void			 uvm_map_vmspace_update(struct vm_map*,
136 			    struct uvm_map_deadq*, int);
137 void			 uvm_map_kmem_grow(struct vm_map*,
138 			    struct uvm_map_deadq*, vsize_t, int);
139 void			 uvm_map_freelist_update_clear(struct vm_map*,
140 			    struct uvm_map_deadq*);
141 void			 uvm_map_freelist_update_refill(struct vm_map *, int);
142 void			 uvm_map_freelist_update(struct vm_map*,
143 			    struct uvm_map_deadq*, vaddr_t, vaddr_t,
144 			    vaddr_t, vaddr_t, int);
145 struct vm_map_entry	*uvm_map_fix_space(struct vm_map*, struct vm_map_entry*,
146 			    vaddr_t, vaddr_t, int);
147 int			 uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int,
148 			    struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t,
149 			    int);
150 int			 uvm_map_findspace(struct vm_map*,
151 			    struct vm_map_entry**, struct vm_map_entry**,
152 			    vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
153 			    vaddr_t);
154 vsize_t			 uvm_map_addr_augment_get(struct vm_map_entry*);
155 void			 uvm_map_addr_augment(struct vm_map_entry*);
156 
157 /*
158  * Tree management functions.
159  */
160 
161 static __inline void	 uvm_mapent_copy(struct vm_map_entry*,
162 			    struct vm_map_entry*);
163 static int		 uvm_mapentry_addrcmp(struct vm_map_entry*,
164 			    struct vm_map_entry*);
165 void			 uvm_mapent_free_insert(struct vm_map*,
166 			    struct uvm_addr_state*, struct vm_map_entry*);
167 void			 uvm_mapent_free_remove(struct vm_map*,
168 			    struct uvm_addr_state*, struct vm_map_entry*);
169 void			 uvm_mapent_addr_insert(struct vm_map*,
170 			    struct vm_map_entry*);
171 void			 uvm_mapent_addr_remove(struct vm_map*,
172 			    struct vm_map_entry*);
173 void			 uvm_map_splitentry(struct vm_map*,
174 			    struct vm_map_entry*, struct vm_map_entry*,
175 			    vaddr_t);
176 vsize_t			 uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t);
177 int			 uvm_mapent_bias(struct vm_map*, struct vm_map_entry*);
178 
179 /*
180  * uvm_vmspace_fork helper functions.
181  */
182 struct vm_map_entry	*uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
183 			    vsize_t, vm_prot_t, vm_prot_t,
184 			    struct vm_map_entry*, struct uvm_map_deadq*, int,
185 			    int);
186 struct vm_map_entry	*uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
187 			    vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
188 			    struct vm_map_entry*, struct uvm_map_deadq*);
189 struct vm_map_entry	*uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
190 			    struct vm_map*, struct vm_map_entry*,
191 			    struct uvm_map_deadq*);
192 struct vm_map_entry	*uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
193 			    struct vm_map*, struct vm_map_entry*,
194 			    struct uvm_map_deadq*);
195 struct vm_map_entry	*uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
196 			    struct vm_map*, struct vm_map_entry*,
197 			    struct uvm_map_deadq*);
198 
199 /*
200  * Tree validation.
201  */
202 #ifdef VMMAP_DEBUG
203 void			 uvm_tree_assert(struct vm_map*, int, char*,
204 			    char*, int);
205 #define UVM_ASSERT(map, cond, file, line)				\
206 	uvm_tree_assert((map), (cond), #cond, (file), (line))
207 void			 uvm_tree_sanity(struct vm_map*, char*, int);
208 void			 uvm_tree_size_chk(struct vm_map*, char*, int);
209 void			 vmspace_validate(struct vm_map*);
210 #else
211 #define uvm_tree_sanity(_map, _file, _line)		do {} while (0)
212 #define uvm_tree_size_chk(_map, _file, _line)		do {} while (0)
213 #define vmspace_validate(_map)				do {} while (0)
214 #endif
215 
216 /*
217  * All architectures will have pmap_prefer.
218  */
219 #ifndef PMAP_PREFER
220 #define PMAP_PREFER_ALIGN()	(vaddr_t)PAGE_SIZE
221 #define PMAP_PREFER_OFFSET(off)	0
222 #define PMAP_PREFER(addr, off)	(addr)
223 #endif
224 
225 
226 /*
227  * The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
228  * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes.
229  *
230  * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size
231  * each time.
232  */
233 #define VM_MAP_KSIZE_INIT	(512 * (vaddr_t)PAGE_SIZE)
234 #define VM_MAP_KSIZE_DELTA	(256 * (vaddr_t)PAGE_SIZE)
235 #define VM_MAP_KSIZE_ALLOCMUL	4
236 /*
237  * When selecting a random free-space block, look at most FSPACE_DELTA blocks
238  * ahead.
239  */
240 #define FSPACE_DELTA		8
241 /*
242  * Put allocations adjecent to previous allocations when the free-space tree
243  * is larger than FSPACE_COMPACT entries.
244  *
245  * Alignment and PMAP_PREFER may still cause the entry to not be fully
246  * adjecent. Note that this strategy reduces memory fragmentation (by leaving
247  * a large space before or after the allocation).
248  */
249 #define FSPACE_COMPACT		128
250 /*
251  * Make the address selection skip at most this many bytes from the start of
252  * the free space in which the allocation takes place.
253  *
254  * The main idea behind a randomized address space is that an attacker cannot
255  * know where to target his attack. Therefore, the location of objects must be
256  * as random as possible. However, the goal is not to create the most sparse
257  * map that is possible.
258  * FSPACE_MAXOFF pushes the considered range in bytes down to less insane
259  * sizes, thereby reducing the sparseness. The biggest randomization comes
260  * from fragmentation, i.e. FSPACE_COMPACT.
261  */
262 #define FSPACE_MAXOFF		((vaddr_t)32 * 1024 * 1024)
263 /*
264  * Allow for small gaps in the overflow areas.
265  * Gap size is in bytes and does not have to be a multiple of page-size.
266  */
267 #define FSPACE_BIASGAP		((vaddr_t)32 * 1024)
268 
269 /* auto-allocate address lower bound */
270 #define VMMAP_MIN_ADDR		PAGE_SIZE
271 
272 
273 #ifdef DEADBEEF0
274 #define UVMMAP_DEADBEEF		((void*)DEADBEEF0)
275 #else
276 #define UVMMAP_DEADBEEF		((void*)0xdeadd0d0)
277 #endif
278 
279 #ifdef DEBUG
280 int uvm_map_printlocks = 0;
281 
282 #define LPRINTF(_args)							\
283 	do {								\
284 		if (uvm_map_printlocks)					\
285 			printf _args;					\
286 	} while (0)
287 #else
288 #define LPRINTF(_args)	do {} while (0)
289 #endif
290 
291 static struct mutex uvm_kmapent_mtx;
292 static struct timeval uvm_kmapent_last_warn_time;
293 static struct timeval uvm_kmapent_warn_rate = { 10, 0 };
294 
295 const char vmmapbsy[] = "vmmapbsy";
296 
297 /*
298  * pool for vmspace structures.
299  */
300 struct pool uvm_vmspace_pool;
301 
302 /*
303  * pool for dynamically-allocated map entries.
304  */
305 struct pool uvm_map_entry_pool;
306 struct pool uvm_map_entry_kmem_pool;
307 
308 /*
309  * This global represents the end of the kernel virtual address
310  * space. If we want to exceed this, we must grow the kernel
311  * virtual address space dynamically.
312  *
313  * Note, this variable is locked by kernel_map's lock.
314  */
315 vaddr_t uvm_maxkaddr;
316 
317 /*
318  * Locking predicate.
319  */
320 #define UVM_MAP_REQ_WRITE(_map)						\
321 	do {								\
322 		if ((_map)->ref_count > 0) {				\
323 			if (((_map)->flags & VM_MAP_INTRSAFE) == 0)	\
324 				rw_assert_wrlock(&(_map)->lock);	\
325 			else						\
326 				MUTEX_ASSERT_LOCKED(&(_map)->mtx);	\
327 		}							\
328 	} while (0)
329 
330 /*
331  * Tree describing entries by address.
332  *
333  * Addresses are unique.
334  * Entries with start == end may only exist if they are the first entry
335  * (sorted by address) within a free-memory tree.
336  */
337 
338 static __inline int
339 uvm_mapentry_addrcmp(struct vm_map_entry *e1, struct vm_map_entry *e2)
340 {
341 	return e1->start < e2->start ? -1 : e1->start > e2->start;
342 }
343 
344 /*
345  * Copy mapentry.
346  */
347 static __inline void
348 uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
349 {
350 	caddr_t csrc, cdst;
351 	size_t sz;
352 
353 	csrc = (caddr_t)src;
354 	cdst = (caddr_t)dst;
355 	csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
356 	cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
357 
358 	sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) -
359 	    offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
360 	memcpy(cdst, csrc, sz);
361 }
362 
363 /*
364  * Handle free-list insertion.
365  */
366 void
367 uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr,
368     struct vm_map_entry *entry)
369 {
370 	const struct uvm_addr_functions *fun;
371 #ifdef VMMAP_DEBUG
372 	vaddr_t min, max, bound;
373 #endif
374 
375 #ifdef VMMAP_DEBUG
376 	/*
377 	 * Boundary check.
378 	 * Boundaries are folded if they go on the same free list.
379 	 */
380 	min = VMMAP_FREE_START(entry);
381 	max = VMMAP_FREE_END(entry);
382 
383 	while (min < max) {
384 		bound = uvm_map_boundary(map, min, max);
385 		KASSERT(uvm_map_uaddr(map, min) == uaddr);
386 		min = bound;
387 	}
388 #endif
389 	KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
390 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
391 
392 	UVM_MAP_REQ_WRITE(map);
393 
394 	/* Actual insert: forward to uaddr pointer. */
395 	if (uaddr != NULL) {
396 		fun = uaddr->uaddr_functions;
397 		KDASSERT(fun != NULL);
398 		if (fun->uaddr_free_insert != NULL)
399 			(*fun->uaddr_free_insert)(map, uaddr, entry);
400 		entry->etype |= UVM_ET_FREEMAPPED;
401 	}
402 
403 	/* Update fspace augmentation. */
404 	uvm_map_addr_augment(entry);
405 }
406 
407 /*
408  * Handle free-list removal.
409  */
410 void
411 uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr,
412     struct vm_map_entry *entry)
413 {
414 	const struct uvm_addr_functions *fun;
415 
416 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
417 	KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
418 	UVM_MAP_REQ_WRITE(map);
419 
420 	if (uaddr != NULL) {
421 		fun = uaddr->uaddr_functions;
422 		if (fun->uaddr_free_remove != NULL)
423 			(*fun->uaddr_free_remove)(map, uaddr, entry);
424 		entry->etype &= ~UVM_ET_FREEMAPPED;
425 	}
426 }
427 
428 /*
429  * Handle address tree insertion.
430  */
431 void
432 uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry)
433 {
434 	struct vm_map_entry *res;
435 
436 	if (RB_LEFT(entry, daddrs.addr_entry) != UVMMAP_DEADBEEF ||
437 	    RB_RIGHT(entry, daddrs.addr_entry) != UVMMAP_DEADBEEF ||
438 	    RB_PARENT(entry, daddrs.addr_entry) != UVMMAP_DEADBEEF)
439 		panic("uvm_mapent_addr_insert: entry still in addr list");
440 	KDASSERT(entry->start <= entry->end);
441 	KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 &&
442 	    (entry->end & (vaddr_t)PAGE_MASK) == 0);
443 
444 	UVM_MAP_REQ_WRITE(map);
445 	res = RB_INSERT(uvm_map_addr, &map->addr, entry);
446 	if (res != NULL) {
447 		panic("uvm_mapent_addr_insert: map %p entry %p "
448 		    "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision "
449 		    "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)",
450 		    map, entry,
451 		    entry->start, entry->end, entry->guard, entry->fspace,
452 		    res, res->start, res->end, res->guard, res->fspace);
453 	}
454 }
455 
456 /*
457  * Handle address tree removal.
458  */
459 void
460 uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry)
461 {
462 	struct vm_map_entry *res;
463 
464 	UVM_MAP_REQ_WRITE(map);
465 	res = RB_REMOVE(uvm_map_addr, &map->addr, entry);
466 	if (res != entry)
467 		panic("uvm_mapent_addr_remove");
468 	RB_LEFT(entry, daddrs.addr_entry) = RB_RIGHT(entry, daddrs.addr_entry) =
469 	    RB_PARENT(entry, daddrs.addr_entry) = UVMMAP_DEADBEEF;
470 }
471 
472 /*
473  * uvm_map_reference: add reference to a map
474  *
475  * XXX check map reference counter lock
476  */
477 #define uvm_map_reference(_map)						\
478 	do {								\
479 		map->ref_count++;					\
480 	} while (0)
481 
482 /*
483  * Calculate the dused delta.
484  */
485 vsize_t
486 uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max)
487 {
488 	struct vmspace *vm;
489 	vsize_t sz;
490 	vaddr_t lmax;
491 	vaddr_t stack_begin, stack_end; /* Position of stack. */
492 
493 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
494 	vm = (struct vmspace *)map;
495 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
496 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
497 
498 	sz = 0;
499 	while (min != max) {
500 		lmax = max;
501 		if (min < stack_begin && lmax > stack_begin)
502 			lmax = stack_begin;
503 		else if (min < stack_end && lmax > stack_end)
504 			lmax = stack_end;
505 
506 		if (min >= stack_begin && min < stack_end) {
507 			/* nothing */
508 		} else
509 			sz += lmax - min;
510 		min = lmax;
511 	}
512 
513 	return sz >> PAGE_SHIFT;
514 }
515 
516 /*
517  * Find the entry describing the given address.
518  */
519 struct vm_map_entry*
520 uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr)
521 {
522 	struct vm_map_entry *iter;
523 
524 	iter = RB_ROOT(atree);
525 	while (iter != NULL) {
526 		if (iter->start > addr)
527 			iter = RB_LEFT(iter, daddrs.addr_entry);
528 		else if (VMMAP_FREE_END(iter) <= addr)
529 			iter = RB_RIGHT(iter, daddrs.addr_entry);
530 		else
531 			return iter;
532 	}
533 	return NULL;
534 }
535 
536 /*
537  * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry)
538  *
539  * Push dead entries into a linked list.
540  * Since the linked list abuses the address tree for storage, the entry
541  * may not be linked in a map.
542  *
543  * *head must be initialized to NULL before the first call to this macro.
544  * uvm_unmap_detach(*head, 0) will remove dead entries.
545  */
546 static __inline void
547 dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry)
548 {
549 	TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq);
550 }
551 #define DEAD_ENTRY_PUSH(_headptr, _entry)				\
552 	dead_entry_push((_headptr), (_entry))
553 
554 /*
555  * Helper function for uvm_map_findspace_tree.
556  *
557  * Given allocation constraints and pmap constraints, finds the
558  * lowest and highest address in a range that can be used for the
559  * allocation.
560  *
561  * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs.
562  *
563  *
564  * Big chunk of math with a seasoning of dragons.
565  */
566 int
567 uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg,
568     struct vm_map_entry *sel, vaddr_t align,
569     vaddr_t pmap_align, vaddr_t pmap_off, int bias)
570 {
571 	vaddr_t sel_min, sel_max;
572 #ifdef PMAP_PREFER
573 	vaddr_t pmap_min, pmap_max;
574 #endif /* PMAP_PREFER */
575 #ifdef DIAGNOSTIC
576 	int bad;
577 #endif /* DIAGNOSTIC */
578 
579 	sel_min = VMMAP_FREE_START(sel);
580 	sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0);
581 
582 #ifdef PMAP_PREFER
583 
584 	/*
585 	 * There are two special cases, in which we can satisfy the align
586 	 * requirement and the pmap_prefer requirement.
587 	 * - when pmap_off == 0, we always select the largest of the two
588 	 * - when pmap_off % align == 0 and pmap_align > align, we simply
589 	 *   satisfy the pmap_align requirement and automatically
590 	 *   satisfy the align requirement.
591 	 */
592 	if (align > PAGE_SIZE &&
593 	    !(pmap_align > align && (pmap_off & (align - 1)) == 0)) {
594 		/*
595 		 * Simple case: only use align.
596 		 */
597 		sel_min = roundup(sel_min, align);
598 		sel_max &= ~(align - 1);
599 
600 		if (sel_min > sel_max)
601 			return ENOMEM;
602 
603 		/* Correct for bias. */
604 		if (sel_max - sel_min > FSPACE_BIASGAP) {
605 			if (bias > 0) {
606 				sel_min = sel_max - FSPACE_BIASGAP;
607 				sel_min = roundup(sel_min, align);
608 			} else if (bias < 0) {
609 				sel_max = sel_min + FSPACE_BIASGAP;
610 				sel_max &= ~(align - 1);
611 			}
612 		}
613 	} else if (pmap_align != 0) {
614 		/*
615 		 * Special case: satisfy both pmap_prefer and
616 		 * align argument.
617 		 */
618 		pmap_max = sel_max & ~(pmap_align - 1);
619 		pmap_min = sel_min;
620 		if (pmap_max < sel_min)
621 			return ENOMEM;
622 
623 		/* Adjust pmap_min for BIASGAP for top-addr bias. */
624 		if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP)
625 			pmap_min = pmap_max - FSPACE_BIASGAP;
626 		/* Align pmap_min. */
627 		pmap_min &= ~(pmap_align - 1);
628 		if (pmap_min < sel_min)
629 			pmap_min += pmap_align;
630 		if (pmap_min > pmap_max)
631 			return ENOMEM;
632 
633 		/* Adjust pmap_max for BIASGAP for bottom-addr bias. */
634 		if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) {
635 			pmap_max = (pmap_min + FSPACE_BIASGAP) &
636 			    ~(pmap_align - 1);
637 		}
638 		if (pmap_min > pmap_max)
639 			return ENOMEM;
640 
641 		/* Apply pmap prefer offset. */
642 		pmap_max |= pmap_off;
643 		if (pmap_max > sel_max)
644 			pmap_max -= pmap_align;
645 		pmap_min |= pmap_off;
646 		if (pmap_min < sel_min)
647 			pmap_min += pmap_align;
648 
649 		/*
650 		 * Fixup: it's possible that pmap_min and pmap_max
651 		 * cross eachother. In this case, try to find one
652 		 * address that is allowed.
653 		 * (This usually happens in biased case.)
654 		 */
655 		if (pmap_min > pmap_max) {
656 			if (pmap_min < sel_max)
657 				pmap_max = pmap_min;
658 			else if (pmap_max > sel_min)
659 				pmap_min = pmap_max;
660 			else
661 				return ENOMEM;
662 		}
663 
664 		/* Internal validation. */
665 		KDASSERT(pmap_min <= pmap_max);
666 
667 		sel_min = pmap_min;
668 		sel_max = pmap_max;
669 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
670 		sel_min = sel_max - FSPACE_BIASGAP;
671 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
672 		sel_max = sel_min + FSPACE_BIASGAP;
673 
674 #else
675 
676 	if (align > PAGE_SIZE) {
677 		sel_min = roundup(sel_min, align);
678 		sel_max &= ~(align - 1);
679 		if (sel_min > sel_max)
680 			return ENOMEM;
681 
682 		if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) {
683 			if (bias > 0) {
684 				sel_min = roundup(sel_max - FSPACE_BIASGAP,
685 				    align);
686 			} else {
687 				sel_max = (sel_min + FSPACE_BIASGAP) &
688 				    ~(align - 1);
689 			}
690 		}
691 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
692 		sel_min = sel_max - FSPACE_BIASGAP;
693 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
694 		sel_max = sel_min + FSPACE_BIASGAP;
695 
696 #endif
697 
698 	if (sel_min > sel_max)
699 		return ENOMEM;
700 
701 #ifdef DIAGNOSTIC
702 	bad = 0;
703 	/* Lower boundary check. */
704 	if (sel_min < VMMAP_FREE_START(sel)) {
705 		printf("sel_min: 0x%lx, but should be at least 0x%lx\n",
706 		    sel_min, VMMAP_FREE_START(sel));
707 		bad++;
708 	}
709 	/* Upper boundary check. */
710 	if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) {
711 		printf("sel_max: 0x%lx, but should be at most 0x%lx\n",
712 		    sel_max,
713 		    VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0));
714 		bad++;
715 	}
716 	/* Lower boundary alignment. */
717 	if (align != 0 && (sel_min & (align - 1)) != 0) {
718 		printf("sel_min: 0x%lx, not aligned to 0x%lx\n",
719 		    sel_min, align);
720 		bad++;
721 	}
722 	/* Upper boundary alignment. */
723 	if (align != 0 && (sel_max & (align - 1)) != 0) {
724 		printf("sel_max: 0x%lx, not aligned to 0x%lx\n",
725 		    sel_max, align);
726 		bad++;
727 	}
728 	/* Lower boundary PMAP_PREFER check. */
729 	if (pmap_align != 0 && align == 0 &&
730 	    (sel_min & (pmap_align - 1)) != pmap_off) {
731 		printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
732 		    sel_min, sel_min & (pmap_align - 1), pmap_off);
733 		bad++;
734 	}
735 	/* Upper boundary PMAP_PREFER check. */
736 	if (pmap_align != 0 && align == 0 &&
737 	    (sel_max & (pmap_align - 1)) != pmap_off) {
738 		printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
739 		    sel_max, sel_max & (pmap_align - 1), pmap_off);
740 		bad++;
741 	}
742 
743 	if (bad) {
744 		panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, "
745 		    "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, "
746 		    "bias = %d, "
747 		    "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)",
748 		    sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off,
749 		    bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel));
750 	}
751 #endif /* DIAGNOSTIC */
752 
753 	*min = sel_min;
754 	*max = sel_max;
755 	return 0;
756 }
757 
758 /*
759  * Test if memory starting at addr with sz bytes is free.
760  *
761  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
762  * the space.
763  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
764  */
765 int
766 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
767     struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr,
768     vaddr_t addr, vsize_t sz)
769 {
770 	struct uvm_addr_state *free;
771 	struct uvm_map_addr *atree;
772 	struct vm_map_entry *i, *i_end;
773 
774 	/*
775 	 * Kernel memory above uvm_maxkaddr is considered unavailable.
776 	 */
777 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
778 		if (addr + sz > uvm_maxkaddr)
779 			return 0;
780 	}
781 
782 	atree = &map->addr;
783 
784 	/*
785 	 * Fill in first, last, so they point at the entries containing the
786 	 * first and last address of the range.
787 	 * Note that if they are not NULL, we don't perform the lookup.
788 	 */
789 	KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL);
790 	if (*start_ptr == NULL) {
791 		*start_ptr = uvm_map_entrybyaddr(atree, addr);
792 		if (*start_ptr == NULL)
793 			return 0;
794 	} else
795 		KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr));
796 	if (*end_ptr == NULL) {
797 		if (VMMAP_FREE_END(*start_ptr) >= addr + sz)
798 			*end_ptr = *start_ptr;
799 		else {
800 			*end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1);
801 			if (*end_ptr == NULL)
802 				return 0;
803 		}
804 	} else
805 		KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1));
806 
807 	/* Validation. */
808 	KDASSERT(*start_ptr != NULL && *end_ptr != NULL);
809 	KDASSERT((*start_ptr)->start <= addr &&
810 	    VMMAP_FREE_END(*start_ptr) > addr &&
811 	    (*end_ptr)->start < addr + sz &&
812 	    VMMAP_FREE_END(*end_ptr) >= addr + sz);
813 
814 	/*
815 	 * Check the none of the entries intersects with <addr, addr+sz>.
816 	 * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is
817 	 * considered unavailable unless called by those allocators.
818 	 */
819 	i = *start_ptr;
820 	i_end = RB_NEXT(uvm_map_addr, atree, *end_ptr);
821 	for (; i != i_end;
822 	    i = RB_NEXT(uvm_map_addr, atree, i)) {
823 		if (i->start != i->end && i->end > addr)
824 			return 0;
825 
826 		/*
827 		 * uaddr_exe and uaddr_brk_stack may only be used
828 		 * by these allocators and the NULL uaddr (i.e. no
829 		 * uaddr).
830 		 * Reject if this requirement is not met.
831 		 */
832 		if (uaddr != NULL) {
833 			free = uvm_map_uaddr_e(map, i);
834 
835 			if (uaddr != free && free != NULL &&
836 			    (free == map->uaddr_exe ||
837 			     free == map->uaddr_brk_stack))
838 				return 0;
839 		}
840 	}
841 
842 	return -1;
843 }
844 
845 /*
846  * Invoke each address selector until an address is found.
847  * Will not invoke uaddr_exe.
848  */
849 int
850 uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
851     struct vm_map_entry**last, vaddr_t *addr, vsize_t sz,
852     vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint)
853 {
854 	struct uvm_addr_state *uaddr;
855 	int i;
856 
857 	/*
858 	 * Allocation for sz bytes at any address,
859 	 * using the addr selectors in order.
860 	 */
861 	for (i = 0; i < nitems(map->uaddr_any); i++) {
862 		uaddr = map->uaddr_any[i];
863 
864 		if (uvm_addr_invoke(map, uaddr, first, last,
865 		    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
866 			return 0;
867 	}
868 
869 	/* Fall back to brk() and stack() address selectors. */
870 	uaddr = map->uaddr_brk_stack;
871 	if (uvm_addr_invoke(map, uaddr, first, last,
872 	    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
873 		return 0;
874 
875 	return ENOMEM;
876 }
877 
878 /* Calculate entry augmentation value. */
879 vsize_t
880 uvm_map_addr_augment_get(struct vm_map_entry *entry)
881 {
882 	vsize_t			 augment;
883 	struct vm_map_entry	*left, *right;
884 
885 	augment = entry->fspace;
886 	if ((left = RB_LEFT(entry, daddrs.addr_entry)) != NULL)
887 		augment = MAX(augment, left->fspace_augment);
888 	if ((right = RB_RIGHT(entry, daddrs.addr_entry)) != NULL)
889 		augment = MAX(augment, right->fspace_augment);
890 	return augment;
891 }
892 
893 /*
894  * Update augmentation data in entry.
895  */
896 void
897 uvm_map_addr_augment(struct vm_map_entry *entry)
898 {
899 	vsize_t			 augment;
900 
901 	while (entry != NULL) {
902 		/* Calculate value for augmentation. */
903 		augment = uvm_map_addr_augment_get(entry);
904 
905 		/*
906 		 * Descend update.
907 		 * Once we find an entry that already has the correct value,
908 		 * stop, since it means all its parents will use the correct
909 		 * value too.
910 		 */
911 		if (entry->fspace_augment == augment)
912 			return;
913 		entry->fspace_augment = augment;
914 		entry = RB_PARENT(entry, daddrs.addr_entry);
915 	}
916 }
917 
918 /*
919  * uvm_mapanon: establish a valid mapping in map for an anon
920  *
921  * => *addr and sz must be a multiple of PAGE_SIZE.
922  * => *addr is ignored, except if flags contains UVM_FLAG_FIXED.
923  * => map must be unlocked.
924  *
925  * => align: align vaddr, must be a power-of-2.
926  *    Align is only a hint and will be ignored if the alignment fails.
927  */
928 int
929 uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz,
930     vsize_t align, unsigned int flags)
931 {
932 	struct vm_map_entry	*first, *last, *entry, *new;
933 	struct uvm_map_deadq	 dead;
934 	vm_prot_t		 prot;
935 	vm_prot_t		 maxprot;
936 	vm_inherit_t		 inherit;
937 	int			 advice;
938 	int			 error;
939 	vaddr_t			 pmap_align, pmap_offset;
940 	vaddr_t			 hint;
941 
942 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE);
943 	KASSERT(map != kernel_map);
944 	KASSERT((map->flags & UVM_FLAG_HOLE) == 0);
945 
946 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
947 	splassert(IPL_NONE);
948 
949 	/*
950 	 * We use pmap_align and pmap_offset as alignment and offset variables.
951 	 *
952 	 * Because the align parameter takes precedence over pmap prefer,
953 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
954 	 * if pmap_prefer will not align.
955 	 */
956 	pmap_align = MAX(align, PAGE_SIZE);
957 	pmap_offset = 0;
958 
959 	/* Decode parameters. */
960 	prot = UVM_PROTECTION(flags);
961 	maxprot = UVM_MAXPROTECTION(flags);
962 	advice = UVM_ADVICE(flags);
963 	inherit = UVM_INHERIT(flags);
964 	error = 0;
965 	hint = trunc_page(*addr);
966 	TAILQ_INIT(&dead);
967 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
968 	KASSERT((align & (align - 1)) == 0);
969 
970 	/* Check protection. */
971 	if ((prot & maxprot) != prot)
972 		return EACCES;
973 
974 	/*
975 	 * Before grabbing the lock, allocate a map entry for later
976 	 * use to ensure we don't wait for memory while holding the
977 	 * vm_map_lock.
978 	 */
979 	new = uvm_mapent_alloc(map, flags);
980 	if (new == NULL)
981 		return(ENOMEM);
982 
983 	if (flags & UVM_FLAG_TRYLOCK) {
984 		if (vm_map_lock_try(map) == FALSE) {
985 			error = EFAULT;
986 			goto out;
987 		}
988 	} else
989 		vm_map_lock(map);
990 
991 	first = last = NULL;
992 	if (flags & UVM_FLAG_FIXED) {
993 		/*
994 		 * Fixed location.
995 		 *
996 		 * Note: we ignore align, pmap_prefer.
997 		 * Fill in first, last and *addr.
998 		 */
999 		KASSERT((*addr & PAGE_MASK) == 0);
1000 
1001 		/* Check that the space is available. */
1002 		if (flags & UVM_FLAG_UNMAP)
1003 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1004 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1005 			error = ENOMEM;
1006 			goto unlock;
1007 		}
1008 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1009 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1010 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1011 		/*
1012 		 * Address used as hint.
1013 		 *
1014 		 * Note: we enforce the alignment restriction,
1015 		 * but ignore pmap_prefer.
1016 		 */
1017 	} else if ((maxprot & PROT_EXEC) != 0 &&
1018 	    map->uaddr_exe != NULL) {
1019 		/* Run selection algorithm for executables. */
1020 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1021 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1022 
1023 		if (error != 0)
1024 			goto unlock;
1025 	} else {
1026 		/* Update freelists from vmspace. */
1027 		uvm_map_vmspace_update(map, &dead, flags);
1028 
1029 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1030 		    pmap_align, pmap_offset, prot, hint);
1031 
1032 		if (error != 0)
1033 			goto unlock;
1034 	}
1035 
1036 	/* If we only want a query, return now. */
1037 	if (flags & UVM_FLAG_QUERY) {
1038 		error = 0;
1039 		goto unlock;
1040 	}
1041 
1042 	/*
1043 	 * Create new entry.
1044 	 * first and last may be invalidated after this call.
1045 	 */
1046 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1047 	    new);
1048 	if (entry == NULL) {
1049 		error = ENOMEM;
1050 		goto unlock;
1051 	}
1052 	new = NULL;
1053 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1054 	entry->object.uvm_obj = NULL;
1055 	entry->offset = 0;
1056 	entry->protection = prot;
1057 	entry->max_protection = maxprot;
1058 	entry->inheritance = inherit;
1059 	entry->wired_count = 0;
1060 	entry->advice = advice;
1061 	if (flags & UVM_FLAG_NOFAULT)
1062 		entry->etype |= UVM_ET_NOFAULT;
1063 	if (flags & UVM_FLAG_COPYONW) {
1064 		entry->etype |= UVM_ET_COPYONWRITE;
1065 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1066 			entry->etype |= UVM_ET_NEEDSCOPY;
1067 	}
1068 	if (flags & UVM_FLAG_OVERLAY) {
1069 		KERNEL_LOCK();
1070 		entry->aref.ar_pageoff = 0;
1071 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1072 		KERNEL_UNLOCK();
1073 	}
1074 
1075 	/* Update map and process statistics. */
1076 	map->size += sz;
1077 	((struct vmspace *)map)->vm_dused += uvmspace_dused(map, *addr, *addr + sz);
1078 
1079 unlock:
1080 	vm_map_unlock(map);
1081 
1082 	/*
1083 	 * Remove dead entries.
1084 	 *
1085 	 * Dead entries may be the result of merging.
1086 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1087 	 * destroy free-space entries.
1088 	 */
1089 	uvm_unmap_detach(&dead, 0);
1090 out:
1091 	if (new)
1092 		uvm_mapent_free(new);
1093 	return error;
1094 }
1095 
1096 /*
1097  * uvm_map: establish a valid mapping in map
1098  *
1099  * => *addr and sz must be a multiple of PAGE_SIZE.
1100  * => map must be unlocked.
1101  * => <uobj,uoffset> value meanings (4 cases):
1102  *	[1] <NULL,uoffset>		== uoffset is a hint for PMAP_PREFER
1103  *	[2] <NULL,UVM_UNKNOWN_OFFSET>	== don't PMAP_PREFER
1104  *	[3] <uobj,uoffset>		== normal mapping
1105  *	[4] <uobj,UVM_UNKNOWN_OFFSET>	== uvm_map finds offset based on VA
1106  *
1107  *   case [4] is for kernel mappings where we don't know the offset until
1108  *   we've found a virtual address.   note that kernel object offsets are
1109  *   always relative to vm_map_min(kernel_map).
1110  *
1111  * => align: align vaddr, must be a power-of-2.
1112  *    Align is only a hint and will be ignored if the alignment fails.
1113  */
1114 int
1115 uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz,
1116     struct uvm_object *uobj, voff_t uoffset,
1117     vsize_t align, unsigned int flags)
1118 {
1119 	struct vm_map_entry	*first, *last, *entry, *new;
1120 	struct uvm_map_deadq	 dead;
1121 	vm_prot_t		 prot;
1122 	vm_prot_t		 maxprot;
1123 	vm_inherit_t		 inherit;
1124 	int			 advice;
1125 	int			 error;
1126 	vaddr_t			 pmap_align, pmap_offset;
1127 	vaddr_t			 hint;
1128 
1129 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1130 		splassert(IPL_NONE);
1131 	else
1132 		splassert(IPL_VM);
1133 
1134 	/*
1135 	 * We use pmap_align and pmap_offset as alignment and offset variables.
1136 	 *
1137 	 * Because the align parameter takes precedence over pmap prefer,
1138 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
1139 	 * if pmap_prefer will not align.
1140 	 */
1141 	if (uoffset == UVM_UNKNOWN_OFFSET) {
1142 		pmap_align = MAX(align, PAGE_SIZE);
1143 		pmap_offset = 0;
1144 	} else {
1145 		pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE);
1146 		pmap_offset = PMAP_PREFER_OFFSET(uoffset);
1147 
1148 		if (align == 0 ||
1149 		    (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) {
1150 			/* pmap_offset satisfies align, no change. */
1151 		} else {
1152 			/* Align takes precedence over pmap prefer. */
1153 			pmap_align = align;
1154 			pmap_offset = 0;
1155 		}
1156 	}
1157 
1158 	/* Decode parameters. */
1159 	prot = UVM_PROTECTION(flags);
1160 	maxprot = UVM_MAXPROTECTION(flags);
1161 	advice = UVM_ADVICE(flags);
1162 	inherit = UVM_INHERIT(flags);
1163 	error = 0;
1164 	hint = trunc_page(*addr);
1165 	TAILQ_INIT(&dead);
1166 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
1167 	KASSERT((align & (align - 1)) == 0);
1168 
1169 	/* Holes are incompatible with other types of mappings. */
1170 	if (flags & UVM_FLAG_HOLE) {
1171 		KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) &&
1172 		    (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0);
1173 	}
1174 
1175 	/* Unset hint for kernel_map non-fixed allocations. */
1176 	if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED))
1177 		hint = 0;
1178 
1179 	/* Check protection. */
1180 	if ((prot & maxprot) != prot)
1181 		return EACCES;
1182 
1183 	if (map == kernel_map &&
1184 	    (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
1185 		panic("uvm_map: kernel map W^X violation requested");
1186 
1187 	/*
1188 	 * Before grabbing the lock, allocate a map entry for later
1189 	 * use to ensure we don't wait for memory while holding the
1190 	 * vm_map_lock.
1191 	 */
1192 	new = uvm_mapent_alloc(map, flags);
1193 	if (new == NULL)
1194 		return(ENOMEM);
1195 
1196 	if (flags & UVM_FLAG_TRYLOCK) {
1197 		if (vm_map_lock_try(map) == FALSE) {
1198 			error = EFAULT;
1199 			goto out;
1200 		}
1201 	} else {
1202 		vm_map_lock(map);
1203 	}
1204 
1205 	first = last = NULL;
1206 	if (flags & UVM_FLAG_FIXED) {
1207 		/*
1208 		 * Fixed location.
1209 		 *
1210 		 * Note: we ignore align, pmap_prefer.
1211 		 * Fill in first, last and *addr.
1212 		 */
1213 		KASSERT((*addr & PAGE_MASK) == 0);
1214 
1215 		/*
1216 		 * Grow pmap to include allocated address.
1217 		 * If the growth fails, the allocation will fail too.
1218 		 */
1219 		if ((map->flags & VM_MAP_ISVMSPACE) == 0 &&
1220 		    uvm_maxkaddr < (*addr + sz)) {
1221 			uvm_map_kmem_grow(map, &dead,
1222 			    *addr + sz - uvm_maxkaddr, flags);
1223 		}
1224 
1225 		/* Check that the space is available. */
1226 		if (flags & UVM_FLAG_UNMAP)
1227 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1228 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1229 			error = ENOMEM;
1230 			goto unlock;
1231 		}
1232 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1233 	    (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE &&
1234 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1235 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1236 		/*
1237 		 * Address used as hint.
1238 		 *
1239 		 * Note: we enforce the alignment restriction,
1240 		 * but ignore pmap_prefer.
1241 		 */
1242 	} else if ((maxprot & PROT_EXEC) != 0 &&
1243 	    map->uaddr_exe != NULL) {
1244 		/* Run selection algorithm for executables. */
1245 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1246 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1247 
1248 		/* Grow kernel memory and try again. */
1249 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1250 			uvm_map_kmem_grow(map, &dead, sz, flags);
1251 
1252 			error = uvm_addr_invoke(map, map->uaddr_exe,
1253 			    &first, &last, addr, sz,
1254 			    pmap_align, pmap_offset, prot, hint);
1255 		}
1256 
1257 		if (error != 0)
1258 			goto unlock;
1259 	} else {
1260 		/* Update freelists from vmspace. */
1261 		if (map->flags & VM_MAP_ISVMSPACE)
1262 			uvm_map_vmspace_update(map, &dead, flags);
1263 
1264 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1265 		    pmap_align, pmap_offset, prot, hint);
1266 
1267 		/* Grow kernel memory and try again. */
1268 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1269 			uvm_map_kmem_grow(map, &dead, sz, flags);
1270 
1271 			error = uvm_map_findspace(map, &first, &last, addr, sz,
1272 			    pmap_align, pmap_offset, prot, hint);
1273 		}
1274 
1275 		if (error != 0)
1276 			goto unlock;
1277 	}
1278 
1279 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE ||
1280 	    uvm_maxkaddr >= *addr + sz);
1281 
1282 	/* If we only want a query, return now. */
1283 	if (flags & UVM_FLAG_QUERY) {
1284 		error = 0;
1285 		goto unlock;
1286 	}
1287 
1288 	if (uobj == NULL)
1289 		uoffset = 0;
1290 	else if (uoffset == UVM_UNKNOWN_OFFSET) {
1291 		KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
1292 		uoffset = *addr - vm_map_min(kernel_map);
1293 	}
1294 
1295 	/*
1296 	 * Create new entry.
1297 	 * first and last may be invalidated after this call.
1298 	 */
1299 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1300 	    new);
1301 	if (entry == NULL) {
1302 		error = ENOMEM;
1303 		goto unlock;
1304 	}
1305 	new = NULL;
1306 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1307 	entry->object.uvm_obj = uobj;
1308 	entry->offset = uoffset;
1309 	entry->protection = prot;
1310 	entry->max_protection = maxprot;
1311 	entry->inheritance = inherit;
1312 	entry->wired_count = 0;
1313 	entry->advice = advice;
1314 	if (uobj)
1315 		entry->etype |= UVM_ET_OBJ;
1316 	else if (flags & UVM_FLAG_HOLE)
1317 		entry->etype |= UVM_ET_HOLE;
1318 	if (flags & UVM_FLAG_NOFAULT)
1319 		entry->etype |= UVM_ET_NOFAULT;
1320 	if (flags & UVM_FLAG_COPYONW) {
1321 		entry->etype |= UVM_ET_COPYONWRITE;
1322 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1323 			entry->etype |= UVM_ET_NEEDSCOPY;
1324 	}
1325 	if (flags & UVM_FLAG_OVERLAY) {
1326 		entry->aref.ar_pageoff = 0;
1327 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1328 	}
1329 
1330 	/* Update map and process statistics. */
1331 	if (!(flags & UVM_FLAG_HOLE)) {
1332 		map->size += sz;
1333 		if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL) {
1334 			((struct vmspace *)map)->vm_dused +=
1335 			    uvmspace_dused(map, *addr, *addr + sz);
1336 		}
1337 	}
1338 
1339 	/*
1340 	 * Try to merge entry.
1341 	 *
1342 	 * Userland allocations are kept separated most of the time.
1343 	 * Forego the effort of merging what most of the time can't be merged
1344 	 * and only try the merge if it concerns a kernel entry.
1345 	 */
1346 	if ((flags & UVM_FLAG_NOMERGE) == 0 &&
1347 	    (map->flags & VM_MAP_ISVMSPACE) == 0)
1348 		uvm_mapent_tryjoin(map, entry, &dead);
1349 
1350 unlock:
1351 	vm_map_unlock(map);
1352 
1353 	/*
1354 	 * Remove dead entries.
1355 	 *
1356 	 * Dead entries may be the result of merging.
1357 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1358 	 * destroy free-space entries.
1359 	 */
1360 	if (map->flags & VM_MAP_INTRSAFE)
1361 		uvm_unmap_detach_intrsafe(&dead);
1362 	else
1363 		uvm_unmap_detach(&dead, 0);
1364 out:
1365 	if (new)
1366 		uvm_mapent_free(new);
1367 	return error;
1368 }
1369 
1370 /*
1371  * True iff e1 and e2 can be joined together.
1372  */
1373 int
1374 uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1,
1375     struct vm_map_entry *e2)
1376 {
1377 	KDASSERT(e1 != NULL && e2 != NULL);
1378 
1379 	/* Must be the same entry type and not have free memory between. */
1380 	if (e1->etype != e2->etype || e1->end != e2->start)
1381 		return 0;
1382 
1383 	/* Submaps are never joined. */
1384 	if (UVM_ET_ISSUBMAP(e1))
1385 		return 0;
1386 
1387 	/* Never merge wired memory. */
1388 	if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2))
1389 		return 0;
1390 
1391 	/* Protection, inheritance and advice must be equal. */
1392 	if (e1->protection != e2->protection ||
1393 	    e1->max_protection != e2->max_protection ||
1394 	    e1->inheritance != e2->inheritance ||
1395 	    e1->advice != e2->advice)
1396 		return 0;
1397 
1398 	/* If uvm_object: object itself and offsets within object must match. */
1399 	if (UVM_ET_ISOBJ(e1)) {
1400 		if (e1->object.uvm_obj != e2->object.uvm_obj)
1401 			return 0;
1402 		if (e1->offset + (e1->end - e1->start) != e2->offset)
1403 			return 0;
1404 	}
1405 
1406 	/*
1407 	 * Cannot join shared amaps.
1408 	 * Note: no need to lock amap to look at refs, since we don't care
1409 	 * about its exact value.
1410 	 * If it is 1 (i.e. we have the only reference) it will stay there.
1411 	 */
1412 	if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1)
1413 		return 0;
1414 	if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1)
1415 		return 0;
1416 
1417 	/* Apprently, e1 and e2 match. */
1418 	return 1;
1419 }
1420 
1421 /*
1422  * Join support function.
1423  *
1424  * Returns the merged entry on succes.
1425  * Returns NULL if the merge failed.
1426  */
1427 struct vm_map_entry*
1428 uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1,
1429     struct vm_map_entry *e2, struct uvm_map_deadq *dead)
1430 {
1431 	struct uvm_addr_state *free;
1432 
1433 	/*
1434 	 * Merging is not supported for map entries that
1435 	 * contain an amap in e1. This should never happen
1436 	 * anyway, because only kernel entries are merged.
1437 	 * These do not contain amaps.
1438 	 * e2 contains no real information in its amap,
1439 	 * so it can be erased immediately.
1440 	 */
1441 	KASSERT(e1->aref.ar_amap == NULL);
1442 
1443 	/*
1444 	 * Don't drop obj reference:
1445 	 * uvm_unmap_detach will do this for us.
1446 	 */
1447 	free = uvm_map_uaddr_e(map, e1);
1448 	uvm_mapent_free_remove(map, free, e1);
1449 
1450 	free = uvm_map_uaddr_e(map, e2);
1451 	uvm_mapent_free_remove(map, free, e2);
1452 	uvm_mapent_addr_remove(map, e2);
1453 	e1->end = e2->end;
1454 	e1->guard = e2->guard;
1455 	e1->fspace = e2->fspace;
1456 	uvm_mapent_free_insert(map, free, e1);
1457 
1458 	DEAD_ENTRY_PUSH(dead, e2);
1459 	return e1;
1460 }
1461 
1462 /*
1463  * Attempt forward and backward joining of entry.
1464  *
1465  * Returns entry after joins.
1466  * We are guaranteed that the amap of entry is either non-existant or
1467  * has never been used.
1468  */
1469 struct vm_map_entry*
1470 uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry,
1471     struct uvm_map_deadq *dead)
1472 {
1473 	struct vm_map_entry *other;
1474 	struct vm_map_entry *merged;
1475 
1476 	/* Merge with previous entry. */
1477 	other = RB_PREV(uvm_map_addr, &map->addr, entry);
1478 	if (other && uvm_mapent_isjoinable(map, other, entry)) {
1479 		merged = uvm_mapent_merge(map, other, entry, dead);
1480 		if (merged)
1481 			entry = merged;
1482 	}
1483 
1484 	/*
1485 	 * Merge with next entry.
1486 	 *
1487 	 * Because amap can only extend forward and the next entry
1488 	 * probably contains sensible info, only perform forward merging
1489 	 * in the absence of an amap.
1490 	 */
1491 	other = RB_NEXT(uvm_map_addr, &map->addr, entry);
1492 	if (other && entry->aref.ar_amap == NULL &&
1493 	    other->aref.ar_amap == NULL &&
1494 	    uvm_mapent_isjoinable(map, entry, other)) {
1495 		merged = uvm_mapent_merge(map, entry, other, dead);
1496 		if (merged)
1497 			entry = merged;
1498 	}
1499 
1500 	return entry;
1501 }
1502 
1503 /*
1504  * Kill entries that are no longer in a map.
1505  */
1506 void
1507 uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags)
1508 {
1509 	struct vm_map_entry *entry;
1510 	int waitok = flags & UVM_PLA_WAITOK;
1511 
1512 	if (TAILQ_EMPTY(deadq))
1513 		return;
1514 
1515 	KERNEL_LOCK();
1516 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1517 		if (waitok)
1518 			uvm_pause();
1519 		/* Drop reference to amap, if we've got one. */
1520 		if (entry->aref.ar_amap)
1521 			amap_unref(entry->aref.ar_amap,
1522 			    entry->aref.ar_pageoff,
1523 			    atop(entry->end - entry->start),
1524 			    flags & AMAP_REFALL);
1525 
1526 		/* Drop reference to our backing object, if we've got one. */
1527 		if (UVM_ET_ISSUBMAP(entry)) {
1528 			/* ... unlikely to happen, but play it safe */
1529 			uvm_map_deallocate(entry->object.sub_map);
1530 		} else if (UVM_ET_ISOBJ(entry) &&
1531 		    entry->object.uvm_obj->pgops->pgo_detach) {
1532 			entry->object.uvm_obj->pgops->pgo_detach(
1533 			    entry->object.uvm_obj);
1534 		}
1535 
1536 		/* Step to next. */
1537 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1538 		uvm_mapent_free(entry);
1539 	}
1540 	KERNEL_UNLOCK();
1541 }
1542 
1543 void
1544 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq)
1545 {
1546 	struct vm_map_entry *entry;
1547 
1548 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1549 		KASSERT(entry->aref.ar_amap == NULL);
1550 		KASSERT(!UVM_ET_ISSUBMAP(entry));
1551 		KASSERT(!UVM_ET_ISOBJ(entry));
1552 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1553 		uvm_mapent_free(entry);
1554 	}
1555 }
1556 
1557 /*
1558  * Create and insert new entry.
1559  *
1560  * Returned entry contains new addresses and is inserted properly in the tree.
1561  * first and last are (probably) no longer valid.
1562  */
1563 struct vm_map_entry*
1564 uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first,
1565     struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags,
1566     struct uvm_map_deadq *dead, struct vm_map_entry *new)
1567 {
1568 	struct vm_map_entry *entry, *prev;
1569 	struct uvm_addr_state *free;
1570 	vaddr_t min, max;	/* free space boundaries for new entry */
1571 
1572 	KDASSERT(map != NULL);
1573 	KDASSERT(first != NULL);
1574 	KDASSERT(last != NULL);
1575 	KDASSERT(dead != NULL);
1576 	KDASSERT(sz > 0);
1577 	KDASSERT(addr + sz > addr);
1578 	KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr);
1579 	KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz);
1580 	KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz));
1581 	uvm_tree_sanity(map, __FILE__, __LINE__);
1582 
1583 	min = addr + sz;
1584 	max = VMMAP_FREE_END(last);
1585 
1586 	/* Initialize new entry. */
1587 	if (new == NULL)
1588 		entry = uvm_mapent_alloc(map, flags);
1589 	else
1590 		entry = new;
1591 	if (entry == NULL)
1592 		return NULL;
1593 	entry->offset = 0;
1594 	entry->etype = 0;
1595 	entry->wired_count = 0;
1596 	entry->aref.ar_pageoff = 0;
1597 	entry->aref.ar_amap = NULL;
1598 
1599 	entry->start = addr;
1600 	entry->end = min;
1601 	entry->guard = 0;
1602 	entry->fspace = 0;
1603 
1604 	/* Reset free space in first. */
1605 	free = uvm_map_uaddr_e(map, first);
1606 	uvm_mapent_free_remove(map, free, first);
1607 	first->guard = 0;
1608 	first->fspace = 0;
1609 
1610 	/*
1611 	 * Remove all entries that are fully replaced.
1612 	 * We are iterating using last in reverse order.
1613 	 */
1614 	for (; first != last; last = prev) {
1615 		prev = RB_PREV(uvm_map_addr, &map->addr, last);
1616 
1617 		KDASSERT(last->start == last->end);
1618 		free = uvm_map_uaddr_e(map, last);
1619 		uvm_mapent_free_remove(map, free, last);
1620 		uvm_mapent_addr_remove(map, last);
1621 		DEAD_ENTRY_PUSH(dead, last);
1622 	}
1623 	/* Remove first if it is entirely inside <addr, addr+sz>.  */
1624 	if (first->start == addr) {
1625 		uvm_mapent_addr_remove(map, first);
1626 		DEAD_ENTRY_PUSH(dead, first);
1627 	} else {
1628 		uvm_map_fix_space(map, first, VMMAP_FREE_START(first),
1629 		    addr, flags);
1630 	}
1631 
1632 	/* Finally, link in entry. */
1633 	uvm_mapent_addr_insert(map, entry);
1634 	uvm_map_fix_space(map, entry, min, max, flags);
1635 
1636 	uvm_tree_sanity(map, __FILE__, __LINE__);
1637 	return entry;
1638 }
1639 
1640 
1641 /*
1642  * uvm_mapent_alloc: allocate a map entry
1643  */
1644 struct vm_map_entry *
1645 uvm_mapent_alloc(struct vm_map *map, int flags)
1646 {
1647 	struct vm_map_entry *me, *ne;
1648 	int pool_flags;
1649 	int i;
1650 
1651 	pool_flags = PR_WAITOK;
1652 	if (flags & UVM_FLAG_TRYLOCK)
1653 		pool_flags = PR_NOWAIT;
1654 
1655 	if (map->flags & VM_MAP_INTRSAFE || cold) {
1656 		mtx_enter(&uvm_kmapent_mtx);
1657 		me = uvm.kentry_free;
1658 		if (me == NULL) {
1659 			ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty,
1660 			    &kd_nowait);
1661 			if (ne == NULL)
1662 				panic("uvm_mapent_alloc: cannot allocate map "
1663 				    "entry");
1664 			for (i = 0;
1665 			    i < PAGE_SIZE / sizeof(struct vm_map_entry) - 1;
1666 			    i++)
1667 				RB_LEFT(&ne[i], daddrs.addr_entry) = &ne[i + 1];
1668 			RB_LEFT(&ne[i], daddrs.addr_entry) = NULL;
1669 			me = ne;
1670 			if (ratecheck(&uvm_kmapent_last_warn_time,
1671 			    &uvm_kmapent_warn_rate))
1672 				printf("uvm_mapent_alloc: out of static "
1673 				    "map entries\n");
1674 		}
1675 		uvm.kentry_free = RB_LEFT(me, daddrs.addr_entry);
1676 		uvmexp.kmapent++;
1677 		mtx_leave(&uvm_kmapent_mtx);
1678 		me->flags = UVM_MAP_STATIC;
1679 	} else if (map == kernel_map) {
1680 		splassert(IPL_NONE);
1681 		me = pool_get(&uvm_map_entry_kmem_pool, pool_flags);
1682 		if (me == NULL)
1683 			goto out;
1684 		me->flags = UVM_MAP_KMEM;
1685 	} else {
1686 		splassert(IPL_NONE);
1687 		me = pool_get(&uvm_map_entry_pool, pool_flags);
1688 		if (me == NULL)
1689 			goto out;
1690 		me->flags = 0;
1691 	}
1692 
1693 	if (me != NULL) {
1694 		RB_LEFT(me, daddrs.addr_entry) =
1695 		    RB_RIGHT(me, daddrs.addr_entry) =
1696 		    RB_PARENT(me, daddrs.addr_entry) = UVMMAP_DEADBEEF;
1697 	}
1698 
1699 out:
1700 	return(me);
1701 }
1702 
1703 /*
1704  * uvm_mapent_free: free map entry
1705  *
1706  * => XXX: static pool for kernel map?
1707  */
1708 void
1709 uvm_mapent_free(struct vm_map_entry *me)
1710 {
1711 	if (me->flags & UVM_MAP_STATIC) {
1712 		mtx_enter(&uvm_kmapent_mtx);
1713 		RB_LEFT(me, daddrs.addr_entry) = uvm.kentry_free;
1714 		uvm.kentry_free = me;
1715 		uvmexp.kmapent--;
1716 		mtx_leave(&uvm_kmapent_mtx);
1717 	} else if (me->flags & UVM_MAP_KMEM) {
1718 		splassert(IPL_NONE);
1719 		pool_put(&uvm_map_entry_kmem_pool, me);
1720 	} else {
1721 		splassert(IPL_NONE);
1722 		pool_put(&uvm_map_entry_pool, me);
1723 	}
1724 }
1725 
1726 /*
1727  * uvm_map_lookup_entry: find map entry at or before an address.
1728  *
1729  * => map must at least be read-locked by caller
1730  * => entry is returned in "entry"
1731  * => return value is true if address is in the returned entry
1732  * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is
1733  * returned for those mappings.
1734  */
1735 boolean_t
1736 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
1737     struct vm_map_entry **entry)
1738 {
1739 	*entry = uvm_map_entrybyaddr(&map->addr, address);
1740 	return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
1741 	    (*entry)->start <= address && (*entry)->end > address;
1742 }
1743 
1744 /*
1745  * uvm_map_pie: return a random load address for a PIE executable
1746  * properly aligned.
1747  */
1748 #ifndef VM_PIE_MAX_ADDR
1749 #define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4)
1750 #endif
1751 
1752 #ifndef VM_PIE_MIN_ADDR
1753 #define VM_PIE_MIN_ADDR VM_MIN_ADDRESS
1754 #endif
1755 
1756 #ifndef VM_PIE_MIN_ALIGN
1757 #define VM_PIE_MIN_ALIGN PAGE_SIZE
1758 #endif
1759 
1760 vaddr_t
1761 uvm_map_pie(vaddr_t align)
1762 {
1763 	vaddr_t addr, space, min;
1764 
1765 	align = MAX(align, VM_PIE_MIN_ALIGN);
1766 
1767 	/* round up to next alignment */
1768 	min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1);
1769 
1770 	if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR)
1771 		return (align);
1772 
1773 	space = (VM_PIE_MAX_ADDR - min) / align;
1774 	space = MIN(space, (u_int32_t)-1);
1775 
1776 	addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align;
1777 	addr += min;
1778 
1779 	return (addr);
1780 }
1781 
1782 void
1783 uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end)
1784 {
1785 	struct uvm_map_deadq dead;
1786 
1787 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0 &&
1788 	    (end & (vaddr_t)PAGE_MASK) == 0);
1789 	TAILQ_INIT(&dead);
1790 	vm_map_lock(map);
1791 	uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE);
1792 	vm_map_unlock(map);
1793 
1794 	if (map->flags & VM_MAP_INTRSAFE)
1795 		uvm_unmap_detach_intrsafe(&dead);
1796 	else
1797 		uvm_unmap_detach(&dead, 0);
1798 }
1799 
1800 /*
1801  * Mark entry as free.
1802  *
1803  * entry will be put on the dead list.
1804  * The free space will be merged into the previous or a new entry,
1805  * unless markfree is false.
1806  */
1807 void
1808 uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry,
1809     struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead,
1810     boolean_t markfree)
1811 {
1812 	struct uvm_addr_state	*free;
1813 	struct vm_map_entry	*prev;
1814 	vaddr_t			 addr;	/* Start of freed range. */
1815 	vaddr_t			 end;	/* End of freed range. */
1816 
1817 	prev = *prev_ptr;
1818 	if (prev == entry)
1819 		*prev_ptr = prev = NULL;
1820 
1821 	if (prev == NULL ||
1822 	    VMMAP_FREE_END(prev) != entry->start)
1823 		prev = RB_PREV(uvm_map_addr, &map->addr, entry);
1824 
1825 	/* Entry is describing only free memory and has nothing to drain into. */
1826 	if (prev == NULL && entry->start == entry->end && markfree) {
1827 		*prev_ptr = entry;
1828 		return;
1829 	}
1830 
1831 	addr = entry->start;
1832 	end = VMMAP_FREE_END(entry);
1833 	free = uvm_map_uaddr_e(map, entry);
1834 	uvm_mapent_free_remove(map, free, entry);
1835 	uvm_mapent_addr_remove(map, entry);
1836 	DEAD_ENTRY_PUSH(dead, entry);
1837 
1838 	if (markfree) {
1839 		if (prev) {
1840 			free = uvm_map_uaddr_e(map, prev);
1841 			uvm_mapent_free_remove(map, free, prev);
1842 		}
1843 		*prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0);
1844 	}
1845 }
1846 
1847 /*
1848  * Unwire and release referenced amap and object from map entry.
1849  */
1850 void
1851 uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry)
1852 {
1853 	/* Unwire removed map entry. */
1854 	if (VM_MAPENT_ISWIRED(entry)) {
1855 		KERNEL_LOCK();
1856 		entry->wired_count = 0;
1857 		uvm_fault_unwire_locked(map, entry->start, entry->end);
1858 		KERNEL_UNLOCK();
1859 	}
1860 
1861 	/* Entry-type specific code. */
1862 	if (UVM_ET_ISHOLE(entry)) {
1863 		/* Nothing to be done for holes. */
1864 	} else if (map->flags & VM_MAP_INTRSAFE) {
1865 		KASSERT(vm_map_pmap(map) == pmap_kernel());
1866 		uvm_km_pgremove_intrsafe(entry->start, entry->end);
1867 		pmap_kremove(entry->start, entry->end - entry->start);
1868 	} else if (UVM_ET_ISOBJ(entry) &&
1869 	    UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
1870 		KASSERT(vm_map_pmap(map) == pmap_kernel());
1871 		/*
1872 		 * Note: kernel object mappings are currently used in
1873 		 * two ways:
1874 		 *  [1] "normal" mappings of pages in the kernel object
1875 		 *  [2] uvm_km_valloc'd allocations in which we
1876 		 *      pmap_enter in some non-kernel-object page
1877 		 *      (e.g. vmapbuf).
1878 		 *
1879 		 * for case [1], we need to remove the mapping from
1880 		 * the pmap and then remove the page from the kernel
1881 		 * object (because, once pages in a kernel object are
1882 		 * unmapped they are no longer needed, unlike, say,
1883 		 * a vnode where you might want the data to persist
1884 		 * until flushed out of a queue).
1885 		 *
1886 		 * for case [2], we need to remove the mapping from
1887 		 * the pmap.  there shouldn't be any pages at the
1888 		 * specified offset in the kernel object [but it
1889 		 * doesn't hurt to call uvm_km_pgremove just to be
1890 		 * safe?]
1891 		 *
1892 		 * uvm_km_pgremove currently does the following:
1893 		 *   for pages in the kernel object range:
1894 		 *     - drops the swap slot
1895 		 *     - uvm_pagefree the page
1896 		 *
1897 		 * note there is version of uvm_km_pgremove() that
1898 		 * is used for "intrsafe" objects.
1899 		 */
1900 		/*
1901 		 * remove mappings from pmap and drop the pages
1902 		 * from the object.  offsets are always relative
1903 		 * to vm_map_min(kernel_map).
1904 		 */
1905 		pmap_remove(pmap_kernel(), entry->start, entry->end);
1906 		uvm_km_pgremove(entry->object.uvm_obj,
1907 		    entry->start - vm_map_min(kernel_map),
1908 		    entry->end - vm_map_min(kernel_map));
1909 
1910 		/*
1911 		 * null out kernel_object reference, we've just
1912 		 * dropped it
1913 		 */
1914 		entry->etype &= ~UVM_ET_OBJ;
1915 		entry->object.uvm_obj = NULL;  /* to be safe */
1916 	} else {
1917 		/* remove mappings the standard way. */
1918 		pmap_remove(map->pmap, entry->start, entry->end);
1919 	}
1920 }
1921 
1922 /*
1923  * Remove all entries from start to end.
1924  *
1925  * If remove_holes, then remove ET_HOLE entries as well.
1926  * If markfree, entry will be properly marked free, otherwise, no replacement
1927  * entry will be put in the tree (corrupting the tree).
1928  */
1929 void
1930 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
1931     struct uvm_map_deadq *dead, boolean_t remove_holes,
1932     boolean_t markfree)
1933 {
1934 	struct vm_map_entry *prev_hint, *next, *entry;
1935 
1936 	start = MAX(start, map->min_offset);
1937 	end = MIN(end, map->max_offset);
1938 	if (start >= end)
1939 		return;
1940 
1941 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1942 		splassert(IPL_NONE);
1943 	else
1944 		splassert(IPL_VM);
1945 
1946 	/* Find first affected entry. */
1947 	entry = uvm_map_entrybyaddr(&map->addr, start);
1948 	KDASSERT(entry != NULL && entry->start <= start);
1949 	if (entry->end <= start && markfree)
1950 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
1951 	else
1952 		UVM_MAP_CLIP_START(map, entry, start);
1953 
1954 	/*
1955 	 * Iterate entries until we reach end address.
1956 	 * prev_hint hints where the freed space can be appended to.
1957 	 */
1958 	prev_hint = NULL;
1959 	for (; entry != NULL && entry->start < end; entry = next) {
1960 		KDASSERT(entry->start >= start);
1961 		if (entry->end > end || !markfree)
1962 			UVM_MAP_CLIP_END(map, entry, end);
1963 		KDASSERT(entry->start >= start && entry->end <= end);
1964 		next = RB_NEXT(uvm_map_addr, &map->addr, entry);
1965 
1966 		/* Don't remove holes unless asked to do so. */
1967 		if (UVM_ET_ISHOLE(entry)) {
1968 			if (!remove_holes) {
1969 				prev_hint = entry;
1970 				continue;
1971 			}
1972 		}
1973 
1974 		/* Kill entry. */
1975 		uvm_unmap_kill_entry(map, entry);
1976 
1977 		/* Update space usage. */
1978 		if ((map->flags & VM_MAP_ISVMSPACE) &&
1979 		    entry->object.uvm_obj == NULL &&
1980 		    !UVM_ET_ISHOLE(entry)) {
1981 			((struct vmspace *)map)->vm_dused -=
1982 			    uvmspace_dused(map, entry->start, entry->end);
1983 		}
1984 		if (!UVM_ET_ISHOLE(entry))
1985 			map->size -= entry->end - entry->start;
1986 
1987 		/* Actual removal of entry. */
1988 		uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree);
1989 	}
1990 
1991 	pmap_update(vm_map_pmap(map));
1992 
1993 #ifdef VMMAP_DEBUG
1994 	if (markfree) {
1995 		for (entry = uvm_map_entrybyaddr(&map->addr, start);
1996 		    entry != NULL && entry->start < end;
1997 		    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
1998 			KDASSERT(entry->end <= start ||
1999 			    entry->start == entry->end ||
2000 			    UVM_ET_ISHOLE(entry));
2001 		}
2002 	} else {
2003 		vaddr_t a;
2004 		for (a = start; a < end; a += PAGE_SIZE)
2005 			KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL);
2006 	}
2007 #endif
2008 }
2009 
2010 /*
2011  * Mark all entries from first until end (exclusive) as pageable.
2012  *
2013  * Lock must be exclusive on entry and will not be touched.
2014  */
2015 void
2016 uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first,
2017     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr)
2018 {
2019 	struct vm_map_entry *iter;
2020 
2021 	for (iter = first; iter != end;
2022 	    iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) {
2023 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2024 		if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2025 			continue;
2026 
2027 		iter->wired_count = 0;
2028 		uvm_fault_unwire_locked(map, iter->start, iter->end);
2029 	}
2030 }
2031 
2032 /*
2033  * Mark all entries from first until end (exclusive) as wired.
2034  *
2035  * Lockflags determines the lock state on return from this function.
2036  * Lock must be exclusive on entry.
2037  */
2038 int
2039 uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first,
2040     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr,
2041     int lockflags)
2042 {
2043 	struct vm_map_entry *iter;
2044 #ifdef DIAGNOSTIC
2045 	unsigned int timestamp_save;
2046 #endif
2047 	int error;
2048 
2049 	/*
2050 	 * Wire pages in two passes:
2051 	 *
2052 	 * 1: holding the write lock, we create any anonymous maps that need
2053 	 *    to be created.  then we clip each map entry to the region to
2054 	 *    be wired and increment its wiring count.
2055 	 *
2056 	 * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
2057 	 *    in the pages for any newly wired area (wired_count == 1).
2058 	 *
2059 	 *    downgrading to a read lock for uvm_fault_wire avoids a possible
2060 	 *    deadlock with another thread that may have faulted on one of
2061 	 *    the pages to be wired (it would mark the page busy, blocking
2062 	 *    us, then in turn block on the map lock that we hold).
2063 	 *    because we keep the read lock on the map, the copy-on-write
2064 	 *    status of the entries we modify here cannot change.
2065 	 */
2066 	for (iter = first; iter != end;
2067 	    iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) {
2068 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2069 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2070 		    iter->protection == PROT_NONE)
2071 			continue;
2072 
2073 		/*
2074 		 * Perform actions of vm_map_lookup that need the write lock.
2075 		 * - create an anonymous map for copy-on-write
2076 		 * - anonymous map for zero-fill
2077 		 * Skip submaps.
2078 		 */
2079 		if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) &&
2080 		    UVM_ET_ISNEEDSCOPY(iter) &&
2081 		    ((iter->protection & PROT_WRITE) ||
2082 		    iter->object.uvm_obj == NULL)) {
2083 			amap_copy(map, iter, M_WAITOK, TRUE,
2084 			    iter->start, iter->end);
2085 		}
2086 		iter->wired_count++;
2087 	}
2088 
2089 	/*
2090 	 * Pass 2.
2091 	 */
2092 #ifdef DIAGNOSTIC
2093 	timestamp_save = map->timestamp;
2094 #endif
2095 	vm_map_busy(map);
2096 	vm_map_downgrade(map);
2097 
2098 	error = 0;
2099 	for (iter = first; error == 0 && iter != end;
2100 	    iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) {
2101 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2102 		    iter->protection == PROT_NONE)
2103 			continue;
2104 
2105 		error = uvm_fault_wire(map, iter->start, iter->end,
2106 		    iter->protection);
2107 	}
2108 
2109 	if (error) {
2110 		/*
2111 		 * uvm_fault_wire failure
2112 		 *
2113 		 * Reacquire lock and undo our work.
2114 		 */
2115 		vm_map_upgrade(map);
2116 		vm_map_unbusy(map);
2117 #ifdef DIAGNOSTIC
2118 		if (timestamp_save != map->timestamp)
2119 			panic("uvm_map_pageable_wire: stale map");
2120 #endif
2121 
2122 		/*
2123 		 * first is no longer needed to restart loops.
2124 		 * Use it as iterator to unmap successful mappings.
2125 		 */
2126 		for (; first != iter;
2127 		    first = RB_NEXT(uvm_map_addr, &map->addr, first)) {
2128 			if (UVM_ET_ISHOLE(first) ||
2129 			    first->start == first->end ||
2130 			    first->protection == PROT_NONE)
2131 				continue;
2132 
2133 			first->wired_count--;
2134 			if (!VM_MAPENT_ISWIRED(first)) {
2135 				uvm_fault_unwire_locked(map,
2136 				    iter->start, iter->end);
2137 			}
2138 		}
2139 
2140 		/* decrease counter in the rest of the entries */
2141 		for (; iter != end;
2142 		    iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) {
2143 			if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2144 			    iter->protection == PROT_NONE)
2145 				continue;
2146 
2147 			iter->wired_count--;
2148 		}
2149 
2150 		if ((lockflags & UVM_LK_EXIT) == 0)
2151 			vm_map_unlock(map);
2152 		return error;
2153 	}
2154 
2155 	/* We are currently holding a read lock. */
2156 	if ((lockflags & UVM_LK_EXIT) == 0) {
2157 		vm_map_unbusy(map);
2158 		vm_map_unlock_read(map);
2159 	} else {
2160 		vm_map_upgrade(map);
2161 		vm_map_unbusy(map);
2162 #ifdef DIAGNOSTIC
2163 		if (timestamp_save != map->timestamp)
2164 			panic("uvm_map_pageable_wire: stale map");
2165 #endif
2166 	}
2167 	return 0;
2168 }
2169 
2170 /*
2171  * uvm_map_pageable: set pageability of a range in a map.
2172  *
2173  * Flags:
2174  * UVM_LK_ENTER: map is already locked by caller
2175  * UVM_LK_EXIT:  don't unlock map on exit
2176  *
2177  * The full range must be in use (entries may not have fspace != 0).
2178  * UVM_ET_HOLE counts as unmapped.
2179  */
2180 int
2181 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
2182     boolean_t new_pageable, int lockflags)
2183 {
2184 	struct vm_map_entry *first, *last, *tmp;
2185 	int error;
2186 
2187 	start = trunc_page(start);
2188 	end = round_page(end);
2189 
2190 	if (start > end)
2191 		return EINVAL;
2192 	if (start == end)
2193 		return 0;	/* nothing to do */
2194 	if (start < map->min_offset)
2195 		return EFAULT; /* why? see first XXX below */
2196 	if (end > map->max_offset)
2197 		return EINVAL; /* why? see second XXX below */
2198 
2199 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2200 	if ((lockflags & UVM_LK_ENTER) == 0)
2201 		vm_map_lock(map);
2202 
2203 	/*
2204 	 * Find first entry.
2205 	 *
2206 	 * Initial test on start is different, because of the different
2207 	 * error returned. Rest is tested further down.
2208 	 */
2209 	first = uvm_map_entrybyaddr(&map->addr, start);
2210 	if (first->end <= start || UVM_ET_ISHOLE(first)) {
2211 		/*
2212 		 * XXX if the first address is not mapped, it is EFAULT?
2213 		 */
2214 		error = EFAULT;
2215 		goto out;
2216 	}
2217 
2218 	/* Check that the range has no holes. */
2219 	for (last = first; last != NULL && last->start < end;
2220 	    last = RB_NEXT(uvm_map_addr, &map->addr, last)) {
2221 		if (UVM_ET_ISHOLE(last) ||
2222 		    (last->end < end && VMMAP_FREE_END(last) != last->end)) {
2223 			/*
2224 			 * XXX unmapped memory in range, why is it EINVAL
2225 			 * instead of EFAULT?
2226 			 */
2227 			error = EINVAL;
2228 			goto out;
2229 		}
2230 	}
2231 
2232 	/*
2233 	 * Last ended at the first entry after the range.
2234 	 * Move back one step.
2235 	 *
2236 	 * Note that last may be NULL.
2237 	 */
2238 	if (last == NULL) {
2239 		last = RB_MAX(uvm_map_addr, &map->addr);
2240 		if (last->end < end) {
2241 			error = EINVAL;
2242 			goto out;
2243 		}
2244 	} else {
2245 		KASSERT(last != first);
2246 		last = RB_PREV(uvm_map_addr, &map->addr, last);
2247 	}
2248 
2249 	/* Wire/unwire pages here. */
2250 	if (new_pageable) {
2251 		/*
2252 		 * Mark pageable.
2253 		 * entries that are not wired are untouched.
2254 		 */
2255 		if (VM_MAPENT_ISWIRED(first))
2256 			UVM_MAP_CLIP_START(map, first, start);
2257 		/*
2258 		 * Split last at end.
2259 		 * Make tmp be the first entry after what is to be touched.
2260 		 * If last is not wired, don't touch it.
2261 		 */
2262 		if (VM_MAPENT_ISWIRED(last)) {
2263 			UVM_MAP_CLIP_END(map, last, end);
2264 			tmp = RB_NEXT(uvm_map_addr, &map->addr, last);
2265 		} else
2266 			tmp = last;
2267 
2268 		uvm_map_pageable_pgon(map, first, tmp, start, end);
2269 		error = 0;
2270 
2271 out:
2272 		if ((lockflags & UVM_LK_EXIT) == 0)
2273 			vm_map_unlock(map);
2274 		return error;
2275 	} else {
2276 		/*
2277 		 * Mark entries wired.
2278 		 * entries are always touched (because recovery needs this).
2279 		 */
2280 		if (!VM_MAPENT_ISWIRED(first))
2281 			UVM_MAP_CLIP_START(map, first, start);
2282 		/*
2283 		 * Split last at end.
2284 		 * Make tmp be the first entry after what is to be touched.
2285 		 * If last is not wired, don't touch it.
2286 		 */
2287 		if (!VM_MAPENT_ISWIRED(last)) {
2288 			UVM_MAP_CLIP_END(map, last, end);
2289 			tmp = RB_NEXT(uvm_map_addr, &map->addr, last);
2290 		} else
2291 			tmp = last;
2292 
2293 		return uvm_map_pageable_wire(map, first, tmp, start, end,
2294 		    lockflags);
2295 	}
2296 }
2297 
2298 /*
2299  * uvm_map_pageable_all: special case of uvm_map_pageable - affects
2300  * all mapped regions.
2301  *
2302  * Map must not be locked.
2303  * If no flags are specified, all ragions are unwired.
2304  */
2305 int
2306 uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
2307 {
2308 	vsize_t size;
2309 	struct vm_map_entry *iter;
2310 
2311 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2312 	vm_map_lock(map);
2313 
2314 	if (flags == 0) {
2315 		uvm_map_pageable_pgon(map, RB_MIN(uvm_map_addr, &map->addr),
2316 		    NULL, map->min_offset, map->max_offset);
2317 
2318 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
2319 		vm_map_unlock(map);
2320 		return 0;
2321 	}
2322 
2323 	if (flags & MCL_FUTURE)
2324 		vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
2325 	if (!(flags & MCL_CURRENT)) {
2326 		vm_map_unlock(map);
2327 		return 0;
2328 	}
2329 
2330 	/*
2331 	 * Count number of pages in all non-wired entries.
2332 	 * If the number exceeds the limit, abort.
2333 	 */
2334 	size = 0;
2335 	RB_FOREACH(iter, uvm_map_addr, &map->addr) {
2336 		if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2337 			continue;
2338 
2339 		size += iter->end - iter->start;
2340 	}
2341 
2342 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
2343 		vm_map_unlock(map);
2344 		return ENOMEM;
2345 	}
2346 
2347 	/* XXX non-pmap_wired_count case must be handled by caller */
2348 #ifdef pmap_wired_count
2349 	if (limit != 0 &&
2350 	    size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) {
2351 		vm_map_unlock(map);
2352 		return ENOMEM;
2353 	}
2354 #endif
2355 
2356 	/*
2357 	 * uvm_map_pageable_wire will release lcok
2358 	 */
2359 	return uvm_map_pageable_wire(map, RB_MIN(uvm_map_addr, &map->addr),
2360 	    NULL, map->min_offset, map->max_offset, 0);
2361 }
2362 
2363 /*
2364  * Initialize map.
2365  *
2366  * Allocates sufficient entries to describe the free memory in the map.
2367  */
2368 void
2369 uvm_map_setup(struct vm_map *map, vaddr_t min, vaddr_t max, int flags)
2370 {
2371 	int i;
2372 
2373 	KASSERT((min & (vaddr_t)PAGE_MASK) == 0);
2374 	KASSERT((max & (vaddr_t)PAGE_MASK) == 0 ||
2375 	    (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
2376 
2377 	/*
2378 	 * Update parameters.
2379 	 *
2380 	 * This code handles (vaddr_t)-1 and other page mask ending addresses
2381 	 * properly.
2382 	 * We lose the top page if the full virtual address space is used.
2383 	 */
2384 	if (max & (vaddr_t)PAGE_MASK) {
2385 		max += 1;
2386 		if (max == 0) /* overflow */
2387 			max -= PAGE_SIZE;
2388 	}
2389 
2390 	RB_INIT(&map->addr);
2391 	map->uaddr_exe = NULL;
2392 	for (i = 0; i < nitems(map->uaddr_any); ++i)
2393 		map->uaddr_any[i] = NULL;
2394 	map->uaddr_brk_stack = NULL;
2395 
2396 	map->size = 0;
2397 	map->ref_count = 0;
2398 	map->min_offset = min;
2399 	map->max_offset = max;
2400 	map->b_start = map->b_end = 0; /* Empty brk() area by default. */
2401 	map->s_start = map->s_end = 0; /* Empty stack area by default. */
2402 	map->flags = flags;
2403 	map->timestamp = 0;
2404 	rw_init(&map->lock, "vmmaplk");
2405 	mtx_init(&map->mtx, IPL_VM);
2406 	mtx_init(&map->flags_lock, IPL_VM);
2407 
2408 	/* Configure the allocators. */
2409 	if (flags & VM_MAP_ISVMSPACE)
2410 		uvm_map_setup_md(map);
2411 	else
2412 		map->uaddr_any[3] = &uaddr_kbootstrap;
2413 
2414 	/*
2415 	 * Fill map entries.
2416 	 * We do not need to write-lock the map here because only the current
2417 	 * thread sees it right now. Initialize ref_count to 0 above to avoid
2418 	 * bogus triggering of lock-not-held assertions.
2419 	 */
2420 	uvm_map_setup_entries(map);
2421 	uvm_tree_sanity(map, __FILE__, __LINE__);
2422 	map->ref_count = 1;
2423 }
2424 
2425 /*
2426  * Destroy the map.
2427  *
2428  * This is the inverse operation to uvm_map_setup.
2429  */
2430 void
2431 uvm_map_teardown(struct vm_map *map)
2432 {
2433 	struct uvm_map_deadq	 dead_entries;
2434 	struct vm_map_entry	*entry, *tmp;
2435 #ifdef VMMAP_DEBUG
2436 	size_t			 numq, numt;
2437 #endif
2438 	int			 i;
2439 
2440 	KERNEL_ASSERT_LOCKED();
2441 	KERNEL_UNLOCK();
2442 	KERNEL_ASSERT_UNLOCKED();
2443 
2444 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
2445 
2446 	/* Remove address selectors. */
2447 	uvm_addr_destroy(map->uaddr_exe);
2448 	map->uaddr_exe = NULL;
2449 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2450 		uvm_addr_destroy(map->uaddr_any[i]);
2451 		map->uaddr_any[i] = NULL;
2452 	}
2453 	uvm_addr_destroy(map->uaddr_brk_stack);
2454 	map->uaddr_brk_stack = NULL;
2455 
2456 	/*
2457 	 * Remove entries.
2458 	 *
2459 	 * The following is based on graph breadth-first search.
2460 	 *
2461 	 * In color terms:
2462 	 * - the dead_entries set contains all nodes that are reachable
2463 	 *   (i.e. both the black and the grey nodes)
2464 	 * - any entry not in dead_entries is white
2465 	 * - any entry that appears in dead_entries before entry,
2466 	 *   is black, the rest is grey.
2467 	 * The set [entry, end] is also referred to as the wavefront.
2468 	 *
2469 	 * Since the tree is always a fully connected graph, the breadth-first
2470 	 * search guarantees that each vmmap_entry is visited exactly once.
2471 	 * The vm_map is broken down in linear time.
2472 	 */
2473 	TAILQ_INIT(&dead_entries);
2474 	if ((entry = RB_ROOT(&map->addr)) != NULL)
2475 		DEAD_ENTRY_PUSH(&dead_entries, entry);
2476 	while (entry != NULL) {
2477 		sched_pause();
2478 		uvm_unmap_kill_entry(map, entry);
2479 		if ((tmp = RB_LEFT(entry, daddrs.addr_entry)) != NULL)
2480 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2481 		if ((tmp = RB_RIGHT(entry, daddrs.addr_entry)) != NULL)
2482 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2483 		/* Update wave-front. */
2484 		entry = TAILQ_NEXT(entry, dfree.deadq);
2485 	}
2486 
2487 #ifdef VMMAP_DEBUG
2488 	numt = numq = 0;
2489 	RB_FOREACH(entry, uvm_map_addr, &map->addr)
2490 		numt++;
2491 	TAILQ_FOREACH(entry, &dead_entries, dfree.deadq)
2492 		numq++;
2493 	KASSERT(numt == numq);
2494 #endif
2495 	uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK);
2496 
2497 	KERNEL_LOCK();
2498 
2499 	pmap_destroy(map->pmap);
2500 	map->pmap = NULL;
2501 }
2502 
2503 /*
2504  * Populate map with free-memory entries.
2505  *
2506  * Map must be initialized and empty.
2507  */
2508 void
2509 uvm_map_setup_entries(struct vm_map *map)
2510 {
2511 	KDASSERT(RB_EMPTY(&map->addr));
2512 
2513 	uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0);
2514 }
2515 
2516 /*
2517  * Split entry at given address.
2518  *
2519  * orig:  entry that is to be split.
2520  * next:  a newly allocated map entry that is not linked.
2521  * split: address at which the split is done.
2522  */
2523 void
2524 uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig,
2525     struct vm_map_entry *next, vaddr_t split)
2526 {
2527 	struct uvm_addr_state *free, *free_before;
2528 	vsize_t adj;
2529 
2530 	if ((split & PAGE_MASK) != 0) {
2531 		panic("uvm_map_splitentry: split address 0x%lx "
2532 		    "not on page boundary!", split);
2533 	}
2534 	KDASSERT(map != NULL && orig != NULL && next != NULL);
2535 	uvm_tree_sanity(map, __FILE__, __LINE__);
2536 	KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split);
2537 
2538 #ifdef VMMAP_DEBUG
2539 	KDASSERT(RB_FIND(uvm_map_addr, &map->addr, orig) == orig);
2540 	KDASSERT(RB_FIND(uvm_map_addr, &map->addr, next) != next);
2541 #endif /* VMMAP_DEBUG */
2542 
2543 	/*
2544 	 * Free space will change, unlink from free space tree.
2545 	 */
2546 	free = uvm_map_uaddr_e(map, orig);
2547 	uvm_mapent_free_remove(map, free, orig);
2548 
2549 	adj = split - orig->start;
2550 
2551 	uvm_mapent_copy(orig, next);
2552 	if (split >= orig->end) {
2553 		next->etype = 0;
2554 		next->offset = 0;
2555 		next->wired_count = 0;
2556 		next->start = next->end = split;
2557 		next->guard = 0;
2558 		next->fspace = VMMAP_FREE_END(orig) - split;
2559 		next->aref.ar_amap = NULL;
2560 		next->aref.ar_pageoff = 0;
2561 		orig->guard = MIN(orig->guard, split - orig->end);
2562 		orig->fspace = split - VMMAP_FREE_START(orig);
2563 	} else {
2564 		orig->fspace = 0;
2565 		orig->guard = 0;
2566 		orig->end = next->start = split;
2567 
2568 		if (next->aref.ar_amap) {
2569 			KERNEL_LOCK();
2570 			amap_splitref(&orig->aref, &next->aref, adj);
2571 			KERNEL_UNLOCK();
2572 		}
2573 		if (UVM_ET_ISSUBMAP(orig)) {
2574 			uvm_map_reference(next->object.sub_map);
2575 			next->offset += adj;
2576 		} else if (UVM_ET_ISOBJ(orig)) {
2577 			if (next->object.uvm_obj->pgops &&
2578 			    next->object.uvm_obj->pgops->pgo_reference) {
2579 				KERNEL_LOCK();
2580 				next->object.uvm_obj->pgops->pgo_reference(
2581 				    next->object.uvm_obj);
2582 				KERNEL_UNLOCK();
2583 			}
2584 			next->offset += adj;
2585 		}
2586 	}
2587 
2588 	/*
2589 	 * Link next into address tree.
2590 	 * Link orig and next into free-space tree.
2591 	 *
2592 	 * Don't insert 'next' into the addr tree until orig has been linked,
2593 	 * in case the free-list looks at adjecent entries in the addr tree
2594 	 * for its decisions.
2595 	 */
2596 	if (orig->fspace > 0)
2597 		free_before = free;
2598 	else
2599 		free_before = uvm_map_uaddr_e(map, orig);
2600 	uvm_mapent_free_insert(map, free_before, orig);
2601 	uvm_mapent_addr_insert(map, next);
2602 	uvm_mapent_free_insert(map, free, next);
2603 
2604 	uvm_tree_sanity(map, __FILE__, __LINE__);
2605 }
2606 
2607 
2608 #ifdef VMMAP_DEBUG
2609 
2610 void
2611 uvm_tree_assert(struct vm_map *map, int test, char *test_str,
2612     char *file, int line)
2613 {
2614 	char* map_special;
2615 
2616 	if (test)
2617 		return;
2618 
2619 	if (map == kernel_map)
2620 		map_special = " (kernel_map)";
2621 	else if (map == kmem_map)
2622 		map_special = " (kmem_map)";
2623 	else
2624 		map_special = "";
2625 	panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file,
2626 	    line, test_str);
2627 }
2628 
2629 /*
2630  * Check that map is sane.
2631  */
2632 void
2633 uvm_tree_sanity(struct vm_map *map, char *file, int line)
2634 {
2635 	struct vm_map_entry	*iter;
2636 	vaddr_t			 addr;
2637 	vaddr_t			 min, max, bound; /* Bounds checker. */
2638 	struct uvm_addr_state	*free;
2639 
2640 	addr = vm_map_min(map);
2641 	RB_FOREACH(iter, uvm_map_addr, &map->addr) {
2642 		/*
2643 		 * Valid start, end.
2644 		 * Catch overflow for end+fspace.
2645 		 */
2646 		UVM_ASSERT(map, iter->end >= iter->start, file, line);
2647 		UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line);
2648 
2649 		/* May not be empty. */
2650 		UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter),
2651 		    file, line);
2652 
2653 		/* Addresses for entry must lie within map boundaries. */
2654 		UVM_ASSERT(map, iter->start >= vm_map_min(map) &&
2655 		    VMMAP_FREE_END(iter) <= vm_map_max(map), file, line);
2656 
2657 		/* Tree may not have gaps. */
2658 		UVM_ASSERT(map, iter->start == addr, file, line);
2659 		addr = VMMAP_FREE_END(iter);
2660 
2661 		/*
2662 		 * Free space may not cross boundaries, unless the same
2663 		 * free list is used on both sides of the border.
2664 		 */
2665 		min = VMMAP_FREE_START(iter);
2666 		max = VMMAP_FREE_END(iter);
2667 
2668 		while (min < max &&
2669 		    (bound = uvm_map_boundary(map, min, max)) != max) {
2670 			UVM_ASSERT(map,
2671 			    uvm_map_uaddr(map, bound - 1) ==
2672 			    uvm_map_uaddr(map, bound),
2673 			    file, line);
2674 			min = bound;
2675 		}
2676 
2677 		free = uvm_map_uaddr_e(map, iter);
2678 		if (free) {
2679 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0,
2680 			    file, line);
2681 		} else {
2682 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0,
2683 			    file, line);
2684 		}
2685 	}
2686 	UVM_ASSERT(map, addr == vm_map_max(map), file, line);
2687 }
2688 
2689 void
2690 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
2691 {
2692 	struct vm_map_entry *iter;
2693 	vsize_t size;
2694 
2695 	size = 0;
2696 	RB_FOREACH(iter, uvm_map_addr, &map->addr) {
2697 		if (!UVM_ET_ISHOLE(iter))
2698 			size += iter->end - iter->start;
2699 	}
2700 
2701 	if (map->size != size)
2702 		printf("map size = 0x%lx, should be 0x%lx\n", map->size, size);
2703 	UVM_ASSERT(map, map->size == size, file, line);
2704 
2705 	vmspace_validate(map);
2706 }
2707 
2708 /*
2709  * This function validates the statistics on vmspace.
2710  */
2711 void
2712 vmspace_validate(struct vm_map *map)
2713 {
2714 	struct vmspace *vm;
2715 	struct vm_map_entry *iter;
2716 	vaddr_t imin, imax;
2717 	vaddr_t stack_begin, stack_end; /* Position of stack. */
2718 	vsize_t stack, heap; /* Measured sizes. */
2719 
2720 	if (!(map->flags & VM_MAP_ISVMSPACE))
2721 		return;
2722 
2723 	vm = (struct vmspace *)map;
2724 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2725 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2726 
2727 	stack = heap = 0;
2728 	RB_FOREACH(iter, uvm_map_addr, &map->addr) {
2729 		imin = imax = iter->start;
2730 
2731 		if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL)
2732 			continue;
2733 
2734 		/*
2735 		 * Update stack, heap.
2736 		 * Keep in mind that (theoretically) the entries of
2737 		 * userspace and stack may be joined.
2738 		 */
2739 		while (imin != iter->end) {
2740 			/*
2741 			 * Set imax to the first boundary crossed between
2742 			 * imin and stack addresses.
2743 			 */
2744 			imax = iter->end;
2745 			if (imin < stack_begin && imax > stack_begin)
2746 				imax = stack_begin;
2747 			else if (imin < stack_end && imax > stack_end)
2748 				imax = stack_end;
2749 
2750 			if (imin >= stack_begin && imin < stack_end)
2751 				stack += imax - imin;
2752 			else
2753 				heap += imax - imin;
2754 			imin = imax;
2755 		}
2756 	}
2757 
2758 	heap >>= PAGE_SHIFT;
2759 	if (heap != vm->vm_dused) {
2760 		printf("vmspace stack range: 0x%lx-0x%lx\n",
2761 		    stack_begin, stack_end);
2762 		panic("vmspace_validate: vmspace.vm_dused invalid, "
2763 		    "expected %ld pgs, got %ld pgs in map %p",
2764 		    heap, vm->vm_dused,
2765 		    map);
2766 	}
2767 }
2768 
2769 #endif /* VMMAP_DEBUG */
2770 
2771 /*
2772  * uvm_map_init: init mapping system at boot time.   note that we allocate
2773  * and init the static pool of structs vm_map_entry for the kernel here.
2774  */
2775 void
2776 uvm_map_init(void)
2777 {
2778 	static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
2779 	int lcv;
2780 
2781 	/* now set up static pool of kernel map entries ... */
2782 	mtx_init(&uvm_kmapent_mtx, IPL_VM);
2783 	uvm.kentry_free = NULL;
2784 	for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
2785 		RB_LEFT(&kernel_map_entry[lcv], daddrs.addr_entry) =
2786 		    uvm.kentry_free;
2787 		uvm.kentry_free = &kernel_map_entry[lcv];
2788 	}
2789 
2790 	/* initialize the map-related pools. */
2791 	pool_init(&uvm_vmspace_pool, sizeof(struct vmspace),
2792 	    0, 0, PR_WAITOK, "vmsppl", NULL);
2793 	pool_setipl(&uvm_vmspace_pool, IPL_NONE);
2794 	pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry),
2795 	    0, 0, PR_WAITOK, "vmmpepl", NULL);
2796 	pool_setipl(&uvm_map_entry_pool, IPL_VM);
2797 	pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry),
2798 	    0, 0, 0, "vmmpekpl", NULL);
2799 	pool_setipl(&uvm_map_entry_kmem_pool, IPL_NONE);
2800 	pool_sethiwat(&uvm_map_entry_pool, 8192);
2801 
2802 	uvm_addr_init();
2803 }
2804 
2805 #if defined(DDB)
2806 
2807 /*
2808  * DDB hooks
2809  */
2810 
2811 /*
2812  * uvm_map_printit: actually prints the map
2813  */
2814 void
2815 uvm_map_printit(struct vm_map *map, boolean_t full,
2816     int (*pr)(const char *, ...))
2817 {
2818 	struct vmspace			*vm;
2819 	struct vm_map_entry		*entry;
2820 	struct uvm_addr_state		*free;
2821 	int				 in_free, i;
2822 	char				 buf[8];
2823 
2824 	(*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
2825 	(*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n",
2826 	    map->b_start, map->b_end);
2827 	(*pr)("\tstack allocate range: 0x%lx-0x%lx\n",
2828 	    map->s_start, map->s_end);
2829 	(*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n",
2830 	    map->size, map->ref_count, map->timestamp,
2831 	    map->flags);
2832 	(*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
2833 	    pmap_resident_count(map->pmap));
2834 
2835 	/* struct vmspace handling. */
2836 	if (map->flags & VM_MAP_ISVMSPACE) {
2837 		vm = (struct vmspace *)map;
2838 
2839 		(*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n",
2840 		    vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss);
2841 		(*pr)("\tvm_tsize=%u vm_dsize=%u\n",
2842 		    vm->vm_tsize, vm->vm_dsize);
2843 		(*pr)("\tvm_taddr=%p vm_daddr=%p\n",
2844 		    vm->vm_taddr, vm->vm_daddr);
2845 		(*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n",
2846 		    vm->vm_maxsaddr, vm->vm_minsaddr);
2847 	}
2848 
2849 	if (!full)
2850 		goto print_uaddr;
2851 	RB_FOREACH(entry, uvm_map_addr, &map->addr) {
2852 		(*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
2853 		    entry, entry->start, entry->end, entry->object.uvm_obj,
2854 		    (long long)entry->offset, entry->aref.ar_amap,
2855 		    entry->aref.ar_pageoff);
2856 		(*pr)("\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
2857 		    "wc=%d, adv=%d\n",
2858 		    (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
2859 		    (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
2860 		    (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
2861 		    entry->protection, entry->max_protection,
2862 		    entry->inheritance, entry->wired_count, entry->advice);
2863 
2864 		free = uvm_map_uaddr_e(map, entry);
2865 		in_free = (free != NULL);
2866 		(*pr)("\thole=%c, free=%c, guard=0x%lx, "
2867 		    "free=0x%lx-0x%lx\n",
2868 		    (entry->etype & UVM_ET_HOLE) ? 'T' : 'F',
2869 		    in_free ? 'T' : 'F',
2870 		    entry->guard,
2871 		    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
2872 		(*pr)("\tfspace_augment=%lu\n", entry->fspace_augment);
2873 		(*pr)("\tfreemapped=%c, uaddr=%p\n",
2874 		    (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free);
2875 		if (free) {
2876 			(*pr)("\t\t(0x%lx-0x%lx %s)\n",
2877 			    free->uaddr_minaddr, free->uaddr_maxaddr,
2878 			    free->uaddr_functions->uaddr_name);
2879 		}
2880 	}
2881 
2882 print_uaddr:
2883 	uvm_addr_print(map->uaddr_exe, "exe", full, pr);
2884 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2885 		snprintf(&buf[0], sizeof(buf), "any[%d]", i);
2886 		uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr);
2887 	}
2888 	uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr);
2889 }
2890 
2891 /*
2892  * uvm_object_printit: actually prints the object
2893  */
2894 void
2895 uvm_object_printit(uobj, full, pr)
2896 	struct uvm_object *uobj;
2897 	boolean_t full;
2898 	int (*pr)(const char *, ...);
2899 {
2900 	struct vm_page *pg;
2901 	int cnt = 0;
2902 
2903 	(*pr)("OBJECT %p: pgops=%p, npages=%d, ",
2904 	    uobj, uobj->pgops, uobj->uo_npages);
2905 	if (UVM_OBJ_IS_KERN_OBJECT(uobj))
2906 		(*pr)("refs=<SYSTEM>\n");
2907 	else
2908 		(*pr)("refs=%d\n", uobj->uo_refs);
2909 
2910 	if (!full) {
2911 		return;
2912 	}
2913 	(*pr)("  PAGES <pg,offset>:\n  ");
2914 	RB_FOREACH(pg, uvm_objtree, &uobj->memt) {
2915 		(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
2916 		if ((cnt % 3) == 2) {
2917 			(*pr)("\n  ");
2918 		}
2919 		cnt++;
2920 	}
2921 	if ((cnt % 3) != 2) {
2922 		(*pr)("\n");
2923 	}
2924 }
2925 
2926 /*
2927  * uvm_page_printit: actually print the page
2928  */
2929 static const char page_flagbits[] =
2930 	"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
2931 	"\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
2932 	"\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
2933 
2934 void
2935 uvm_page_printit(pg, full, pr)
2936 	struct vm_page *pg;
2937 	boolean_t full;
2938 	int (*pr)(const char *, ...);
2939 {
2940 	struct vm_page *tpg;
2941 	struct uvm_object *uobj;
2942 	struct pglist *pgl;
2943 
2944 	(*pr)("PAGE %p:\n", pg);
2945 	(*pr)("  flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n",
2946 	    pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count,
2947 	    (long long)pg->phys_addr);
2948 	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
2949 	    pg->uobject, pg->uanon, (long long)pg->offset);
2950 #if defined(UVM_PAGE_TRKOWN)
2951 	if (pg->pg_flags & PG_BUSY)
2952 		(*pr)("  owning process = %d, tag=%s",
2953 		    pg->owner, pg->owner_tag);
2954 	else
2955 		(*pr)("  page not busy, no owner");
2956 #else
2957 	(*pr)("  [page ownership tracking disabled]");
2958 #endif
2959 	(*pr)("\tvm_page_md %p\n", &pg->mdpage);
2960 
2961 	if (!full)
2962 		return;
2963 
2964 	/* cross-verify object/anon */
2965 	if ((pg->pg_flags & PQ_FREE) == 0) {
2966 		if (pg->pg_flags & PQ_ANON) {
2967 			if (pg->uanon == NULL || pg->uanon->an_page != pg)
2968 			    (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2969 				(pg->uanon) ? pg->uanon->an_page : NULL);
2970 			else
2971 				(*pr)("  anon backpointer is OK\n");
2972 		} else {
2973 			uobj = pg->uobject;
2974 			if (uobj) {
2975 				(*pr)("  checking object list\n");
2976 				RB_FOREACH(tpg, uvm_objtree, &uobj->memt) {
2977 					if (tpg == pg) {
2978 						break;
2979 					}
2980 				}
2981 				if (tpg)
2982 					(*pr)("  page found on object list\n");
2983 				else
2984 					(*pr)("  >>> PAGE NOT FOUND "
2985 					    "ON OBJECT LIST! <<<\n");
2986 			}
2987 		}
2988 	}
2989 
2990 	/* cross-verify page queue */
2991 	if (pg->pg_flags & PQ_FREE) {
2992 		if (uvm_pmr_isfree(pg))
2993 			(*pr)("  page found in uvm_pmemrange\n");
2994 		else
2995 			(*pr)("  >>> page not found in uvm_pmemrange <<<\n");
2996 		pgl = NULL;
2997 	} else if (pg->pg_flags & PQ_INACTIVE) {
2998 		pgl = (pg->pg_flags & PQ_SWAPBACKED) ?
2999 		    &uvm.page_inactive_swp : &uvm.page_inactive_obj;
3000 	} else if (pg->pg_flags & PQ_ACTIVE) {
3001 		pgl = &uvm.page_active;
3002  	} else {
3003 		pgl = NULL;
3004 	}
3005 
3006 	if (pgl) {
3007 		(*pr)("  checking pageq list\n");
3008 		TAILQ_FOREACH(tpg, pgl, pageq) {
3009 			if (tpg == pg) {
3010 				break;
3011 			}
3012 		}
3013 		if (tpg)
3014 			(*pr)("  page found on pageq list\n");
3015 		else
3016 			(*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
3017 	}
3018 }
3019 #endif
3020 
3021 /*
3022  * uvm_map_protect: change map protection
3023  *
3024  * => set_max means set max_protection.
3025  * => map must be unlocked.
3026  */
3027 int
3028 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
3029     vm_prot_t new_prot, boolean_t set_max)
3030 {
3031 	struct vm_map_entry *first, *iter;
3032 	vm_prot_t old_prot;
3033 	vm_prot_t mask;
3034 	int error;
3035 
3036 	if (start > end)
3037 		return EINVAL;
3038 	start = MAX(start, map->min_offset);
3039 	end = MIN(end, map->max_offset);
3040 	if (start >= end)
3041 		return 0;
3042 
3043 	error = 0;
3044 	vm_map_lock(map);
3045 
3046 	/*
3047 	 * Set up first and last.
3048 	 * - first will contain first entry at or after start.
3049 	 */
3050 	first = uvm_map_entrybyaddr(&map->addr, start);
3051 	KDASSERT(first != NULL);
3052 	if (first->end < start)
3053 		first = RB_NEXT(uvm_map_addr, &map->addr, first);
3054 
3055 	/* First, check for protection violations. */
3056 	for (iter = first; iter != NULL && iter->start < end;
3057 	    iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) {
3058 		/* Treat memory holes as free space. */
3059 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3060 			continue;
3061 
3062 		if (UVM_ET_ISSUBMAP(iter)) {
3063 			error = EINVAL;
3064 			goto out;
3065 		}
3066 		if ((new_prot & iter->max_protection) != new_prot) {
3067 			error = EACCES;
3068 			goto out;
3069 		}
3070 		if (map == kernel_map &&
3071 		    (new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
3072 			panic("uvm_map_protect: kernel map W^X violation requested");
3073 	}
3074 
3075 	/* Fix protections.  */
3076 	for (iter = first; iter != NULL && iter->start < end;
3077 	    iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) {
3078 		/* Treat memory holes as free space. */
3079 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3080 			continue;
3081 
3082 		old_prot = iter->protection;
3083 
3084 		/*
3085 		 * Skip adapting protection iff old and new protection
3086 		 * are equal.
3087 		 */
3088 		if (set_max) {
3089 			if (old_prot == (new_prot & old_prot) &&
3090 			    iter->max_protection == new_prot)
3091 				continue;
3092 		} else {
3093 			if (old_prot == new_prot)
3094 				continue;
3095 		}
3096 
3097 		UVM_MAP_CLIP_START(map, iter, start);
3098 		UVM_MAP_CLIP_END(map, iter, end);
3099 
3100 		if (set_max) {
3101 			iter->max_protection = new_prot;
3102 			iter->protection &= new_prot;
3103 		} else
3104 			iter->protection = new_prot;
3105 
3106 		/*
3107 		 * update physical map if necessary.  worry about copy-on-write
3108 		 * here -- CHECK THIS XXX
3109 		 */
3110 		if (iter->protection != old_prot) {
3111 			mask = UVM_ET_ISCOPYONWRITE(iter) ?
3112 			    ~PROT_WRITE : PROT_MASK;
3113 
3114 			/* update pmap */
3115 			if ((iter->protection & mask) == PROT_NONE &&
3116 			    VM_MAPENT_ISWIRED(iter)) {
3117 				/*
3118 				 * TODO(ariane) this is stupid. wired_count
3119 				 * is 0 if not wired, otherwise anything
3120 				 * larger than 0 (incremented once each time
3121 				 * wire is called).
3122 				 * Mostly to be able to undo the damage on
3123 				 * failure. Not the actually be a wired
3124 				 * refcounter...
3125 				 * Originally: iter->wired_count--;
3126 				 * (don't we have to unwire this in the pmap
3127 				 * as well?)
3128 				 */
3129 				iter->wired_count = 0;
3130 			}
3131 			pmap_protect(map->pmap, iter->start, iter->end,
3132 			    iter->protection & mask);
3133 		}
3134 
3135 		/*
3136 		 * If the map is configured to lock any future mappings,
3137 		 * wire this entry now if the old protection was PROT_NONE
3138 		 * and the new protection is not PROT_NONE.
3139 		 */
3140 		if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
3141 		    VM_MAPENT_ISWIRED(iter) == 0 &&
3142 		    old_prot == PROT_NONE &&
3143 		    new_prot != PROT_NONE) {
3144 			if (uvm_map_pageable(map, iter->start, iter->end,
3145 			    FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) {
3146 				/*
3147 				 * If locking the entry fails, remember the
3148 				 * error if it's the first one.  Note we
3149 				 * still continue setting the protection in
3150 				 * the map, but it will return the resource
3151 				 * storage condition regardless.
3152 				 *
3153 				 * XXX Ignore what the actual error is,
3154 				 * XXX just call it a resource shortage
3155 				 * XXX so that it doesn't get confused
3156 				 * XXX what uvm_map_protect() itself would
3157 				 * XXX normally return.
3158 				 */
3159 				error = ENOMEM;
3160 			}
3161 		}
3162 	}
3163 	pmap_update(map->pmap);
3164 
3165 out:
3166 	vm_map_unlock(map);
3167 	return error;
3168 }
3169 
3170 /*
3171  * uvmspace_alloc: allocate a vmspace structure.
3172  *
3173  * - structure includes vm_map and pmap
3174  * - XXX: no locking on this structure
3175  * - refcnt set to 1, rest must be init'd by caller
3176  */
3177 struct vmspace *
3178 uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable,
3179     boolean_t remove_holes)
3180 {
3181 	struct vmspace *vm;
3182 
3183 	vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO);
3184 	uvmspace_init(vm, NULL, min, max, pageable, remove_holes);
3185 	return (vm);
3186 }
3187 
3188 /*
3189  * uvmspace_init: initialize a vmspace structure.
3190  *
3191  * - XXX: no locking on this structure
3192  * - refcnt set to 1, rest must be init'd by caller
3193  */
3194 void
3195 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max,
3196     boolean_t pageable, boolean_t remove_holes)
3197 {
3198 	KASSERT(pmap == NULL || pmap == pmap_kernel());
3199 
3200 	if (pmap)
3201 		pmap_reference(pmap);
3202 	else
3203 		pmap = pmap_create();
3204 	vm->vm_map.pmap = pmap;
3205 
3206 	uvm_map_setup(&vm->vm_map, min, max,
3207 	    (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE);
3208 
3209 	vm->vm_refcnt = 1;
3210 
3211 	if (remove_holes)
3212 		pmap_remove_holes(vm);
3213 }
3214 
3215 /*
3216  * uvmspace_share: share a vmspace between two processes
3217  *
3218  * - XXX: no locking on vmspace
3219  * - used for vfork
3220  */
3221 
3222 struct vmspace *
3223 uvmspace_share(struct process *pr)
3224 {
3225 	struct vmspace *vm = pr->ps_vmspace;
3226 
3227 	vm->vm_refcnt++;
3228 	return vm;
3229 }
3230 
3231 /*
3232  * uvmspace_exec: the process wants to exec a new program
3233  *
3234  * - XXX: no locking on vmspace
3235  */
3236 
3237 void
3238 uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end)
3239 {
3240 	struct process *pr = p->p_p;
3241 	struct vmspace *nvm, *ovm = pr->ps_vmspace;
3242 	struct vm_map *map = &ovm->vm_map;
3243 	struct uvm_map_deadq dead_entries;
3244 
3245 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0);
3246 	KASSERT((end & (vaddr_t)PAGE_MASK) == 0 ||
3247 	    (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
3248 
3249 	pmap_unuse_final(p);   /* before stack addresses go away */
3250 	TAILQ_INIT(&dead_entries);
3251 
3252 	/* see if more than one process is using this vmspace...  */
3253 	if (ovm->vm_refcnt == 1) {
3254 		/*
3255 		 * If pr is the only process using its vmspace then
3256 		 * we can safely recycle that vmspace for the program
3257 		 * that is being exec'd.
3258 		 */
3259 
3260 #ifdef SYSVSHM
3261 		/*
3262 		 * SYSV SHM semantics require us to kill all segments on an exec
3263 		 */
3264 		if (ovm->vm_shm)
3265 			shmexit(ovm);
3266 #endif
3267 
3268 		/*
3269 		 * POSIX 1003.1b -- "lock future mappings" is revoked
3270 		 * when a process execs another program image.
3271 		 */
3272 		vm_map_lock(map);
3273 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
3274 
3275 		/*
3276 		 * now unmap the old program
3277 		 *
3278 		 * Instead of attempting to keep the map valid, we simply
3279 		 * nuke all entries and ask uvm_map_setup to reinitialize
3280 		 * the map to the new boundaries.
3281 		 *
3282 		 * uvm_unmap_remove will actually nuke all entries for us
3283 		 * (as in, not replace them with free-memory entries).
3284 		 */
3285 		uvm_unmap_remove(map, map->min_offset, map->max_offset,
3286 		    &dead_entries, TRUE, FALSE);
3287 
3288 		KDASSERT(RB_EMPTY(&map->addr));
3289 
3290 		/* Nuke statistics and boundaries. */
3291 		memset(&ovm->vm_startcopy, 0,
3292 		    (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy);
3293 
3294 
3295 		if (end & (vaddr_t)PAGE_MASK) {
3296 			end += 1;
3297 			if (end == 0) /* overflow */
3298 				end -= PAGE_SIZE;
3299 		}
3300 
3301 		/* Setup new boundaries and populate map with entries. */
3302 		map->min_offset = start;
3303 		map->max_offset = end;
3304 		uvm_map_setup_entries(map);
3305 		vm_map_unlock(map);
3306 
3307 		/* but keep MMU holes unavailable */
3308 		pmap_remove_holes(ovm);
3309 	} else {
3310 		/*
3311 		 * pr's vmspace is being shared, so we can't reuse
3312 		 * it for pr since it is still being used for others.
3313 		 * allocate a new vmspace for pr
3314 		 */
3315 		nvm = uvmspace_alloc(start, end,
3316 		    (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE);
3317 
3318 		/* install new vmspace and drop our ref to the old one. */
3319 		pmap_deactivate(p);
3320 		p->p_vmspace = pr->ps_vmspace = nvm;
3321 		pmap_activate(p);
3322 
3323 		uvmspace_free(ovm);
3324 	}
3325 
3326 	/* Release dead entries */
3327 	uvm_unmap_detach(&dead_entries, 0);
3328 }
3329 
3330 /*
3331  * uvmspace_free: free a vmspace data structure
3332  *
3333  * - XXX: no locking on vmspace
3334  */
3335 void
3336 uvmspace_free(struct vmspace *vm)
3337 {
3338 	if (--vm->vm_refcnt == 0) {
3339 		/*
3340 		 * lock the map, to wait out all other references to it.  delete
3341 		 * all of the mappings and pages they hold, then call the pmap
3342 		 * module to reclaim anything left.
3343 		 */
3344 #ifdef SYSVSHM
3345 		/* Get rid of any SYSV shared memory segments. */
3346 		if (vm->vm_shm != NULL)
3347 			shmexit(vm);
3348 #endif
3349 
3350 		uvm_map_teardown(&vm->vm_map);
3351 		pool_put(&uvm_vmspace_pool, vm);
3352 	}
3353 }
3354 
3355 /*
3356  * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
3357  * srcmap to the address range [dstaddr, dstaddr + sz) in
3358  * dstmap.
3359  *
3360  * The whole address range in srcmap must be backed by an object
3361  * (no holes).
3362  *
3363  * If successful, the address ranges share memory and the destination
3364  * address range uses the protection flags in prot.
3365  *
3366  * This routine assumes that sz is a multiple of PAGE_SIZE and
3367  * that dstaddr and srcaddr are page-aligned.
3368  */
3369 int
3370 uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
3371     struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
3372 {
3373 	int ret = 0;
3374 	vaddr_t unmap_end;
3375 	vaddr_t dstva;
3376 	vsize_t off, len, n = sz;
3377 	struct vm_map_entry *first = NULL, *last = NULL;
3378 	struct vm_map_entry *src_entry, *psrc_entry = NULL;
3379 	struct uvm_map_deadq dead;
3380 
3381 	if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
3382 		return EINVAL;
3383 
3384 	TAILQ_INIT(&dead);
3385 	vm_map_lock(dstmap);
3386 	vm_map_lock_read(srcmap);
3387 
3388 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
3389 		ret = ENOMEM;
3390 		goto exit_unlock;
3391 	}
3392 	if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
3393 		ret = EINVAL;
3394 		goto exit_unlock;
3395 	}
3396 
3397 	unmap_end = dstaddr;
3398 	for (; src_entry != NULL;
3399 	    psrc_entry = src_entry,
3400 	    src_entry = RB_NEXT(uvm_map_addr, &srcmap->addr, src_entry)) {
3401 		/* hole in address space, bail out */
3402 		if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
3403 			break;
3404 		if (src_entry->start >= srcaddr + sz)
3405 			break;
3406 
3407 		if (UVM_ET_ISSUBMAP(src_entry))
3408 			panic("uvm_share: encountered a submap (illegal)");
3409 		if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
3410 		    UVM_ET_ISNEEDSCOPY(src_entry))
3411 			panic("uvm_share: non-copy_on_write map entries "
3412 			    "marked needs_copy (illegal)");
3413 
3414 		dstva = dstaddr;
3415 		if (src_entry->start > srcaddr) {
3416 			dstva += src_entry->start - srcaddr;
3417 			off = 0;
3418 		} else
3419 			off = srcaddr - src_entry->start;
3420 
3421 		if (n < src_entry->end - src_entry->start)
3422 			len = n;
3423 		else
3424 			len = src_entry->end - src_entry->start;
3425 		n -= len;
3426 
3427 		if (uvm_mapent_share(dstmap, dstva, len, off, prot, prot,
3428 		    srcmap, src_entry, &dead) == NULL)
3429 			break;
3430 
3431 		unmap_end = dstva + len;
3432 		if (n == 0)
3433 			goto exit_unlock;
3434 	}
3435 
3436 	ret = EINVAL;
3437 	uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
3438 
3439 exit_unlock:
3440 	vm_map_unlock_read(srcmap);
3441 	vm_map_unlock(dstmap);
3442 	uvm_unmap_detach(&dead, 0);
3443 
3444 	return ret;
3445 }
3446 
3447 /*
3448  * Clone map entry into other map.
3449  *
3450  * Mapping will be placed at dstaddr, for the same length.
3451  * Space must be available.
3452  * Reference counters are incremented.
3453  */
3454 struct vm_map_entry *
3455 uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3456     vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
3457     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
3458     int mapent_flags, int amap_share_flags)
3459 {
3460 	struct vm_map_entry *new_entry, *first, *last;
3461 
3462 	KDASSERT(!UVM_ET_ISSUBMAP(old_entry));
3463 
3464 	/* Create new entry (linked in on creation). Fill in first, last. */
3465 	first = last = NULL;
3466 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) {
3467 		panic("uvmspace_fork: no space in map for "
3468 		    "entry in empty map");
3469 	}
3470 	new_entry = uvm_map_mkentry(dstmap, first, last,
3471 	    dstaddr, dstlen, mapent_flags, dead, NULL);
3472 	if (new_entry == NULL)
3473 		return NULL;
3474 	/* old_entry -> new_entry */
3475 	new_entry->object = old_entry->object;
3476 	new_entry->offset = old_entry->offset;
3477 	new_entry->aref = old_entry->aref;
3478 	new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
3479 	new_entry->protection = prot;
3480 	new_entry->max_protection = maxprot;
3481 	new_entry->inheritance = old_entry->inheritance;
3482 	new_entry->advice = old_entry->advice;
3483 
3484 	/* gain reference to object backing the map (can't be a submap). */
3485 	if (new_entry->aref.ar_amap) {
3486 		new_entry->aref.ar_pageoff += off >> PAGE_SHIFT;
3487 		amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3488 		    (new_entry->end - new_entry->start) >> PAGE_SHIFT,
3489 		    amap_share_flags);
3490 	}
3491 
3492 	if (UVM_ET_ISOBJ(new_entry) &&
3493 	    new_entry->object.uvm_obj->pgops->pgo_reference) {
3494 		new_entry->offset += off;
3495 		new_entry->object.uvm_obj->pgops->pgo_reference
3496 		    (new_entry->object.uvm_obj);
3497 	}
3498 
3499 	return new_entry;
3500 }
3501 
3502 struct vm_map_entry *
3503 uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3504     vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
3505     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3506 {
3507 	/*
3508 	 * If old_entry refers to a copy-on-write region that has not yet been
3509 	 * written to (needs_copy flag is set), then we need to allocate a new
3510 	 * amap for old_entry.
3511 	 *
3512 	 * If we do not do this, and the process owning old_entry does a copy-on
3513 	 * write later, old_entry and new_entry will refer to different memory
3514 	 * regions, and the memory between the processes is no longer shared.
3515 	 *
3516 	 * [in other words, we need to clear needs_copy]
3517 	 */
3518 
3519 	if (UVM_ET_ISNEEDSCOPY(old_entry)) {
3520 		/* get our own amap, clears needs_copy */
3521 		amap_copy(old_map, old_entry, M_WAITOK, FALSE,
3522 		    0, 0);
3523 		/* XXXCDC: WAITOK??? */
3524 	}
3525 
3526 	return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
3527 	    prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
3528 }
3529 
3530 /*
3531  * share the mapping: this means we want the old and
3532  * new entries to share amaps and backing objects.
3533  */
3534 struct vm_map_entry *
3535 uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
3536     struct vm_map *old_map,
3537     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3538 {
3539 	struct vm_map_entry *new_entry;
3540 
3541 	new_entry = uvm_mapent_share(new_map, old_entry->start,
3542 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3543 	    old_entry->max_protection, old_map, old_entry, dead);
3544 
3545 	/*
3546 	 * pmap_copy the mappings: this routine is optional
3547 	 * but if it is there it will reduce the number of
3548 	 * page faults in the new proc.
3549 	 */
3550 	if (!UVM_ET_ISHOLE(new_entry))
3551 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->start,
3552 		    (new_entry->end - new_entry->start), new_entry->start);
3553 
3554 	return (new_entry);
3555 }
3556 
3557 /*
3558  * copy-on-write the mapping (using mmap's
3559  * MAP_PRIVATE semantics)
3560  *
3561  * allocate new_entry, adjust reference counts.
3562  * (note that new references are read-only).
3563  */
3564 struct vm_map_entry *
3565 uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
3566     struct vm_map *old_map,
3567     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3568 {
3569 	struct vm_map_entry	*new_entry;
3570 	boolean_t		 protect_child;
3571 
3572 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3573 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3574 	    old_entry->max_protection, old_entry, dead, 0, 0);
3575 
3576 	new_entry->etype |=
3577 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3578 
3579 	/*
3580 	 * the new entry will need an amap.  it will either
3581 	 * need to be copied from the old entry or created
3582 	 * from scratch (if the old entry does not have an
3583 	 * amap).  can we defer this process until later
3584 	 * (by setting "needs_copy") or do we need to copy
3585 	 * the amap now?
3586 	 *
3587 	 * we must copy the amap now if any of the following
3588 	 * conditions hold:
3589 	 * 1. the old entry has an amap and that amap is
3590 	 *    being shared.  this means that the old (parent)
3591 	 *    process is sharing the amap with another
3592 	 *    process.  if we do not clear needs_copy here
3593 	 *    we will end up in a situation where both the
3594 	 *    parent and child process are referring to the
3595 	 *    same amap with "needs_copy" set.  if the
3596 	 *    parent write-faults, the fault routine will
3597 	 *    clear "needs_copy" in the parent by allocating
3598 	 *    a new amap.   this is wrong because the
3599 	 *    parent is supposed to be sharing the old amap
3600 	 *    and the new amap will break that.
3601 	 *
3602 	 * 2. if the old entry has an amap and a non-zero
3603 	 *    wire count then we are going to have to call
3604 	 *    amap_cow_now to avoid page faults in the
3605 	 *    parent process.   since amap_cow_now requires
3606 	 *    "needs_copy" to be clear we might as well
3607 	 *    clear it here as well.
3608 	 *
3609 	 */
3610 	if (old_entry->aref.ar_amap != NULL &&
3611 	    ((amap_flags(old_entry->aref.ar_amap) &
3612 	    AMAP_SHARED) != 0 ||
3613 	    VM_MAPENT_ISWIRED(old_entry))) {
3614 		amap_copy(new_map, new_entry, M_WAITOK, FALSE,
3615 		    0, 0);
3616 		/* XXXCDC: M_WAITOK ... ok? */
3617 	}
3618 
3619 	/*
3620 	 * if the parent's entry is wired down, then the
3621 	 * parent process does not want page faults on
3622 	 * access to that memory.  this means that we
3623 	 * cannot do copy-on-write because we can't write
3624 	 * protect the old entry.   in this case we
3625 	 * resolve all copy-on-write faults now, using
3626 	 * amap_cow_now.   note that we have already
3627 	 * allocated any needed amap (above).
3628 	 */
3629 	if (VM_MAPENT_ISWIRED(old_entry)) {
3630 		/*
3631 		 * resolve all copy-on-write faults now
3632 		 * (note that there is nothing to do if
3633 		 * the old mapping does not have an amap).
3634 		 * XXX: is it worthwhile to bother with
3635 		 * pmap_copy in this case?
3636 		 */
3637 		if (old_entry->aref.ar_amap)
3638 			amap_cow_now(new_map, new_entry);
3639 	} else {
3640 		if (old_entry->aref.ar_amap) {
3641 			/*
3642 			 * setup mappings to trigger copy-on-write faults
3643 			 * we must write-protect the parent if it has
3644 			 * an amap and it is not already "needs_copy"...
3645 			 * if it is already "needs_copy" then the parent
3646 			 * has already been write-protected by a previous
3647 			 * fork operation.
3648 			 *
3649 			 * if we do not write-protect the parent, then
3650 			 * we must be sure to write-protect the child
3651 			 * after the pmap_copy() operation.
3652 			 *
3653 			 * XXX: pmap_copy should have some way of telling
3654 			 * us that it didn't do anything so we can avoid
3655 			 * calling pmap_protect needlessly.
3656 			 */
3657 			if (!UVM_ET_ISNEEDSCOPY(old_entry)) {
3658 				if (old_entry->max_protection & PROT_WRITE) {
3659 					pmap_protect(old_map->pmap,
3660 					    old_entry->start,
3661 					    old_entry->end,
3662 					    old_entry->protection &
3663 					    ~PROT_WRITE);
3664 					pmap_update(old_map->pmap);
3665 				}
3666 				old_entry->etype |= UVM_ET_NEEDSCOPY;
3667 			}
3668 
3669 	  		/* parent must now be write-protected */
3670 	  		protect_child = FALSE;
3671 		} else {
3672 			/*
3673 			 * we only need to protect the child if the
3674 			 * parent has write access.
3675 			 */
3676 			if (old_entry->max_protection & PROT_WRITE)
3677 				protect_child = TRUE;
3678 			else
3679 				protect_child = FALSE;
3680 		}
3681 		/*
3682 		 * copy the mappings
3683 		 * XXX: need a way to tell if this does anything
3684 		 */
3685 		if (!UVM_ET_ISHOLE(new_entry))
3686 			pmap_copy(new_map->pmap, old_map->pmap,
3687 			    new_entry->start,
3688 			    (old_entry->end - old_entry->start),
3689 			    old_entry->start);
3690 
3691 		/* protect the child's mappings if necessary */
3692 		if (protect_child) {
3693 			pmap_protect(new_map->pmap, new_entry->start,
3694 			    new_entry->end,
3695 			    new_entry->protection &
3696 			    ~PROT_WRITE);
3697 		}
3698 	}
3699 
3700 	return (new_entry);
3701 }
3702 
3703 /*
3704  * zero the mapping: the new entry will be zero initialized
3705  */
3706 struct vm_map_entry *
3707 uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
3708     struct vm_map *old_map,
3709     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3710 {
3711 	struct vm_map_entry *new_entry;
3712 
3713 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3714 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3715 	    old_entry->max_protection, old_entry, dead, 0, 0);
3716 
3717 	new_entry->etype |=
3718 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3719 
3720 	if (new_entry->aref.ar_amap) {
3721 		amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3722 		    atop(new_entry->end - new_entry->start), 0);
3723 		new_entry->aref.ar_amap = NULL;
3724 		new_entry->aref.ar_pageoff = 0;
3725 	}
3726 
3727 	if (UVM_ET_ISOBJ(new_entry)) {
3728 		if (new_entry->object.uvm_obj->pgops->pgo_detach)
3729 			new_entry->object.uvm_obj->pgops->pgo_detach(
3730 			    new_entry->object.uvm_obj);
3731 		new_entry->object.uvm_obj = NULL;
3732 		new_entry->etype &= ~UVM_ET_OBJ;
3733 	}
3734 
3735 	return (new_entry);
3736 }
3737 
3738 /*
3739  * uvmspace_fork: fork a process' main map
3740  *
3741  * => create a new vmspace for child process from parent.
3742  * => parent's map must not be locked.
3743  */
3744 struct vmspace *
3745 uvmspace_fork(struct process *pr)
3746 {
3747 	struct vmspace *vm1 = pr->ps_vmspace;
3748 	struct vmspace *vm2;
3749 	struct vm_map *old_map = &vm1->vm_map;
3750 	struct vm_map *new_map;
3751 	struct vm_map_entry *old_entry, *new_entry;
3752 	struct uvm_map_deadq dead;
3753 
3754 	vm_map_lock(old_map);
3755 
3756 	vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
3757 	    (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE);
3758 	memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
3759 	    (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
3760 	vm2->vm_dused = 0; /* Statistic managed by us. */
3761 	new_map = &vm2->vm_map;
3762 	vm_map_lock(new_map);
3763 
3764 	/* go entry-by-entry */
3765 	TAILQ_INIT(&dead);
3766 	RB_FOREACH(old_entry, uvm_map_addr, &old_map->addr) {
3767 		if (old_entry->start == old_entry->end)
3768 			continue;
3769 
3770 		/* first, some sanity checks on the old entry */
3771 		if (UVM_ET_ISSUBMAP(old_entry)) {
3772 			panic("fork: encountered a submap during fork "
3773 			    "(illegal)");
3774 		}
3775 
3776 		if (!UVM_ET_ISCOPYONWRITE(old_entry) &&
3777 		    UVM_ET_ISNEEDSCOPY(old_entry)) {
3778 			panic("fork: non-copy_on_write map entry marked "
3779 			    "needs_copy (illegal)");
3780 		}
3781 
3782 		/* Apply inheritance. */
3783 		switch (old_entry->inheritance) {
3784 		case MAP_INHERIT_SHARE:
3785 			new_entry = uvm_mapent_forkshared(vm2, new_map,
3786 			    old_map, old_entry, &dead);
3787 			break;
3788 		case MAP_INHERIT_COPY:
3789 			new_entry = uvm_mapent_forkcopy(vm2, new_map,
3790 			    old_map, old_entry, &dead);
3791 			break;
3792 		case MAP_INHERIT_ZERO:
3793 			new_entry = uvm_mapent_forkzero(vm2, new_map,
3794 			    old_map, old_entry, &dead);
3795 			break;
3796 		default:
3797 			continue;
3798 		}
3799 
3800 	 	/* Update process statistics. */
3801 		if (!UVM_ET_ISHOLE(new_entry))
3802 			new_map->size += new_entry->end - new_entry->start;
3803 		if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry)) {
3804 			vm2->vm_dused += uvmspace_dused(
3805 			    new_map, new_entry->start, new_entry->end);
3806 		}
3807 	}
3808 
3809 	vm_map_unlock(old_map);
3810 	vm_map_unlock(new_map);
3811 
3812 	/*
3813 	 * This can actually happen, if multiple entries described a
3814 	 * space in which an entry was inherited.
3815 	 */
3816 	uvm_unmap_detach(&dead, 0);
3817 
3818 #ifdef SYSVSHM
3819 	if (vm1->vm_shm)
3820 		shmfork(vm1, vm2);
3821 #endif
3822 
3823 	return vm2;
3824 }
3825 
3826 /*
3827  * uvm_map_hint: return the beginning of the best area suitable for
3828  * creating a new mapping with "prot" protection.
3829  */
3830 vaddr_t
3831 uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr,
3832     vaddr_t maxaddr)
3833 {
3834 	vaddr_t addr;
3835 	vaddr_t spacing;
3836 
3837 #ifdef __i386__
3838 	/*
3839 	 * If executable skip first two pages, otherwise start
3840 	 * after data + heap region.
3841 	 */
3842 	if ((prot & PROT_EXEC) != 0 &&
3843 	    (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) {
3844 		addr = (PAGE_SIZE*2) +
3845 		    (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
3846 		return (round_page(addr));
3847 	}
3848 #endif
3849 
3850 #if defined (__LP64__)
3851 	spacing = (MIN((4UL * 1024 * 1024 * 1024), BRKSIZ) - 1);
3852 #else
3853 	spacing = (MIN((256 * 1024 * 1024), BRKSIZ) - 1);
3854 #endif
3855 
3856 	addr = (vaddr_t)vm->vm_daddr;
3857 	/*
3858 	 * Start malloc/mmap after the brk.
3859 	 * If the random spacing area has been used up,
3860 	 * the brk area becomes fair game for mmap as well.
3861 	 */
3862 	if (vm->vm_dused < spacing >> PAGE_SHIFT)
3863 		addr += BRKSIZ;
3864 	if (addr < maxaddr) {
3865 		while (spacing > maxaddr - addr)
3866 			spacing >>= 1;
3867 	}
3868 	addr += arc4random() & spacing;
3869 	return (round_page(addr));
3870 }
3871 
3872 /*
3873  * uvm_map_submap: punch down part of a map into a submap
3874  *
3875  * => only the kernel_map is allowed to be submapped
3876  * => the purpose of submapping is to break up the locking granularity
3877  *	of a larger map
3878  * => the range specified must have been mapped previously with a uvm_map()
3879  *	call [with uobj==NULL] to create a blank map entry in the main map.
3880  *	[And it had better still be blank!]
3881  * => maps which contain submaps should never be copied or forked.
3882  * => to remove a submap, use uvm_unmap() on the main map
3883  *	and then uvm_map_deallocate() the submap.
3884  * => main map must be unlocked.
3885  * => submap must have been init'd and have a zero reference count.
3886  *	[need not be locked as we don't actually reference it]
3887  */
3888 int
3889 uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
3890     struct vm_map *submap)
3891 {
3892 	struct vm_map_entry *entry;
3893 	int result;
3894 
3895 	if (start > map->max_offset || end > map->max_offset ||
3896 	    start < map->min_offset || end < map->min_offset)
3897 		return EINVAL;
3898 
3899 	vm_map_lock(map);
3900 
3901 	if (uvm_map_lookup_entry(map, start, &entry)) {
3902 		UVM_MAP_CLIP_START(map, entry, start);
3903 		UVM_MAP_CLIP_END(map, entry, end);
3904 	} else
3905 		entry = NULL;
3906 
3907 	if (entry != NULL &&
3908 	    entry->start == start && entry->end == end &&
3909 	    entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
3910 	    !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
3911 		entry->etype |= UVM_ET_SUBMAP;
3912 		entry->object.sub_map = submap;
3913 		entry->offset = 0;
3914 		uvm_map_reference(submap);
3915 		result = 0;
3916 	} else
3917 		result = EINVAL;
3918 
3919 	vm_map_unlock(map);
3920 	return(result);
3921 }
3922 
3923 /*
3924  * uvm_map_checkprot: check protection in map
3925  *
3926  * => must allow specific protection in a fully allocated region.
3927  * => map mut be read or write locked by caller.
3928  */
3929 boolean_t
3930 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
3931     vm_prot_t protection)
3932 {
3933 	struct vm_map_entry *entry;
3934 
3935 	if (start < map->min_offset || end > map->max_offset || start > end)
3936 		return FALSE;
3937 	if (start == end)
3938 		return TRUE;
3939 
3940 	/*
3941 	 * Iterate entries.
3942 	 */
3943 	for (entry = uvm_map_entrybyaddr(&map->addr, start);
3944 	    entry != NULL && entry->start < end;
3945 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
3946 		/* Fail if a hole is found. */
3947 		if (UVM_ET_ISHOLE(entry) ||
3948 		    (entry->end < end && entry->end != VMMAP_FREE_END(entry)))
3949 			return FALSE;
3950 
3951 		/* Check protection. */
3952 		if ((entry->protection & protection) != protection)
3953 			return FALSE;
3954 	}
3955 	return TRUE;
3956 }
3957 
3958 /*
3959  * uvm_map_create: create map
3960  */
3961 vm_map_t
3962 uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags)
3963 {
3964 	vm_map_t map;
3965 
3966 	map = malloc(sizeof *map, M_VMMAP, M_WAITOK);
3967 	map->pmap = pmap;
3968 	uvm_map_setup(map, min, max, flags);
3969 	return (map);
3970 }
3971 
3972 /*
3973  * uvm_map_deallocate: drop reference to a map
3974  *
3975  * => caller must not lock map
3976  * => we will zap map if ref count goes to zero
3977  */
3978 void
3979 uvm_map_deallocate(vm_map_t map)
3980 {
3981 	int c;
3982 	struct uvm_map_deadq dead;
3983 
3984 	c = --map->ref_count;
3985 	if (c > 0) {
3986 		return;
3987 	}
3988 
3989 	/*
3990 	 * all references gone.   unmap and free.
3991 	 *
3992 	 * No lock required: we are only one to access this map.
3993 	 */
3994 	TAILQ_INIT(&dead);
3995 	uvm_tree_sanity(map, __FILE__, __LINE__);
3996 	uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead,
3997 	    TRUE, FALSE);
3998 	pmap_destroy(map->pmap);
3999 	KASSERT(RB_EMPTY(&map->addr));
4000 	free(map, M_VMMAP, sizeof *map);
4001 
4002 	uvm_unmap_detach(&dead, 0);
4003 }
4004 
4005 /*
4006  * uvm_map_inherit: set inheritance code for range of addrs in map.
4007  *
4008  * => map must be unlocked
4009  * => note that the inherit code is used during a "fork".  see fork
4010  *	code for details.
4011  */
4012 int
4013 uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
4014     vm_inherit_t new_inheritance)
4015 {
4016 	struct vm_map_entry *entry;
4017 
4018 	switch (new_inheritance) {
4019 	case MAP_INHERIT_NONE:
4020 	case MAP_INHERIT_COPY:
4021 	case MAP_INHERIT_SHARE:
4022 	case MAP_INHERIT_ZERO:
4023 		break;
4024 	default:
4025 		return (EINVAL);
4026 	}
4027 
4028 	if (start > end)
4029 		return EINVAL;
4030 	start = MAX(start, map->min_offset);
4031 	end = MIN(end, map->max_offset);
4032 	if (start >= end)
4033 		return 0;
4034 
4035 	vm_map_lock(map);
4036 
4037 	entry = uvm_map_entrybyaddr(&map->addr, start);
4038 	if (entry->end > start)
4039 		UVM_MAP_CLIP_START(map, entry, start);
4040 	else
4041 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
4042 
4043 	while (entry != NULL && entry->start < end) {
4044 		UVM_MAP_CLIP_END(map, entry, end);
4045 		entry->inheritance = new_inheritance;
4046 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
4047 	}
4048 
4049 	vm_map_unlock(map);
4050 	return (0);
4051 }
4052 
4053 /*
4054  * uvm_map_advice: set advice code for range of addrs in map.
4055  *
4056  * => map must be unlocked
4057  */
4058 int
4059 uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
4060 {
4061 	struct vm_map_entry *entry;
4062 
4063 	switch (new_advice) {
4064 	case MADV_NORMAL:
4065 	case MADV_RANDOM:
4066 	case MADV_SEQUENTIAL:
4067 		break;
4068 	default:
4069 		return (EINVAL);
4070 	}
4071 
4072 	if (start > end)
4073 		return EINVAL;
4074 	start = MAX(start, map->min_offset);
4075 	end = MIN(end, map->max_offset);
4076 	if (start >= end)
4077 		return 0;
4078 
4079 	vm_map_lock(map);
4080 
4081 	entry = uvm_map_entrybyaddr(&map->addr, start);
4082 	if (entry != NULL && entry->end > start)
4083 		UVM_MAP_CLIP_START(map, entry, start);
4084 	else if (entry!= NULL)
4085 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
4086 
4087 	/*
4088 	 * XXXJRT: disallow holes?
4089 	 */
4090 	while (entry != NULL && entry->start < end) {
4091 		UVM_MAP_CLIP_END(map, entry, end);
4092 		entry->advice = new_advice;
4093 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
4094 	}
4095 
4096 	vm_map_unlock(map);
4097 	return (0);
4098 }
4099 
4100 /*
4101  * uvm_map_extract: extract a mapping from a map and put it somewhere
4102  * in the kernel_map, setting protection to max_prot.
4103  *
4104  * => map should be unlocked (we will write lock it and kernel_map)
4105  * => returns 0 on success, error code otherwise
4106  * => start must be page aligned
4107  * => len must be page sized
4108  * => flags:
4109  *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
4110  * Mappings are QREF's.
4111  */
4112 int
4113 uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
4114     vaddr_t *dstaddrp, int flags)
4115 {
4116 	struct uvm_map_deadq dead;
4117 	struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2;
4118 	vaddr_t dstaddr;
4119 	vaddr_t end;
4120 	vaddr_t cp_start;
4121 	vsize_t cp_len, cp_off;
4122 	int error;
4123 
4124 	TAILQ_INIT(&dead);
4125 	end = start + len;
4126 
4127 	/*
4128 	 * Sanity check on the parameters.
4129 	 * Also, since the mapping may not contain gaps, error out if the
4130 	 * mapped area is not in source map.
4131 	 */
4132 	if ((start & (vaddr_t)PAGE_MASK) != 0 ||
4133 	    (end & (vaddr_t)PAGE_MASK) != 0 || end < start)
4134 		return EINVAL;
4135 	if (start < srcmap->min_offset || end > srcmap->max_offset)
4136 		return EINVAL;
4137 
4138 	/* Initialize dead entries. Handle len == 0 case. */
4139 	if (len == 0)
4140 		return 0;
4141 
4142 	/* Acquire lock on srcmap. */
4143 	vm_map_lock(srcmap);
4144 
4145 	/* Lock srcmap, lookup first and last entry in <start,len>. */
4146 	first = uvm_map_entrybyaddr(&srcmap->addr, start);
4147 
4148 	/* Check that the range is contiguous. */
4149 	for (entry = first; entry != NULL && entry->end < end;
4150 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
4151 		if (VMMAP_FREE_END(entry) != entry->end ||
4152 		    UVM_ET_ISHOLE(entry)) {
4153 			error = EINVAL;
4154 			goto fail;
4155 		}
4156 	}
4157 	if (entry == NULL || UVM_ET_ISHOLE(entry)) {
4158 		error = EINVAL;
4159 		goto fail;
4160 	}
4161 
4162 	/*
4163 	 * Handle need-copy flag.
4164 	 * This may invalidate last, hence the re-initialization during the
4165 	 * loop.
4166 	 *
4167 	 * Also, perform clipping of last if not UVM_EXTRACT_QREF.
4168 	 */
4169 	for (entry = first; entry != NULL && entry->start < end;
4170 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
4171 		if (UVM_ET_ISNEEDSCOPY(entry))
4172 			amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end);
4173 		if (UVM_ET_ISNEEDSCOPY(entry)) {
4174 			/*
4175 			 * amap_copy failure
4176 			 */
4177 			error = ENOMEM;
4178 			goto fail;
4179 		}
4180 	}
4181 
4182 	/* Lock destination map (kernel_map). */
4183 	vm_map_lock(kernel_map);
4184 
4185 	if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len,
4186 	    MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start),
4187 	    PROT_NONE, 0) != 0) {
4188 		error = ENOMEM;
4189 		goto fail2;
4190 	}
4191 	*dstaddrp = dstaddr;
4192 
4193 	/*
4194 	 * We now have srcmap and kernel_map locked.
4195 	 * dstaddr contains the destination offset in dstmap.
4196 	 */
4197 	/* step 1: start looping through map entries, performing extraction. */
4198 	for (entry = first; entry != NULL && entry->start < end;
4199 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
4200 		KDASSERT(!UVM_ET_ISNEEDSCOPY(entry));
4201 		if (UVM_ET_ISHOLE(entry))
4202 			continue;
4203 
4204 		/* Calculate uvm_mapent_clone parameters. */
4205 		cp_start = entry->start;
4206 		if (cp_start < start) {
4207 			cp_off = start - cp_start;
4208 			cp_start = start;
4209 		} else
4210 			cp_off = 0;
4211 		cp_len = MIN(entry->end, end) - cp_start;
4212 
4213 		newentry = uvm_mapent_clone(kernel_map,
4214 		    cp_start - start + dstaddr, cp_len, cp_off,
4215 		    entry->protection, entry->max_protection,
4216 		    entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
4217 		if (newentry == NULL) {
4218 			error = ENOMEM;
4219 			goto fail2_unmap;
4220 		}
4221 		kernel_map->size += cp_len;
4222 		if (flags & UVM_EXTRACT_FIXPROT)
4223 			newentry->protection = newentry->max_protection;
4224 
4225 		/*
4226 		 * Step 2: perform pmap copy.
4227 		 * (Doing this in the loop saves one RB traversal.)
4228 		 */
4229 		pmap_copy(kernel_map->pmap, srcmap->pmap,
4230 		    cp_start - start + dstaddr, cp_len, cp_start);
4231 	}
4232 	pmap_update(kernel_map->pmap);
4233 
4234 	error = 0;
4235 
4236 	/* Unmap copied entries on failure. */
4237 fail2_unmap:
4238 	if (error) {
4239 		uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead,
4240 		    FALSE, TRUE);
4241 	}
4242 
4243 	/* Release maps, release dead entries. */
4244 fail2:
4245 	vm_map_unlock(kernel_map);
4246 
4247 fail:
4248 	vm_map_unlock(srcmap);
4249 
4250 	uvm_unmap_detach(&dead, 0);
4251 
4252 	return error;
4253 }
4254 
4255 /*
4256  * uvm_map_clean: clean out a map range
4257  *
4258  * => valid flags:
4259  *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
4260  *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
4261  *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
4262  *   if (flags & PGO_FREE): any cached pages are freed after clean
4263  * => returns an error if any part of the specified range isn't mapped
4264  * => never a need to flush amap layer since the anonymous memory has
4265  *	no permanent home, but may deactivate pages there
4266  * => called from sys_msync() and sys_madvise()
4267  * => caller must not write-lock map (read OK).
4268  * => we may sleep while cleaning if SYNCIO [with map read-locked]
4269  */
4270 
4271 int
4272 uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
4273 {
4274 	struct vm_map_entry *first, *entry;
4275 	struct vm_amap *amap;
4276 	struct vm_anon *anon;
4277 	struct vm_page *pg;
4278 	struct uvm_object *uobj;
4279 	vaddr_t cp_start, cp_end;
4280 	int refs;
4281 	int error;
4282 	boolean_t rv;
4283 
4284 	KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
4285 	    (PGO_FREE|PGO_DEACTIVATE));
4286 
4287 	if (start > end || start < map->min_offset || end > map->max_offset)
4288 		return EINVAL;
4289 
4290 	vm_map_lock_read(map);
4291 	first = uvm_map_entrybyaddr(&map->addr, start);
4292 
4293 	/* Make a first pass to check for holes. */
4294 	for (entry = first; entry != NULL && entry->start < end;
4295 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
4296 		if (UVM_ET_ISSUBMAP(entry)) {
4297 			vm_map_unlock_read(map);
4298 			return EINVAL;
4299 		}
4300 		if (UVM_ET_ISSUBMAP(entry) ||
4301 		    UVM_ET_ISHOLE(entry) ||
4302 		    (entry->end < end &&
4303 		    VMMAP_FREE_END(entry) != entry->end)) {
4304 			vm_map_unlock_read(map);
4305 			return EFAULT;
4306 		}
4307 	}
4308 
4309 	error = 0;
4310 	for (entry = first; entry != NULL && entry->start < end;
4311 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
4312 		amap = entry->aref.ar_amap;	/* top layer */
4313 		if (UVM_ET_ISOBJ(entry))
4314 			uobj = entry->object.uvm_obj;
4315 		else
4316 			uobj = NULL;
4317 
4318 		/*
4319 		 * No amap cleaning necessary if:
4320 		 *  - there's no amap
4321 		 *  - we're not deactivating or freeing pages.
4322 		 */
4323 		if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
4324 			goto flush_object;
4325 
4326 		cp_start = MAX(entry->start, start);
4327 		cp_end = MIN(entry->end, end);
4328 
4329 		for (; cp_start != cp_end; cp_start += PAGE_SIZE) {
4330 			anon = amap_lookup(&entry->aref,
4331 			    cp_start - entry->start);
4332 			if (anon == NULL)
4333 				continue;
4334 
4335 			pg = anon->an_page;
4336 			if (pg == NULL) {
4337 				continue;
4338 			}
4339 			KASSERT(pg->pg_flags & PQ_ANON);
4340 
4341 			switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
4342 			/*
4343 			 * XXX In these first 3 cases, we always just
4344 			 * XXX deactivate the page.  We may want to
4345 			 * XXX handle the different cases more
4346 			 * XXX specifically, in the future.
4347 			 */
4348 			case PGO_CLEANIT|PGO_FREE:
4349 			case PGO_CLEANIT|PGO_DEACTIVATE:
4350 			case PGO_DEACTIVATE:
4351 deactivate_it:
4352 				/* skip the page if it's wired */
4353 				if (pg->wire_count != 0)
4354 					break;
4355 
4356 				uvm_lock_pageq();
4357 
4358 				KASSERT(pg->uanon == anon);
4359 
4360 				/* zap all mappings for the page. */
4361 				pmap_page_protect(pg, PROT_NONE);
4362 
4363 				/* ...and deactivate the page. */
4364 				uvm_pagedeactivate(pg);
4365 
4366 				uvm_unlock_pageq();
4367 				break;
4368 			case PGO_FREE:
4369 				/*
4370 				 * If there are multiple references to
4371 				 * the amap, just deactivate the page.
4372 				 */
4373 				if (amap_refs(amap) > 1)
4374 					goto deactivate_it;
4375 
4376 				/* XXX skip the page if it's wired */
4377 				if (pg->wire_count != 0) {
4378 					break;
4379 				}
4380 				amap_unadd(&entry->aref,
4381 				    cp_start - entry->start);
4382 				refs = --anon->an_ref;
4383 				if (refs == 0)
4384 					uvm_anfree(anon);
4385 				break;
4386 			default:
4387 				panic("uvm_map_clean: weird flags");
4388 			}
4389 		}
4390 
4391 flush_object:
4392 		cp_start = MAX(entry->start, start);
4393 		cp_end = MIN(entry->end, end);
4394 
4395 		/*
4396 		 * flush pages if we've got a valid backing object.
4397 		 *
4398 		 * Don't PGO_FREE if we don't have write permission
4399 		 * and don't flush if this is a copy-on-write object
4400 		 * since we can't know our permissions on it.
4401 		 */
4402 		if (uobj != NULL &&
4403 		    ((flags & PGO_FREE) == 0 ||
4404 		     ((entry->max_protection & PROT_WRITE) != 0 &&
4405 		      (entry->etype & UVM_ET_COPYONWRITE) == 0))) {
4406 			rv = uobj->pgops->pgo_flush(uobj,
4407 			    cp_start - entry->start + entry->offset,
4408 			    cp_end - entry->start + entry->offset, flags);
4409 
4410 			if (rv == FALSE)
4411 				error = EFAULT;
4412 		}
4413 	}
4414 
4415 	vm_map_unlock_read(map);
4416 	return error;
4417 }
4418 
4419 /*
4420  * UVM_MAP_CLIP_END implementation
4421  */
4422 void
4423 uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4424 {
4425 	struct vm_map_entry *tmp;
4426 
4427 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4428 	tmp = uvm_mapent_alloc(map, 0);
4429 
4430 	/* Invoke splitentry. */
4431 	uvm_map_splitentry(map, entry, tmp, addr);
4432 }
4433 
4434 /*
4435  * UVM_MAP_CLIP_START implementation
4436  *
4437  * Clippers are required to not change the pointers to the entry they are
4438  * clipping on.
4439  * Since uvm_map_splitentry turns the original entry into the lowest
4440  * entry (address wise) we do a swap between the new entry and the original
4441  * entry, prior to calling uvm_map_splitentry.
4442  */
4443 void
4444 uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4445 {
4446 	struct vm_map_entry *tmp;
4447 	struct uvm_addr_state *free;
4448 
4449 	/* Unlink original. */
4450 	free = uvm_map_uaddr_e(map, entry);
4451 	uvm_mapent_free_remove(map, free, entry);
4452 	uvm_mapent_addr_remove(map, entry);
4453 
4454 	/* Copy entry. */
4455 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4456 	tmp = uvm_mapent_alloc(map, 0);
4457 	uvm_mapent_copy(entry, tmp);
4458 
4459 	/* Put new entry in place of original entry. */
4460 	uvm_mapent_addr_insert(map, tmp);
4461 	uvm_mapent_free_insert(map, free, tmp);
4462 
4463 	/* Invoke splitentry. */
4464 	uvm_map_splitentry(map, tmp, entry, addr);
4465 }
4466 
4467 /*
4468  * Boundary fixer.
4469  */
4470 static __inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t);
4471 static __inline vaddr_t
4472 uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound)
4473 {
4474 	return (min < bound && max > bound) ? bound : max;
4475 }
4476 
4477 /*
4478  * Choose free list based on address at start of free space.
4479  *
4480  * The uvm_addr_state returned contains addr and is the first of:
4481  * - uaddr_exe
4482  * - uaddr_brk_stack
4483  * - uaddr_any
4484  */
4485 struct uvm_addr_state*
4486 uvm_map_uaddr(struct vm_map *map, vaddr_t addr)
4487 {
4488 	struct uvm_addr_state *uaddr;
4489 	int i;
4490 
4491 	/* Special case the first page, to prevent mmap from returning 0. */
4492 	if (addr < VMMAP_MIN_ADDR)
4493 		return NULL;
4494 
4495 	/* Upper bound for kernel maps at uvm_maxkaddr. */
4496 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
4497 		if (addr >= uvm_maxkaddr)
4498 			return NULL;
4499 	}
4500 
4501 	/* Is the address inside the exe-only map? */
4502 	if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr &&
4503 	    addr < map->uaddr_exe->uaddr_maxaddr)
4504 		return map->uaddr_exe;
4505 
4506 	/* Check if the space falls inside brk/stack area. */
4507 	if ((addr >= map->b_start && addr < map->b_end) ||
4508 	    (addr >= map->s_start && addr < map->s_end)) {
4509 		if (map->uaddr_brk_stack != NULL &&
4510 		    addr >= map->uaddr_brk_stack->uaddr_minaddr &&
4511 		    addr < map->uaddr_brk_stack->uaddr_maxaddr) {
4512 			return map->uaddr_brk_stack;
4513 		} else
4514 			return NULL;
4515 	}
4516 
4517 	/*
4518 	 * Check the other selectors.
4519 	 *
4520 	 * These selectors are only marked as the owner, if they have insert
4521 	 * functions.
4522 	 */
4523 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4524 		uaddr = map->uaddr_any[i];
4525 		if (uaddr == NULL)
4526 			continue;
4527 		if (uaddr->uaddr_functions->uaddr_free_insert == NULL)
4528 			continue;
4529 
4530 		if (addr >= uaddr->uaddr_minaddr &&
4531 		    addr < uaddr->uaddr_maxaddr)
4532 			return uaddr;
4533 	}
4534 
4535 	return NULL;
4536 }
4537 
4538 /*
4539  * Choose free list based on address at start of free space.
4540  *
4541  * The uvm_addr_state returned contains addr and is the first of:
4542  * - uaddr_exe
4543  * - uaddr_brk_stack
4544  * - uaddr_any
4545  */
4546 struct uvm_addr_state*
4547 uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry)
4548 {
4549 	return uvm_map_uaddr(map, VMMAP_FREE_START(entry));
4550 }
4551 
4552 /*
4553  * Returns the first free-memory boundary that is crossed by [min-max].
4554  */
4555 vsize_t
4556 uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max)
4557 {
4558 	struct uvm_addr_state	*uaddr;
4559 	int			 i;
4560 
4561 	/* Never return first page. */
4562 	max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR);
4563 
4564 	/* Treat the maxkaddr special, if the map is a kernel_map. */
4565 	if ((map->flags & VM_MAP_ISVMSPACE) == 0)
4566 		max = uvm_map_boundfix(min, max, uvm_maxkaddr);
4567 
4568 	/* Check for exe-only boundaries. */
4569 	if (map->uaddr_exe != NULL) {
4570 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr);
4571 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr);
4572 	}
4573 
4574 	/* Check for exe-only boundaries. */
4575 	if (map->uaddr_brk_stack != NULL) {
4576 		max = uvm_map_boundfix(min, max,
4577 		    map->uaddr_brk_stack->uaddr_minaddr);
4578 		max = uvm_map_boundfix(min, max,
4579 		    map->uaddr_brk_stack->uaddr_maxaddr);
4580 	}
4581 
4582 	/* Check other boundaries. */
4583 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4584 		uaddr = map->uaddr_any[i];
4585 		if (uaddr != NULL) {
4586 			max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr);
4587 			max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr);
4588 		}
4589 	}
4590 
4591 	/* Boundaries at stack and brk() area. */
4592 	max = uvm_map_boundfix(min, max, map->s_start);
4593 	max = uvm_map_boundfix(min, max, map->s_end);
4594 	max = uvm_map_boundfix(min, max, map->b_start);
4595 	max = uvm_map_boundfix(min, max, map->b_end);
4596 
4597 	return max;
4598 }
4599 
4600 /*
4601  * Update map allocation start and end addresses from proc vmspace.
4602  */
4603 void
4604 uvm_map_vmspace_update(struct vm_map *map,
4605     struct uvm_map_deadq *dead, int flags)
4606 {
4607 	struct vmspace *vm;
4608 	vaddr_t b_start, b_end, s_start, s_end;
4609 
4610 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
4611 	KASSERT(offsetof(struct vmspace, vm_map) == 0);
4612 
4613 	/*
4614 	 * Derive actual allocation boundaries from vmspace.
4615 	 */
4616 	vm = (struct vmspace *)map;
4617 	b_start = (vaddr_t)vm->vm_daddr;
4618 	b_end   = b_start + BRKSIZ;
4619 	s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4620 	s_end   = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4621 #ifdef DIAGNOSTIC
4622 	if ((b_start & (vaddr_t)PAGE_MASK) != 0 ||
4623 	    (b_end & (vaddr_t)PAGE_MASK) != 0 ||
4624 	    (s_start & (vaddr_t)PAGE_MASK) != 0 ||
4625 	    (s_end & (vaddr_t)PAGE_MASK) != 0) {
4626 		panic("uvm_map_vmspace_update: vmspace %p invalid bounds: "
4627 		    "b=0x%lx-0x%lx s=0x%lx-0x%lx",
4628 		    vm, b_start, b_end, s_start, s_end);
4629 	}
4630 #endif
4631 
4632 	if (__predict_true(map->b_start == b_start && map->b_end == b_end &&
4633 	    map->s_start == s_start && map->s_end == s_end))
4634 		return;
4635 
4636 	uvm_map_freelist_update(map, dead, b_start, b_end,
4637 	    s_start, s_end, flags);
4638 }
4639 
4640 /*
4641  * Grow kernel memory.
4642  *
4643  * This function is only called for kernel maps when an allocation fails.
4644  *
4645  * If the map has a gap that is large enough to accommodate alloc_sz, this
4646  * function will make sure map->free will include it.
4647  */
4648 void
4649 uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead,
4650     vsize_t alloc_sz, int flags)
4651 {
4652 	vsize_t sz;
4653 	vaddr_t end;
4654 	struct vm_map_entry *entry;
4655 
4656 	/* Kernel memory only. */
4657 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0);
4658 	/* Destroy free list. */
4659 	uvm_map_freelist_update_clear(map, dead);
4660 
4661 	/* Include the guard page in the hard minimum requirement of alloc_sz. */
4662 	if (map->flags & VM_MAP_GUARDPAGES)
4663 		alloc_sz += PAGE_SIZE;
4664 
4665 	/*
4666 	 * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA.
4667 	 *
4668 	 * Don't handle the case where the multiplication overflows:
4669 	 * if that happens, the allocation is probably too big anyway.
4670 	 */
4671 	sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA);
4672 
4673 	/*
4674 	 * Walk forward until a gap large enough for alloc_sz shows up.
4675 	 *
4676 	 * We assume the kernel map has no boundaries.
4677 	 * uvm_maxkaddr may be zero.
4678 	 */
4679 	end = MAX(uvm_maxkaddr, map->min_offset);
4680 	entry = uvm_map_entrybyaddr(&map->addr, end);
4681 	while (entry && entry->fspace < alloc_sz)
4682 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
4683 	if (entry) {
4684 		end = MAX(VMMAP_FREE_START(entry), end);
4685 		end += MIN(sz, map->max_offset - end);
4686 	} else
4687 		end = map->max_offset;
4688 
4689 	/* Reserve pmap entries. */
4690 #ifdef PMAP_GROWKERNEL
4691 	uvm_maxkaddr = pmap_growkernel(end);
4692 #else
4693 	uvm_maxkaddr = MAX(uvm_maxkaddr, end);
4694 #endif
4695 
4696 	/* Rebuild free list. */
4697 	uvm_map_freelist_update_refill(map, flags);
4698 }
4699 
4700 /*
4701  * Freelist update subfunction: unlink all entries from freelists.
4702  */
4703 void
4704 uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead)
4705 {
4706 	struct uvm_addr_state *free;
4707 	struct vm_map_entry *entry, *prev, *next;
4708 
4709 	prev = NULL;
4710 	for (entry = RB_MIN(uvm_map_addr, &map->addr); entry != NULL;
4711 	    entry = next) {
4712 		next = RB_NEXT(uvm_map_addr, &map->addr, entry);
4713 
4714 		free = uvm_map_uaddr_e(map, entry);
4715 		uvm_mapent_free_remove(map, free, entry);
4716 
4717 		if (prev != NULL && entry->start == entry->end) {
4718 			prev->fspace += VMMAP_FREE_END(entry) - entry->end;
4719 			uvm_mapent_addr_remove(map, entry);
4720 			DEAD_ENTRY_PUSH(dead, entry);
4721 		} else
4722 			prev = entry;
4723 	}
4724 }
4725 
4726 /*
4727  * Freelist update subfunction: refill the freelists with entries.
4728  */
4729 void
4730 uvm_map_freelist_update_refill(struct vm_map *map, int flags)
4731 {
4732 	struct vm_map_entry *entry;
4733 	vaddr_t min, max;
4734 
4735 	RB_FOREACH(entry, uvm_map_addr, &map->addr) {
4736 		min = VMMAP_FREE_START(entry);
4737 		max = VMMAP_FREE_END(entry);
4738 		entry->fspace = 0;
4739 
4740 		entry = uvm_map_fix_space(map, entry, min, max, flags);
4741 	}
4742 
4743 	uvm_tree_sanity(map, __FILE__, __LINE__);
4744 }
4745 
4746 /*
4747  * Change {a,b}_{start,end} allocation ranges and associated free lists.
4748  */
4749 void
4750 uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead,
4751     vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
4752 {
4753 	KDASSERT(b_end >= b_start && s_end >= s_start);
4754 
4755 	/* Clear all free lists. */
4756 	uvm_map_freelist_update_clear(map, dead);
4757 
4758 	/* Apply new bounds. */
4759 	map->b_start = b_start;
4760 	map->b_end   = b_end;
4761 	map->s_start = s_start;
4762 	map->s_end   = s_end;
4763 
4764 	/* Refill free lists. */
4765 	uvm_map_freelist_update_refill(map, flags);
4766 }
4767 
4768 /*
4769  * Assign a uvm_addr_state to the specified pointer in vm_map.
4770  *
4771  * May sleep.
4772  */
4773 void
4774 uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which,
4775     struct uvm_addr_state *newval)
4776 {
4777 	struct uvm_map_deadq dead;
4778 
4779 	/* Pointer which must be in this map. */
4780 	KASSERT(which != NULL);
4781 	KASSERT((void*)map <= (void*)(which) &&
4782 	    (void*)(which) < (void*)(map + 1));
4783 
4784 	vm_map_lock(map);
4785 	TAILQ_INIT(&dead);
4786 	uvm_map_freelist_update_clear(map, &dead);
4787 
4788 	uvm_addr_destroy(*which);
4789 	*which = newval;
4790 
4791 	uvm_map_freelist_update_refill(map, 0);
4792 	vm_map_unlock(map);
4793 	uvm_unmap_detach(&dead, 0);
4794 }
4795 
4796 /*
4797  * Correct space insert.
4798  *
4799  * Entry must not be on any freelist.
4800  */
4801 struct vm_map_entry*
4802 uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry,
4803     vaddr_t min, vaddr_t max, int flags)
4804 {
4805 	struct uvm_addr_state	*free, *entfree;
4806 	vaddr_t			 lmax;
4807 
4808 	KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0);
4809 	KDASSERT(min <= max);
4810 	KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) ||
4811 	    min == map->min_offset);
4812 
4813 	/*
4814 	 * During the function, entfree will always point at the uaddr state
4815 	 * for entry.
4816 	 */
4817 	entfree = (entry == NULL ? NULL :
4818 	    uvm_map_uaddr_e(map, entry));
4819 
4820 	while (min != max) {
4821 		/* Claim guard page for entry. */
4822 		if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL &&
4823 		    VMMAP_FREE_END(entry) == entry->end &&
4824 		    entry->start != entry->end) {
4825 			if (max - min == 2 * PAGE_SIZE) {
4826 				/*
4827 				 * If the free-space gap is exactly 2 pages,
4828 				 * we make the guard 2 pages instead of 1.
4829 				 * Because in a guarded map, an area needs
4830 				 * at least 2 pages to allocate from:
4831 				 * one page for the allocation and one for
4832 				 * the guard.
4833 				 */
4834 				entry->guard = 2 * PAGE_SIZE;
4835 				min = max;
4836 			} else {
4837 				entry->guard = PAGE_SIZE;
4838 				min += PAGE_SIZE;
4839 			}
4840 			continue;
4841 		}
4842 
4843 		/*
4844 		 * Handle the case where entry has a 2-page guard, but the
4845 		 * space after entry is freed.
4846 		 */
4847 		if (entry != NULL && entry->fspace == 0 &&
4848 		    entry->guard > PAGE_SIZE) {
4849 			entry->guard = PAGE_SIZE;
4850 			min = VMMAP_FREE_START(entry);
4851 		}
4852 
4853 		lmax = uvm_map_boundary(map, min, max);
4854 		free = uvm_map_uaddr(map, min);
4855 
4856 		/*
4857 		 * Entries are merged if they point at the same uvm_free().
4858 		 * Exception to that rule: if min == uvm_maxkaddr, a new
4859 		 * entry is started regardless (otherwise the allocators
4860 		 * will get confused).
4861 		 */
4862 		if (entry != NULL && free == entfree &&
4863 		    !((map->flags & VM_MAP_ISVMSPACE) == 0 &&
4864 		    min == uvm_maxkaddr)) {
4865 			KDASSERT(VMMAP_FREE_END(entry) == min);
4866 			entry->fspace += lmax - min;
4867 		} else {
4868 			/*
4869 			 * Commit entry to free list: it'll not be added to
4870 			 * anymore.
4871 			 * We'll start a new entry and add to that entry
4872 			 * instead.
4873 			 */
4874 			if (entry != NULL)
4875 				uvm_mapent_free_insert(map, entfree, entry);
4876 
4877 			/* New entry for new uaddr. */
4878 			entry = uvm_mapent_alloc(map, flags);
4879 			KDASSERT(entry != NULL);
4880 			entry->end = entry->start = min;
4881 			entry->guard = 0;
4882 			entry->fspace = lmax - min;
4883 			entry->object.uvm_obj = NULL;
4884 			entry->offset = 0;
4885 			entry->etype = 0;
4886 			entry->protection = entry->max_protection = 0;
4887 			entry->inheritance = 0;
4888 			entry->wired_count = 0;
4889 			entry->advice = 0;
4890 			entry->aref.ar_pageoff = 0;
4891 			entry->aref.ar_amap = NULL;
4892 			uvm_mapent_addr_insert(map, entry);
4893 
4894 			entfree = free;
4895 		}
4896 
4897 		min = lmax;
4898 	}
4899 	/* Finally put entry on the uaddr state. */
4900 	if (entry != NULL)
4901 		uvm_mapent_free_insert(map, entfree, entry);
4902 
4903 	return entry;
4904 }
4905 
4906 /*
4907  * MQuery style of allocation.
4908  *
4909  * This allocator searches forward until sufficient space is found to map
4910  * the given size.
4911  *
4912  * XXX: factor in offset (via pmap_prefer) and protection?
4913  */
4914 int
4915 uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset,
4916     int flags)
4917 {
4918 	struct vm_map_entry *entry, *last;
4919 	vaddr_t addr;
4920 	vaddr_t tmp, pmap_align, pmap_offset;
4921 	int error;
4922 
4923 	addr = *addr_p;
4924 	vm_map_lock_read(map);
4925 
4926 	/* Configure pmap prefer. */
4927 	if (offset != UVM_UNKNOWN_OFFSET) {
4928 		pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN());
4929 		pmap_offset = PMAP_PREFER_OFFSET(offset);
4930 	} else {
4931 		pmap_align = PAGE_SIZE;
4932 		pmap_offset = 0;
4933 	}
4934 
4935 	/* Align address to pmap_prefer unless FLAG_FIXED is set. */
4936 	if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) {
4937 	  	tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
4938 		if (tmp < addr)
4939 			tmp += pmap_align;
4940 		addr = tmp;
4941 	}
4942 
4943 	/* First, check if the requested range is fully available. */
4944 	entry = uvm_map_entrybyaddr(&map->addr, addr);
4945 	last = NULL;
4946 	if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
4947 		error = 0;
4948 		goto out;
4949 	}
4950 	if (flags & UVM_FLAG_FIXED) {
4951 		error = EINVAL;
4952 		goto out;
4953 	}
4954 
4955 	error = ENOMEM; /* Default error from here. */
4956 
4957 	/*
4958 	 * At this point, the memory at <addr, sz> is not available.
4959 	 * The reasons are:
4960 	 * [1] it's outside the map,
4961 	 * [2] it starts in used memory (and therefore needs to move
4962 	 *     toward the first free page in entry),
4963 	 * [3] it starts in free memory but bumps into used memory.
4964 	 *
4965 	 * Note that for case [2], the forward moving is handled by the
4966 	 * for loop below.
4967 	 */
4968 	if (entry == NULL) {
4969 		/* [1] Outside the map. */
4970 		if (addr >= map->max_offset)
4971 			goto out;
4972 		else
4973 			entry = RB_MIN(uvm_map_addr, &map->addr);
4974 	} else if (VMMAP_FREE_START(entry) <= addr) {
4975 		/* [3] Bumped into used memory. */
4976 		entry = RB_NEXT(uvm_map_addr, &map->addr, entry);
4977 	}
4978 
4979 	/* Test if the next entry is sufficient for the allocation. */
4980 	for (; entry != NULL;
4981 	    entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) {
4982 		if (entry->fspace == 0)
4983 			continue;
4984 		addr = VMMAP_FREE_START(entry);
4985 
4986 restart:	/* Restart address checks on address change. */
4987 		tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
4988 		if (tmp < addr)
4989 			tmp += pmap_align;
4990 		addr = tmp;
4991 		if (addr >= VMMAP_FREE_END(entry))
4992 			continue;
4993 
4994 		/* Skip brk() allocation addresses. */
4995 		if (addr + sz > map->b_start && addr < map->b_end) {
4996 			if (VMMAP_FREE_END(entry) > map->b_end) {
4997 				addr = map->b_end;
4998 				goto restart;
4999 			} else
5000 				continue;
5001 		}
5002 		/* Skip stack allocation addresses. */
5003 		if (addr + sz > map->s_start && addr < map->s_end) {
5004 			if (VMMAP_FREE_END(entry) > map->s_end) {
5005 				addr = map->s_end;
5006 				goto restart;
5007 			} else
5008 				continue;
5009 		}
5010 
5011 		last = NULL;
5012 		if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5013 			error = 0;
5014 			goto out;
5015 		}
5016 	}
5017 
5018 out:
5019 	vm_map_unlock_read(map);
5020 	if (error == 0)
5021 		*addr_p = addr;
5022 	return error;
5023 }
5024 
5025 /*
5026  * Determine allocation bias.
5027  *
5028  * Returns 1 if we should bias to high addresses, -1 for a bias towards low
5029  * addresses, or 0 for no bias.
5030  * The bias mechanism is intended to avoid clashing with brk() and stack
5031  * areas.
5032  */
5033 int
5034 uvm_mapent_bias(struct vm_map *map, struct vm_map_entry *entry)
5035 {
5036 	vaddr_t start, end;
5037 
5038 	start = VMMAP_FREE_START(entry);
5039 	end = VMMAP_FREE_END(entry);
5040 
5041 	/* Stay at the top of brk() area. */
5042 	if (end >= map->b_start && start < map->b_end)
5043 		return 1;
5044 	/* Stay at the far end of the stack area. */
5045 	if (end >= map->s_start && start < map->s_end) {
5046 #ifdef MACHINE_STACK_GROWS_UP
5047 		return 1;
5048 #else
5049 		return -1;
5050 #endif
5051 	}
5052 
5053 	/* No bias, this area is meant for us. */
5054 	return 0;
5055 }
5056 
5057 
5058 boolean_t
5059 vm_map_lock_try_ln(struct vm_map *map, char *file, int line)
5060 {
5061 	boolean_t rv;
5062 
5063 	if (map->flags & VM_MAP_INTRSAFE) {
5064 		rv = mtx_enter_try(&map->mtx);
5065 	} else {
5066 		mtx_enter(&map->flags_lock);
5067 		if (map->flags & VM_MAP_BUSY) {
5068 			mtx_leave(&map->flags_lock);
5069 			return (FALSE);
5070 		}
5071 		mtx_leave(&map->flags_lock);
5072 		rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0);
5073 		/* check if the lock is busy and back out if we won the race */
5074 		if (rv) {
5075 			mtx_enter(&map->flags_lock);
5076 			if (map->flags & VM_MAP_BUSY) {
5077 				rw_exit(&map->lock);
5078 				rv = FALSE;
5079 			}
5080 			mtx_leave(&map->flags_lock);
5081 		}
5082 	}
5083 
5084 	if (rv) {
5085 		map->timestamp++;
5086 		LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5087 		uvm_tree_sanity(map, file, line);
5088 		uvm_tree_size_chk(map, file, line);
5089 	}
5090 
5091 	return (rv);
5092 }
5093 
5094 void
5095 vm_map_lock_ln(struct vm_map *map, char *file, int line)
5096 {
5097 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5098 		do {
5099 			mtx_enter(&map->flags_lock);
5100 tryagain:
5101 			while (map->flags & VM_MAP_BUSY) {
5102 				map->flags |= VM_MAP_WANTLOCK;
5103 				msleep(&map->flags, &map->flags_lock,
5104 				    PVM, vmmapbsy, 0);
5105 			}
5106 			mtx_leave(&map->flags_lock);
5107 		} while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0);
5108 		/* check if the lock is busy and back out if we won the race */
5109 		mtx_enter(&map->flags_lock);
5110 		if (map->flags & VM_MAP_BUSY) {
5111 			rw_exit(&map->lock);
5112 			goto tryagain;
5113 		}
5114 		mtx_leave(&map->flags_lock);
5115 	} else {
5116 		mtx_enter(&map->mtx);
5117 	}
5118 
5119 	map->timestamp++;
5120 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5121 	uvm_tree_sanity(map, file, line);
5122 	uvm_tree_size_chk(map, file, line);
5123 }
5124 
5125 void
5126 vm_map_lock_read_ln(struct vm_map *map, char *file, int line)
5127 {
5128 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5129 		rw_enter_read(&map->lock);
5130 	else
5131 		mtx_enter(&map->mtx);
5132 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5133 	uvm_tree_sanity(map, file, line);
5134 	uvm_tree_size_chk(map, file, line);
5135 }
5136 
5137 void
5138 vm_map_unlock_ln(struct vm_map *map, char *file, int line)
5139 {
5140 	uvm_tree_sanity(map, file, line);
5141 	uvm_tree_size_chk(map, file, line);
5142 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5143 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5144 		rw_exit(&map->lock);
5145 	else
5146 		mtx_leave(&map->mtx);
5147 }
5148 
5149 void
5150 vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
5151 {
5152 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5153 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5154 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5155 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5156 		rw_exit_read(&map->lock);
5157 	else
5158 		mtx_leave(&map->mtx);
5159 }
5160 
5161 void
5162 vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
5163 {
5164 	uvm_tree_sanity(map, file, line);
5165 	uvm_tree_size_chk(map, file, line);
5166 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5167 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5168 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5169 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5170 		rw_enter(&map->lock, RW_DOWNGRADE);
5171 }
5172 
5173 void
5174 vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
5175 {
5176 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5177 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5178 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5179 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5180 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5181 		rw_exit_read(&map->lock);
5182 		rw_enter_write(&map->lock);
5183 	}
5184 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5185 	uvm_tree_sanity(map, file, line);
5186 }
5187 
5188 void
5189 vm_map_busy_ln(struct vm_map *map, char *file, int line)
5190 {
5191 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5192 	mtx_enter(&map->flags_lock);
5193 	map->flags |= VM_MAP_BUSY;
5194 	mtx_leave(&map->flags_lock);
5195 }
5196 
5197 void
5198 vm_map_unbusy_ln(struct vm_map *map, char *file, int line)
5199 {
5200 	int oflags;
5201 
5202 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5203 	mtx_enter(&map->flags_lock);
5204 	oflags = map->flags;
5205 	map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK);
5206 	mtx_leave(&map->flags_lock);
5207 	if (oflags & VM_MAP_WANTLOCK)
5208 		wakeup(&map->flags);
5209 }
5210 
5211 #ifndef SMALL_KERNEL
5212 int
5213 uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve,
5214     size_t *lenp)
5215 {
5216 	struct vm_map_entry *entry;
5217 	vaddr_t start;
5218 	int cnt, maxcnt, error = 0;
5219 
5220 	KASSERT(*lenp > 0);
5221 	KASSERT((*lenp % sizeof(*kve)) == 0);
5222 	cnt = 0;
5223 	maxcnt = *lenp / sizeof(*kve);
5224 	KASSERT(maxcnt > 0);
5225 
5226 	/*
5227 	 * Return only entries whose address is above the given base
5228 	 * address.  This allows userland to iterate without knowing the
5229 	 * number of entries beforehand.
5230 	 */
5231 	start = (vaddr_t)kve[0].kve_start;
5232 
5233 	vm_map_lock(map);
5234 	RB_FOREACH(entry, uvm_map_addr, &map->addr) {
5235 		if (cnt == maxcnt) {
5236 			error = ENOMEM;
5237 			break;
5238 		}
5239 		if (start != 0 && entry->start < start)
5240 			continue;
5241 		kve->kve_start = entry->start;
5242 		kve->kve_end = entry->end;
5243 		kve->kve_guard = entry->guard;
5244 		kve->kve_fspace = entry->fspace;
5245 		kve->kve_fspace_augment = entry->fspace_augment;
5246 		kve->kve_offset = entry->offset;
5247 		kve->kve_wired_count = entry->wired_count;
5248 		kve->kve_etype = entry->etype;
5249 		kve->kve_protection = entry->protection;
5250 		kve->kve_max_protection = entry->max_protection;
5251 		kve->kve_advice = entry->advice;
5252 		kve->kve_inheritance = entry->inheritance;
5253 		kve->kve_flags = entry->flags;
5254 		kve++;
5255 		cnt++;
5256 	}
5257 	vm_map_unlock(map);
5258 
5259 	KASSERT(cnt <= maxcnt);
5260 
5261 	*lenp = sizeof(*kve) * cnt;
5262 	return error;
5263 }
5264 #endif
5265 
5266 
5267 #undef RB_AUGMENT
5268 #define RB_AUGMENT(x)	uvm_map_addr_augment((x))
5269 RB_GENERATE(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
5270     uvm_mapentry_addrcmp);
5271 #undef RB_AUGMENT
5272 
5273 
5274 /*
5275  * MD code: vmspace allocator setup.
5276  */
5277 
5278 #ifdef __i386__
5279 void
5280 uvm_map_setup_md(struct vm_map *map)
5281 {
5282 	vaddr_t		min, max;
5283 
5284 	min = map->min_offset;
5285 	max = map->max_offset;
5286 
5287 	/*
5288 	 * Ensure the selectors will not try to manage page 0;
5289 	 * it's too special.
5290 	 */
5291 	if (min < VMMAP_MIN_ADDR)
5292 		min = VMMAP_MIN_ADDR;
5293 
5294 #if 0	/* Cool stuff, not yet */
5295 	/* Hinted allocations. */
5296 	map->uaddr_any[1] = uaddr_hint_create(MAX(min, VMMAP_MIN_ADDR), max,
5297 	    1024 * 1024 * 1024);
5298 
5299 	/* Executable code is special. */
5300 	map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR);
5301 	/* Place normal allocations beyond executable mappings. */
5302 	map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max);
5303 #else	/* Crappy stuff, for now */
5304 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5305 #endif
5306 
5307 #ifndef SMALL_KERNEL
5308 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5309 #endif /* !SMALL_KERNEL */
5310 }
5311 #elif __LP64__
5312 void
5313 uvm_map_setup_md(struct vm_map *map)
5314 {
5315 	vaddr_t		min, max;
5316 
5317 	min = map->min_offset;
5318 	max = map->max_offset;
5319 
5320 	/*
5321 	 * Ensure the selectors will not try to manage page 0;
5322 	 * it's too special.
5323 	 */
5324 	if (min < VMMAP_MIN_ADDR)
5325 		min = VMMAP_MIN_ADDR;
5326 
5327 #if 0	/* Cool stuff, not yet */
5328 	/* Hinted allocations above 4GB */
5329 	map->uaddr_any[0] =
5330 	    uaddr_hint_create(0x100000000ULL, max, 1024 * 1024 * 1024);
5331 	/* Hinted allocations below 4GB */
5332 	map->uaddr_any[1] =
5333 	    uaddr_hint_create(MAX(min, VMMAP_MIN_ADDR), 0x100000000ULL,
5334 	    1024 * 1024 * 1024);
5335 	/* Normal allocations, always above 4GB */
5336 	map->uaddr_any[3] =
5337 	    uaddr_pivot_create(MAX(min, 0x100000000ULL), max);
5338 #else	/* Crappy stuff, for now */
5339 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5340 #endif
5341 
5342 #ifndef SMALL_KERNEL
5343 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5344 #endif /* !SMALL_KERNEL */
5345 }
5346 #else	/* non-i386, 32 bit */
5347 void
5348 uvm_map_setup_md(struct vm_map *map)
5349 {
5350 	vaddr_t		min, max;
5351 
5352 	min = map->min_offset;
5353 	max = map->max_offset;
5354 
5355 	/*
5356 	 * Ensure the selectors will not try to manage page 0;
5357 	 * it's too special.
5358 	 */
5359 	if (min < VMMAP_MIN_ADDR)
5360 		min = VMMAP_MIN_ADDR;
5361 
5362 #if 0	/* Cool stuff, not yet */
5363 	/* Hinted allocations. */
5364 	map->uaddr_any[1] = uaddr_hint_create(MAX(min, VMMAP_MIN_ADDR), max,
5365 	    1024 * 1024 * 1024);
5366 	/* Normal allocations. */
5367 	map->uaddr_any[3] = uaddr_pivot_create(min, max);
5368 #else	/* Crappy stuff, for now */
5369 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5370 #endif
5371 
5372 #ifndef SMALL_KERNEL
5373 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5374 #endif /* !SMALL_KERNEL */
5375 }
5376 #endif
5377