xref: /openbsd-src/sys/uvm/uvm_map.c (revision 1a8dbaac879b9f3335ad7fb25429ce63ac1d6bac)
1 /*	$OpenBSD: uvm_map.c,v 1.268 2020/09/22 14:31:08 mpi Exp $	*/
2 /*	$NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  *
19  *
20  * Copyright (c) 1997 Charles D. Cranor and Washington University.
21  * Copyright (c) 1991, 1993, The Regents of the University of California.
22  *
23  * All rights reserved.
24  *
25  * This code is derived from software contributed to Berkeley by
26  * The Mach Operating System project at Carnegie-Mellon University.
27  *
28  * Redistribution and use in source and binary forms, with or without
29  * modification, are permitted provided that the following conditions
30  * are met:
31  * 1. Redistributions of source code must retain the above copyright
32  *    notice, this list of conditions and the following disclaimer.
33  * 2. Redistributions in binary form must reproduce the above copyright
34  *    notice, this list of conditions and the following disclaimer in the
35  *    documentation and/or other materials provided with the distribution.
36  * 3. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	@(#)vm_map.c    8.3 (Berkeley) 1/12/94
53  * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
54  *
55  *
56  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
57  * All rights reserved.
58  *
59  * Permission to use, copy, modify and distribute this software and
60  * its documentation is hereby granted, provided that both the copyright
61  * notice and this permission notice appear in all copies of the
62  * software, derivative works or modified versions, and any portions
63  * thereof, and that both notices appear in supporting documentation.
64  *
65  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
66  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
67  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
68  *
69  * Carnegie Mellon requests users of this software to return to
70  *
71  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
72  *  School of Computer Science
73  *  Carnegie Mellon University
74  *  Pittsburgh PA 15213-3890
75  *
76  * any improvements or extensions that they make and grant Carnegie the
77  * rights to redistribute these changes.
78  */
79 
80 /*
81  * uvm_map.c: uvm map operations
82  */
83 
84 /* #define DEBUG */
85 /* #define VMMAP_DEBUG */
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/acct.h>
90 #include <sys/mman.h>
91 #include <sys/proc.h>
92 #include <sys/malloc.h>
93 #include <sys/pool.h>
94 #include <sys/sysctl.h>
95 #include <sys/signalvar.h>
96 #include <sys/syslog.h>
97 #include <sys/user.h>
98 #include <sys/tracepoint.h>
99 
100 #ifdef SYSVSHM
101 #include <sys/shm.h>
102 #endif
103 
104 #include <uvm/uvm.h>
105 
106 #ifdef DDB
107 #include <uvm/uvm_ddb.h>
108 #endif
109 
110 #include <uvm/uvm_addr.h>
111 
112 
113 vsize_t			 uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t);
114 int			 uvm_mapent_isjoinable(struct vm_map*,
115 			    struct vm_map_entry*, struct vm_map_entry*);
116 struct vm_map_entry	*uvm_mapent_merge(struct vm_map*, struct vm_map_entry*,
117 			    struct vm_map_entry*, struct uvm_map_deadq*);
118 struct vm_map_entry	*uvm_mapent_tryjoin(struct vm_map*,
119 			    struct vm_map_entry*, struct uvm_map_deadq*);
120 struct vm_map_entry	*uvm_map_mkentry(struct vm_map*, struct vm_map_entry*,
121 			    struct vm_map_entry*, vaddr_t, vsize_t, int,
122 			    struct uvm_map_deadq*, struct vm_map_entry*);
123 struct vm_map_entry	*uvm_mapent_alloc(struct vm_map*, int);
124 void			 uvm_mapent_free(struct vm_map_entry*);
125 void			 uvm_unmap_kill_entry(struct vm_map*,
126 			    struct vm_map_entry*);
127 void			 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *);
128 void			 uvm_mapent_mkfree(struct vm_map*,
129 			    struct vm_map_entry*, struct vm_map_entry**,
130 			    struct uvm_map_deadq*, boolean_t);
131 void			 uvm_map_pageable_pgon(struct vm_map*,
132 			    struct vm_map_entry*, struct vm_map_entry*,
133 			    vaddr_t, vaddr_t);
134 int			 uvm_map_pageable_wire(struct vm_map*,
135 			    struct vm_map_entry*, struct vm_map_entry*,
136 			    vaddr_t, vaddr_t, int);
137 void			 uvm_map_setup_entries(struct vm_map*);
138 void			 uvm_map_setup_md(struct vm_map*);
139 void			 uvm_map_teardown(struct vm_map*);
140 void			 uvm_map_vmspace_update(struct vm_map*,
141 			    struct uvm_map_deadq*, int);
142 void			 uvm_map_kmem_grow(struct vm_map*,
143 			    struct uvm_map_deadq*, vsize_t, int);
144 void			 uvm_map_freelist_update_clear(struct vm_map*,
145 			    struct uvm_map_deadq*);
146 void			 uvm_map_freelist_update_refill(struct vm_map *, int);
147 void			 uvm_map_freelist_update(struct vm_map*,
148 			    struct uvm_map_deadq*, vaddr_t, vaddr_t,
149 			    vaddr_t, vaddr_t, int);
150 struct vm_map_entry	*uvm_map_fix_space(struct vm_map*, struct vm_map_entry*,
151 			    vaddr_t, vaddr_t, int);
152 int			 uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int,
153 			    struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t,
154 			    int);
155 int			 uvm_map_findspace(struct vm_map*,
156 			    struct vm_map_entry**, struct vm_map_entry**,
157 			    vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
158 			    vaddr_t);
159 vsize_t			 uvm_map_addr_augment_get(struct vm_map_entry*);
160 void			 uvm_map_addr_augment(struct vm_map_entry*);
161 
162 int			 uvm_map_inentry_recheck(u_long, vaddr_t,
163 			     struct p_inentry *);
164 boolean_t		 uvm_map_inentry_fix(struct proc *, struct p_inentry *,
165 			     vaddr_t, int (*)(vm_map_entry_t), u_long);
166 /*
167  * Tree management functions.
168  */
169 
170 static inline void	 uvm_mapent_copy(struct vm_map_entry*,
171 			    struct vm_map_entry*);
172 static inline int	 uvm_mapentry_addrcmp(const struct vm_map_entry*,
173 			    const struct vm_map_entry*);
174 void			 uvm_mapent_free_insert(struct vm_map*,
175 			    struct uvm_addr_state*, struct vm_map_entry*);
176 void			 uvm_mapent_free_remove(struct vm_map*,
177 			    struct uvm_addr_state*, struct vm_map_entry*);
178 void			 uvm_mapent_addr_insert(struct vm_map*,
179 			    struct vm_map_entry*);
180 void			 uvm_mapent_addr_remove(struct vm_map*,
181 			    struct vm_map_entry*);
182 void			 uvm_map_splitentry(struct vm_map*,
183 			    struct vm_map_entry*, struct vm_map_entry*,
184 			    vaddr_t);
185 vsize_t			 uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t);
186 int			 uvm_mapent_bias(struct vm_map*, struct vm_map_entry*);
187 
188 /*
189  * uvm_vmspace_fork helper functions.
190  */
191 struct vm_map_entry	*uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
192 			    vsize_t, vm_prot_t, vm_prot_t,
193 			    struct vm_map_entry*, struct uvm_map_deadq*, int,
194 			    int);
195 struct vm_map_entry	*uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
196 			    vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
197 			    struct vm_map_entry*, struct uvm_map_deadq*);
198 struct vm_map_entry	*uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
199 			    struct vm_map*, struct vm_map_entry*,
200 			    struct uvm_map_deadq*);
201 struct vm_map_entry	*uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
202 			    struct vm_map*, struct vm_map_entry*,
203 			    struct uvm_map_deadq*);
204 struct vm_map_entry	*uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
205 			    struct vm_map*, struct vm_map_entry*,
206 			    struct uvm_map_deadq*);
207 
208 /*
209  * Tree validation.
210  */
211 #ifdef VMMAP_DEBUG
212 void			 uvm_tree_assert(struct vm_map*, int, char*,
213 			    char*, int);
214 #define UVM_ASSERT(map, cond, file, line)				\
215 	uvm_tree_assert((map), (cond), #cond, (file), (line))
216 void			 uvm_tree_sanity(struct vm_map*, char*, int);
217 void			 uvm_tree_size_chk(struct vm_map*, char*, int);
218 void			 vmspace_validate(struct vm_map*);
219 #else
220 #define uvm_tree_sanity(_map, _file, _line)		do {} while (0)
221 #define uvm_tree_size_chk(_map, _file, _line)		do {} while (0)
222 #define vmspace_validate(_map)				do {} while (0)
223 #endif
224 
225 /*
226  * All architectures will have pmap_prefer.
227  */
228 #ifndef PMAP_PREFER
229 #define PMAP_PREFER_ALIGN()	(vaddr_t)PAGE_SIZE
230 #define PMAP_PREFER_OFFSET(off)	0
231 #define PMAP_PREFER(addr, off)	(addr)
232 #endif
233 
234 /*
235  * The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
236  * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes.
237  *
238  * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size
239  * each time.
240  */
241 #define VM_MAP_KSIZE_INIT	(512 * (vaddr_t)PAGE_SIZE)
242 #define VM_MAP_KSIZE_DELTA	(256 * (vaddr_t)PAGE_SIZE)
243 #define VM_MAP_KSIZE_ALLOCMUL	4
244 /*
245  * When selecting a random free-space block, look at most FSPACE_DELTA blocks
246  * ahead.
247  */
248 #define FSPACE_DELTA		8
249 /*
250  * Put allocations adjecent to previous allocations when the free-space tree
251  * is larger than FSPACE_COMPACT entries.
252  *
253  * Alignment and PMAP_PREFER may still cause the entry to not be fully
254  * adjecent. Note that this strategy reduces memory fragmentation (by leaving
255  * a large space before or after the allocation).
256  */
257 #define FSPACE_COMPACT		128
258 /*
259  * Make the address selection skip at most this many bytes from the start of
260  * the free space in which the allocation takes place.
261  *
262  * The main idea behind a randomized address space is that an attacker cannot
263  * know where to target his attack. Therefore, the location of objects must be
264  * as random as possible. However, the goal is not to create the most sparse
265  * map that is possible.
266  * FSPACE_MAXOFF pushes the considered range in bytes down to less insane
267  * sizes, thereby reducing the sparseness. The biggest randomization comes
268  * from fragmentation, i.e. FSPACE_COMPACT.
269  */
270 #define FSPACE_MAXOFF		((vaddr_t)32 * 1024 * 1024)
271 /*
272  * Allow for small gaps in the overflow areas.
273  * Gap size is in bytes and does not have to be a multiple of page-size.
274  */
275 #define FSPACE_BIASGAP		((vaddr_t)32 * 1024)
276 
277 /* auto-allocate address lower bound */
278 #define VMMAP_MIN_ADDR		PAGE_SIZE
279 
280 
281 #ifdef DEADBEEF0
282 #define UVMMAP_DEADBEEF		((unsigned long)DEADBEEF0)
283 #else
284 #define UVMMAP_DEADBEEF		((unsigned long)0xdeadd0d0)
285 #endif
286 
287 #ifdef DEBUG
288 int uvm_map_printlocks = 0;
289 
290 #define LPRINTF(_args)							\
291 	do {								\
292 		if (uvm_map_printlocks)					\
293 			printf _args;					\
294 	} while (0)
295 #else
296 #define LPRINTF(_args)	do {} while (0)
297 #endif
298 
299 static struct mutex uvm_kmapent_mtx;
300 static struct timeval uvm_kmapent_last_warn_time;
301 static struct timeval uvm_kmapent_warn_rate = { 10, 0 };
302 
303 const char vmmapbsy[] = "vmmapbsy";
304 
305 /*
306  * pool for vmspace structures.
307  */
308 struct pool uvm_vmspace_pool;
309 
310 /*
311  * pool for dynamically-allocated map entries.
312  */
313 struct pool uvm_map_entry_pool;
314 struct pool uvm_map_entry_kmem_pool;
315 
316 /*
317  * This global represents the end of the kernel virtual address
318  * space. If we want to exceed this, we must grow the kernel
319  * virtual address space dynamically.
320  *
321  * Note, this variable is locked by kernel_map's lock.
322  */
323 vaddr_t uvm_maxkaddr;
324 
325 /*
326  * Locking predicate.
327  */
328 #define UVM_MAP_REQ_WRITE(_map)						\
329 	do {								\
330 		if ((_map)->ref_count > 0) {				\
331 			if (((_map)->flags & VM_MAP_INTRSAFE) == 0)	\
332 				rw_assert_wrlock(&(_map)->lock);	\
333 			else						\
334 				MUTEX_ASSERT_LOCKED(&(_map)->mtx);	\
335 		}							\
336 	} while (0)
337 
338 #define	vm_map_modflags(map, set, clear)				\
339 	do {								\
340 		mtx_enter(&(map)->flags_lock);				\
341 		(map)->flags = ((map)->flags | (set)) & ~(clear);	\
342 		mtx_leave(&(map)->flags_lock);				\
343 	} while (0)
344 
345 
346 /*
347  * Tree describing entries by address.
348  *
349  * Addresses are unique.
350  * Entries with start == end may only exist if they are the first entry
351  * (sorted by address) within a free-memory tree.
352  */
353 
354 static inline int
355 uvm_mapentry_addrcmp(const struct vm_map_entry *e1,
356     const struct vm_map_entry *e2)
357 {
358 	return e1->start < e2->start ? -1 : e1->start > e2->start;
359 }
360 
361 /*
362  * Copy mapentry.
363  */
364 static inline void
365 uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
366 {
367 	caddr_t csrc, cdst;
368 	size_t sz;
369 
370 	csrc = (caddr_t)src;
371 	cdst = (caddr_t)dst;
372 	csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
373 	cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
374 
375 	sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) -
376 	    offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
377 	memcpy(cdst, csrc, sz);
378 }
379 
380 /*
381  * Handle free-list insertion.
382  */
383 void
384 uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr,
385     struct vm_map_entry *entry)
386 {
387 	const struct uvm_addr_functions *fun;
388 #ifdef VMMAP_DEBUG
389 	vaddr_t min, max, bound;
390 #endif
391 
392 #ifdef VMMAP_DEBUG
393 	/*
394 	 * Boundary check.
395 	 * Boundaries are folded if they go on the same free list.
396 	 */
397 	min = VMMAP_FREE_START(entry);
398 	max = VMMAP_FREE_END(entry);
399 
400 	while (min < max) {
401 		bound = uvm_map_boundary(map, min, max);
402 		KASSERT(uvm_map_uaddr(map, min) == uaddr);
403 		min = bound;
404 	}
405 #endif
406 	KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
407 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
408 
409 	UVM_MAP_REQ_WRITE(map);
410 
411 	/* Actual insert: forward to uaddr pointer. */
412 	if (uaddr != NULL) {
413 		fun = uaddr->uaddr_functions;
414 		KDASSERT(fun != NULL);
415 		if (fun->uaddr_free_insert != NULL)
416 			(*fun->uaddr_free_insert)(map, uaddr, entry);
417 		entry->etype |= UVM_ET_FREEMAPPED;
418 	}
419 
420 	/* Update fspace augmentation. */
421 	uvm_map_addr_augment(entry);
422 }
423 
424 /*
425  * Handle free-list removal.
426  */
427 void
428 uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr,
429     struct vm_map_entry *entry)
430 {
431 	const struct uvm_addr_functions *fun;
432 
433 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
434 	KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
435 	UVM_MAP_REQ_WRITE(map);
436 
437 	if (uaddr != NULL) {
438 		fun = uaddr->uaddr_functions;
439 		if (fun->uaddr_free_remove != NULL)
440 			(*fun->uaddr_free_remove)(map, uaddr, entry);
441 		entry->etype &= ~UVM_ET_FREEMAPPED;
442 	}
443 }
444 
445 /*
446  * Handle address tree insertion.
447  */
448 void
449 uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry)
450 {
451 	struct vm_map_entry *res;
452 
453 	if (!RBT_CHECK(uvm_map_addr, entry, UVMMAP_DEADBEEF))
454 		panic("uvm_mapent_addr_insert: entry still in addr list");
455 	KDASSERT(entry->start <= entry->end);
456 	KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 &&
457 	    (entry->end & (vaddr_t)PAGE_MASK) == 0);
458 
459 	TRACEPOINT(uvm, map_insert,
460 	    entry->start, entry->end, entry->protection, NULL);
461 
462 	UVM_MAP_REQ_WRITE(map);
463 	res = RBT_INSERT(uvm_map_addr, &map->addr, entry);
464 	if (res != NULL) {
465 		panic("uvm_mapent_addr_insert: map %p entry %p "
466 		    "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision "
467 		    "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)",
468 		    map, entry,
469 		    entry->start, entry->end, entry->guard, entry->fspace,
470 		    res, res->start, res->end, res->guard, res->fspace);
471 	}
472 }
473 
474 /*
475  * Handle address tree removal.
476  */
477 void
478 uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry)
479 {
480 	struct vm_map_entry *res;
481 
482 	TRACEPOINT(uvm, map_remove,
483 	    entry->start, entry->end, entry->protection, NULL);
484 
485 	UVM_MAP_REQ_WRITE(map);
486 	res = RBT_REMOVE(uvm_map_addr, &map->addr, entry);
487 	if (res != entry)
488 		panic("uvm_mapent_addr_remove");
489 	RBT_POISON(uvm_map_addr, entry, UVMMAP_DEADBEEF);
490 }
491 
492 /*
493  * uvm_map_reference: add reference to a map
494  *
495  * XXX check map reference counter lock
496  */
497 #define uvm_map_reference(_map)						\
498 	do {								\
499 		map->ref_count++;					\
500 	} while (0)
501 
502 /*
503  * Calculate the dused delta.
504  */
505 vsize_t
506 uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max)
507 {
508 	struct vmspace *vm;
509 	vsize_t sz;
510 	vaddr_t lmax;
511 	vaddr_t stack_begin, stack_end; /* Position of stack. */
512 
513 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
514 	vm = (struct vmspace *)map;
515 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
516 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
517 
518 	sz = 0;
519 	while (min != max) {
520 		lmax = max;
521 		if (min < stack_begin && lmax > stack_begin)
522 			lmax = stack_begin;
523 		else if (min < stack_end && lmax > stack_end)
524 			lmax = stack_end;
525 
526 		if (min >= stack_begin && min < stack_end) {
527 			/* nothing */
528 		} else
529 			sz += lmax - min;
530 		min = lmax;
531 	}
532 
533 	return sz >> PAGE_SHIFT;
534 }
535 
536 /*
537  * Find the entry describing the given address.
538  */
539 struct vm_map_entry*
540 uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr)
541 {
542 	struct vm_map_entry *iter;
543 
544 	iter = RBT_ROOT(uvm_map_addr, atree);
545 	while (iter != NULL) {
546 		if (iter->start > addr)
547 			iter = RBT_LEFT(uvm_map_addr, iter);
548 		else if (VMMAP_FREE_END(iter) <= addr)
549 			iter = RBT_RIGHT(uvm_map_addr, iter);
550 		else
551 			return iter;
552 	}
553 	return NULL;
554 }
555 
556 /*
557  * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry)
558  *
559  * Push dead entries into a linked list.
560  * Since the linked list abuses the address tree for storage, the entry
561  * may not be linked in a map.
562  *
563  * *head must be initialized to NULL before the first call to this macro.
564  * uvm_unmap_detach(*head, 0) will remove dead entries.
565  */
566 static inline void
567 dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry)
568 {
569 	TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq);
570 }
571 #define DEAD_ENTRY_PUSH(_headptr, _entry)				\
572 	dead_entry_push((_headptr), (_entry))
573 
574 /*
575  * Helper function for uvm_map_findspace_tree.
576  *
577  * Given allocation constraints and pmap constraints, finds the
578  * lowest and highest address in a range that can be used for the
579  * allocation.
580  *
581  * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs.
582  *
583  *
584  * Big chunk of math with a seasoning of dragons.
585  */
586 int
587 uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg,
588     struct vm_map_entry *sel, vaddr_t align,
589     vaddr_t pmap_align, vaddr_t pmap_off, int bias)
590 {
591 	vaddr_t sel_min, sel_max;
592 #ifdef PMAP_PREFER
593 	vaddr_t pmap_min, pmap_max;
594 #endif /* PMAP_PREFER */
595 #ifdef DIAGNOSTIC
596 	int bad;
597 #endif /* DIAGNOSTIC */
598 
599 	sel_min = VMMAP_FREE_START(sel);
600 	sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0);
601 
602 #ifdef PMAP_PREFER
603 
604 	/*
605 	 * There are two special cases, in which we can satisfy the align
606 	 * requirement and the pmap_prefer requirement.
607 	 * - when pmap_off == 0, we always select the largest of the two
608 	 * - when pmap_off % align == 0 and pmap_align > align, we simply
609 	 *   satisfy the pmap_align requirement and automatically
610 	 *   satisfy the align requirement.
611 	 */
612 	if (align > PAGE_SIZE &&
613 	    !(pmap_align > align && (pmap_off & (align - 1)) == 0)) {
614 		/*
615 		 * Simple case: only use align.
616 		 */
617 		sel_min = roundup(sel_min, align);
618 		sel_max &= ~(align - 1);
619 
620 		if (sel_min > sel_max)
621 			return ENOMEM;
622 
623 		/* Correct for bias. */
624 		if (sel_max - sel_min > FSPACE_BIASGAP) {
625 			if (bias > 0) {
626 				sel_min = sel_max - FSPACE_BIASGAP;
627 				sel_min = roundup(sel_min, align);
628 			} else if (bias < 0) {
629 				sel_max = sel_min + FSPACE_BIASGAP;
630 				sel_max &= ~(align - 1);
631 			}
632 		}
633 	} else if (pmap_align != 0) {
634 		/*
635 		 * Special case: satisfy both pmap_prefer and
636 		 * align argument.
637 		 */
638 		pmap_max = sel_max & ~(pmap_align - 1);
639 		pmap_min = sel_min;
640 		if (pmap_max < sel_min)
641 			return ENOMEM;
642 
643 		/* Adjust pmap_min for BIASGAP for top-addr bias. */
644 		if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP)
645 			pmap_min = pmap_max - FSPACE_BIASGAP;
646 		/* Align pmap_min. */
647 		pmap_min &= ~(pmap_align - 1);
648 		if (pmap_min < sel_min)
649 			pmap_min += pmap_align;
650 		if (pmap_min > pmap_max)
651 			return ENOMEM;
652 
653 		/* Adjust pmap_max for BIASGAP for bottom-addr bias. */
654 		if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) {
655 			pmap_max = (pmap_min + FSPACE_BIASGAP) &
656 			    ~(pmap_align - 1);
657 		}
658 		if (pmap_min > pmap_max)
659 			return ENOMEM;
660 
661 		/* Apply pmap prefer offset. */
662 		pmap_max |= pmap_off;
663 		if (pmap_max > sel_max)
664 			pmap_max -= pmap_align;
665 		pmap_min |= pmap_off;
666 		if (pmap_min < sel_min)
667 			pmap_min += pmap_align;
668 
669 		/*
670 		 * Fixup: it's possible that pmap_min and pmap_max
671 		 * cross eachother. In this case, try to find one
672 		 * address that is allowed.
673 		 * (This usually happens in biased case.)
674 		 */
675 		if (pmap_min > pmap_max) {
676 			if (pmap_min < sel_max)
677 				pmap_max = pmap_min;
678 			else if (pmap_max > sel_min)
679 				pmap_min = pmap_max;
680 			else
681 				return ENOMEM;
682 		}
683 
684 		/* Internal validation. */
685 		KDASSERT(pmap_min <= pmap_max);
686 
687 		sel_min = pmap_min;
688 		sel_max = pmap_max;
689 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
690 		sel_min = sel_max - FSPACE_BIASGAP;
691 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
692 		sel_max = sel_min + FSPACE_BIASGAP;
693 
694 #else
695 
696 	if (align > PAGE_SIZE) {
697 		sel_min = roundup(sel_min, align);
698 		sel_max &= ~(align - 1);
699 		if (sel_min > sel_max)
700 			return ENOMEM;
701 
702 		if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) {
703 			if (bias > 0) {
704 				sel_min = roundup(sel_max - FSPACE_BIASGAP,
705 				    align);
706 			} else {
707 				sel_max = (sel_min + FSPACE_BIASGAP) &
708 				    ~(align - 1);
709 			}
710 		}
711 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
712 		sel_min = sel_max - FSPACE_BIASGAP;
713 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
714 		sel_max = sel_min + FSPACE_BIASGAP;
715 
716 #endif
717 
718 	if (sel_min > sel_max)
719 		return ENOMEM;
720 
721 #ifdef DIAGNOSTIC
722 	bad = 0;
723 	/* Lower boundary check. */
724 	if (sel_min < VMMAP_FREE_START(sel)) {
725 		printf("sel_min: 0x%lx, but should be at least 0x%lx\n",
726 		    sel_min, VMMAP_FREE_START(sel));
727 		bad++;
728 	}
729 	/* Upper boundary check. */
730 	if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) {
731 		printf("sel_max: 0x%lx, but should be at most 0x%lx\n",
732 		    sel_max,
733 		    VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0));
734 		bad++;
735 	}
736 	/* Lower boundary alignment. */
737 	if (align != 0 && (sel_min & (align - 1)) != 0) {
738 		printf("sel_min: 0x%lx, not aligned to 0x%lx\n",
739 		    sel_min, align);
740 		bad++;
741 	}
742 	/* Upper boundary alignment. */
743 	if (align != 0 && (sel_max & (align - 1)) != 0) {
744 		printf("sel_max: 0x%lx, not aligned to 0x%lx\n",
745 		    sel_max, align);
746 		bad++;
747 	}
748 	/* Lower boundary PMAP_PREFER check. */
749 	if (pmap_align != 0 && align == 0 &&
750 	    (sel_min & (pmap_align - 1)) != pmap_off) {
751 		printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
752 		    sel_min, sel_min & (pmap_align - 1), pmap_off);
753 		bad++;
754 	}
755 	/* Upper boundary PMAP_PREFER check. */
756 	if (pmap_align != 0 && align == 0 &&
757 	    (sel_max & (pmap_align - 1)) != pmap_off) {
758 		printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
759 		    sel_max, sel_max & (pmap_align - 1), pmap_off);
760 		bad++;
761 	}
762 
763 	if (bad) {
764 		panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, "
765 		    "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, "
766 		    "bias = %d, "
767 		    "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)",
768 		    sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off,
769 		    bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel));
770 	}
771 #endif /* DIAGNOSTIC */
772 
773 	*min = sel_min;
774 	*max = sel_max;
775 	return 0;
776 }
777 
778 /*
779  * Test if memory starting at addr with sz bytes is free.
780  *
781  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
782  * the space.
783  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
784  */
785 int
786 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
787     struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr,
788     vaddr_t addr, vsize_t sz)
789 {
790 	struct uvm_addr_state *free;
791 	struct uvm_map_addr *atree;
792 	struct vm_map_entry *i, *i_end;
793 
794 	if (addr + sz < addr)
795 		return 0;
796 
797 	/*
798 	 * Kernel memory above uvm_maxkaddr is considered unavailable.
799 	 */
800 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
801 		if (addr + sz > uvm_maxkaddr)
802 			return 0;
803 	}
804 
805 	atree = &map->addr;
806 
807 	/*
808 	 * Fill in first, last, so they point at the entries containing the
809 	 * first and last address of the range.
810 	 * Note that if they are not NULL, we don't perform the lookup.
811 	 */
812 	KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL);
813 	if (*start_ptr == NULL) {
814 		*start_ptr = uvm_map_entrybyaddr(atree, addr);
815 		if (*start_ptr == NULL)
816 			return 0;
817 	} else
818 		KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr));
819 	if (*end_ptr == NULL) {
820 		if (VMMAP_FREE_END(*start_ptr) >= addr + sz)
821 			*end_ptr = *start_ptr;
822 		else {
823 			*end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1);
824 			if (*end_ptr == NULL)
825 				return 0;
826 		}
827 	} else
828 		KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1));
829 
830 	/* Validation. */
831 	KDASSERT(*start_ptr != NULL && *end_ptr != NULL);
832 	KDASSERT((*start_ptr)->start <= addr &&
833 	    VMMAP_FREE_END(*start_ptr) > addr &&
834 	    (*end_ptr)->start < addr + sz &&
835 	    VMMAP_FREE_END(*end_ptr) >= addr + sz);
836 
837 	/*
838 	 * Check the none of the entries intersects with <addr, addr+sz>.
839 	 * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is
840 	 * considered unavailable unless called by those allocators.
841 	 */
842 	i = *start_ptr;
843 	i_end = RBT_NEXT(uvm_map_addr, *end_ptr);
844 	for (; i != i_end;
845 	    i = RBT_NEXT(uvm_map_addr, i)) {
846 		if (i->start != i->end && i->end > addr)
847 			return 0;
848 
849 		/*
850 		 * uaddr_exe and uaddr_brk_stack may only be used
851 		 * by these allocators and the NULL uaddr (i.e. no
852 		 * uaddr).
853 		 * Reject if this requirement is not met.
854 		 */
855 		if (uaddr != NULL) {
856 			free = uvm_map_uaddr_e(map, i);
857 
858 			if (uaddr != free && free != NULL &&
859 			    (free == map->uaddr_exe ||
860 			     free == map->uaddr_brk_stack))
861 				return 0;
862 		}
863 	}
864 
865 	return -1;
866 }
867 
868 /*
869  * Invoke each address selector until an address is found.
870  * Will not invoke uaddr_exe.
871  */
872 int
873 uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
874     struct vm_map_entry**last, vaddr_t *addr, vsize_t sz,
875     vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint)
876 {
877 	struct uvm_addr_state *uaddr;
878 	int i;
879 
880 	/*
881 	 * Allocation for sz bytes at any address,
882 	 * using the addr selectors in order.
883 	 */
884 	for (i = 0; i < nitems(map->uaddr_any); i++) {
885 		uaddr = map->uaddr_any[i];
886 
887 		if (uvm_addr_invoke(map, uaddr, first, last,
888 		    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
889 			return 0;
890 	}
891 
892 	/* Fall back to brk() and stack() address selectors. */
893 	uaddr = map->uaddr_brk_stack;
894 	if (uvm_addr_invoke(map, uaddr, first, last,
895 	    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
896 		return 0;
897 
898 	return ENOMEM;
899 }
900 
901 /* Calculate entry augmentation value. */
902 vsize_t
903 uvm_map_addr_augment_get(struct vm_map_entry *entry)
904 {
905 	vsize_t			 augment;
906 	struct vm_map_entry	*left, *right;
907 
908 	augment = entry->fspace;
909 	if ((left = RBT_LEFT(uvm_map_addr, entry)) != NULL)
910 		augment = MAX(augment, left->fspace_augment);
911 	if ((right = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
912 		augment = MAX(augment, right->fspace_augment);
913 	return augment;
914 }
915 
916 /*
917  * Update augmentation data in entry.
918  */
919 void
920 uvm_map_addr_augment(struct vm_map_entry *entry)
921 {
922 	vsize_t			 augment;
923 
924 	while (entry != NULL) {
925 		/* Calculate value for augmentation. */
926 		augment = uvm_map_addr_augment_get(entry);
927 
928 		/*
929 		 * Descend update.
930 		 * Once we find an entry that already has the correct value,
931 		 * stop, since it means all its parents will use the correct
932 		 * value too.
933 		 */
934 		if (entry->fspace_augment == augment)
935 			return;
936 		entry->fspace_augment = augment;
937 		entry = RBT_PARENT(uvm_map_addr, entry);
938 	}
939 }
940 
941 /*
942  * uvm_mapanon: establish a valid mapping in map for an anon
943  *
944  * => *addr and sz must be a multiple of PAGE_SIZE.
945  * => *addr is ignored, except if flags contains UVM_FLAG_FIXED.
946  * => map must be unlocked.
947  *
948  * => align: align vaddr, must be a power-of-2.
949  *    Align is only a hint and will be ignored if the alignment fails.
950  */
951 int
952 uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz,
953     vsize_t align, unsigned int flags)
954 {
955 	struct vm_map_entry	*first, *last, *entry, *new;
956 	struct uvm_map_deadq	 dead;
957 	vm_prot_t		 prot;
958 	vm_prot_t		 maxprot;
959 	vm_inherit_t		 inherit;
960 	int			 advice;
961 	int			 error;
962 	vaddr_t			 pmap_align, pmap_offset;
963 	vaddr_t			 hint;
964 
965 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE);
966 	KASSERT(map != kernel_map);
967 	KASSERT((map->flags & UVM_FLAG_HOLE) == 0);
968 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
969 	splassert(IPL_NONE);
970 	KASSERT((flags & UVM_FLAG_TRYLOCK) == 0);
971 
972 	/*
973 	 * We use pmap_align and pmap_offset as alignment and offset variables.
974 	 *
975 	 * Because the align parameter takes precedence over pmap prefer,
976 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
977 	 * if pmap_prefer will not align.
978 	 */
979 	pmap_align = MAX(align, PAGE_SIZE);
980 	pmap_offset = 0;
981 
982 	/* Decode parameters. */
983 	prot = UVM_PROTECTION(flags);
984 	maxprot = UVM_MAXPROTECTION(flags);
985 	advice = UVM_ADVICE(flags);
986 	inherit = UVM_INHERIT(flags);
987 	error = 0;
988 	hint = trunc_page(*addr);
989 	TAILQ_INIT(&dead);
990 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
991 	KASSERT((align & (align - 1)) == 0);
992 
993 	/* Check protection. */
994 	if ((prot & maxprot) != prot)
995 		return EACCES;
996 
997 	/*
998 	 * Before grabbing the lock, allocate a map entry for later
999 	 * use to ensure we don't wait for memory while holding the
1000 	 * vm_map_lock.
1001 	 */
1002 	new = uvm_mapent_alloc(map, flags);
1003 	if (new == NULL)
1004 		return(ENOMEM);
1005 
1006 	vm_map_lock(map);
1007 	first = last = NULL;
1008 	if (flags & UVM_FLAG_FIXED) {
1009 		/*
1010 		 * Fixed location.
1011 		 *
1012 		 * Note: we ignore align, pmap_prefer.
1013 		 * Fill in first, last and *addr.
1014 		 */
1015 		KASSERT((*addr & PAGE_MASK) == 0);
1016 
1017 		/* Check that the space is available. */
1018 		if (flags & UVM_FLAG_UNMAP) {
1019 			if ((flags & UVM_FLAG_STACK) &&
1020 			    !uvm_map_is_stack_remappable(map, *addr, sz)) {
1021 				error = EINVAL;
1022 				goto unlock;
1023 			}
1024 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1025 		}
1026 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1027 			error = ENOMEM;
1028 			goto unlock;
1029 		}
1030 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1031 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1032 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1033 		/*
1034 		 * Address used as hint.
1035 		 *
1036 		 * Note: we enforce the alignment restriction,
1037 		 * but ignore pmap_prefer.
1038 		 */
1039 	} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1040 		/* Run selection algorithm for executables. */
1041 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1042 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1043 
1044 		if (error != 0)
1045 			goto unlock;
1046 	} else {
1047 		/* Update freelists from vmspace. */
1048 		uvm_map_vmspace_update(map, &dead, flags);
1049 
1050 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1051 		    pmap_align, pmap_offset, prot, hint);
1052 
1053 		if (error != 0)
1054 			goto unlock;
1055 	}
1056 
1057 	/* Double-check if selected address doesn't cause overflow. */
1058 	if (*addr + sz < *addr) {
1059 		error = ENOMEM;
1060 		goto unlock;
1061 	}
1062 
1063 	/* If we only want a query, return now. */
1064 	if (flags & UVM_FLAG_QUERY) {
1065 		error = 0;
1066 		goto unlock;
1067 	}
1068 
1069 	/*
1070 	 * Create new entry.
1071 	 * first and last may be invalidated after this call.
1072 	 */
1073 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1074 	    new);
1075 	if (entry == NULL) {
1076 		error = ENOMEM;
1077 		goto unlock;
1078 	}
1079 	new = NULL;
1080 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1081 	entry->object.uvm_obj = NULL;
1082 	entry->offset = 0;
1083 	entry->protection = prot;
1084 	entry->max_protection = maxprot;
1085 	entry->inheritance = inherit;
1086 	entry->wired_count = 0;
1087 	entry->advice = advice;
1088 	if (prot & PROT_WRITE)
1089 		map->wserial++;
1090 	if (flags & UVM_FLAG_SYSCALL) {
1091 		entry->etype |= UVM_ET_SYSCALL;
1092 		map->wserial++;
1093 	}
1094 	if (flags & UVM_FLAG_STACK) {
1095 		entry->etype |= UVM_ET_STACK;
1096 		if (flags & (UVM_FLAG_FIXED | UVM_FLAG_UNMAP))
1097 			map->sserial++;
1098 	}
1099 	if (flags & UVM_FLAG_COPYONW) {
1100 		entry->etype |= UVM_ET_COPYONWRITE;
1101 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1102 			entry->etype |= UVM_ET_NEEDSCOPY;
1103 	}
1104 	if (flags & UVM_FLAG_CONCEAL)
1105 		entry->etype |= UVM_ET_CONCEAL;
1106 	if (flags & UVM_FLAG_OVERLAY) {
1107 		KERNEL_LOCK();
1108 		entry->aref.ar_pageoff = 0;
1109 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1110 		KERNEL_UNLOCK();
1111 	}
1112 
1113 	/* Update map and process statistics. */
1114 	map->size += sz;
1115 	if (prot != PROT_NONE) {
1116 		((struct vmspace *)map)->vm_dused +=
1117 		    uvmspace_dused(map, *addr, *addr + sz);
1118 	}
1119 
1120 unlock:
1121 	vm_map_unlock(map);
1122 
1123 	/*
1124 	 * Remove dead entries.
1125 	 *
1126 	 * Dead entries may be the result of merging.
1127 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1128 	 * destroy free-space entries.
1129 	 */
1130 	uvm_unmap_detach(&dead, 0);
1131 
1132 	if (new)
1133 		uvm_mapent_free(new);
1134 	return error;
1135 }
1136 
1137 /*
1138  * uvm_map: establish a valid mapping in map
1139  *
1140  * => *addr and sz must be a multiple of PAGE_SIZE.
1141  * => map must be unlocked.
1142  * => <uobj,uoffset> value meanings (4 cases):
1143  *	[1] <NULL,uoffset>		== uoffset is a hint for PMAP_PREFER
1144  *	[2] <NULL,UVM_UNKNOWN_OFFSET>	== don't PMAP_PREFER
1145  *	[3] <uobj,uoffset>		== normal mapping
1146  *	[4] <uobj,UVM_UNKNOWN_OFFSET>	== uvm_map finds offset based on VA
1147  *
1148  *   case [4] is for kernel mappings where we don't know the offset until
1149  *   we've found a virtual address.   note that kernel object offsets are
1150  *   always relative to vm_map_min(kernel_map).
1151  *
1152  * => align: align vaddr, must be a power-of-2.
1153  *    Align is only a hint and will be ignored if the alignment fails.
1154  */
1155 int
1156 uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz,
1157     struct uvm_object *uobj, voff_t uoffset,
1158     vsize_t align, unsigned int flags)
1159 {
1160 	struct vm_map_entry	*first, *last, *entry, *new;
1161 	struct uvm_map_deadq	 dead;
1162 	vm_prot_t		 prot;
1163 	vm_prot_t		 maxprot;
1164 	vm_inherit_t		 inherit;
1165 	int			 advice;
1166 	int			 error;
1167 	vaddr_t			 pmap_align, pmap_offset;
1168 	vaddr_t			 hint;
1169 
1170 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1171 		splassert(IPL_NONE);
1172 	else
1173 		splassert(IPL_VM);
1174 
1175 	/*
1176 	 * We use pmap_align and pmap_offset as alignment and offset variables.
1177 	 *
1178 	 * Because the align parameter takes precedence over pmap prefer,
1179 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
1180 	 * if pmap_prefer will not align.
1181 	 */
1182 	if (uoffset == UVM_UNKNOWN_OFFSET) {
1183 		pmap_align = MAX(align, PAGE_SIZE);
1184 		pmap_offset = 0;
1185 	} else {
1186 		pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE);
1187 		pmap_offset = PMAP_PREFER_OFFSET(uoffset);
1188 
1189 		if (align == 0 ||
1190 		    (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) {
1191 			/* pmap_offset satisfies align, no change. */
1192 		} else {
1193 			/* Align takes precedence over pmap prefer. */
1194 			pmap_align = align;
1195 			pmap_offset = 0;
1196 		}
1197 	}
1198 
1199 	/* Decode parameters. */
1200 	prot = UVM_PROTECTION(flags);
1201 	maxprot = UVM_MAXPROTECTION(flags);
1202 	advice = UVM_ADVICE(flags);
1203 	inherit = UVM_INHERIT(flags);
1204 	error = 0;
1205 	hint = trunc_page(*addr);
1206 	TAILQ_INIT(&dead);
1207 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
1208 	KASSERT((align & (align - 1)) == 0);
1209 
1210 	/* Holes are incompatible with other types of mappings. */
1211 	if (flags & UVM_FLAG_HOLE) {
1212 		KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) &&
1213 		    (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0);
1214 	}
1215 
1216 	/* Unset hint for kernel_map non-fixed allocations. */
1217 	if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED))
1218 		hint = 0;
1219 
1220 	/* Check protection. */
1221 	if ((prot & maxprot) != prot)
1222 		return EACCES;
1223 
1224 	if (map == kernel_map &&
1225 	    (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
1226 		panic("uvm_map: kernel map W^X violation requested");
1227 
1228 	/*
1229 	 * Before grabbing the lock, allocate a map entry for later
1230 	 * use to ensure we don't wait for memory while holding the
1231 	 * vm_map_lock.
1232 	 */
1233 	new = uvm_mapent_alloc(map, flags);
1234 	if (new == NULL)
1235 		return(ENOMEM);
1236 
1237 	if (flags & UVM_FLAG_TRYLOCK) {
1238 		if (vm_map_lock_try(map) == FALSE) {
1239 			error = EFAULT;
1240 			goto out;
1241 		}
1242 	} else {
1243 		vm_map_lock(map);
1244 	}
1245 
1246 	first = last = NULL;
1247 	if (flags & UVM_FLAG_FIXED) {
1248 		/*
1249 		 * Fixed location.
1250 		 *
1251 		 * Note: we ignore align, pmap_prefer.
1252 		 * Fill in first, last and *addr.
1253 		 */
1254 		KASSERT((*addr & PAGE_MASK) == 0);
1255 
1256 		/*
1257 		 * Grow pmap to include allocated address.
1258 		 * If the growth fails, the allocation will fail too.
1259 		 */
1260 		if ((map->flags & VM_MAP_ISVMSPACE) == 0 &&
1261 		    uvm_maxkaddr < (*addr + sz)) {
1262 			uvm_map_kmem_grow(map, &dead,
1263 			    *addr + sz - uvm_maxkaddr, flags);
1264 		}
1265 
1266 		/* Check that the space is available. */
1267 		if (flags & UVM_FLAG_UNMAP)
1268 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1269 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1270 			error = ENOMEM;
1271 			goto unlock;
1272 		}
1273 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1274 	    (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE &&
1275 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1276 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1277 		/*
1278 		 * Address used as hint.
1279 		 *
1280 		 * Note: we enforce the alignment restriction,
1281 		 * but ignore pmap_prefer.
1282 		 */
1283 	} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1284 		/* Run selection algorithm for executables. */
1285 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1286 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1287 
1288 		/* Grow kernel memory and try again. */
1289 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1290 			uvm_map_kmem_grow(map, &dead, sz, flags);
1291 
1292 			error = uvm_addr_invoke(map, map->uaddr_exe,
1293 			    &first, &last, addr, sz,
1294 			    pmap_align, pmap_offset, prot, hint);
1295 		}
1296 
1297 		if (error != 0)
1298 			goto unlock;
1299 	} else {
1300 		/* Update freelists from vmspace. */
1301 		if (map->flags & VM_MAP_ISVMSPACE)
1302 			uvm_map_vmspace_update(map, &dead, flags);
1303 
1304 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1305 		    pmap_align, pmap_offset, prot, hint);
1306 
1307 		/* Grow kernel memory and try again. */
1308 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1309 			uvm_map_kmem_grow(map, &dead, sz, flags);
1310 
1311 			error = uvm_map_findspace(map, &first, &last, addr, sz,
1312 			    pmap_align, pmap_offset, prot, hint);
1313 		}
1314 
1315 		if (error != 0)
1316 			goto unlock;
1317 	}
1318 
1319 	/* Double-check if selected address doesn't cause overflow. */
1320 	if (*addr + sz < *addr) {
1321 		error = ENOMEM;
1322 		goto unlock;
1323 	}
1324 
1325 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE ||
1326 	    uvm_maxkaddr >= *addr + sz);
1327 
1328 	/* If we only want a query, return now. */
1329 	if (flags & UVM_FLAG_QUERY) {
1330 		error = 0;
1331 		goto unlock;
1332 	}
1333 
1334 	if (uobj == NULL)
1335 		uoffset = 0;
1336 	else if (uoffset == UVM_UNKNOWN_OFFSET) {
1337 		KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
1338 		uoffset = *addr - vm_map_min(kernel_map);
1339 	}
1340 
1341 	/*
1342 	 * Create new entry.
1343 	 * first and last may be invalidated after this call.
1344 	 */
1345 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1346 	    new);
1347 	if (entry == NULL) {
1348 		error = ENOMEM;
1349 		goto unlock;
1350 	}
1351 	new = NULL;
1352 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1353 	entry->object.uvm_obj = uobj;
1354 	entry->offset = uoffset;
1355 	entry->protection = prot;
1356 	entry->max_protection = maxprot;
1357 	entry->inheritance = inherit;
1358 	entry->wired_count = 0;
1359 	entry->advice = advice;
1360 	if (prot & PROT_WRITE)
1361 		map->wserial++;
1362 	if (flags & UVM_FLAG_SYSCALL) {
1363 		entry->etype |= UVM_ET_SYSCALL;
1364 		map->wserial++;
1365 	}
1366 	if (flags & UVM_FLAG_STACK) {
1367 		entry->etype |= UVM_ET_STACK;
1368 		if (flags & UVM_FLAG_UNMAP)
1369 			map->sserial++;
1370 	}
1371 	if (uobj)
1372 		entry->etype |= UVM_ET_OBJ;
1373 	else if (flags & UVM_FLAG_HOLE)
1374 		entry->etype |= UVM_ET_HOLE;
1375 	if (flags & UVM_FLAG_NOFAULT)
1376 		entry->etype |= UVM_ET_NOFAULT;
1377 	if (flags & UVM_FLAG_WC)
1378 		entry->etype |= UVM_ET_WC;
1379 	if (flags & UVM_FLAG_COPYONW) {
1380 		entry->etype |= UVM_ET_COPYONWRITE;
1381 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1382 			entry->etype |= UVM_ET_NEEDSCOPY;
1383 	}
1384 	if (flags & UVM_FLAG_CONCEAL)
1385 		entry->etype |= UVM_ET_CONCEAL;
1386 	if (flags & UVM_FLAG_OVERLAY) {
1387 		entry->aref.ar_pageoff = 0;
1388 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1389 	}
1390 
1391 	/* Update map and process statistics. */
1392 	if (!(flags & UVM_FLAG_HOLE)) {
1393 		map->size += sz;
1394 		if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL &&
1395 		    prot != PROT_NONE) {
1396 			((struct vmspace *)map)->vm_dused +=
1397 			    uvmspace_dused(map, *addr, *addr + sz);
1398 		}
1399 	}
1400 
1401 	/*
1402 	 * Try to merge entry.
1403 	 *
1404 	 * Userland allocations are kept separated most of the time.
1405 	 * Forego the effort of merging what most of the time can't be merged
1406 	 * and only try the merge if it concerns a kernel entry.
1407 	 */
1408 	if ((flags & UVM_FLAG_NOMERGE) == 0 &&
1409 	    (map->flags & VM_MAP_ISVMSPACE) == 0)
1410 		uvm_mapent_tryjoin(map, entry, &dead);
1411 
1412 unlock:
1413 	vm_map_unlock(map);
1414 
1415 	/*
1416 	 * Remove dead entries.
1417 	 *
1418 	 * Dead entries may be the result of merging.
1419 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1420 	 * destroy free-space entries.
1421 	 */
1422 	if (map->flags & VM_MAP_INTRSAFE)
1423 		uvm_unmap_detach_intrsafe(&dead);
1424 	else
1425 		uvm_unmap_detach(&dead, 0);
1426 out:
1427 	if (new)
1428 		uvm_mapent_free(new);
1429 	return error;
1430 }
1431 
1432 /*
1433  * True iff e1 and e2 can be joined together.
1434  */
1435 int
1436 uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1,
1437     struct vm_map_entry *e2)
1438 {
1439 	KDASSERT(e1 != NULL && e2 != NULL);
1440 
1441 	/* Must be the same entry type and not have free memory between. */
1442 	if (e1->etype != e2->etype || e1->end != e2->start)
1443 		return 0;
1444 
1445 	/* Submaps are never joined. */
1446 	if (UVM_ET_ISSUBMAP(e1))
1447 		return 0;
1448 
1449 	/* Never merge wired memory. */
1450 	if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2))
1451 		return 0;
1452 
1453 	/* Protection, inheritance and advice must be equal. */
1454 	if (e1->protection != e2->protection ||
1455 	    e1->max_protection != e2->max_protection ||
1456 	    e1->inheritance != e2->inheritance ||
1457 	    e1->advice != e2->advice)
1458 		return 0;
1459 
1460 	/* If uvm_object: object itself and offsets within object must match. */
1461 	if (UVM_ET_ISOBJ(e1)) {
1462 		if (e1->object.uvm_obj != e2->object.uvm_obj)
1463 			return 0;
1464 		if (e1->offset + (e1->end - e1->start) != e2->offset)
1465 			return 0;
1466 	}
1467 
1468 	/*
1469 	 * Cannot join shared amaps.
1470 	 * Note: no need to lock amap to look at refs, since we don't care
1471 	 * about its exact value.
1472 	 * If it is 1 (i.e. we have the only reference) it will stay there.
1473 	 */
1474 	if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1)
1475 		return 0;
1476 	if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1)
1477 		return 0;
1478 
1479 	/* Apprently, e1 and e2 match. */
1480 	return 1;
1481 }
1482 
1483 /*
1484  * Join support function.
1485  *
1486  * Returns the merged entry on succes.
1487  * Returns NULL if the merge failed.
1488  */
1489 struct vm_map_entry*
1490 uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1,
1491     struct vm_map_entry *e2, struct uvm_map_deadq *dead)
1492 {
1493 	struct uvm_addr_state *free;
1494 
1495 	/*
1496 	 * Merging is not supported for map entries that
1497 	 * contain an amap in e1. This should never happen
1498 	 * anyway, because only kernel entries are merged.
1499 	 * These do not contain amaps.
1500 	 * e2 contains no real information in its amap,
1501 	 * so it can be erased immediately.
1502 	 */
1503 	KASSERT(e1->aref.ar_amap == NULL);
1504 
1505 	/*
1506 	 * Don't drop obj reference:
1507 	 * uvm_unmap_detach will do this for us.
1508 	 */
1509 	free = uvm_map_uaddr_e(map, e1);
1510 	uvm_mapent_free_remove(map, free, e1);
1511 
1512 	free = uvm_map_uaddr_e(map, e2);
1513 	uvm_mapent_free_remove(map, free, e2);
1514 	uvm_mapent_addr_remove(map, e2);
1515 	e1->end = e2->end;
1516 	e1->guard = e2->guard;
1517 	e1->fspace = e2->fspace;
1518 	uvm_mapent_free_insert(map, free, e1);
1519 
1520 	DEAD_ENTRY_PUSH(dead, e2);
1521 	return e1;
1522 }
1523 
1524 /*
1525  * Attempt forward and backward joining of entry.
1526  *
1527  * Returns entry after joins.
1528  * We are guaranteed that the amap of entry is either non-existent or
1529  * has never been used.
1530  */
1531 struct vm_map_entry*
1532 uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry,
1533     struct uvm_map_deadq *dead)
1534 {
1535 	struct vm_map_entry *other;
1536 	struct vm_map_entry *merged;
1537 
1538 	/* Merge with previous entry. */
1539 	other = RBT_PREV(uvm_map_addr, entry);
1540 	if (other && uvm_mapent_isjoinable(map, other, entry)) {
1541 		merged = uvm_mapent_merge(map, other, entry, dead);
1542 		if (merged)
1543 			entry = merged;
1544 	}
1545 
1546 	/*
1547 	 * Merge with next entry.
1548 	 *
1549 	 * Because amap can only extend forward and the next entry
1550 	 * probably contains sensible info, only perform forward merging
1551 	 * in the absence of an amap.
1552 	 */
1553 	other = RBT_NEXT(uvm_map_addr, entry);
1554 	if (other && entry->aref.ar_amap == NULL &&
1555 	    other->aref.ar_amap == NULL &&
1556 	    uvm_mapent_isjoinable(map, entry, other)) {
1557 		merged = uvm_mapent_merge(map, entry, other, dead);
1558 		if (merged)
1559 			entry = merged;
1560 	}
1561 
1562 	return entry;
1563 }
1564 
1565 /*
1566  * Kill entries that are no longer in a map.
1567  */
1568 void
1569 uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags)
1570 {
1571 	struct vm_map_entry *entry, *tmp;
1572 	int waitok = flags & UVM_PLA_WAITOK;
1573 
1574 	TAILQ_FOREACH_SAFE(entry, deadq, dfree.deadq, tmp) {
1575 		/* Skip entries for which we have to grab the kernel lock. */
1576 		if (entry->aref.ar_amap || UVM_ET_ISSUBMAP(entry) ||
1577 		    UVM_ET_ISOBJ(entry))
1578 			continue;
1579 
1580 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1581 		uvm_mapent_free(entry);
1582 	}
1583 
1584 	if (TAILQ_EMPTY(deadq))
1585 		return;
1586 
1587 	KERNEL_LOCK();
1588 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1589 		if (waitok)
1590 			uvm_pause();
1591 		/* Drop reference to amap, if we've got one. */
1592 		if (entry->aref.ar_amap)
1593 			amap_unref(entry->aref.ar_amap,
1594 			    entry->aref.ar_pageoff,
1595 			    atop(entry->end - entry->start),
1596 			    flags & AMAP_REFALL);
1597 
1598 		/* Drop reference to our backing object, if we've got one. */
1599 		if (UVM_ET_ISSUBMAP(entry)) {
1600 			/* ... unlikely to happen, but play it safe */
1601 			uvm_map_deallocate(entry->object.sub_map);
1602 		} else if (UVM_ET_ISOBJ(entry) &&
1603 		    entry->object.uvm_obj->pgops->pgo_detach) {
1604 			entry->object.uvm_obj->pgops->pgo_detach(
1605 			    entry->object.uvm_obj);
1606 		}
1607 
1608 		/* Step to next. */
1609 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1610 		uvm_mapent_free(entry);
1611 	}
1612 	KERNEL_UNLOCK();
1613 }
1614 
1615 void
1616 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq)
1617 {
1618 	struct vm_map_entry *entry;
1619 
1620 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1621 		KASSERT(entry->aref.ar_amap == NULL);
1622 		KASSERT(!UVM_ET_ISSUBMAP(entry));
1623 		KASSERT(!UVM_ET_ISOBJ(entry));
1624 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1625 		uvm_mapent_free(entry);
1626 	}
1627 }
1628 
1629 /*
1630  * Create and insert new entry.
1631  *
1632  * Returned entry contains new addresses and is inserted properly in the tree.
1633  * first and last are (probably) no longer valid.
1634  */
1635 struct vm_map_entry*
1636 uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first,
1637     struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags,
1638     struct uvm_map_deadq *dead, struct vm_map_entry *new)
1639 {
1640 	struct vm_map_entry *entry, *prev;
1641 	struct uvm_addr_state *free;
1642 	vaddr_t min, max;	/* free space boundaries for new entry */
1643 
1644 	KDASSERT(map != NULL);
1645 	KDASSERT(first != NULL);
1646 	KDASSERT(last != NULL);
1647 	KDASSERT(dead != NULL);
1648 	KDASSERT(sz > 0);
1649 	KDASSERT(addr + sz > addr);
1650 	KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr);
1651 	KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz);
1652 	KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz));
1653 	uvm_tree_sanity(map, __FILE__, __LINE__);
1654 
1655 	min = addr + sz;
1656 	max = VMMAP_FREE_END(last);
1657 
1658 	/* Initialize new entry. */
1659 	if (new == NULL)
1660 		entry = uvm_mapent_alloc(map, flags);
1661 	else
1662 		entry = new;
1663 	if (entry == NULL)
1664 		return NULL;
1665 	entry->offset = 0;
1666 	entry->etype = 0;
1667 	entry->wired_count = 0;
1668 	entry->aref.ar_pageoff = 0;
1669 	entry->aref.ar_amap = NULL;
1670 
1671 	entry->start = addr;
1672 	entry->end = min;
1673 	entry->guard = 0;
1674 	entry->fspace = 0;
1675 
1676 	/* Reset free space in first. */
1677 	free = uvm_map_uaddr_e(map, first);
1678 	uvm_mapent_free_remove(map, free, first);
1679 	first->guard = 0;
1680 	first->fspace = 0;
1681 
1682 	/*
1683 	 * Remove all entries that are fully replaced.
1684 	 * We are iterating using last in reverse order.
1685 	 */
1686 	for (; first != last; last = prev) {
1687 		prev = RBT_PREV(uvm_map_addr, last);
1688 
1689 		KDASSERT(last->start == last->end);
1690 		free = uvm_map_uaddr_e(map, last);
1691 		uvm_mapent_free_remove(map, free, last);
1692 		uvm_mapent_addr_remove(map, last);
1693 		DEAD_ENTRY_PUSH(dead, last);
1694 	}
1695 	/* Remove first if it is entirely inside <addr, addr+sz>.  */
1696 	if (first->start == addr) {
1697 		uvm_mapent_addr_remove(map, first);
1698 		DEAD_ENTRY_PUSH(dead, first);
1699 	} else {
1700 		uvm_map_fix_space(map, first, VMMAP_FREE_START(first),
1701 		    addr, flags);
1702 	}
1703 
1704 	/* Finally, link in entry. */
1705 	uvm_mapent_addr_insert(map, entry);
1706 	uvm_map_fix_space(map, entry, min, max, flags);
1707 
1708 	uvm_tree_sanity(map, __FILE__, __LINE__);
1709 	return entry;
1710 }
1711 
1712 
1713 /*
1714  * uvm_mapent_alloc: allocate a map entry
1715  */
1716 struct vm_map_entry *
1717 uvm_mapent_alloc(struct vm_map *map, int flags)
1718 {
1719 	struct vm_map_entry *me, *ne;
1720 	int pool_flags;
1721 	int i;
1722 
1723 	pool_flags = PR_WAITOK;
1724 	if (flags & UVM_FLAG_TRYLOCK)
1725 		pool_flags = PR_NOWAIT;
1726 
1727 	if (map->flags & VM_MAP_INTRSAFE || cold) {
1728 		mtx_enter(&uvm_kmapent_mtx);
1729 		if (SLIST_EMPTY(&uvm.kentry_free)) {
1730 			ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty,
1731 			    &kd_nowait);
1732 			if (ne == NULL)
1733 				panic("uvm_mapent_alloc: cannot allocate map "
1734 				    "entry");
1735 			for (i = 0; i < PAGE_SIZE / sizeof(*ne); i++) {
1736 				SLIST_INSERT_HEAD(&uvm.kentry_free,
1737 				    &ne[i], daddrs.addr_kentry);
1738 			}
1739 			if (ratecheck(&uvm_kmapent_last_warn_time,
1740 			    &uvm_kmapent_warn_rate))
1741 				printf("uvm_mapent_alloc: out of static "
1742 				    "map entries\n");
1743 		}
1744 		me = SLIST_FIRST(&uvm.kentry_free);
1745 		SLIST_REMOVE_HEAD(&uvm.kentry_free, daddrs.addr_kentry);
1746 		uvmexp.kmapent++;
1747 		mtx_leave(&uvm_kmapent_mtx);
1748 		me->flags = UVM_MAP_STATIC;
1749 	} else if (map == kernel_map) {
1750 		splassert(IPL_NONE);
1751 		me = pool_get(&uvm_map_entry_kmem_pool, pool_flags);
1752 		if (me == NULL)
1753 			goto out;
1754 		me->flags = UVM_MAP_KMEM;
1755 	} else {
1756 		splassert(IPL_NONE);
1757 		me = pool_get(&uvm_map_entry_pool, pool_flags);
1758 		if (me == NULL)
1759 			goto out;
1760 		me->flags = 0;
1761 	}
1762 
1763 	RBT_POISON(uvm_map_addr, me, UVMMAP_DEADBEEF);
1764 out:
1765 	return(me);
1766 }
1767 
1768 /*
1769  * uvm_mapent_free: free map entry
1770  *
1771  * => XXX: static pool for kernel map?
1772  */
1773 void
1774 uvm_mapent_free(struct vm_map_entry *me)
1775 {
1776 	if (me->flags & UVM_MAP_STATIC) {
1777 		mtx_enter(&uvm_kmapent_mtx);
1778 		SLIST_INSERT_HEAD(&uvm.kentry_free, me, daddrs.addr_kentry);
1779 		uvmexp.kmapent--;
1780 		mtx_leave(&uvm_kmapent_mtx);
1781 	} else if (me->flags & UVM_MAP_KMEM) {
1782 		splassert(IPL_NONE);
1783 		pool_put(&uvm_map_entry_kmem_pool, me);
1784 	} else {
1785 		splassert(IPL_NONE);
1786 		pool_put(&uvm_map_entry_pool, me);
1787 	}
1788 }
1789 
1790 /*
1791  * uvm_map_lookup_entry: find map entry at or before an address.
1792  *
1793  * => map must at least be read-locked by caller
1794  * => entry is returned in "entry"
1795  * => return value is true if address is in the returned entry
1796  * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is
1797  * returned for those mappings.
1798  */
1799 boolean_t
1800 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
1801     struct vm_map_entry **entry)
1802 {
1803 	*entry = uvm_map_entrybyaddr(&map->addr, address);
1804 	return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
1805 	    (*entry)->start <= address && (*entry)->end > address;
1806 }
1807 
1808 /*
1809  * Stack must be in a MAP_STACK entry. PROT_NONE indicates stack not yet
1810  * grown -- then uvm_map_check_region_range() should not cache the entry
1811  * because growth won't be seen.
1812  */
1813 int
1814 uvm_map_inentry_sp(vm_map_entry_t entry)
1815 {
1816 	if ((entry->etype & UVM_ET_STACK) == 0) {
1817 		if (entry->protection == PROT_NONE)
1818 			return (-1);	/* don't update range */
1819 		return (0);
1820 	}
1821 	return (1);
1822 }
1823 
1824 /*
1825  * The system call must not come from a writeable entry, W^X is violated.
1826  * (Would be nice if we can spot aliasing, which is also kind of bad)
1827  *
1828  * The system call must come from an syscall-labeled entry (which are
1829  * the text regions of the main program, sigtramp, ld.so, or libc).
1830  */
1831 int
1832 uvm_map_inentry_pc(vm_map_entry_t entry)
1833 {
1834 	if (entry->protection & PROT_WRITE)
1835 		return (0);	/* not permitted */
1836 	if ((entry->etype & UVM_ET_SYSCALL) == 0)
1837 		return (0);	/* not permitted */
1838 	return (1);
1839 }
1840 
1841 int
1842 uvm_map_inentry_recheck(u_long serial, vaddr_t addr, struct p_inentry *ie)
1843 {
1844 	return (serial != ie->ie_serial || ie->ie_start == 0 ||
1845 	    addr < ie->ie_start || addr >= ie->ie_end);
1846 }
1847 
1848 /*
1849  * Inside a vm_map find the reg address and verify it via function.
1850  * Remember low and high addresses of region if valid and return TRUE,
1851  * else return FALSE.
1852  */
1853 boolean_t
1854 uvm_map_inentry_fix(struct proc *p, struct p_inentry *ie, vaddr_t addr,
1855     int (*fn)(vm_map_entry_t), u_long serial)
1856 {
1857 	vm_map_t map = &p->p_vmspace->vm_map;
1858 	vm_map_entry_t entry;
1859 	int ret;
1860 
1861 	if (addr < map->min_offset || addr >= map->max_offset)
1862 		return (FALSE);
1863 
1864 	/* lock map */
1865 	vm_map_lock_read(map);
1866 
1867 	/* lookup */
1868 	if (!uvm_map_lookup_entry(map, trunc_page(addr), &entry)) {
1869 		vm_map_unlock_read(map);
1870 		return (FALSE);
1871 	}
1872 
1873 	ret = (*fn)(entry);
1874 	if (ret == 0) {
1875 		vm_map_unlock_read(map);
1876 		return (FALSE);
1877 	} else if (ret == 1) {
1878 		ie->ie_start = entry->start;
1879 		ie->ie_end = entry->end;
1880 		ie->ie_serial = serial;
1881 	} else {
1882 		/* do not update, re-check later */
1883 	}
1884 	vm_map_unlock_read(map);
1885 	return (TRUE);
1886 }
1887 
1888 boolean_t
1889 uvm_map_inentry(struct proc *p, struct p_inentry *ie, vaddr_t addr,
1890     const char *fmt, int (*fn)(vm_map_entry_t), u_long serial)
1891 {
1892 	union sigval sv;
1893 	boolean_t ok = TRUE;
1894 
1895 	if (uvm_map_inentry_recheck(serial, addr, ie)) {
1896 		ok = uvm_map_inentry_fix(p, ie, addr, fn, serial);
1897 		if (!ok) {
1898 			KERNEL_LOCK();
1899 			printf(fmt, p->p_p->ps_comm, p->p_p->ps_pid, p->p_tid,
1900 			    addr, ie->ie_start, ie->ie_end);
1901 			p->p_p->ps_acflag |= AMAP;
1902 			sv.sival_ptr = (void *)PROC_PC(p);
1903 			trapsignal(p, SIGSEGV, 0, SEGV_ACCERR, sv);
1904 			KERNEL_UNLOCK();
1905 		}
1906 	}
1907 	return (ok);
1908 }
1909 
1910 /*
1911  * Check whether the given address range can be converted to a MAP_STACK
1912  * mapping.
1913  *
1914  * Must be called with map locked.
1915  */
1916 boolean_t
1917 uvm_map_is_stack_remappable(struct vm_map *map, vaddr_t addr, vaddr_t sz)
1918 {
1919 	vaddr_t end = addr + sz;
1920 	struct vm_map_entry *first, *iter, *prev = NULL;
1921 
1922 	if (!uvm_map_lookup_entry(map, addr, &first)) {
1923 		printf("map stack 0x%lx-0x%lx of map %p failed: no mapping\n",
1924 		    addr, end, map);
1925 		return FALSE;
1926 	}
1927 
1928 	/*
1929 	 * Check that the address range exists and is contiguous.
1930 	 */
1931 	for (iter = first; iter != NULL && iter->start < end;
1932 	    prev = iter, iter = RBT_NEXT(uvm_map_addr, iter)) {
1933 		/*
1934 		 * Make sure that we do not have holes in the range.
1935 		 */
1936 #if 0
1937 		if (prev != NULL) {
1938 			printf("prev->start 0x%lx, prev->end 0x%lx, "
1939 			    "iter->start 0x%lx, iter->end 0x%lx\n",
1940 			    prev->start, prev->end, iter->start, iter->end);
1941 		}
1942 #endif
1943 
1944 		if (prev != NULL && prev->end != iter->start) {
1945 			printf("map stack 0x%lx-0x%lx of map %p failed: "
1946 			    "hole in range\n", addr, end, map);
1947 			return FALSE;
1948 		}
1949 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) {
1950 			printf("map stack 0x%lx-0x%lx of map %p failed: "
1951 			    "hole in range\n", addr, end, map);
1952 			return FALSE;
1953 		}
1954 	}
1955 
1956 	return TRUE;
1957 }
1958 
1959 /*
1960  * Remap the middle-pages of an existing mapping as a stack range.
1961  * If there exists a previous contiguous mapping with the given range
1962  * [addr, addr + sz), with protection PROT_READ|PROT_WRITE, then the
1963  * mapping is dropped, and a new anon mapping is created and marked as
1964  * a stack.
1965  *
1966  * Must be called with map unlocked.
1967  */
1968 int
1969 uvm_map_remap_as_stack(struct proc *p, vaddr_t addr, vaddr_t sz)
1970 {
1971 	vm_map_t map = &p->p_vmspace->vm_map;
1972 	vaddr_t start, end;
1973 	int error;
1974 	int flags = UVM_MAPFLAG(PROT_READ | PROT_WRITE,
1975 	    PROT_READ | PROT_WRITE | PROT_EXEC,
1976 	    MAP_INHERIT_COPY, MADV_NORMAL,
1977 	    UVM_FLAG_STACK | UVM_FLAG_FIXED | UVM_FLAG_UNMAP |
1978 	    UVM_FLAG_COPYONW);
1979 
1980 	start = round_page(addr);
1981 	end = trunc_page(addr + sz);
1982 #ifdef MACHINE_STACK_GROWS_UP
1983 	if (end == addr + sz)
1984 		end -= PAGE_SIZE;
1985 #else
1986 	if (start == addr)
1987 		start += PAGE_SIZE;
1988 #endif
1989 
1990 	if (start < map->min_offset || end >= map->max_offset || end < start)
1991 		return EINVAL;
1992 
1993 	error = uvm_mapanon(map, &start, end - start, 0, flags);
1994 	if (error != 0)
1995 		printf("map stack for pid %d failed\n", p->p_p->ps_pid);
1996 
1997 	return error;
1998 }
1999 
2000 /*
2001  * uvm_map_pie: return a random load address for a PIE executable
2002  * properly aligned.
2003  */
2004 #ifndef VM_PIE_MAX_ADDR
2005 #define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4)
2006 #endif
2007 
2008 #ifndef VM_PIE_MIN_ADDR
2009 #define VM_PIE_MIN_ADDR VM_MIN_ADDRESS
2010 #endif
2011 
2012 #ifndef VM_PIE_MIN_ALIGN
2013 #define VM_PIE_MIN_ALIGN PAGE_SIZE
2014 #endif
2015 
2016 vaddr_t
2017 uvm_map_pie(vaddr_t align)
2018 {
2019 	vaddr_t addr, space, min;
2020 
2021 	align = MAX(align, VM_PIE_MIN_ALIGN);
2022 
2023 	/* round up to next alignment */
2024 	min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1);
2025 
2026 	if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR)
2027 		return (align);
2028 
2029 	space = (VM_PIE_MAX_ADDR - min) / align;
2030 	space = MIN(space, (u_int32_t)-1);
2031 
2032 	addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align;
2033 	addr += min;
2034 
2035 	return (addr);
2036 }
2037 
2038 void
2039 uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end)
2040 {
2041 	struct uvm_map_deadq dead;
2042 
2043 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0 &&
2044 	    (end & (vaddr_t)PAGE_MASK) == 0);
2045 	TAILQ_INIT(&dead);
2046 	vm_map_lock(map);
2047 	uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE);
2048 	vm_map_unlock(map);
2049 
2050 	if (map->flags & VM_MAP_INTRSAFE)
2051 		uvm_unmap_detach_intrsafe(&dead);
2052 	else
2053 		uvm_unmap_detach(&dead, 0);
2054 }
2055 
2056 /*
2057  * Mark entry as free.
2058  *
2059  * entry will be put on the dead list.
2060  * The free space will be merged into the previous or a new entry,
2061  * unless markfree is false.
2062  */
2063 void
2064 uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry,
2065     struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead,
2066     boolean_t markfree)
2067 {
2068 	struct uvm_addr_state	*free;
2069 	struct vm_map_entry	*prev;
2070 	vaddr_t			 addr;	/* Start of freed range. */
2071 	vaddr_t			 end;	/* End of freed range. */
2072 
2073 	prev = *prev_ptr;
2074 	if (prev == entry)
2075 		*prev_ptr = prev = NULL;
2076 
2077 	if (prev == NULL ||
2078 	    VMMAP_FREE_END(prev) != entry->start)
2079 		prev = RBT_PREV(uvm_map_addr, entry);
2080 
2081 	/* Entry is describing only free memory and has nothing to drain into. */
2082 	if (prev == NULL && entry->start == entry->end && markfree) {
2083 		*prev_ptr = entry;
2084 		return;
2085 	}
2086 
2087 	addr = entry->start;
2088 	end = VMMAP_FREE_END(entry);
2089 	free = uvm_map_uaddr_e(map, entry);
2090 	uvm_mapent_free_remove(map, free, entry);
2091 	uvm_mapent_addr_remove(map, entry);
2092 	DEAD_ENTRY_PUSH(dead, entry);
2093 
2094 	if (markfree) {
2095 		if (prev) {
2096 			free = uvm_map_uaddr_e(map, prev);
2097 			uvm_mapent_free_remove(map, free, prev);
2098 		}
2099 		*prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0);
2100 	}
2101 }
2102 
2103 /*
2104  * Unwire and release referenced amap and object from map entry.
2105  */
2106 void
2107 uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry)
2108 {
2109 	/* Unwire removed map entry. */
2110 	if (VM_MAPENT_ISWIRED(entry)) {
2111 		KERNEL_LOCK();
2112 		entry->wired_count = 0;
2113 		uvm_fault_unwire_locked(map, entry->start, entry->end);
2114 		KERNEL_UNLOCK();
2115 	}
2116 
2117 	/* Entry-type specific code. */
2118 	if (UVM_ET_ISHOLE(entry)) {
2119 		/* Nothing to be done for holes. */
2120 	} else if (map->flags & VM_MAP_INTRSAFE) {
2121 		KASSERT(vm_map_pmap(map) == pmap_kernel());
2122 		uvm_km_pgremove_intrsafe(entry->start, entry->end);
2123 		pmap_kremove(entry->start, entry->end - entry->start);
2124 	} else if (UVM_ET_ISOBJ(entry) &&
2125 	    UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
2126 		KASSERT(vm_map_pmap(map) == pmap_kernel());
2127 		/*
2128 		 * Note: kernel object mappings are currently used in
2129 		 * two ways:
2130 		 *  [1] "normal" mappings of pages in the kernel object
2131 		 *  [2] uvm_km_valloc'd allocations in which we
2132 		 *      pmap_enter in some non-kernel-object page
2133 		 *      (e.g. vmapbuf).
2134 		 *
2135 		 * for case [1], we need to remove the mapping from
2136 		 * the pmap and then remove the page from the kernel
2137 		 * object (because, once pages in a kernel object are
2138 		 * unmapped they are no longer needed, unlike, say,
2139 		 * a vnode where you might want the data to persist
2140 		 * until flushed out of a queue).
2141 		 *
2142 		 * for case [2], we need to remove the mapping from
2143 		 * the pmap.  there shouldn't be any pages at the
2144 		 * specified offset in the kernel object [but it
2145 		 * doesn't hurt to call uvm_km_pgremove just to be
2146 		 * safe?]
2147 		 *
2148 		 * uvm_km_pgremove currently does the following:
2149 		 *   for pages in the kernel object range:
2150 		 *     - drops the swap slot
2151 		 *     - uvm_pagefree the page
2152 		 *
2153 		 * note there is version of uvm_km_pgremove() that
2154 		 * is used for "intrsafe" objects.
2155 		 */
2156 		/*
2157 		 * remove mappings from pmap and drop the pages
2158 		 * from the object.  offsets are always relative
2159 		 * to vm_map_min(kernel_map).
2160 		 */
2161 		pmap_remove(pmap_kernel(), entry->start, entry->end);
2162 		uvm_km_pgremove(entry->object.uvm_obj,
2163 		    entry->start - vm_map_min(kernel_map),
2164 		    entry->end - vm_map_min(kernel_map));
2165 
2166 		/*
2167 		 * null out kernel_object reference, we've just
2168 		 * dropped it
2169 		 */
2170 		entry->etype &= ~UVM_ET_OBJ;
2171 		entry->object.uvm_obj = NULL;  /* to be safe */
2172 	} else {
2173 		/* remove mappings the standard way. */
2174 		pmap_remove(map->pmap, entry->start, entry->end);
2175 	}
2176 }
2177 
2178 /*
2179  * Remove all entries from start to end.
2180  *
2181  * If remove_holes, then remove ET_HOLE entries as well.
2182  * If markfree, entry will be properly marked free, otherwise, no replacement
2183  * entry will be put in the tree (corrupting the tree).
2184  */
2185 void
2186 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
2187     struct uvm_map_deadq *dead, boolean_t remove_holes,
2188     boolean_t markfree)
2189 {
2190 	struct vm_map_entry *prev_hint, *next, *entry;
2191 
2192 	start = MAX(start, map->min_offset);
2193 	end = MIN(end, map->max_offset);
2194 	if (start >= end)
2195 		return;
2196 
2197 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
2198 		splassert(IPL_NONE);
2199 	else
2200 		splassert(IPL_VM);
2201 
2202 	/* Find first affected entry. */
2203 	entry = uvm_map_entrybyaddr(&map->addr, start);
2204 	KDASSERT(entry != NULL && entry->start <= start);
2205 	if (entry->end <= start && markfree)
2206 		entry = RBT_NEXT(uvm_map_addr, entry);
2207 	else
2208 		UVM_MAP_CLIP_START(map, entry, start);
2209 
2210 	/*
2211 	 * Iterate entries until we reach end address.
2212 	 * prev_hint hints where the freed space can be appended to.
2213 	 */
2214 	prev_hint = NULL;
2215 	for (; entry != NULL && entry->start < end; entry = next) {
2216 		KDASSERT(entry->start >= start);
2217 		if (entry->end > end || !markfree)
2218 			UVM_MAP_CLIP_END(map, entry, end);
2219 		KDASSERT(entry->start >= start && entry->end <= end);
2220 		next = RBT_NEXT(uvm_map_addr, entry);
2221 
2222 		/* Don't remove holes unless asked to do so. */
2223 		if (UVM_ET_ISHOLE(entry)) {
2224 			if (!remove_holes) {
2225 				prev_hint = entry;
2226 				continue;
2227 			}
2228 		}
2229 
2230 		/* A stack has been removed.. */
2231 		if (UVM_ET_ISSTACK(entry) && (map->flags & VM_MAP_ISVMSPACE))
2232 			map->sserial++;
2233 
2234 		/* Kill entry. */
2235 		uvm_unmap_kill_entry(map, entry);
2236 
2237 		/* Update space usage. */
2238 		if ((map->flags & VM_MAP_ISVMSPACE) &&
2239 		    entry->object.uvm_obj == NULL &&
2240 		    entry->protection != PROT_NONE &&
2241 		    !UVM_ET_ISHOLE(entry)) {
2242 			((struct vmspace *)map)->vm_dused -=
2243 			    uvmspace_dused(map, entry->start, entry->end);
2244 		}
2245 		if (!UVM_ET_ISHOLE(entry))
2246 			map->size -= entry->end - entry->start;
2247 
2248 		/* Actual removal of entry. */
2249 		uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree);
2250 	}
2251 
2252 	pmap_update(vm_map_pmap(map));
2253 
2254 #ifdef VMMAP_DEBUG
2255 	if (markfree) {
2256 		for (entry = uvm_map_entrybyaddr(&map->addr, start);
2257 		    entry != NULL && entry->start < end;
2258 		    entry = RBT_NEXT(uvm_map_addr, entry)) {
2259 			KDASSERT(entry->end <= start ||
2260 			    entry->start == entry->end ||
2261 			    UVM_ET_ISHOLE(entry));
2262 		}
2263 	} else {
2264 		vaddr_t a;
2265 		for (a = start; a < end; a += PAGE_SIZE)
2266 			KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL);
2267 	}
2268 #endif
2269 }
2270 
2271 /*
2272  * Mark all entries from first until end (exclusive) as pageable.
2273  *
2274  * Lock must be exclusive on entry and will not be touched.
2275  */
2276 void
2277 uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first,
2278     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr)
2279 {
2280 	struct vm_map_entry *iter;
2281 
2282 	for (iter = first; iter != end;
2283 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2284 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2285 		if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2286 			continue;
2287 
2288 		iter->wired_count = 0;
2289 		uvm_fault_unwire_locked(map, iter->start, iter->end);
2290 	}
2291 }
2292 
2293 /*
2294  * Mark all entries from first until end (exclusive) as wired.
2295  *
2296  * Lockflags determines the lock state on return from this function.
2297  * Lock must be exclusive on entry.
2298  */
2299 int
2300 uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first,
2301     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr,
2302     int lockflags)
2303 {
2304 	struct vm_map_entry *iter;
2305 #ifdef DIAGNOSTIC
2306 	unsigned int timestamp_save;
2307 #endif
2308 	int error;
2309 
2310 	/*
2311 	 * Wire pages in two passes:
2312 	 *
2313 	 * 1: holding the write lock, we create any anonymous maps that need
2314 	 *    to be created.  then we clip each map entry to the region to
2315 	 *    be wired and increment its wiring count.
2316 	 *
2317 	 * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
2318 	 *    in the pages for any newly wired area (wired_count == 1).
2319 	 *
2320 	 *    downgrading to a read lock for uvm_fault_wire avoids a possible
2321 	 *    deadlock with another thread that may have faulted on one of
2322 	 *    the pages to be wired (it would mark the page busy, blocking
2323 	 *    us, then in turn block on the map lock that we hold).
2324 	 *    because we keep the read lock on the map, the copy-on-write
2325 	 *    status of the entries we modify here cannot change.
2326 	 */
2327 	for (iter = first; iter != end;
2328 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2329 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2330 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2331 		    iter->protection == PROT_NONE)
2332 			continue;
2333 
2334 		/*
2335 		 * Perform actions of vm_map_lookup that need the write lock.
2336 		 * - create an anonymous map for copy-on-write
2337 		 * - anonymous map for zero-fill
2338 		 * Skip submaps.
2339 		 */
2340 		if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) &&
2341 		    UVM_ET_ISNEEDSCOPY(iter) &&
2342 		    ((iter->protection & PROT_WRITE) ||
2343 		    iter->object.uvm_obj == NULL)) {
2344 			amap_copy(map, iter, M_WAITOK,
2345 			    UVM_ET_ISSTACK(iter) ? FALSE : TRUE,
2346 			    iter->start, iter->end);
2347 		}
2348 		iter->wired_count++;
2349 	}
2350 
2351 	/*
2352 	 * Pass 2.
2353 	 */
2354 #ifdef DIAGNOSTIC
2355 	timestamp_save = map->timestamp;
2356 #endif
2357 	vm_map_busy(map);
2358 	vm_map_downgrade(map);
2359 
2360 	error = 0;
2361 	for (iter = first; error == 0 && iter != end;
2362 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2363 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2364 		    iter->protection == PROT_NONE)
2365 			continue;
2366 
2367 		error = uvm_fault_wire(map, iter->start, iter->end,
2368 		    iter->protection);
2369 	}
2370 
2371 	if (error) {
2372 		/*
2373 		 * uvm_fault_wire failure
2374 		 *
2375 		 * Reacquire lock and undo our work.
2376 		 */
2377 		vm_map_upgrade(map);
2378 		vm_map_unbusy(map);
2379 #ifdef DIAGNOSTIC
2380 		if (timestamp_save != map->timestamp)
2381 			panic("uvm_map_pageable_wire: stale map");
2382 #endif
2383 
2384 		/*
2385 		 * first is no longer needed to restart loops.
2386 		 * Use it as iterator to unmap successful mappings.
2387 		 */
2388 		for (; first != iter;
2389 		    first = RBT_NEXT(uvm_map_addr, first)) {
2390 			if (UVM_ET_ISHOLE(first) ||
2391 			    first->start == first->end ||
2392 			    first->protection == PROT_NONE)
2393 				continue;
2394 
2395 			first->wired_count--;
2396 			if (!VM_MAPENT_ISWIRED(first)) {
2397 				uvm_fault_unwire_locked(map,
2398 				    iter->start, iter->end);
2399 			}
2400 		}
2401 
2402 		/* decrease counter in the rest of the entries */
2403 		for (; iter != end;
2404 		    iter = RBT_NEXT(uvm_map_addr, iter)) {
2405 			if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2406 			    iter->protection == PROT_NONE)
2407 				continue;
2408 
2409 			iter->wired_count--;
2410 		}
2411 
2412 		if ((lockflags & UVM_LK_EXIT) == 0)
2413 			vm_map_unlock(map);
2414 		return error;
2415 	}
2416 
2417 	/* We are currently holding a read lock. */
2418 	if ((lockflags & UVM_LK_EXIT) == 0) {
2419 		vm_map_unbusy(map);
2420 		vm_map_unlock_read(map);
2421 	} else {
2422 		vm_map_upgrade(map);
2423 		vm_map_unbusy(map);
2424 #ifdef DIAGNOSTIC
2425 		if (timestamp_save != map->timestamp)
2426 			panic("uvm_map_pageable_wire: stale map");
2427 #endif
2428 	}
2429 	return 0;
2430 }
2431 
2432 /*
2433  * uvm_map_pageable: set pageability of a range in a map.
2434  *
2435  * Flags:
2436  * UVM_LK_ENTER: map is already locked by caller
2437  * UVM_LK_EXIT:  don't unlock map on exit
2438  *
2439  * The full range must be in use (entries may not have fspace != 0).
2440  * UVM_ET_HOLE counts as unmapped.
2441  */
2442 int
2443 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
2444     boolean_t new_pageable, int lockflags)
2445 {
2446 	struct vm_map_entry *first, *last, *tmp;
2447 	int error;
2448 
2449 	start = trunc_page(start);
2450 	end = round_page(end);
2451 
2452 	if (start > end)
2453 		return EINVAL;
2454 	if (start == end)
2455 		return 0;	/* nothing to do */
2456 	if (start < map->min_offset)
2457 		return EFAULT; /* why? see first XXX below */
2458 	if (end > map->max_offset)
2459 		return EINVAL; /* why? see second XXX below */
2460 
2461 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2462 	if ((lockflags & UVM_LK_ENTER) == 0)
2463 		vm_map_lock(map);
2464 
2465 	/*
2466 	 * Find first entry.
2467 	 *
2468 	 * Initial test on start is different, because of the different
2469 	 * error returned. Rest is tested further down.
2470 	 */
2471 	first = uvm_map_entrybyaddr(&map->addr, start);
2472 	if (first->end <= start || UVM_ET_ISHOLE(first)) {
2473 		/*
2474 		 * XXX if the first address is not mapped, it is EFAULT?
2475 		 */
2476 		error = EFAULT;
2477 		goto out;
2478 	}
2479 
2480 	/* Check that the range has no holes. */
2481 	for (last = first; last != NULL && last->start < end;
2482 	    last = RBT_NEXT(uvm_map_addr, last)) {
2483 		if (UVM_ET_ISHOLE(last) ||
2484 		    (last->end < end && VMMAP_FREE_END(last) != last->end)) {
2485 			/*
2486 			 * XXX unmapped memory in range, why is it EINVAL
2487 			 * instead of EFAULT?
2488 			 */
2489 			error = EINVAL;
2490 			goto out;
2491 		}
2492 	}
2493 
2494 	/*
2495 	 * Last ended at the first entry after the range.
2496 	 * Move back one step.
2497 	 *
2498 	 * Note that last may be NULL.
2499 	 */
2500 	if (last == NULL) {
2501 		last = RBT_MAX(uvm_map_addr, &map->addr);
2502 		if (last->end < end) {
2503 			error = EINVAL;
2504 			goto out;
2505 		}
2506 	} else {
2507 		KASSERT(last != first);
2508 		last = RBT_PREV(uvm_map_addr, last);
2509 	}
2510 
2511 	/* Wire/unwire pages here. */
2512 	if (new_pageable) {
2513 		/*
2514 		 * Mark pageable.
2515 		 * entries that are not wired are untouched.
2516 		 */
2517 		if (VM_MAPENT_ISWIRED(first))
2518 			UVM_MAP_CLIP_START(map, first, start);
2519 		/*
2520 		 * Split last at end.
2521 		 * Make tmp be the first entry after what is to be touched.
2522 		 * If last is not wired, don't touch it.
2523 		 */
2524 		if (VM_MAPENT_ISWIRED(last)) {
2525 			UVM_MAP_CLIP_END(map, last, end);
2526 			tmp = RBT_NEXT(uvm_map_addr, last);
2527 		} else
2528 			tmp = last;
2529 
2530 		uvm_map_pageable_pgon(map, first, tmp, start, end);
2531 		error = 0;
2532 
2533 out:
2534 		if ((lockflags & UVM_LK_EXIT) == 0)
2535 			vm_map_unlock(map);
2536 		return error;
2537 	} else {
2538 		/*
2539 		 * Mark entries wired.
2540 		 * entries are always touched (because recovery needs this).
2541 		 */
2542 		if (!VM_MAPENT_ISWIRED(first))
2543 			UVM_MAP_CLIP_START(map, first, start);
2544 		/*
2545 		 * Split last at end.
2546 		 * Make tmp be the first entry after what is to be touched.
2547 		 * If last is not wired, don't touch it.
2548 		 */
2549 		if (!VM_MAPENT_ISWIRED(last)) {
2550 			UVM_MAP_CLIP_END(map, last, end);
2551 			tmp = RBT_NEXT(uvm_map_addr, last);
2552 		} else
2553 			tmp = last;
2554 
2555 		return uvm_map_pageable_wire(map, first, tmp, start, end,
2556 		    lockflags);
2557 	}
2558 }
2559 
2560 /*
2561  * uvm_map_pageable_all: special case of uvm_map_pageable - affects
2562  * all mapped regions.
2563  *
2564  * Map must not be locked.
2565  * If no flags are specified, all ragions are unwired.
2566  */
2567 int
2568 uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
2569 {
2570 	vsize_t size;
2571 	struct vm_map_entry *iter;
2572 
2573 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2574 	vm_map_lock(map);
2575 
2576 	if (flags == 0) {
2577 		uvm_map_pageable_pgon(map, RBT_MIN(uvm_map_addr, &map->addr),
2578 		    NULL, map->min_offset, map->max_offset);
2579 
2580 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
2581 		vm_map_unlock(map);
2582 		return 0;
2583 	}
2584 
2585 	if (flags & MCL_FUTURE)
2586 		vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
2587 	if (!(flags & MCL_CURRENT)) {
2588 		vm_map_unlock(map);
2589 		return 0;
2590 	}
2591 
2592 	/*
2593 	 * Count number of pages in all non-wired entries.
2594 	 * If the number exceeds the limit, abort.
2595 	 */
2596 	size = 0;
2597 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2598 		if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2599 			continue;
2600 
2601 		size += iter->end - iter->start;
2602 	}
2603 
2604 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
2605 		vm_map_unlock(map);
2606 		return ENOMEM;
2607 	}
2608 
2609 	/* XXX non-pmap_wired_count case must be handled by caller */
2610 #ifdef pmap_wired_count
2611 	if (limit != 0 &&
2612 	    size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) {
2613 		vm_map_unlock(map);
2614 		return ENOMEM;
2615 	}
2616 #endif
2617 
2618 	/*
2619 	 * uvm_map_pageable_wire will release lock
2620 	 */
2621 	return uvm_map_pageable_wire(map, RBT_MIN(uvm_map_addr, &map->addr),
2622 	    NULL, map->min_offset, map->max_offset, 0);
2623 }
2624 
2625 /*
2626  * Initialize map.
2627  *
2628  * Allocates sufficient entries to describe the free memory in the map.
2629  */
2630 void
2631 uvm_map_setup(struct vm_map *map, pmap_t pmap, vaddr_t min, vaddr_t max,
2632     int flags)
2633 {
2634 	int i;
2635 
2636 	KASSERT((min & (vaddr_t)PAGE_MASK) == 0);
2637 	KASSERT((max & (vaddr_t)PAGE_MASK) == 0 ||
2638 	    (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
2639 
2640 	/*
2641 	 * Update parameters.
2642 	 *
2643 	 * This code handles (vaddr_t)-1 and other page mask ending addresses
2644 	 * properly.
2645 	 * We lose the top page if the full virtual address space is used.
2646 	 */
2647 	if (max & (vaddr_t)PAGE_MASK) {
2648 		max += 1;
2649 		if (max == 0) /* overflow */
2650 			max -= PAGE_SIZE;
2651 	}
2652 
2653 	RBT_INIT(uvm_map_addr, &map->addr);
2654 	map->uaddr_exe = NULL;
2655 	for (i = 0; i < nitems(map->uaddr_any); ++i)
2656 		map->uaddr_any[i] = NULL;
2657 	map->uaddr_brk_stack = NULL;
2658 
2659 	map->pmap = pmap;
2660 	map->size = 0;
2661 	map->ref_count = 0;
2662 	map->min_offset = min;
2663 	map->max_offset = max;
2664 	map->b_start = map->b_end = 0; /* Empty brk() area by default. */
2665 	map->s_start = map->s_end = 0; /* Empty stack area by default. */
2666 	map->flags = flags;
2667 	map->timestamp = 0;
2668 	if (flags & VM_MAP_ISVMSPACE)
2669 		rw_init_flags(&map->lock, "vmmaplk", RWL_DUPOK);
2670 	else
2671 		rw_init(&map->lock, "kmmaplk");
2672 	mtx_init(&map->mtx, IPL_VM);
2673 	mtx_init(&map->flags_lock, IPL_VM);
2674 
2675 	/* Configure the allocators. */
2676 	if (flags & VM_MAP_ISVMSPACE)
2677 		uvm_map_setup_md(map);
2678 	else
2679 		map->uaddr_any[3] = &uaddr_kbootstrap;
2680 
2681 	/*
2682 	 * Fill map entries.
2683 	 * We do not need to write-lock the map here because only the current
2684 	 * thread sees it right now. Initialize ref_count to 0 above to avoid
2685 	 * bogus triggering of lock-not-held assertions.
2686 	 */
2687 	uvm_map_setup_entries(map);
2688 	uvm_tree_sanity(map, __FILE__, __LINE__);
2689 	map->ref_count = 1;
2690 }
2691 
2692 /*
2693  * Destroy the map.
2694  *
2695  * This is the inverse operation to uvm_map_setup.
2696  */
2697 void
2698 uvm_map_teardown(struct vm_map *map)
2699 {
2700 	struct uvm_map_deadq	 dead_entries;
2701 	struct vm_map_entry	*entry, *tmp;
2702 #ifdef VMMAP_DEBUG
2703 	size_t			 numq, numt;
2704 #endif
2705 	int			 i;
2706 
2707 	KERNEL_ASSERT_LOCKED();
2708 	KERNEL_UNLOCK();
2709 	KERNEL_ASSERT_UNLOCKED();
2710 
2711 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
2712 
2713 	/* Remove address selectors. */
2714 	uvm_addr_destroy(map->uaddr_exe);
2715 	map->uaddr_exe = NULL;
2716 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2717 		uvm_addr_destroy(map->uaddr_any[i]);
2718 		map->uaddr_any[i] = NULL;
2719 	}
2720 	uvm_addr_destroy(map->uaddr_brk_stack);
2721 	map->uaddr_brk_stack = NULL;
2722 
2723 	/*
2724 	 * Remove entries.
2725 	 *
2726 	 * The following is based on graph breadth-first search.
2727 	 *
2728 	 * In color terms:
2729 	 * - the dead_entries set contains all nodes that are reachable
2730 	 *   (i.e. both the black and the grey nodes)
2731 	 * - any entry not in dead_entries is white
2732 	 * - any entry that appears in dead_entries before entry,
2733 	 *   is black, the rest is grey.
2734 	 * The set [entry, end] is also referred to as the wavefront.
2735 	 *
2736 	 * Since the tree is always a fully connected graph, the breadth-first
2737 	 * search guarantees that each vmmap_entry is visited exactly once.
2738 	 * The vm_map is broken down in linear time.
2739 	 */
2740 	TAILQ_INIT(&dead_entries);
2741 	if ((entry = RBT_ROOT(uvm_map_addr, &map->addr)) != NULL)
2742 		DEAD_ENTRY_PUSH(&dead_entries, entry);
2743 	while (entry != NULL) {
2744 		sched_pause(yield);
2745 		uvm_unmap_kill_entry(map, entry);
2746 		if ((tmp = RBT_LEFT(uvm_map_addr, entry)) != NULL)
2747 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2748 		if ((tmp = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
2749 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2750 		/* Update wave-front. */
2751 		entry = TAILQ_NEXT(entry, dfree.deadq);
2752 	}
2753 
2754 #ifdef VMMAP_DEBUG
2755 	numt = numq = 0;
2756 	RBT_FOREACH(entry, uvm_map_addr, &map->addr)
2757 		numt++;
2758 	TAILQ_FOREACH(entry, &dead_entries, dfree.deadq)
2759 		numq++;
2760 	KASSERT(numt == numq);
2761 #endif
2762 	uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK);
2763 
2764 	KERNEL_LOCK();
2765 
2766 	pmap_destroy(map->pmap);
2767 	map->pmap = NULL;
2768 }
2769 
2770 /*
2771  * Populate map with free-memory entries.
2772  *
2773  * Map must be initialized and empty.
2774  */
2775 void
2776 uvm_map_setup_entries(struct vm_map *map)
2777 {
2778 	KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
2779 
2780 	uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0);
2781 }
2782 
2783 /*
2784  * Split entry at given address.
2785  *
2786  * orig:  entry that is to be split.
2787  * next:  a newly allocated map entry that is not linked.
2788  * split: address at which the split is done.
2789  */
2790 void
2791 uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig,
2792     struct vm_map_entry *next, vaddr_t split)
2793 {
2794 	struct uvm_addr_state *free, *free_before;
2795 	vsize_t adj;
2796 
2797 	if ((split & PAGE_MASK) != 0) {
2798 		panic("uvm_map_splitentry: split address 0x%lx "
2799 		    "not on page boundary!", split);
2800 	}
2801 	KDASSERT(map != NULL && orig != NULL && next != NULL);
2802 	uvm_tree_sanity(map, __FILE__, __LINE__);
2803 	KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split);
2804 
2805 #ifdef VMMAP_DEBUG
2806 	KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, orig) == orig);
2807 	KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, next) != next);
2808 #endif /* VMMAP_DEBUG */
2809 
2810 	/*
2811 	 * Free space will change, unlink from free space tree.
2812 	 */
2813 	free = uvm_map_uaddr_e(map, orig);
2814 	uvm_mapent_free_remove(map, free, orig);
2815 
2816 	adj = split - orig->start;
2817 
2818 	uvm_mapent_copy(orig, next);
2819 	if (split >= orig->end) {
2820 		next->etype = 0;
2821 		next->offset = 0;
2822 		next->wired_count = 0;
2823 		next->start = next->end = split;
2824 		next->guard = 0;
2825 		next->fspace = VMMAP_FREE_END(orig) - split;
2826 		next->aref.ar_amap = NULL;
2827 		next->aref.ar_pageoff = 0;
2828 		orig->guard = MIN(orig->guard, split - orig->end);
2829 		orig->fspace = split - VMMAP_FREE_START(orig);
2830 	} else {
2831 		orig->fspace = 0;
2832 		orig->guard = 0;
2833 		orig->end = next->start = split;
2834 
2835 		if (next->aref.ar_amap) {
2836 			KERNEL_LOCK();
2837 			amap_splitref(&orig->aref, &next->aref, adj);
2838 			KERNEL_UNLOCK();
2839 		}
2840 		if (UVM_ET_ISSUBMAP(orig)) {
2841 			uvm_map_reference(next->object.sub_map);
2842 			next->offset += adj;
2843 		} else if (UVM_ET_ISOBJ(orig)) {
2844 			if (next->object.uvm_obj->pgops &&
2845 			    next->object.uvm_obj->pgops->pgo_reference) {
2846 				KERNEL_LOCK();
2847 				next->object.uvm_obj->pgops->pgo_reference(
2848 				    next->object.uvm_obj);
2849 				KERNEL_UNLOCK();
2850 			}
2851 			next->offset += adj;
2852 		}
2853 	}
2854 
2855 	/*
2856 	 * Link next into address tree.
2857 	 * Link orig and next into free-space tree.
2858 	 *
2859 	 * Don't insert 'next' into the addr tree until orig has been linked,
2860 	 * in case the free-list looks at adjecent entries in the addr tree
2861 	 * for its decisions.
2862 	 */
2863 	if (orig->fspace > 0)
2864 		free_before = free;
2865 	else
2866 		free_before = uvm_map_uaddr_e(map, orig);
2867 	uvm_mapent_free_insert(map, free_before, orig);
2868 	uvm_mapent_addr_insert(map, next);
2869 	uvm_mapent_free_insert(map, free, next);
2870 
2871 	uvm_tree_sanity(map, __FILE__, __LINE__);
2872 }
2873 
2874 
2875 #ifdef VMMAP_DEBUG
2876 
2877 void
2878 uvm_tree_assert(struct vm_map *map, int test, char *test_str,
2879     char *file, int line)
2880 {
2881 	char* map_special;
2882 
2883 	if (test)
2884 		return;
2885 
2886 	if (map == kernel_map)
2887 		map_special = " (kernel_map)";
2888 	else if (map == kmem_map)
2889 		map_special = " (kmem_map)";
2890 	else
2891 		map_special = "";
2892 	panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file,
2893 	    line, test_str);
2894 }
2895 
2896 /*
2897  * Check that map is sane.
2898  */
2899 void
2900 uvm_tree_sanity(struct vm_map *map, char *file, int line)
2901 {
2902 	struct vm_map_entry	*iter;
2903 	vaddr_t			 addr;
2904 	vaddr_t			 min, max, bound; /* Bounds checker. */
2905 	struct uvm_addr_state	*free;
2906 
2907 	addr = vm_map_min(map);
2908 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2909 		/*
2910 		 * Valid start, end.
2911 		 * Catch overflow for end+fspace.
2912 		 */
2913 		UVM_ASSERT(map, iter->end >= iter->start, file, line);
2914 		UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line);
2915 
2916 		/* May not be empty. */
2917 		UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter),
2918 		    file, line);
2919 
2920 		/* Addresses for entry must lie within map boundaries. */
2921 		UVM_ASSERT(map, iter->start >= vm_map_min(map) &&
2922 		    VMMAP_FREE_END(iter) <= vm_map_max(map), file, line);
2923 
2924 		/* Tree may not have gaps. */
2925 		UVM_ASSERT(map, iter->start == addr, file, line);
2926 		addr = VMMAP_FREE_END(iter);
2927 
2928 		/*
2929 		 * Free space may not cross boundaries, unless the same
2930 		 * free list is used on both sides of the border.
2931 		 */
2932 		min = VMMAP_FREE_START(iter);
2933 		max = VMMAP_FREE_END(iter);
2934 
2935 		while (min < max &&
2936 		    (bound = uvm_map_boundary(map, min, max)) != max) {
2937 			UVM_ASSERT(map,
2938 			    uvm_map_uaddr(map, bound - 1) ==
2939 			    uvm_map_uaddr(map, bound),
2940 			    file, line);
2941 			min = bound;
2942 		}
2943 
2944 		free = uvm_map_uaddr_e(map, iter);
2945 		if (free) {
2946 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0,
2947 			    file, line);
2948 		} else {
2949 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0,
2950 			    file, line);
2951 		}
2952 	}
2953 	UVM_ASSERT(map, addr == vm_map_max(map), file, line);
2954 }
2955 
2956 void
2957 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
2958 {
2959 	struct vm_map_entry *iter;
2960 	vsize_t size;
2961 
2962 	size = 0;
2963 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2964 		if (!UVM_ET_ISHOLE(iter))
2965 			size += iter->end - iter->start;
2966 	}
2967 
2968 	if (map->size != size)
2969 		printf("map size = 0x%lx, should be 0x%lx\n", map->size, size);
2970 	UVM_ASSERT(map, map->size == size, file, line);
2971 
2972 	vmspace_validate(map);
2973 }
2974 
2975 /*
2976  * This function validates the statistics on vmspace.
2977  */
2978 void
2979 vmspace_validate(struct vm_map *map)
2980 {
2981 	struct vmspace *vm;
2982 	struct vm_map_entry *iter;
2983 	vaddr_t imin, imax;
2984 	vaddr_t stack_begin, stack_end; /* Position of stack. */
2985 	vsize_t stack, heap; /* Measured sizes. */
2986 
2987 	if (!(map->flags & VM_MAP_ISVMSPACE))
2988 		return;
2989 
2990 	vm = (struct vmspace *)map;
2991 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2992 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2993 
2994 	stack = heap = 0;
2995 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2996 		imin = imax = iter->start;
2997 
2998 		if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL ||
2999 		    iter->prot != PROT_NONE)
3000 			continue;
3001 
3002 		/*
3003 		 * Update stack, heap.
3004 		 * Keep in mind that (theoretically) the entries of
3005 		 * userspace and stack may be joined.
3006 		 */
3007 		while (imin != iter->end) {
3008 			/*
3009 			 * Set imax to the first boundary crossed between
3010 			 * imin and stack addresses.
3011 			 */
3012 			imax = iter->end;
3013 			if (imin < stack_begin && imax > stack_begin)
3014 				imax = stack_begin;
3015 			else if (imin < stack_end && imax > stack_end)
3016 				imax = stack_end;
3017 
3018 			if (imin >= stack_begin && imin < stack_end)
3019 				stack += imax - imin;
3020 			else
3021 				heap += imax - imin;
3022 			imin = imax;
3023 		}
3024 	}
3025 
3026 	heap >>= PAGE_SHIFT;
3027 	if (heap != vm->vm_dused) {
3028 		printf("vmspace stack range: 0x%lx-0x%lx\n",
3029 		    stack_begin, stack_end);
3030 		panic("vmspace_validate: vmspace.vm_dused invalid, "
3031 		    "expected %ld pgs, got %ld pgs in map %p",
3032 		    heap, vm->vm_dused,
3033 		    map);
3034 	}
3035 }
3036 
3037 #endif /* VMMAP_DEBUG */
3038 
3039 /*
3040  * uvm_map_init: init mapping system at boot time.   note that we allocate
3041  * and init the static pool of structs vm_map_entry for the kernel here.
3042  */
3043 void
3044 uvm_map_init(void)
3045 {
3046 	static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
3047 	int lcv;
3048 
3049 	/* now set up static pool of kernel map entries ... */
3050 	mtx_init(&uvm_kmapent_mtx, IPL_VM);
3051 	SLIST_INIT(&uvm.kentry_free);
3052 	for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
3053 		SLIST_INSERT_HEAD(&uvm.kentry_free,
3054 		    &kernel_map_entry[lcv], daddrs.addr_kentry);
3055 	}
3056 
3057 	/* initialize the map-related pools. */
3058 	pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), 0,
3059 	    IPL_NONE, PR_WAITOK, "vmsppl", NULL);
3060 	pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), 0,
3061 	    IPL_VM, PR_WAITOK, "vmmpepl", NULL);
3062 	pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), 0,
3063 	    IPL_VM, 0, "vmmpekpl", NULL);
3064 	pool_sethiwat(&uvm_map_entry_pool, 8192);
3065 
3066 	uvm_addr_init();
3067 }
3068 
3069 #if defined(DDB)
3070 
3071 /*
3072  * DDB hooks
3073  */
3074 
3075 /*
3076  * uvm_map_printit: actually prints the map
3077  */
3078 void
3079 uvm_map_printit(struct vm_map *map, boolean_t full,
3080     int (*pr)(const char *, ...))
3081 {
3082 	struct vmspace			*vm;
3083 	struct vm_map_entry		*entry;
3084 	struct uvm_addr_state		*free;
3085 	int				 in_free, i;
3086 	char				 buf[8];
3087 
3088 	(*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
3089 	(*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n",
3090 	    map->b_start, map->b_end);
3091 	(*pr)("\tstack allocate range: 0x%lx-0x%lx\n",
3092 	    map->s_start, map->s_end);
3093 	(*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n",
3094 	    map->size, map->ref_count, map->timestamp,
3095 	    map->flags);
3096 	(*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
3097 	    pmap_resident_count(map->pmap));
3098 
3099 	/* struct vmspace handling. */
3100 	if (map->flags & VM_MAP_ISVMSPACE) {
3101 		vm = (struct vmspace *)map;
3102 
3103 		(*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n",
3104 		    vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss);
3105 		(*pr)("\tvm_tsize=%u vm_dsize=%u\n",
3106 		    vm->vm_tsize, vm->vm_dsize);
3107 		(*pr)("\tvm_taddr=%p vm_daddr=%p\n",
3108 		    vm->vm_taddr, vm->vm_daddr);
3109 		(*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n",
3110 		    vm->vm_maxsaddr, vm->vm_minsaddr);
3111 	}
3112 
3113 	if (!full)
3114 		goto print_uaddr;
3115 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
3116 		(*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
3117 		    entry, entry->start, entry->end, entry->object.uvm_obj,
3118 		    (long long)entry->offset, entry->aref.ar_amap,
3119 		    entry->aref.ar_pageoff);
3120 		(*pr)("\tsubmap=%c, cow=%c, nc=%c, stack=%c, "
3121 		    "syscall=%c, prot(max)=%d/%d, inh=%d, "
3122 		    "wc=%d, adv=%d\n",
3123 		    (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
3124 		    (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
3125 		    (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
3126 		    (entry->etype & UVM_ET_STACK) ? 'T' : 'F',
3127 		    (entry->etype & UVM_ET_SYSCALL) ? 'T' : 'F',
3128 		    entry->protection, entry->max_protection,
3129 		    entry->inheritance, entry->wired_count, entry->advice);
3130 
3131 		free = uvm_map_uaddr_e(map, entry);
3132 		in_free = (free != NULL);
3133 		(*pr)("\thole=%c, free=%c, guard=0x%lx, "
3134 		    "free=0x%lx-0x%lx\n",
3135 		    (entry->etype & UVM_ET_HOLE) ? 'T' : 'F',
3136 		    in_free ? 'T' : 'F',
3137 		    entry->guard,
3138 		    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
3139 		(*pr)("\tfspace_augment=%lu\n", entry->fspace_augment);
3140 		(*pr)("\tfreemapped=%c, uaddr=%p\n",
3141 		    (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free);
3142 		if (free) {
3143 			(*pr)("\t\t(0x%lx-0x%lx %s)\n",
3144 			    free->uaddr_minaddr, free->uaddr_maxaddr,
3145 			    free->uaddr_functions->uaddr_name);
3146 		}
3147 	}
3148 
3149 print_uaddr:
3150 	uvm_addr_print(map->uaddr_exe, "exe", full, pr);
3151 	for (i = 0; i < nitems(map->uaddr_any); i++) {
3152 		snprintf(&buf[0], sizeof(buf), "any[%d]", i);
3153 		uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr);
3154 	}
3155 	uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr);
3156 }
3157 
3158 /*
3159  * uvm_object_printit: actually prints the object
3160  */
3161 void
3162 uvm_object_printit(uobj, full, pr)
3163 	struct uvm_object *uobj;
3164 	boolean_t full;
3165 	int (*pr)(const char *, ...);
3166 {
3167 	struct vm_page *pg;
3168 	int cnt = 0;
3169 
3170 	(*pr)("OBJECT %p: pgops=%p, npages=%d, ",
3171 	    uobj, uobj->pgops, uobj->uo_npages);
3172 	if (UVM_OBJ_IS_KERN_OBJECT(uobj))
3173 		(*pr)("refs=<SYSTEM>\n");
3174 	else
3175 		(*pr)("refs=%d\n", uobj->uo_refs);
3176 
3177 	if (!full) {
3178 		return;
3179 	}
3180 	(*pr)("  PAGES <pg,offset>:\n  ");
3181 	RBT_FOREACH(pg, uvm_objtree, &uobj->memt) {
3182 		(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
3183 		if ((cnt % 3) == 2) {
3184 			(*pr)("\n  ");
3185 		}
3186 		cnt++;
3187 	}
3188 	if ((cnt % 3) != 2) {
3189 		(*pr)("\n");
3190 	}
3191 }
3192 
3193 /*
3194  * uvm_page_printit: actually print the page
3195  */
3196 static const char page_flagbits[] =
3197 	"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
3198 	"\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
3199 	"\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
3200 
3201 void
3202 uvm_page_printit(pg, full, pr)
3203 	struct vm_page *pg;
3204 	boolean_t full;
3205 	int (*pr)(const char *, ...);
3206 {
3207 	struct vm_page *tpg;
3208 	struct uvm_object *uobj;
3209 	struct pglist *pgl;
3210 
3211 	(*pr)("PAGE %p:\n", pg);
3212 	(*pr)("  flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n",
3213 	    pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count,
3214 	    (long long)pg->phys_addr);
3215 	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
3216 	    pg->uobject, pg->uanon, (long long)pg->offset);
3217 #if defined(UVM_PAGE_TRKOWN)
3218 	if (pg->pg_flags & PG_BUSY)
3219 		(*pr)("  owning thread = %d, tag=%s",
3220 		    pg->owner, pg->owner_tag);
3221 	else
3222 		(*pr)("  page not busy, no owner");
3223 #else
3224 	(*pr)("  [page ownership tracking disabled]");
3225 #endif
3226 	(*pr)("\tvm_page_md %p\n", &pg->mdpage);
3227 
3228 	if (!full)
3229 		return;
3230 
3231 	/* cross-verify object/anon */
3232 	if ((pg->pg_flags & PQ_FREE) == 0) {
3233 		if (pg->pg_flags & PQ_ANON) {
3234 			if (pg->uanon == NULL || pg->uanon->an_page != pg)
3235 			    (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
3236 				(pg->uanon) ? pg->uanon->an_page : NULL);
3237 			else
3238 				(*pr)("  anon backpointer is OK\n");
3239 		} else {
3240 			uobj = pg->uobject;
3241 			if (uobj) {
3242 				(*pr)("  checking object list\n");
3243 				RBT_FOREACH(tpg, uvm_objtree, &uobj->memt) {
3244 					if (tpg == pg) {
3245 						break;
3246 					}
3247 				}
3248 				if (tpg)
3249 					(*pr)("  page found on object list\n");
3250 				else
3251 					(*pr)("  >>> PAGE NOT FOUND "
3252 					    "ON OBJECT LIST! <<<\n");
3253 			}
3254 		}
3255 	}
3256 
3257 	/* cross-verify page queue */
3258 	if (pg->pg_flags & PQ_FREE) {
3259 		if (uvm_pmr_isfree(pg))
3260 			(*pr)("  page found in uvm_pmemrange\n");
3261 		else
3262 			(*pr)("  >>> page not found in uvm_pmemrange <<<\n");
3263 		pgl = NULL;
3264 	} else if (pg->pg_flags & PQ_INACTIVE) {
3265 		pgl = (pg->pg_flags & PQ_SWAPBACKED) ?
3266 		    &uvm.page_inactive_swp : &uvm.page_inactive_obj;
3267 	} else if (pg->pg_flags & PQ_ACTIVE) {
3268 		pgl = &uvm.page_active;
3269  	} else {
3270 		pgl = NULL;
3271 	}
3272 
3273 	if (pgl) {
3274 		(*pr)("  checking pageq list\n");
3275 		TAILQ_FOREACH(tpg, pgl, pageq) {
3276 			if (tpg == pg) {
3277 				break;
3278 			}
3279 		}
3280 		if (tpg)
3281 			(*pr)("  page found on pageq list\n");
3282 		else
3283 			(*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
3284 	}
3285 }
3286 #endif
3287 
3288 /*
3289  * uvm_map_protect: change map protection
3290  *
3291  * => set_max means set max_protection.
3292  * => map must be unlocked.
3293  */
3294 int
3295 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
3296     vm_prot_t new_prot, boolean_t set_max)
3297 {
3298 	struct vm_map_entry *first, *iter;
3299 	vm_prot_t old_prot;
3300 	vm_prot_t mask;
3301 	vsize_t dused;
3302 	int error;
3303 
3304 	if (start > end)
3305 		return EINVAL;
3306 	start = MAX(start, map->min_offset);
3307 	end = MIN(end, map->max_offset);
3308 	if (start >= end)
3309 		return 0;
3310 
3311 	dused = 0;
3312 	error = 0;
3313 	vm_map_lock(map);
3314 
3315 	/*
3316 	 * Set up first and last.
3317 	 * - first will contain first entry at or after start.
3318 	 */
3319 	first = uvm_map_entrybyaddr(&map->addr, start);
3320 	KDASSERT(first != NULL);
3321 	if (first->end <= start)
3322 		first = RBT_NEXT(uvm_map_addr, first);
3323 
3324 	/* First, check for protection violations. */
3325 	for (iter = first; iter != NULL && iter->start < end;
3326 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
3327 		/* Treat memory holes as free space. */
3328 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3329 			continue;
3330 
3331 		old_prot = iter->protection;
3332 		if (old_prot == PROT_NONE && new_prot != old_prot) {
3333 			dused += uvmspace_dused(
3334 			    map, MAX(start, iter->start), MIN(end, iter->end));
3335 		}
3336 
3337 		if (UVM_ET_ISSUBMAP(iter)) {
3338 			error = EINVAL;
3339 			goto out;
3340 		}
3341 		if ((new_prot & iter->max_protection) != new_prot) {
3342 			error = EACCES;
3343 			goto out;
3344 		}
3345 		if (map == kernel_map &&
3346 		    (new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
3347 			panic("uvm_map_protect: kernel map W^X violation requested");
3348 	}
3349 
3350 	/* Check limits. */
3351 	if (dused > 0 && (map->flags & VM_MAP_ISVMSPACE)) {
3352 		vsize_t limit = lim_cur(RLIMIT_DATA);
3353 		dused = ptoa(dused);
3354 		if (limit < dused ||
3355 		    limit - dused < ptoa(((struct vmspace *)map)->vm_dused)) {
3356 			error = ENOMEM;
3357 			goto out;
3358 		}
3359 	}
3360 
3361 	/* Fix protections.  */
3362 	for (iter = first; iter != NULL && iter->start < end;
3363 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
3364 		/* Treat memory holes as free space. */
3365 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3366 			continue;
3367 
3368 		old_prot = iter->protection;
3369 
3370 		/*
3371 		 * Skip adapting protection iff old and new protection
3372 		 * are equal.
3373 		 */
3374 		if (set_max) {
3375 			if (old_prot == (new_prot & old_prot) &&
3376 			    iter->max_protection == new_prot)
3377 				continue;
3378 		} else {
3379 			if (old_prot == new_prot)
3380 				continue;
3381 		}
3382 
3383 		UVM_MAP_CLIP_START(map, iter, start);
3384 		UVM_MAP_CLIP_END(map, iter, end);
3385 
3386 		if (set_max) {
3387 			iter->max_protection = new_prot;
3388 			iter->protection &= new_prot;
3389 		} else
3390 			iter->protection = new_prot;
3391 
3392 		/*
3393 		 * update physical map if necessary.  worry about copy-on-write
3394 		 * here -- CHECK THIS XXX
3395 		 */
3396 		if (iter->protection != old_prot) {
3397 			mask = UVM_ET_ISCOPYONWRITE(iter) ?
3398 			    ~PROT_WRITE : PROT_MASK;
3399 
3400 			/* XXX should only wserial++ if no split occurs */
3401 			if (iter->protection & PROT_WRITE)
3402 				map->wserial++;
3403 
3404 			if (map->flags & VM_MAP_ISVMSPACE) {
3405 				if (old_prot == PROT_NONE) {
3406 					((struct vmspace *)map)->vm_dused +=
3407 					    uvmspace_dused(map, iter->start,
3408 					        iter->end);
3409 				}
3410 				if (iter->protection == PROT_NONE) {
3411 					((struct vmspace *)map)->vm_dused -=
3412 					    uvmspace_dused(map, iter->start,
3413 					        iter->end);
3414 				}
3415 			}
3416 
3417 			/* update pmap */
3418 			if ((iter->protection & mask) == PROT_NONE &&
3419 			    VM_MAPENT_ISWIRED(iter)) {
3420 				/*
3421 				 * TODO(ariane) this is stupid. wired_count
3422 				 * is 0 if not wired, otherwise anything
3423 				 * larger than 0 (incremented once each time
3424 				 * wire is called).
3425 				 * Mostly to be able to undo the damage on
3426 				 * failure. Not the actually be a wired
3427 				 * refcounter...
3428 				 * Originally: iter->wired_count--;
3429 				 * (don't we have to unwire this in the pmap
3430 				 * as well?)
3431 				 */
3432 				iter->wired_count = 0;
3433 			}
3434 			pmap_protect(map->pmap, iter->start, iter->end,
3435 			    iter->protection & mask);
3436 		}
3437 
3438 		/*
3439 		 * If the map is configured to lock any future mappings,
3440 		 * wire this entry now if the old protection was PROT_NONE
3441 		 * and the new protection is not PROT_NONE.
3442 		 */
3443 		if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
3444 		    VM_MAPENT_ISWIRED(iter) == 0 &&
3445 		    old_prot == PROT_NONE &&
3446 		    new_prot != PROT_NONE) {
3447 			if (uvm_map_pageable(map, iter->start, iter->end,
3448 			    FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) {
3449 				/*
3450 				 * If locking the entry fails, remember the
3451 				 * error if it's the first one.  Note we
3452 				 * still continue setting the protection in
3453 				 * the map, but it will return the resource
3454 				 * storage condition regardless.
3455 				 *
3456 				 * XXX Ignore what the actual error is,
3457 				 * XXX just call it a resource shortage
3458 				 * XXX so that it doesn't get confused
3459 				 * XXX what uvm_map_protect() itself would
3460 				 * XXX normally return.
3461 				 */
3462 				error = ENOMEM;
3463 			}
3464 		}
3465 	}
3466 	pmap_update(map->pmap);
3467 
3468 out:
3469 	vm_map_unlock(map);
3470 	return error;
3471 }
3472 
3473 /*
3474  * uvmspace_alloc: allocate a vmspace structure.
3475  *
3476  * - structure includes vm_map and pmap
3477  * - XXX: no locking on this structure
3478  * - refcnt set to 1, rest must be init'd by caller
3479  */
3480 struct vmspace *
3481 uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable,
3482     boolean_t remove_holes)
3483 {
3484 	struct vmspace *vm;
3485 
3486 	vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO);
3487 	uvmspace_init(vm, NULL, min, max, pageable, remove_holes);
3488 	return (vm);
3489 }
3490 
3491 /*
3492  * uvmspace_init: initialize a vmspace structure.
3493  *
3494  * - XXX: no locking on this structure
3495  * - refcnt set to 1, rest must be init'd by caller
3496  */
3497 void
3498 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max,
3499     boolean_t pageable, boolean_t remove_holes)
3500 {
3501 	KASSERT(pmap == NULL || pmap == pmap_kernel());
3502 
3503 	if (pmap)
3504 		pmap_reference(pmap);
3505 	else
3506 		pmap = pmap_create();
3507 
3508 	uvm_map_setup(&vm->vm_map, pmap, min, max,
3509 	    (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE);
3510 
3511 	vm->vm_refcnt = 1;
3512 
3513 	if (remove_holes)
3514 		pmap_remove_holes(vm);
3515 }
3516 
3517 /*
3518  * uvmspace_share: share a vmspace between two processes
3519  *
3520  * - XXX: no locking on vmspace
3521  * - used for vfork
3522  */
3523 
3524 struct vmspace *
3525 uvmspace_share(struct process *pr)
3526 {
3527 	struct vmspace *vm = pr->ps_vmspace;
3528 
3529 	vm->vm_refcnt++;
3530 	return vm;
3531 }
3532 
3533 /*
3534  * uvmspace_exec: the process wants to exec a new program
3535  *
3536  * - XXX: no locking on vmspace
3537  */
3538 
3539 void
3540 uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end)
3541 {
3542 	struct process *pr = p->p_p;
3543 	struct vmspace *nvm, *ovm = pr->ps_vmspace;
3544 	struct vm_map *map = &ovm->vm_map;
3545 	struct uvm_map_deadq dead_entries;
3546 
3547 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0);
3548 	KASSERT((end & (vaddr_t)PAGE_MASK) == 0 ||
3549 	    (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
3550 
3551 	pmap_unuse_final(p);   /* before stack addresses go away */
3552 	TAILQ_INIT(&dead_entries);
3553 
3554 	/* see if more than one process is using this vmspace...  */
3555 	if (ovm->vm_refcnt == 1) {
3556 		/*
3557 		 * If pr is the only process using its vmspace then
3558 		 * we can safely recycle that vmspace for the program
3559 		 * that is being exec'd.
3560 		 */
3561 
3562 #ifdef SYSVSHM
3563 		/*
3564 		 * SYSV SHM semantics require us to kill all segments on an exec
3565 		 */
3566 		if (ovm->vm_shm)
3567 			shmexit(ovm);
3568 #endif
3569 
3570 		/*
3571 		 * POSIX 1003.1b -- "lock future mappings" is revoked
3572 		 * when a process execs another program image.
3573 		 */
3574 		vm_map_lock(map);
3575 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE|VM_MAP_SYSCALL_ONCE);
3576 
3577 		/*
3578 		 * now unmap the old program
3579 		 *
3580 		 * Instead of attempting to keep the map valid, we simply
3581 		 * nuke all entries and ask uvm_map_setup to reinitialize
3582 		 * the map to the new boundaries.
3583 		 *
3584 		 * uvm_unmap_remove will actually nuke all entries for us
3585 		 * (as in, not replace them with free-memory entries).
3586 		 */
3587 		uvm_unmap_remove(map, map->min_offset, map->max_offset,
3588 		    &dead_entries, TRUE, FALSE);
3589 
3590 		KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
3591 
3592 		/* Nuke statistics and boundaries. */
3593 		memset(&ovm->vm_startcopy, 0,
3594 		    (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy);
3595 
3596 
3597 		if (end & (vaddr_t)PAGE_MASK) {
3598 			end += 1;
3599 			if (end == 0) /* overflow */
3600 				end -= PAGE_SIZE;
3601 		}
3602 
3603 		/* Setup new boundaries and populate map with entries. */
3604 		map->min_offset = start;
3605 		map->max_offset = end;
3606 		uvm_map_setup_entries(map);
3607 		vm_map_unlock(map);
3608 
3609 		/* but keep MMU holes unavailable */
3610 		pmap_remove_holes(ovm);
3611 	} else {
3612 		/*
3613 		 * pr's vmspace is being shared, so we can't reuse
3614 		 * it for pr since it is still being used for others.
3615 		 * allocate a new vmspace for pr
3616 		 */
3617 		nvm = uvmspace_alloc(start, end,
3618 		    (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE);
3619 
3620 		/* install new vmspace and drop our ref to the old one. */
3621 		pmap_deactivate(p);
3622 		p->p_vmspace = pr->ps_vmspace = nvm;
3623 		pmap_activate(p);
3624 
3625 		uvmspace_free(ovm);
3626 	}
3627 
3628 	/* Release dead entries */
3629 	uvm_unmap_detach(&dead_entries, 0);
3630 }
3631 
3632 /*
3633  * uvmspace_free: free a vmspace data structure
3634  *
3635  * - XXX: no locking on vmspace
3636  */
3637 void
3638 uvmspace_free(struct vmspace *vm)
3639 {
3640 	if (--vm->vm_refcnt == 0) {
3641 		/*
3642 		 * lock the map, to wait out all other references to it.  delete
3643 		 * all of the mappings and pages they hold, then call the pmap
3644 		 * module to reclaim anything left.
3645 		 */
3646 #ifdef SYSVSHM
3647 		/* Get rid of any SYSV shared memory segments. */
3648 		if (vm->vm_shm != NULL)
3649 			shmexit(vm);
3650 #endif
3651 
3652 		uvm_map_teardown(&vm->vm_map);
3653 		pool_put(&uvm_vmspace_pool, vm);
3654 	}
3655 }
3656 
3657 /*
3658  * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
3659  * srcmap to the address range [dstaddr, dstaddr + sz) in
3660  * dstmap.
3661  *
3662  * The whole address range in srcmap must be backed by an object
3663  * (no holes).
3664  *
3665  * If successful, the address ranges share memory and the destination
3666  * address range uses the protection flags in prot.
3667  *
3668  * This routine assumes that sz is a multiple of PAGE_SIZE and
3669  * that dstaddr and srcaddr are page-aligned.
3670  */
3671 int
3672 uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
3673     struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
3674 {
3675 	int ret = 0;
3676 	vaddr_t unmap_end;
3677 	vaddr_t dstva;
3678 	vsize_t s_off, len, n = sz, remain;
3679 	struct vm_map_entry *first = NULL, *last = NULL;
3680 	struct vm_map_entry *src_entry, *psrc_entry = NULL;
3681 	struct uvm_map_deadq dead;
3682 
3683 	if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
3684 		return EINVAL;
3685 
3686 	TAILQ_INIT(&dead);
3687 	vm_map_lock(dstmap);
3688 	vm_map_lock_read(srcmap);
3689 
3690 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
3691 		ret = ENOMEM;
3692 		goto exit_unlock;
3693 	}
3694 	if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
3695 		ret = EINVAL;
3696 		goto exit_unlock;
3697 	}
3698 
3699 	dstva = dstaddr;
3700 	unmap_end = dstaddr;
3701 	for (; src_entry != NULL;
3702 	    psrc_entry = src_entry,
3703 	    src_entry = RBT_NEXT(uvm_map_addr, src_entry)) {
3704 		/* hole in address space, bail out */
3705 		if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
3706 			break;
3707 		if (src_entry->start >= srcaddr + sz)
3708 			break;
3709 
3710 		if (UVM_ET_ISSUBMAP(src_entry))
3711 			panic("uvm_share: encountered a submap (illegal)");
3712 		if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
3713 		    UVM_ET_ISNEEDSCOPY(src_entry))
3714 			panic("uvm_share: non-copy_on_write map entries "
3715 			    "marked needs_copy (illegal)");
3716 
3717 		/*
3718 		 * srcaddr > map entry start? means we are in the middle of a
3719 		 * map, so we calculate the offset to use in the source map.
3720 		 */
3721 		if (srcaddr > src_entry->start)
3722 			s_off = srcaddr - src_entry->start;
3723 		else if (srcaddr == src_entry->start)
3724 			s_off = 0;
3725 		else
3726 			panic("uvm_share: map entry start > srcaddr");
3727 
3728 		remain = src_entry->end - src_entry->start - s_off;
3729 
3730 		/* Determine how many bytes to share in this pass */
3731 		if (n < remain)
3732 			len = n;
3733 		else
3734 			len = remain;
3735 
3736 		if (uvm_mapent_share(dstmap, dstva, len, s_off, prot, prot,
3737 		    srcmap, src_entry, &dead) == NULL)
3738 			break;
3739 
3740 		n -= len;
3741 		dstva += len;
3742 		srcaddr += len;
3743 		unmap_end = dstva + len;
3744 		if (n == 0)
3745 			goto exit_unlock;
3746 	}
3747 
3748 	ret = EINVAL;
3749 	uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
3750 
3751 exit_unlock:
3752 	vm_map_unlock_read(srcmap);
3753 	vm_map_unlock(dstmap);
3754 	uvm_unmap_detach(&dead, 0);
3755 
3756 	return ret;
3757 }
3758 
3759 /*
3760  * Clone map entry into other map.
3761  *
3762  * Mapping will be placed at dstaddr, for the same length.
3763  * Space must be available.
3764  * Reference counters are incremented.
3765  */
3766 struct vm_map_entry *
3767 uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3768     vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
3769     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
3770     int mapent_flags, int amap_share_flags)
3771 {
3772 	struct vm_map_entry *new_entry, *first, *last;
3773 
3774 	KDASSERT(!UVM_ET_ISSUBMAP(old_entry));
3775 
3776 	/* Create new entry (linked in on creation). Fill in first, last. */
3777 	first = last = NULL;
3778 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) {
3779 		panic("uvm_mapent_clone: no space in map for "
3780 		    "entry in empty map");
3781 	}
3782 	new_entry = uvm_map_mkentry(dstmap, first, last,
3783 	    dstaddr, dstlen, mapent_flags, dead, NULL);
3784 	if (new_entry == NULL)
3785 		return NULL;
3786 	/* old_entry -> new_entry */
3787 	new_entry->object = old_entry->object;
3788 	new_entry->offset = old_entry->offset;
3789 	new_entry->aref = old_entry->aref;
3790 	new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
3791 	new_entry->protection = prot;
3792 	new_entry->max_protection = maxprot;
3793 	new_entry->inheritance = old_entry->inheritance;
3794 	new_entry->advice = old_entry->advice;
3795 
3796 	/* gain reference to object backing the map (can't be a submap). */
3797 	if (new_entry->aref.ar_amap) {
3798 		new_entry->aref.ar_pageoff += off >> PAGE_SHIFT;
3799 		amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3800 		    (new_entry->end - new_entry->start) >> PAGE_SHIFT,
3801 		    amap_share_flags);
3802 	}
3803 
3804 	if (UVM_ET_ISOBJ(new_entry) &&
3805 	    new_entry->object.uvm_obj->pgops->pgo_reference) {
3806 		new_entry->offset += off;
3807 		new_entry->object.uvm_obj->pgops->pgo_reference
3808 		    (new_entry->object.uvm_obj);
3809 	}
3810 
3811 	return new_entry;
3812 }
3813 
3814 struct vm_map_entry *
3815 uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3816     vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
3817     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3818 {
3819 	/*
3820 	 * If old_entry refers to a copy-on-write region that has not yet been
3821 	 * written to (needs_copy flag is set), then we need to allocate a new
3822 	 * amap for old_entry.
3823 	 *
3824 	 * If we do not do this, and the process owning old_entry does a copy-on
3825 	 * write later, old_entry and new_entry will refer to different memory
3826 	 * regions, and the memory between the processes is no longer shared.
3827 	 *
3828 	 * [in other words, we need to clear needs_copy]
3829 	 */
3830 
3831 	if (UVM_ET_ISNEEDSCOPY(old_entry)) {
3832 		/* get our own amap, clears needs_copy */
3833 		amap_copy(old_map, old_entry, M_WAITOK, FALSE, 0, 0);
3834 		/* XXXCDC: WAITOK??? */
3835 	}
3836 
3837 	return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
3838 	    prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
3839 }
3840 
3841 /*
3842  * share the mapping: this means we want the old and
3843  * new entries to share amaps and backing objects.
3844  */
3845 struct vm_map_entry *
3846 uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
3847     struct vm_map *old_map,
3848     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3849 {
3850 	struct vm_map_entry *new_entry;
3851 
3852 	new_entry = uvm_mapent_share(new_map, old_entry->start,
3853 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3854 	    old_entry->max_protection, old_map, old_entry, dead);
3855 
3856 	/*
3857 	 * pmap_copy the mappings: this routine is optional
3858 	 * but if it is there it will reduce the number of
3859 	 * page faults in the new proc.
3860 	 */
3861 	if (!UVM_ET_ISHOLE(new_entry))
3862 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->start,
3863 		    (new_entry->end - new_entry->start), new_entry->start);
3864 
3865 	return (new_entry);
3866 }
3867 
3868 /*
3869  * copy-on-write the mapping (using mmap's
3870  * MAP_PRIVATE semantics)
3871  *
3872  * allocate new_entry, adjust reference counts.
3873  * (note that new references are read-only).
3874  */
3875 struct vm_map_entry *
3876 uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
3877     struct vm_map *old_map,
3878     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3879 {
3880 	struct vm_map_entry	*new_entry;
3881 	boolean_t		 protect_child;
3882 
3883 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3884 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3885 	    old_entry->max_protection, old_entry, dead, 0, 0);
3886 
3887 	new_entry->etype |=
3888 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3889 
3890 	/*
3891 	 * the new entry will need an amap.  it will either
3892 	 * need to be copied from the old entry or created
3893 	 * from scratch (if the old entry does not have an
3894 	 * amap).  can we defer this process until later
3895 	 * (by setting "needs_copy") or do we need to copy
3896 	 * the amap now?
3897 	 *
3898 	 * we must copy the amap now if any of the following
3899 	 * conditions hold:
3900 	 * 1. the old entry has an amap and that amap is
3901 	 *    being shared.  this means that the old (parent)
3902 	 *    process is sharing the amap with another
3903 	 *    process.  if we do not clear needs_copy here
3904 	 *    we will end up in a situation where both the
3905 	 *    parent and child process are referring to the
3906 	 *    same amap with "needs_copy" set.  if the
3907 	 *    parent write-faults, the fault routine will
3908 	 *    clear "needs_copy" in the parent by allocating
3909 	 *    a new amap.   this is wrong because the
3910 	 *    parent is supposed to be sharing the old amap
3911 	 *    and the new amap will break that.
3912 	 *
3913 	 * 2. if the old entry has an amap and a non-zero
3914 	 *    wire count then we are going to have to call
3915 	 *    amap_cow_now to avoid page faults in the
3916 	 *    parent process.   since amap_cow_now requires
3917 	 *    "needs_copy" to be clear we might as well
3918 	 *    clear it here as well.
3919 	 *
3920 	 */
3921 	if (old_entry->aref.ar_amap != NULL &&
3922 	    ((amap_flags(old_entry->aref.ar_amap) &
3923 	    AMAP_SHARED) != 0 ||
3924 	    VM_MAPENT_ISWIRED(old_entry))) {
3925 		amap_copy(new_map, new_entry, M_WAITOK, FALSE,
3926 		    0, 0);
3927 		/* XXXCDC: M_WAITOK ... ok? */
3928 	}
3929 
3930 	/*
3931 	 * if the parent's entry is wired down, then the
3932 	 * parent process does not want page faults on
3933 	 * access to that memory.  this means that we
3934 	 * cannot do copy-on-write because we can't write
3935 	 * protect the old entry.   in this case we
3936 	 * resolve all copy-on-write faults now, using
3937 	 * amap_cow_now.   note that we have already
3938 	 * allocated any needed amap (above).
3939 	 */
3940 	if (VM_MAPENT_ISWIRED(old_entry)) {
3941 		/*
3942 		 * resolve all copy-on-write faults now
3943 		 * (note that there is nothing to do if
3944 		 * the old mapping does not have an amap).
3945 		 * XXX: is it worthwhile to bother with
3946 		 * pmap_copy in this case?
3947 		 */
3948 		if (old_entry->aref.ar_amap)
3949 			amap_cow_now(new_map, new_entry);
3950 	} else {
3951 		if (old_entry->aref.ar_amap) {
3952 			/*
3953 			 * setup mappings to trigger copy-on-write faults
3954 			 * we must write-protect the parent if it has
3955 			 * an amap and it is not already "needs_copy"...
3956 			 * if it is already "needs_copy" then the parent
3957 			 * has already been write-protected by a previous
3958 			 * fork operation.
3959 			 *
3960 			 * if we do not write-protect the parent, then
3961 			 * we must be sure to write-protect the child
3962 			 * after the pmap_copy() operation.
3963 			 *
3964 			 * XXX: pmap_copy should have some way of telling
3965 			 * us that it didn't do anything so we can avoid
3966 			 * calling pmap_protect needlessly.
3967 			 */
3968 			if (!UVM_ET_ISNEEDSCOPY(old_entry)) {
3969 				if (old_entry->max_protection & PROT_WRITE) {
3970 					pmap_protect(old_map->pmap,
3971 					    old_entry->start,
3972 					    old_entry->end,
3973 					    old_entry->protection &
3974 					    ~PROT_WRITE);
3975 					pmap_update(old_map->pmap);
3976 				}
3977 				old_entry->etype |= UVM_ET_NEEDSCOPY;
3978 			}
3979 
3980 	  		/* parent must now be write-protected */
3981 	  		protect_child = FALSE;
3982 		} else {
3983 			/*
3984 			 * we only need to protect the child if the
3985 			 * parent has write access.
3986 			 */
3987 			if (old_entry->max_protection & PROT_WRITE)
3988 				protect_child = TRUE;
3989 			else
3990 				protect_child = FALSE;
3991 		}
3992 		/*
3993 		 * copy the mappings
3994 		 * XXX: need a way to tell if this does anything
3995 		 */
3996 		if (!UVM_ET_ISHOLE(new_entry))
3997 			pmap_copy(new_map->pmap, old_map->pmap,
3998 			    new_entry->start,
3999 			    (old_entry->end - old_entry->start),
4000 			    old_entry->start);
4001 
4002 		/* protect the child's mappings if necessary */
4003 		if (protect_child) {
4004 			pmap_protect(new_map->pmap, new_entry->start,
4005 			    new_entry->end,
4006 			    new_entry->protection &
4007 			    ~PROT_WRITE);
4008 		}
4009 	}
4010 
4011 	return (new_entry);
4012 }
4013 
4014 /*
4015  * zero the mapping: the new entry will be zero initialized
4016  */
4017 struct vm_map_entry *
4018 uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
4019     struct vm_map *old_map,
4020     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
4021 {
4022 	struct vm_map_entry *new_entry;
4023 
4024 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
4025 	    old_entry->end - old_entry->start, 0, old_entry->protection,
4026 	    old_entry->max_protection, old_entry, dead, 0, 0);
4027 
4028 	new_entry->etype |=
4029 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
4030 
4031 	if (new_entry->aref.ar_amap) {
4032 		amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
4033 		    atop(new_entry->end - new_entry->start), 0);
4034 		new_entry->aref.ar_amap = NULL;
4035 		new_entry->aref.ar_pageoff = 0;
4036 	}
4037 
4038 	if (UVM_ET_ISOBJ(new_entry)) {
4039 		if (new_entry->object.uvm_obj->pgops->pgo_detach)
4040 			new_entry->object.uvm_obj->pgops->pgo_detach(
4041 			    new_entry->object.uvm_obj);
4042 		new_entry->object.uvm_obj = NULL;
4043 		new_entry->etype &= ~UVM_ET_OBJ;
4044 	}
4045 
4046 	return (new_entry);
4047 }
4048 
4049 /*
4050  * uvmspace_fork: fork a process' main map
4051  *
4052  * => create a new vmspace for child process from parent.
4053  * => parent's map must not be locked.
4054  */
4055 struct vmspace *
4056 uvmspace_fork(struct process *pr)
4057 {
4058 	struct vmspace *vm1 = pr->ps_vmspace;
4059 	struct vmspace *vm2;
4060 	struct vm_map *old_map = &vm1->vm_map;
4061 	struct vm_map *new_map;
4062 	struct vm_map_entry *old_entry, *new_entry;
4063 	struct uvm_map_deadq dead;
4064 
4065 	vm_map_lock(old_map);
4066 
4067 	vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
4068 	    (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE);
4069 	memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
4070 	    (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
4071 	vm2->vm_dused = 0; /* Statistic managed by us. */
4072 	new_map = &vm2->vm_map;
4073 	vm_map_lock(new_map);
4074 
4075 	/* go entry-by-entry */
4076 	TAILQ_INIT(&dead);
4077 	RBT_FOREACH(old_entry, uvm_map_addr, &old_map->addr) {
4078 		if (old_entry->start == old_entry->end)
4079 			continue;
4080 
4081 		/* first, some sanity checks on the old entry */
4082 		if (UVM_ET_ISSUBMAP(old_entry)) {
4083 			panic("fork: encountered a submap during fork "
4084 			    "(illegal)");
4085 		}
4086 
4087 		if (!UVM_ET_ISCOPYONWRITE(old_entry) &&
4088 		    UVM_ET_ISNEEDSCOPY(old_entry)) {
4089 			panic("fork: non-copy_on_write map entry marked "
4090 			    "needs_copy (illegal)");
4091 		}
4092 
4093 		/* Apply inheritance. */
4094 		switch (old_entry->inheritance) {
4095 		case MAP_INHERIT_SHARE:
4096 			new_entry = uvm_mapent_forkshared(vm2, new_map,
4097 			    old_map, old_entry, &dead);
4098 			break;
4099 		case MAP_INHERIT_COPY:
4100 			new_entry = uvm_mapent_forkcopy(vm2, new_map,
4101 			    old_map, old_entry, &dead);
4102 			break;
4103 		case MAP_INHERIT_ZERO:
4104 			new_entry = uvm_mapent_forkzero(vm2, new_map,
4105 			    old_map, old_entry, &dead);
4106 			break;
4107 		default:
4108 			continue;
4109 		}
4110 
4111 	 	/* Update process statistics. */
4112 		if (!UVM_ET_ISHOLE(new_entry))
4113 			new_map->size += new_entry->end - new_entry->start;
4114 		if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry) &&
4115 		    new_entry->protection != PROT_NONE) {
4116 			vm2->vm_dused += uvmspace_dused(
4117 			    new_map, new_entry->start, new_entry->end);
4118 		}
4119 	}
4120 
4121 	vm_map_unlock(old_map);
4122 	vm_map_unlock(new_map);
4123 
4124 	/*
4125 	 * This can actually happen, if multiple entries described a
4126 	 * space in which an entry was inherited.
4127 	 */
4128 	uvm_unmap_detach(&dead, 0);
4129 
4130 #ifdef SYSVSHM
4131 	if (vm1->vm_shm)
4132 		shmfork(vm1, vm2);
4133 #endif
4134 
4135 	return vm2;
4136 }
4137 
4138 /*
4139  * uvm_map_hint: return the beginning of the best area suitable for
4140  * creating a new mapping with "prot" protection.
4141  */
4142 vaddr_t
4143 uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr,
4144     vaddr_t maxaddr)
4145 {
4146 	vaddr_t addr;
4147 	vaddr_t spacing;
4148 
4149 #ifdef __i386__
4150 	/*
4151 	 * If executable skip first two pages, otherwise start
4152 	 * after data + heap region.
4153 	 */
4154 	if ((prot & PROT_EXEC) != 0 &&
4155 	    (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) {
4156 		addr = (PAGE_SIZE*2) +
4157 		    (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
4158 		return (round_page(addr));
4159 	}
4160 #endif
4161 
4162 #if defined (__LP64__)
4163 	spacing = MIN(4UL * 1024 * 1024 * 1024, MAXDSIZ) - 1;
4164 #else
4165 	spacing = MIN(1 * 1024 * 1024 * 1024, MAXDSIZ) - 1;
4166 #endif
4167 
4168 	/*
4169 	 * Start malloc/mmap after the brk.
4170 	 */
4171 	addr = (vaddr_t)vm->vm_daddr + BRKSIZ;
4172 	addr = MAX(addr, minaddr);
4173 
4174 	if (addr < maxaddr) {
4175 		while (spacing > maxaddr - addr)
4176 			spacing >>= 1;
4177 	}
4178 	addr += arc4random() & spacing;
4179 	return (round_page(addr));
4180 }
4181 
4182 /*
4183  * uvm_map_submap: punch down part of a map into a submap
4184  *
4185  * => only the kernel_map is allowed to be submapped
4186  * => the purpose of submapping is to break up the locking granularity
4187  *	of a larger map
4188  * => the range specified must have been mapped previously with a uvm_map()
4189  *	call [with uobj==NULL] to create a blank map entry in the main map.
4190  *	[And it had better still be blank!]
4191  * => maps which contain submaps should never be copied or forked.
4192  * => to remove a submap, use uvm_unmap() on the main map
4193  *	and then uvm_map_deallocate() the submap.
4194  * => main map must be unlocked.
4195  * => submap must have been init'd and have a zero reference count.
4196  *	[need not be locked as we don't actually reference it]
4197  */
4198 int
4199 uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
4200     struct vm_map *submap)
4201 {
4202 	struct vm_map_entry *entry;
4203 	int result;
4204 
4205 	if (start > map->max_offset || end > map->max_offset ||
4206 	    start < map->min_offset || end < map->min_offset)
4207 		return EINVAL;
4208 
4209 	vm_map_lock(map);
4210 
4211 	if (uvm_map_lookup_entry(map, start, &entry)) {
4212 		UVM_MAP_CLIP_START(map, entry, start);
4213 		UVM_MAP_CLIP_END(map, entry, end);
4214 	} else
4215 		entry = NULL;
4216 
4217 	if (entry != NULL &&
4218 	    entry->start == start && entry->end == end &&
4219 	    entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
4220 	    !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
4221 		entry->etype |= UVM_ET_SUBMAP;
4222 		entry->object.sub_map = submap;
4223 		entry->offset = 0;
4224 		uvm_map_reference(submap);
4225 		result = 0;
4226 	} else
4227 		result = EINVAL;
4228 
4229 	vm_map_unlock(map);
4230 	return(result);
4231 }
4232 
4233 /*
4234  * uvm_map_checkprot: check protection in map
4235  *
4236  * => must allow specific protection in a fully allocated region.
4237  * => map mut be read or write locked by caller.
4238  */
4239 boolean_t
4240 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
4241     vm_prot_t protection)
4242 {
4243 	struct vm_map_entry *entry;
4244 
4245 	if (start < map->min_offset || end > map->max_offset || start > end)
4246 		return FALSE;
4247 	if (start == end)
4248 		return TRUE;
4249 
4250 	/*
4251 	 * Iterate entries.
4252 	 */
4253 	for (entry = uvm_map_entrybyaddr(&map->addr, start);
4254 	    entry != NULL && entry->start < end;
4255 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4256 		/* Fail if a hole is found. */
4257 		if (UVM_ET_ISHOLE(entry) ||
4258 		    (entry->end < end && entry->end != VMMAP_FREE_END(entry)))
4259 			return FALSE;
4260 
4261 		/* Check protection. */
4262 		if ((entry->protection & protection) != protection)
4263 			return FALSE;
4264 	}
4265 	return TRUE;
4266 }
4267 
4268 /*
4269  * uvm_map_create: create map
4270  */
4271 vm_map_t
4272 uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags)
4273 {
4274 	vm_map_t map;
4275 
4276 	map = malloc(sizeof *map, M_VMMAP, M_WAITOK);
4277 	uvm_map_setup(map, pmap, min, max, flags);
4278 	return (map);
4279 }
4280 
4281 /*
4282  * uvm_map_deallocate: drop reference to a map
4283  *
4284  * => caller must not lock map
4285  * => we will zap map if ref count goes to zero
4286  */
4287 void
4288 uvm_map_deallocate(vm_map_t map)
4289 {
4290 	int c;
4291 	struct uvm_map_deadq dead;
4292 
4293 	c = --map->ref_count;
4294 	if (c > 0) {
4295 		return;
4296 	}
4297 
4298 	/*
4299 	 * all references gone.   unmap and free.
4300 	 *
4301 	 * No lock required: we are only one to access this map.
4302 	 */
4303 	TAILQ_INIT(&dead);
4304 	uvm_tree_sanity(map, __FILE__, __LINE__);
4305 	uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead,
4306 	    TRUE, FALSE);
4307 	pmap_destroy(map->pmap);
4308 	KASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
4309 	free(map, M_VMMAP, sizeof *map);
4310 
4311 	uvm_unmap_detach(&dead, 0);
4312 }
4313 
4314 /*
4315  * uvm_map_inherit: set inheritance code for range of addrs in map.
4316  *
4317  * => map must be unlocked
4318  * => note that the inherit code is used during a "fork".  see fork
4319  *	code for details.
4320  */
4321 int
4322 uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
4323     vm_inherit_t new_inheritance)
4324 {
4325 	struct vm_map_entry *entry;
4326 
4327 	switch (new_inheritance) {
4328 	case MAP_INHERIT_NONE:
4329 	case MAP_INHERIT_COPY:
4330 	case MAP_INHERIT_SHARE:
4331 	case MAP_INHERIT_ZERO:
4332 		break;
4333 	default:
4334 		return (EINVAL);
4335 	}
4336 
4337 	if (start > end)
4338 		return EINVAL;
4339 	start = MAX(start, map->min_offset);
4340 	end = MIN(end, map->max_offset);
4341 	if (start >= end)
4342 		return 0;
4343 
4344 	vm_map_lock(map);
4345 
4346 	entry = uvm_map_entrybyaddr(&map->addr, start);
4347 	if (entry->end > start)
4348 		UVM_MAP_CLIP_START(map, entry, start);
4349 	else
4350 		entry = RBT_NEXT(uvm_map_addr, entry);
4351 
4352 	while (entry != NULL && entry->start < end) {
4353 		UVM_MAP_CLIP_END(map, entry, end);
4354 		entry->inheritance = new_inheritance;
4355 		entry = RBT_NEXT(uvm_map_addr, entry);
4356 	}
4357 
4358 	vm_map_unlock(map);
4359 	return (0);
4360 }
4361 
4362 /*
4363  * uvm_map_syscall: permit system calls for range of addrs in map.
4364  *
4365  * => map must be unlocked
4366  */
4367 int
4368 uvm_map_syscall(struct vm_map *map, vaddr_t start, vaddr_t end)
4369 {
4370 	struct vm_map_entry *entry;
4371 
4372 	if (start > end)
4373 		return EINVAL;
4374 	start = MAX(start, map->min_offset);
4375 	end = MIN(end, map->max_offset);
4376 	if (start >= end)
4377 		return 0;
4378 	if (map->flags & VM_MAP_SYSCALL_ONCE)	/* only allowed once */
4379 		return (EPERM);
4380 
4381 	vm_map_lock(map);
4382 
4383 	entry = uvm_map_entrybyaddr(&map->addr, start);
4384 	if (entry->end > start)
4385 		UVM_MAP_CLIP_START(map, entry, start);
4386 	else
4387 		entry = RBT_NEXT(uvm_map_addr, entry);
4388 
4389 	while (entry != NULL && entry->start < end) {
4390 		UVM_MAP_CLIP_END(map, entry, end);
4391 		entry->etype |= UVM_ET_SYSCALL;
4392 		entry = RBT_NEXT(uvm_map_addr, entry);
4393 	}
4394 
4395 	map->wserial++;
4396 	map->flags |= VM_MAP_SYSCALL_ONCE;
4397 	vm_map_unlock(map);
4398 	return (0);
4399 }
4400 
4401 /*
4402  * uvm_map_advice: set advice code for range of addrs in map.
4403  *
4404  * => map must be unlocked
4405  */
4406 int
4407 uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
4408 {
4409 	struct vm_map_entry *entry;
4410 
4411 	switch (new_advice) {
4412 	case MADV_NORMAL:
4413 	case MADV_RANDOM:
4414 	case MADV_SEQUENTIAL:
4415 		break;
4416 	default:
4417 		return (EINVAL);
4418 	}
4419 
4420 	if (start > end)
4421 		return EINVAL;
4422 	start = MAX(start, map->min_offset);
4423 	end = MIN(end, map->max_offset);
4424 	if (start >= end)
4425 		return 0;
4426 
4427 	vm_map_lock(map);
4428 
4429 	entry = uvm_map_entrybyaddr(&map->addr, start);
4430 	if (entry != NULL && entry->end > start)
4431 		UVM_MAP_CLIP_START(map, entry, start);
4432 	else if (entry!= NULL)
4433 		entry = RBT_NEXT(uvm_map_addr, entry);
4434 
4435 	/*
4436 	 * XXXJRT: disallow holes?
4437 	 */
4438 	while (entry != NULL && entry->start < end) {
4439 		UVM_MAP_CLIP_END(map, entry, end);
4440 		entry->advice = new_advice;
4441 		entry = RBT_NEXT(uvm_map_addr, entry);
4442 	}
4443 
4444 	vm_map_unlock(map);
4445 	return (0);
4446 }
4447 
4448 /*
4449  * uvm_map_extract: extract a mapping from a map and put it somewhere
4450  * in the kernel_map, setting protection to max_prot.
4451  *
4452  * => map should be unlocked (we will write lock it and kernel_map)
4453  * => returns 0 on success, error code otherwise
4454  * => start must be page aligned
4455  * => len must be page sized
4456  * => flags:
4457  *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
4458  * Mappings are QREF's.
4459  */
4460 int
4461 uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
4462     vaddr_t *dstaddrp, int flags)
4463 {
4464 	struct uvm_map_deadq dead;
4465 	struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2;
4466 	vaddr_t dstaddr;
4467 	vaddr_t end;
4468 	vaddr_t cp_start;
4469 	vsize_t cp_len, cp_off;
4470 	int error;
4471 
4472 	TAILQ_INIT(&dead);
4473 	end = start + len;
4474 
4475 	/*
4476 	 * Sanity check on the parameters.
4477 	 * Also, since the mapping may not contain gaps, error out if the
4478 	 * mapped area is not in source map.
4479 	 */
4480 	if ((start & (vaddr_t)PAGE_MASK) != 0 ||
4481 	    (end & (vaddr_t)PAGE_MASK) != 0 || end < start)
4482 		return EINVAL;
4483 	if (start < srcmap->min_offset || end > srcmap->max_offset)
4484 		return EINVAL;
4485 
4486 	/* Initialize dead entries. Handle len == 0 case. */
4487 	if (len == 0)
4488 		return 0;
4489 
4490 	/* Acquire lock on srcmap. */
4491 	vm_map_lock(srcmap);
4492 
4493 	/* Lock srcmap, lookup first and last entry in <start,len>. */
4494 	first = uvm_map_entrybyaddr(&srcmap->addr, start);
4495 
4496 	/* Check that the range is contiguous. */
4497 	for (entry = first; entry != NULL && entry->end < end;
4498 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4499 		if (VMMAP_FREE_END(entry) != entry->end ||
4500 		    UVM_ET_ISHOLE(entry)) {
4501 			error = EINVAL;
4502 			goto fail;
4503 		}
4504 	}
4505 	if (entry == NULL || UVM_ET_ISHOLE(entry)) {
4506 		error = EINVAL;
4507 		goto fail;
4508 	}
4509 
4510 	/*
4511 	 * Handle need-copy flag.
4512 	 */
4513 	for (entry = first; entry != NULL && entry->start < end;
4514 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4515 		if (UVM_ET_ISNEEDSCOPY(entry))
4516 			amap_copy(srcmap, entry, M_NOWAIT,
4517 			    UVM_ET_ISSTACK(entry) ? FALSE : TRUE, start, end);
4518 		if (UVM_ET_ISNEEDSCOPY(entry)) {
4519 			/*
4520 			 * amap_copy failure
4521 			 */
4522 			error = ENOMEM;
4523 			goto fail;
4524 		}
4525 	}
4526 
4527 	/* Lock destination map (kernel_map). */
4528 	vm_map_lock(kernel_map);
4529 
4530 	if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len,
4531 	    MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start),
4532 	    PROT_NONE, 0) != 0) {
4533 		error = ENOMEM;
4534 		goto fail2;
4535 	}
4536 	*dstaddrp = dstaddr;
4537 
4538 	/*
4539 	 * We now have srcmap and kernel_map locked.
4540 	 * dstaddr contains the destination offset in dstmap.
4541 	 */
4542 	/* step 1: start looping through map entries, performing extraction. */
4543 	for (entry = first; entry != NULL && entry->start < end;
4544 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4545 		KDASSERT(!UVM_ET_ISNEEDSCOPY(entry));
4546 		if (UVM_ET_ISHOLE(entry))
4547 			continue;
4548 
4549 		/* Calculate uvm_mapent_clone parameters. */
4550 		cp_start = entry->start;
4551 		if (cp_start < start) {
4552 			cp_off = start - cp_start;
4553 			cp_start = start;
4554 		} else
4555 			cp_off = 0;
4556 		cp_len = MIN(entry->end, end) - cp_start;
4557 
4558 		newentry = uvm_mapent_clone(kernel_map,
4559 		    cp_start - start + dstaddr, cp_len, cp_off,
4560 		    entry->protection, entry->max_protection,
4561 		    entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
4562 		if (newentry == NULL) {
4563 			error = ENOMEM;
4564 			goto fail2_unmap;
4565 		}
4566 		kernel_map->size += cp_len;
4567 		if (flags & UVM_EXTRACT_FIXPROT)
4568 			newentry->protection = newentry->max_protection;
4569 
4570 		/*
4571 		 * Step 2: perform pmap copy.
4572 		 * (Doing this in the loop saves one RB traversal.)
4573 		 */
4574 		pmap_copy(kernel_map->pmap, srcmap->pmap,
4575 		    cp_start - start + dstaddr, cp_len, cp_start);
4576 	}
4577 	pmap_update(kernel_map->pmap);
4578 
4579 	error = 0;
4580 
4581 	/* Unmap copied entries on failure. */
4582 fail2_unmap:
4583 	if (error) {
4584 		uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead,
4585 		    FALSE, TRUE);
4586 	}
4587 
4588 	/* Release maps, release dead entries. */
4589 fail2:
4590 	vm_map_unlock(kernel_map);
4591 
4592 fail:
4593 	vm_map_unlock(srcmap);
4594 
4595 	uvm_unmap_detach(&dead, 0);
4596 
4597 	return error;
4598 }
4599 
4600 /*
4601  * uvm_map_clean: clean out a map range
4602  *
4603  * => valid flags:
4604  *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
4605  *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
4606  *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
4607  *   if (flags & PGO_FREE): any cached pages are freed after clean
4608  * => returns an error if any part of the specified range isn't mapped
4609  * => never a need to flush amap layer since the anonymous memory has
4610  *	no permanent home, but may deactivate pages there
4611  * => called from sys_msync() and sys_madvise()
4612  * => caller must not write-lock map (read OK).
4613  * => we may sleep while cleaning if SYNCIO [with map read-locked]
4614  */
4615 
4616 int
4617 uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
4618 {
4619 	struct vm_map_entry *first, *entry;
4620 	struct vm_amap *amap;
4621 	struct vm_anon *anon;
4622 	struct vm_page *pg;
4623 	struct uvm_object *uobj;
4624 	vaddr_t cp_start, cp_end;
4625 	int refs;
4626 	int error;
4627 	boolean_t rv;
4628 
4629 	KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
4630 	    (PGO_FREE|PGO_DEACTIVATE));
4631 
4632 	if (start > end || start < map->min_offset || end > map->max_offset)
4633 		return EINVAL;
4634 
4635 	vm_map_lock_read(map);
4636 	first = uvm_map_entrybyaddr(&map->addr, start);
4637 
4638 	/* Make a first pass to check for holes. */
4639 	for (entry = first; entry != NULL && entry->start < end;
4640 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4641 		if (UVM_ET_ISSUBMAP(entry)) {
4642 			vm_map_unlock_read(map);
4643 			return EINVAL;
4644 		}
4645 		if (UVM_ET_ISSUBMAP(entry) ||
4646 		    UVM_ET_ISHOLE(entry) ||
4647 		    (entry->end < end &&
4648 		    VMMAP_FREE_END(entry) != entry->end)) {
4649 			vm_map_unlock_read(map);
4650 			return EFAULT;
4651 		}
4652 	}
4653 
4654 	error = 0;
4655 	for (entry = first; entry != NULL && entry->start < end;
4656 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4657 		amap = entry->aref.ar_amap;	/* top layer */
4658 		if (UVM_ET_ISOBJ(entry))
4659 			uobj = entry->object.uvm_obj;
4660 		else
4661 			uobj = NULL;
4662 
4663 		/*
4664 		 * No amap cleaning necessary if:
4665 		 *  - there's no amap
4666 		 *  - we're not deactivating or freeing pages.
4667 		 */
4668 		if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
4669 			goto flush_object;
4670 
4671 		cp_start = MAX(entry->start, start);
4672 		cp_end = MIN(entry->end, end);
4673 
4674 		for (; cp_start != cp_end; cp_start += PAGE_SIZE) {
4675 			anon = amap_lookup(&entry->aref,
4676 			    cp_start - entry->start);
4677 			if (anon == NULL)
4678 				continue;
4679 
4680 			pg = anon->an_page;
4681 			if (pg == NULL) {
4682 				continue;
4683 			}
4684 			KASSERT(pg->pg_flags & PQ_ANON);
4685 
4686 			switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
4687 			/*
4688 			 * XXX In these first 3 cases, we always just
4689 			 * XXX deactivate the page.  We may want to
4690 			 * XXX handle the different cases more
4691 			 * XXX specifically, in the future.
4692 			 */
4693 			case PGO_CLEANIT|PGO_FREE:
4694 			case PGO_CLEANIT|PGO_DEACTIVATE:
4695 			case PGO_DEACTIVATE:
4696 deactivate_it:
4697 				/* skip the page if it's wired */
4698 				if (pg->wire_count != 0)
4699 					break;
4700 
4701 				uvm_lock_pageq();
4702 
4703 				KASSERT(pg->uanon == anon);
4704 
4705 				/* zap all mappings for the page. */
4706 				pmap_page_protect(pg, PROT_NONE);
4707 
4708 				/* ...and deactivate the page. */
4709 				uvm_pagedeactivate(pg);
4710 
4711 				uvm_unlock_pageq();
4712 				break;
4713 			case PGO_FREE:
4714 				/*
4715 				 * If there are multiple references to
4716 				 * the amap, just deactivate the page.
4717 				 */
4718 				if (amap_refs(amap) > 1)
4719 					goto deactivate_it;
4720 
4721 				/* XXX skip the page if it's wired */
4722 				if (pg->wire_count != 0) {
4723 					break;
4724 				}
4725 				amap_unadd(&entry->aref,
4726 				    cp_start - entry->start);
4727 				refs = --anon->an_ref;
4728 				if (refs == 0)
4729 					uvm_anfree(anon);
4730 				break;
4731 			default:
4732 				panic("uvm_map_clean: weird flags");
4733 			}
4734 		}
4735 
4736 flush_object:
4737 		cp_start = MAX(entry->start, start);
4738 		cp_end = MIN(entry->end, end);
4739 
4740 		/*
4741 		 * flush pages if we've got a valid backing object.
4742 		 *
4743 		 * Don't PGO_FREE if we don't have write permission
4744 		 * and don't flush if this is a copy-on-write object
4745 		 * since we can't know our permissions on it.
4746 		 */
4747 		if (uobj != NULL &&
4748 		    ((flags & PGO_FREE) == 0 ||
4749 		     ((entry->max_protection & PROT_WRITE) != 0 &&
4750 		      (entry->etype & UVM_ET_COPYONWRITE) == 0))) {
4751 			rv = uobj->pgops->pgo_flush(uobj,
4752 			    cp_start - entry->start + entry->offset,
4753 			    cp_end - entry->start + entry->offset, flags);
4754 
4755 			if (rv == FALSE)
4756 				error = EFAULT;
4757 		}
4758 	}
4759 
4760 	vm_map_unlock_read(map);
4761 	return error;
4762 }
4763 
4764 /*
4765  * UVM_MAP_CLIP_END implementation
4766  */
4767 void
4768 uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4769 {
4770 	struct vm_map_entry *tmp;
4771 
4772 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4773 	tmp = uvm_mapent_alloc(map, 0);
4774 
4775 	/* Invoke splitentry. */
4776 	uvm_map_splitentry(map, entry, tmp, addr);
4777 }
4778 
4779 /*
4780  * UVM_MAP_CLIP_START implementation
4781  *
4782  * Clippers are required to not change the pointers to the entry they are
4783  * clipping on.
4784  * Since uvm_map_splitentry turns the original entry into the lowest
4785  * entry (address wise) we do a swap between the new entry and the original
4786  * entry, prior to calling uvm_map_splitentry.
4787  */
4788 void
4789 uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4790 {
4791 	struct vm_map_entry *tmp;
4792 	struct uvm_addr_state *free;
4793 
4794 	/* Unlink original. */
4795 	free = uvm_map_uaddr_e(map, entry);
4796 	uvm_mapent_free_remove(map, free, entry);
4797 	uvm_mapent_addr_remove(map, entry);
4798 
4799 	/* Copy entry. */
4800 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4801 	tmp = uvm_mapent_alloc(map, 0);
4802 	uvm_mapent_copy(entry, tmp);
4803 
4804 	/* Put new entry in place of original entry. */
4805 	uvm_mapent_addr_insert(map, tmp);
4806 	uvm_mapent_free_insert(map, free, tmp);
4807 
4808 	/* Invoke splitentry. */
4809 	uvm_map_splitentry(map, tmp, entry, addr);
4810 }
4811 
4812 /*
4813  * Boundary fixer.
4814  */
4815 static inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t);
4816 static inline vaddr_t
4817 uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound)
4818 {
4819 	return (min < bound && max > bound) ? bound : max;
4820 }
4821 
4822 /*
4823  * Choose free list based on address at start of free space.
4824  *
4825  * The uvm_addr_state returned contains addr and is the first of:
4826  * - uaddr_exe
4827  * - uaddr_brk_stack
4828  * - uaddr_any
4829  */
4830 struct uvm_addr_state*
4831 uvm_map_uaddr(struct vm_map *map, vaddr_t addr)
4832 {
4833 	struct uvm_addr_state *uaddr;
4834 	int i;
4835 
4836 	/* Special case the first page, to prevent mmap from returning 0. */
4837 	if (addr < VMMAP_MIN_ADDR)
4838 		return NULL;
4839 
4840 	/* Upper bound for kernel maps at uvm_maxkaddr. */
4841 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
4842 		if (addr >= uvm_maxkaddr)
4843 			return NULL;
4844 	}
4845 
4846 	/* Is the address inside the exe-only map? */
4847 	if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr &&
4848 	    addr < map->uaddr_exe->uaddr_maxaddr)
4849 		return map->uaddr_exe;
4850 
4851 	/* Check if the space falls inside brk/stack area. */
4852 	if ((addr >= map->b_start && addr < map->b_end) ||
4853 	    (addr >= map->s_start && addr < map->s_end)) {
4854 		if (map->uaddr_brk_stack != NULL &&
4855 		    addr >= map->uaddr_brk_stack->uaddr_minaddr &&
4856 		    addr < map->uaddr_brk_stack->uaddr_maxaddr) {
4857 			return map->uaddr_brk_stack;
4858 		} else
4859 			return NULL;
4860 	}
4861 
4862 	/*
4863 	 * Check the other selectors.
4864 	 *
4865 	 * These selectors are only marked as the owner, if they have insert
4866 	 * functions.
4867 	 */
4868 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4869 		uaddr = map->uaddr_any[i];
4870 		if (uaddr == NULL)
4871 			continue;
4872 		if (uaddr->uaddr_functions->uaddr_free_insert == NULL)
4873 			continue;
4874 
4875 		if (addr >= uaddr->uaddr_minaddr &&
4876 		    addr < uaddr->uaddr_maxaddr)
4877 			return uaddr;
4878 	}
4879 
4880 	return NULL;
4881 }
4882 
4883 /*
4884  * Choose free list based on address at start of free space.
4885  *
4886  * The uvm_addr_state returned contains addr and is the first of:
4887  * - uaddr_exe
4888  * - uaddr_brk_stack
4889  * - uaddr_any
4890  */
4891 struct uvm_addr_state*
4892 uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry)
4893 {
4894 	return uvm_map_uaddr(map, VMMAP_FREE_START(entry));
4895 }
4896 
4897 /*
4898  * Returns the first free-memory boundary that is crossed by [min-max].
4899  */
4900 vsize_t
4901 uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max)
4902 {
4903 	struct uvm_addr_state	*uaddr;
4904 	int			 i;
4905 
4906 	/* Never return first page. */
4907 	max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR);
4908 
4909 	/* Treat the maxkaddr special, if the map is a kernel_map. */
4910 	if ((map->flags & VM_MAP_ISVMSPACE) == 0)
4911 		max = uvm_map_boundfix(min, max, uvm_maxkaddr);
4912 
4913 	/* Check for exe-only boundaries. */
4914 	if (map->uaddr_exe != NULL) {
4915 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr);
4916 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr);
4917 	}
4918 
4919 	/* Check for exe-only boundaries. */
4920 	if (map->uaddr_brk_stack != NULL) {
4921 		max = uvm_map_boundfix(min, max,
4922 		    map->uaddr_brk_stack->uaddr_minaddr);
4923 		max = uvm_map_boundfix(min, max,
4924 		    map->uaddr_brk_stack->uaddr_maxaddr);
4925 	}
4926 
4927 	/* Check other boundaries. */
4928 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4929 		uaddr = map->uaddr_any[i];
4930 		if (uaddr != NULL) {
4931 			max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr);
4932 			max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr);
4933 		}
4934 	}
4935 
4936 	/* Boundaries at stack and brk() area. */
4937 	max = uvm_map_boundfix(min, max, map->s_start);
4938 	max = uvm_map_boundfix(min, max, map->s_end);
4939 	max = uvm_map_boundfix(min, max, map->b_start);
4940 	max = uvm_map_boundfix(min, max, map->b_end);
4941 
4942 	return max;
4943 }
4944 
4945 /*
4946  * Update map allocation start and end addresses from proc vmspace.
4947  */
4948 void
4949 uvm_map_vmspace_update(struct vm_map *map,
4950     struct uvm_map_deadq *dead, int flags)
4951 {
4952 	struct vmspace *vm;
4953 	vaddr_t b_start, b_end, s_start, s_end;
4954 
4955 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
4956 	KASSERT(offsetof(struct vmspace, vm_map) == 0);
4957 
4958 	/*
4959 	 * Derive actual allocation boundaries from vmspace.
4960 	 */
4961 	vm = (struct vmspace *)map;
4962 	b_start = (vaddr_t)vm->vm_daddr;
4963 	b_end   = b_start + BRKSIZ;
4964 	s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4965 	s_end   = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4966 #ifdef DIAGNOSTIC
4967 	if ((b_start & (vaddr_t)PAGE_MASK) != 0 ||
4968 	    (b_end & (vaddr_t)PAGE_MASK) != 0 ||
4969 	    (s_start & (vaddr_t)PAGE_MASK) != 0 ||
4970 	    (s_end & (vaddr_t)PAGE_MASK) != 0) {
4971 		panic("uvm_map_vmspace_update: vmspace %p invalid bounds: "
4972 		    "b=0x%lx-0x%lx s=0x%lx-0x%lx",
4973 		    vm, b_start, b_end, s_start, s_end);
4974 	}
4975 #endif
4976 
4977 	if (__predict_true(map->b_start == b_start && map->b_end == b_end &&
4978 	    map->s_start == s_start && map->s_end == s_end))
4979 		return;
4980 
4981 	uvm_map_freelist_update(map, dead, b_start, b_end,
4982 	    s_start, s_end, flags);
4983 }
4984 
4985 /*
4986  * Grow kernel memory.
4987  *
4988  * This function is only called for kernel maps when an allocation fails.
4989  *
4990  * If the map has a gap that is large enough to accommodate alloc_sz, this
4991  * function will make sure map->free will include it.
4992  */
4993 void
4994 uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead,
4995     vsize_t alloc_sz, int flags)
4996 {
4997 	vsize_t sz;
4998 	vaddr_t end;
4999 	struct vm_map_entry *entry;
5000 
5001 	/* Kernel memory only. */
5002 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0);
5003 	/* Destroy free list. */
5004 	uvm_map_freelist_update_clear(map, dead);
5005 
5006 	/* Include the guard page in the hard minimum requirement of alloc_sz. */
5007 	if (map->flags & VM_MAP_GUARDPAGES)
5008 		alloc_sz += PAGE_SIZE;
5009 
5010 	/*
5011 	 * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA.
5012 	 *
5013 	 * Don't handle the case where the multiplication overflows:
5014 	 * if that happens, the allocation is probably too big anyway.
5015 	 */
5016 	sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA);
5017 
5018 	/*
5019 	 * Walk forward until a gap large enough for alloc_sz shows up.
5020 	 *
5021 	 * We assume the kernel map has no boundaries.
5022 	 * uvm_maxkaddr may be zero.
5023 	 */
5024 	end = MAX(uvm_maxkaddr, map->min_offset);
5025 	entry = uvm_map_entrybyaddr(&map->addr, end);
5026 	while (entry && entry->fspace < alloc_sz)
5027 		entry = RBT_NEXT(uvm_map_addr, entry);
5028 	if (entry) {
5029 		end = MAX(VMMAP_FREE_START(entry), end);
5030 		end += MIN(sz, map->max_offset - end);
5031 	} else
5032 		end = map->max_offset;
5033 
5034 	/* Reserve pmap entries. */
5035 #ifdef PMAP_GROWKERNEL
5036 	uvm_maxkaddr = pmap_growkernel(end);
5037 #else
5038 	uvm_maxkaddr = MAX(uvm_maxkaddr, end);
5039 #endif
5040 
5041 	/* Rebuild free list. */
5042 	uvm_map_freelist_update_refill(map, flags);
5043 }
5044 
5045 /*
5046  * Freelist update subfunction: unlink all entries from freelists.
5047  */
5048 void
5049 uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead)
5050 {
5051 	struct uvm_addr_state *free;
5052 	struct vm_map_entry *entry, *prev, *next;
5053 
5054 	prev = NULL;
5055 	for (entry = RBT_MIN(uvm_map_addr, &map->addr); entry != NULL;
5056 	    entry = next) {
5057 		next = RBT_NEXT(uvm_map_addr, entry);
5058 
5059 		free = uvm_map_uaddr_e(map, entry);
5060 		uvm_mapent_free_remove(map, free, entry);
5061 
5062 		if (prev != NULL && entry->start == entry->end) {
5063 			prev->fspace += VMMAP_FREE_END(entry) - entry->end;
5064 			uvm_mapent_addr_remove(map, entry);
5065 			DEAD_ENTRY_PUSH(dead, entry);
5066 		} else
5067 			prev = entry;
5068 	}
5069 }
5070 
5071 /*
5072  * Freelist update subfunction: refill the freelists with entries.
5073  */
5074 void
5075 uvm_map_freelist_update_refill(struct vm_map *map, int flags)
5076 {
5077 	struct vm_map_entry *entry;
5078 	vaddr_t min, max;
5079 
5080 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
5081 		min = VMMAP_FREE_START(entry);
5082 		max = VMMAP_FREE_END(entry);
5083 		entry->fspace = 0;
5084 
5085 		entry = uvm_map_fix_space(map, entry, min, max, flags);
5086 	}
5087 
5088 	uvm_tree_sanity(map, __FILE__, __LINE__);
5089 }
5090 
5091 /*
5092  * Change {a,b}_{start,end} allocation ranges and associated free lists.
5093  */
5094 void
5095 uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead,
5096     vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
5097 {
5098 	KDASSERT(b_end >= b_start && s_end >= s_start);
5099 
5100 	/* Clear all free lists. */
5101 	uvm_map_freelist_update_clear(map, dead);
5102 
5103 	/* Apply new bounds. */
5104 	map->b_start = b_start;
5105 	map->b_end   = b_end;
5106 	map->s_start = s_start;
5107 	map->s_end   = s_end;
5108 
5109 	/* Refill free lists. */
5110 	uvm_map_freelist_update_refill(map, flags);
5111 }
5112 
5113 /*
5114  * Assign a uvm_addr_state to the specified pointer in vm_map.
5115  *
5116  * May sleep.
5117  */
5118 void
5119 uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which,
5120     struct uvm_addr_state *newval)
5121 {
5122 	struct uvm_map_deadq dead;
5123 
5124 	/* Pointer which must be in this map. */
5125 	KASSERT(which != NULL);
5126 	KASSERT((void*)map <= (void*)(which) &&
5127 	    (void*)(which) < (void*)(map + 1));
5128 
5129 	vm_map_lock(map);
5130 	TAILQ_INIT(&dead);
5131 	uvm_map_freelist_update_clear(map, &dead);
5132 
5133 	uvm_addr_destroy(*which);
5134 	*which = newval;
5135 
5136 	uvm_map_freelist_update_refill(map, 0);
5137 	vm_map_unlock(map);
5138 	uvm_unmap_detach(&dead, 0);
5139 }
5140 
5141 /*
5142  * Correct space insert.
5143  *
5144  * Entry must not be on any freelist.
5145  */
5146 struct vm_map_entry*
5147 uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry,
5148     vaddr_t min, vaddr_t max, int flags)
5149 {
5150 	struct uvm_addr_state	*free, *entfree;
5151 	vaddr_t			 lmax;
5152 
5153 	KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0);
5154 	KDASSERT(min <= max);
5155 	KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) ||
5156 	    min == map->min_offset);
5157 
5158 	/*
5159 	 * During the function, entfree will always point at the uaddr state
5160 	 * for entry.
5161 	 */
5162 	entfree = (entry == NULL ? NULL :
5163 	    uvm_map_uaddr_e(map, entry));
5164 
5165 	while (min != max) {
5166 		/* Claim guard page for entry. */
5167 		if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL &&
5168 		    VMMAP_FREE_END(entry) == entry->end &&
5169 		    entry->start != entry->end) {
5170 			if (max - min == 2 * PAGE_SIZE) {
5171 				/*
5172 				 * If the free-space gap is exactly 2 pages,
5173 				 * we make the guard 2 pages instead of 1.
5174 				 * Because in a guarded map, an area needs
5175 				 * at least 2 pages to allocate from:
5176 				 * one page for the allocation and one for
5177 				 * the guard.
5178 				 */
5179 				entry->guard = 2 * PAGE_SIZE;
5180 				min = max;
5181 			} else {
5182 				entry->guard = PAGE_SIZE;
5183 				min += PAGE_SIZE;
5184 			}
5185 			continue;
5186 		}
5187 
5188 		/*
5189 		 * Handle the case where entry has a 2-page guard, but the
5190 		 * space after entry is freed.
5191 		 */
5192 		if (entry != NULL && entry->fspace == 0 &&
5193 		    entry->guard > PAGE_SIZE) {
5194 			entry->guard = PAGE_SIZE;
5195 			min = VMMAP_FREE_START(entry);
5196 		}
5197 
5198 		lmax = uvm_map_boundary(map, min, max);
5199 		free = uvm_map_uaddr(map, min);
5200 
5201 		/*
5202 		 * Entries are merged if they point at the same uvm_free().
5203 		 * Exception to that rule: if min == uvm_maxkaddr, a new
5204 		 * entry is started regardless (otherwise the allocators
5205 		 * will get confused).
5206 		 */
5207 		if (entry != NULL && free == entfree &&
5208 		    !((map->flags & VM_MAP_ISVMSPACE) == 0 &&
5209 		    min == uvm_maxkaddr)) {
5210 			KDASSERT(VMMAP_FREE_END(entry) == min);
5211 			entry->fspace += lmax - min;
5212 		} else {
5213 			/*
5214 			 * Commit entry to free list: it'll not be added to
5215 			 * anymore.
5216 			 * We'll start a new entry and add to that entry
5217 			 * instead.
5218 			 */
5219 			if (entry != NULL)
5220 				uvm_mapent_free_insert(map, entfree, entry);
5221 
5222 			/* New entry for new uaddr. */
5223 			entry = uvm_mapent_alloc(map, flags);
5224 			KDASSERT(entry != NULL);
5225 			entry->end = entry->start = min;
5226 			entry->guard = 0;
5227 			entry->fspace = lmax - min;
5228 			entry->object.uvm_obj = NULL;
5229 			entry->offset = 0;
5230 			entry->etype = 0;
5231 			entry->protection = entry->max_protection = 0;
5232 			entry->inheritance = 0;
5233 			entry->wired_count = 0;
5234 			entry->advice = 0;
5235 			entry->aref.ar_pageoff = 0;
5236 			entry->aref.ar_amap = NULL;
5237 			uvm_mapent_addr_insert(map, entry);
5238 
5239 			entfree = free;
5240 		}
5241 
5242 		min = lmax;
5243 	}
5244 	/* Finally put entry on the uaddr state. */
5245 	if (entry != NULL)
5246 		uvm_mapent_free_insert(map, entfree, entry);
5247 
5248 	return entry;
5249 }
5250 
5251 /*
5252  * MQuery style of allocation.
5253  *
5254  * This allocator searches forward until sufficient space is found to map
5255  * the given size.
5256  *
5257  * XXX: factor in offset (via pmap_prefer) and protection?
5258  */
5259 int
5260 uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset,
5261     int flags)
5262 {
5263 	struct vm_map_entry *entry, *last;
5264 	vaddr_t addr;
5265 	vaddr_t tmp, pmap_align, pmap_offset;
5266 	int error;
5267 
5268 	addr = *addr_p;
5269 	vm_map_lock_read(map);
5270 
5271 	/* Configure pmap prefer. */
5272 	if (offset != UVM_UNKNOWN_OFFSET) {
5273 		pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN());
5274 		pmap_offset = PMAP_PREFER_OFFSET(offset);
5275 	} else {
5276 		pmap_align = PAGE_SIZE;
5277 		pmap_offset = 0;
5278 	}
5279 
5280 	/* Align address to pmap_prefer unless FLAG_FIXED is set. */
5281 	if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) {
5282 	  	tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
5283 		if (tmp < addr)
5284 			tmp += pmap_align;
5285 		addr = tmp;
5286 	}
5287 
5288 	/* First, check if the requested range is fully available. */
5289 	entry = uvm_map_entrybyaddr(&map->addr, addr);
5290 	last = NULL;
5291 	if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5292 		error = 0;
5293 		goto out;
5294 	}
5295 	if (flags & UVM_FLAG_FIXED) {
5296 		error = EINVAL;
5297 		goto out;
5298 	}
5299 
5300 	error = ENOMEM; /* Default error from here. */
5301 
5302 	/*
5303 	 * At this point, the memory at <addr, sz> is not available.
5304 	 * The reasons are:
5305 	 * [1] it's outside the map,
5306 	 * [2] it starts in used memory (and therefore needs to move
5307 	 *     toward the first free page in entry),
5308 	 * [3] it starts in free memory but bumps into used memory.
5309 	 *
5310 	 * Note that for case [2], the forward moving is handled by the
5311 	 * for loop below.
5312 	 */
5313 	if (entry == NULL) {
5314 		/* [1] Outside the map. */
5315 		if (addr >= map->max_offset)
5316 			goto out;
5317 		else
5318 			entry = RBT_MIN(uvm_map_addr, &map->addr);
5319 	} else if (VMMAP_FREE_START(entry) <= addr) {
5320 		/* [3] Bumped into used memory. */
5321 		entry = RBT_NEXT(uvm_map_addr, entry);
5322 	}
5323 
5324 	/* Test if the next entry is sufficient for the allocation. */
5325 	for (; entry != NULL;
5326 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
5327 		if (entry->fspace == 0)
5328 			continue;
5329 		addr = VMMAP_FREE_START(entry);
5330 
5331 restart:	/* Restart address checks on address change. */
5332 		tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
5333 		if (tmp < addr)
5334 			tmp += pmap_align;
5335 		addr = tmp;
5336 		if (addr >= VMMAP_FREE_END(entry))
5337 			continue;
5338 
5339 		/* Skip brk() allocation addresses. */
5340 		if (addr + sz > map->b_start && addr < map->b_end) {
5341 			if (VMMAP_FREE_END(entry) > map->b_end) {
5342 				addr = map->b_end;
5343 				goto restart;
5344 			} else
5345 				continue;
5346 		}
5347 		/* Skip stack allocation addresses. */
5348 		if (addr + sz > map->s_start && addr < map->s_end) {
5349 			if (VMMAP_FREE_END(entry) > map->s_end) {
5350 				addr = map->s_end;
5351 				goto restart;
5352 			} else
5353 				continue;
5354 		}
5355 
5356 		last = NULL;
5357 		if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5358 			error = 0;
5359 			goto out;
5360 		}
5361 	}
5362 
5363 out:
5364 	vm_map_unlock_read(map);
5365 	if (error == 0)
5366 		*addr_p = addr;
5367 	return error;
5368 }
5369 
5370 /*
5371  * Determine allocation bias.
5372  *
5373  * Returns 1 if we should bias to high addresses, -1 for a bias towards low
5374  * addresses, or 0 for no bias.
5375  * The bias mechanism is intended to avoid clashing with brk() and stack
5376  * areas.
5377  */
5378 int
5379 uvm_mapent_bias(struct vm_map *map, struct vm_map_entry *entry)
5380 {
5381 	vaddr_t start, end;
5382 
5383 	start = VMMAP_FREE_START(entry);
5384 	end = VMMAP_FREE_END(entry);
5385 
5386 	/* Stay at the top of brk() area. */
5387 	if (end >= map->b_start && start < map->b_end)
5388 		return 1;
5389 	/* Stay at the far end of the stack area. */
5390 	if (end >= map->s_start && start < map->s_end) {
5391 #ifdef MACHINE_STACK_GROWS_UP
5392 		return 1;
5393 #else
5394 		return -1;
5395 #endif
5396 	}
5397 
5398 	/* No bias, this area is meant for us. */
5399 	return 0;
5400 }
5401 
5402 
5403 boolean_t
5404 vm_map_lock_try_ln(struct vm_map *map, char *file, int line)
5405 {
5406 	boolean_t rv;
5407 
5408 	if (map->flags & VM_MAP_INTRSAFE) {
5409 		rv = mtx_enter_try(&map->mtx);
5410 	} else {
5411 		mtx_enter(&map->flags_lock);
5412 		if (map->flags & VM_MAP_BUSY) {
5413 			mtx_leave(&map->flags_lock);
5414 			return (FALSE);
5415 		}
5416 		mtx_leave(&map->flags_lock);
5417 		rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0);
5418 		/* check if the lock is busy and back out if we won the race */
5419 		if (rv) {
5420 			mtx_enter(&map->flags_lock);
5421 			if (map->flags & VM_MAP_BUSY) {
5422 				rw_exit(&map->lock);
5423 				rv = FALSE;
5424 			}
5425 			mtx_leave(&map->flags_lock);
5426 		}
5427 	}
5428 
5429 	if (rv) {
5430 		map->timestamp++;
5431 		LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5432 		uvm_tree_sanity(map, file, line);
5433 		uvm_tree_size_chk(map, file, line);
5434 	}
5435 
5436 	return (rv);
5437 }
5438 
5439 void
5440 vm_map_lock_ln(struct vm_map *map, char *file, int line)
5441 {
5442 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5443 		do {
5444 			mtx_enter(&map->flags_lock);
5445 tryagain:
5446 			while (map->flags & VM_MAP_BUSY) {
5447 				map->flags |= VM_MAP_WANTLOCK;
5448 				msleep_nsec(&map->flags, &map->flags_lock,
5449 				    PVM, vmmapbsy, INFSLP);
5450 			}
5451 			mtx_leave(&map->flags_lock);
5452 		} while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0);
5453 		/* check if the lock is busy and back out if we won the race */
5454 		mtx_enter(&map->flags_lock);
5455 		if (map->flags & VM_MAP_BUSY) {
5456 			rw_exit(&map->lock);
5457 			goto tryagain;
5458 		}
5459 		mtx_leave(&map->flags_lock);
5460 	} else {
5461 		mtx_enter(&map->mtx);
5462 	}
5463 
5464 	map->timestamp++;
5465 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5466 	uvm_tree_sanity(map, file, line);
5467 	uvm_tree_size_chk(map, file, line);
5468 }
5469 
5470 void
5471 vm_map_lock_read_ln(struct vm_map *map, char *file, int line)
5472 {
5473 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5474 		rw_enter_read(&map->lock);
5475 	else
5476 		mtx_enter(&map->mtx);
5477 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5478 	uvm_tree_sanity(map, file, line);
5479 	uvm_tree_size_chk(map, file, line);
5480 }
5481 
5482 void
5483 vm_map_unlock_ln(struct vm_map *map, char *file, int line)
5484 {
5485 	uvm_tree_sanity(map, file, line);
5486 	uvm_tree_size_chk(map, file, line);
5487 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5488 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5489 		rw_exit(&map->lock);
5490 	else
5491 		mtx_leave(&map->mtx);
5492 }
5493 
5494 void
5495 vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
5496 {
5497 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5498 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5499 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5500 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5501 		rw_exit_read(&map->lock);
5502 	else
5503 		mtx_leave(&map->mtx);
5504 }
5505 
5506 void
5507 vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
5508 {
5509 	uvm_tree_sanity(map, file, line);
5510 	uvm_tree_size_chk(map, file, line);
5511 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5512 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5513 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5514 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5515 		rw_enter(&map->lock, RW_DOWNGRADE);
5516 }
5517 
5518 void
5519 vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
5520 {
5521 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5522 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5523 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5524 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5525 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5526 		rw_exit_read(&map->lock);
5527 		rw_enter_write(&map->lock);
5528 	}
5529 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5530 	uvm_tree_sanity(map, file, line);
5531 }
5532 
5533 void
5534 vm_map_busy_ln(struct vm_map *map, char *file, int line)
5535 {
5536 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5537 	mtx_enter(&map->flags_lock);
5538 	map->flags |= VM_MAP_BUSY;
5539 	mtx_leave(&map->flags_lock);
5540 }
5541 
5542 void
5543 vm_map_unbusy_ln(struct vm_map *map, char *file, int line)
5544 {
5545 	int oflags;
5546 
5547 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5548 	mtx_enter(&map->flags_lock);
5549 	oflags = map->flags;
5550 	map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK);
5551 	mtx_leave(&map->flags_lock);
5552 	if (oflags & VM_MAP_WANTLOCK)
5553 		wakeup(&map->flags);
5554 }
5555 
5556 #ifndef SMALL_KERNEL
5557 int
5558 uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve,
5559     size_t *lenp)
5560 {
5561 	struct vm_map_entry *entry;
5562 	vaddr_t start;
5563 	int cnt, maxcnt, error = 0;
5564 
5565 	KASSERT(*lenp > 0);
5566 	KASSERT((*lenp % sizeof(*kve)) == 0);
5567 	cnt = 0;
5568 	maxcnt = *lenp / sizeof(*kve);
5569 	KASSERT(maxcnt > 0);
5570 
5571 	/*
5572 	 * Return only entries whose address is above the given base
5573 	 * address.  This allows userland to iterate without knowing the
5574 	 * number of entries beforehand.
5575 	 */
5576 	start = (vaddr_t)kve[0].kve_start;
5577 
5578 	vm_map_lock(map);
5579 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
5580 		if (cnt == maxcnt) {
5581 			error = ENOMEM;
5582 			break;
5583 		}
5584 		if (start != 0 && entry->start < start)
5585 			continue;
5586 		kve->kve_start = entry->start;
5587 		kve->kve_end = entry->end;
5588 		kve->kve_guard = entry->guard;
5589 		kve->kve_fspace = entry->fspace;
5590 		kve->kve_fspace_augment = entry->fspace_augment;
5591 		kve->kve_offset = entry->offset;
5592 		kve->kve_wired_count = entry->wired_count;
5593 		kve->kve_etype = entry->etype;
5594 		kve->kve_protection = entry->protection;
5595 		kve->kve_max_protection = entry->max_protection;
5596 		kve->kve_advice = entry->advice;
5597 		kve->kve_inheritance = entry->inheritance;
5598 		kve->kve_flags = entry->flags;
5599 		kve++;
5600 		cnt++;
5601 	}
5602 	vm_map_unlock(map);
5603 
5604 	KASSERT(cnt <= maxcnt);
5605 
5606 	*lenp = sizeof(*kve) * cnt;
5607 	return error;
5608 }
5609 #endif
5610 
5611 
5612 RBT_GENERATE_AUGMENT(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
5613     uvm_mapentry_addrcmp, uvm_map_addr_augment);
5614 
5615 
5616 /*
5617  * MD code: vmspace allocator setup.
5618  */
5619 
5620 #ifdef __i386__
5621 void
5622 uvm_map_setup_md(struct vm_map *map)
5623 {
5624 	vaddr_t		min, max;
5625 
5626 	min = map->min_offset;
5627 	max = map->max_offset;
5628 
5629 	/*
5630 	 * Ensure the selectors will not try to manage page 0;
5631 	 * it's too special.
5632 	 */
5633 	if (min < VMMAP_MIN_ADDR)
5634 		min = VMMAP_MIN_ADDR;
5635 
5636 #if 0	/* Cool stuff, not yet */
5637 	/* Executable code is special. */
5638 	map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR);
5639 	/* Place normal allocations beyond executable mappings. */
5640 	map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max);
5641 #else	/* Crappy stuff, for now */
5642 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5643 #endif
5644 
5645 #ifndef SMALL_KERNEL
5646 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5647 #endif /* !SMALL_KERNEL */
5648 }
5649 #elif __LP64__
5650 void
5651 uvm_map_setup_md(struct vm_map *map)
5652 {
5653 	vaddr_t		min, max;
5654 
5655 	min = map->min_offset;
5656 	max = map->max_offset;
5657 
5658 	/*
5659 	 * Ensure the selectors will not try to manage page 0;
5660 	 * it's too special.
5661 	 */
5662 	if (min < VMMAP_MIN_ADDR)
5663 		min = VMMAP_MIN_ADDR;
5664 
5665 #if 0	/* Cool stuff, not yet */
5666 	map->uaddr_any[3] = uaddr_pivot_create(MAX(min, 0x100000000ULL), max);
5667 #else	/* Crappy stuff, for now */
5668 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5669 #endif
5670 
5671 #ifndef SMALL_KERNEL
5672 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5673 #endif /* !SMALL_KERNEL */
5674 }
5675 #else	/* non-i386, 32 bit */
5676 void
5677 uvm_map_setup_md(struct vm_map *map)
5678 {
5679 	vaddr_t		min, max;
5680 
5681 	min = map->min_offset;
5682 	max = map->max_offset;
5683 
5684 	/*
5685 	 * Ensure the selectors will not try to manage page 0;
5686 	 * it's too special.
5687 	 */
5688 	if (min < VMMAP_MIN_ADDR)
5689 		min = VMMAP_MIN_ADDR;
5690 
5691 #if 0	/* Cool stuff, not yet */
5692 	map->uaddr_any[3] = uaddr_pivot_create(min, max);
5693 #else	/* Crappy stuff, for now */
5694 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5695 #endif
5696 
5697 #ifndef SMALL_KERNEL
5698 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5699 #endif /* !SMALL_KERNEL */
5700 }
5701 #endif
5702