xref: /openbsd-src/sys/uvm/uvm_map.c (revision ae3cb403620ab940fbaabb3055fac045a63d56b7)
1 /*	$OpenBSD: uvm_map.c,v 1.233 2017/11/30 00:36:10 guenther Exp $	*/
2 /*	$NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 2011 Ariane van der Steldt <ariane@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  *
19  *
20  * Copyright (c) 1997 Charles D. Cranor and Washington University.
21  * Copyright (c) 1991, 1993, The Regents of the University of California.
22  *
23  * All rights reserved.
24  *
25  * This code is derived from software contributed to Berkeley by
26  * The Mach Operating System project at Carnegie-Mellon University.
27  *
28  * Redistribution and use in source and binary forms, with or without
29  * modification, are permitted provided that the following conditions
30  * are met:
31  * 1. Redistributions of source code must retain the above copyright
32  *    notice, this list of conditions and the following disclaimer.
33  * 2. Redistributions in binary form must reproduce the above copyright
34  *    notice, this list of conditions and the following disclaimer in the
35  *    documentation and/or other materials provided with the distribution.
36  * 3. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	@(#)vm_map.c    8.3 (Berkeley) 1/12/94
53  * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
54  *
55  *
56  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
57  * All rights reserved.
58  *
59  * Permission to use, copy, modify and distribute this software and
60  * its documentation is hereby granted, provided that both the copyright
61  * notice and this permission notice appear in all copies of the
62  * software, derivative works or modified versions, and any portions
63  * thereof, and that both notices appear in supporting documentation.
64  *
65  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
66  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
67  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
68  *
69  * Carnegie Mellon requests users of this software to return to
70  *
71  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
72  *  School of Computer Science
73  *  Carnegie Mellon University
74  *  Pittsburgh PA 15213-3890
75  *
76  * any improvements or extensions that they make and grant Carnegie the
77  * rights to redistribute these changes.
78  */
79 
80 /*
81  * uvm_map.c: uvm map operations
82  */
83 
84 /* #define DEBUG */
85 /* #define VMMAP_DEBUG */
86 
87 #include <sys/param.h>
88 #include <sys/systm.h>
89 #include <sys/mman.h>
90 #include <sys/proc.h>
91 #include <sys/malloc.h>
92 #include <sys/pool.h>
93 #include <sys/sysctl.h>
94 
95 #ifdef SYSVSHM
96 #include <sys/shm.h>
97 #endif
98 
99 #include <uvm/uvm.h>
100 
101 #ifdef DDB
102 #include <uvm/uvm_ddb.h>
103 #endif
104 
105 #include <uvm/uvm_addr.h>
106 
107 
108 vsize_t			 uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t);
109 int			 uvm_mapent_isjoinable(struct vm_map*,
110 			    struct vm_map_entry*, struct vm_map_entry*);
111 struct vm_map_entry	*uvm_mapent_merge(struct vm_map*, struct vm_map_entry*,
112 			    struct vm_map_entry*, struct uvm_map_deadq*);
113 struct vm_map_entry	*uvm_mapent_tryjoin(struct vm_map*,
114 			    struct vm_map_entry*, struct uvm_map_deadq*);
115 struct vm_map_entry	*uvm_map_mkentry(struct vm_map*, struct vm_map_entry*,
116 			    struct vm_map_entry*, vaddr_t, vsize_t, int,
117 			    struct uvm_map_deadq*, struct vm_map_entry*);
118 struct vm_map_entry	*uvm_mapent_alloc(struct vm_map*, int);
119 void			 uvm_mapent_free(struct vm_map_entry*);
120 void			 uvm_unmap_kill_entry(struct vm_map*,
121 			    struct vm_map_entry*);
122 void			 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *);
123 void			 uvm_mapent_mkfree(struct vm_map*,
124 			    struct vm_map_entry*, struct vm_map_entry**,
125 			    struct uvm_map_deadq*, boolean_t);
126 void			 uvm_map_pageable_pgon(struct vm_map*,
127 			    struct vm_map_entry*, struct vm_map_entry*,
128 			    vaddr_t, vaddr_t);
129 int			 uvm_map_pageable_wire(struct vm_map*,
130 			    struct vm_map_entry*, struct vm_map_entry*,
131 			    vaddr_t, vaddr_t, int);
132 void			 uvm_map_setup_entries(struct vm_map*);
133 void			 uvm_map_setup_md(struct vm_map*);
134 void			 uvm_map_teardown(struct vm_map*);
135 void			 uvm_map_vmspace_update(struct vm_map*,
136 			    struct uvm_map_deadq*, int);
137 void			 uvm_map_kmem_grow(struct vm_map*,
138 			    struct uvm_map_deadq*, vsize_t, int);
139 void			 uvm_map_freelist_update_clear(struct vm_map*,
140 			    struct uvm_map_deadq*);
141 void			 uvm_map_freelist_update_refill(struct vm_map *, int);
142 void			 uvm_map_freelist_update(struct vm_map*,
143 			    struct uvm_map_deadq*, vaddr_t, vaddr_t,
144 			    vaddr_t, vaddr_t, int);
145 struct vm_map_entry	*uvm_map_fix_space(struct vm_map*, struct vm_map_entry*,
146 			    vaddr_t, vaddr_t, int);
147 int			 uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int,
148 			    struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t,
149 			    int);
150 int			 uvm_map_findspace(struct vm_map*,
151 			    struct vm_map_entry**, struct vm_map_entry**,
152 			    vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t,
153 			    vaddr_t);
154 vsize_t			 uvm_map_addr_augment_get(struct vm_map_entry*);
155 void			 uvm_map_addr_augment(struct vm_map_entry*);
156 
157 /*
158  * Tree management functions.
159  */
160 
161 static __inline void	 uvm_mapent_copy(struct vm_map_entry*,
162 			    struct vm_map_entry*);
163 static inline int	 uvm_mapentry_addrcmp(const struct vm_map_entry*,
164 			    const struct vm_map_entry*);
165 void			 uvm_mapent_free_insert(struct vm_map*,
166 			    struct uvm_addr_state*, struct vm_map_entry*);
167 void			 uvm_mapent_free_remove(struct vm_map*,
168 			    struct uvm_addr_state*, struct vm_map_entry*);
169 void			 uvm_mapent_addr_insert(struct vm_map*,
170 			    struct vm_map_entry*);
171 void			 uvm_mapent_addr_remove(struct vm_map*,
172 			    struct vm_map_entry*);
173 void			 uvm_map_splitentry(struct vm_map*,
174 			    struct vm_map_entry*, struct vm_map_entry*,
175 			    vaddr_t);
176 vsize_t			 uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t);
177 int			 uvm_mapent_bias(struct vm_map*, struct vm_map_entry*);
178 
179 /*
180  * uvm_vmspace_fork helper functions.
181  */
182 struct vm_map_entry	*uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t,
183 			    vsize_t, vm_prot_t, vm_prot_t,
184 			    struct vm_map_entry*, struct uvm_map_deadq*, int,
185 			    int);
186 struct vm_map_entry	*uvm_mapent_share(struct vm_map*, vaddr_t, vsize_t,
187 			    vsize_t, vm_prot_t, vm_prot_t, struct vm_map*,
188 			    struct vm_map_entry*, struct uvm_map_deadq*);
189 struct vm_map_entry	*uvm_mapent_forkshared(struct vmspace*, struct vm_map*,
190 			    struct vm_map*, struct vm_map_entry*,
191 			    struct uvm_map_deadq*);
192 struct vm_map_entry	*uvm_mapent_forkcopy(struct vmspace*, struct vm_map*,
193 			    struct vm_map*, struct vm_map_entry*,
194 			    struct uvm_map_deadq*);
195 struct vm_map_entry	*uvm_mapent_forkzero(struct vmspace*, struct vm_map*,
196 			    struct vm_map*, struct vm_map_entry*,
197 			    struct uvm_map_deadq*);
198 
199 /*
200  * Tree validation.
201  */
202 #ifdef VMMAP_DEBUG
203 void			 uvm_tree_assert(struct vm_map*, int, char*,
204 			    char*, int);
205 #define UVM_ASSERT(map, cond, file, line)				\
206 	uvm_tree_assert((map), (cond), #cond, (file), (line))
207 void			 uvm_tree_sanity(struct vm_map*, char*, int);
208 void			 uvm_tree_size_chk(struct vm_map*, char*, int);
209 void			 vmspace_validate(struct vm_map*);
210 #else
211 #define uvm_tree_sanity(_map, _file, _line)		do {} while (0)
212 #define uvm_tree_size_chk(_map, _file, _line)		do {} while (0)
213 #define vmspace_validate(_map)				do {} while (0)
214 #endif
215 
216 /*
217  * All architectures will have pmap_prefer.
218  */
219 #ifndef PMAP_PREFER
220 #define PMAP_PREFER_ALIGN()	(vaddr_t)PAGE_SIZE
221 #define PMAP_PREFER_OFFSET(off)	0
222 #define PMAP_PREFER(addr, off)	(addr)
223 #endif
224 
225 
226 /*
227  * The kernel map will initially be VM_MAP_KSIZE_INIT bytes.
228  * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes.
229  *
230  * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size
231  * each time.
232  */
233 #define VM_MAP_KSIZE_INIT	(512 * (vaddr_t)PAGE_SIZE)
234 #define VM_MAP_KSIZE_DELTA	(256 * (vaddr_t)PAGE_SIZE)
235 #define VM_MAP_KSIZE_ALLOCMUL	4
236 /*
237  * When selecting a random free-space block, look at most FSPACE_DELTA blocks
238  * ahead.
239  */
240 #define FSPACE_DELTA		8
241 /*
242  * Put allocations adjecent to previous allocations when the free-space tree
243  * is larger than FSPACE_COMPACT entries.
244  *
245  * Alignment and PMAP_PREFER may still cause the entry to not be fully
246  * adjecent. Note that this strategy reduces memory fragmentation (by leaving
247  * a large space before or after the allocation).
248  */
249 #define FSPACE_COMPACT		128
250 /*
251  * Make the address selection skip at most this many bytes from the start of
252  * the free space in which the allocation takes place.
253  *
254  * The main idea behind a randomized address space is that an attacker cannot
255  * know where to target his attack. Therefore, the location of objects must be
256  * as random as possible. However, the goal is not to create the most sparse
257  * map that is possible.
258  * FSPACE_MAXOFF pushes the considered range in bytes down to less insane
259  * sizes, thereby reducing the sparseness. The biggest randomization comes
260  * from fragmentation, i.e. FSPACE_COMPACT.
261  */
262 #define FSPACE_MAXOFF		((vaddr_t)32 * 1024 * 1024)
263 /*
264  * Allow for small gaps in the overflow areas.
265  * Gap size is in bytes and does not have to be a multiple of page-size.
266  */
267 #define FSPACE_BIASGAP		((vaddr_t)32 * 1024)
268 
269 /* auto-allocate address lower bound */
270 #define VMMAP_MIN_ADDR		PAGE_SIZE
271 
272 
273 #ifdef DEADBEEF0
274 #define UVMMAP_DEADBEEF		((unsigned long)DEADBEEF0)
275 #else
276 #define UVMMAP_DEADBEEF		((unsigned long)0xdeadd0d0)
277 #endif
278 
279 #ifdef DEBUG
280 int uvm_map_printlocks = 0;
281 
282 #define LPRINTF(_args)							\
283 	do {								\
284 		if (uvm_map_printlocks)					\
285 			printf _args;					\
286 	} while (0)
287 #else
288 #define LPRINTF(_args)	do {} while (0)
289 #endif
290 
291 static struct mutex uvm_kmapent_mtx;
292 static struct timeval uvm_kmapent_last_warn_time;
293 static struct timeval uvm_kmapent_warn_rate = { 10, 0 };
294 
295 const char vmmapbsy[] = "vmmapbsy";
296 
297 /*
298  * pool for vmspace structures.
299  */
300 struct pool uvm_vmspace_pool;
301 
302 /*
303  * pool for dynamically-allocated map entries.
304  */
305 struct pool uvm_map_entry_pool;
306 struct pool uvm_map_entry_kmem_pool;
307 
308 /*
309  * This global represents the end of the kernel virtual address
310  * space. If we want to exceed this, we must grow the kernel
311  * virtual address space dynamically.
312  *
313  * Note, this variable is locked by kernel_map's lock.
314  */
315 vaddr_t uvm_maxkaddr;
316 
317 /*
318  * Locking predicate.
319  */
320 #define UVM_MAP_REQ_WRITE(_map)						\
321 	do {								\
322 		if ((_map)->ref_count > 0) {				\
323 			if (((_map)->flags & VM_MAP_INTRSAFE) == 0)	\
324 				rw_assert_wrlock(&(_map)->lock);	\
325 			else						\
326 				MUTEX_ASSERT_LOCKED(&(_map)->mtx);	\
327 		}							\
328 	} while (0)
329 
330 /*
331  * Tree describing entries by address.
332  *
333  * Addresses are unique.
334  * Entries with start == end may only exist if they are the first entry
335  * (sorted by address) within a free-memory tree.
336  */
337 
338 static inline int
339 uvm_mapentry_addrcmp(const struct vm_map_entry *e1,
340     const struct vm_map_entry *e2)
341 {
342 	return e1->start < e2->start ? -1 : e1->start > e2->start;
343 }
344 
345 /*
346  * Copy mapentry.
347  */
348 static __inline void
349 uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
350 {
351 	caddr_t csrc, cdst;
352 	size_t sz;
353 
354 	csrc = (caddr_t)src;
355 	cdst = (caddr_t)dst;
356 	csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
357 	cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
358 
359 	sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) -
360 	    offsetof(struct vm_map_entry, uvm_map_entry_start_copy);
361 	memcpy(cdst, csrc, sz);
362 }
363 
364 /*
365  * Handle free-list insertion.
366  */
367 void
368 uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr,
369     struct vm_map_entry *entry)
370 {
371 	const struct uvm_addr_functions *fun;
372 #ifdef VMMAP_DEBUG
373 	vaddr_t min, max, bound;
374 #endif
375 
376 #ifdef VMMAP_DEBUG
377 	/*
378 	 * Boundary check.
379 	 * Boundaries are folded if they go on the same free list.
380 	 */
381 	min = VMMAP_FREE_START(entry);
382 	max = VMMAP_FREE_END(entry);
383 
384 	while (min < max) {
385 		bound = uvm_map_boundary(map, min, max);
386 		KASSERT(uvm_map_uaddr(map, min) == uaddr);
387 		min = bound;
388 	}
389 #endif
390 	KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
391 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
392 
393 	UVM_MAP_REQ_WRITE(map);
394 
395 	/* Actual insert: forward to uaddr pointer. */
396 	if (uaddr != NULL) {
397 		fun = uaddr->uaddr_functions;
398 		KDASSERT(fun != NULL);
399 		if (fun->uaddr_free_insert != NULL)
400 			(*fun->uaddr_free_insert)(map, uaddr, entry);
401 		entry->etype |= UVM_ET_FREEMAPPED;
402 	}
403 
404 	/* Update fspace augmentation. */
405 	uvm_map_addr_augment(entry);
406 }
407 
408 /*
409  * Handle free-list removal.
410  */
411 void
412 uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr,
413     struct vm_map_entry *entry)
414 {
415 	const struct uvm_addr_functions *fun;
416 
417 	KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
418 	KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
419 	UVM_MAP_REQ_WRITE(map);
420 
421 	if (uaddr != NULL) {
422 		fun = uaddr->uaddr_functions;
423 		if (fun->uaddr_free_remove != NULL)
424 			(*fun->uaddr_free_remove)(map, uaddr, entry);
425 		entry->etype &= ~UVM_ET_FREEMAPPED;
426 	}
427 }
428 
429 /*
430  * Handle address tree insertion.
431  */
432 void
433 uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry)
434 {
435 	struct vm_map_entry *res;
436 
437 	if (!RBT_CHECK(uvm_map_addr, entry, UVMMAP_DEADBEEF))
438 		panic("uvm_mapent_addr_insert: entry still in addr list");
439 	KDASSERT(entry->start <= entry->end);
440 	KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 &&
441 	    (entry->end & (vaddr_t)PAGE_MASK) == 0);
442 
443 	UVM_MAP_REQ_WRITE(map);
444 	res = RBT_INSERT(uvm_map_addr, &map->addr, entry);
445 	if (res != NULL) {
446 		panic("uvm_mapent_addr_insert: map %p entry %p "
447 		    "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision "
448 		    "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)",
449 		    map, entry,
450 		    entry->start, entry->end, entry->guard, entry->fspace,
451 		    res, res->start, res->end, res->guard, res->fspace);
452 	}
453 }
454 
455 /*
456  * Handle address tree removal.
457  */
458 void
459 uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry)
460 {
461 	struct vm_map_entry *res;
462 
463 	UVM_MAP_REQ_WRITE(map);
464 	res = RBT_REMOVE(uvm_map_addr, &map->addr, entry);
465 	if (res != entry)
466 		panic("uvm_mapent_addr_remove");
467 	RBT_POISON(uvm_map_addr, entry, UVMMAP_DEADBEEF);
468 }
469 
470 /*
471  * uvm_map_reference: add reference to a map
472  *
473  * XXX check map reference counter lock
474  */
475 #define uvm_map_reference(_map)						\
476 	do {								\
477 		map->ref_count++;					\
478 	} while (0)
479 
480 /*
481  * Calculate the dused delta.
482  */
483 vsize_t
484 uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max)
485 {
486 	struct vmspace *vm;
487 	vsize_t sz;
488 	vaddr_t lmax;
489 	vaddr_t stack_begin, stack_end; /* Position of stack. */
490 
491 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
492 	vm = (struct vmspace *)map;
493 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
494 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
495 
496 	sz = 0;
497 	while (min != max) {
498 		lmax = max;
499 		if (min < stack_begin && lmax > stack_begin)
500 			lmax = stack_begin;
501 		else if (min < stack_end && lmax > stack_end)
502 			lmax = stack_end;
503 
504 		if (min >= stack_begin && min < stack_end) {
505 			/* nothing */
506 		} else
507 			sz += lmax - min;
508 		min = lmax;
509 	}
510 
511 	return sz >> PAGE_SHIFT;
512 }
513 
514 /*
515  * Find the entry describing the given address.
516  */
517 struct vm_map_entry*
518 uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr)
519 {
520 	struct vm_map_entry *iter;
521 
522 	iter = RBT_ROOT(uvm_map_addr, atree);
523 	while (iter != NULL) {
524 		if (iter->start > addr)
525 			iter = RBT_LEFT(uvm_map_addr, iter);
526 		else if (VMMAP_FREE_END(iter) <= addr)
527 			iter = RBT_RIGHT(uvm_map_addr, iter);
528 		else
529 			return iter;
530 	}
531 	return NULL;
532 }
533 
534 /*
535  * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry)
536  *
537  * Push dead entries into a linked list.
538  * Since the linked list abuses the address tree for storage, the entry
539  * may not be linked in a map.
540  *
541  * *head must be initialized to NULL before the first call to this macro.
542  * uvm_unmap_detach(*head, 0) will remove dead entries.
543  */
544 static __inline void
545 dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry)
546 {
547 	TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq);
548 }
549 #define DEAD_ENTRY_PUSH(_headptr, _entry)				\
550 	dead_entry_push((_headptr), (_entry))
551 
552 /*
553  * Helper function for uvm_map_findspace_tree.
554  *
555  * Given allocation constraints and pmap constraints, finds the
556  * lowest and highest address in a range that can be used for the
557  * allocation.
558  *
559  * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs.
560  *
561  *
562  * Big chunk of math with a seasoning of dragons.
563  */
564 int
565 uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg,
566     struct vm_map_entry *sel, vaddr_t align,
567     vaddr_t pmap_align, vaddr_t pmap_off, int bias)
568 {
569 	vaddr_t sel_min, sel_max;
570 #ifdef PMAP_PREFER
571 	vaddr_t pmap_min, pmap_max;
572 #endif /* PMAP_PREFER */
573 #ifdef DIAGNOSTIC
574 	int bad;
575 #endif /* DIAGNOSTIC */
576 
577 	sel_min = VMMAP_FREE_START(sel);
578 	sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0);
579 
580 #ifdef PMAP_PREFER
581 
582 	/*
583 	 * There are two special cases, in which we can satisfy the align
584 	 * requirement and the pmap_prefer requirement.
585 	 * - when pmap_off == 0, we always select the largest of the two
586 	 * - when pmap_off % align == 0 and pmap_align > align, we simply
587 	 *   satisfy the pmap_align requirement and automatically
588 	 *   satisfy the align requirement.
589 	 */
590 	if (align > PAGE_SIZE &&
591 	    !(pmap_align > align && (pmap_off & (align - 1)) == 0)) {
592 		/*
593 		 * Simple case: only use align.
594 		 */
595 		sel_min = roundup(sel_min, align);
596 		sel_max &= ~(align - 1);
597 
598 		if (sel_min > sel_max)
599 			return ENOMEM;
600 
601 		/* Correct for bias. */
602 		if (sel_max - sel_min > FSPACE_BIASGAP) {
603 			if (bias > 0) {
604 				sel_min = sel_max - FSPACE_BIASGAP;
605 				sel_min = roundup(sel_min, align);
606 			} else if (bias < 0) {
607 				sel_max = sel_min + FSPACE_BIASGAP;
608 				sel_max &= ~(align - 1);
609 			}
610 		}
611 	} else if (pmap_align != 0) {
612 		/*
613 		 * Special case: satisfy both pmap_prefer and
614 		 * align argument.
615 		 */
616 		pmap_max = sel_max & ~(pmap_align - 1);
617 		pmap_min = sel_min;
618 		if (pmap_max < sel_min)
619 			return ENOMEM;
620 
621 		/* Adjust pmap_min for BIASGAP for top-addr bias. */
622 		if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP)
623 			pmap_min = pmap_max - FSPACE_BIASGAP;
624 		/* Align pmap_min. */
625 		pmap_min &= ~(pmap_align - 1);
626 		if (pmap_min < sel_min)
627 			pmap_min += pmap_align;
628 		if (pmap_min > pmap_max)
629 			return ENOMEM;
630 
631 		/* Adjust pmap_max for BIASGAP for bottom-addr bias. */
632 		if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) {
633 			pmap_max = (pmap_min + FSPACE_BIASGAP) &
634 			    ~(pmap_align - 1);
635 		}
636 		if (pmap_min > pmap_max)
637 			return ENOMEM;
638 
639 		/* Apply pmap prefer offset. */
640 		pmap_max |= pmap_off;
641 		if (pmap_max > sel_max)
642 			pmap_max -= pmap_align;
643 		pmap_min |= pmap_off;
644 		if (pmap_min < sel_min)
645 			pmap_min += pmap_align;
646 
647 		/*
648 		 * Fixup: it's possible that pmap_min and pmap_max
649 		 * cross eachother. In this case, try to find one
650 		 * address that is allowed.
651 		 * (This usually happens in biased case.)
652 		 */
653 		if (pmap_min > pmap_max) {
654 			if (pmap_min < sel_max)
655 				pmap_max = pmap_min;
656 			else if (pmap_max > sel_min)
657 				pmap_min = pmap_max;
658 			else
659 				return ENOMEM;
660 		}
661 
662 		/* Internal validation. */
663 		KDASSERT(pmap_min <= pmap_max);
664 
665 		sel_min = pmap_min;
666 		sel_max = pmap_max;
667 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
668 		sel_min = sel_max - FSPACE_BIASGAP;
669 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
670 		sel_max = sel_min + FSPACE_BIASGAP;
671 
672 #else
673 
674 	if (align > PAGE_SIZE) {
675 		sel_min = roundup(sel_min, align);
676 		sel_max &= ~(align - 1);
677 		if (sel_min > sel_max)
678 			return ENOMEM;
679 
680 		if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) {
681 			if (bias > 0) {
682 				sel_min = roundup(sel_max - FSPACE_BIASGAP,
683 				    align);
684 			} else {
685 				sel_max = (sel_min + FSPACE_BIASGAP) &
686 				    ~(align - 1);
687 			}
688 		}
689 	} else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP)
690 		sel_min = sel_max - FSPACE_BIASGAP;
691 	else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP)
692 		sel_max = sel_min + FSPACE_BIASGAP;
693 
694 #endif
695 
696 	if (sel_min > sel_max)
697 		return ENOMEM;
698 
699 #ifdef DIAGNOSTIC
700 	bad = 0;
701 	/* Lower boundary check. */
702 	if (sel_min < VMMAP_FREE_START(sel)) {
703 		printf("sel_min: 0x%lx, but should be at least 0x%lx\n",
704 		    sel_min, VMMAP_FREE_START(sel));
705 		bad++;
706 	}
707 	/* Upper boundary check. */
708 	if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) {
709 		printf("sel_max: 0x%lx, but should be at most 0x%lx\n",
710 		    sel_max,
711 		    VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0));
712 		bad++;
713 	}
714 	/* Lower boundary alignment. */
715 	if (align != 0 && (sel_min & (align - 1)) != 0) {
716 		printf("sel_min: 0x%lx, not aligned to 0x%lx\n",
717 		    sel_min, align);
718 		bad++;
719 	}
720 	/* Upper boundary alignment. */
721 	if (align != 0 && (sel_max & (align - 1)) != 0) {
722 		printf("sel_max: 0x%lx, not aligned to 0x%lx\n",
723 		    sel_max, align);
724 		bad++;
725 	}
726 	/* Lower boundary PMAP_PREFER check. */
727 	if (pmap_align != 0 && align == 0 &&
728 	    (sel_min & (pmap_align - 1)) != pmap_off) {
729 		printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
730 		    sel_min, sel_min & (pmap_align - 1), pmap_off);
731 		bad++;
732 	}
733 	/* Upper boundary PMAP_PREFER check. */
734 	if (pmap_align != 0 && align == 0 &&
735 	    (sel_max & (pmap_align - 1)) != pmap_off) {
736 		printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n",
737 		    sel_max, sel_max & (pmap_align - 1), pmap_off);
738 		bad++;
739 	}
740 
741 	if (bad) {
742 		panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, "
743 		    "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, "
744 		    "bias = %d, "
745 		    "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)",
746 		    sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off,
747 		    bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel));
748 	}
749 #endif /* DIAGNOSTIC */
750 
751 	*min = sel_min;
752 	*max = sel_max;
753 	return 0;
754 }
755 
756 /*
757  * Test if memory starting at addr with sz bytes is free.
758  *
759  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
760  * the space.
761  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
762  */
763 int
764 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
765     struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr,
766     vaddr_t addr, vsize_t sz)
767 {
768 	struct uvm_addr_state *free;
769 	struct uvm_map_addr *atree;
770 	struct vm_map_entry *i, *i_end;
771 
772 	if (addr + sz < addr)
773 		return 0;
774 
775 	/*
776 	 * Kernel memory above uvm_maxkaddr is considered unavailable.
777 	 */
778 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
779 		if (addr + sz > uvm_maxkaddr)
780 			return 0;
781 	}
782 
783 	atree = &map->addr;
784 
785 	/*
786 	 * Fill in first, last, so they point at the entries containing the
787 	 * first and last address of the range.
788 	 * Note that if they are not NULL, we don't perform the lookup.
789 	 */
790 	KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL);
791 	if (*start_ptr == NULL) {
792 		*start_ptr = uvm_map_entrybyaddr(atree, addr);
793 		if (*start_ptr == NULL)
794 			return 0;
795 	} else
796 		KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr));
797 	if (*end_ptr == NULL) {
798 		if (VMMAP_FREE_END(*start_ptr) >= addr + sz)
799 			*end_ptr = *start_ptr;
800 		else {
801 			*end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1);
802 			if (*end_ptr == NULL)
803 				return 0;
804 		}
805 	} else
806 		KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1));
807 
808 	/* Validation. */
809 	KDASSERT(*start_ptr != NULL && *end_ptr != NULL);
810 	KDASSERT((*start_ptr)->start <= addr &&
811 	    VMMAP_FREE_END(*start_ptr) > addr &&
812 	    (*end_ptr)->start < addr + sz &&
813 	    VMMAP_FREE_END(*end_ptr) >= addr + sz);
814 
815 	/*
816 	 * Check the none of the entries intersects with <addr, addr+sz>.
817 	 * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is
818 	 * considered unavailable unless called by those allocators.
819 	 */
820 	i = *start_ptr;
821 	i_end = RBT_NEXT(uvm_map_addr, *end_ptr);
822 	for (; i != i_end;
823 	    i = RBT_NEXT(uvm_map_addr, i)) {
824 		if (i->start != i->end && i->end > addr)
825 			return 0;
826 
827 		/*
828 		 * uaddr_exe and uaddr_brk_stack may only be used
829 		 * by these allocators and the NULL uaddr (i.e. no
830 		 * uaddr).
831 		 * Reject if this requirement is not met.
832 		 */
833 		if (uaddr != NULL) {
834 			free = uvm_map_uaddr_e(map, i);
835 
836 			if (uaddr != free && free != NULL &&
837 			    (free == map->uaddr_exe ||
838 			     free == map->uaddr_brk_stack))
839 				return 0;
840 		}
841 	}
842 
843 	return -1;
844 }
845 
846 /*
847  * Invoke each address selector until an address is found.
848  * Will not invoke uaddr_exe.
849  */
850 int
851 uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
852     struct vm_map_entry**last, vaddr_t *addr, vsize_t sz,
853     vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint)
854 {
855 	struct uvm_addr_state *uaddr;
856 	int i;
857 
858 	/*
859 	 * Allocation for sz bytes at any address,
860 	 * using the addr selectors in order.
861 	 */
862 	for (i = 0; i < nitems(map->uaddr_any); i++) {
863 		uaddr = map->uaddr_any[i];
864 
865 		if (uvm_addr_invoke(map, uaddr, first, last,
866 		    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
867 			return 0;
868 	}
869 
870 	/* Fall back to brk() and stack() address selectors. */
871 	uaddr = map->uaddr_brk_stack;
872 	if (uvm_addr_invoke(map, uaddr, first, last,
873 	    addr, sz, pmap_align, pmap_offset, prot, hint) == 0)
874 		return 0;
875 
876 	return ENOMEM;
877 }
878 
879 /* Calculate entry augmentation value. */
880 vsize_t
881 uvm_map_addr_augment_get(struct vm_map_entry *entry)
882 {
883 	vsize_t			 augment;
884 	struct vm_map_entry	*left, *right;
885 
886 	augment = entry->fspace;
887 	if ((left = RBT_LEFT(uvm_map_addr, entry)) != NULL)
888 		augment = MAX(augment, left->fspace_augment);
889 	if ((right = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
890 		augment = MAX(augment, right->fspace_augment);
891 	return augment;
892 }
893 
894 /*
895  * Update augmentation data in entry.
896  */
897 void
898 uvm_map_addr_augment(struct vm_map_entry *entry)
899 {
900 	vsize_t			 augment;
901 
902 	while (entry != NULL) {
903 		/* Calculate value for augmentation. */
904 		augment = uvm_map_addr_augment_get(entry);
905 
906 		/*
907 		 * Descend update.
908 		 * Once we find an entry that already has the correct value,
909 		 * stop, since it means all its parents will use the correct
910 		 * value too.
911 		 */
912 		if (entry->fspace_augment == augment)
913 			return;
914 		entry->fspace_augment = augment;
915 		entry = RBT_PARENT(uvm_map_addr, entry);
916 	}
917 }
918 
919 /*
920  * uvm_mapanon: establish a valid mapping in map for an anon
921  *
922  * => *addr and sz must be a multiple of PAGE_SIZE.
923  * => *addr is ignored, except if flags contains UVM_FLAG_FIXED.
924  * => map must be unlocked.
925  *
926  * => align: align vaddr, must be a power-of-2.
927  *    Align is only a hint and will be ignored if the alignment fails.
928  */
929 int
930 uvm_mapanon(struct vm_map *map, vaddr_t *addr, vsize_t sz,
931     vsize_t align, unsigned int flags)
932 {
933 	struct vm_map_entry	*first, *last, *entry, *new;
934 	struct uvm_map_deadq	 dead;
935 	vm_prot_t		 prot;
936 	vm_prot_t		 maxprot;
937 	vm_inherit_t		 inherit;
938 	int			 advice;
939 	int			 error;
940 	vaddr_t			 pmap_align, pmap_offset;
941 	vaddr_t			 hint;
942 
943 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE);
944 	KASSERT(map != kernel_map);
945 	KASSERT((map->flags & UVM_FLAG_HOLE) == 0);
946 
947 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
948 	splassert(IPL_NONE);
949 
950 	/*
951 	 * We use pmap_align and pmap_offset as alignment and offset variables.
952 	 *
953 	 * Because the align parameter takes precedence over pmap prefer,
954 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
955 	 * if pmap_prefer will not align.
956 	 */
957 	pmap_align = MAX(align, PAGE_SIZE);
958 	pmap_offset = 0;
959 
960 	/* Decode parameters. */
961 	prot = UVM_PROTECTION(flags);
962 	maxprot = UVM_MAXPROTECTION(flags);
963 	advice = UVM_ADVICE(flags);
964 	inherit = UVM_INHERIT(flags);
965 	error = 0;
966 	hint = trunc_page(*addr);
967 	TAILQ_INIT(&dead);
968 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
969 	KASSERT((align & (align - 1)) == 0);
970 
971 	/* Check protection. */
972 	if ((prot & maxprot) != prot)
973 		return EACCES;
974 
975 	/*
976 	 * Before grabbing the lock, allocate a map entry for later
977 	 * use to ensure we don't wait for memory while holding the
978 	 * vm_map_lock.
979 	 */
980 	new = uvm_mapent_alloc(map, flags);
981 	if (new == NULL)
982 		return(ENOMEM);
983 
984 	if (flags & UVM_FLAG_TRYLOCK) {
985 		if (vm_map_lock_try(map) == FALSE) {
986 			error = EFAULT;
987 			goto out;
988 		}
989 	} else
990 		vm_map_lock(map);
991 
992 	first = last = NULL;
993 	if (flags & UVM_FLAG_FIXED) {
994 		/*
995 		 * Fixed location.
996 		 *
997 		 * Note: we ignore align, pmap_prefer.
998 		 * Fill in first, last and *addr.
999 		 */
1000 		KASSERT((*addr & PAGE_MASK) == 0);
1001 
1002 		/* Check that the space is available. */
1003 		if (flags & UVM_FLAG_UNMAP)
1004 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1005 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1006 			error = ENOMEM;
1007 			goto unlock;
1008 		}
1009 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1010 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1011 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1012 		/*
1013 		 * Address used as hint.
1014 		 *
1015 		 * Note: we enforce the alignment restriction,
1016 		 * but ignore pmap_prefer.
1017 		 */
1018 	} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1019 		/* Run selection algorithm for executables. */
1020 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1021 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1022 
1023 		if (error != 0)
1024 			goto unlock;
1025 	} else {
1026 		/* Update freelists from vmspace. */
1027 		uvm_map_vmspace_update(map, &dead, flags);
1028 
1029 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1030 		    pmap_align, pmap_offset, prot, hint);
1031 
1032 		if (error != 0)
1033 			goto unlock;
1034 	}
1035 
1036 	/* Double-check if selected address doesn't cause overflow. */
1037 	if (*addr + sz < *addr) {
1038 		error = ENOMEM;
1039 		goto unlock;
1040 	}
1041 
1042 	/* If we only want a query, return now. */
1043 	if (flags & UVM_FLAG_QUERY) {
1044 		error = 0;
1045 		goto unlock;
1046 	}
1047 
1048 	/*
1049 	 * Create new entry.
1050 	 * first and last may be invalidated after this call.
1051 	 */
1052 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1053 	    new);
1054 	if (entry == NULL) {
1055 		error = ENOMEM;
1056 		goto unlock;
1057 	}
1058 	new = NULL;
1059 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1060 	entry->object.uvm_obj = NULL;
1061 	entry->offset = 0;
1062 	entry->protection = prot;
1063 	entry->max_protection = maxprot;
1064 	entry->inheritance = inherit;
1065 	entry->wired_count = 0;
1066 	entry->advice = advice;
1067 	if (flags & UVM_FLAG_COPYONW) {
1068 		entry->etype |= UVM_ET_COPYONWRITE;
1069 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1070 			entry->etype |= UVM_ET_NEEDSCOPY;
1071 	}
1072 	if (flags & UVM_FLAG_OVERLAY) {
1073 		KERNEL_LOCK();
1074 		entry->aref.ar_pageoff = 0;
1075 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1076 		KERNEL_UNLOCK();
1077 	}
1078 
1079 	/* Update map and process statistics. */
1080 	map->size += sz;
1081 	((struct vmspace *)map)->vm_dused += uvmspace_dused(map, *addr, *addr + sz);
1082 
1083 unlock:
1084 	vm_map_unlock(map);
1085 
1086 	/*
1087 	 * Remove dead entries.
1088 	 *
1089 	 * Dead entries may be the result of merging.
1090 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1091 	 * destroy free-space entries.
1092 	 */
1093 	uvm_unmap_detach(&dead, 0);
1094 out:
1095 	if (new)
1096 		uvm_mapent_free(new);
1097 	return error;
1098 }
1099 
1100 /*
1101  * uvm_map: establish a valid mapping in map
1102  *
1103  * => *addr and sz must be a multiple of PAGE_SIZE.
1104  * => map must be unlocked.
1105  * => <uobj,uoffset> value meanings (4 cases):
1106  *	[1] <NULL,uoffset>		== uoffset is a hint for PMAP_PREFER
1107  *	[2] <NULL,UVM_UNKNOWN_OFFSET>	== don't PMAP_PREFER
1108  *	[3] <uobj,uoffset>		== normal mapping
1109  *	[4] <uobj,UVM_UNKNOWN_OFFSET>	== uvm_map finds offset based on VA
1110  *
1111  *   case [4] is for kernel mappings where we don't know the offset until
1112  *   we've found a virtual address.   note that kernel object offsets are
1113  *   always relative to vm_map_min(kernel_map).
1114  *
1115  * => align: align vaddr, must be a power-of-2.
1116  *    Align is only a hint and will be ignored if the alignment fails.
1117  */
1118 int
1119 uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz,
1120     struct uvm_object *uobj, voff_t uoffset,
1121     vsize_t align, unsigned int flags)
1122 {
1123 	struct vm_map_entry	*first, *last, *entry, *new;
1124 	struct uvm_map_deadq	 dead;
1125 	vm_prot_t		 prot;
1126 	vm_prot_t		 maxprot;
1127 	vm_inherit_t		 inherit;
1128 	int			 advice;
1129 	int			 error;
1130 	vaddr_t			 pmap_align, pmap_offset;
1131 	vaddr_t			 hint;
1132 
1133 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1134 		splassert(IPL_NONE);
1135 	else
1136 		splassert(IPL_VM);
1137 
1138 	/*
1139 	 * We use pmap_align and pmap_offset as alignment and offset variables.
1140 	 *
1141 	 * Because the align parameter takes precedence over pmap prefer,
1142 	 * the pmap_align will need to be set to align, with pmap_offset = 0,
1143 	 * if pmap_prefer will not align.
1144 	 */
1145 	if (uoffset == UVM_UNKNOWN_OFFSET) {
1146 		pmap_align = MAX(align, PAGE_SIZE);
1147 		pmap_offset = 0;
1148 	} else {
1149 		pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE);
1150 		pmap_offset = PMAP_PREFER_OFFSET(uoffset);
1151 
1152 		if (align == 0 ||
1153 		    (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) {
1154 			/* pmap_offset satisfies align, no change. */
1155 		} else {
1156 			/* Align takes precedence over pmap prefer. */
1157 			pmap_align = align;
1158 			pmap_offset = 0;
1159 		}
1160 	}
1161 
1162 	/* Decode parameters. */
1163 	prot = UVM_PROTECTION(flags);
1164 	maxprot = UVM_MAXPROTECTION(flags);
1165 	advice = UVM_ADVICE(flags);
1166 	inherit = UVM_INHERIT(flags);
1167 	error = 0;
1168 	hint = trunc_page(*addr);
1169 	TAILQ_INIT(&dead);
1170 	KASSERT((sz & (vaddr_t)PAGE_MASK) == 0);
1171 	KASSERT((align & (align - 1)) == 0);
1172 
1173 	/* Holes are incompatible with other types of mappings. */
1174 	if (flags & UVM_FLAG_HOLE) {
1175 		KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) &&
1176 		    (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0);
1177 	}
1178 
1179 	/* Unset hint for kernel_map non-fixed allocations. */
1180 	if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED))
1181 		hint = 0;
1182 
1183 	/* Check protection. */
1184 	if ((prot & maxprot) != prot)
1185 		return EACCES;
1186 
1187 	if (map == kernel_map &&
1188 	    (prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
1189 		panic("uvm_map: kernel map W^X violation requested");
1190 
1191 	/*
1192 	 * Before grabbing the lock, allocate a map entry for later
1193 	 * use to ensure we don't wait for memory while holding the
1194 	 * vm_map_lock.
1195 	 */
1196 	new = uvm_mapent_alloc(map, flags);
1197 	if (new == NULL)
1198 		return(ENOMEM);
1199 
1200 	if (flags & UVM_FLAG_TRYLOCK) {
1201 		if (vm_map_lock_try(map) == FALSE) {
1202 			error = EFAULT;
1203 			goto out;
1204 		}
1205 	} else {
1206 		vm_map_lock(map);
1207 	}
1208 
1209 	first = last = NULL;
1210 	if (flags & UVM_FLAG_FIXED) {
1211 		/*
1212 		 * Fixed location.
1213 		 *
1214 		 * Note: we ignore align, pmap_prefer.
1215 		 * Fill in first, last and *addr.
1216 		 */
1217 		KASSERT((*addr & PAGE_MASK) == 0);
1218 
1219 		/*
1220 		 * Grow pmap to include allocated address.
1221 		 * If the growth fails, the allocation will fail too.
1222 		 */
1223 		if ((map->flags & VM_MAP_ISVMSPACE) == 0 &&
1224 		    uvm_maxkaddr < (*addr + sz)) {
1225 			uvm_map_kmem_grow(map, &dead,
1226 			    *addr + sz - uvm_maxkaddr, flags);
1227 		}
1228 
1229 		/* Check that the space is available. */
1230 		if (flags & UVM_FLAG_UNMAP)
1231 			uvm_unmap_remove(map, *addr, *addr + sz, &dead, FALSE, TRUE);
1232 		if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1233 			error = ENOMEM;
1234 			goto unlock;
1235 		}
1236 	} else if (*addr != 0 && (*addr & PAGE_MASK) == 0 &&
1237 	    (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE &&
1238 	    (align == 0 || (*addr & (align - 1)) == 0) &&
1239 	    uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) {
1240 		/*
1241 		 * Address used as hint.
1242 		 *
1243 		 * Note: we enforce the alignment restriction,
1244 		 * but ignore pmap_prefer.
1245 		 */
1246 	} else if ((prot & PROT_EXEC) != 0 && map->uaddr_exe != NULL) {
1247 		/* Run selection algorithm for executables. */
1248 		error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last,
1249 		    addr, sz, pmap_align, pmap_offset, prot, hint);
1250 
1251 		/* Grow kernel memory and try again. */
1252 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1253 			uvm_map_kmem_grow(map, &dead, sz, flags);
1254 
1255 			error = uvm_addr_invoke(map, map->uaddr_exe,
1256 			    &first, &last, addr, sz,
1257 			    pmap_align, pmap_offset, prot, hint);
1258 		}
1259 
1260 		if (error != 0)
1261 			goto unlock;
1262 	} else {
1263 		/* Update freelists from vmspace. */
1264 		if (map->flags & VM_MAP_ISVMSPACE)
1265 			uvm_map_vmspace_update(map, &dead, flags);
1266 
1267 		error = uvm_map_findspace(map, &first, &last, addr, sz,
1268 		    pmap_align, pmap_offset, prot, hint);
1269 
1270 		/* Grow kernel memory and try again. */
1271 		if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) {
1272 			uvm_map_kmem_grow(map, &dead, sz, flags);
1273 
1274 			error = uvm_map_findspace(map, &first, &last, addr, sz,
1275 			    pmap_align, pmap_offset, prot, hint);
1276 		}
1277 
1278 		if (error != 0)
1279 			goto unlock;
1280 	}
1281 
1282 	/* Double-check if selected address doesn't cause overflow. */
1283 	if (*addr + sz < *addr) {
1284 		error = ENOMEM;
1285 		goto unlock;
1286 	}
1287 
1288 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE ||
1289 	    uvm_maxkaddr >= *addr + sz);
1290 
1291 	/* If we only want a query, return now. */
1292 	if (flags & UVM_FLAG_QUERY) {
1293 		error = 0;
1294 		goto unlock;
1295 	}
1296 
1297 	if (uobj == NULL)
1298 		uoffset = 0;
1299 	else if (uoffset == UVM_UNKNOWN_OFFSET) {
1300 		KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
1301 		uoffset = *addr - vm_map_min(kernel_map);
1302 	}
1303 
1304 	/*
1305 	 * Create new entry.
1306 	 * first and last may be invalidated after this call.
1307 	 */
1308 	entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead,
1309 	    new);
1310 	if (entry == NULL) {
1311 		error = ENOMEM;
1312 		goto unlock;
1313 	}
1314 	new = NULL;
1315 	KDASSERT(entry->start == *addr && entry->end == *addr + sz);
1316 	entry->object.uvm_obj = uobj;
1317 	entry->offset = uoffset;
1318 	entry->protection = prot;
1319 	entry->max_protection = maxprot;
1320 	entry->inheritance = inherit;
1321 	entry->wired_count = 0;
1322 	entry->advice = advice;
1323 	if (uobj)
1324 		entry->etype |= UVM_ET_OBJ;
1325 	else if (flags & UVM_FLAG_HOLE)
1326 		entry->etype |= UVM_ET_HOLE;
1327 	if (flags & UVM_FLAG_NOFAULT)
1328 		entry->etype |= UVM_ET_NOFAULT;
1329 	if (flags & UVM_FLAG_COPYONW) {
1330 		entry->etype |= UVM_ET_COPYONWRITE;
1331 		if ((flags & UVM_FLAG_OVERLAY) == 0)
1332 			entry->etype |= UVM_ET_NEEDSCOPY;
1333 	}
1334 	if (flags & UVM_FLAG_OVERLAY) {
1335 		entry->aref.ar_pageoff = 0;
1336 		entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0);
1337 	}
1338 
1339 	/* Update map and process statistics. */
1340 	if (!(flags & UVM_FLAG_HOLE)) {
1341 		map->size += sz;
1342 		if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL) {
1343 			((struct vmspace *)map)->vm_dused +=
1344 			    uvmspace_dused(map, *addr, *addr + sz);
1345 		}
1346 	}
1347 
1348 	/*
1349 	 * Try to merge entry.
1350 	 *
1351 	 * Userland allocations are kept separated most of the time.
1352 	 * Forego the effort of merging what most of the time can't be merged
1353 	 * and only try the merge if it concerns a kernel entry.
1354 	 */
1355 	if ((flags & UVM_FLAG_NOMERGE) == 0 &&
1356 	    (map->flags & VM_MAP_ISVMSPACE) == 0)
1357 		uvm_mapent_tryjoin(map, entry, &dead);
1358 
1359 unlock:
1360 	vm_map_unlock(map);
1361 
1362 	/*
1363 	 * Remove dead entries.
1364 	 *
1365 	 * Dead entries may be the result of merging.
1366 	 * uvm_map_mkentry may also create dead entries, when it attempts to
1367 	 * destroy free-space entries.
1368 	 */
1369 	if (map->flags & VM_MAP_INTRSAFE)
1370 		uvm_unmap_detach_intrsafe(&dead);
1371 	else
1372 		uvm_unmap_detach(&dead, 0);
1373 out:
1374 	if (new)
1375 		uvm_mapent_free(new);
1376 	return error;
1377 }
1378 
1379 /*
1380  * True iff e1 and e2 can be joined together.
1381  */
1382 int
1383 uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1,
1384     struct vm_map_entry *e2)
1385 {
1386 	KDASSERT(e1 != NULL && e2 != NULL);
1387 
1388 	/* Must be the same entry type and not have free memory between. */
1389 	if (e1->etype != e2->etype || e1->end != e2->start)
1390 		return 0;
1391 
1392 	/* Submaps are never joined. */
1393 	if (UVM_ET_ISSUBMAP(e1))
1394 		return 0;
1395 
1396 	/* Never merge wired memory. */
1397 	if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2))
1398 		return 0;
1399 
1400 	/* Protection, inheritance and advice must be equal. */
1401 	if (e1->protection != e2->protection ||
1402 	    e1->max_protection != e2->max_protection ||
1403 	    e1->inheritance != e2->inheritance ||
1404 	    e1->advice != e2->advice)
1405 		return 0;
1406 
1407 	/* If uvm_object: object itself and offsets within object must match. */
1408 	if (UVM_ET_ISOBJ(e1)) {
1409 		if (e1->object.uvm_obj != e2->object.uvm_obj)
1410 			return 0;
1411 		if (e1->offset + (e1->end - e1->start) != e2->offset)
1412 			return 0;
1413 	}
1414 
1415 	/*
1416 	 * Cannot join shared amaps.
1417 	 * Note: no need to lock amap to look at refs, since we don't care
1418 	 * about its exact value.
1419 	 * If it is 1 (i.e. we have the only reference) it will stay there.
1420 	 */
1421 	if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1)
1422 		return 0;
1423 	if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1)
1424 		return 0;
1425 
1426 	/* Apprently, e1 and e2 match. */
1427 	return 1;
1428 }
1429 
1430 /*
1431  * Join support function.
1432  *
1433  * Returns the merged entry on succes.
1434  * Returns NULL if the merge failed.
1435  */
1436 struct vm_map_entry*
1437 uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1,
1438     struct vm_map_entry *e2, struct uvm_map_deadq *dead)
1439 {
1440 	struct uvm_addr_state *free;
1441 
1442 	/*
1443 	 * Merging is not supported for map entries that
1444 	 * contain an amap in e1. This should never happen
1445 	 * anyway, because only kernel entries are merged.
1446 	 * These do not contain amaps.
1447 	 * e2 contains no real information in its amap,
1448 	 * so it can be erased immediately.
1449 	 */
1450 	KASSERT(e1->aref.ar_amap == NULL);
1451 
1452 	/*
1453 	 * Don't drop obj reference:
1454 	 * uvm_unmap_detach will do this for us.
1455 	 */
1456 	free = uvm_map_uaddr_e(map, e1);
1457 	uvm_mapent_free_remove(map, free, e1);
1458 
1459 	free = uvm_map_uaddr_e(map, e2);
1460 	uvm_mapent_free_remove(map, free, e2);
1461 	uvm_mapent_addr_remove(map, e2);
1462 	e1->end = e2->end;
1463 	e1->guard = e2->guard;
1464 	e1->fspace = e2->fspace;
1465 	uvm_mapent_free_insert(map, free, e1);
1466 
1467 	DEAD_ENTRY_PUSH(dead, e2);
1468 	return e1;
1469 }
1470 
1471 /*
1472  * Attempt forward and backward joining of entry.
1473  *
1474  * Returns entry after joins.
1475  * We are guaranteed that the amap of entry is either non-existant or
1476  * has never been used.
1477  */
1478 struct vm_map_entry*
1479 uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry,
1480     struct uvm_map_deadq *dead)
1481 {
1482 	struct vm_map_entry *other;
1483 	struct vm_map_entry *merged;
1484 
1485 	/* Merge with previous entry. */
1486 	other = RBT_PREV(uvm_map_addr, entry);
1487 	if (other && uvm_mapent_isjoinable(map, other, entry)) {
1488 		merged = uvm_mapent_merge(map, other, entry, dead);
1489 		if (merged)
1490 			entry = merged;
1491 	}
1492 
1493 	/*
1494 	 * Merge with next entry.
1495 	 *
1496 	 * Because amap can only extend forward and the next entry
1497 	 * probably contains sensible info, only perform forward merging
1498 	 * in the absence of an amap.
1499 	 */
1500 	other = RBT_NEXT(uvm_map_addr, entry);
1501 	if (other && entry->aref.ar_amap == NULL &&
1502 	    other->aref.ar_amap == NULL &&
1503 	    uvm_mapent_isjoinable(map, entry, other)) {
1504 		merged = uvm_mapent_merge(map, entry, other, dead);
1505 		if (merged)
1506 			entry = merged;
1507 	}
1508 
1509 	return entry;
1510 }
1511 
1512 /*
1513  * Kill entries that are no longer in a map.
1514  */
1515 void
1516 uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags)
1517 {
1518 	struct vm_map_entry *entry;
1519 	int waitok = flags & UVM_PLA_WAITOK;
1520 
1521 	if (TAILQ_EMPTY(deadq))
1522 		return;
1523 
1524 	KERNEL_LOCK();
1525 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1526 		if (waitok)
1527 			uvm_pause();
1528 		/* Drop reference to amap, if we've got one. */
1529 		if (entry->aref.ar_amap)
1530 			amap_unref(entry->aref.ar_amap,
1531 			    entry->aref.ar_pageoff,
1532 			    atop(entry->end - entry->start),
1533 			    flags & AMAP_REFALL);
1534 
1535 		/* Drop reference to our backing object, if we've got one. */
1536 		if (UVM_ET_ISSUBMAP(entry)) {
1537 			/* ... unlikely to happen, but play it safe */
1538 			uvm_map_deallocate(entry->object.sub_map);
1539 		} else if (UVM_ET_ISOBJ(entry) &&
1540 		    entry->object.uvm_obj->pgops->pgo_detach) {
1541 			entry->object.uvm_obj->pgops->pgo_detach(
1542 			    entry->object.uvm_obj);
1543 		}
1544 
1545 		/* Step to next. */
1546 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1547 		uvm_mapent_free(entry);
1548 	}
1549 	KERNEL_UNLOCK();
1550 }
1551 
1552 void
1553 uvm_unmap_detach_intrsafe(struct uvm_map_deadq *deadq)
1554 {
1555 	struct vm_map_entry *entry;
1556 
1557 	while ((entry = TAILQ_FIRST(deadq)) != NULL) {
1558 		KASSERT(entry->aref.ar_amap == NULL);
1559 		KASSERT(!UVM_ET_ISSUBMAP(entry));
1560 		KASSERT(!UVM_ET_ISOBJ(entry));
1561 		TAILQ_REMOVE(deadq, entry, dfree.deadq);
1562 		uvm_mapent_free(entry);
1563 	}
1564 }
1565 
1566 /*
1567  * Create and insert new entry.
1568  *
1569  * Returned entry contains new addresses and is inserted properly in the tree.
1570  * first and last are (probably) no longer valid.
1571  */
1572 struct vm_map_entry*
1573 uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first,
1574     struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags,
1575     struct uvm_map_deadq *dead, struct vm_map_entry *new)
1576 {
1577 	struct vm_map_entry *entry, *prev;
1578 	struct uvm_addr_state *free;
1579 	vaddr_t min, max;	/* free space boundaries for new entry */
1580 
1581 	KDASSERT(map != NULL);
1582 	KDASSERT(first != NULL);
1583 	KDASSERT(last != NULL);
1584 	KDASSERT(dead != NULL);
1585 	KDASSERT(sz > 0);
1586 	KDASSERT(addr + sz > addr);
1587 	KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr);
1588 	KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz);
1589 	KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz));
1590 	uvm_tree_sanity(map, __FILE__, __LINE__);
1591 
1592 	min = addr + sz;
1593 	max = VMMAP_FREE_END(last);
1594 
1595 	/* Initialize new entry. */
1596 	if (new == NULL)
1597 		entry = uvm_mapent_alloc(map, flags);
1598 	else
1599 		entry = new;
1600 	if (entry == NULL)
1601 		return NULL;
1602 	entry->offset = 0;
1603 	entry->etype = 0;
1604 	entry->wired_count = 0;
1605 	entry->aref.ar_pageoff = 0;
1606 	entry->aref.ar_amap = NULL;
1607 
1608 	entry->start = addr;
1609 	entry->end = min;
1610 	entry->guard = 0;
1611 	entry->fspace = 0;
1612 
1613 	/* Reset free space in first. */
1614 	free = uvm_map_uaddr_e(map, first);
1615 	uvm_mapent_free_remove(map, free, first);
1616 	first->guard = 0;
1617 	first->fspace = 0;
1618 
1619 	/*
1620 	 * Remove all entries that are fully replaced.
1621 	 * We are iterating using last in reverse order.
1622 	 */
1623 	for (; first != last; last = prev) {
1624 		prev = RBT_PREV(uvm_map_addr, last);
1625 
1626 		KDASSERT(last->start == last->end);
1627 		free = uvm_map_uaddr_e(map, last);
1628 		uvm_mapent_free_remove(map, free, last);
1629 		uvm_mapent_addr_remove(map, last);
1630 		DEAD_ENTRY_PUSH(dead, last);
1631 	}
1632 	/* Remove first if it is entirely inside <addr, addr+sz>.  */
1633 	if (first->start == addr) {
1634 		uvm_mapent_addr_remove(map, first);
1635 		DEAD_ENTRY_PUSH(dead, first);
1636 	} else {
1637 		uvm_map_fix_space(map, first, VMMAP_FREE_START(first),
1638 		    addr, flags);
1639 	}
1640 
1641 	/* Finally, link in entry. */
1642 	uvm_mapent_addr_insert(map, entry);
1643 	uvm_map_fix_space(map, entry, min, max, flags);
1644 
1645 	uvm_tree_sanity(map, __FILE__, __LINE__);
1646 	return entry;
1647 }
1648 
1649 
1650 /*
1651  * uvm_mapent_alloc: allocate a map entry
1652  */
1653 struct vm_map_entry *
1654 uvm_mapent_alloc(struct vm_map *map, int flags)
1655 {
1656 	struct vm_map_entry *me, *ne;
1657 	int pool_flags;
1658 	int i;
1659 
1660 	pool_flags = PR_WAITOK;
1661 	if (flags & UVM_FLAG_TRYLOCK)
1662 		pool_flags = PR_NOWAIT;
1663 
1664 	if (map->flags & VM_MAP_INTRSAFE || cold) {
1665 		mtx_enter(&uvm_kmapent_mtx);
1666 		if (SLIST_EMPTY(&uvm.kentry_free)) {
1667 			ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty,
1668 			    &kd_nowait);
1669 			if (ne == NULL)
1670 				panic("uvm_mapent_alloc: cannot allocate map "
1671 				    "entry");
1672 			for (i = 0; i < PAGE_SIZE / sizeof(*ne); i++) {
1673 				SLIST_INSERT_HEAD(&uvm.kentry_free,
1674 				    &ne[i], daddrs.addr_kentry);
1675 			}
1676 			if (ratecheck(&uvm_kmapent_last_warn_time,
1677 			    &uvm_kmapent_warn_rate))
1678 				printf("uvm_mapent_alloc: out of static "
1679 				    "map entries\n");
1680 		}
1681 		me = SLIST_FIRST(&uvm.kentry_free);
1682 		SLIST_REMOVE_HEAD(&uvm.kentry_free, daddrs.addr_kentry);
1683 		uvmexp.kmapent++;
1684 		mtx_leave(&uvm_kmapent_mtx);
1685 		me->flags = UVM_MAP_STATIC;
1686 	} else if (map == kernel_map) {
1687 		splassert(IPL_NONE);
1688 		me = pool_get(&uvm_map_entry_kmem_pool, pool_flags);
1689 		if (me == NULL)
1690 			goto out;
1691 		me->flags = UVM_MAP_KMEM;
1692 	} else {
1693 		splassert(IPL_NONE);
1694 		me = pool_get(&uvm_map_entry_pool, pool_flags);
1695 		if (me == NULL)
1696 			goto out;
1697 		me->flags = 0;
1698 	}
1699 
1700 	if (me != NULL) {
1701 		RBT_POISON(uvm_map_addr, me, UVMMAP_DEADBEEF);
1702 	}
1703 
1704 out:
1705 	return(me);
1706 }
1707 
1708 /*
1709  * uvm_mapent_free: free map entry
1710  *
1711  * => XXX: static pool for kernel map?
1712  */
1713 void
1714 uvm_mapent_free(struct vm_map_entry *me)
1715 {
1716 	if (me->flags & UVM_MAP_STATIC) {
1717 		mtx_enter(&uvm_kmapent_mtx);
1718 		SLIST_INSERT_HEAD(&uvm.kentry_free, me, daddrs.addr_kentry);
1719 		uvmexp.kmapent--;
1720 		mtx_leave(&uvm_kmapent_mtx);
1721 	} else if (me->flags & UVM_MAP_KMEM) {
1722 		splassert(IPL_NONE);
1723 		pool_put(&uvm_map_entry_kmem_pool, me);
1724 	} else {
1725 		splassert(IPL_NONE);
1726 		pool_put(&uvm_map_entry_pool, me);
1727 	}
1728 }
1729 
1730 /*
1731  * uvm_map_lookup_entry: find map entry at or before an address.
1732  *
1733  * => map must at least be read-locked by caller
1734  * => entry is returned in "entry"
1735  * => return value is true if address is in the returned entry
1736  * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is
1737  * returned for those mappings.
1738  */
1739 boolean_t
1740 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
1741     struct vm_map_entry **entry)
1742 {
1743 	*entry = uvm_map_entrybyaddr(&map->addr, address);
1744 	return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
1745 	    (*entry)->start <= address && (*entry)->end > address;
1746 }
1747 
1748 /*
1749  * uvm_map_pie: return a random load address for a PIE executable
1750  * properly aligned.
1751  */
1752 #ifndef VM_PIE_MAX_ADDR
1753 #define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4)
1754 #endif
1755 
1756 #ifndef VM_PIE_MIN_ADDR
1757 #define VM_PIE_MIN_ADDR VM_MIN_ADDRESS
1758 #endif
1759 
1760 #ifndef VM_PIE_MIN_ALIGN
1761 #define VM_PIE_MIN_ALIGN PAGE_SIZE
1762 #endif
1763 
1764 vaddr_t
1765 uvm_map_pie(vaddr_t align)
1766 {
1767 	vaddr_t addr, space, min;
1768 
1769 	align = MAX(align, VM_PIE_MIN_ALIGN);
1770 
1771 	/* round up to next alignment */
1772 	min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1);
1773 
1774 	if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR)
1775 		return (align);
1776 
1777 	space = (VM_PIE_MAX_ADDR - min) / align;
1778 	space = MIN(space, (u_int32_t)-1);
1779 
1780 	addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align;
1781 	addr += min;
1782 
1783 	return (addr);
1784 }
1785 
1786 void
1787 uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end)
1788 {
1789 	struct uvm_map_deadq dead;
1790 
1791 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0 &&
1792 	    (end & (vaddr_t)PAGE_MASK) == 0);
1793 	TAILQ_INIT(&dead);
1794 	vm_map_lock(map);
1795 	uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE);
1796 	vm_map_unlock(map);
1797 
1798 	if (map->flags & VM_MAP_INTRSAFE)
1799 		uvm_unmap_detach_intrsafe(&dead);
1800 	else
1801 		uvm_unmap_detach(&dead, 0);
1802 }
1803 
1804 /*
1805  * Mark entry as free.
1806  *
1807  * entry will be put on the dead list.
1808  * The free space will be merged into the previous or a new entry,
1809  * unless markfree is false.
1810  */
1811 void
1812 uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry,
1813     struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead,
1814     boolean_t markfree)
1815 {
1816 	struct uvm_addr_state	*free;
1817 	struct vm_map_entry	*prev;
1818 	vaddr_t			 addr;	/* Start of freed range. */
1819 	vaddr_t			 end;	/* End of freed range. */
1820 
1821 	prev = *prev_ptr;
1822 	if (prev == entry)
1823 		*prev_ptr = prev = NULL;
1824 
1825 	if (prev == NULL ||
1826 	    VMMAP_FREE_END(prev) != entry->start)
1827 		prev = RBT_PREV(uvm_map_addr, entry);
1828 
1829 	/* Entry is describing only free memory and has nothing to drain into. */
1830 	if (prev == NULL && entry->start == entry->end && markfree) {
1831 		*prev_ptr = entry;
1832 		return;
1833 	}
1834 
1835 	addr = entry->start;
1836 	end = VMMAP_FREE_END(entry);
1837 	free = uvm_map_uaddr_e(map, entry);
1838 	uvm_mapent_free_remove(map, free, entry);
1839 	uvm_mapent_addr_remove(map, entry);
1840 	DEAD_ENTRY_PUSH(dead, entry);
1841 
1842 	if (markfree) {
1843 		if (prev) {
1844 			free = uvm_map_uaddr_e(map, prev);
1845 			uvm_mapent_free_remove(map, free, prev);
1846 		}
1847 		*prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0);
1848 	}
1849 }
1850 
1851 /*
1852  * Unwire and release referenced amap and object from map entry.
1853  */
1854 void
1855 uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry)
1856 {
1857 	/* Unwire removed map entry. */
1858 	if (VM_MAPENT_ISWIRED(entry)) {
1859 		KERNEL_LOCK();
1860 		entry->wired_count = 0;
1861 		uvm_fault_unwire_locked(map, entry->start, entry->end);
1862 		KERNEL_UNLOCK();
1863 	}
1864 
1865 	/* Entry-type specific code. */
1866 	if (UVM_ET_ISHOLE(entry)) {
1867 		/* Nothing to be done for holes. */
1868 	} else if (map->flags & VM_MAP_INTRSAFE) {
1869 		KASSERT(vm_map_pmap(map) == pmap_kernel());
1870 		uvm_km_pgremove_intrsafe(entry->start, entry->end);
1871 		pmap_kremove(entry->start, entry->end - entry->start);
1872 	} else if (UVM_ET_ISOBJ(entry) &&
1873 	    UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
1874 		KASSERT(vm_map_pmap(map) == pmap_kernel());
1875 		/*
1876 		 * Note: kernel object mappings are currently used in
1877 		 * two ways:
1878 		 *  [1] "normal" mappings of pages in the kernel object
1879 		 *  [2] uvm_km_valloc'd allocations in which we
1880 		 *      pmap_enter in some non-kernel-object page
1881 		 *      (e.g. vmapbuf).
1882 		 *
1883 		 * for case [1], we need to remove the mapping from
1884 		 * the pmap and then remove the page from the kernel
1885 		 * object (because, once pages in a kernel object are
1886 		 * unmapped they are no longer needed, unlike, say,
1887 		 * a vnode where you might want the data to persist
1888 		 * until flushed out of a queue).
1889 		 *
1890 		 * for case [2], we need to remove the mapping from
1891 		 * the pmap.  there shouldn't be any pages at the
1892 		 * specified offset in the kernel object [but it
1893 		 * doesn't hurt to call uvm_km_pgremove just to be
1894 		 * safe?]
1895 		 *
1896 		 * uvm_km_pgremove currently does the following:
1897 		 *   for pages in the kernel object range:
1898 		 *     - drops the swap slot
1899 		 *     - uvm_pagefree the page
1900 		 *
1901 		 * note there is version of uvm_km_pgremove() that
1902 		 * is used for "intrsafe" objects.
1903 		 */
1904 		/*
1905 		 * remove mappings from pmap and drop the pages
1906 		 * from the object.  offsets are always relative
1907 		 * to vm_map_min(kernel_map).
1908 		 */
1909 		pmap_remove(pmap_kernel(), entry->start, entry->end);
1910 		uvm_km_pgremove(entry->object.uvm_obj,
1911 		    entry->start - vm_map_min(kernel_map),
1912 		    entry->end - vm_map_min(kernel_map));
1913 
1914 		/*
1915 		 * null out kernel_object reference, we've just
1916 		 * dropped it
1917 		 */
1918 		entry->etype &= ~UVM_ET_OBJ;
1919 		entry->object.uvm_obj = NULL;  /* to be safe */
1920 	} else {
1921 		/* remove mappings the standard way. */
1922 		pmap_remove(map->pmap, entry->start, entry->end);
1923 	}
1924 }
1925 
1926 /*
1927  * Remove all entries from start to end.
1928  *
1929  * If remove_holes, then remove ET_HOLE entries as well.
1930  * If markfree, entry will be properly marked free, otherwise, no replacement
1931  * entry will be put in the tree (corrupting the tree).
1932  */
1933 void
1934 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
1935     struct uvm_map_deadq *dead, boolean_t remove_holes,
1936     boolean_t markfree)
1937 {
1938 	struct vm_map_entry *prev_hint, *next, *entry;
1939 
1940 	start = MAX(start, map->min_offset);
1941 	end = MIN(end, map->max_offset);
1942 	if (start >= end)
1943 		return;
1944 
1945 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
1946 		splassert(IPL_NONE);
1947 	else
1948 		splassert(IPL_VM);
1949 
1950 	/* Find first affected entry. */
1951 	entry = uvm_map_entrybyaddr(&map->addr, start);
1952 	KDASSERT(entry != NULL && entry->start <= start);
1953 	if (entry->end <= start && markfree)
1954 		entry = RBT_NEXT(uvm_map_addr, entry);
1955 	else
1956 		UVM_MAP_CLIP_START(map, entry, start);
1957 
1958 	/*
1959 	 * Iterate entries until we reach end address.
1960 	 * prev_hint hints where the freed space can be appended to.
1961 	 */
1962 	prev_hint = NULL;
1963 	for (; entry != NULL && entry->start < end; entry = next) {
1964 		KDASSERT(entry->start >= start);
1965 		if (entry->end > end || !markfree)
1966 			UVM_MAP_CLIP_END(map, entry, end);
1967 		KDASSERT(entry->start >= start && entry->end <= end);
1968 		next = RBT_NEXT(uvm_map_addr, entry);
1969 
1970 		/* Don't remove holes unless asked to do so. */
1971 		if (UVM_ET_ISHOLE(entry)) {
1972 			if (!remove_holes) {
1973 				prev_hint = entry;
1974 				continue;
1975 			}
1976 		}
1977 
1978 		/* Kill entry. */
1979 		uvm_unmap_kill_entry(map, entry);
1980 
1981 		/* Update space usage. */
1982 		if ((map->flags & VM_MAP_ISVMSPACE) &&
1983 		    entry->object.uvm_obj == NULL &&
1984 		    !UVM_ET_ISHOLE(entry)) {
1985 			((struct vmspace *)map)->vm_dused -=
1986 			    uvmspace_dused(map, entry->start, entry->end);
1987 		}
1988 		if (!UVM_ET_ISHOLE(entry))
1989 			map->size -= entry->end - entry->start;
1990 
1991 		/* Actual removal of entry. */
1992 		uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree);
1993 	}
1994 
1995 	pmap_update(vm_map_pmap(map));
1996 
1997 #ifdef VMMAP_DEBUG
1998 	if (markfree) {
1999 		for (entry = uvm_map_entrybyaddr(&map->addr, start);
2000 		    entry != NULL && entry->start < end;
2001 		    entry = RBT_NEXT(uvm_map_addr, entry)) {
2002 			KDASSERT(entry->end <= start ||
2003 			    entry->start == entry->end ||
2004 			    UVM_ET_ISHOLE(entry));
2005 		}
2006 	} else {
2007 		vaddr_t a;
2008 		for (a = start; a < end; a += PAGE_SIZE)
2009 			KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL);
2010 	}
2011 #endif
2012 }
2013 
2014 /*
2015  * Mark all entries from first until end (exclusive) as pageable.
2016  *
2017  * Lock must be exclusive on entry and will not be touched.
2018  */
2019 void
2020 uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first,
2021     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr)
2022 {
2023 	struct vm_map_entry *iter;
2024 
2025 	for (iter = first; iter != end;
2026 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2027 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2028 		if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2029 			continue;
2030 
2031 		iter->wired_count = 0;
2032 		uvm_fault_unwire_locked(map, iter->start, iter->end);
2033 	}
2034 }
2035 
2036 /*
2037  * Mark all entries from first until end (exclusive) as wired.
2038  *
2039  * Lockflags determines the lock state on return from this function.
2040  * Lock must be exclusive on entry.
2041  */
2042 int
2043 uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first,
2044     struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr,
2045     int lockflags)
2046 {
2047 	struct vm_map_entry *iter;
2048 #ifdef DIAGNOSTIC
2049 	unsigned int timestamp_save;
2050 #endif
2051 	int error;
2052 
2053 	/*
2054 	 * Wire pages in two passes:
2055 	 *
2056 	 * 1: holding the write lock, we create any anonymous maps that need
2057 	 *    to be created.  then we clip each map entry to the region to
2058 	 *    be wired and increment its wiring count.
2059 	 *
2060 	 * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
2061 	 *    in the pages for any newly wired area (wired_count == 1).
2062 	 *
2063 	 *    downgrading to a read lock for uvm_fault_wire avoids a possible
2064 	 *    deadlock with another thread that may have faulted on one of
2065 	 *    the pages to be wired (it would mark the page busy, blocking
2066 	 *    us, then in turn block on the map lock that we hold).
2067 	 *    because we keep the read lock on the map, the copy-on-write
2068 	 *    status of the entries we modify here cannot change.
2069 	 */
2070 	for (iter = first; iter != end;
2071 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2072 		KDASSERT(iter->start >= start_addr && iter->end <= end_addr);
2073 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2074 		    iter->protection == PROT_NONE)
2075 			continue;
2076 
2077 		/*
2078 		 * Perform actions of vm_map_lookup that need the write lock.
2079 		 * - create an anonymous map for copy-on-write
2080 		 * - anonymous map for zero-fill
2081 		 * Skip submaps.
2082 		 */
2083 		if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) &&
2084 		    UVM_ET_ISNEEDSCOPY(iter) &&
2085 		    ((iter->protection & PROT_WRITE) ||
2086 		    iter->object.uvm_obj == NULL)) {
2087 			amap_copy(map, iter, M_WAITOK, TRUE,
2088 			    iter->start, iter->end);
2089 		}
2090 		iter->wired_count++;
2091 	}
2092 
2093 	/*
2094 	 * Pass 2.
2095 	 */
2096 #ifdef DIAGNOSTIC
2097 	timestamp_save = map->timestamp;
2098 #endif
2099 	vm_map_busy(map);
2100 	vm_map_downgrade(map);
2101 
2102 	error = 0;
2103 	for (iter = first; error == 0 && iter != end;
2104 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
2105 		if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2106 		    iter->protection == PROT_NONE)
2107 			continue;
2108 
2109 		error = uvm_fault_wire(map, iter->start, iter->end,
2110 		    iter->protection);
2111 	}
2112 
2113 	if (error) {
2114 		/*
2115 		 * uvm_fault_wire failure
2116 		 *
2117 		 * Reacquire lock and undo our work.
2118 		 */
2119 		vm_map_upgrade(map);
2120 		vm_map_unbusy(map);
2121 #ifdef DIAGNOSTIC
2122 		if (timestamp_save != map->timestamp)
2123 			panic("uvm_map_pageable_wire: stale map");
2124 #endif
2125 
2126 		/*
2127 		 * first is no longer needed to restart loops.
2128 		 * Use it as iterator to unmap successful mappings.
2129 		 */
2130 		for (; first != iter;
2131 		    first = RBT_NEXT(uvm_map_addr, first)) {
2132 			if (UVM_ET_ISHOLE(first) ||
2133 			    first->start == first->end ||
2134 			    first->protection == PROT_NONE)
2135 				continue;
2136 
2137 			first->wired_count--;
2138 			if (!VM_MAPENT_ISWIRED(first)) {
2139 				uvm_fault_unwire_locked(map,
2140 				    iter->start, iter->end);
2141 			}
2142 		}
2143 
2144 		/* decrease counter in the rest of the entries */
2145 		for (; iter != end;
2146 		    iter = RBT_NEXT(uvm_map_addr, iter)) {
2147 			if (UVM_ET_ISHOLE(iter) || iter->start == iter->end ||
2148 			    iter->protection == PROT_NONE)
2149 				continue;
2150 
2151 			iter->wired_count--;
2152 		}
2153 
2154 		if ((lockflags & UVM_LK_EXIT) == 0)
2155 			vm_map_unlock(map);
2156 		return error;
2157 	}
2158 
2159 	/* We are currently holding a read lock. */
2160 	if ((lockflags & UVM_LK_EXIT) == 0) {
2161 		vm_map_unbusy(map);
2162 		vm_map_unlock_read(map);
2163 	} else {
2164 		vm_map_upgrade(map);
2165 		vm_map_unbusy(map);
2166 #ifdef DIAGNOSTIC
2167 		if (timestamp_save != map->timestamp)
2168 			panic("uvm_map_pageable_wire: stale map");
2169 #endif
2170 	}
2171 	return 0;
2172 }
2173 
2174 /*
2175  * uvm_map_pageable: set pageability of a range in a map.
2176  *
2177  * Flags:
2178  * UVM_LK_ENTER: map is already locked by caller
2179  * UVM_LK_EXIT:  don't unlock map on exit
2180  *
2181  * The full range must be in use (entries may not have fspace != 0).
2182  * UVM_ET_HOLE counts as unmapped.
2183  */
2184 int
2185 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
2186     boolean_t new_pageable, int lockflags)
2187 {
2188 	struct vm_map_entry *first, *last, *tmp;
2189 	int error;
2190 
2191 	start = trunc_page(start);
2192 	end = round_page(end);
2193 
2194 	if (start > end)
2195 		return EINVAL;
2196 	if (start == end)
2197 		return 0;	/* nothing to do */
2198 	if (start < map->min_offset)
2199 		return EFAULT; /* why? see first XXX below */
2200 	if (end > map->max_offset)
2201 		return EINVAL; /* why? see second XXX below */
2202 
2203 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2204 	if ((lockflags & UVM_LK_ENTER) == 0)
2205 		vm_map_lock(map);
2206 
2207 	/*
2208 	 * Find first entry.
2209 	 *
2210 	 * Initial test on start is different, because of the different
2211 	 * error returned. Rest is tested further down.
2212 	 */
2213 	first = uvm_map_entrybyaddr(&map->addr, start);
2214 	if (first->end <= start || UVM_ET_ISHOLE(first)) {
2215 		/*
2216 		 * XXX if the first address is not mapped, it is EFAULT?
2217 		 */
2218 		error = EFAULT;
2219 		goto out;
2220 	}
2221 
2222 	/* Check that the range has no holes. */
2223 	for (last = first; last != NULL && last->start < end;
2224 	    last = RBT_NEXT(uvm_map_addr, last)) {
2225 		if (UVM_ET_ISHOLE(last) ||
2226 		    (last->end < end && VMMAP_FREE_END(last) != last->end)) {
2227 			/*
2228 			 * XXX unmapped memory in range, why is it EINVAL
2229 			 * instead of EFAULT?
2230 			 */
2231 			error = EINVAL;
2232 			goto out;
2233 		}
2234 	}
2235 
2236 	/*
2237 	 * Last ended at the first entry after the range.
2238 	 * Move back one step.
2239 	 *
2240 	 * Note that last may be NULL.
2241 	 */
2242 	if (last == NULL) {
2243 		last = RBT_MAX(uvm_map_addr, &map->addr);
2244 		if (last->end < end) {
2245 			error = EINVAL;
2246 			goto out;
2247 		}
2248 	} else {
2249 		KASSERT(last != first);
2250 		last = RBT_PREV(uvm_map_addr, last);
2251 	}
2252 
2253 	/* Wire/unwire pages here. */
2254 	if (new_pageable) {
2255 		/*
2256 		 * Mark pageable.
2257 		 * entries that are not wired are untouched.
2258 		 */
2259 		if (VM_MAPENT_ISWIRED(first))
2260 			UVM_MAP_CLIP_START(map, first, start);
2261 		/*
2262 		 * Split last at end.
2263 		 * Make tmp be the first entry after what is to be touched.
2264 		 * If last is not wired, don't touch it.
2265 		 */
2266 		if (VM_MAPENT_ISWIRED(last)) {
2267 			UVM_MAP_CLIP_END(map, last, end);
2268 			tmp = RBT_NEXT(uvm_map_addr, last);
2269 		} else
2270 			tmp = last;
2271 
2272 		uvm_map_pageable_pgon(map, first, tmp, start, end);
2273 		error = 0;
2274 
2275 out:
2276 		if ((lockflags & UVM_LK_EXIT) == 0)
2277 			vm_map_unlock(map);
2278 		return error;
2279 	} else {
2280 		/*
2281 		 * Mark entries wired.
2282 		 * entries are always touched (because recovery needs this).
2283 		 */
2284 		if (!VM_MAPENT_ISWIRED(first))
2285 			UVM_MAP_CLIP_START(map, first, start);
2286 		/*
2287 		 * Split last at end.
2288 		 * Make tmp be the first entry after what is to be touched.
2289 		 * If last is not wired, don't touch it.
2290 		 */
2291 		if (!VM_MAPENT_ISWIRED(last)) {
2292 			UVM_MAP_CLIP_END(map, last, end);
2293 			tmp = RBT_NEXT(uvm_map_addr, last);
2294 		} else
2295 			tmp = last;
2296 
2297 		return uvm_map_pageable_wire(map, first, tmp, start, end,
2298 		    lockflags);
2299 	}
2300 }
2301 
2302 /*
2303  * uvm_map_pageable_all: special case of uvm_map_pageable - affects
2304  * all mapped regions.
2305  *
2306  * Map must not be locked.
2307  * If no flags are specified, all ragions are unwired.
2308  */
2309 int
2310 uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
2311 {
2312 	vsize_t size;
2313 	struct vm_map_entry *iter;
2314 
2315 	KASSERT(map->flags & VM_MAP_PAGEABLE);
2316 	vm_map_lock(map);
2317 
2318 	if (flags == 0) {
2319 		uvm_map_pageable_pgon(map, RBT_MIN(uvm_map_addr, &map->addr),
2320 		    NULL, map->min_offset, map->max_offset);
2321 
2322 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
2323 		vm_map_unlock(map);
2324 		return 0;
2325 	}
2326 
2327 	if (flags & MCL_FUTURE)
2328 		vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
2329 	if (!(flags & MCL_CURRENT)) {
2330 		vm_map_unlock(map);
2331 		return 0;
2332 	}
2333 
2334 	/*
2335 	 * Count number of pages in all non-wired entries.
2336 	 * If the number exceeds the limit, abort.
2337 	 */
2338 	size = 0;
2339 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2340 		if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter))
2341 			continue;
2342 
2343 		size += iter->end - iter->start;
2344 	}
2345 
2346 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
2347 		vm_map_unlock(map);
2348 		return ENOMEM;
2349 	}
2350 
2351 	/* XXX non-pmap_wired_count case must be handled by caller */
2352 #ifdef pmap_wired_count
2353 	if (limit != 0 &&
2354 	    size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) {
2355 		vm_map_unlock(map);
2356 		return ENOMEM;
2357 	}
2358 #endif
2359 
2360 	/*
2361 	 * uvm_map_pageable_wire will release lcok
2362 	 */
2363 	return uvm_map_pageable_wire(map, RBT_MIN(uvm_map_addr, &map->addr),
2364 	    NULL, map->min_offset, map->max_offset, 0);
2365 }
2366 
2367 /*
2368  * Initialize map.
2369  *
2370  * Allocates sufficient entries to describe the free memory in the map.
2371  */
2372 void
2373 uvm_map_setup(struct vm_map *map, vaddr_t min, vaddr_t max, int flags)
2374 {
2375 	int i;
2376 
2377 	KASSERT((min & (vaddr_t)PAGE_MASK) == 0);
2378 	KASSERT((max & (vaddr_t)PAGE_MASK) == 0 ||
2379 	    (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
2380 
2381 	/*
2382 	 * Update parameters.
2383 	 *
2384 	 * This code handles (vaddr_t)-1 and other page mask ending addresses
2385 	 * properly.
2386 	 * We lose the top page if the full virtual address space is used.
2387 	 */
2388 	if (max & (vaddr_t)PAGE_MASK) {
2389 		max += 1;
2390 		if (max == 0) /* overflow */
2391 			max -= PAGE_SIZE;
2392 	}
2393 
2394 	RBT_INIT(uvm_map_addr, &map->addr);
2395 	map->uaddr_exe = NULL;
2396 	for (i = 0; i < nitems(map->uaddr_any); ++i)
2397 		map->uaddr_any[i] = NULL;
2398 	map->uaddr_brk_stack = NULL;
2399 
2400 	map->size = 0;
2401 	map->ref_count = 0;
2402 	map->min_offset = min;
2403 	map->max_offset = max;
2404 	map->b_start = map->b_end = 0; /* Empty brk() area by default. */
2405 	map->s_start = map->s_end = 0; /* Empty stack area by default. */
2406 	map->flags = flags;
2407 	map->timestamp = 0;
2408 	rw_init_flags(&map->lock, "vmmaplk", RWL_DUPOK);
2409 	mtx_init(&map->mtx, IPL_VM);
2410 	mtx_init(&map->flags_lock, IPL_VM);
2411 
2412 	/* Configure the allocators. */
2413 	if (flags & VM_MAP_ISVMSPACE)
2414 		uvm_map_setup_md(map);
2415 	else
2416 		map->uaddr_any[3] = &uaddr_kbootstrap;
2417 
2418 	/*
2419 	 * Fill map entries.
2420 	 * We do not need to write-lock the map here because only the current
2421 	 * thread sees it right now. Initialize ref_count to 0 above to avoid
2422 	 * bogus triggering of lock-not-held assertions.
2423 	 */
2424 	uvm_map_setup_entries(map);
2425 	uvm_tree_sanity(map, __FILE__, __LINE__);
2426 	map->ref_count = 1;
2427 }
2428 
2429 /*
2430  * Destroy the map.
2431  *
2432  * This is the inverse operation to uvm_map_setup.
2433  */
2434 void
2435 uvm_map_teardown(struct vm_map *map)
2436 {
2437 	struct uvm_map_deadq	 dead_entries;
2438 	struct vm_map_entry	*entry, *tmp;
2439 #ifdef VMMAP_DEBUG
2440 	size_t			 numq, numt;
2441 #endif
2442 	int			 i;
2443 
2444 	KERNEL_ASSERT_LOCKED();
2445 	KERNEL_UNLOCK();
2446 	KERNEL_ASSERT_UNLOCKED();
2447 
2448 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
2449 
2450 	/* Remove address selectors. */
2451 	uvm_addr_destroy(map->uaddr_exe);
2452 	map->uaddr_exe = NULL;
2453 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2454 		uvm_addr_destroy(map->uaddr_any[i]);
2455 		map->uaddr_any[i] = NULL;
2456 	}
2457 	uvm_addr_destroy(map->uaddr_brk_stack);
2458 	map->uaddr_brk_stack = NULL;
2459 
2460 	/*
2461 	 * Remove entries.
2462 	 *
2463 	 * The following is based on graph breadth-first search.
2464 	 *
2465 	 * In color terms:
2466 	 * - the dead_entries set contains all nodes that are reachable
2467 	 *   (i.e. both the black and the grey nodes)
2468 	 * - any entry not in dead_entries is white
2469 	 * - any entry that appears in dead_entries before entry,
2470 	 *   is black, the rest is grey.
2471 	 * The set [entry, end] is also referred to as the wavefront.
2472 	 *
2473 	 * Since the tree is always a fully connected graph, the breadth-first
2474 	 * search guarantees that each vmmap_entry is visited exactly once.
2475 	 * The vm_map is broken down in linear time.
2476 	 */
2477 	TAILQ_INIT(&dead_entries);
2478 	if ((entry = RBT_ROOT(uvm_map_addr, &map->addr)) != NULL)
2479 		DEAD_ENTRY_PUSH(&dead_entries, entry);
2480 	while (entry != NULL) {
2481 		sched_pause(yield);
2482 		uvm_unmap_kill_entry(map, entry);
2483 		if ((tmp = RBT_LEFT(uvm_map_addr, entry)) != NULL)
2484 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2485 		if ((tmp = RBT_RIGHT(uvm_map_addr, entry)) != NULL)
2486 			DEAD_ENTRY_PUSH(&dead_entries, tmp);
2487 		/* Update wave-front. */
2488 		entry = TAILQ_NEXT(entry, dfree.deadq);
2489 	}
2490 
2491 #ifdef VMMAP_DEBUG
2492 	numt = numq = 0;
2493 	RBT_FOREACH(entry, uvm_map_addr, &map->addr)
2494 		numt++;
2495 	TAILQ_FOREACH(entry, &dead_entries, dfree.deadq)
2496 		numq++;
2497 	KASSERT(numt == numq);
2498 #endif
2499 	uvm_unmap_detach(&dead_entries, UVM_PLA_WAITOK);
2500 
2501 	KERNEL_LOCK();
2502 
2503 	pmap_destroy(map->pmap);
2504 	map->pmap = NULL;
2505 }
2506 
2507 /*
2508  * Populate map with free-memory entries.
2509  *
2510  * Map must be initialized and empty.
2511  */
2512 void
2513 uvm_map_setup_entries(struct vm_map *map)
2514 {
2515 	KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
2516 
2517 	uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0);
2518 }
2519 
2520 /*
2521  * Split entry at given address.
2522  *
2523  * orig:  entry that is to be split.
2524  * next:  a newly allocated map entry that is not linked.
2525  * split: address at which the split is done.
2526  */
2527 void
2528 uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig,
2529     struct vm_map_entry *next, vaddr_t split)
2530 {
2531 	struct uvm_addr_state *free, *free_before;
2532 	vsize_t adj;
2533 
2534 	if ((split & PAGE_MASK) != 0) {
2535 		panic("uvm_map_splitentry: split address 0x%lx "
2536 		    "not on page boundary!", split);
2537 	}
2538 	KDASSERT(map != NULL && orig != NULL && next != NULL);
2539 	uvm_tree_sanity(map, __FILE__, __LINE__);
2540 	KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split);
2541 
2542 #ifdef VMMAP_DEBUG
2543 	KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, orig) == orig);
2544 	KDASSERT(RBT_FIND(uvm_map_addr, &map->addr, next) != next);
2545 #endif /* VMMAP_DEBUG */
2546 
2547 	/*
2548 	 * Free space will change, unlink from free space tree.
2549 	 */
2550 	free = uvm_map_uaddr_e(map, orig);
2551 	uvm_mapent_free_remove(map, free, orig);
2552 
2553 	adj = split - orig->start;
2554 
2555 	uvm_mapent_copy(orig, next);
2556 	if (split >= orig->end) {
2557 		next->etype = 0;
2558 		next->offset = 0;
2559 		next->wired_count = 0;
2560 		next->start = next->end = split;
2561 		next->guard = 0;
2562 		next->fspace = VMMAP_FREE_END(orig) - split;
2563 		next->aref.ar_amap = NULL;
2564 		next->aref.ar_pageoff = 0;
2565 		orig->guard = MIN(orig->guard, split - orig->end);
2566 		orig->fspace = split - VMMAP_FREE_START(orig);
2567 	} else {
2568 		orig->fspace = 0;
2569 		orig->guard = 0;
2570 		orig->end = next->start = split;
2571 
2572 		if (next->aref.ar_amap) {
2573 			KERNEL_LOCK();
2574 			amap_splitref(&orig->aref, &next->aref, adj);
2575 			KERNEL_UNLOCK();
2576 		}
2577 		if (UVM_ET_ISSUBMAP(orig)) {
2578 			uvm_map_reference(next->object.sub_map);
2579 			next->offset += adj;
2580 		} else if (UVM_ET_ISOBJ(orig)) {
2581 			if (next->object.uvm_obj->pgops &&
2582 			    next->object.uvm_obj->pgops->pgo_reference) {
2583 				KERNEL_LOCK();
2584 				next->object.uvm_obj->pgops->pgo_reference(
2585 				    next->object.uvm_obj);
2586 				KERNEL_UNLOCK();
2587 			}
2588 			next->offset += adj;
2589 		}
2590 	}
2591 
2592 	/*
2593 	 * Link next into address tree.
2594 	 * Link orig and next into free-space tree.
2595 	 *
2596 	 * Don't insert 'next' into the addr tree until orig has been linked,
2597 	 * in case the free-list looks at adjecent entries in the addr tree
2598 	 * for its decisions.
2599 	 */
2600 	if (orig->fspace > 0)
2601 		free_before = free;
2602 	else
2603 		free_before = uvm_map_uaddr_e(map, orig);
2604 	uvm_mapent_free_insert(map, free_before, orig);
2605 	uvm_mapent_addr_insert(map, next);
2606 	uvm_mapent_free_insert(map, free, next);
2607 
2608 	uvm_tree_sanity(map, __FILE__, __LINE__);
2609 }
2610 
2611 
2612 #ifdef VMMAP_DEBUG
2613 
2614 void
2615 uvm_tree_assert(struct vm_map *map, int test, char *test_str,
2616     char *file, int line)
2617 {
2618 	char* map_special;
2619 
2620 	if (test)
2621 		return;
2622 
2623 	if (map == kernel_map)
2624 		map_special = " (kernel_map)";
2625 	else if (map == kmem_map)
2626 		map_special = " (kmem_map)";
2627 	else
2628 		map_special = "";
2629 	panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file,
2630 	    line, test_str);
2631 }
2632 
2633 /*
2634  * Check that map is sane.
2635  */
2636 void
2637 uvm_tree_sanity(struct vm_map *map, char *file, int line)
2638 {
2639 	struct vm_map_entry	*iter;
2640 	vaddr_t			 addr;
2641 	vaddr_t			 min, max, bound; /* Bounds checker. */
2642 	struct uvm_addr_state	*free;
2643 
2644 	addr = vm_map_min(map);
2645 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2646 		/*
2647 		 * Valid start, end.
2648 		 * Catch overflow for end+fspace.
2649 		 */
2650 		UVM_ASSERT(map, iter->end >= iter->start, file, line);
2651 		UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line);
2652 
2653 		/* May not be empty. */
2654 		UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter),
2655 		    file, line);
2656 
2657 		/* Addresses for entry must lie within map boundaries. */
2658 		UVM_ASSERT(map, iter->start >= vm_map_min(map) &&
2659 		    VMMAP_FREE_END(iter) <= vm_map_max(map), file, line);
2660 
2661 		/* Tree may not have gaps. */
2662 		UVM_ASSERT(map, iter->start == addr, file, line);
2663 		addr = VMMAP_FREE_END(iter);
2664 
2665 		/*
2666 		 * Free space may not cross boundaries, unless the same
2667 		 * free list is used on both sides of the border.
2668 		 */
2669 		min = VMMAP_FREE_START(iter);
2670 		max = VMMAP_FREE_END(iter);
2671 
2672 		while (min < max &&
2673 		    (bound = uvm_map_boundary(map, min, max)) != max) {
2674 			UVM_ASSERT(map,
2675 			    uvm_map_uaddr(map, bound - 1) ==
2676 			    uvm_map_uaddr(map, bound),
2677 			    file, line);
2678 			min = bound;
2679 		}
2680 
2681 		free = uvm_map_uaddr_e(map, iter);
2682 		if (free) {
2683 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0,
2684 			    file, line);
2685 		} else {
2686 			UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0,
2687 			    file, line);
2688 		}
2689 	}
2690 	UVM_ASSERT(map, addr == vm_map_max(map), file, line);
2691 }
2692 
2693 void
2694 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
2695 {
2696 	struct vm_map_entry *iter;
2697 	vsize_t size;
2698 
2699 	size = 0;
2700 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2701 		if (!UVM_ET_ISHOLE(iter))
2702 			size += iter->end - iter->start;
2703 	}
2704 
2705 	if (map->size != size)
2706 		printf("map size = 0x%lx, should be 0x%lx\n", map->size, size);
2707 	UVM_ASSERT(map, map->size == size, file, line);
2708 
2709 	vmspace_validate(map);
2710 }
2711 
2712 /*
2713  * This function validates the statistics on vmspace.
2714  */
2715 void
2716 vmspace_validate(struct vm_map *map)
2717 {
2718 	struct vmspace *vm;
2719 	struct vm_map_entry *iter;
2720 	vaddr_t imin, imax;
2721 	vaddr_t stack_begin, stack_end; /* Position of stack. */
2722 	vsize_t stack, heap; /* Measured sizes. */
2723 
2724 	if (!(map->flags & VM_MAP_ISVMSPACE))
2725 		return;
2726 
2727 	vm = (struct vmspace *)map;
2728 	stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2729 	stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
2730 
2731 	stack = heap = 0;
2732 	RBT_FOREACH(iter, uvm_map_addr, &map->addr) {
2733 		imin = imax = iter->start;
2734 
2735 		if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL)
2736 			continue;
2737 
2738 		/*
2739 		 * Update stack, heap.
2740 		 * Keep in mind that (theoretically) the entries of
2741 		 * userspace and stack may be joined.
2742 		 */
2743 		while (imin != iter->end) {
2744 			/*
2745 			 * Set imax to the first boundary crossed between
2746 			 * imin and stack addresses.
2747 			 */
2748 			imax = iter->end;
2749 			if (imin < stack_begin && imax > stack_begin)
2750 				imax = stack_begin;
2751 			else if (imin < stack_end && imax > stack_end)
2752 				imax = stack_end;
2753 
2754 			if (imin >= stack_begin && imin < stack_end)
2755 				stack += imax - imin;
2756 			else
2757 				heap += imax - imin;
2758 			imin = imax;
2759 		}
2760 	}
2761 
2762 	heap >>= PAGE_SHIFT;
2763 	if (heap != vm->vm_dused) {
2764 		printf("vmspace stack range: 0x%lx-0x%lx\n",
2765 		    stack_begin, stack_end);
2766 		panic("vmspace_validate: vmspace.vm_dused invalid, "
2767 		    "expected %ld pgs, got %ld pgs in map %p",
2768 		    heap, vm->vm_dused,
2769 		    map);
2770 	}
2771 }
2772 
2773 #endif /* VMMAP_DEBUG */
2774 
2775 /*
2776  * uvm_map_init: init mapping system at boot time.   note that we allocate
2777  * and init the static pool of structs vm_map_entry for the kernel here.
2778  */
2779 void
2780 uvm_map_init(void)
2781 {
2782 	static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
2783 	int lcv;
2784 
2785 	/* now set up static pool of kernel map entries ... */
2786 	mtx_init(&uvm_kmapent_mtx, IPL_VM);
2787 	SLIST_INIT(&uvm.kentry_free);
2788 	for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
2789 		SLIST_INSERT_HEAD(&uvm.kentry_free,
2790 		    &kernel_map_entry[lcv], daddrs.addr_kentry);
2791 	}
2792 
2793 	/* initialize the map-related pools. */
2794 	pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), 0,
2795 	    IPL_NONE, PR_WAITOK, "vmsppl", NULL);
2796 	pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), 0,
2797 	    IPL_VM, PR_WAITOK, "vmmpepl", NULL);
2798 	pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), 0,
2799 	    IPL_VM, 0, "vmmpekpl", NULL);
2800 	pool_sethiwat(&uvm_map_entry_pool, 8192);
2801 
2802 	uvm_addr_init();
2803 }
2804 
2805 #if defined(DDB)
2806 
2807 /*
2808  * DDB hooks
2809  */
2810 
2811 /*
2812  * uvm_map_printit: actually prints the map
2813  */
2814 void
2815 uvm_map_printit(struct vm_map *map, boolean_t full,
2816     int (*pr)(const char *, ...))
2817 {
2818 	struct vmspace			*vm;
2819 	struct vm_map_entry		*entry;
2820 	struct uvm_addr_state		*free;
2821 	int				 in_free, i;
2822 	char				 buf[8];
2823 
2824 	(*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
2825 	(*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n",
2826 	    map->b_start, map->b_end);
2827 	(*pr)("\tstack allocate range: 0x%lx-0x%lx\n",
2828 	    map->s_start, map->s_end);
2829 	(*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n",
2830 	    map->size, map->ref_count, map->timestamp,
2831 	    map->flags);
2832 	(*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
2833 	    pmap_resident_count(map->pmap));
2834 
2835 	/* struct vmspace handling. */
2836 	if (map->flags & VM_MAP_ISVMSPACE) {
2837 		vm = (struct vmspace *)map;
2838 
2839 		(*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n",
2840 		    vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss);
2841 		(*pr)("\tvm_tsize=%u vm_dsize=%u\n",
2842 		    vm->vm_tsize, vm->vm_dsize);
2843 		(*pr)("\tvm_taddr=%p vm_daddr=%p\n",
2844 		    vm->vm_taddr, vm->vm_daddr);
2845 		(*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n",
2846 		    vm->vm_maxsaddr, vm->vm_minsaddr);
2847 	}
2848 
2849 	if (!full)
2850 		goto print_uaddr;
2851 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
2852 		(*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
2853 		    entry, entry->start, entry->end, entry->object.uvm_obj,
2854 		    (long long)entry->offset, entry->aref.ar_amap,
2855 		    entry->aref.ar_pageoff);
2856 		(*pr)("\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
2857 		    "wc=%d, adv=%d\n",
2858 		    (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
2859 		    (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
2860 		    (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
2861 		    entry->protection, entry->max_protection,
2862 		    entry->inheritance, entry->wired_count, entry->advice);
2863 
2864 		free = uvm_map_uaddr_e(map, entry);
2865 		in_free = (free != NULL);
2866 		(*pr)("\thole=%c, free=%c, guard=0x%lx, "
2867 		    "free=0x%lx-0x%lx\n",
2868 		    (entry->etype & UVM_ET_HOLE) ? 'T' : 'F',
2869 		    in_free ? 'T' : 'F',
2870 		    entry->guard,
2871 		    VMMAP_FREE_START(entry), VMMAP_FREE_END(entry));
2872 		(*pr)("\tfspace_augment=%lu\n", entry->fspace_augment);
2873 		(*pr)("\tfreemapped=%c, uaddr=%p\n",
2874 		    (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free);
2875 		if (free) {
2876 			(*pr)("\t\t(0x%lx-0x%lx %s)\n",
2877 			    free->uaddr_minaddr, free->uaddr_maxaddr,
2878 			    free->uaddr_functions->uaddr_name);
2879 		}
2880 	}
2881 
2882 print_uaddr:
2883 	uvm_addr_print(map->uaddr_exe, "exe", full, pr);
2884 	for (i = 0; i < nitems(map->uaddr_any); i++) {
2885 		snprintf(&buf[0], sizeof(buf), "any[%d]", i);
2886 		uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr);
2887 	}
2888 	uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr);
2889 }
2890 
2891 /*
2892  * uvm_object_printit: actually prints the object
2893  */
2894 void
2895 uvm_object_printit(uobj, full, pr)
2896 	struct uvm_object *uobj;
2897 	boolean_t full;
2898 	int (*pr)(const char *, ...);
2899 {
2900 	struct vm_page *pg;
2901 	int cnt = 0;
2902 
2903 	(*pr)("OBJECT %p: pgops=%p, npages=%d, ",
2904 	    uobj, uobj->pgops, uobj->uo_npages);
2905 	if (UVM_OBJ_IS_KERN_OBJECT(uobj))
2906 		(*pr)("refs=<SYSTEM>\n");
2907 	else
2908 		(*pr)("refs=%d\n", uobj->uo_refs);
2909 
2910 	if (!full) {
2911 		return;
2912 	}
2913 	(*pr)("  PAGES <pg,offset>:\n  ");
2914 	RBT_FOREACH(pg, uvm_objtree, &uobj->memt) {
2915 		(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
2916 		if ((cnt % 3) == 2) {
2917 			(*pr)("\n  ");
2918 		}
2919 		cnt++;
2920 	}
2921 	if ((cnt % 3) != 2) {
2922 		(*pr)("\n");
2923 	}
2924 }
2925 
2926 /*
2927  * uvm_page_printit: actually print the page
2928  */
2929 static const char page_flagbits[] =
2930 	"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
2931 	"\11ZERO\12DEV\15PAGER1\21FREE\22INACTIVE\23ACTIVE\25ANON\26AOBJ"
2932 	"\27ENCRYPT\31PMAP0\32PMAP1\33PMAP2\34PMAP3\35PMAP4\36PMAP5";
2933 
2934 void
2935 uvm_page_printit(pg, full, pr)
2936 	struct vm_page *pg;
2937 	boolean_t full;
2938 	int (*pr)(const char *, ...);
2939 {
2940 	struct vm_page *tpg;
2941 	struct uvm_object *uobj;
2942 	struct pglist *pgl;
2943 
2944 	(*pr)("PAGE %p:\n", pg);
2945 	(*pr)("  flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n",
2946 	    pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count,
2947 	    (long long)pg->phys_addr);
2948 	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
2949 	    pg->uobject, pg->uanon, (long long)pg->offset);
2950 #if defined(UVM_PAGE_TRKOWN)
2951 	if (pg->pg_flags & PG_BUSY)
2952 		(*pr)("  owning thread = %d, tag=%s",
2953 		    pg->owner, pg->owner_tag);
2954 	else
2955 		(*pr)("  page not busy, no owner");
2956 #else
2957 	(*pr)("  [page ownership tracking disabled]");
2958 #endif
2959 	(*pr)("\tvm_page_md %p\n", &pg->mdpage);
2960 
2961 	if (!full)
2962 		return;
2963 
2964 	/* cross-verify object/anon */
2965 	if ((pg->pg_flags & PQ_FREE) == 0) {
2966 		if (pg->pg_flags & PQ_ANON) {
2967 			if (pg->uanon == NULL || pg->uanon->an_page != pg)
2968 			    (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2969 				(pg->uanon) ? pg->uanon->an_page : NULL);
2970 			else
2971 				(*pr)("  anon backpointer is OK\n");
2972 		} else {
2973 			uobj = pg->uobject;
2974 			if (uobj) {
2975 				(*pr)("  checking object list\n");
2976 				RBT_FOREACH(tpg, uvm_objtree, &uobj->memt) {
2977 					if (tpg == pg) {
2978 						break;
2979 					}
2980 				}
2981 				if (tpg)
2982 					(*pr)("  page found on object list\n");
2983 				else
2984 					(*pr)("  >>> PAGE NOT FOUND "
2985 					    "ON OBJECT LIST! <<<\n");
2986 			}
2987 		}
2988 	}
2989 
2990 	/* cross-verify page queue */
2991 	if (pg->pg_flags & PQ_FREE) {
2992 		if (uvm_pmr_isfree(pg))
2993 			(*pr)("  page found in uvm_pmemrange\n");
2994 		else
2995 			(*pr)("  >>> page not found in uvm_pmemrange <<<\n");
2996 		pgl = NULL;
2997 	} else if (pg->pg_flags & PQ_INACTIVE) {
2998 		pgl = (pg->pg_flags & PQ_SWAPBACKED) ?
2999 		    &uvm.page_inactive_swp : &uvm.page_inactive_obj;
3000 	} else if (pg->pg_flags & PQ_ACTIVE) {
3001 		pgl = &uvm.page_active;
3002  	} else {
3003 		pgl = NULL;
3004 	}
3005 
3006 	if (pgl) {
3007 		(*pr)("  checking pageq list\n");
3008 		TAILQ_FOREACH(tpg, pgl, pageq) {
3009 			if (tpg == pg) {
3010 				break;
3011 			}
3012 		}
3013 		if (tpg)
3014 			(*pr)("  page found on pageq list\n");
3015 		else
3016 			(*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
3017 	}
3018 }
3019 #endif
3020 
3021 /*
3022  * uvm_map_protect: change map protection
3023  *
3024  * => set_max means set max_protection.
3025  * => map must be unlocked.
3026  */
3027 int
3028 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
3029     vm_prot_t new_prot, boolean_t set_max)
3030 {
3031 	struct vm_map_entry *first, *iter;
3032 	vm_prot_t old_prot;
3033 	vm_prot_t mask;
3034 	int error;
3035 
3036 	if (start > end)
3037 		return EINVAL;
3038 	start = MAX(start, map->min_offset);
3039 	end = MIN(end, map->max_offset);
3040 	if (start >= end)
3041 		return 0;
3042 
3043 	error = 0;
3044 	vm_map_lock(map);
3045 
3046 	/*
3047 	 * Set up first and last.
3048 	 * - first will contain first entry at or after start.
3049 	 */
3050 	first = uvm_map_entrybyaddr(&map->addr, start);
3051 	KDASSERT(first != NULL);
3052 	if (first->end < start)
3053 		first = RBT_NEXT(uvm_map_addr, first);
3054 
3055 	/* First, check for protection violations. */
3056 	for (iter = first; iter != NULL && iter->start < end;
3057 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
3058 		/* Treat memory holes as free space. */
3059 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3060 			continue;
3061 
3062 		if (UVM_ET_ISSUBMAP(iter)) {
3063 			error = EINVAL;
3064 			goto out;
3065 		}
3066 		if ((new_prot & iter->max_protection) != new_prot) {
3067 			error = EACCES;
3068 			goto out;
3069 		}
3070 		if (map == kernel_map &&
3071 		    (new_prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC))
3072 			panic("uvm_map_protect: kernel map W^X violation requested");
3073 	}
3074 
3075 	/* Fix protections.  */
3076 	for (iter = first; iter != NULL && iter->start < end;
3077 	    iter = RBT_NEXT(uvm_map_addr, iter)) {
3078 		/* Treat memory holes as free space. */
3079 		if (iter->start == iter->end || UVM_ET_ISHOLE(iter))
3080 			continue;
3081 
3082 		old_prot = iter->protection;
3083 
3084 		/*
3085 		 * Skip adapting protection iff old and new protection
3086 		 * are equal.
3087 		 */
3088 		if (set_max) {
3089 			if (old_prot == (new_prot & old_prot) &&
3090 			    iter->max_protection == new_prot)
3091 				continue;
3092 		} else {
3093 			if (old_prot == new_prot)
3094 				continue;
3095 		}
3096 
3097 		UVM_MAP_CLIP_START(map, iter, start);
3098 		UVM_MAP_CLIP_END(map, iter, end);
3099 
3100 		if (set_max) {
3101 			iter->max_protection = new_prot;
3102 			iter->protection &= new_prot;
3103 		} else
3104 			iter->protection = new_prot;
3105 
3106 		/*
3107 		 * update physical map if necessary.  worry about copy-on-write
3108 		 * here -- CHECK THIS XXX
3109 		 */
3110 		if (iter->protection != old_prot) {
3111 			mask = UVM_ET_ISCOPYONWRITE(iter) ?
3112 			    ~PROT_WRITE : PROT_MASK;
3113 
3114 			/* update pmap */
3115 			if ((iter->protection & mask) == PROT_NONE &&
3116 			    VM_MAPENT_ISWIRED(iter)) {
3117 				/*
3118 				 * TODO(ariane) this is stupid. wired_count
3119 				 * is 0 if not wired, otherwise anything
3120 				 * larger than 0 (incremented once each time
3121 				 * wire is called).
3122 				 * Mostly to be able to undo the damage on
3123 				 * failure. Not the actually be a wired
3124 				 * refcounter...
3125 				 * Originally: iter->wired_count--;
3126 				 * (don't we have to unwire this in the pmap
3127 				 * as well?)
3128 				 */
3129 				iter->wired_count = 0;
3130 			}
3131 			pmap_protect(map->pmap, iter->start, iter->end,
3132 			    iter->protection & mask);
3133 		}
3134 
3135 		/*
3136 		 * If the map is configured to lock any future mappings,
3137 		 * wire this entry now if the old protection was PROT_NONE
3138 		 * and the new protection is not PROT_NONE.
3139 		 */
3140 		if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
3141 		    VM_MAPENT_ISWIRED(iter) == 0 &&
3142 		    old_prot == PROT_NONE &&
3143 		    new_prot != PROT_NONE) {
3144 			if (uvm_map_pageable(map, iter->start, iter->end,
3145 			    FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) {
3146 				/*
3147 				 * If locking the entry fails, remember the
3148 				 * error if it's the first one.  Note we
3149 				 * still continue setting the protection in
3150 				 * the map, but it will return the resource
3151 				 * storage condition regardless.
3152 				 *
3153 				 * XXX Ignore what the actual error is,
3154 				 * XXX just call it a resource shortage
3155 				 * XXX so that it doesn't get confused
3156 				 * XXX what uvm_map_protect() itself would
3157 				 * XXX normally return.
3158 				 */
3159 				error = ENOMEM;
3160 			}
3161 		}
3162 	}
3163 	pmap_update(map->pmap);
3164 
3165 out:
3166 	vm_map_unlock(map);
3167 	return error;
3168 }
3169 
3170 /*
3171  * uvmspace_alloc: allocate a vmspace structure.
3172  *
3173  * - structure includes vm_map and pmap
3174  * - XXX: no locking on this structure
3175  * - refcnt set to 1, rest must be init'd by caller
3176  */
3177 struct vmspace *
3178 uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable,
3179     boolean_t remove_holes)
3180 {
3181 	struct vmspace *vm;
3182 
3183 	vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO);
3184 	uvmspace_init(vm, NULL, min, max, pageable, remove_holes);
3185 	return (vm);
3186 }
3187 
3188 /*
3189  * uvmspace_init: initialize a vmspace structure.
3190  *
3191  * - XXX: no locking on this structure
3192  * - refcnt set to 1, rest must be init'd by caller
3193  */
3194 void
3195 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max,
3196     boolean_t pageable, boolean_t remove_holes)
3197 {
3198 	KASSERT(pmap == NULL || pmap == pmap_kernel());
3199 
3200 	if (pmap)
3201 		pmap_reference(pmap);
3202 	else
3203 		pmap = pmap_create();
3204 	vm->vm_map.pmap = pmap;
3205 
3206 	uvm_map_setup(&vm->vm_map, min, max,
3207 	    (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE);
3208 
3209 	vm->vm_refcnt = 1;
3210 
3211 	if (remove_holes)
3212 		pmap_remove_holes(vm);
3213 }
3214 
3215 /*
3216  * uvmspace_share: share a vmspace between two processes
3217  *
3218  * - XXX: no locking on vmspace
3219  * - used for vfork
3220  */
3221 
3222 struct vmspace *
3223 uvmspace_share(struct process *pr)
3224 {
3225 	struct vmspace *vm = pr->ps_vmspace;
3226 
3227 	vm->vm_refcnt++;
3228 	return vm;
3229 }
3230 
3231 /*
3232  * uvmspace_exec: the process wants to exec a new program
3233  *
3234  * - XXX: no locking on vmspace
3235  */
3236 
3237 void
3238 uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end)
3239 {
3240 	struct process *pr = p->p_p;
3241 	struct vmspace *nvm, *ovm = pr->ps_vmspace;
3242 	struct vm_map *map = &ovm->vm_map;
3243 	struct uvm_map_deadq dead_entries;
3244 
3245 	KASSERT((start & (vaddr_t)PAGE_MASK) == 0);
3246 	KASSERT((end & (vaddr_t)PAGE_MASK) == 0 ||
3247 	    (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK);
3248 
3249 	pmap_unuse_final(p);   /* before stack addresses go away */
3250 	TAILQ_INIT(&dead_entries);
3251 
3252 	/* see if more than one process is using this vmspace...  */
3253 	if (ovm->vm_refcnt == 1) {
3254 		/*
3255 		 * If pr is the only process using its vmspace then
3256 		 * we can safely recycle that vmspace for the program
3257 		 * that is being exec'd.
3258 		 */
3259 
3260 #ifdef SYSVSHM
3261 		/*
3262 		 * SYSV SHM semantics require us to kill all segments on an exec
3263 		 */
3264 		if (ovm->vm_shm)
3265 			shmexit(ovm);
3266 #endif
3267 
3268 		/*
3269 		 * POSIX 1003.1b -- "lock future mappings" is revoked
3270 		 * when a process execs another program image.
3271 		 */
3272 		vm_map_lock(map);
3273 		vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
3274 
3275 		/*
3276 		 * now unmap the old program
3277 		 *
3278 		 * Instead of attempting to keep the map valid, we simply
3279 		 * nuke all entries and ask uvm_map_setup to reinitialize
3280 		 * the map to the new boundaries.
3281 		 *
3282 		 * uvm_unmap_remove will actually nuke all entries for us
3283 		 * (as in, not replace them with free-memory entries).
3284 		 */
3285 		uvm_unmap_remove(map, map->min_offset, map->max_offset,
3286 		    &dead_entries, TRUE, FALSE);
3287 
3288 		KDASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
3289 
3290 		/* Nuke statistics and boundaries. */
3291 		memset(&ovm->vm_startcopy, 0,
3292 		    (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy);
3293 
3294 
3295 		if (end & (vaddr_t)PAGE_MASK) {
3296 			end += 1;
3297 			if (end == 0) /* overflow */
3298 				end -= PAGE_SIZE;
3299 		}
3300 
3301 		/* Setup new boundaries and populate map with entries. */
3302 		map->min_offset = start;
3303 		map->max_offset = end;
3304 		uvm_map_setup_entries(map);
3305 		vm_map_unlock(map);
3306 
3307 		/* but keep MMU holes unavailable */
3308 		pmap_remove_holes(ovm);
3309 	} else {
3310 		/*
3311 		 * pr's vmspace is being shared, so we can't reuse
3312 		 * it for pr since it is still being used for others.
3313 		 * allocate a new vmspace for pr
3314 		 */
3315 		nvm = uvmspace_alloc(start, end,
3316 		    (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE);
3317 
3318 		/* install new vmspace and drop our ref to the old one. */
3319 		pmap_deactivate(p);
3320 		p->p_vmspace = pr->ps_vmspace = nvm;
3321 		pmap_activate(p);
3322 
3323 		uvmspace_free(ovm);
3324 	}
3325 
3326 	/* Release dead entries */
3327 	uvm_unmap_detach(&dead_entries, 0);
3328 }
3329 
3330 /*
3331  * uvmspace_free: free a vmspace data structure
3332  *
3333  * - XXX: no locking on vmspace
3334  */
3335 void
3336 uvmspace_free(struct vmspace *vm)
3337 {
3338 	if (--vm->vm_refcnt == 0) {
3339 		/*
3340 		 * lock the map, to wait out all other references to it.  delete
3341 		 * all of the mappings and pages they hold, then call the pmap
3342 		 * module to reclaim anything left.
3343 		 */
3344 #ifdef SYSVSHM
3345 		/* Get rid of any SYSV shared memory segments. */
3346 		if (vm->vm_shm != NULL)
3347 			shmexit(vm);
3348 #endif
3349 
3350 		uvm_map_teardown(&vm->vm_map);
3351 		pool_put(&uvm_vmspace_pool, vm);
3352 	}
3353 }
3354 
3355 /*
3356  * uvm_share: Map the address range [srcaddr, srcaddr + sz) in
3357  * srcmap to the address range [dstaddr, dstaddr + sz) in
3358  * dstmap.
3359  *
3360  * The whole address range in srcmap must be backed by an object
3361  * (no holes).
3362  *
3363  * If successful, the address ranges share memory and the destination
3364  * address range uses the protection flags in prot.
3365  *
3366  * This routine assumes that sz is a multiple of PAGE_SIZE and
3367  * that dstaddr and srcaddr are page-aligned.
3368  */
3369 int
3370 uvm_share(struct vm_map *dstmap, vaddr_t dstaddr, vm_prot_t prot,
3371     struct vm_map *srcmap, vaddr_t srcaddr, vsize_t sz)
3372 {
3373 	int ret = 0;
3374 	vaddr_t unmap_end;
3375 	vaddr_t dstva;
3376 	vsize_t off, len, n = sz;
3377 	struct vm_map_entry *first = NULL, *last = NULL;
3378 	struct vm_map_entry *src_entry, *psrc_entry = NULL;
3379 	struct uvm_map_deadq dead;
3380 
3381 	if (srcaddr >= srcmap->max_offset || sz > srcmap->max_offset - srcaddr)
3382 		return EINVAL;
3383 
3384 	TAILQ_INIT(&dead);
3385 	vm_map_lock(dstmap);
3386 	vm_map_lock_read(srcmap);
3387 
3388 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, sz)) {
3389 		ret = ENOMEM;
3390 		goto exit_unlock;
3391 	}
3392 	if (!uvm_map_lookup_entry(srcmap, srcaddr, &src_entry)) {
3393 		ret = EINVAL;
3394 		goto exit_unlock;
3395 	}
3396 
3397 	unmap_end = dstaddr;
3398 	for (; src_entry != NULL;
3399 	    psrc_entry = src_entry,
3400 	    src_entry = RBT_NEXT(uvm_map_addr, src_entry)) {
3401 		/* hole in address space, bail out */
3402 		if (psrc_entry != NULL && psrc_entry->end != src_entry->start)
3403 			break;
3404 		if (src_entry->start >= srcaddr + sz)
3405 			break;
3406 
3407 		if (UVM_ET_ISSUBMAP(src_entry))
3408 			panic("uvm_share: encountered a submap (illegal)");
3409 		if (!UVM_ET_ISCOPYONWRITE(src_entry) &&
3410 		    UVM_ET_ISNEEDSCOPY(src_entry))
3411 			panic("uvm_share: non-copy_on_write map entries "
3412 			    "marked needs_copy (illegal)");
3413 
3414 		dstva = dstaddr;
3415 		if (src_entry->start > srcaddr) {
3416 			dstva += src_entry->start - srcaddr;
3417 			off = 0;
3418 		} else
3419 			off = srcaddr - src_entry->start;
3420 
3421 		if (n < src_entry->end - src_entry->start)
3422 			len = n;
3423 		else
3424 			len = src_entry->end - src_entry->start;
3425 		n -= len;
3426 
3427 		if (uvm_mapent_share(dstmap, dstva, len, off, prot, prot,
3428 		    srcmap, src_entry, &dead) == NULL)
3429 			break;
3430 
3431 		unmap_end = dstva + len;
3432 		if (n == 0)
3433 			goto exit_unlock;
3434 	}
3435 
3436 	ret = EINVAL;
3437 	uvm_unmap_remove(dstmap, dstaddr, unmap_end, &dead, FALSE, TRUE);
3438 
3439 exit_unlock:
3440 	vm_map_unlock_read(srcmap);
3441 	vm_map_unlock(dstmap);
3442 	uvm_unmap_detach(&dead, 0);
3443 
3444 	return ret;
3445 }
3446 
3447 /*
3448  * Clone map entry into other map.
3449  *
3450  * Mapping will be placed at dstaddr, for the same length.
3451  * Space must be available.
3452  * Reference counters are incremented.
3453  */
3454 struct vm_map_entry *
3455 uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3456     vsize_t off, vm_prot_t prot, vm_prot_t maxprot,
3457     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead,
3458     int mapent_flags, int amap_share_flags)
3459 {
3460 	struct vm_map_entry *new_entry, *first, *last;
3461 
3462 	KDASSERT(!UVM_ET_ISSUBMAP(old_entry));
3463 
3464 	/* Create new entry (linked in on creation). Fill in first, last. */
3465 	first = last = NULL;
3466 	if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) {
3467 		panic("uvmspace_fork: no space in map for "
3468 		    "entry in empty map");
3469 	}
3470 	new_entry = uvm_map_mkentry(dstmap, first, last,
3471 	    dstaddr, dstlen, mapent_flags, dead, NULL);
3472 	if (new_entry == NULL)
3473 		return NULL;
3474 	/* old_entry -> new_entry */
3475 	new_entry->object = old_entry->object;
3476 	new_entry->offset = old_entry->offset;
3477 	new_entry->aref = old_entry->aref;
3478 	new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED;
3479 	new_entry->protection = prot;
3480 	new_entry->max_protection = maxprot;
3481 	new_entry->inheritance = old_entry->inheritance;
3482 	new_entry->advice = old_entry->advice;
3483 
3484 	/* gain reference to object backing the map (can't be a submap). */
3485 	if (new_entry->aref.ar_amap) {
3486 		new_entry->aref.ar_pageoff += off >> PAGE_SHIFT;
3487 		amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3488 		    (new_entry->end - new_entry->start) >> PAGE_SHIFT,
3489 		    amap_share_flags);
3490 	}
3491 
3492 	if (UVM_ET_ISOBJ(new_entry) &&
3493 	    new_entry->object.uvm_obj->pgops->pgo_reference) {
3494 		new_entry->offset += off;
3495 		new_entry->object.uvm_obj->pgops->pgo_reference
3496 		    (new_entry->object.uvm_obj);
3497 	}
3498 
3499 	return new_entry;
3500 }
3501 
3502 struct vm_map_entry *
3503 uvm_mapent_share(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen,
3504     vsize_t off, vm_prot_t prot, vm_prot_t maxprot, struct vm_map *old_map,
3505     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3506 {
3507 	/*
3508 	 * If old_entry refers to a copy-on-write region that has not yet been
3509 	 * written to (needs_copy flag is set), then we need to allocate a new
3510 	 * amap for old_entry.
3511 	 *
3512 	 * If we do not do this, and the process owning old_entry does a copy-on
3513 	 * write later, old_entry and new_entry will refer to different memory
3514 	 * regions, and the memory between the processes is no longer shared.
3515 	 *
3516 	 * [in other words, we need to clear needs_copy]
3517 	 */
3518 
3519 	if (UVM_ET_ISNEEDSCOPY(old_entry)) {
3520 		/* get our own amap, clears needs_copy */
3521 		amap_copy(old_map, old_entry, M_WAITOK, FALSE,
3522 		    0, 0);
3523 		/* XXXCDC: WAITOK??? */
3524 	}
3525 
3526 	return uvm_mapent_clone(dstmap, dstaddr, dstlen, off,
3527 	    prot, maxprot, old_entry, dead, 0, AMAP_SHARED);
3528 }
3529 
3530 /*
3531  * share the mapping: this means we want the old and
3532  * new entries to share amaps and backing objects.
3533  */
3534 struct vm_map_entry *
3535 uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map,
3536     struct vm_map *old_map,
3537     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3538 {
3539 	struct vm_map_entry *new_entry;
3540 
3541 	new_entry = uvm_mapent_share(new_map, old_entry->start,
3542 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3543 	    old_entry->max_protection, old_map, old_entry, dead);
3544 
3545 	/*
3546 	 * pmap_copy the mappings: this routine is optional
3547 	 * but if it is there it will reduce the number of
3548 	 * page faults in the new proc.
3549 	 */
3550 	if (!UVM_ET_ISHOLE(new_entry))
3551 		pmap_copy(new_map->pmap, old_map->pmap, new_entry->start,
3552 		    (new_entry->end - new_entry->start), new_entry->start);
3553 
3554 	return (new_entry);
3555 }
3556 
3557 /*
3558  * copy-on-write the mapping (using mmap's
3559  * MAP_PRIVATE semantics)
3560  *
3561  * allocate new_entry, adjust reference counts.
3562  * (note that new references are read-only).
3563  */
3564 struct vm_map_entry *
3565 uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map,
3566     struct vm_map *old_map,
3567     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3568 {
3569 	struct vm_map_entry	*new_entry;
3570 	boolean_t		 protect_child;
3571 
3572 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3573 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3574 	    old_entry->max_protection, old_entry, dead, 0, 0);
3575 
3576 	new_entry->etype |=
3577 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3578 
3579 	/*
3580 	 * the new entry will need an amap.  it will either
3581 	 * need to be copied from the old entry or created
3582 	 * from scratch (if the old entry does not have an
3583 	 * amap).  can we defer this process until later
3584 	 * (by setting "needs_copy") or do we need to copy
3585 	 * the amap now?
3586 	 *
3587 	 * we must copy the amap now if any of the following
3588 	 * conditions hold:
3589 	 * 1. the old entry has an amap and that amap is
3590 	 *    being shared.  this means that the old (parent)
3591 	 *    process is sharing the amap with another
3592 	 *    process.  if we do not clear needs_copy here
3593 	 *    we will end up in a situation where both the
3594 	 *    parent and child process are referring to the
3595 	 *    same amap with "needs_copy" set.  if the
3596 	 *    parent write-faults, the fault routine will
3597 	 *    clear "needs_copy" in the parent by allocating
3598 	 *    a new amap.   this is wrong because the
3599 	 *    parent is supposed to be sharing the old amap
3600 	 *    and the new amap will break that.
3601 	 *
3602 	 * 2. if the old entry has an amap and a non-zero
3603 	 *    wire count then we are going to have to call
3604 	 *    amap_cow_now to avoid page faults in the
3605 	 *    parent process.   since amap_cow_now requires
3606 	 *    "needs_copy" to be clear we might as well
3607 	 *    clear it here as well.
3608 	 *
3609 	 */
3610 	if (old_entry->aref.ar_amap != NULL &&
3611 	    ((amap_flags(old_entry->aref.ar_amap) &
3612 	    AMAP_SHARED) != 0 ||
3613 	    VM_MAPENT_ISWIRED(old_entry))) {
3614 		amap_copy(new_map, new_entry, M_WAITOK, FALSE,
3615 		    0, 0);
3616 		/* XXXCDC: M_WAITOK ... ok? */
3617 	}
3618 
3619 	/*
3620 	 * if the parent's entry is wired down, then the
3621 	 * parent process does not want page faults on
3622 	 * access to that memory.  this means that we
3623 	 * cannot do copy-on-write because we can't write
3624 	 * protect the old entry.   in this case we
3625 	 * resolve all copy-on-write faults now, using
3626 	 * amap_cow_now.   note that we have already
3627 	 * allocated any needed amap (above).
3628 	 */
3629 	if (VM_MAPENT_ISWIRED(old_entry)) {
3630 		/*
3631 		 * resolve all copy-on-write faults now
3632 		 * (note that there is nothing to do if
3633 		 * the old mapping does not have an amap).
3634 		 * XXX: is it worthwhile to bother with
3635 		 * pmap_copy in this case?
3636 		 */
3637 		if (old_entry->aref.ar_amap)
3638 			amap_cow_now(new_map, new_entry);
3639 	} else {
3640 		if (old_entry->aref.ar_amap) {
3641 			/*
3642 			 * setup mappings to trigger copy-on-write faults
3643 			 * we must write-protect the parent if it has
3644 			 * an amap and it is not already "needs_copy"...
3645 			 * if it is already "needs_copy" then the parent
3646 			 * has already been write-protected by a previous
3647 			 * fork operation.
3648 			 *
3649 			 * if we do not write-protect the parent, then
3650 			 * we must be sure to write-protect the child
3651 			 * after the pmap_copy() operation.
3652 			 *
3653 			 * XXX: pmap_copy should have some way of telling
3654 			 * us that it didn't do anything so we can avoid
3655 			 * calling pmap_protect needlessly.
3656 			 */
3657 			if (!UVM_ET_ISNEEDSCOPY(old_entry)) {
3658 				if (old_entry->max_protection & PROT_WRITE) {
3659 					pmap_protect(old_map->pmap,
3660 					    old_entry->start,
3661 					    old_entry->end,
3662 					    old_entry->protection &
3663 					    ~PROT_WRITE);
3664 					pmap_update(old_map->pmap);
3665 				}
3666 				old_entry->etype |= UVM_ET_NEEDSCOPY;
3667 			}
3668 
3669 	  		/* parent must now be write-protected */
3670 	  		protect_child = FALSE;
3671 		} else {
3672 			/*
3673 			 * we only need to protect the child if the
3674 			 * parent has write access.
3675 			 */
3676 			if (old_entry->max_protection & PROT_WRITE)
3677 				protect_child = TRUE;
3678 			else
3679 				protect_child = FALSE;
3680 		}
3681 		/*
3682 		 * copy the mappings
3683 		 * XXX: need a way to tell if this does anything
3684 		 */
3685 		if (!UVM_ET_ISHOLE(new_entry))
3686 			pmap_copy(new_map->pmap, old_map->pmap,
3687 			    new_entry->start,
3688 			    (old_entry->end - old_entry->start),
3689 			    old_entry->start);
3690 
3691 		/* protect the child's mappings if necessary */
3692 		if (protect_child) {
3693 			pmap_protect(new_map->pmap, new_entry->start,
3694 			    new_entry->end,
3695 			    new_entry->protection &
3696 			    ~PROT_WRITE);
3697 		}
3698 	}
3699 
3700 	return (new_entry);
3701 }
3702 
3703 /*
3704  * zero the mapping: the new entry will be zero initialized
3705  */
3706 struct vm_map_entry *
3707 uvm_mapent_forkzero(struct vmspace *new_vm, struct vm_map *new_map,
3708     struct vm_map *old_map,
3709     struct vm_map_entry *old_entry, struct uvm_map_deadq *dead)
3710 {
3711 	struct vm_map_entry *new_entry;
3712 
3713 	new_entry = uvm_mapent_clone(new_map, old_entry->start,
3714 	    old_entry->end - old_entry->start, 0, old_entry->protection,
3715 	    old_entry->max_protection, old_entry, dead, 0, 0);
3716 
3717 	new_entry->etype |=
3718 	    (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
3719 
3720 	if (new_entry->aref.ar_amap) {
3721 		amap_unref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff,
3722 		    atop(new_entry->end - new_entry->start), 0);
3723 		new_entry->aref.ar_amap = NULL;
3724 		new_entry->aref.ar_pageoff = 0;
3725 	}
3726 
3727 	if (UVM_ET_ISOBJ(new_entry)) {
3728 		if (new_entry->object.uvm_obj->pgops->pgo_detach)
3729 			new_entry->object.uvm_obj->pgops->pgo_detach(
3730 			    new_entry->object.uvm_obj);
3731 		new_entry->object.uvm_obj = NULL;
3732 		new_entry->etype &= ~UVM_ET_OBJ;
3733 	}
3734 
3735 	return (new_entry);
3736 }
3737 
3738 /*
3739  * uvmspace_fork: fork a process' main map
3740  *
3741  * => create a new vmspace for child process from parent.
3742  * => parent's map must not be locked.
3743  */
3744 struct vmspace *
3745 uvmspace_fork(struct process *pr)
3746 {
3747 	struct vmspace *vm1 = pr->ps_vmspace;
3748 	struct vmspace *vm2;
3749 	struct vm_map *old_map = &vm1->vm_map;
3750 	struct vm_map *new_map;
3751 	struct vm_map_entry *old_entry, *new_entry;
3752 	struct uvm_map_deadq dead;
3753 
3754 	vm_map_lock(old_map);
3755 
3756 	vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
3757 	    (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE);
3758 	memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
3759 	    (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
3760 	vm2->vm_dused = 0; /* Statistic managed by us. */
3761 	new_map = &vm2->vm_map;
3762 	vm_map_lock(new_map);
3763 
3764 	/* go entry-by-entry */
3765 	TAILQ_INIT(&dead);
3766 	RBT_FOREACH(old_entry, uvm_map_addr, &old_map->addr) {
3767 		if (old_entry->start == old_entry->end)
3768 			continue;
3769 
3770 		/* first, some sanity checks on the old entry */
3771 		if (UVM_ET_ISSUBMAP(old_entry)) {
3772 			panic("fork: encountered a submap during fork "
3773 			    "(illegal)");
3774 		}
3775 
3776 		if (!UVM_ET_ISCOPYONWRITE(old_entry) &&
3777 		    UVM_ET_ISNEEDSCOPY(old_entry)) {
3778 			panic("fork: non-copy_on_write map entry marked "
3779 			    "needs_copy (illegal)");
3780 		}
3781 
3782 		/* Apply inheritance. */
3783 		switch (old_entry->inheritance) {
3784 		case MAP_INHERIT_SHARE:
3785 			new_entry = uvm_mapent_forkshared(vm2, new_map,
3786 			    old_map, old_entry, &dead);
3787 			break;
3788 		case MAP_INHERIT_COPY:
3789 			new_entry = uvm_mapent_forkcopy(vm2, new_map,
3790 			    old_map, old_entry, &dead);
3791 			break;
3792 		case MAP_INHERIT_ZERO:
3793 			new_entry = uvm_mapent_forkzero(vm2, new_map,
3794 			    old_map, old_entry, &dead);
3795 			break;
3796 		default:
3797 			continue;
3798 		}
3799 
3800 	 	/* Update process statistics. */
3801 		if (!UVM_ET_ISHOLE(new_entry))
3802 			new_map->size += new_entry->end - new_entry->start;
3803 		if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry)) {
3804 			vm2->vm_dused += uvmspace_dused(
3805 			    new_map, new_entry->start, new_entry->end);
3806 		}
3807 	}
3808 
3809 	vm_map_unlock(old_map);
3810 	vm_map_unlock(new_map);
3811 
3812 	/*
3813 	 * This can actually happen, if multiple entries described a
3814 	 * space in which an entry was inherited.
3815 	 */
3816 	uvm_unmap_detach(&dead, 0);
3817 
3818 #ifdef SYSVSHM
3819 	if (vm1->vm_shm)
3820 		shmfork(vm1, vm2);
3821 #endif
3822 
3823 	return vm2;
3824 }
3825 
3826 /*
3827  * uvm_map_hint: return the beginning of the best area suitable for
3828  * creating a new mapping with "prot" protection.
3829  */
3830 vaddr_t
3831 uvm_map_hint(struct vmspace *vm, vm_prot_t prot, vaddr_t minaddr,
3832     vaddr_t maxaddr)
3833 {
3834 	vaddr_t addr;
3835 	vaddr_t spacing;
3836 
3837 #ifdef __i386__
3838 	/*
3839 	 * If executable skip first two pages, otherwise start
3840 	 * after data + heap region.
3841 	 */
3842 	if ((prot & PROT_EXEC) != 0 &&
3843 	    (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) {
3844 		addr = (PAGE_SIZE*2) +
3845 		    (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1));
3846 		return (round_page(addr));
3847 	}
3848 #endif
3849 
3850 #if defined (__LP64__)
3851 	spacing = (MIN((4UL * 1024 * 1024 * 1024), BRKSIZ) - 1);
3852 #else
3853 	spacing = (MIN((256 * 1024 * 1024), BRKSIZ) - 1);
3854 #endif
3855 
3856 	addr = (vaddr_t)vm->vm_daddr;
3857 	/*
3858 	 * Start malloc/mmap after the brk.
3859 	 * If the random spacing area has been used up,
3860 	 * the brk area becomes fair game for mmap as well.
3861 	 */
3862 	if (vm->vm_dused < spacing >> PAGE_SHIFT)
3863 		addr += BRKSIZ;
3864 	if (addr < maxaddr) {
3865 		while (spacing > maxaddr - addr)
3866 			spacing >>= 1;
3867 	}
3868 	addr += arc4random() & spacing;
3869 	return (round_page(addr));
3870 }
3871 
3872 /*
3873  * uvm_map_submap: punch down part of a map into a submap
3874  *
3875  * => only the kernel_map is allowed to be submapped
3876  * => the purpose of submapping is to break up the locking granularity
3877  *	of a larger map
3878  * => the range specified must have been mapped previously with a uvm_map()
3879  *	call [with uobj==NULL] to create a blank map entry in the main map.
3880  *	[And it had better still be blank!]
3881  * => maps which contain submaps should never be copied or forked.
3882  * => to remove a submap, use uvm_unmap() on the main map
3883  *	and then uvm_map_deallocate() the submap.
3884  * => main map must be unlocked.
3885  * => submap must have been init'd and have a zero reference count.
3886  *	[need not be locked as we don't actually reference it]
3887  */
3888 int
3889 uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
3890     struct vm_map *submap)
3891 {
3892 	struct vm_map_entry *entry;
3893 	int result;
3894 
3895 	if (start > map->max_offset || end > map->max_offset ||
3896 	    start < map->min_offset || end < map->min_offset)
3897 		return EINVAL;
3898 
3899 	vm_map_lock(map);
3900 
3901 	if (uvm_map_lookup_entry(map, start, &entry)) {
3902 		UVM_MAP_CLIP_START(map, entry, start);
3903 		UVM_MAP_CLIP_END(map, entry, end);
3904 	} else
3905 		entry = NULL;
3906 
3907 	if (entry != NULL &&
3908 	    entry->start == start && entry->end == end &&
3909 	    entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
3910 	    !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
3911 		entry->etype |= UVM_ET_SUBMAP;
3912 		entry->object.sub_map = submap;
3913 		entry->offset = 0;
3914 		uvm_map_reference(submap);
3915 		result = 0;
3916 	} else
3917 		result = EINVAL;
3918 
3919 	vm_map_unlock(map);
3920 	return(result);
3921 }
3922 
3923 /*
3924  * uvm_map_checkprot: check protection in map
3925  *
3926  * => must allow specific protection in a fully allocated region.
3927  * => map mut be read or write locked by caller.
3928  */
3929 boolean_t
3930 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
3931     vm_prot_t protection)
3932 {
3933 	struct vm_map_entry *entry;
3934 
3935 	if (start < map->min_offset || end > map->max_offset || start > end)
3936 		return FALSE;
3937 	if (start == end)
3938 		return TRUE;
3939 
3940 	/*
3941 	 * Iterate entries.
3942 	 */
3943 	for (entry = uvm_map_entrybyaddr(&map->addr, start);
3944 	    entry != NULL && entry->start < end;
3945 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
3946 		/* Fail if a hole is found. */
3947 		if (UVM_ET_ISHOLE(entry) ||
3948 		    (entry->end < end && entry->end != VMMAP_FREE_END(entry)))
3949 			return FALSE;
3950 
3951 		/* Check protection. */
3952 		if ((entry->protection & protection) != protection)
3953 			return FALSE;
3954 	}
3955 	return TRUE;
3956 }
3957 
3958 /*
3959  * uvm_map_create: create map
3960  */
3961 vm_map_t
3962 uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags)
3963 {
3964 	vm_map_t map;
3965 
3966 	map = malloc(sizeof *map, M_VMMAP, M_WAITOK);
3967 	map->pmap = pmap;
3968 	uvm_map_setup(map, min, max, flags);
3969 	return (map);
3970 }
3971 
3972 /*
3973  * uvm_map_deallocate: drop reference to a map
3974  *
3975  * => caller must not lock map
3976  * => we will zap map if ref count goes to zero
3977  */
3978 void
3979 uvm_map_deallocate(vm_map_t map)
3980 {
3981 	int c;
3982 	struct uvm_map_deadq dead;
3983 
3984 	c = --map->ref_count;
3985 	if (c > 0) {
3986 		return;
3987 	}
3988 
3989 	/*
3990 	 * all references gone.   unmap and free.
3991 	 *
3992 	 * No lock required: we are only one to access this map.
3993 	 */
3994 	TAILQ_INIT(&dead);
3995 	uvm_tree_sanity(map, __FILE__, __LINE__);
3996 	uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead,
3997 	    TRUE, FALSE);
3998 	pmap_destroy(map->pmap);
3999 	KASSERT(RBT_EMPTY(uvm_map_addr, &map->addr));
4000 	free(map, M_VMMAP, sizeof *map);
4001 
4002 	uvm_unmap_detach(&dead, 0);
4003 }
4004 
4005 /*
4006  * uvm_map_inherit: set inheritance code for range of addrs in map.
4007  *
4008  * => map must be unlocked
4009  * => note that the inherit code is used during a "fork".  see fork
4010  *	code for details.
4011  */
4012 int
4013 uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
4014     vm_inherit_t new_inheritance)
4015 {
4016 	struct vm_map_entry *entry;
4017 
4018 	switch (new_inheritance) {
4019 	case MAP_INHERIT_NONE:
4020 	case MAP_INHERIT_COPY:
4021 	case MAP_INHERIT_SHARE:
4022 	case MAP_INHERIT_ZERO:
4023 		break;
4024 	default:
4025 		return (EINVAL);
4026 	}
4027 
4028 	if (start > end)
4029 		return EINVAL;
4030 	start = MAX(start, map->min_offset);
4031 	end = MIN(end, map->max_offset);
4032 	if (start >= end)
4033 		return 0;
4034 
4035 	vm_map_lock(map);
4036 
4037 	entry = uvm_map_entrybyaddr(&map->addr, start);
4038 	if (entry->end > start)
4039 		UVM_MAP_CLIP_START(map, entry, start);
4040 	else
4041 		entry = RBT_NEXT(uvm_map_addr, entry);
4042 
4043 	while (entry != NULL && entry->start < end) {
4044 		UVM_MAP_CLIP_END(map, entry, end);
4045 		entry->inheritance = new_inheritance;
4046 		entry = RBT_NEXT(uvm_map_addr, entry);
4047 	}
4048 
4049 	vm_map_unlock(map);
4050 	return (0);
4051 }
4052 
4053 /*
4054  * uvm_map_advice: set advice code for range of addrs in map.
4055  *
4056  * => map must be unlocked
4057  */
4058 int
4059 uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
4060 {
4061 	struct vm_map_entry *entry;
4062 
4063 	switch (new_advice) {
4064 	case MADV_NORMAL:
4065 	case MADV_RANDOM:
4066 	case MADV_SEQUENTIAL:
4067 		break;
4068 	default:
4069 		return (EINVAL);
4070 	}
4071 
4072 	if (start > end)
4073 		return EINVAL;
4074 	start = MAX(start, map->min_offset);
4075 	end = MIN(end, map->max_offset);
4076 	if (start >= end)
4077 		return 0;
4078 
4079 	vm_map_lock(map);
4080 
4081 	entry = uvm_map_entrybyaddr(&map->addr, start);
4082 	if (entry != NULL && entry->end > start)
4083 		UVM_MAP_CLIP_START(map, entry, start);
4084 	else if (entry!= NULL)
4085 		entry = RBT_NEXT(uvm_map_addr, entry);
4086 
4087 	/*
4088 	 * XXXJRT: disallow holes?
4089 	 */
4090 	while (entry != NULL && entry->start < end) {
4091 		UVM_MAP_CLIP_END(map, entry, end);
4092 		entry->advice = new_advice;
4093 		entry = RBT_NEXT(uvm_map_addr, entry);
4094 	}
4095 
4096 	vm_map_unlock(map);
4097 	return (0);
4098 }
4099 
4100 /*
4101  * uvm_map_extract: extract a mapping from a map and put it somewhere
4102  * in the kernel_map, setting protection to max_prot.
4103  *
4104  * => map should be unlocked (we will write lock it and kernel_map)
4105  * => returns 0 on success, error code otherwise
4106  * => start must be page aligned
4107  * => len must be page sized
4108  * => flags:
4109  *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
4110  * Mappings are QREF's.
4111  */
4112 int
4113 uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
4114     vaddr_t *dstaddrp, int flags)
4115 {
4116 	struct uvm_map_deadq dead;
4117 	struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2;
4118 	vaddr_t dstaddr;
4119 	vaddr_t end;
4120 	vaddr_t cp_start;
4121 	vsize_t cp_len, cp_off;
4122 	int error;
4123 
4124 	TAILQ_INIT(&dead);
4125 	end = start + len;
4126 
4127 	/*
4128 	 * Sanity check on the parameters.
4129 	 * Also, since the mapping may not contain gaps, error out if the
4130 	 * mapped area is not in source map.
4131 	 */
4132 	if ((start & (vaddr_t)PAGE_MASK) != 0 ||
4133 	    (end & (vaddr_t)PAGE_MASK) != 0 || end < start)
4134 		return EINVAL;
4135 	if (start < srcmap->min_offset || end > srcmap->max_offset)
4136 		return EINVAL;
4137 
4138 	/* Initialize dead entries. Handle len == 0 case. */
4139 	if (len == 0)
4140 		return 0;
4141 
4142 	/* Acquire lock on srcmap. */
4143 	vm_map_lock(srcmap);
4144 
4145 	/* Lock srcmap, lookup first and last entry in <start,len>. */
4146 	first = uvm_map_entrybyaddr(&srcmap->addr, start);
4147 
4148 	/* Check that the range is contiguous. */
4149 	for (entry = first; entry != NULL && entry->end < end;
4150 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4151 		if (VMMAP_FREE_END(entry) != entry->end ||
4152 		    UVM_ET_ISHOLE(entry)) {
4153 			error = EINVAL;
4154 			goto fail;
4155 		}
4156 	}
4157 	if (entry == NULL || UVM_ET_ISHOLE(entry)) {
4158 		error = EINVAL;
4159 		goto fail;
4160 	}
4161 
4162 	/*
4163 	 * Handle need-copy flag.
4164 	 */
4165 	for (entry = first; entry != NULL && entry->start < end;
4166 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4167 		if (UVM_ET_ISNEEDSCOPY(entry))
4168 			amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end);
4169 		if (UVM_ET_ISNEEDSCOPY(entry)) {
4170 			/*
4171 			 * amap_copy failure
4172 			 */
4173 			error = ENOMEM;
4174 			goto fail;
4175 		}
4176 	}
4177 
4178 	/* Lock destination map (kernel_map). */
4179 	vm_map_lock(kernel_map);
4180 
4181 	if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len,
4182 	    MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()), PMAP_PREFER_OFFSET(start),
4183 	    PROT_NONE, 0) != 0) {
4184 		error = ENOMEM;
4185 		goto fail2;
4186 	}
4187 	*dstaddrp = dstaddr;
4188 
4189 	/*
4190 	 * We now have srcmap and kernel_map locked.
4191 	 * dstaddr contains the destination offset in dstmap.
4192 	 */
4193 	/* step 1: start looping through map entries, performing extraction. */
4194 	for (entry = first; entry != NULL && entry->start < end;
4195 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4196 		KDASSERT(!UVM_ET_ISNEEDSCOPY(entry));
4197 		if (UVM_ET_ISHOLE(entry))
4198 			continue;
4199 
4200 		/* Calculate uvm_mapent_clone parameters. */
4201 		cp_start = entry->start;
4202 		if (cp_start < start) {
4203 			cp_off = start - cp_start;
4204 			cp_start = start;
4205 		} else
4206 			cp_off = 0;
4207 		cp_len = MIN(entry->end, end) - cp_start;
4208 
4209 		newentry = uvm_mapent_clone(kernel_map,
4210 		    cp_start - start + dstaddr, cp_len, cp_off,
4211 		    entry->protection, entry->max_protection,
4212 		    entry, &dead, flags, AMAP_SHARED | AMAP_REFALL);
4213 		if (newentry == NULL) {
4214 			error = ENOMEM;
4215 			goto fail2_unmap;
4216 		}
4217 		kernel_map->size += cp_len;
4218 		if (flags & UVM_EXTRACT_FIXPROT)
4219 			newentry->protection = newentry->max_protection;
4220 
4221 		/*
4222 		 * Step 2: perform pmap copy.
4223 		 * (Doing this in the loop saves one RB traversal.)
4224 		 */
4225 		pmap_copy(kernel_map->pmap, srcmap->pmap,
4226 		    cp_start - start + dstaddr, cp_len, cp_start);
4227 	}
4228 	pmap_update(kernel_map->pmap);
4229 
4230 	error = 0;
4231 
4232 	/* Unmap copied entries on failure. */
4233 fail2_unmap:
4234 	if (error) {
4235 		uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead,
4236 		    FALSE, TRUE);
4237 	}
4238 
4239 	/* Release maps, release dead entries. */
4240 fail2:
4241 	vm_map_unlock(kernel_map);
4242 
4243 fail:
4244 	vm_map_unlock(srcmap);
4245 
4246 	uvm_unmap_detach(&dead, 0);
4247 
4248 	return error;
4249 }
4250 
4251 /*
4252  * uvm_map_clean: clean out a map range
4253  *
4254  * => valid flags:
4255  *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
4256  *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
4257  *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
4258  *   if (flags & PGO_FREE): any cached pages are freed after clean
4259  * => returns an error if any part of the specified range isn't mapped
4260  * => never a need to flush amap layer since the anonymous memory has
4261  *	no permanent home, but may deactivate pages there
4262  * => called from sys_msync() and sys_madvise()
4263  * => caller must not write-lock map (read OK).
4264  * => we may sleep while cleaning if SYNCIO [with map read-locked]
4265  */
4266 
4267 int
4268 uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
4269 {
4270 	struct vm_map_entry *first, *entry;
4271 	struct vm_amap *amap;
4272 	struct vm_anon *anon;
4273 	struct vm_page *pg;
4274 	struct uvm_object *uobj;
4275 	vaddr_t cp_start, cp_end;
4276 	int refs;
4277 	int error;
4278 	boolean_t rv;
4279 
4280 	KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
4281 	    (PGO_FREE|PGO_DEACTIVATE));
4282 
4283 	if (start > end || start < map->min_offset || end > map->max_offset)
4284 		return EINVAL;
4285 
4286 	vm_map_lock_read(map);
4287 	first = uvm_map_entrybyaddr(&map->addr, start);
4288 
4289 	/* Make a first pass to check for holes. */
4290 	for (entry = first; entry != NULL && entry->start < end;
4291 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4292 		if (UVM_ET_ISSUBMAP(entry)) {
4293 			vm_map_unlock_read(map);
4294 			return EINVAL;
4295 		}
4296 		if (UVM_ET_ISSUBMAP(entry) ||
4297 		    UVM_ET_ISHOLE(entry) ||
4298 		    (entry->end < end &&
4299 		    VMMAP_FREE_END(entry) != entry->end)) {
4300 			vm_map_unlock_read(map);
4301 			return EFAULT;
4302 		}
4303 	}
4304 
4305 	error = 0;
4306 	for (entry = first; entry != NULL && entry->start < end;
4307 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4308 		amap = entry->aref.ar_amap;	/* top layer */
4309 		if (UVM_ET_ISOBJ(entry))
4310 			uobj = entry->object.uvm_obj;
4311 		else
4312 			uobj = NULL;
4313 
4314 		/*
4315 		 * No amap cleaning necessary if:
4316 		 *  - there's no amap
4317 		 *  - we're not deactivating or freeing pages.
4318 		 */
4319 		if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
4320 			goto flush_object;
4321 
4322 		cp_start = MAX(entry->start, start);
4323 		cp_end = MIN(entry->end, end);
4324 
4325 		for (; cp_start != cp_end; cp_start += PAGE_SIZE) {
4326 			anon = amap_lookup(&entry->aref,
4327 			    cp_start - entry->start);
4328 			if (anon == NULL)
4329 				continue;
4330 
4331 			pg = anon->an_page;
4332 			if (pg == NULL) {
4333 				continue;
4334 			}
4335 			KASSERT(pg->pg_flags & PQ_ANON);
4336 
4337 			switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
4338 			/*
4339 			 * XXX In these first 3 cases, we always just
4340 			 * XXX deactivate the page.  We may want to
4341 			 * XXX handle the different cases more
4342 			 * XXX specifically, in the future.
4343 			 */
4344 			case PGO_CLEANIT|PGO_FREE:
4345 			case PGO_CLEANIT|PGO_DEACTIVATE:
4346 			case PGO_DEACTIVATE:
4347 deactivate_it:
4348 				/* skip the page if it's wired */
4349 				if (pg->wire_count != 0)
4350 					break;
4351 
4352 				uvm_lock_pageq();
4353 
4354 				KASSERT(pg->uanon == anon);
4355 
4356 				/* zap all mappings for the page. */
4357 				pmap_page_protect(pg, PROT_NONE);
4358 
4359 				/* ...and deactivate the page. */
4360 				uvm_pagedeactivate(pg);
4361 
4362 				uvm_unlock_pageq();
4363 				break;
4364 			case PGO_FREE:
4365 				/*
4366 				 * If there are multiple references to
4367 				 * the amap, just deactivate the page.
4368 				 */
4369 				if (amap_refs(amap) > 1)
4370 					goto deactivate_it;
4371 
4372 				/* XXX skip the page if it's wired */
4373 				if (pg->wire_count != 0) {
4374 					break;
4375 				}
4376 				amap_unadd(&entry->aref,
4377 				    cp_start - entry->start);
4378 				refs = --anon->an_ref;
4379 				if (refs == 0)
4380 					uvm_anfree(anon);
4381 				break;
4382 			default:
4383 				panic("uvm_map_clean: weird flags");
4384 			}
4385 		}
4386 
4387 flush_object:
4388 		cp_start = MAX(entry->start, start);
4389 		cp_end = MIN(entry->end, end);
4390 
4391 		/*
4392 		 * flush pages if we've got a valid backing object.
4393 		 *
4394 		 * Don't PGO_FREE if we don't have write permission
4395 		 * and don't flush if this is a copy-on-write object
4396 		 * since we can't know our permissions on it.
4397 		 */
4398 		if (uobj != NULL &&
4399 		    ((flags & PGO_FREE) == 0 ||
4400 		     ((entry->max_protection & PROT_WRITE) != 0 &&
4401 		      (entry->etype & UVM_ET_COPYONWRITE) == 0))) {
4402 			rv = uobj->pgops->pgo_flush(uobj,
4403 			    cp_start - entry->start + entry->offset,
4404 			    cp_end - entry->start + entry->offset, flags);
4405 
4406 			if (rv == FALSE)
4407 				error = EFAULT;
4408 		}
4409 	}
4410 
4411 	vm_map_unlock_read(map);
4412 	return error;
4413 }
4414 
4415 /*
4416  * UVM_MAP_CLIP_END implementation
4417  */
4418 void
4419 uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4420 {
4421 	struct vm_map_entry *tmp;
4422 
4423 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4424 	tmp = uvm_mapent_alloc(map, 0);
4425 
4426 	/* Invoke splitentry. */
4427 	uvm_map_splitentry(map, entry, tmp, addr);
4428 }
4429 
4430 /*
4431  * UVM_MAP_CLIP_START implementation
4432  *
4433  * Clippers are required to not change the pointers to the entry they are
4434  * clipping on.
4435  * Since uvm_map_splitentry turns the original entry into the lowest
4436  * entry (address wise) we do a swap between the new entry and the original
4437  * entry, prior to calling uvm_map_splitentry.
4438  */
4439 void
4440 uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr)
4441 {
4442 	struct vm_map_entry *tmp;
4443 	struct uvm_addr_state *free;
4444 
4445 	/* Unlink original. */
4446 	free = uvm_map_uaddr_e(map, entry);
4447 	uvm_mapent_free_remove(map, free, entry);
4448 	uvm_mapent_addr_remove(map, entry);
4449 
4450 	/* Copy entry. */
4451 	KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr);
4452 	tmp = uvm_mapent_alloc(map, 0);
4453 	uvm_mapent_copy(entry, tmp);
4454 
4455 	/* Put new entry in place of original entry. */
4456 	uvm_mapent_addr_insert(map, tmp);
4457 	uvm_mapent_free_insert(map, free, tmp);
4458 
4459 	/* Invoke splitentry. */
4460 	uvm_map_splitentry(map, tmp, entry, addr);
4461 }
4462 
4463 /*
4464  * Boundary fixer.
4465  */
4466 static __inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t);
4467 static __inline vaddr_t
4468 uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound)
4469 {
4470 	return (min < bound && max > bound) ? bound : max;
4471 }
4472 
4473 /*
4474  * Choose free list based on address at start of free space.
4475  *
4476  * The uvm_addr_state returned contains addr and is the first of:
4477  * - uaddr_exe
4478  * - uaddr_brk_stack
4479  * - uaddr_any
4480  */
4481 struct uvm_addr_state*
4482 uvm_map_uaddr(struct vm_map *map, vaddr_t addr)
4483 {
4484 	struct uvm_addr_state *uaddr;
4485 	int i;
4486 
4487 	/* Special case the first page, to prevent mmap from returning 0. */
4488 	if (addr < VMMAP_MIN_ADDR)
4489 		return NULL;
4490 
4491 	/* Upper bound for kernel maps at uvm_maxkaddr. */
4492 	if ((map->flags & VM_MAP_ISVMSPACE) == 0) {
4493 		if (addr >= uvm_maxkaddr)
4494 			return NULL;
4495 	}
4496 
4497 	/* Is the address inside the exe-only map? */
4498 	if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr &&
4499 	    addr < map->uaddr_exe->uaddr_maxaddr)
4500 		return map->uaddr_exe;
4501 
4502 	/* Check if the space falls inside brk/stack area. */
4503 	if ((addr >= map->b_start && addr < map->b_end) ||
4504 	    (addr >= map->s_start && addr < map->s_end)) {
4505 		if (map->uaddr_brk_stack != NULL &&
4506 		    addr >= map->uaddr_brk_stack->uaddr_minaddr &&
4507 		    addr < map->uaddr_brk_stack->uaddr_maxaddr) {
4508 			return map->uaddr_brk_stack;
4509 		} else
4510 			return NULL;
4511 	}
4512 
4513 	/*
4514 	 * Check the other selectors.
4515 	 *
4516 	 * These selectors are only marked as the owner, if they have insert
4517 	 * functions.
4518 	 */
4519 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4520 		uaddr = map->uaddr_any[i];
4521 		if (uaddr == NULL)
4522 			continue;
4523 		if (uaddr->uaddr_functions->uaddr_free_insert == NULL)
4524 			continue;
4525 
4526 		if (addr >= uaddr->uaddr_minaddr &&
4527 		    addr < uaddr->uaddr_maxaddr)
4528 			return uaddr;
4529 	}
4530 
4531 	return NULL;
4532 }
4533 
4534 /*
4535  * Choose free list based on address at start of free space.
4536  *
4537  * The uvm_addr_state returned contains addr and is the first of:
4538  * - uaddr_exe
4539  * - uaddr_brk_stack
4540  * - uaddr_any
4541  */
4542 struct uvm_addr_state*
4543 uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry)
4544 {
4545 	return uvm_map_uaddr(map, VMMAP_FREE_START(entry));
4546 }
4547 
4548 /*
4549  * Returns the first free-memory boundary that is crossed by [min-max].
4550  */
4551 vsize_t
4552 uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max)
4553 {
4554 	struct uvm_addr_state	*uaddr;
4555 	int			 i;
4556 
4557 	/* Never return first page. */
4558 	max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR);
4559 
4560 	/* Treat the maxkaddr special, if the map is a kernel_map. */
4561 	if ((map->flags & VM_MAP_ISVMSPACE) == 0)
4562 		max = uvm_map_boundfix(min, max, uvm_maxkaddr);
4563 
4564 	/* Check for exe-only boundaries. */
4565 	if (map->uaddr_exe != NULL) {
4566 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr);
4567 		max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr);
4568 	}
4569 
4570 	/* Check for exe-only boundaries. */
4571 	if (map->uaddr_brk_stack != NULL) {
4572 		max = uvm_map_boundfix(min, max,
4573 		    map->uaddr_brk_stack->uaddr_minaddr);
4574 		max = uvm_map_boundfix(min, max,
4575 		    map->uaddr_brk_stack->uaddr_maxaddr);
4576 	}
4577 
4578 	/* Check other boundaries. */
4579 	for (i = 0; i < nitems(map->uaddr_any); i++) {
4580 		uaddr = map->uaddr_any[i];
4581 		if (uaddr != NULL) {
4582 			max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr);
4583 			max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr);
4584 		}
4585 	}
4586 
4587 	/* Boundaries at stack and brk() area. */
4588 	max = uvm_map_boundfix(min, max, map->s_start);
4589 	max = uvm_map_boundfix(min, max, map->s_end);
4590 	max = uvm_map_boundfix(min, max, map->b_start);
4591 	max = uvm_map_boundfix(min, max, map->b_end);
4592 
4593 	return max;
4594 }
4595 
4596 /*
4597  * Update map allocation start and end addresses from proc vmspace.
4598  */
4599 void
4600 uvm_map_vmspace_update(struct vm_map *map,
4601     struct uvm_map_deadq *dead, int flags)
4602 {
4603 	struct vmspace *vm;
4604 	vaddr_t b_start, b_end, s_start, s_end;
4605 
4606 	KASSERT(map->flags & VM_MAP_ISVMSPACE);
4607 	KASSERT(offsetof(struct vmspace, vm_map) == 0);
4608 
4609 	/*
4610 	 * Derive actual allocation boundaries from vmspace.
4611 	 */
4612 	vm = (struct vmspace *)map;
4613 	b_start = (vaddr_t)vm->vm_daddr;
4614 	b_end   = b_start + BRKSIZ;
4615 	s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4616 	s_end   = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr);
4617 #ifdef DIAGNOSTIC
4618 	if ((b_start & (vaddr_t)PAGE_MASK) != 0 ||
4619 	    (b_end & (vaddr_t)PAGE_MASK) != 0 ||
4620 	    (s_start & (vaddr_t)PAGE_MASK) != 0 ||
4621 	    (s_end & (vaddr_t)PAGE_MASK) != 0) {
4622 		panic("uvm_map_vmspace_update: vmspace %p invalid bounds: "
4623 		    "b=0x%lx-0x%lx s=0x%lx-0x%lx",
4624 		    vm, b_start, b_end, s_start, s_end);
4625 	}
4626 #endif
4627 
4628 	if (__predict_true(map->b_start == b_start && map->b_end == b_end &&
4629 	    map->s_start == s_start && map->s_end == s_end))
4630 		return;
4631 
4632 	uvm_map_freelist_update(map, dead, b_start, b_end,
4633 	    s_start, s_end, flags);
4634 }
4635 
4636 /*
4637  * Grow kernel memory.
4638  *
4639  * This function is only called for kernel maps when an allocation fails.
4640  *
4641  * If the map has a gap that is large enough to accommodate alloc_sz, this
4642  * function will make sure map->free will include it.
4643  */
4644 void
4645 uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead,
4646     vsize_t alloc_sz, int flags)
4647 {
4648 	vsize_t sz;
4649 	vaddr_t end;
4650 	struct vm_map_entry *entry;
4651 
4652 	/* Kernel memory only. */
4653 	KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0);
4654 	/* Destroy free list. */
4655 	uvm_map_freelist_update_clear(map, dead);
4656 
4657 	/* Include the guard page in the hard minimum requirement of alloc_sz. */
4658 	if (map->flags & VM_MAP_GUARDPAGES)
4659 		alloc_sz += PAGE_SIZE;
4660 
4661 	/*
4662 	 * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA.
4663 	 *
4664 	 * Don't handle the case where the multiplication overflows:
4665 	 * if that happens, the allocation is probably too big anyway.
4666 	 */
4667 	sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA);
4668 
4669 	/*
4670 	 * Walk forward until a gap large enough for alloc_sz shows up.
4671 	 *
4672 	 * We assume the kernel map has no boundaries.
4673 	 * uvm_maxkaddr may be zero.
4674 	 */
4675 	end = MAX(uvm_maxkaddr, map->min_offset);
4676 	entry = uvm_map_entrybyaddr(&map->addr, end);
4677 	while (entry && entry->fspace < alloc_sz)
4678 		entry = RBT_NEXT(uvm_map_addr, entry);
4679 	if (entry) {
4680 		end = MAX(VMMAP_FREE_START(entry), end);
4681 		end += MIN(sz, map->max_offset - end);
4682 	} else
4683 		end = map->max_offset;
4684 
4685 	/* Reserve pmap entries. */
4686 #ifdef PMAP_GROWKERNEL
4687 	uvm_maxkaddr = pmap_growkernel(end);
4688 #else
4689 	uvm_maxkaddr = MAX(uvm_maxkaddr, end);
4690 #endif
4691 
4692 	/* Rebuild free list. */
4693 	uvm_map_freelist_update_refill(map, flags);
4694 }
4695 
4696 /*
4697  * Freelist update subfunction: unlink all entries from freelists.
4698  */
4699 void
4700 uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead)
4701 {
4702 	struct uvm_addr_state *free;
4703 	struct vm_map_entry *entry, *prev, *next;
4704 
4705 	prev = NULL;
4706 	for (entry = RBT_MIN(uvm_map_addr, &map->addr); entry != NULL;
4707 	    entry = next) {
4708 		next = RBT_NEXT(uvm_map_addr, entry);
4709 
4710 		free = uvm_map_uaddr_e(map, entry);
4711 		uvm_mapent_free_remove(map, free, entry);
4712 
4713 		if (prev != NULL && entry->start == entry->end) {
4714 			prev->fspace += VMMAP_FREE_END(entry) - entry->end;
4715 			uvm_mapent_addr_remove(map, entry);
4716 			DEAD_ENTRY_PUSH(dead, entry);
4717 		} else
4718 			prev = entry;
4719 	}
4720 }
4721 
4722 /*
4723  * Freelist update subfunction: refill the freelists with entries.
4724  */
4725 void
4726 uvm_map_freelist_update_refill(struct vm_map *map, int flags)
4727 {
4728 	struct vm_map_entry *entry;
4729 	vaddr_t min, max;
4730 
4731 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
4732 		min = VMMAP_FREE_START(entry);
4733 		max = VMMAP_FREE_END(entry);
4734 		entry->fspace = 0;
4735 
4736 		entry = uvm_map_fix_space(map, entry, min, max, flags);
4737 	}
4738 
4739 	uvm_tree_sanity(map, __FILE__, __LINE__);
4740 }
4741 
4742 /*
4743  * Change {a,b}_{start,end} allocation ranges and associated free lists.
4744  */
4745 void
4746 uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead,
4747     vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags)
4748 {
4749 	KDASSERT(b_end >= b_start && s_end >= s_start);
4750 
4751 	/* Clear all free lists. */
4752 	uvm_map_freelist_update_clear(map, dead);
4753 
4754 	/* Apply new bounds. */
4755 	map->b_start = b_start;
4756 	map->b_end   = b_end;
4757 	map->s_start = s_start;
4758 	map->s_end   = s_end;
4759 
4760 	/* Refill free lists. */
4761 	uvm_map_freelist_update_refill(map, flags);
4762 }
4763 
4764 /*
4765  * Assign a uvm_addr_state to the specified pointer in vm_map.
4766  *
4767  * May sleep.
4768  */
4769 void
4770 uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which,
4771     struct uvm_addr_state *newval)
4772 {
4773 	struct uvm_map_deadq dead;
4774 
4775 	/* Pointer which must be in this map. */
4776 	KASSERT(which != NULL);
4777 	KASSERT((void*)map <= (void*)(which) &&
4778 	    (void*)(which) < (void*)(map + 1));
4779 
4780 	vm_map_lock(map);
4781 	TAILQ_INIT(&dead);
4782 	uvm_map_freelist_update_clear(map, &dead);
4783 
4784 	uvm_addr_destroy(*which);
4785 	*which = newval;
4786 
4787 	uvm_map_freelist_update_refill(map, 0);
4788 	vm_map_unlock(map);
4789 	uvm_unmap_detach(&dead, 0);
4790 }
4791 
4792 /*
4793  * Correct space insert.
4794  *
4795  * Entry must not be on any freelist.
4796  */
4797 struct vm_map_entry*
4798 uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry,
4799     vaddr_t min, vaddr_t max, int flags)
4800 {
4801 	struct uvm_addr_state	*free, *entfree;
4802 	vaddr_t			 lmax;
4803 
4804 	KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0);
4805 	KDASSERT(min <= max);
4806 	KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) ||
4807 	    min == map->min_offset);
4808 
4809 	/*
4810 	 * During the function, entfree will always point at the uaddr state
4811 	 * for entry.
4812 	 */
4813 	entfree = (entry == NULL ? NULL :
4814 	    uvm_map_uaddr_e(map, entry));
4815 
4816 	while (min != max) {
4817 		/* Claim guard page for entry. */
4818 		if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL &&
4819 		    VMMAP_FREE_END(entry) == entry->end &&
4820 		    entry->start != entry->end) {
4821 			if (max - min == 2 * PAGE_SIZE) {
4822 				/*
4823 				 * If the free-space gap is exactly 2 pages,
4824 				 * we make the guard 2 pages instead of 1.
4825 				 * Because in a guarded map, an area needs
4826 				 * at least 2 pages to allocate from:
4827 				 * one page for the allocation and one for
4828 				 * the guard.
4829 				 */
4830 				entry->guard = 2 * PAGE_SIZE;
4831 				min = max;
4832 			} else {
4833 				entry->guard = PAGE_SIZE;
4834 				min += PAGE_SIZE;
4835 			}
4836 			continue;
4837 		}
4838 
4839 		/*
4840 		 * Handle the case where entry has a 2-page guard, but the
4841 		 * space after entry is freed.
4842 		 */
4843 		if (entry != NULL && entry->fspace == 0 &&
4844 		    entry->guard > PAGE_SIZE) {
4845 			entry->guard = PAGE_SIZE;
4846 			min = VMMAP_FREE_START(entry);
4847 		}
4848 
4849 		lmax = uvm_map_boundary(map, min, max);
4850 		free = uvm_map_uaddr(map, min);
4851 
4852 		/*
4853 		 * Entries are merged if they point at the same uvm_free().
4854 		 * Exception to that rule: if min == uvm_maxkaddr, a new
4855 		 * entry is started regardless (otherwise the allocators
4856 		 * will get confused).
4857 		 */
4858 		if (entry != NULL && free == entfree &&
4859 		    !((map->flags & VM_MAP_ISVMSPACE) == 0 &&
4860 		    min == uvm_maxkaddr)) {
4861 			KDASSERT(VMMAP_FREE_END(entry) == min);
4862 			entry->fspace += lmax - min;
4863 		} else {
4864 			/*
4865 			 * Commit entry to free list: it'll not be added to
4866 			 * anymore.
4867 			 * We'll start a new entry and add to that entry
4868 			 * instead.
4869 			 */
4870 			if (entry != NULL)
4871 				uvm_mapent_free_insert(map, entfree, entry);
4872 
4873 			/* New entry for new uaddr. */
4874 			entry = uvm_mapent_alloc(map, flags);
4875 			KDASSERT(entry != NULL);
4876 			entry->end = entry->start = min;
4877 			entry->guard = 0;
4878 			entry->fspace = lmax - min;
4879 			entry->object.uvm_obj = NULL;
4880 			entry->offset = 0;
4881 			entry->etype = 0;
4882 			entry->protection = entry->max_protection = 0;
4883 			entry->inheritance = 0;
4884 			entry->wired_count = 0;
4885 			entry->advice = 0;
4886 			entry->aref.ar_pageoff = 0;
4887 			entry->aref.ar_amap = NULL;
4888 			uvm_mapent_addr_insert(map, entry);
4889 
4890 			entfree = free;
4891 		}
4892 
4893 		min = lmax;
4894 	}
4895 	/* Finally put entry on the uaddr state. */
4896 	if (entry != NULL)
4897 		uvm_mapent_free_insert(map, entfree, entry);
4898 
4899 	return entry;
4900 }
4901 
4902 /*
4903  * MQuery style of allocation.
4904  *
4905  * This allocator searches forward until sufficient space is found to map
4906  * the given size.
4907  *
4908  * XXX: factor in offset (via pmap_prefer) and protection?
4909  */
4910 int
4911 uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset,
4912     int flags)
4913 {
4914 	struct vm_map_entry *entry, *last;
4915 	vaddr_t addr;
4916 	vaddr_t tmp, pmap_align, pmap_offset;
4917 	int error;
4918 
4919 	addr = *addr_p;
4920 	vm_map_lock_read(map);
4921 
4922 	/* Configure pmap prefer. */
4923 	if (offset != UVM_UNKNOWN_OFFSET) {
4924 		pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN());
4925 		pmap_offset = PMAP_PREFER_OFFSET(offset);
4926 	} else {
4927 		pmap_align = PAGE_SIZE;
4928 		pmap_offset = 0;
4929 	}
4930 
4931 	/* Align address to pmap_prefer unless FLAG_FIXED is set. */
4932 	if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) {
4933 	  	tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
4934 		if (tmp < addr)
4935 			tmp += pmap_align;
4936 		addr = tmp;
4937 	}
4938 
4939 	/* First, check if the requested range is fully available. */
4940 	entry = uvm_map_entrybyaddr(&map->addr, addr);
4941 	last = NULL;
4942 	if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
4943 		error = 0;
4944 		goto out;
4945 	}
4946 	if (flags & UVM_FLAG_FIXED) {
4947 		error = EINVAL;
4948 		goto out;
4949 	}
4950 
4951 	error = ENOMEM; /* Default error from here. */
4952 
4953 	/*
4954 	 * At this point, the memory at <addr, sz> is not available.
4955 	 * The reasons are:
4956 	 * [1] it's outside the map,
4957 	 * [2] it starts in used memory (and therefore needs to move
4958 	 *     toward the first free page in entry),
4959 	 * [3] it starts in free memory but bumps into used memory.
4960 	 *
4961 	 * Note that for case [2], the forward moving is handled by the
4962 	 * for loop below.
4963 	 */
4964 	if (entry == NULL) {
4965 		/* [1] Outside the map. */
4966 		if (addr >= map->max_offset)
4967 			goto out;
4968 		else
4969 			entry = RBT_MIN(uvm_map_addr, &map->addr);
4970 	} else if (VMMAP_FREE_START(entry) <= addr) {
4971 		/* [3] Bumped into used memory. */
4972 		entry = RBT_NEXT(uvm_map_addr, entry);
4973 	}
4974 
4975 	/* Test if the next entry is sufficient for the allocation. */
4976 	for (; entry != NULL;
4977 	    entry = RBT_NEXT(uvm_map_addr, entry)) {
4978 		if (entry->fspace == 0)
4979 			continue;
4980 		addr = VMMAP_FREE_START(entry);
4981 
4982 restart:	/* Restart address checks on address change. */
4983 		tmp = (addr & ~(pmap_align - 1)) | pmap_offset;
4984 		if (tmp < addr)
4985 			tmp += pmap_align;
4986 		addr = tmp;
4987 		if (addr >= VMMAP_FREE_END(entry))
4988 			continue;
4989 
4990 		/* Skip brk() allocation addresses. */
4991 		if (addr + sz > map->b_start && addr < map->b_end) {
4992 			if (VMMAP_FREE_END(entry) > map->b_end) {
4993 				addr = map->b_end;
4994 				goto restart;
4995 			} else
4996 				continue;
4997 		}
4998 		/* Skip stack allocation addresses. */
4999 		if (addr + sz > map->s_start && addr < map->s_end) {
5000 			if (VMMAP_FREE_END(entry) > map->s_end) {
5001 				addr = map->s_end;
5002 				goto restart;
5003 			} else
5004 				continue;
5005 		}
5006 
5007 		last = NULL;
5008 		if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) {
5009 			error = 0;
5010 			goto out;
5011 		}
5012 	}
5013 
5014 out:
5015 	vm_map_unlock_read(map);
5016 	if (error == 0)
5017 		*addr_p = addr;
5018 	return error;
5019 }
5020 
5021 /*
5022  * Determine allocation bias.
5023  *
5024  * Returns 1 if we should bias to high addresses, -1 for a bias towards low
5025  * addresses, or 0 for no bias.
5026  * The bias mechanism is intended to avoid clashing with brk() and stack
5027  * areas.
5028  */
5029 int
5030 uvm_mapent_bias(struct vm_map *map, struct vm_map_entry *entry)
5031 {
5032 	vaddr_t start, end;
5033 
5034 	start = VMMAP_FREE_START(entry);
5035 	end = VMMAP_FREE_END(entry);
5036 
5037 	/* Stay at the top of brk() area. */
5038 	if (end >= map->b_start && start < map->b_end)
5039 		return 1;
5040 	/* Stay at the far end of the stack area. */
5041 	if (end >= map->s_start && start < map->s_end) {
5042 #ifdef MACHINE_STACK_GROWS_UP
5043 		return 1;
5044 #else
5045 		return -1;
5046 #endif
5047 	}
5048 
5049 	/* No bias, this area is meant for us. */
5050 	return 0;
5051 }
5052 
5053 
5054 boolean_t
5055 vm_map_lock_try_ln(struct vm_map *map, char *file, int line)
5056 {
5057 	boolean_t rv;
5058 
5059 	if (map->flags & VM_MAP_INTRSAFE) {
5060 		rv = _mtx_enter_try(&map->mtx LOCK_FL_ARGS);
5061 	} else {
5062 		mtx_enter(&map->flags_lock);
5063 		if (map->flags & VM_MAP_BUSY) {
5064 			mtx_leave(&map->flags_lock);
5065 			return (FALSE);
5066 		}
5067 		mtx_leave(&map->flags_lock);
5068 		rv = (_rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP LOCK_FL_ARGS)
5069 		    == 0);
5070 		/* check if the lock is busy and back out if we won the race */
5071 		if (rv) {
5072 			mtx_enter(&map->flags_lock);
5073 			if (map->flags & VM_MAP_BUSY) {
5074 				_rw_exit(&map->lock LOCK_FL_ARGS);
5075 				rv = FALSE;
5076 			}
5077 			mtx_leave(&map->flags_lock);
5078 		}
5079 	}
5080 
5081 	if (rv) {
5082 		map->timestamp++;
5083 		LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5084 		uvm_tree_sanity(map, file, line);
5085 		uvm_tree_size_chk(map, file, line);
5086 	}
5087 
5088 	return (rv);
5089 }
5090 
5091 void
5092 vm_map_lock_ln(struct vm_map *map, char *file, int line)
5093 {
5094 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5095 		do {
5096 			mtx_enter(&map->flags_lock);
5097 tryagain:
5098 			while (map->flags & VM_MAP_BUSY) {
5099 				map->flags |= VM_MAP_WANTLOCK;
5100 				msleep(&map->flags, &map->flags_lock,
5101 				    PVM, vmmapbsy, 0);
5102 			}
5103 			mtx_leave(&map->flags_lock);
5104 		} while (_rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL
5105 		    LOCK_FL_ARGS) != 0);
5106 		/* check if the lock is busy and back out if we won the race */
5107 		mtx_enter(&map->flags_lock);
5108 		if (map->flags & VM_MAP_BUSY) {
5109 			_rw_exit(&map->lock LOCK_FL_ARGS);
5110 			goto tryagain;
5111 		}
5112 		mtx_leave(&map->flags_lock);
5113 	} else {
5114 		_mtx_enter(&map->mtx LOCK_FL_ARGS);
5115 	}
5116 
5117 	map->timestamp++;
5118 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5119 	uvm_tree_sanity(map, file, line);
5120 	uvm_tree_size_chk(map, file, line);
5121 }
5122 
5123 void
5124 vm_map_lock_read_ln(struct vm_map *map, char *file, int line)
5125 {
5126 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5127 		_rw_enter_read(&map->lock LOCK_FL_ARGS);
5128 	else
5129 		_mtx_enter(&map->mtx LOCK_FL_ARGS);
5130 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5131 	uvm_tree_sanity(map, file, line);
5132 	uvm_tree_size_chk(map, file, line);
5133 }
5134 
5135 void
5136 vm_map_unlock_ln(struct vm_map *map, char *file, int line)
5137 {
5138 	uvm_tree_sanity(map, file, line);
5139 	uvm_tree_size_chk(map, file, line);
5140 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5141 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5142 		_rw_exit(&map->lock LOCK_FL_ARGS);
5143 	else
5144 		_mtx_leave(&map->mtx LOCK_FL_ARGS);
5145 }
5146 
5147 void
5148 vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
5149 {
5150 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5151 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5152 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5153 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5154 		_rw_exit_read(&map->lock LOCK_FL_ARGS);
5155 	else
5156 		_mtx_leave(&map->mtx LOCK_FL_ARGS);
5157 }
5158 
5159 void
5160 vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
5161 {
5162 	uvm_tree_sanity(map, file, line);
5163 	uvm_tree_size_chk(map, file, line);
5164 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5165 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5166 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5167 	if ((map->flags & VM_MAP_INTRSAFE) == 0)
5168 		_rw_enter(&map->lock, RW_DOWNGRADE LOCK_FL_ARGS);
5169 }
5170 
5171 void
5172 vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
5173 {
5174 	/* XXX: RO */ uvm_tree_sanity(map, file, line);
5175 	/* XXX: RO */ uvm_tree_size_chk(map, file, line);
5176 	LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line));
5177 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5178 	if ((map->flags & VM_MAP_INTRSAFE) == 0) {
5179 		_rw_exit_read(&map->lock LOCK_FL_ARGS);
5180 		_rw_enter_write(&map->lock LOCK_FL_ARGS);
5181 	}
5182 	LPRINTF(("map   lock: %p (at %s %d)\n", map, file, line));
5183 	uvm_tree_sanity(map, file, line);
5184 }
5185 
5186 void
5187 vm_map_busy_ln(struct vm_map *map, char *file, int line)
5188 {
5189 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5190 	mtx_enter(&map->flags_lock);
5191 	map->flags |= VM_MAP_BUSY;
5192 	mtx_leave(&map->flags_lock);
5193 }
5194 
5195 void
5196 vm_map_unbusy_ln(struct vm_map *map, char *file, int line)
5197 {
5198 	int oflags;
5199 
5200 	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);
5201 	mtx_enter(&map->flags_lock);
5202 	oflags = map->flags;
5203 	map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK);
5204 	mtx_leave(&map->flags_lock);
5205 	if (oflags & VM_MAP_WANTLOCK)
5206 		wakeup(&map->flags);
5207 }
5208 
5209 #ifndef SMALL_KERNEL
5210 int
5211 uvm_map_fill_vmmap(struct vm_map *map, struct kinfo_vmentry *kve,
5212     size_t *lenp)
5213 {
5214 	struct vm_map_entry *entry;
5215 	vaddr_t start;
5216 	int cnt, maxcnt, error = 0;
5217 
5218 	KASSERT(*lenp > 0);
5219 	KASSERT((*lenp % sizeof(*kve)) == 0);
5220 	cnt = 0;
5221 	maxcnt = *lenp / sizeof(*kve);
5222 	KASSERT(maxcnt > 0);
5223 
5224 	/*
5225 	 * Return only entries whose address is above the given base
5226 	 * address.  This allows userland to iterate without knowing the
5227 	 * number of entries beforehand.
5228 	 */
5229 	start = (vaddr_t)kve[0].kve_start;
5230 
5231 	vm_map_lock(map);
5232 	RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
5233 		if (cnt == maxcnt) {
5234 			error = ENOMEM;
5235 			break;
5236 		}
5237 		if (start != 0 && entry->start < start)
5238 			continue;
5239 		kve->kve_start = entry->start;
5240 		kve->kve_end = entry->end;
5241 		kve->kve_guard = entry->guard;
5242 		kve->kve_fspace = entry->fspace;
5243 		kve->kve_fspace_augment = entry->fspace_augment;
5244 		kve->kve_offset = entry->offset;
5245 		kve->kve_wired_count = entry->wired_count;
5246 		kve->kve_etype = entry->etype;
5247 		kve->kve_protection = entry->protection;
5248 		kve->kve_max_protection = entry->max_protection;
5249 		kve->kve_advice = entry->advice;
5250 		kve->kve_inheritance = entry->inheritance;
5251 		kve->kve_flags = entry->flags;
5252 		kve++;
5253 		cnt++;
5254 	}
5255 	vm_map_unlock(map);
5256 
5257 	KASSERT(cnt <= maxcnt);
5258 
5259 	*lenp = sizeof(*kve) * cnt;
5260 	return error;
5261 }
5262 #endif
5263 
5264 
5265 RBT_GENERATE_AUGMENT(uvm_map_addr, vm_map_entry, daddrs.addr_entry,
5266     uvm_mapentry_addrcmp, uvm_map_addr_augment);
5267 
5268 
5269 /*
5270  * MD code: vmspace allocator setup.
5271  */
5272 
5273 #ifdef __i386__
5274 void
5275 uvm_map_setup_md(struct vm_map *map)
5276 {
5277 	vaddr_t		min, max;
5278 
5279 	min = map->min_offset;
5280 	max = map->max_offset;
5281 
5282 	/*
5283 	 * Ensure the selectors will not try to manage page 0;
5284 	 * it's too special.
5285 	 */
5286 	if (min < VMMAP_MIN_ADDR)
5287 		min = VMMAP_MIN_ADDR;
5288 
5289 #if 0	/* Cool stuff, not yet */
5290 	/* Executable code is special. */
5291 	map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR);
5292 	/* Place normal allocations beyond executable mappings. */
5293 	map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, max);
5294 #else	/* Crappy stuff, for now */
5295 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5296 #endif
5297 
5298 #ifndef SMALL_KERNEL
5299 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5300 #endif /* !SMALL_KERNEL */
5301 }
5302 #elif __LP64__
5303 void
5304 uvm_map_setup_md(struct vm_map *map)
5305 {
5306 	vaddr_t		min, max;
5307 
5308 	min = map->min_offset;
5309 	max = map->max_offset;
5310 
5311 	/*
5312 	 * Ensure the selectors will not try to manage page 0;
5313 	 * it's too special.
5314 	 */
5315 	if (min < VMMAP_MIN_ADDR)
5316 		min = VMMAP_MIN_ADDR;
5317 
5318 #if 0	/* Cool stuff, not yet */
5319 	map->uaddr_any[3] = uaddr_pivot_create(MAX(min, 0x100000000ULL), max);
5320 #else	/* Crappy stuff, for now */
5321 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5322 #endif
5323 
5324 #ifndef SMALL_KERNEL
5325 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5326 #endif /* !SMALL_KERNEL */
5327 }
5328 #else	/* non-i386, 32 bit */
5329 void
5330 uvm_map_setup_md(struct vm_map *map)
5331 {
5332 	vaddr_t		min, max;
5333 
5334 	min = map->min_offset;
5335 	max = map->max_offset;
5336 
5337 	/*
5338 	 * Ensure the selectors will not try to manage page 0;
5339 	 * it's too special.
5340 	 */
5341 	if (min < VMMAP_MIN_ADDR)
5342 		min = VMMAP_MIN_ADDR;
5343 
5344 #if 0	/* Cool stuff, not yet */
5345 	map->uaddr_any[3] = uaddr_pivot_create(min, max);
5346 #else	/* Crappy stuff, for now */
5347 	map->uaddr_any[0] = uaddr_rnd_create(min, max);
5348 #endif
5349 
5350 #ifndef SMALL_KERNEL
5351 	map->uaddr_brk_stack = uaddr_stack_brk_create(min, max);
5352 #endif /* !SMALL_KERNEL */
5353 }
5354 #endif
5355