xref: /dflybsd-src/sys/vm/vm_map.c (revision 530e94fc9e8b4693c7e841a45371bdb6e76ee4cd)
1 /*
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
37  *
38  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
39  * All rights reserved.
40  *
41  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/proc.h>
67 #include <sys/serialize.h>
68 #include <sys/lock.h>
69 #include <sys/vmmeter.h>
70 #include <sys/mman.h>
71 #include <sys/vnode.h>
72 #include <sys/resourcevar.h>
73 #include <sys/shm.h>
74 #include <sys/tree.h>
75 #include <sys/malloc.h>
76 #include <sys/objcache.h>
77 #include <sys/kern_syscall.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_zone.h>
90 
91 #include <sys/random.h>
92 #include <sys/sysctl.h>
93 #include <sys/spinlock.h>
94 
95 #include <sys/thread2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Virtual memory maps provide for the mapping, protection, and sharing
100  * of virtual memory objects.  In addition, this module provides for an
101  * efficient virtual copy of memory from one map to another.
102  *
103  * Synchronization is required prior to most operations.
104  *
105  * Maps consist of an ordered doubly-linked list of simple entries.
106  * A hint and a RB tree is used to speed-up lookups.
107  *
108  * Callers looking to modify maps specify start/end addresses which cause
109  * the related map entry to be clipped if necessary, and then later
110  * recombined if the pieces remained compatible.
111  *
112  * Virtual copy operations are performed by copying VM object references
113  * from one map to another, and then marking both regions as copy-on-write.
114  */
115 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
116 static void vmspace_dtor(void *obj, void *privdata);
117 static void vmspace_terminate(struct vmspace *vm, int final);
118 
119 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
120 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
121 static struct objcache *vmspace_cache;
122 
123 /*
124  * per-cpu page table cross mappings are initialized in early boot
125  * and might require a considerable number of vm_map_entry structures.
126  */
127 #define MAPENTRYBSP_CACHE	(MAXCPU+1)
128 #define MAPENTRYAP_CACHE	8
129 
130 /*
131  * Partioning threaded programs with large anonymous memory areas can
132  * improve concurrent fault performance.
133  */
134 #define MAP_ENTRY_PARTITION_SIZE	((vm_offset_t)(32 * 1024 * 1024))
135 #define MAP_ENTRY_PARTITION_MASK	(MAP_ENTRY_PARTITION_SIZE - 1)
136 
137 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)	\
138 	((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
139 
140 static struct vm_zone mapentzone_store;
141 static vm_zone_t mapentzone;
142 
143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
144 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
145 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
146 
147 static int randomize_mmap;
148 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
149     "Randomize mmap offsets");
150 static int vm_map_relock_enable = 1;
151 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
152 	   &vm_map_relock_enable, 0, "insert pop pgtable optimization");
153 static int vm_map_partition_enable = 1;
154 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
155 	   &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
156 static int vm_map_backing_limit = 5;
157 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
158 	   &vm_map_backing_limit, 0, "ba.backing_ba link depth");
159 static int vm_map_backing_shadow_test = 1;
160 SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
161 	   &vm_map_backing_shadow_test, 0, "ba.object shadow test");
162 
163 static void vmspace_drop_notoken(struct vmspace *vm);
164 static void vm_map_entry_shadow(vm_map_entry_t entry);
165 static vm_map_entry_t vm_map_entry_create(int *);
166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
167 static void vm_map_entry_dispose_ba (vm_map_backing_t ba);
168 static void vm_map_backing_replicated(vm_map_t map,
169 		vm_map_entry_t entry, int flags);
170 static void vm_map_backing_adjust_start(vm_map_entry_t entry,
171 		vm_ooffset_t start);
172 static void vm_map_backing_adjust_end(vm_map_entry_t entry,
173 		vm_ooffset_t end);
174 static void vm_map_backing_attach (vm_map_backing_t ba);
175 static void vm_map_backing_detach (vm_map_backing_t ba);
176 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
177 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
178 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
179 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
180 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
181 		vm_map_entry_t);
182 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
183 		vm_offset_t start, vm_offset_t end, int *countp, int flags);
184 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
185 		vm_offset_t vaddr, int *countp);
186 
187 #define MAP_BACK_CLIPPED	0x0001
188 #define MAP_BACK_BASEOBJREFD	0x0002
189 
190 /*
191  * Initialize the vm_map module.  Must be called before any other vm_map
192  * routines.
193  *
194  * Map and entry structures are allocated from the general purpose
195  * memory pool with some exceptions:
196  *
197  *	- The kernel map is allocated statically.
198  *	- Initial kernel map entries are allocated out of a static pool.
199  *	- We must set ZONE_SPECIAL here or the early boot code can get
200  *	  stuck if there are >63 cores.
201  *
202  *	These restrictions are necessary since malloc() uses the
203  *	maps and requires map entries.
204  *
205  * Called from the low level boot code only.
206  */
207 void
208 vm_map_startup(void)
209 {
210 	mapentzone = &mapentzone_store;
211 	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
212 		  map_entry_init, MAX_MAPENT);
213 	mapentzone_store.zflags |= ZONE_SPECIAL;
214 }
215 
216 /*
217  * Called prior to any vmspace allocations.
218  *
219  * Called from the low level boot code only.
220  */
221 void
222 vm_init2(void)
223 {
224 	vmspace_cache = objcache_create_mbacked(M_VMSPACE,
225 						sizeof(struct vmspace),
226 						0, ncpus * 4,
227 						vmspace_ctor, vmspace_dtor,
228 						NULL);
229 	zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
230 	pmap_init2();
231 	vm_object_init2();
232 }
233 
234 /*
235  * objcache support.  We leave the pmap root cached as long as possible
236  * for performance reasons.
237  */
238 static
239 boolean_t
240 vmspace_ctor(void *obj, void *privdata, int ocflags)
241 {
242 	struct vmspace *vm = obj;
243 
244 	bzero(vm, sizeof(*vm));
245 	vm->vm_refcnt = VM_REF_DELETED;
246 
247 	return 1;
248 }
249 
250 static
251 void
252 vmspace_dtor(void *obj, void *privdata)
253 {
254 	struct vmspace *vm = obj;
255 
256 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
257 	pmap_puninit(vmspace_pmap(vm));
258 }
259 
260 /*
261  * Red black tree functions
262  *
263  * The caller must hold the related map lock.
264  */
265 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
266 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
267 
268 /* a->ba.start is address, and the only field which must be initialized */
269 static int
270 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
271 {
272 	if (a->ba.start < b->ba.start)
273 		return(-1);
274 	else if (a->ba.start > b->ba.start)
275 		return(1);
276 	return(0);
277 }
278 
279 /*
280  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
281  * every refcnt.
282  */
283 void
284 vmspace_initrefs(struct vmspace *vm)
285 {
286 	vm->vm_refcnt = 1;
287 	vm->vm_holdcnt = 1;
288 }
289 
290 /*
291  * Allocate a vmspace structure, including a vm_map and pmap.
292  * Initialize numerous fields.  While the initial allocation is zerod,
293  * subsequence reuse from the objcache leaves elements of the structure
294  * intact (particularly the pmap), so portions must be zerod.
295  *
296  * Returns a referenced vmspace.
297  *
298  * No requirements.
299  */
300 struct vmspace *
301 vmspace_alloc(vm_offset_t min, vm_offset_t max)
302 {
303 	struct vmspace *vm;
304 
305 	vm = objcache_get(vmspace_cache, M_WAITOK);
306 
307 	bzero(&vm->vm_startcopy,
308 	      (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
309 	vm_map_init(&vm->vm_map, min, max, NULL);	/* initializes token */
310 
311 	/*
312 	 * NOTE: hold to acquires token for safety.
313 	 *
314 	 * On return vmspace is referenced (refs=1, hold=1).  That is,
315 	 * each refcnt also has a holdcnt.  There can be additional holds
316 	 * (holdcnt) above and beyond the refcnt.  Finalization is handled in
317 	 * two stages, one on refs 1->0, and the the second on hold 1->0.
318 	 */
319 	KKASSERT(vm->vm_holdcnt == 0);
320 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
321 	vmspace_initrefs(vm);
322 	vmspace_hold(vm);
323 	pmap_pinit(vmspace_pmap(vm));		/* (some fields reused) */
324 	vm->vm_map.pmap = vmspace_pmap(vm);	/* XXX */
325 	vm->vm_shm = NULL;
326 	vm->vm_flags = 0;
327 	cpu_vmspace_alloc(vm);
328 	vmspace_drop(vm);
329 
330 	return (vm);
331 }
332 
333 /*
334  * NOTE: Can return 0 if the vmspace is exiting.
335  */
336 int
337 vmspace_getrefs(struct vmspace *vm)
338 {
339 	int32_t n;
340 
341 	n = vm->vm_refcnt;
342 	cpu_ccfence();
343 	if (n & VM_REF_DELETED)
344 		n = -1;
345 	return n;
346 }
347 
348 void
349 vmspace_hold(struct vmspace *vm)
350 {
351 	atomic_add_int(&vm->vm_holdcnt, 1);
352 	lwkt_gettoken(&vm->vm_map.token);
353 }
354 
355 /*
356  * Drop with final termination interlock.
357  */
358 void
359 vmspace_drop(struct vmspace *vm)
360 {
361 	lwkt_reltoken(&vm->vm_map.token);
362 	vmspace_drop_notoken(vm);
363 }
364 
365 static void
366 vmspace_drop_notoken(struct vmspace *vm)
367 {
368 	if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
369 		if (vm->vm_refcnt & VM_REF_DELETED)
370 			vmspace_terminate(vm, 1);
371 	}
372 }
373 
374 /*
375  * A vmspace object must not be in a terminated state to be able to obtain
376  * additional refs on it.
377  *
378  * These are official references to the vmspace, the count is used to check
379  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
380  *
381  * XXX we need to combine hold & ref together into one 64-bit field to allow
382  * holds to prevent stage-1 termination.
383  */
384 void
385 vmspace_ref(struct vmspace *vm)
386 {
387 	uint32_t n;
388 
389 	atomic_add_int(&vm->vm_holdcnt, 1);
390 	n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
391 	KKASSERT((n & VM_REF_DELETED) == 0);
392 }
393 
394 /*
395  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
396  * termination of the vmspace.  Then, on the final drop of the hold we
397  * will do stage-2 final termination.
398  */
399 void
400 vmspace_rel(struct vmspace *vm)
401 {
402 	uint32_t n;
403 
404 	/*
405 	 * Drop refs.  Each ref also has a hold which is also dropped.
406 	 *
407 	 * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
408 	 * prevent finalization) to start termination processing.
409 	 * Finalization occurs when the last hold count drops to 0.
410 	 */
411 	n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
412 	while (n == 0) {
413 		if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
414 			vmspace_terminate(vm, 0);
415 			break;
416 		}
417 		n = vm->vm_refcnt;
418 		cpu_ccfence();
419 	}
420 	vmspace_drop_notoken(vm);
421 }
422 
423 /*
424  * This is called during exit indicating that the vmspace is no
425  * longer in used by an exiting process, but the process has not yet
426  * been reaped.
427  *
428  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
429  * to prevent stage-2 until the process is reaped.  Note hte order of
430  * operation, we must hold first.
431  *
432  * No requirements.
433  */
434 void
435 vmspace_relexit(struct vmspace *vm)
436 {
437 	atomic_add_int(&vm->vm_holdcnt, 1);
438 	vmspace_rel(vm);
439 }
440 
441 /*
442  * Called during reap to disconnect the remainder of the vmspace from
443  * the process.  On the hold drop the vmspace termination is finalized.
444  *
445  * No requirements.
446  */
447 void
448 vmspace_exitfree(struct proc *p)
449 {
450 	struct vmspace *vm;
451 
452 	vm = p->p_vmspace;
453 	p->p_vmspace = NULL;
454 	vmspace_drop_notoken(vm);
455 }
456 
457 /*
458  * Called in two cases:
459  *
460  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
461  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
462  *     and holdcnt will still be non-zero.
463  *
464  * (2) When holdcnt becomes 0, called with final == 1.  There should no
465  *     longer be anyone with access to the vmspace.
466  *
467  * VMSPACE_EXIT1 flags the primary deactivation
468  * VMSPACE_EXIT2 flags the last reap
469  */
470 static void
471 vmspace_terminate(struct vmspace *vm, int final)
472 {
473 	int count;
474 
475 	lwkt_gettoken(&vm->vm_map.token);
476 	if (final == 0) {
477 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
478 		vm->vm_flags |= VMSPACE_EXIT1;
479 
480 		/*
481 		 * Get rid of most of the resources.  Leave the kernel pmap
482 		 * intact.
483 		 *
484 		 * If the pmap does not contain wired pages we can bulk-delete
485 		 * the pmap as a performance optimization before removing the
486 		 * related mappings.
487 		 *
488 		 * If the pmap contains wired pages we cannot do this
489 		 * pre-optimization because currently vm_fault_unwire()
490 		 * expects the pmap pages to exist and will not decrement
491 		 * p->wire_count if they do not.
492 		 */
493 		shmexit(vm);
494 		if (vmspace_pmap(vm)->pm_stats.wired_count) {
495 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
496 				      VM_MAX_USER_ADDRESS);
497 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
498 					  VM_MAX_USER_ADDRESS);
499 		} else {
500 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
501 					  VM_MAX_USER_ADDRESS);
502 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
503 				      VM_MAX_USER_ADDRESS);
504 		}
505 		lwkt_reltoken(&vm->vm_map.token);
506 	} else {
507 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
508 		KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
509 
510 		/*
511 		 * Get rid of remaining basic resources.
512 		 */
513 		vm->vm_flags |= VMSPACE_EXIT2;
514 		shmexit(vm);
515 
516 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
517 		vm_map_lock(&vm->vm_map);
518 		cpu_vmspace_free(vm);
519 
520 		/*
521 		 * Lock the map, to wait out all other references to it.
522 		 * Delete all of the mappings and pages they hold, then call
523 		 * the pmap module to reclaim anything left.
524 		 */
525 		vm_map_delete(&vm->vm_map,
526 			      vm_map_min(&vm->vm_map),
527 			      vm_map_max(&vm->vm_map),
528 			      &count);
529 		vm_map_unlock(&vm->vm_map);
530 		vm_map_entry_release(count);
531 
532 		pmap_release(vmspace_pmap(vm));
533 		lwkt_reltoken(&vm->vm_map.token);
534 		objcache_put(vmspace_cache, vm);
535 	}
536 }
537 
538 /*
539  * Swap useage is determined by taking the proportional swap used by
540  * VM objects backing the VM map.  To make up for fractional losses,
541  * if the VM object has any swap use at all the associated map entries
542  * count for at least 1 swap page.
543  *
544  * No requirements.
545  */
546 vm_offset_t
547 vmspace_swap_count(struct vmspace *vm)
548 {
549 	vm_map_t map = &vm->vm_map;
550 	vm_map_entry_t cur;
551 	vm_object_t object;
552 	vm_offset_t count = 0;
553 	vm_offset_t n;
554 
555 	vmspace_hold(vm);
556 
557 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
558 		switch(cur->maptype) {
559 		case VM_MAPTYPE_NORMAL:
560 		case VM_MAPTYPE_VPAGETABLE:
561 			if ((object = cur->ba.object) == NULL)
562 				break;
563 			if (object->swblock_count) {
564 				n = (cur->ba.end - cur->ba.start) / PAGE_SIZE;
565 				count += object->swblock_count *
566 				    SWAP_META_PAGES * n / object->size + 1;
567 			}
568 			break;
569 		default:
570 			break;
571 		}
572 	}
573 	vmspace_drop(vm);
574 
575 	return(count);
576 }
577 
578 /*
579  * Calculate the approximate number of anonymous pages in use by
580  * this vmspace.  To make up for fractional losses, we count each
581  * VM object as having at least 1 anonymous page.
582  *
583  * No requirements.
584  */
585 vm_offset_t
586 vmspace_anonymous_count(struct vmspace *vm)
587 {
588 	vm_map_t map = &vm->vm_map;
589 	vm_map_entry_t cur;
590 	vm_object_t object;
591 	vm_offset_t count = 0;
592 
593 	vmspace_hold(vm);
594 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
595 		switch(cur->maptype) {
596 		case VM_MAPTYPE_NORMAL:
597 		case VM_MAPTYPE_VPAGETABLE:
598 			if ((object = cur->ba.object) == NULL)
599 				break;
600 			if (object->type != OBJT_DEFAULT &&
601 			    object->type != OBJT_SWAP) {
602 				break;
603 			}
604 			count += object->resident_page_count;
605 			break;
606 		default:
607 			break;
608 		}
609 	}
610 	vmspace_drop(vm);
611 
612 	return(count);
613 }
614 
615 /*
616  * Initialize an existing vm_map structure such as that in the vmspace
617  * structure.  The pmap is initialized elsewhere.
618  *
619  * No requirements.
620  */
621 void
622 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
623 	    pmap_t pmap)
624 {
625 	RB_INIT(&map->rb_root);
626 	spin_init(&map->ilock_spin, "ilock");
627 	map->ilock_base = NULL;
628 	map->nentries = 0;
629 	map->size = 0;
630 	map->system_map = 0;
631 	vm_map_min(map) = min_addr;
632 	vm_map_max(map) = max_addr;
633 	map->pmap = pmap;
634 	map->timestamp = 0;
635 	map->flags = 0;
636 	bzero(&map->freehint, sizeof(map->freehint));
637 	lwkt_token_init(&map->token, "vm_map");
638 	lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
639 }
640 
641 /*
642  * Find the first possible free address for the specified request length.
643  * Returns 0 if we don't have one cached.
644  */
645 static
646 vm_offset_t
647 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
648 {
649 	vm_map_freehint_t *scan;
650 
651 	scan = &map->freehint[0];
652 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
653 		if (scan->length == length && scan->align == align)
654 			return(scan->start);
655 		++scan;
656 	}
657 	return 0;
658 }
659 
660 /*
661  * Unconditionally set the freehint.  Called by vm_map_findspace() after
662  * it finds an address.  This will help us iterate optimally on the next
663  * similar findspace.
664  */
665 static
666 void
667 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
668 		       vm_size_t length, vm_size_t align)
669 {
670 	vm_map_freehint_t *scan;
671 
672 	scan = &map->freehint[0];
673 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
674 		if (scan->length == length && scan->align == align) {
675 			scan->start = start;
676 			return;
677 		}
678 		++scan;
679 	}
680 	scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
681 	scan->start = start;
682 	scan->align = align;
683 	scan->length = length;
684 	++map->freehint_newindex;
685 }
686 
687 /*
688  * Update any existing freehints (for any alignment), for the hole we just
689  * added.
690  */
691 static
692 void
693 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
694 {
695 	vm_map_freehint_t *scan;
696 
697 	scan = &map->freehint[0];
698 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
699 		if (scan->length <= length && scan->start > start)
700 			scan->start = start;
701 		++scan;
702 	}
703 }
704 
705 /*
706  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
707  * object in the entry for COW faults.
708  *
709  * The entire chain including entry->ba (prior to inserting the fronting
710  * object) essentially becomes set in stone... elements of it can be paged
711  * in or out, but cannot be further modified.
712  *
713  * NOTE: If we do not optimize the backing chain then a unique copy is not
714  *	 needed.  Note, however, that because portions of the chain are
715  *	 shared across pmaps we cannot make any changes to the vm_map_backing
716  *	 elements themselves.
717  *
718  * If the map segment is governed by a virtual page table then it is
719  * possible to address offsets beyond the mapped area.  Just allocate
720  * a maximally sized object for this case.
721  *
722  * If addref is non-zero an additional reference is added to the returned
723  * entry.  This mechanic exists because the additional reference might have
724  * to be added atomically and not after return to prevent a premature
725  * collapse.  XXX currently there is no collapse code.
726  *
727  * The vm_map must be exclusively locked.
728  * No other requirements.
729  */
730 static
731 void
732 vm_map_entry_shadow(vm_map_entry_t entry)
733 {
734 	vm_map_backing_t ba;
735 	vm_size_t length;
736 	vm_object_t source;
737 	vm_object_t result;
738 
739 	if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
740 		length = 0x7FFFFFFF;
741 	else
742 		length = atop(entry->ba.end - entry->ba.start);
743 	ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
744 
745 	/*
746 	 * Don't create the new object if the old object isn't shared.
747 	 *
748 	 * Caller ensures source exists (all backing_ba's must have objects),
749 	 * typically indirectly by virtue of the NEEDS_COPY flag being set.
750 	 *
751 	 * WARNING! Checking ref_count == 1 only works because we are testing
752 	 *	    the object embedded in the entry (entry->ba.object).
753 	 *	    This test DOES NOT WORK if checking an object hanging off
754 	 *	    the backing chain (entry->ba.backing_ba list) because the
755 	 *	    vm_map_backing might be shared, or part of a chain that
756 	 *	    is shared.  Checking ba->refs is worthless.
757 	 *
758 	 *	    XXX since we now replicate vm_map_backing's, ref_count==1
759 	 *	    actually works generally for non-vnodes.
760 	 */
761 	source = entry->ba.object;
762 	KKASSERT(source);
763 	vm_object_hold(source);
764 
765 	if (source->type != OBJT_VNODE) {
766 		if (source->ref_count == 1 &&
767 		    source->handle == NULL &&
768 		    (source->type == OBJT_DEFAULT ||
769 		     source->type == OBJT_SWAP)) {
770 			vm_object_drop(source);
771 			kfree(ba, M_MAP_BACKING);
772 			goto done;
773 		}
774 	}
775 
776 	/*
777 	 * Once it becomes part of a backing_ba chain it can wind up anywhere,
778 	 * drop the ONEMAPPING flag now.
779 	 */
780 	vm_object_clear_flag(source, OBJ_ONEMAPPING);
781 
782 	/*
783 	 * Allocate a new object with the given length.  The new object
784 	 * is returned referenced but we may have to add another one.
785 	 * If we are adding a second reference we must clear OBJ_ONEMAPPING.
786 	 * (typically because the caller is about to clone a vm_map_entry).
787 	 *
788 	 * The source object currently has an extra reference to prevent
789 	 * collapses into it while we mess with its shadow list, which
790 	 * we will remove later in this routine.
791 	 *
792 	 * The target object may require a second reference if asked for one
793 	 * by the caller.
794 	 */
795 	result = vm_object_allocate_hold(OBJT_DEFAULT, length);
796 	if (result == NULL)
797 		panic("vm_object_shadow: no object for shadowing");
798 
799 	/*
800 	 * The new object shadows the source object.
801 	 *
802 	 * Try to optimize the result object's page color when shadowing
803 	 * in order to maintain page coloring consistency in the combined
804 	 * shadowed object.
805 	 *
806 	 * The source object is moved to ba, retaining its existing ref-count.
807 	 * No additional ref is needed.
808 	 *
809 	 * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
810 	 */
811 	vm_map_backing_detach(&entry->ba);
812 	*ba = entry->ba;		/* previous ba */
813 	ba->refs = 1;			/* initialize ref count */
814 	entry->ba.object = result;	/* new ba (at head of entry) */
815 	entry->ba.backing_ba = ba;
816 	entry->ba.backing_count = ba->backing_count + 1;
817 	entry->ba.offset = 0;
818 	entry->ba.refs = 0;
819 
820 	/* cpu localization twist */
821 	result->pg_color = vm_quickcolor();
822 
823 	vm_map_backing_attach(&entry->ba);
824 	vm_map_backing_attach(ba);
825 
826 	/*
827 	 * Adjust the return storage.  Drop the ref on source before
828 	 * returning.
829 	 */
830 	vm_object_drop(result);
831 	vm_object_drop(source);
832 done:
833 	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
834 }
835 
836 /*
837  * Allocate an object for a vm_map_entry.
838  *
839  * Object allocation for anonymous mappings is defered as long as possible.
840  * This function is called when we can defer no longer, generally when a map
841  * entry might be split or forked or takes a page fault.
842  *
843  * If the map segment is governed by a virtual page table then it is
844  * possible to address offsets beyond the mapped area.  Just allocate
845  * a maximally sized object for this case.
846  *
847  * The vm_map must be exclusively locked.
848  * No other requirements.
849  */
850 void
851 vm_map_entry_allocate_object(vm_map_entry_t entry)
852 {
853 	vm_object_t obj;
854 
855 	/*
856 	 * ba.offset is NOT cumulatively added in the backing_ba scan like
857 	 * it was in the old object chain, so we can assign whatever offset
858 	 * we like to the new object.
859 	 *
860 	 * For now assign a value of 0 to make debugging object sizes
861 	 * easier.
862 	 */
863 	entry->ba.offset = 0;
864 
865 	if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
866 		/* XXX */
867 		obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF);
868 	} else {
869 		obj = vm_object_allocate(OBJT_DEFAULT,
870 					 atop(entry->ba.end - entry->ba.start) +
871 					 entry->ba.offset);
872 	}
873 	entry->ba.object = obj;
874 	vm_map_backing_attach(&entry->ba);
875 }
876 
877 /*
878  * Set an initial negative count so the first attempt to reserve
879  * space preloads a bunch of vm_map_entry's for this cpu.  Also
880  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
881  * map a new page for vm_map_entry structures.  SMP systems are
882  * particularly sensitive.
883  *
884  * This routine is called in early boot so we cannot just call
885  * vm_map_entry_reserve().
886  *
887  * Called from the low level boot code only (for each cpu)
888  *
889  * WARNING! Take care not to have too-big a static/BSS structure here
890  *	    as MAXCPU can be 256+, otherwise the loader's 64MB heap
891  *	    can get blown out by the kernel plus the initrd image.
892  */
893 void
894 vm_map_entry_reserve_cpu_init(globaldata_t gd)
895 {
896 	vm_map_entry_t entry;
897 	int count;
898 	int i;
899 
900 	atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
901 	if (gd->gd_cpuid == 0) {
902 		entry = &cpu_map_entry_init_bsp[0];
903 		count = MAPENTRYBSP_CACHE;
904 	} else {
905 		entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
906 		count = MAPENTRYAP_CACHE;
907 	}
908 	for (i = 0; i < count; ++i, ++entry) {
909 		MAPENT_FREELIST(entry) = gd->gd_vme_base;
910 		gd->gd_vme_base = entry;
911 	}
912 }
913 
914 /*
915  * Reserves vm_map_entry structures so code later-on can manipulate
916  * map_entry structures within a locked map without blocking trying
917  * to allocate a new vm_map_entry.
918  *
919  * No requirements.
920  *
921  * WARNING!  We must not decrement gd_vme_avail until after we have
922  *	     ensured that sufficient entries exist, otherwise we can
923  *	     get into an endless call recursion in the zalloc code
924  *	     itself.
925  */
926 int
927 vm_map_entry_reserve(int count)
928 {
929 	struct globaldata *gd = mycpu;
930 	vm_map_entry_t entry;
931 
932 	/*
933 	 * Make sure we have enough structures in gd_vme_base to handle
934 	 * the reservation request.
935 	 *
936 	 * Use a critical section to protect against VM faults.  It might
937 	 * not be needed, but we have to be careful here.
938 	 */
939 	if (gd->gd_vme_avail < count) {
940 		crit_enter();
941 		while (gd->gd_vme_avail < count) {
942 			entry = zalloc(mapentzone);
943 			MAPENT_FREELIST(entry) = gd->gd_vme_base;
944 			gd->gd_vme_base = entry;
945 			atomic_add_int(&gd->gd_vme_avail, 1);
946 		}
947 		crit_exit();
948 	}
949 	atomic_add_int(&gd->gd_vme_avail, -count);
950 
951 	return(count);
952 }
953 
954 /*
955  * Releases previously reserved vm_map_entry structures that were not
956  * used.  If we have too much junk in our per-cpu cache clean some of
957  * it out.
958  *
959  * No requirements.
960  */
961 void
962 vm_map_entry_release(int count)
963 {
964 	struct globaldata *gd = mycpu;
965 	vm_map_entry_t entry;
966 	vm_map_entry_t efree;
967 
968 	count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
969 	if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
970 		efree = NULL;
971 		crit_enter();
972 		while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
973 			entry = gd->gd_vme_base;
974 			KKASSERT(entry != NULL);
975 			gd->gd_vme_base = MAPENT_FREELIST(entry);
976 			atomic_add_int(&gd->gd_vme_avail, -1);
977 			MAPENT_FREELIST(entry) = efree;
978 			efree = entry;
979 		}
980 		crit_exit();
981 		while ((entry = efree) != NULL) {
982 			efree = MAPENT_FREELIST(efree);
983 			zfree(mapentzone, entry);
984 		}
985 	}
986 }
987 
988 /*
989  * Reserve map entry structures for use in kernel_map itself.  These
990  * entries have *ALREADY* been reserved on a per-cpu basis when the map
991  * was inited.  This function is used by zalloc() to avoid a recursion
992  * when zalloc() itself needs to allocate additional kernel memory.
993  *
994  * This function works like the normal reserve but does not load the
995  * vm_map_entry cache (because that would result in an infinite
996  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
997  *
998  * Any caller of this function must be sure to renormalize after
999  * potentially eating entries to ensure that the reserve supply
1000  * remains intact.
1001  *
1002  * No requirements.
1003  */
1004 int
1005 vm_map_entry_kreserve(int count)
1006 {
1007 	struct globaldata *gd = mycpu;
1008 
1009 	atomic_add_int(&gd->gd_vme_avail, -count);
1010 	KASSERT(gd->gd_vme_base != NULL,
1011 		("no reserved entries left, gd_vme_avail = %d",
1012 		gd->gd_vme_avail));
1013 	return(count);
1014 }
1015 
1016 /*
1017  * Release previously reserved map entries for kernel_map.  We do not
1018  * attempt to clean up like the normal release function as this would
1019  * cause an unnecessary (but probably not fatal) deep procedure call.
1020  *
1021  * No requirements.
1022  */
1023 void
1024 vm_map_entry_krelease(int count)
1025 {
1026 	struct globaldata *gd = mycpu;
1027 
1028 	atomic_add_int(&gd->gd_vme_avail, count);
1029 }
1030 
1031 /*
1032  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1033  *
1034  * The entries should have previously been reserved.  The reservation count
1035  * is tracked in (*countp).
1036  *
1037  * No requirements.
1038  */
1039 static vm_map_entry_t
1040 vm_map_entry_create(int *countp)
1041 {
1042 	struct globaldata *gd = mycpu;
1043 	vm_map_entry_t entry;
1044 
1045 	KKASSERT(*countp > 0);
1046 	--*countp;
1047 	crit_enter();
1048 	entry = gd->gd_vme_base;
1049 	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1050 	gd->gd_vme_base = MAPENT_FREELIST(entry);
1051 	crit_exit();
1052 
1053 	return(entry);
1054 }
1055 
1056 /*
1057  *
1058  */
1059 static void
1060 vm_map_backing_attach(vm_map_backing_t ba)
1061 {
1062 	vm_object_t obj = ba->object;
1063 
1064 	spin_lock(&obj->spin);
1065 	TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry);
1066 	spin_unlock(&obj->spin);
1067 }
1068 
1069 static void
1070 vm_map_backing_detach(vm_map_backing_t ba)
1071 {
1072 	vm_object_t obj = ba->object;
1073 
1074 	spin_lock(&obj->spin);
1075 	TAILQ_REMOVE(&obj->backing_list, ba, entry);
1076 	spin_unlock(&obj->spin);
1077 }
1078 
1079 /*
1080  * Dispose of the dynamically allocated backing_ba chain associated
1081  * with a vm_map_entry.
1082  *
1083  * We decrement the (possibly shared) element and kfree() on the
1084  * 1->0 transition.  We only iterate to the next backing_ba when
1085  * the previous one went through a 1->0 transition.
1086  */
1087 static void
1088 vm_map_entry_dispose_ba(vm_map_backing_t ba)
1089 {
1090 	vm_map_backing_t next;
1091 	long refs;
1092 
1093 	while (ba) {
1094 		refs = atomic_fetchadd_long(&ba->refs, -1);
1095 		if (refs > 1)
1096 			break;
1097 		KKASSERT(refs == 1);	/* transitioned 1->0 */
1098 		if (ba->object) {
1099 			vm_map_backing_detach(ba);
1100 			vm_object_deallocate(ba->object);
1101 		}
1102 		next = ba->backing_ba;
1103 		kfree(ba, M_MAP_BACKING);
1104 		ba = next;
1105 	}
1106 }
1107 
1108 /*
1109  * Dispose of a vm_map_entry that is no longer being referenced.
1110  *
1111  * No requirements.
1112  */
1113 static void
1114 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1115 {
1116 	struct globaldata *gd = mycpu;
1117 
1118 	/*
1119 	 * Dispose of the base object and the backing link.
1120 	 */
1121 	switch(entry->maptype) {
1122 	case VM_MAPTYPE_NORMAL:
1123 	case VM_MAPTYPE_VPAGETABLE:
1124 		if (entry->ba.object) {
1125 			vm_map_backing_detach(&entry->ba);
1126 			vm_object_deallocate(entry->ba.object);
1127 		}
1128 		break;
1129 	case VM_MAPTYPE_SUBMAP:
1130 	case VM_MAPTYPE_UKSMAP:
1131 		/* XXX TODO */
1132 		break;
1133 	default:
1134 		break;
1135 	}
1136 	vm_map_entry_dispose_ba(entry->ba.backing_ba);
1137 
1138 	/*
1139 	 * Cleanup for safety.
1140 	 */
1141 	entry->ba.backing_ba = NULL;
1142 	entry->ba.object = NULL;
1143 	entry->ba.offset = 0;
1144 
1145 	++*countp;
1146 	crit_enter();
1147 	MAPENT_FREELIST(entry) = gd->gd_vme_base;
1148 	gd->gd_vme_base = entry;
1149 	crit_exit();
1150 }
1151 
1152 
1153 /*
1154  * Insert/remove entries from maps.
1155  *
1156  * The related map must be exclusively locked.
1157  * The caller must hold map->token
1158  * No other requirements.
1159  */
1160 static __inline void
1161 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1162 {
1163 	ASSERT_VM_MAP_LOCKED(map);
1164 
1165 	map->nentries++;
1166 	if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1167 		panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1168 }
1169 
1170 static __inline void
1171 vm_map_entry_unlink(vm_map_t map,
1172 		    vm_map_entry_t entry)
1173 {
1174 	ASSERT_VM_MAP_LOCKED(map);
1175 
1176 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1177 		panic("vm_map_entry_unlink: attempt to mess with "
1178 		      "locked entry! %p", entry);
1179 	}
1180 	vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1181 	map->nentries--;
1182 }
1183 
1184 /*
1185  * Finds the map entry containing (or immediately preceding) the specified
1186  * address in the given map.  The entry is returned in (*entry).
1187  *
1188  * The boolean result indicates whether the address is actually contained
1189  * in the map.
1190  *
1191  * The related map must be locked.
1192  * No other requirements.
1193  */
1194 boolean_t
1195 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1196 {
1197 	vm_map_entry_t tmp;
1198 	vm_map_entry_t last;
1199 
1200 	ASSERT_VM_MAP_LOCKED(map);
1201 
1202 	/*
1203 	 * Locate the record from the top of the tree.  'last' tracks the
1204 	 * closest prior record and is returned if no match is found, which
1205 	 * in binary tree terms means tracking the most recent right-branch
1206 	 * taken.  If there is no prior record, *entry is set to NULL.
1207 	 */
1208 	last = NULL;
1209 	tmp = RB_ROOT(&map->rb_root);
1210 
1211 	while (tmp) {
1212 		if (address >= tmp->ba.start) {
1213 			if (address < tmp->ba.end) {
1214 				*entry = tmp;
1215 				return(TRUE);
1216 			}
1217 			last = tmp;
1218 			tmp = RB_RIGHT(tmp, rb_entry);
1219 		} else {
1220 			tmp = RB_LEFT(tmp, rb_entry);
1221 		}
1222 	}
1223 	*entry = last;
1224 	return (FALSE);
1225 }
1226 
1227 /*
1228  * Inserts the given whole VM object into the target map at the specified
1229  * address range.  The object's size should match that of the address range.
1230  *
1231  * The map must be exclusively locked.
1232  * The object must be held.
1233  * The caller must have reserved sufficient vm_map_entry structures.
1234  *
1235  * If object is non-NULL, ref count must be bumped by caller prior to
1236  * making call to account for the new entry.  XXX API is a bit messy.
1237  */
1238 int
1239 vm_map_insert(vm_map_t map, int *countp, void *map_object, void *map_aux,
1240 	      vm_ooffset_t offset, vm_offset_t start, vm_offset_t end,
1241 	      vm_maptype_t maptype, vm_subsys_t id,
1242 	      vm_prot_t prot, vm_prot_t max, int cow)
1243 {
1244 	vm_map_entry_t new_entry;
1245 	vm_map_entry_t prev_entry;
1246 	vm_map_entry_t next;
1247 	vm_map_entry_t temp_entry;
1248 	vm_eflags_t protoeflags;
1249 	vm_object_t object;
1250 	int must_drop = 0;
1251 
1252 	if (maptype == VM_MAPTYPE_UKSMAP)
1253 		object = NULL;
1254 	else
1255 		object = map_object;
1256 
1257 	ASSERT_VM_MAP_LOCKED(map);
1258 	if (object)
1259 		ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1260 
1261 	/*
1262 	 * Check that the start and end points are not bogus.
1263 	 */
1264 	if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1265 	    (start >= end)) {
1266 		return (KERN_INVALID_ADDRESS);
1267 	}
1268 
1269 	/*
1270 	 * Find the entry prior to the proposed starting address; if it's part
1271 	 * of an existing entry, this range is bogus.
1272 	 */
1273 	if (vm_map_lookup_entry(map, start, &temp_entry))
1274 		return (KERN_NO_SPACE);
1275 	prev_entry = temp_entry;
1276 
1277 	/*
1278 	 * Assert that the next entry doesn't overlap the end point.
1279 	 */
1280 	if (prev_entry)
1281 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
1282 	else
1283 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1284 	if (next && next->ba.start < end)
1285 		return (KERN_NO_SPACE);
1286 
1287 	protoeflags = 0;
1288 
1289 	if (cow & MAP_COPY_ON_WRITE)
1290 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1291 
1292 	if (cow & MAP_NOFAULT) {
1293 		protoeflags |= MAP_ENTRY_NOFAULT;
1294 
1295 		KASSERT(object == NULL,
1296 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
1297 	}
1298 	if (cow & MAP_DISABLE_SYNCER)
1299 		protoeflags |= MAP_ENTRY_NOSYNC;
1300 	if (cow & MAP_DISABLE_COREDUMP)
1301 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1302 	if (cow & MAP_IS_STACK)
1303 		protoeflags |= MAP_ENTRY_STACK;
1304 	if (cow & MAP_IS_KSTACK)
1305 		protoeflags |= MAP_ENTRY_KSTACK;
1306 
1307 	lwkt_gettoken(&map->token);
1308 
1309 	if (object) {
1310 		;
1311 	} else if (prev_entry &&
1312 		 (prev_entry->eflags == protoeflags) &&
1313 		 (prev_entry->ba.end == start) &&
1314 		 (prev_entry->wired_count == 0) &&
1315 		 (prev_entry->id == id) &&
1316 		 prev_entry->maptype == maptype &&
1317 		 maptype == VM_MAPTYPE_NORMAL &&
1318 		 prev_entry->ba.backing_ba == NULL &&	/* not backed */
1319 		 ((prev_entry->ba.object == NULL) ||
1320 		  vm_object_coalesce(prev_entry->ba.object,
1321 				     OFF_TO_IDX(prev_entry->ba.offset),
1322 				     (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start),
1323 				     (vm_size_t)(end - prev_entry->ba.end)))) {
1324 		/*
1325 		 * We were able to extend the object.  Determine if we
1326 		 * can extend the previous map entry to include the
1327 		 * new range as well.
1328 		 */
1329 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1330 		    (prev_entry->protection == prot) &&
1331 		    (prev_entry->max_protection == max)) {
1332 			map->size += (end - prev_entry->ba.end);
1333 			vm_map_backing_adjust_end(prev_entry, end);
1334 			vm_map_simplify_entry(map, prev_entry, countp);
1335 			lwkt_reltoken(&map->token);
1336 			return (KERN_SUCCESS);
1337 		}
1338 
1339 		/*
1340 		 * If we can extend the object but cannot extend the
1341 		 * map entry, we have to create a new map entry.  We
1342 		 * must bump the ref count on the extended object to
1343 		 * account for it.  object may be NULL.
1344 		 */
1345 		object = prev_entry->ba.object;
1346 		offset = prev_entry->ba.offset +
1347 			(prev_entry->ba.end - prev_entry->ba.start);
1348 		if (object) {
1349 			vm_object_hold(object);
1350 			vm_object_lock_swap(); /* map->token order */
1351 			vm_object_reference_locked(object);
1352 			map_object = object;
1353 			must_drop = 1;
1354 		}
1355 	}
1356 
1357 	/*
1358 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
1359 	 * in things like the buffer map where we manage kva but do not manage
1360 	 * backing objects.
1361 	 */
1362 
1363 	/*
1364 	 * Create a new entry
1365 	 */
1366 	new_entry = vm_map_entry_create(countp);
1367 	new_entry->ba.pmap = map->pmap;
1368 	new_entry->ba.start = start;
1369 	new_entry->ba.end = end;
1370 	new_entry->id = id;
1371 
1372 	new_entry->maptype = maptype;
1373 	new_entry->eflags = protoeflags;
1374 	new_entry->aux.master_pde = 0;		/* in case size is different */
1375 	new_entry->aux.map_aux = map_aux;
1376 	new_entry->ba.map_object = map_object;
1377 	new_entry->ba.backing_ba = NULL;
1378 	new_entry->ba.backing_count = 0;
1379 	new_entry->ba.offset = offset;
1380 	new_entry->ba.refs = 0;
1381 	new_entry->ba.flags = 0;
1382 	new_entry->ba.pmap = map->pmap;
1383 
1384 	new_entry->inheritance = VM_INHERIT_DEFAULT;
1385 	new_entry->protection = prot;
1386 	new_entry->max_protection = max;
1387 	new_entry->wired_count = 0;
1388 
1389 	/*
1390 	 * Insert the new entry into the list
1391 	 */
1392 	vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD);
1393 	vm_map_entry_link(map, new_entry);
1394 	map->size += new_entry->ba.end - new_entry->ba.start;
1395 
1396 	/*
1397 	 * Don't worry about updating freehint[] when inserting, allow
1398 	 * addresses to be lower than the actual first free spot.
1399 	 */
1400 #if 0
1401 	/*
1402 	 * Temporarily removed to avoid MAP_STACK panic, due to
1403 	 * MAP_STACK being a huge hack.  Will be added back in
1404 	 * when MAP_STACK (and the user stack mapping) is fixed.
1405 	 */
1406 	/*
1407 	 * It may be possible to simplify the entry
1408 	 */
1409 	vm_map_simplify_entry(map, new_entry, countp);
1410 #endif
1411 
1412 	/*
1413 	 * Try to pre-populate the page table.  Mappings governed by virtual
1414 	 * page tables cannot be prepopulated without a lot of work, so
1415 	 * don't try.
1416 	 */
1417 	if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1418 	    maptype != VM_MAPTYPE_VPAGETABLE &&
1419 	    maptype != VM_MAPTYPE_UKSMAP) {
1420 		int dorelock = 0;
1421 		if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1422 			dorelock = 1;
1423 			vm_object_lock_swap();
1424 			vm_object_drop(object);
1425 		}
1426 		pmap_object_init_pt(map->pmap, new_entry,
1427 				    new_entry->ba.start,
1428 				    new_entry->ba.end - new_entry->ba.start,
1429 				    cow & MAP_PREFAULT_PARTIAL);
1430 		if (dorelock) {
1431 			vm_object_hold(object);
1432 			vm_object_lock_swap();
1433 		}
1434 	}
1435 	lwkt_reltoken(&map->token);
1436 	if (must_drop)
1437 		vm_object_drop(object);
1438 
1439 	return (KERN_SUCCESS);
1440 }
1441 
1442 /*
1443  * Find sufficient space for `length' bytes in the given map, starting at
1444  * `start'.  Returns 0 on success, 1 on no space.
1445  *
1446  * This function will returned an arbitrarily aligned pointer.  If no
1447  * particular alignment is required you should pass align as 1.  Note that
1448  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1449  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1450  * argument.
1451  *
1452  * 'align' should be a power of 2 but is not required to be.
1453  *
1454  * The map must be exclusively locked.
1455  * No other requirements.
1456  */
1457 int
1458 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1459 		 vm_size_t align, int flags, vm_offset_t *addr)
1460 {
1461 	vm_map_entry_t entry;
1462 	vm_map_entry_t tmp;
1463 	vm_offset_t hole_start;
1464 	vm_offset_t end;
1465 	vm_offset_t align_mask;
1466 
1467 	if (start < vm_map_min(map))
1468 		start = vm_map_min(map);
1469 	if (start > vm_map_max(map))
1470 		return (1);
1471 
1472 	/*
1473 	 * If the alignment is not a power of 2 we will have to use
1474 	 * a mod/division, set align_mask to a special value.
1475 	 */
1476 	if ((align | (align - 1)) + 1 != (align << 1))
1477 		align_mask = (vm_offset_t)-1;
1478 	else
1479 		align_mask = align - 1;
1480 
1481 	/*
1482 	 * Use freehint to adjust the start point, hopefully reducing
1483 	 * the iteration to O(1).
1484 	 */
1485 	hole_start = vm_map_freehint_find(map, length, align);
1486 	if (start < hole_start)
1487 		start = hole_start;
1488 	if (vm_map_lookup_entry(map, start, &tmp))
1489 		start = tmp->ba.end;
1490 	entry = tmp;	/* may be NULL */
1491 
1492 	/*
1493 	 * Look through the rest of the map, trying to fit a new region in the
1494 	 * gap between existing regions, or after the very last region.
1495 	 */
1496 	for (;;) {
1497 		/*
1498 		 * Adjust the proposed start by the requested alignment,
1499 		 * be sure that we didn't wrap the address.
1500 		 */
1501 		if (align_mask == (vm_offset_t)-1)
1502 			end = roundup(start, align);
1503 		else
1504 			end = (start + align_mask) & ~align_mask;
1505 		if (end < start)
1506 			return (1);
1507 		start = end;
1508 
1509 		/*
1510 		 * Find the end of the proposed new region.  Be sure we didn't
1511 		 * go beyond the end of the map, or wrap around the address.
1512 		 * Then check to see if this is the last entry or if the
1513 		 * proposed end fits in the gap between this and the next
1514 		 * entry.
1515 		 */
1516 		end = start + length;
1517 		if (end > vm_map_max(map) || end < start)
1518 			return (1);
1519 
1520 		/*
1521 		 * Locate the next entry, we can stop if this is the
1522 		 * last entry (we know we are in-bounds so that would
1523 		 * be a sucess).
1524 		 */
1525 		if (entry)
1526 			entry = vm_map_rb_tree_RB_NEXT(entry);
1527 		else
1528 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1529 		if (entry == NULL)
1530 			break;
1531 
1532 		/*
1533 		 * Determine if the proposed area would overlap the
1534 		 * next entry.
1535 		 *
1536 		 * When matching against a STACK entry, only allow the
1537 		 * memory map to intrude on the ungrown portion of the
1538 		 * STACK entry when MAP_TRYFIXED is set.
1539 		 */
1540 		if (entry->ba.start >= end) {
1541 			if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1542 				break;
1543 			if (flags & MAP_TRYFIXED)
1544 				break;
1545 			if (entry->ba.start - entry->aux.avail_ssize >= end)
1546 				break;
1547 		}
1548 		start = entry->ba.end;
1549 	}
1550 
1551 	/*
1552 	 * Update the freehint
1553 	 */
1554 	vm_map_freehint_update(map, start, length, align);
1555 
1556 	/*
1557 	 * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1558 	 * if it fails.  The kernel_map is locked and nothing can steal
1559 	 * our address space if pmap_growkernel() blocks.
1560 	 *
1561 	 * NOTE: This may be unconditionally called for kldload areas on
1562 	 *	 x86_64 because these do not bump kernel_vm_end (which would
1563 	 *	 fill 128G worth of page tables!).  Therefore we must not
1564 	 *	 retry.
1565 	 */
1566 	if (map == &kernel_map) {
1567 		vm_offset_t kstop;
1568 
1569 		kstop = round_page(start + length);
1570 		if (kstop > kernel_vm_end)
1571 			pmap_growkernel(start, kstop);
1572 	}
1573 	*addr = start;
1574 	return (0);
1575 }
1576 
1577 /*
1578  * vm_map_find finds an unallocated region in the target address map with
1579  * the given length and allocates it.  The search is defined to be first-fit
1580  * from the specified address; the region found is returned in the same
1581  * parameter.
1582  *
1583  * If object is non-NULL, ref count must be bumped by caller
1584  * prior to making call to account for the new entry.
1585  *
1586  * No requirements.  This function will lock the map temporarily.
1587  */
1588 int
1589 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1590 	    vm_ooffset_t offset, vm_offset_t *addr,
1591 	    vm_size_t length, vm_size_t align, boolean_t fitit,
1592 	    vm_maptype_t maptype, vm_subsys_t id,
1593 	    vm_prot_t prot, vm_prot_t max, int cow)
1594 {
1595 	vm_offset_t start;
1596 	vm_object_t object;
1597 	int result;
1598 	int count;
1599 
1600 	if (maptype == VM_MAPTYPE_UKSMAP)
1601 		object = NULL;
1602 	else
1603 		object = map_object;
1604 
1605 	start = *addr;
1606 
1607 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1608 	vm_map_lock(map);
1609 	if (object)
1610 		vm_object_hold_shared(object);
1611 	if (fitit) {
1612 		if (vm_map_findspace(map, start, length, align, 0, addr)) {
1613 			if (object)
1614 				vm_object_drop(object);
1615 			vm_map_unlock(map);
1616 			vm_map_entry_release(count);
1617 			return (KERN_NO_SPACE);
1618 		}
1619 		start = *addr;
1620 	}
1621 	result = vm_map_insert(map, &count, map_object, map_aux,
1622 			       offset, start, start + length,
1623 			       maptype, id, prot, max, cow);
1624 	if (object)
1625 		vm_object_drop(object);
1626 	vm_map_unlock(map);
1627 	vm_map_entry_release(count);
1628 
1629 	return (result);
1630 }
1631 
1632 /*
1633  * Simplify the given map entry by merging with either neighbor.  This
1634  * routine also has the ability to merge with both neighbors.
1635  *
1636  * This routine guarentees that the passed entry remains valid (though
1637  * possibly extended).  When merging, this routine may delete one or
1638  * both neighbors.  No action is taken on entries which have their
1639  * in-transition flag set.
1640  *
1641  * The map must be exclusively locked.
1642  */
1643 void
1644 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1645 {
1646 	vm_map_entry_t next, prev;
1647 	vm_size_t prevsize, esize;
1648 
1649 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1650 		++mycpu->gd_cnt.v_intrans_coll;
1651 		return;
1652 	}
1653 
1654 	if (entry->maptype == VM_MAPTYPE_SUBMAP)
1655 		return;
1656 	if (entry->maptype == VM_MAPTYPE_UKSMAP)
1657 		return;
1658 
1659 	prev = vm_map_rb_tree_RB_PREV(entry);
1660 	if (prev) {
1661 		prevsize = prev->ba.end - prev->ba.start;
1662 		if ( (prev->ba.end == entry->ba.start) &&
1663 		     (prev->maptype == entry->maptype) &&
1664 		     (prev->ba.object == entry->ba.object) &&
1665 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1666 		     (!prev->ba.object ||
1667 			(prev->ba.offset + prevsize == entry->ba.offset)) &&
1668 		     (prev->eflags == entry->eflags) &&
1669 		     (prev->protection == entry->protection) &&
1670 		     (prev->max_protection == entry->max_protection) &&
1671 		     (prev->inheritance == entry->inheritance) &&
1672 		     (prev->id == entry->id) &&
1673 		     (prev->wired_count == entry->wired_count)) {
1674 			/*
1675 			 * NOTE: order important.  Unlink before gumming up
1676 			 *	 the RBTREE w/adjust, adjust before disposal
1677 			 *	 of prior entry, to avoid pmap snafus.
1678 			 */
1679 			vm_map_entry_unlink(map, prev);
1680 			vm_map_backing_adjust_start(entry, prev->ba.start);
1681 			if (entry->ba.object == NULL)
1682 				entry->ba.offset = 0;
1683 			vm_map_entry_dispose(map, prev, countp);
1684 		}
1685 	}
1686 
1687 	next = vm_map_rb_tree_RB_NEXT(entry);
1688 	if (next) {
1689 		esize = entry->ba.end - entry->ba.start;
1690 		if ((entry->ba.end == next->ba.start) &&
1691 		    (next->maptype == entry->maptype) &&
1692 		    (next->ba.object == entry->ba.object) &&
1693 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1694 		     (!entry->ba.object ||
1695 			(entry->ba.offset + esize == next->ba.offset)) &&
1696 		    (next->eflags == entry->eflags) &&
1697 		    (next->protection == entry->protection) &&
1698 		    (next->max_protection == entry->max_protection) &&
1699 		    (next->inheritance == entry->inheritance) &&
1700 		    (next->id == entry->id) &&
1701 		    (next->wired_count == entry->wired_count)) {
1702 			/*
1703 			 * NOTE: order important.  Unlink before gumming up
1704 			 *	 the RBTREE w/adjust, adjust before disposal
1705 			 *	 of prior entry, to avoid pmap snafus.
1706 			 */
1707 			vm_map_entry_unlink(map, next);
1708 			vm_map_backing_adjust_end(entry, next->ba.end);
1709 			vm_map_entry_dispose(map, next, countp);
1710 	        }
1711 	}
1712 }
1713 
1714 /*
1715  * Asserts that the given entry begins at or after the specified address.
1716  * If necessary, it splits the entry into two.
1717  */
1718 #define vm_map_clip_start(map, entry, startaddr, countp)		\
1719 {									\
1720 	if (startaddr > entry->ba.start)				\
1721 		_vm_map_clip_start(map, entry, startaddr, countp);	\
1722 }
1723 
1724 /*
1725  * This routine is called only when it is known that the entry must be split.
1726  *
1727  * The map must be exclusively locked.
1728  */
1729 static void
1730 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1731 		   int *countp)
1732 {
1733 	vm_map_entry_t new_entry;
1734 
1735 	/*
1736 	 * Split off the front portion -- note that we must insert the new
1737 	 * entry BEFORE this one, so that this entry has the specified
1738 	 * starting address.
1739 	 */
1740 
1741 	vm_map_simplify_entry(map, entry, countp);
1742 
1743 	/*
1744 	 * If there is no object backing this entry, we might as well create
1745 	 * one now.  If we defer it, an object can get created after the map
1746 	 * is clipped, and individual objects will be created for the split-up
1747 	 * map.  This is a bit of a hack, but is also about the best place to
1748 	 * put this improvement.
1749 	 */
1750 	if (entry->ba.object == NULL && !map->system_map &&
1751 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1752 		vm_map_entry_allocate_object(entry);
1753 	}
1754 
1755 	/*
1756 	 * NOTE: The replicated function will adjust start, end, and offset
1757 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1758 	 *	 the embedded ba.
1759 	 */
1760 	new_entry = vm_map_entry_create(countp);
1761 	*new_entry = *entry;
1762 	new_entry->ba.end = start;
1763 
1764 	/*
1765 	 * Ordering is important, make sure the new entry is replicated
1766 	 * before we cut the exiting entry.
1767 	 */
1768 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1769 	vm_map_backing_adjust_start(entry, start);
1770 	vm_map_entry_link(map, new_entry);
1771 }
1772 
1773 /*
1774  * Asserts that the given entry ends at or before the specified address.
1775  * If necessary, it splits the entry into two.
1776  *
1777  * The map must be exclusively locked.
1778  */
1779 #define vm_map_clip_end(map, entry, endaddr, countp)		\
1780 {								\
1781 	if (endaddr < entry->ba.end)				\
1782 		_vm_map_clip_end(map, entry, endaddr, countp);	\
1783 }
1784 
1785 /*
1786  * This routine is called only when it is known that the entry must be split.
1787  *
1788  * The map must be exclusively locked.
1789  */
1790 static void
1791 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1792 		 int *countp)
1793 {
1794 	vm_map_entry_t new_entry;
1795 
1796 	/*
1797 	 * If there is no object backing this entry, we might as well create
1798 	 * one now.  If we defer it, an object can get created after the map
1799 	 * is clipped, and individual objects will be created for the split-up
1800 	 * map.  This is a bit of a hack, but is also about the best place to
1801 	 * put this improvement.
1802 	 */
1803 
1804 	if (entry->ba.object == NULL && !map->system_map &&
1805 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1806 		vm_map_entry_allocate_object(entry);
1807 	}
1808 
1809 	/*
1810 	 * Create a new entry and insert it AFTER the specified entry
1811 	 *
1812 	 * NOTE: The replicated function will adjust start, end, and offset
1813 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1814 	 *	 the embedded ba.
1815 	 */
1816 	new_entry = vm_map_entry_create(countp);
1817 	*new_entry = *entry;
1818 	new_entry->ba.start = end;
1819 	new_entry->ba.offset += (new_entry->ba.start - entry->ba.start);
1820 
1821 	/*
1822 	 * Ordering is important, make sure the new entry is replicated
1823 	 * before we cut the exiting entry.
1824 	 */
1825 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1826 	vm_map_backing_adjust_end(entry, end);
1827 	vm_map_entry_link(map, new_entry);
1828 }
1829 
1830 /*
1831  * Asserts that the starting and ending region addresses fall within the
1832  * valid range for the map.
1833  */
1834 #define	VM_MAP_RANGE_CHECK(map, start, end)	\
1835 {						\
1836 	if (start < vm_map_min(map))		\
1837 		start = vm_map_min(map);	\
1838 	if (end > vm_map_max(map))		\
1839 		end = vm_map_max(map);		\
1840 	if (start > end)			\
1841 		start = end;			\
1842 }
1843 
1844 /*
1845  * Used to block when an in-transition collison occurs.  The map
1846  * is unlocked for the sleep and relocked before the return.
1847  */
1848 void
1849 vm_map_transition_wait(vm_map_t map, int relock)
1850 {
1851 	tsleep_interlock(map, 0);
1852 	vm_map_unlock(map);
1853 	tsleep(map, PINTERLOCKED, "vment", 0);
1854 	if (relock)
1855 		vm_map_lock(map);
1856 }
1857 
1858 /*
1859  * When we do blocking operations with the map lock held it is
1860  * possible that a clip might have occured on our in-transit entry,
1861  * requiring an adjustment to the entry in our loop.  These macros
1862  * help the pageable and clip_range code deal with the case.  The
1863  * conditional costs virtually nothing if no clipping has occured.
1864  */
1865 
1866 #define CLIP_CHECK_BACK(entry, save_start)			\
1867     do {							\
1868 	    while (entry->ba.start != save_start) {		\
1869 		    entry = vm_map_rb_tree_RB_PREV(entry);	\
1870 		    KASSERT(entry, ("bad entry clip")); 	\
1871 	    }							\
1872     } while(0)
1873 
1874 #define CLIP_CHECK_FWD(entry, save_end)				\
1875     do {							\
1876 	    while (entry->ba.end != save_end) {			\
1877 		    entry = vm_map_rb_tree_RB_NEXT(entry);	\
1878 		    KASSERT(entry, ("bad entry clip")); 	\
1879 	    }							\
1880     } while(0)
1881 
1882 
1883 /*
1884  * Clip the specified range and return the base entry.  The
1885  * range may cover several entries starting at the returned base
1886  * and the first and last entry in the covering sequence will be
1887  * properly clipped to the requested start and end address.
1888  *
1889  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1890  * flag.
1891  *
1892  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1893  * covered by the requested range.
1894  *
1895  * The map must be exclusively locked on entry and will remain locked
1896  * on return. If no range exists or the range contains holes and you
1897  * specified that no holes were allowed, NULL will be returned.  This
1898  * routine may temporarily unlock the map in order avoid a deadlock when
1899  * sleeping.
1900  */
1901 static
1902 vm_map_entry_t
1903 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1904 		  int *countp, int flags)
1905 {
1906 	vm_map_entry_t start_entry;
1907 	vm_map_entry_t entry;
1908 	vm_map_entry_t next;
1909 
1910 	/*
1911 	 * Locate the entry and effect initial clipping.  The in-transition
1912 	 * case does not occur very often so do not try to optimize it.
1913 	 */
1914 again:
1915 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1916 		return (NULL);
1917 	entry = start_entry;
1918 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1919 		entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1920 		++mycpu->gd_cnt.v_intrans_coll;
1921 		++mycpu->gd_cnt.v_intrans_wait;
1922 		vm_map_transition_wait(map, 1);
1923 		/*
1924 		 * entry and/or start_entry may have been clipped while
1925 		 * we slept, or may have gone away entirely.  We have
1926 		 * to restart from the lookup.
1927 		 */
1928 		goto again;
1929 	}
1930 
1931 	/*
1932 	 * Since we hold an exclusive map lock we do not have to restart
1933 	 * after clipping, even though clipping may block in zalloc.
1934 	 */
1935 	vm_map_clip_start(map, entry, start, countp);
1936 	vm_map_clip_end(map, entry, end, countp);
1937 	entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1938 
1939 	/*
1940 	 * Scan entries covered by the range.  When working on the next
1941 	 * entry a restart need only re-loop on the current entry which
1942 	 * we have already locked, since 'next' may have changed.  Also,
1943 	 * even though entry is safe, it may have been clipped so we
1944 	 * have to iterate forwards through the clip after sleeping.
1945 	 */
1946 	for (;;) {
1947 		next = vm_map_rb_tree_RB_NEXT(entry);
1948 		if (next == NULL || next->ba.start >= end)
1949 			break;
1950 		if (flags & MAP_CLIP_NO_HOLES) {
1951 			if (next->ba.start > entry->ba.end) {
1952 				vm_map_unclip_range(map, start_entry,
1953 					start, entry->ba.end, countp, flags);
1954 				return(NULL);
1955 			}
1956 		}
1957 
1958 		if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1959 			vm_offset_t save_end = entry->ba.end;
1960 			next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1961 			++mycpu->gd_cnt.v_intrans_coll;
1962 			++mycpu->gd_cnt.v_intrans_wait;
1963 			vm_map_transition_wait(map, 1);
1964 
1965 			/*
1966 			 * clips might have occured while we blocked.
1967 			 */
1968 			CLIP_CHECK_FWD(entry, save_end);
1969 			CLIP_CHECK_BACK(start_entry, start);
1970 			continue;
1971 		}
1972 
1973 		/*
1974 		 * No restart necessary even though clip_end may block, we
1975 		 * are holding the map lock.
1976 		 */
1977 		vm_map_clip_end(map, next, end, countp);
1978 		next->eflags |= MAP_ENTRY_IN_TRANSITION;
1979 		entry = next;
1980 	}
1981 	if (flags & MAP_CLIP_NO_HOLES) {
1982 		if (entry->ba.end != end) {
1983 			vm_map_unclip_range(map, start_entry,
1984 				start, entry->ba.end, countp, flags);
1985 			return(NULL);
1986 		}
1987 	}
1988 	return(start_entry);
1989 }
1990 
1991 /*
1992  * Undo the effect of vm_map_clip_range().  You should pass the same
1993  * flags and the same range that you passed to vm_map_clip_range().
1994  * This code will clear the in-transition flag on the entries and
1995  * wake up anyone waiting.  This code will also simplify the sequence
1996  * and attempt to merge it with entries before and after the sequence.
1997  *
1998  * The map must be locked on entry and will remain locked on return.
1999  *
2000  * Note that you should also pass the start_entry returned by
2001  * vm_map_clip_range().  However, if you block between the two calls
2002  * with the map unlocked please be aware that the start_entry may
2003  * have been clipped and you may need to scan it backwards to find
2004  * the entry corresponding with the original start address.  You are
2005  * responsible for this, vm_map_unclip_range() expects the correct
2006  * start_entry to be passed to it and will KASSERT otherwise.
2007  */
2008 static
2009 void
2010 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2011 		    vm_offset_t start, vm_offset_t end,
2012 		    int *countp, int flags)
2013 {
2014 	vm_map_entry_t entry;
2015 
2016 	entry = start_entry;
2017 
2018 	KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry"));
2019 	while (entry && entry->ba.start < end) {
2020 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2021 			("in-transition flag not set during unclip on: %p",
2022 			entry));
2023 		KASSERT(entry->ba.end <= end,
2024 			("unclip_range: tail wasn't clipped"));
2025 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2026 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2027 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2028 			wakeup(map);
2029 		}
2030 		entry = vm_map_rb_tree_RB_NEXT(entry);
2031 	}
2032 
2033 	/*
2034 	 * Simplification does not block so there is no restart case.
2035 	 */
2036 	entry = start_entry;
2037 	while (entry && entry->ba.start < end) {
2038 		vm_map_simplify_entry(map, entry, countp);
2039 		entry = vm_map_rb_tree_RB_NEXT(entry);
2040 	}
2041 }
2042 
2043 /*
2044  * Mark the given range as handled by a subordinate map.
2045  *
2046  * This range must have been created with vm_map_find(), and no other
2047  * operations may have been performed on this range prior to calling
2048  * vm_map_submap().
2049  *
2050  * Submappings cannot be removed.
2051  *
2052  * No requirements.
2053  */
2054 int
2055 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2056 {
2057 	vm_map_entry_t entry;
2058 	int result = KERN_INVALID_ARGUMENT;
2059 	int count;
2060 
2061 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2062 	vm_map_lock(map);
2063 
2064 	VM_MAP_RANGE_CHECK(map, start, end);
2065 
2066 	if (vm_map_lookup_entry(map, start, &entry)) {
2067 		vm_map_clip_start(map, entry, start, &count);
2068 	} else if (entry) {
2069 		entry = vm_map_rb_tree_RB_NEXT(entry);
2070 	} else {
2071 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2072 	}
2073 
2074 	vm_map_clip_end(map, entry, end, &count);
2075 
2076 	if ((entry->ba.start == start) && (entry->ba.end == end) &&
2077 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2078 	    (entry->ba.object == NULL)) {
2079 		entry->ba.sub_map = submap;
2080 		entry->maptype = VM_MAPTYPE_SUBMAP;
2081 		result = KERN_SUCCESS;
2082 	}
2083 	vm_map_unlock(map);
2084 	vm_map_entry_release(count);
2085 
2086 	return (result);
2087 }
2088 
2089 /*
2090  * Sets the protection of the specified address region in the target map.
2091  * If "set_max" is specified, the maximum protection is to be set;
2092  * otherwise, only the current protection is affected.
2093  *
2094  * The protection is not applicable to submaps, but is applicable to normal
2095  * maps and maps governed by virtual page tables.  For example, when operating
2096  * on a virtual page table our protection basically controls how COW occurs
2097  * on the backing object, whereas the virtual page table abstraction itself
2098  * is an abstraction for userland.
2099  *
2100  * No requirements.
2101  */
2102 int
2103 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2104 	       vm_prot_t new_prot, boolean_t set_max)
2105 {
2106 	vm_map_entry_t current;
2107 	vm_map_entry_t entry;
2108 	int count;
2109 
2110 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2111 	vm_map_lock(map);
2112 
2113 	VM_MAP_RANGE_CHECK(map, start, end);
2114 
2115 	if (vm_map_lookup_entry(map, start, &entry)) {
2116 		vm_map_clip_start(map, entry, start, &count);
2117 	} else if (entry) {
2118 		entry = vm_map_rb_tree_RB_NEXT(entry);
2119 	} else {
2120 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2121 	}
2122 
2123 	/*
2124 	 * Make a first pass to check for protection violations.
2125 	 */
2126 	current = entry;
2127 	while (current && current->ba.start < end) {
2128 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2129 			vm_map_unlock(map);
2130 			vm_map_entry_release(count);
2131 			return (KERN_INVALID_ARGUMENT);
2132 		}
2133 		if ((new_prot & current->max_protection) != new_prot) {
2134 			vm_map_unlock(map);
2135 			vm_map_entry_release(count);
2136 			return (KERN_PROTECTION_FAILURE);
2137 		}
2138 
2139 		/*
2140 		 * When making a SHARED+RW file mmap writable, update
2141 		 * v_lastwrite_ts.
2142 		 */
2143 		if (new_prot & PROT_WRITE &&
2144 		    (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2145 		    (current->maptype == VM_MAPTYPE_NORMAL ||
2146 		     current->maptype == VM_MAPTYPE_VPAGETABLE) &&
2147 		    current->ba.object &&
2148 		    current->ba.object->type == OBJT_VNODE) {
2149 			struct vnode *vp;
2150 
2151 			vp = current->ba.object->handle;
2152 			if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2153 				vfs_timestamp(&vp->v_lastwrite_ts);
2154 				vsetflags(vp, VLASTWRITETS);
2155 				vn_unlock(vp);
2156 			}
2157 		}
2158 		current = vm_map_rb_tree_RB_NEXT(current);
2159 	}
2160 
2161 	/*
2162 	 * Go back and fix up protections. [Note that clipping is not
2163 	 * necessary the second time.]
2164 	 */
2165 	current = entry;
2166 
2167 	while (current && current->ba.start < end) {
2168 		vm_prot_t old_prot;
2169 
2170 		vm_map_clip_end(map, current, end, &count);
2171 
2172 		old_prot = current->protection;
2173 		if (set_max) {
2174 			current->max_protection = new_prot;
2175 			current->protection = new_prot & old_prot;
2176 		} else {
2177 			current->protection = new_prot;
2178 		}
2179 
2180 		/*
2181 		 * Update physical map if necessary. Worry about copy-on-write
2182 		 * here -- CHECK THIS XXX
2183 		 */
2184 		if (current->protection != old_prot) {
2185 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2186 							VM_PROT_ALL)
2187 
2188 			pmap_protect(map->pmap, current->ba.start,
2189 			    current->ba.end,
2190 			    current->protection & MASK(current));
2191 #undef	MASK
2192 		}
2193 
2194 		vm_map_simplify_entry(map, current, &count);
2195 
2196 		current = vm_map_rb_tree_RB_NEXT(current);
2197 	}
2198 	vm_map_unlock(map);
2199 	vm_map_entry_release(count);
2200 	return (KERN_SUCCESS);
2201 }
2202 
2203 /*
2204  * This routine traverses a processes map handling the madvise
2205  * system call.  Advisories are classified as either those effecting
2206  * the vm_map_entry structure, or those effecting the underlying
2207  * objects.
2208  *
2209  * The <value> argument is used for extended madvise calls.
2210  *
2211  * No requirements.
2212  */
2213 int
2214 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2215 	       int behav, off_t value)
2216 {
2217 	vm_map_entry_t current, entry;
2218 	int modify_map = 0;
2219 	int error = 0;
2220 	int count;
2221 
2222 	/*
2223 	 * Some madvise calls directly modify the vm_map_entry, in which case
2224 	 * we need to use an exclusive lock on the map and we need to perform
2225 	 * various clipping operations.  Otherwise we only need a read-lock
2226 	 * on the map.
2227 	 */
2228 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2229 
2230 	switch(behav) {
2231 	case MADV_NORMAL:
2232 	case MADV_SEQUENTIAL:
2233 	case MADV_RANDOM:
2234 	case MADV_NOSYNC:
2235 	case MADV_AUTOSYNC:
2236 	case MADV_NOCORE:
2237 	case MADV_CORE:
2238 	case MADV_SETMAP:
2239 		modify_map = 1;
2240 		vm_map_lock(map);
2241 		break;
2242 	case MADV_INVAL:
2243 	case MADV_WILLNEED:
2244 	case MADV_DONTNEED:
2245 	case MADV_FREE:
2246 		vm_map_lock_read(map);
2247 		break;
2248 	default:
2249 		vm_map_entry_release(count);
2250 		return (EINVAL);
2251 	}
2252 
2253 	/*
2254 	 * Locate starting entry and clip if necessary.
2255 	 */
2256 
2257 	VM_MAP_RANGE_CHECK(map, start, end);
2258 
2259 	if (vm_map_lookup_entry(map, start, &entry)) {
2260 		if (modify_map)
2261 			vm_map_clip_start(map, entry, start, &count);
2262 	} else if (entry) {
2263 		entry = vm_map_rb_tree_RB_NEXT(entry);
2264 	} else {
2265 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2266 	}
2267 
2268 	if (modify_map) {
2269 		/*
2270 		 * madvise behaviors that are implemented in the vm_map_entry.
2271 		 *
2272 		 * We clip the vm_map_entry so that behavioral changes are
2273 		 * limited to the specified address range.
2274 		 */
2275 		for (current = entry;
2276 		     current && current->ba.start < end;
2277 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2278 			/*
2279 			 * Ignore submaps
2280 			 */
2281 			if (current->maptype == VM_MAPTYPE_SUBMAP)
2282 				continue;
2283 
2284 			vm_map_clip_end(map, current, end, &count);
2285 
2286 			switch (behav) {
2287 			case MADV_NORMAL:
2288 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2289 				break;
2290 			case MADV_SEQUENTIAL:
2291 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2292 				break;
2293 			case MADV_RANDOM:
2294 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2295 				break;
2296 			case MADV_NOSYNC:
2297 				current->eflags |= MAP_ENTRY_NOSYNC;
2298 				break;
2299 			case MADV_AUTOSYNC:
2300 				current->eflags &= ~MAP_ENTRY_NOSYNC;
2301 				break;
2302 			case MADV_NOCORE:
2303 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2304 				break;
2305 			case MADV_CORE:
2306 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2307 				break;
2308 			case MADV_SETMAP:
2309 				/*
2310 				 * Set the page directory page for a map
2311 				 * governed by a virtual page table.  Mark
2312 				 * the entry as being governed by a virtual
2313 				 * page table if it is not.
2314 				 *
2315 				 * XXX the page directory page is stored
2316 				 * in the avail_ssize field if the map_entry.
2317 				 *
2318 				 * XXX the map simplification code does not
2319 				 * compare this field so weird things may
2320 				 * happen if you do not apply this function
2321 				 * to the entire mapping governed by the
2322 				 * virtual page table.
2323 				 */
2324 				if (current->maptype != VM_MAPTYPE_VPAGETABLE) {
2325 					error = EINVAL;
2326 					break;
2327 				}
2328 				current->aux.master_pde = value;
2329 				pmap_remove(map->pmap,
2330 					    current->ba.start, current->ba.end);
2331 				break;
2332 			case MADV_INVAL:
2333 				/*
2334 				 * Invalidate the related pmap entries, used
2335 				 * to flush portions of the real kernel's
2336 				 * pmap when the caller has removed or
2337 				 * modified existing mappings in a virtual
2338 				 * page table.
2339 				 *
2340 				 * (exclusive locked map version does not
2341 				 * need the range interlock).
2342 				 */
2343 				pmap_remove(map->pmap,
2344 					    current->ba.start, current->ba.end);
2345 				break;
2346 			default:
2347 				error = EINVAL;
2348 				break;
2349 			}
2350 			vm_map_simplify_entry(map, current, &count);
2351 		}
2352 		vm_map_unlock(map);
2353 	} else {
2354 		vm_pindex_t pindex;
2355 		vm_pindex_t delta;
2356 
2357 		/*
2358 		 * madvise behaviors that are implemented in the underlying
2359 		 * vm_object.
2360 		 *
2361 		 * Since we don't clip the vm_map_entry, we have to clip
2362 		 * the vm_object pindex and count.
2363 		 *
2364 		 * NOTE!  These functions are only supported on normal maps,
2365 		 *	  except MADV_INVAL which is also supported on
2366 		 *	  virtual page tables.
2367 		 *
2368 		 * NOTE!  These functions only apply to the top-most object.
2369 		 *	  It is not applicable to backing objects.
2370 		 */
2371 		for (current = entry;
2372 		     current && current->ba.start < end;
2373 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2374 			vm_offset_t useStart;
2375 
2376 			if (current->maptype != VM_MAPTYPE_NORMAL &&
2377 			    (current->maptype != VM_MAPTYPE_VPAGETABLE ||
2378 			     behav != MADV_INVAL)) {
2379 				continue;
2380 			}
2381 
2382 			pindex = OFF_TO_IDX(current->ba.offset);
2383 			delta = atop(current->ba.end - current->ba.start);
2384 			useStart = current->ba.start;
2385 
2386 			if (current->ba.start < start) {
2387 				pindex += atop(start - current->ba.start);
2388 				delta -= atop(start - current->ba.start);
2389 				useStart = start;
2390 			}
2391 			if (current->ba.end > end)
2392 				delta -= atop(current->ba.end - end);
2393 
2394 			if ((vm_spindex_t)delta <= 0)
2395 				continue;
2396 
2397 			if (behav == MADV_INVAL) {
2398 				/*
2399 				 * Invalidate the related pmap entries, used
2400 				 * to flush portions of the real kernel's
2401 				 * pmap when the caller has removed or
2402 				 * modified existing mappings in a virtual
2403 				 * page table.
2404 				 *
2405 				 * (shared locked map version needs the
2406 				 * interlock, see vm_fault()).
2407 				 */
2408 				struct vm_map_ilock ilock;
2409 
2410 				KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2411 					    useStart + ptoa(delta) <=
2412 					    VM_MAX_USER_ADDRESS,
2413 					 ("Bad range %016jx-%016jx (%016jx)",
2414 					 useStart, useStart + ptoa(delta),
2415 					 delta));
2416 				vm_map_interlock(map, &ilock,
2417 						 useStart,
2418 						 useStart + ptoa(delta));
2419 				pmap_remove(map->pmap,
2420 					    useStart,
2421 					    useStart + ptoa(delta));
2422 				vm_map_deinterlock(map, &ilock);
2423 			} else {
2424 				vm_object_madvise(current->ba.object,
2425 						  pindex, delta, behav);
2426 			}
2427 
2428 			/*
2429 			 * Try to populate the page table.  Mappings governed
2430 			 * by virtual page tables cannot be pre-populated
2431 			 * without a lot of work so don't try.
2432 			 */
2433 			if (behav == MADV_WILLNEED &&
2434 			    current->maptype != VM_MAPTYPE_VPAGETABLE) {
2435 				pmap_object_init_pt(
2436 				    map->pmap, current,
2437 				    useStart,
2438 				    (delta << PAGE_SHIFT),
2439 				    MAP_PREFAULT_MADVISE
2440 				);
2441 			}
2442 		}
2443 		vm_map_unlock_read(map);
2444 	}
2445 	vm_map_entry_release(count);
2446 	return(error);
2447 }
2448 
2449 
2450 /*
2451  * Sets the inheritance of the specified address range in the target map.
2452  * Inheritance affects how the map will be shared with child maps at the
2453  * time of vm_map_fork.
2454  */
2455 int
2456 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2457 	       vm_inherit_t new_inheritance)
2458 {
2459 	vm_map_entry_t entry;
2460 	vm_map_entry_t temp_entry;
2461 	int count;
2462 
2463 	switch (new_inheritance) {
2464 	case VM_INHERIT_NONE:
2465 	case VM_INHERIT_COPY:
2466 	case VM_INHERIT_SHARE:
2467 		break;
2468 	default:
2469 		return (KERN_INVALID_ARGUMENT);
2470 	}
2471 
2472 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2473 	vm_map_lock(map);
2474 
2475 	VM_MAP_RANGE_CHECK(map, start, end);
2476 
2477 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2478 		entry = temp_entry;
2479 		vm_map_clip_start(map, entry, start, &count);
2480 	} else if (temp_entry) {
2481 		entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2482 	} else {
2483 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2484 	}
2485 
2486 	while (entry && entry->ba.start < end) {
2487 		vm_map_clip_end(map, entry, end, &count);
2488 
2489 		entry->inheritance = new_inheritance;
2490 
2491 		vm_map_simplify_entry(map, entry, &count);
2492 
2493 		entry = vm_map_rb_tree_RB_NEXT(entry);
2494 	}
2495 	vm_map_unlock(map);
2496 	vm_map_entry_release(count);
2497 	return (KERN_SUCCESS);
2498 }
2499 
2500 /*
2501  * Implement the semantics of mlock
2502  */
2503 int
2504 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2505 	      boolean_t new_pageable)
2506 {
2507 	vm_map_entry_t entry;
2508 	vm_map_entry_t start_entry;
2509 	vm_offset_t end;
2510 	int rv = KERN_SUCCESS;
2511 	int count;
2512 
2513 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2514 	vm_map_lock(map);
2515 	VM_MAP_RANGE_CHECK(map, start, real_end);
2516 	end = real_end;
2517 
2518 	start_entry = vm_map_clip_range(map, start, end, &count,
2519 					MAP_CLIP_NO_HOLES);
2520 	if (start_entry == NULL) {
2521 		vm_map_unlock(map);
2522 		vm_map_entry_release(count);
2523 		return (KERN_INVALID_ADDRESS);
2524 	}
2525 
2526 	if (new_pageable == 0) {
2527 		entry = start_entry;
2528 		while (entry && entry->ba.start < end) {
2529 			vm_offset_t save_start;
2530 			vm_offset_t save_end;
2531 
2532 			/*
2533 			 * Already user wired or hard wired (trivial cases)
2534 			 */
2535 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2536 				entry = vm_map_rb_tree_RB_NEXT(entry);
2537 				continue;
2538 			}
2539 			if (entry->wired_count != 0) {
2540 				entry->wired_count++;
2541 				entry->eflags |= MAP_ENTRY_USER_WIRED;
2542 				entry = vm_map_rb_tree_RB_NEXT(entry);
2543 				continue;
2544 			}
2545 
2546 			/*
2547 			 * A new wiring requires instantiation of appropriate
2548 			 * management structures and the faulting in of the
2549 			 * page.
2550 			 */
2551 			if (entry->maptype == VM_MAPTYPE_NORMAL ||
2552 			    entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2553 				int copyflag = entry->eflags &
2554 					       MAP_ENTRY_NEEDS_COPY;
2555 				if (copyflag && ((entry->protection &
2556 						  VM_PROT_WRITE) != 0)) {
2557 					vm_map_entry_shadow(entry);
2558 				} else if (entry->ba.object == NULL &&
2559 					   !map->system_map) {
2560 					vm_map_entry_allocate_object(entry);
2561 				}
2562 			}
2563 			entry->wired_count++;
2564 			entry->eflags |= MAP_ENTRY_USER_WIRED;
2565 
2566 			/*
2567 			 * Now fault in the area.  Note that vm_fault_wire()
2568 			 * may release the map lock temporarily, it will be
2569 			 * relocked on return.  The in-transition
2570 			 * flag protects the entries.
2571 			 */
2572 			save_start = entry->ba.start;
2573 			save_end = entry->ba.end;
2574 			rv = vm_fault_wire(map, entry, TRUE, 0);
2575 			if (rv) {
2576 				CLIP_CHECK_BACK(entry, save_start);
2577 				for (;;) {
2578 					KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2579 					entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2580 					entry->wired_count = 0;
2581 					if (entry->ba.end == save_end)
2582 						break;
2583 					entry = vm_map_rb_tree_RB_NEXT(entry);
2584 					KASSERT(entry,
2585 					     ("bad entry clip during backout"));
2586 				}
2587 				end = save_start;	/* unwire the rest */
2588 				break;
2589 			}
2590 			/*
2591 			 * note that even though the entry might have been
2592 			 * clipped, the USER_WIRED flag we set prevents
2593 			 * duplication so we do not have to do a
2594 			 * clip check.
2595 			 */
2596 			entry = vm_map_rb_tree_RB_NEXT(entry);
2597 		}
2598 
2599 		/*
2600 		 * If we failed fall through to the unwiring section to
2601 		 * unwire what we had wired so far.  'end' has already
2602 		 * been adjusted.
2603 		 */
2604 		if (rv)
2605 			new_pageable = 1;
2606 
2607 		/*
2608 		 * start_entry might have been clipped if we unlocked the
2609 		 * map and blocked.  No matter how clipped it has gotten
2610 		 * there should be a fragment that is on our start boundary.
2611 		 */
2612 		CLIP_CHECK_BACK(start_entry, start);
2613 	}
2614 
2615 	/*
2616 	 * Deal with the unwiring case.
2617 	 */
2618 	if (new_pageable) {
2619 		/*
2620 		 * This is the unwiring case.  We must first ensure that the
2621 		 * range to be unwired is really wired down.  We know there
2622 		 * are no holes.
2623 		 */
2624 		entry = start_entry;
2625 		while (entry && entry->ba.start < end) {
2626 			if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2627 				rv = KERN_INVALID_ARGUMENT;
2628 				goto done;
2629 			}
2630 			KASSERT(entry->wired_count != 0,
2631 				("wired count was 0 with USER_WIRED set! %p",
2632 				 entry));
2633 			entry = vm_map_rb_tree_RB_NEXT(entry);
2634 		}
2635 
2636 		/*
2637 		 * Now decrement the wiring count for each region. If a region
2638 		 * becomes completely unwired, unwire its physical pages and
2639 		 * mappings.
2640 		 */
2641 		/*
2642 		 * The map entries are processed in a loop, checking to
2643 		 * make sure the entry is wired and asserting it has a wired
2644 		 * count. However, another loop was inserted more-or-less in
2645 		 * the middle of the unwiring path. This loop picks up the
2646 		 * "entry" loop variable from the first loop without first
2647 		 * setting it to start_entry. Naturally, the secound loop
2648 		 * is never entered and the pages backing the entries are
2649 		 * never unwired. This can lead to a leak of wired pages.
2650 		 */
2651 		entry = start_entry;
2652 		while (entry && entry->ba.start < end) {
2653 			KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2654 				("expected USER_WIRED on entry %p", entry));
2655 			entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2656 			entry->wired_count--;
2657 			if (entry->wired_count == 0)
2658 				vm_fault_unwire(map, entry);
2659 			entry = vm_map_rb_tree_RB_NEXT(entry);
2660 		}
2661 	}
2662 done:
2663 	vm_map_unclip_range(map, start_entry, start, real_end, &count,
2664 		MAP_CLIP_NO_HOLES);
2665 	vm_map_unlock(map);
2666 	vm_map_entry_release(count);
2667 
2668 	return (rv);
2669 }
2670 
2671 /*
2672  * Sets the pageability of the specified address range in the target map.
2673  * Regions specified as not pageable require locked-down physical
2674  * memory and physical page maps.
2675  *
2676  * The map must not be locked, but a reference must remain to the map
2677  * throughout the call.
2678  *
2679  * This function may be called via the zalloc path and must properly
2680  * reserve map entries for kernel_map.
2681  *
2682  * No requirements.
2683  */
2684 int
2685 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2686 {
2687 	vm_map_entry_t entry;
2688 	vm_map_entry_t start_entry;
2689 	vm_offset_t end;
2690 	int rv = KERN_SUCCESS;
2691 	int count;
2692 
2693 	if (kmflags & KM_KRESERVE)
2694 		count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2695 	else
2696 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2697 	vm_map_lock(map);
2698 	VM_MAP_RANGE_CHECK(map, start, real_end);
2699 	end = real_end;
2700 
2701 	start_entry = vm_map_clip_range(map, start, end, &count,
2702 					MAP_CLIP_NO_HOLES);
2703 	if (start_entry == NULL) {
2704 		vm_map_unlock(map);
2705 		rv = KERN_INVALID_ADDRESS;
2706 		goto failure;
2707 	}
2708 	if ((kmflags & KM_PAGEABLE) == 0) {
2709 		/*
2710 		 * Wiring.
2711 		 *
2712 		 * 1.  Holding the write lock, we create any shadow or zero-fill
2713 		 * objects that need to be created. Then we clip each map
2714 		 * entry to the region to be wired and increment its wiring
2715 		 * count.  We create objects before clipping the map entries
2716 		 * to avoid object proliferation.
2717 		 *
2718 		 * 2.  We downgrade to a read lock, and call vm_fault_wire to
2719 		 * fault in the pages for any newly wired area (wired_count is
2720 		 * 1).
2721 		 *
2722 		 * Downgrading to a read lock for vm_fault_wire avoids a
2723 		 * possible deadlock with another process that may have faulted
2724 		 * on one of the pages to be wired (it would mark the page busy,
2725 		 * blocking us, then in turn block on the map lock that we
2726 		 * hold).  Because of problems in the recursive lock package,
2727 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2728 		 * any actions that require the write lock must be done
2729 		 * beforehand.  Because we keep the read lock on the map, the
2730 		 * copy-on-write status of the entries we modify here cannot
2731 		 * change.
2732 		 */
2733 		entry = start_entry;
2734 		while (entry && entry->ba.start < end) {
2735 			/*
2736 			 * Trivial case if the entry is already wired
2737 			 */
2738 			if (entry->wired_count) {
2739 				entry->wired_count++;
2740 				entry = vm_map_rb_tree_RB_NEXT(entry);
2741 				continue;
2742 			}
2743 
2744 			/*
2745 			 * The entry is being newly wired, we have to setup
2746 			 * appropriate management structures.  A shadow
2747 			 * object is required for a copy-on-write region,
2748 			 * or a normal object for a zero-fill region.  We
2749 			 * do not have to do this for entries that point to sub
2750 			 * maps because we won't hold the lock on the sub map.
2751 			 */
2752 			if (entry->maptype == VM_MAPTYPE_NORMAL ||
2753 			    entry->maptype == VM_MAPTYPE_VPAGETABLE) {
2754 				int copyflag = entry->eflags &
2755 					       MAP_ENTRY_NEEDS_COPY;
2756 				if (copyflag && ((entry->protection &
2757 						  VM_PROT_WRITE) != 0)) {
2758 					vm_map_entry_shadow(entry);
2759 				} else if (entry->ba.object == NULL &&
2760 					   !map->system_map) {
2761 					vm_map_entry_allocate_object(entry);
2762 				}
2763 			}
2764 			entry->wired_count++;
2765 			entry = vm_map_rb_tree_RB_NEXT(entry);
2766 		}
2767 
2768 		/*
2769 		 * Pass 2.
2770 		 */
2771 
2772 		/*
2773 		 * HACK HACK HACK HACK
2774 		 *
2775 		 * vm_fault_wire() temporarily unlocks the map to avoid
2776 		 * deadlocks.  The in-transition flag from vm_map_clip_range
2777 		 * call should protect us from changes while the map is
2778 		 * unlocked.  T
2779 		 *
2780 		 * NOTE: Previously this comment stated that clipping might
2781 		 *	 still occur while the entry is unlocked, but from
2782 		 *	 what I can tell it actually cannot.
2783 		 *
2784 		 *	 It is unclear whether the CLIP_CHECK_*() calls
2785 		 *	 are still needed but we keep them in anyway.
2786 		 *
2787 		 * HACK HACK HACK HACK
2788 		 */
2789 
2790 		entry = start_entry;
2791 		while (entry && entry->ba.start < end) {
2792 			/*
2793 			 * If vm_fault_wire fails for any page we need to undo
2794 			 * what has been done.  We decrement the wiring count
2795 			 * for those pages which have not yet been wired (now)
2796 			 * and unwire those that have (later).
2797 			 */
2798 			vm_offset_t save_start = entry->ba.start;
2799 			vm_offset_t save_end = entry->ba.end;
2800 
2801 			if (entry->wired_count == 1)
2802 				rv = vm_fault_wire(map, entry, FALSE, kmflags);
2803 			if (rv) {
2804 				CLIP_CHECK_BACK(entry, save_start);
2805 				for (;;) {
2806 					KASSERT(entry->wired_count == 1,
2807 					  ("wired_count changed unexpectedly"));
2808 					entry->wired_count = 0;
2809 					if (entry->ba.end == save_end)
2810 						break;
2811 					entry = vm_map_rb_tree_RB_NEXT(entry);
2812 					KASSERT(entry,
2813 					  ("bad entry clip during backout"));
2814 				}
2815 				end = save_start;
2816 				break;
2817 			}
2818 			CLIP_CHECK_FWD(entry, save_end);
2819 			entry = vm_map_rb_tree_RB_NEXT(entry);
2820 		}
2821 
2822 		/*
2823 		 * If a failure occured undo everything by falling through
2824 		 * to the unwiring code.  'end' has already been adjusted
2825 		 * appropriately.
2826 		 */
2827 		if (rv)
2828 			kmflags |= KM_PAGEABLE;
2829 
2830 		/*
2831 		 * start_entry is still IN_TRANSITION but may have been
2832 		 * clipped since vm_fault_wire() unlocks and relocks the
2833 		 * map.  No matter how clipped it has gotten there should
2834 		 * be a fragment that is on our start boundary.
2835 		 */
2836 		CLIP_CHECK_BACK(start_entry, start);
2837 	}
2838 
2839 	if (kmflags & KM_PAGEABLE) {
2840 		/*
2841 		 * This is the unwiring case.  We must first ensure that the
2842 		 * range to be unwired is really wired down.  We know there
2843 		 * are no holes.
2844 		 */
2845 		entry = start_entry;
2846 		while (entry && entry->ba.start < end) {
2847 			if (entry->wired_count == 0) {
2848 				rv = KERN_INVALID_ARGUMENT;
2849 				goto done;
2850 			}
2851 			entry = vm_map_rb_tree_RB_NEXT(entry);
2852 		}
2853 
2854 		/*
2855 		 * Now decrement the wiring count for each region. If a region
2856 		 * becomes completely unwired, unwire its physical pages and
2857 		 * mappings.
2858 		 */
2859 		entry = start_entry;
2860 		while (entry && entry->ba.start < end) {
2861 			entry->wired_count--;
2862 			if (entry->wired_count == 0)
2863 				vm_fault_unwire(map, entry);
2864 			entry = vm_map_rb_tree_RB_NEXT(entry);
2865 		}
2866 	}
2867 done:
2868 	vm_map_unclip_range(map, start_entry, start, real_end,
2869 			    &count, MAP_CLIP_NO_HOLES);
2870 	vm_map_unlock(map);
2871 failure:
2872 	if (kmflags & KM_KRESERVE)
2873 		vm_map_entry_krelease(count);
2874 	else
2875 		vm_map_entry_release(count);
2876 	return (rv);
2877 }
2878 
2879 /*
2880  * Mark a newly allocated address range as wired but do not fault in
2881  * the pages.  The caller is expected to load the pages into the object.
2882  *
2883  * The map must be locked on entry and will remain locked on return.
2884  * No other requirements.
2885  */
2886 void
2887 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2888 		       int *countp)
2889 {
2890 	vm_map_entry_t scan;
2891 	vm_map_entry_t entry;
2892 
2893 	entry = vm_map_clip_range(map, addr, addr + size,
2894 				  countp, MAP_CLIP_NO_HOLES);
2895 	scan = entry;
2896 	while (scan && scan->ba.start < addr + size) {
2897 		KKASSERT(scan->wired_count == 0);
2898 		scan->wired_count = 1;
2899 		scan = vm_map_rb_tree_RB_NEXT(scan);
2900 	}
2901 	vm_map_unclip_range(map, entry, addr, addr + size,
2902 			    countp, MAP_CLIP_NO_HOLES);
2903 }
2904 
2905 /*
2906  * Push any dirty cached pages in the address range to their pager.
2907  * If syncio is TRUE, dirty pages are written synchronously.
2908  * If invalidate is TRUE, any cached pages are freed as well.
2909  *
2910  * This routine is called by sys_msync()
2911  *
2912  * Returns an error if any part of the specified range is not mapped.
2913  *
2914  * No requirements.
2915  */
2916 int
2917 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2918 	     boolean_t syncio, boolean_t invalidate)
2919 {
2920 	vm_map_entry_t current;
2921 	vm_map_entry_t next;
2922 	vm_map_entry_t entry;
2923 	vm_map_backing_t ba;
2924 	vm_size_t size;
2925 	vm_object_t object;
2926 	vm_ooffset_t offset;
2927 
2928 	vm_map_lock_read(map);
2929 	VM_MAP_RANGE_CHECK(map, start, end);
2930 	if (!vm_map_lookup_entry(map, start, &entry)) {
2931 		vm_map_unlock_read(map);
2932 		return (KERN_INVALID_ADDRESS);
2933 	}
2934 	lwkt_gettoken(&map->token);
2935 
2936 	/*
2937 	 * Make a first pass to check for holes.
2938 	 */
2939 	current = entry;
2940 	while (current && current->ba.start < end) {
2941 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2942 			lwkt_reltoken(&map->token);
2943 			vm_map_unlock_read(map);
2944 			return (KERN_INVALID_ARGUMENT);
2945 		}
2946 		next = vm_map_rb_tree_RB_NEXT(current);
2947 		if (end > current->ba.end &&
2948 		    (next == NULL ||
2949 		     current->ba.end != next->ba.start)) {
2950 			lwkt_reltoken(&map->token);
2951 			vm_map_unlock_read(map);
2952 			return (KERN_INVALID_ADDRESS);
2953 		}
2954 		current = next;
2955 	}
2956 
2957 	if (invalidate)
2958 		pmap_remove(vm_map_pmap(map), start, end);
2959 
2960 	/*
2961 	 * Make a second pass, cleaning/uncaching pages from the indicated
2962 	 * objects as we go.
2963 	 */
2964 	current = entry;
2965 	while (current && current->ba.start < end) {
2966 		offset = current->ba.offset + (start - current->ba.start);
2967 		size = (end <= current->ba.end ? end : current->ba.end) - start;
2968 
2969 		switch(current->maptype) {
2970 		case VM_MAPTYPE_SUBMAP:
2971 		{
2972 			vm_map_t smap;
2973 			vm_map_entry_t tentry;
2974 			vm_size_t tsize;
2975 
2976 			smap = current->ba.sub_map;
2977 			vm_map_lock_read(smap);
2978 			vm_map_lookup_entry(smap, offset, &tentry);
2979 			if (tentry == NULL) {
2980 				tsize = vm_map_max(smap) - offset;
2981 				ba = NULL;
2982 				offset = 0 + (offset - vm_map_min(smap));
2983 			} else {
2984 				tsize = tentry->ba.end - offset;
2985 				ba = &tentry->ba;
2986 				offset = tentry->ba.offset +
2987 					 (offset - tentry->ba.start);
2988 			}
2989 			vm_map_unlock_read(smap);
2990 			if (tsize < size)
2991 				size = tsize;
2992 			break;
2993 		}
2994 		case VM_MAPTYPE_NORMAL:
2995 		case VM_MAPTYPE_VPAGETABLE:
2996 			ba = &current->ba;
2997 			break;
2998 		default:
2999 			ba = NULL;
3000 			break;
3001 		}
3002 		if (ba) {
3003 			object = ba->object;
3004 			if (object)
3005 				vm_object_hold(object);
3006 		} else {
3007 			object = NULL;
3008 		}
3009 
3010 		/*
3011 		 * Note that there is absolutely no sense in writing out
3012 		 * anonymous objects, so we track down the vnode object
3013 		 * to write out.
3014 		 * We invalidate (remove) all pages from the address space
3015 		 * anyway, for semantic correctness.
3016 		 *
3017 		 * note: certain anonymous maps, such as MAP_NOSYNC maps,
3018 		 * may start out with a NULL object.
3019 		 *
3020 		 * XXX do we really want to stop at the first backing store
3021 		 * here if there are more? XXX
3022 		 */
3023 		if (ba) {
3024 			vm_object_t tobj;
3025 
3026 			tobj = object;
3027 			while (ba->backing_ba != NULL) {
3028 				offset -= ba->offset;
3029 				ba = ba->backing_ba;
3030 				offset += ba->offset;
3031 				tobj = ba->object;
3032 				if (tobj->size < OFF_TO_IDX(offset + size))
3033 					size = IDX_TO_OFF(tobj->size) - offset;
3034 				break; /* XXX this break is not correct */
3035 			}
3036 			if (object != tobj) {
3037 				if (object)
3038 					vm_object_drop(object);
3039 				object = tobj;
3040 				vm_object_hold(object);
3041 			}
3042 		}
3043 
3044 		if (object && (object->type == OBJT_VNODE) &&
3045 		    (current->protection & VM_PROT_WRITE) &&
3046 		    (object->flags & OBJ_NOMSYNC) == 0) {
3047 			/*
3048 			 * Flush pages if writing is allowed, invalidate them
3049 			 * if invalidation requested.  Pages undergoing I/O
3050 			 * will be ignored by vm_object_page_remove().
3051 			 *
3052 			 * We cannot lock the vnode and then wait for paging
3053 			 * to complete without deadlocking against vm_fault.
3054 			 * Instead we simply call vm_object_page_remove() and
3055 			 * allow it to block internally on a page-by-page
3056 			 * basis when it encounters pages undergoing async
3057 			 * I/O.
3058 			 */
3059 			int flags;
3060 
3061 			/* no chain wait needed for vnode objects */
3062 			vm_object_reference_locked(object);
3063 			vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3064 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3065 			flags |= invalidate ? OBJPC_INVAL : 0;
3066 
3067 			/*
3068 			 * When operating on a virtual page table just
3069 			 * flush the whole object.  XXX we probably ought
3070 			 * to
3071 			 */
3072 			switch(current->maptype) {
3073 			case VM_MAPTYPE_NORMAL:
3074 				vm_object_page_clean(object,
3075 				    OFF_TO_IDX(offset),
3076 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3077 				    flags);
3078 				break;
3079 			case VM_MAPTYPE_VPAGETABLE:
3080 				vm_object_page_clean(object, 0, 0, flags);
3081 				break;
3082 			}
3083 			vn_unlock(((struct vnode *)object->handle));
3084 			vm_object_deallocate_locked(object);
3085 		}
3086 		if (object && invalidate &&
3087 		   ((object->type == OBJT_VNODE) ||
3088 		    (object->type == OBJT_DEVICE) ||
3089 		    (object->type == OBJT_MGTDEVICE))) {
3090 			int clean_only =
3091 				((object->type == OBJT_DEVICE) ||
3092 				(object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3093 			/* no chain wait needed for vnode/device objects */
3094 			vm_object_reference_locked(object);
3095 			switch(current->maptype) {
3096 			case VM_MAPTYPE_NORMAL:
3097 				vm_object_page_remove(object,
3098 				    OFF_TO_IDX(offset),
3099 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3100 				    clean_only);
3101 				break;
3102 			case VM_MAPTYPE_VPAGETABLE:
3103 				vm_object_page_remove(object, 0, 0, clean_only);
3104 				break;
3105 			}
3106 			vm_object_deallocate_locked(object);
3107 		}
3108 		start += size;
3109 		if (object)
3110 			vm_object_drop(object);
3111 		current = vm_map_rb_tree_RB_NEXT(current);
3112 	}
3113 
3114 	lwkt_reltoken(&map->token);
3115 	vm_map_unlock_read(map);
3116 
3117 	return (KERN_SUCCESS);
3118 }
3119 
3120 /*
3121  * Make the region specified by this entry pageable.
3122  *
3123  * The vm_map must be exclusively locked.
3124  */
3125 static void
3126 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3127 {
3128 	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3129 	entry->wired_count = 0;
3130 	vm_fault_unwire(map, entry);
3131 }
3132 
3133 /*
3134  * Deallocate the given entry from the target map.
3135  *
3136  * The vm_map must be exclusively locked.
3137  */
3138 static void
3139 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3140 {
3141 	vm_map_entry_unlink(map, entry);
3142 	map->size -= entry->ba.end - entry->ba.start;
3143 	vm_map_entry_dispose(map, entry, countp);
3144 }
3145 
3146 /*
3147  * Deallocates the given address range from the target map.
3148  *
3149  * The vm_map must be exclusively locked.
3150  */
3151 int
3152 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3153 {
3154 	vm_object_t object;
3155 	vm_map_entry_t entry;
3156 	vm_map_entry_t first_entry;
3157 	vm_offset_t hole_start;
3158 
3159 	ASSERT_VM_MAP_LOCKED(map);
3160 	lwkt_gettoken(&map->token);
3161 again:
3162 	/*
3163 	 * Find the start of the region, and clip it.  Set entry to point
3164 	 * at the first record containing the requested address or, if no
3165 	 * such record exists, the next record with a greater address.  The
3166 	 * loop will run from this point until a record beyond the termination
3167 	 * address is encountered.
3168 	 *
3169 	 * Adjust freehint[] for either the clip case or the extension case.
3170 	 *
3171 	 * GGG see other GGG comment.
3172 	 */
3173 	if (vm_map_lookup_entry(map, start, &first_entry)) {
3174 		entry = first_entry;
3175 		vm_map_clip_start(map, entry, start, countp);
3176 		hole_start = start;
3177 	} else {
3178 		if (first_entry) {
3179 			entry = vm_map_rb_tree_RB_NEXT(first_entry);
3180 			if (entry == NULL)
3181 				hole_start = first_entry->ba.start;
3182 			else
3183 				hole_start = first_entry->ba.end;
3184 		} else {
3185 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3186 			if (entry == NULL)
3187 				hole_start = vm_map_min(map);
3188 			else
3189 				hole_start = vm_map_max(map);
3190 		}
3191 	}
3192 
3193 	/*
3194 	 * Step through all entries in this region
3195 	 */
3196 	while (entry && entry->ba.start < end) {
3197 		vm_map_entry_t next;
3198 		vm_offset_t s, e;
3199 		vm_pindex_t offidxstart, offidxend, count;
3200 
3201 		/*
3202 		 * If we hit an in-transition entry we have to sleep and
3203 		 * retry.  It's easier (and not really slower) to just retry
3204 		 * since this case occurs so rarely and the hint is already
3205 		 * pointing at the right place.  We have to reset the
3206 		 * start offset so as not to accidently delete an entry
3207 		 * another process just created in vacated space.
3208 		 */
3209 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3210 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3211 			start = entry->ba.start;
3212 			++mycpu->gd_cnt.v_intrans_coll;
3213 			++mycpu->gd_cnt.v_intrans_wait;
3214 			vm_map_transition_wait(map, 1);
3215 			goto again;
3216 		}
3217 		vm_map_clip_end(map, entry, end, countp);
3218 
3219 		s = entry->ba.start;
3220 		e = entry->ba.end;
3221 		next = vm_map_rb_tree_RB_NEXT(entry);
3222 
3223 		offidxstart = OFF_TO_IDX(entry->ba.offset);
3224 		count = OFF_TO_IDX(e - s);
3225 
3226 		switch(entry->maptype) {
3227 		case VM_MAPTYPE_NORMAL:
3228 		case VM_MAPTYPE_VPAGETABLE:
3229 		case VM_MAPTYPE_SUBMAP:
3230 			object = entry->ba.object;
3231 			break;
3232 		default:
3233 			object = NULL;
3234 			break;
3235 		}
3236 
3237 		/*
3238 		 * Unwire before removing addresses from the pmap; otherwise,
3239 		 * unwiring will put the entries back in the pmap.
3240 		 *
3241 		 * Generally speaking, doing a bulk pmap_remove() before
3242 		 * removing the pages from the VM object is better at
3243 		 * reducing unnecessary IPIs.  The pmap code is now optimized
3244 		 * to not blindly iterate the range when pt and pd pages
3245 		 * are missing.
3246 		 */
3247 		if (entry->wired_count != 0)
3248 			vm_map_entry_unwire(map, entry);
3249 
3250 		offidxend = offidxstart + count;
3251 
3252 		if (object == &kernel_object) {
3253 			pmap_remove(map->pmap, s, e);
3254 			vm_object_hold(object);
3255 			vm_object_page_remove(object, offidxstart,
3256 					      offidxend, FALSE);
3257 			vm_object_drop(object);
3258 		} else if (object && object->type != OBJT_DEFAULT &&
3259 			   object->type != OBJT_SWAP) {
3260 			/*
3261 			 * vnode object routines cannot be chain-locked,
3262 			 * but since we aren't removing pages from the
3263 			 * object here we can use a shared hold.
3264 			 */
3265 			vm_object_hold_shared(object);
3266 			pmap_remove(map->pmap, s, e);
3267 			vm_object_drop(object);
3268 		} else if (object) {
3269 			vm_object_hold(object);
3270 			pmap_remove(map->pmap, s, e);
3271 
3272 			if (object != NULL &&
3273 			    object->ref_count != 1 &&
3274 			    (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3275 			     OBJ_ONEMAPPING &&
3276 			    (object->type == OBJT_DEFAULT ||
3277 			     object->type == OBJT_SWAP)) {
3278 				/*
3279 				 * When ONEMAPPING is set we can destroy the
3280 				 * pages underlying the entry's range.
3281 				 */
3282 				vm_object_page_remove(object, offidxstart,
3283 						      offidxend, FALSE);
3284 				if (object->type == OBJT_SWAP) {
3285 					swap_pager_freespace(object,
3286 							     offidxstart,
3287 							     count);
3288 				}
3289 				if (offidxend >= object->size &&
3290 				    offidxstart < object->size) {
3291 					object->size = offidxstart;
3292 				}
3293 			}
3294 			vm_object_drop(object);
3295 		} else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3296 			pmap_remove(map->pmap, s, e);
3297 		}
3298 
3299 		/*
3300 		 * Delete the entry (which may delete the object) only after
3301 		 * removing all pmap entries pointing to its pages.
3302 		 * (Otherwise, its page frames may be reallocated, and any
3303 		 * modify bits will be set in the wrong object!)
3304 		 */
3305 		vm_map_entry_delete(map, entry, countp);
3306 		entry = next;
3307 	}
3308 
3309 	/*
3310 	 * We either reached the end and use vm_map_max as the end
3311 	 * address, or we didn't and we use the next entry as the
3312 	 * end address.
3313 	 */
3314 	if (entry == NULL) {
3315 		vm_map_freehint_hole(map, hole_start,
3316 				     vm_map_max(map) - hole_start);
3317 	} else {
3318 		vm_map_freehint_hole(map, hole_start,
3319 				     entry->ba.start - hole_start);
3320 	}
3321 
3322 	lwkt_reltoken(&map->token);
3323 
3324 	return (KERN_SUCCESS);
3325 }
3326 
3327 /*
3328  * Remove the given address range from the target map.
3329  * This is the exported form of vm_map_delete.
3330  *
3331  * No requirements.
3332  */
3333 int
3334 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3335 {
3336 	int result;
3337 	int count;
3338 
3339 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3340 	vm_map_lock(map);
3341 	VM_MAP_RANGE_CHECK(map, start, end);
3342 	result = vm_map_delete(map, start, end, &count);
3343 	vm_map_unlock(map);
3344 	vm_map_entry_release(count);
3345 
3346 	return (result);
3347 }
3348 
3349 /*
3350  * Assert that the target map allows the specified privilege on the
3351  * entire address region given.  The entire region must be allocated.
3352  *
3353  * The caller must specify whether the vm_map is already locked or not.
3354  */
3355 boolean_t
3356 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3357 			vm_prot_t protection, boolean_t have_lock)
3358 {
3359 	vm_map_entry_t entry;
3360 	vm_map_entry_t tmp_entry;
3361 	boolean_t result;
3362 
3363 	if (have_lock == FALSE)
3364 		vm_map_lock_read(map);
3365 
3366 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3367 		if (have_lock == FALSE)
3368 			vm_map_unlock_read(map);
3369 		return (FALSE);
3370 	}
3371 	entry = tmp_entry;
3372 
3373 	result = TRUE;
3374 	while (start < end) {
3375 		if (entry == NULL) {
3376 			result = FALSE;
3377 			break;
3378 		}
3379 
3380 		/*
3381 		 * No holes allowed!
3382 		 */
3383 
3384 		if (start < entry->ba.start) {
3385 			result = FALSE;
3386 			break;
3387 		}
3388 		/*
3389 		 * Check protection associated with entry.
3390 		 */
3391 
3392 		if ((entry->protection & protection) != protection) {
3393 			result = FALSE;
3394 			break;
3395 		}
3396 		/* go to next entry */
3397 		start = entry->ba.end;
3398 		entry = vm_map_rb_tree_RB_NEXT(entry);
3399 	}
3400 	if (have_lock == FALSE)
3401 		vm_map_unlock_read(map);
3402 	return (result);
3403 }
3404 
3405 /*
3406  * vm_map_backing structures are not shared across forks and must be
3407  * replicated.
3408  *
3409  * Generally speaking we must reallocate the backing_ba sequence and
3410  * also adjust it for any changes made to the base entry->ba.start and
3411  * entry->ba.end.  The first ba in the chain is of course &entry->ba,
3412  * so we only need to adjust subsequent ba's start, end, and offset.
3413  *
3414  * MAP_BACK_CLIPPED	- Called as part of a clipping replication.
3415  *			  Do not clear OBJ_ONEMAPPING.
3416  *
3417  * MAP_BACK_BASEOBJREFD - Called from vm_map_insert().  The base object
3418  *			  has already been referenced.
3419  */
3420 static
3421 void
3422 vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
3423 {
3424 	vm_map_backing_t ba;
3425 	vm_map_backing_t nba;
3426 	vm_object_t object;
3427 
3428 	ba = &entry->ba;
3429 	for (;;) {
3430 		object = ba->object;
3431 		ba->pmap = map->pmap;
3432 		ba->refs = 1;
3433 		if (object &&
3434 		    (entry->maptype == VM_MAPTYPE_VPAGETABLE ||
3435 		     entry->maptype == VM_MAPTYPE_NORMAL)) {
3436 			if (ba != &entry->ba ||
3437 			    (flags & MAP_BACK_BASEOBJREFD) == 0) {
3438 				vm_object_reference_quick(object);
3439 			}
3440 			vm_map_backing_attach(ba);
3441 			if ((flags & MAP_BACK_CLIPPED) == 0 &&
3442 			    object->ref_count > 1) {
3443 				vm_object_clear_flag(object, OBJ_ONEMAPPING);
3444 			}
3445 		}
3446 		if (ba->backing_ba == NULL)
3447 			break;
3448 		nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT);
3449 		*nba = *ba->backing_ba;
3450 		nba->offset += (ba->start - nba->start);  /* += (new - old) */
3451 		nba->start = ba->start;
3452 		nba->end = ba->end;
3453 		ba->backing_ba = nba;
3454 		ba = nba;
3455 		/* pmap is replaced at the top of the loop */
3456 	}
3457 	entry->ba.refs = 0;	/* base entry refs is 0 */
3458 }
3459 
3460 static
3461 void
3462 vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start)
3463 {
3464 	vm_map_backing_t ba;
3465 
3466 	if (entry->maptype == VM_MAPTYPE_VPAGETABLE ||
3467 	    entry->maptype == VM_MAPTYPE_NORMAL) {
3468 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3469 			if (ba->object) {
3470 				spin_lock(&ba->object->spin);
3471 				ba->offset += (start - ba->start);
3472 				ba->start = start;
3473 				spin_unlock(&ba->object->spin);
3474 			} else {
3475 				ba->offset += (start - ba->start);
3476 				ba->start = start;
3477 			}
3478 		}
3479 	} else {
3480 		/* not an object and can't be shadowed */
3481 	}
3482 }
3483 
3484 static
3485 void
3486 vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end)
3487 {
3488 	vm_map_backing_t ba;
3489 
3490 	if (entry->maptype == VM_MAPTYPE_VPAGETABLE ||
3491 	    entry->maptype == VM_MAPTYPE_NORMAL) {
3492 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3493 			if (ba->object) {
3494 				spin_lock(&ba->object->spin);
3495 				ba->end = end;
3496 				spin_unlock(&ba->object->spin);
3497 			} else {
3498 				ba->end = end;
3499 			}
3500 		}
3501 	} else {
3502 		/* not an object and can't be shadowed */
3503 	}
3504 }
3505 
3506 /*
3507  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3508  * after src_entry has been cloned to dst_entry.  For normal entries only.
3509  *
3510  * The vm_maps must be exclusively locked.
3511  * The vm_map's token must be held.
3512  *
3513  * Because the maps are locked no faults can be in progress during the
3514  * operation.
3515  */
3516 static void
3517 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3518 		  vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3519 {
3520 	vm_object_t obj;
3521 
3522 	KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL ||
3523 		 dst_entry->maptype == VM_MAPTYPE_VPAGETABLE);
3524 
3525 	if (src_entry->wired_count &&
3526 	    src_entry->maptype != VM_MAPTYPE_VPAGETABLE) {
3527 		/*
3528 		 * Of course, wired down pages can't be set copy-on-write.
3529 		 * Cause wired pages to be copied into the new map by
3530 		 * simulating faults (the new pages are pageable)
3531 		 *
3532 		 * Scrap ba.object (its ref-count has not yet been adjusted
3533 		 * so we can just NULL out the field).  Remove the backing
3534 		 * store.
3535 		 *
3536 		 * Then call vm_fault_copy_entry() to create a new object
3537 		 * in dst_entry and copy the wired pages from src to dst.
3538 		 *
3539 		 * The fault-copy code doesn't work with virtual page
3540 		 * tables.
3541 		 */
3542 		if ((obj = dst_entry->ba.object) != NULL) {
3543 			vm_map_backing_detach(&dst_entry->ba);
3544 			dst_entry->ba.object = NULL;
3545 			vm_map_entry_dispose_ba(dst_entry->ba.backing_ba);
3546 			dst_entry->ba.backing_ba = NULL;
3547 			dst_entry->ba.backing_count = 0;
3548 		}
3549 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3550 	} else {
3551 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3552 			/*
3553 			 * If the source entry is not already marked NEEDS_COPY
3554 			 * we need to write-protect the PTEs.
3555 			 */
3556 			pmap_protect(src_map->pmap,
3557 				     src_entry->ba.start,
3558 				     src_entry->ba.end,
3559 				     src_entry->protection & ~VM_PROT_WRITE);
3560 		}
3561 
3562 		/*
3563 		 * dst_entry.ba_object might be stale.  Update it (its
3564 		 * ref-count has not yet been updated so just overwrite
3565 		 * the field).
3566 		 *
3567 		 * If there is no object then we are golden.  Also, in
3568 		 * this situation if there are no backing_ba linkages then
3569 		 * we can set ba.offset to whatever we want.  For now we
3570 		 * set the offset for 0 for make debugging object sizes
3571 		 * easier.
3572 		 */
3573 		obj = src_entry->ba.object;
3574 
3575 		if (obj) {
3576 			src_entry->eflags |= (MAP_ENTRY_COW |
3577 					      MAP_ENTRY_NEEDS_COPY);
3578 			dst_entry->eflags |= (MAP_ENTRY_COW |
3579 					      MAP_ENTRY_NEEDS_COPY);
3580 			KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3581 		} else {
3582 			dst_entry->ba.offset = 0;
3583 		}
3584 
3585 		/*
3586 		 * Normal, allow the backing_ba link depth to
3587 		 * increase.
3588 		 */
3589 		pmap_copy(dst_map->pmap, src_map->pmap,
3590 			  dst_entry->ba.start,
3591 			  dst_entry->ba.end - dst_entry->ba.start,
3592 			  src_entry->ba.start);
3593 	}
3594 }
3595 
3596 /*
3597  * Create a vmspace for a new process and its related vm_map based on an
3598  * existing vmspace.  The new map inherits information from the old map
3599  * according to inheritance settings.
3600  *
3601  * The source map must not be locked.
3602  * No requirements.
3603  */
3604 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3605 			  vm_map_entry_t old_entry, int *countp);
3606 static void vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3607 			  vm_map_entry_t old_entry, int *countp);
3608 
3609 struct vmspace *
3610 vmspace_fork(struct vmspace *vm1)
3611 {
3612 	struct vmspace *vm2;
3613 	vm_map_t old_map = &vm1->vm_map;
3614 	vm_map_t new_map;
3615 	vm_map_entry_t old_entry;
3616 	int count;
3617 
3618 	lwkt_gettoken(&vm1->vm_map.token);
3619 	vm_map_lock(old_map);
3620 
3621 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3622 	lwkt_gettoken(&vm2->vm_map.token);
3623 
3624 	/*
3625 	 * We must bump the timestamp to force any concurrent fault
3626 	 * to retry.
3627 	 */
3628 	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3629 	      (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3630 	new_map = &vm2->vm_map;	/* XXX */
3631 	new_map->timestamp = 1;
3632 
3633 	vm_map_lock(new_map);
3634 
3635 	count = old_map->nentries;
3636 	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3637 
3638 	RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3639 		switch(old_entry->maptype) {
3640 		case VM_MAPTYPE_SUBMAP:
3641 			panic("vm_map_fork: encountered a submap");
3642 			break;
3643 		case VM_MAPTYPE_UKSMAP:
3644 			vmspace_fork_uksmap_entry(old_map, new_map,
3645 						  old_entry, &count);
3646 			break;
3647 		case VM_MAPTYPE_NORMAL:
3648 		case VM_MAPTYPE_VPAGETABLE:
3649 			vmspace_fork_normal_entry(old_map, new_map,
3650 						  old_entry, &count);
3651 			break;
3652 		}
3653 	}
3654 
3655 	new_map->size = old_map->size;
3656 	vm_map_unlock(new_map);
3657 	vm_map_unlock(old_map);
3658 	vm_map_entry_release(count);
3659 
3660 	lwkt_reltoken(&vm2->vm_map.token);
3661 	lwkt_reltoken(&vm1->vm_map.token);
3662 
3663 	return (vm2);
3664 }
3665 
3666 static
3667 void
3668 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3669 			  vm_map_entry_t old_entry, int *countp)
3670 {
3671 	vm_map_entry_t new_entry;
3672 	vm_map_backing_t ba;
3673 	vm_object_t object;
3674 
3675 	/*
3676 	 * If the backing_ba link list gets too long then fault it
3677 	 * all into the head object and dispose of the list.  We do
3678 	 * this in old_entry prior to cloning in order to benefit both
3679 	 * parent and child.
3680 	 *
3681 	 * We can test our fronting object's size against its
3682 	 * resident_page_count for a really cheap (but probably not perfect)
3683 	 * all-shadowed test, allowing us to disconnect the backing_ba
3684 	 * link list early.
3685 	 *
3686 	 * XXX Currently doesn't work for VPAGETABLEs (the entire object
3687 	 *     would have to be copied).
3688 	 */
3689 	object = old_entry->ba.object;
3690 	if (old_entry->ba.backing_ba &&
3691 	    old_entry->maptype != VM_MAPTYPE_VPAGETABLE &&
3692 	    (old_entry->ba.backing_count >= vm_map_backing_limit ||
3693 	     (vm_map_backing_shadow_test && object &&
3694 	      object->size == object->resident_page_count))) {
3695 		/*
3696 		 * If there are too many backing_ba linkages we
3697 		 * collapse everything into the head
3698 		 *
3699 		 * This will also remove all the pte's.
3700 		 */
3701 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
3702 			vm_map_entry_shadow(old_entry);
3703 		if (object == NULL)
3704 			vm_map_entry_allocate_object(old_entry);
3705 		if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
3706 			ba = old_entry->ba.backing_ba;
3707 			old_entry->ba.backing_ba = NULL;
3708 			old_entry->ba.backing_count = 0;
3709 			vm_map_entry_dispose_ba(ba);
3710 		}
3711 	}
3712 	object = NULL;	/* object variable is now invalid */
3713 
3714 	/*
3715 	 * Fork the entry
3716 	 */
3717 	switch (old_entry->inheritance) {
3718 	case VM_INHERIT_NONE:
3719 		break;
3720 	case VM_INHERIT_SHARE:
3721 		/*
3722 		 * Clone the entry as a shared entry.  This will look like
3723 		 * shared memory across the old and the new process.  We must
3724 		 * ensure that the object is allocated.
3725 		 */
3726 		if (old_entry->ba.object == NULL)
3727 			vm_map_entry_allocate_object(old_entry);
3728 
3729 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3730 			/*
3731 			 * Create the fronting vm_map_backing for
3732 			 * an entry which needs a copy, plus an extra
3733 			 * ref because we are going to duplicate it
3734 			 * in the fork.
3735 			 *
3736 			 * The call to vm_map_entry_shadow() will also clear
3737 			 * OBJ_ONEMAPPING.
3738 			 *
3739 			 * XXX no more collapse.  Still need extra ref
3740 			 * for the fork.
3741 			 */
3742 			vm_map_entry_shadow(old_entry);
3743 		} else if (old_entry->ba.object) {
3744 			object = old_entry->ba.object;
3745 		}
3746 
3747 		/*
3748 		 * Clone the entry.  We've already bumped the ref on
3749 		 * the vm_object for our new entry.
3750 		 */
3751 		new_entry = vm_map_entry_create(countp);
3752 		*new_entry = *old_entry;
3753 
3754 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3755 		new_entry->wired_count = 0;
3756 
3757 		/*
3758 		 * Replicate and index the vm_map_backing.  Don't share
3759 		 * the vm_map_backing across vm_map's (only across clips).
3760 		 *
3761 		 * Insert the entry into the new map -- we know we're
3762 		 * inserting at the end of the new map.
3763 		 */
3764 		vm_map_backing_replicated(new_map, new_entry, 0);
3765 		vm_map_entry_link(new_map, new_entry);
3766 
3767 		/*
3768 		 * Update the physical map
3769 		 */
3770 		pmap_copy(new_map->pmap, old_map->pmap,
3771 			  new_entry->ba.start,
3772 			  (old_entry->ba.end - old_entry->ba.start),
3773 			  old_entry->ba.start);
3774 		break;
3775 	case VM_INHERIT_COPY:
3776 		/*
3777 		 * Clone the entry and link the copy into the new map.
3778 		 *
3779 		 * Note that ref-counting adjustment for old_entry->ba.object
3780 		 * (if it isn't a special map that is) is handled by
3781 		 * vm_map_copy_entry().
3782 		 */
3783 		new_entry = vm_map_entry_create(countp);
3784 		*new_entry = *old_entry;
3785 
3786 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3787 		new_entry->wired_count = 0;
3788 
3789 		vm_map_backing_replicated(new_map, new_entry, 0);
3790 		vm_map_entry_link(new_map, new_entry);
3791 
3792 		/*
3793 		 * This does the actual dirty work of making both entries
3794 		 * copy-on-write, and will also handle the fronting object.
3795 		 */
3796 		vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3797 		break;
3798 	}
3799 }
3800 
3801 /*
3802  * When forking user-kernel shared maps, the map might change in the
3803  * child so do not try to copy the underlying pmap entries.
3804  */
3805 static
3806 void
3807 vmspace_fork_uksmap_entry(vm_map_t old_map, vm_map_t new_map,
3808 			  vm_map_entry_t old_entry, int *countp)
3809 {
3810 	vm_map_entry_t new_entry;
3811 
3812 	new_entry = vm_map_entry_create(countp);
3813 	*new_entry = *old_entry;
3814 
3815 	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3816 	new_entry->wired_count = 0;
3817 	KKASSERT(new_entry->ba.backing_ba == NULL);
3818 	vm_map_backing_replicated(new_map, new_entry, 0);
3819 
3820 	vm_map_entry_link(new_map, new_entry);
3821 }
3822 
3823 /*
3824  * Create an auto-grow stack entry
3825  *
3826  * No requirements.
3827  */
3828 int
3829 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3830 	      int flags, vm_prot_t prot, vm_prot_t max, int cow)
3831 {
3832 	vm_map_entry_t	prev_entry;
3833 	vm_map_entry_t	next;
3834 	vm_size_t	init_ssize;
3835 	int		rv;
3836 	int		count;
3837 	vm_offset_t	tmpaddr;
3838 
3839 	cow |= MAP_IS_STACK;
3840 
3841 	if (max_ssize < sgrowsiz)
3842 		init_ssize = max_ssize;
3843 	else
3844 		init_ssize = sgrowsiz;
3845 
3846 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3847 	vm_map_lock(map);
3848 
3849 	/*
3850 	 * Find space for the mapping
3851 	 */
3852 	if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3853 		if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3854 				     flags, &tmpaddr)) {
3855 			vm_map_unlock(map);
3856 			vm_map_entry_release(count);
3857 			return (KERN_NO_SPACE);
3858 		}
3859 		*addrbos = tmpaddr;
3860 	}
3861 
3862 	/* If addr is already mapped, no go */
3863 	if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3864 		vm_map_unlock(map);
3865 		vm_map_entry_release(count);
3866 		return (KERN_NO_SPACE);
3867 	}
3868 
3869 #if 0
3870 	/* XXX already handled by kern_mmap() */
3871 	/* If we would blow our VMEM resource limit, no go */
3872 	if (map->size + init_ssize >
3873 	    curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3874 		vm_map_unlock(map);
3875 		vm_map_entry_release(count);
3876 		return (KERN_NO_SPACE);
3877 	}
3878 #endif
3879 
3880 	/*
3881 	 * If we can't accomodate max_ssize in the current mapping,
3882 	 * no go.  However, we need to be aware that subsequent user
3883 	 * mappings might map into the space we have reserved for
3884 	 * stack, and currently this space is not protected.
3885 	 *
3886 	 * Hopefully we will at least detect this condition
3887 	 * when we try to grow the stack.
3888 	 */
3889 	if (prev_entry)
3890 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
3891 	else
3892 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3893 
3894 	if (next && next->ba.start < *addrbos + max_ssize) {
3895 		vm_map_unlock(map);
3896 		vm_map_entry_release(count);
3897 		return (KERN_NO_SPACE);
3898 	}
3899 
3900 	/*
3901 	 * We initially map a stack of only init_ssize.  We will
3902 	 * grow as needed later.  Since this is to be a grow
3903 	 * down stack, we map at the top of the range.
3904 	 *
3905 	 * Note: we would normally expect prot and max to be
3906 	 * VM_PROT_ALL, and cow to be 0.  Possibly we should
3907 	 * eliminate these as input parameters, and just
3908 	 * pass these values here in the insert call.
3909 	 */
3910 	rv = vm_map_insert(map, &count, NULL, NULL,
3911 			   0, *addrbos + max_ssize - init_ssize,
3912 	                   *addrbos + max_ssize,
3913 			   VM_MAPTYPE_NORMAL,
3914 			   VM_SUBSYS_STACK, prot, max, cow);
3915 
3916 	/* Now set the avail_ssize amount */
3917 	if (rv == KERN_SUCCESS) {
3918 		if (prev_entry)
3919 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
3920 		else
3921 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3922 		if (prev_entry != NULL) {
3923 			vm_map_clip_end(map,
3924 					prev_entry,
3925 					*addrbos + max_ssize - init_ssize,
3926 					&count);
3927 		}
3928 		if (next->ba.end   != *addrbos + max_ssize ||
3929 		    next->ba.start != *addrbos + max_ssize - init_ssize){
3930 			panic ("Bad entry start/end for new stack entry");
3931 		} else {
3932 			next->aux.avail_ssize = max_ssize - init_ssize;
3933 		}
3934 	}
3935 
3936 	vm_map_unlock(map);
3937 	vm_map_entry_release(count);
3938 	return (rv);
3939 }
3940 
3941 /*
3942  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3943  * desired address is already mapped, or if we successfully grow
3944  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3945  * stack range (this is strange, but preserves compatibility with
3946  * the grow function in vm_machdep.c).
3947  *
3948  * No requirements.
3949  */
3950 int
3951 vm_map_growstack (vm_map_t map, vm_offset_t addr)
3952 {
3953 	vm_map_entry_t prev_entry;
3954 	vm_map_entry_t stack_entry;
3955 	vm_map_entry_t next;
3956 	struct vmspace *vm;
3957 	struct lwp *lp;
3958 	struct proc *p;
3959 	vm_offset_t    end;
3960 	int grow_amount;
3961 	int rv = KERN_SUCCESS;
3962 	int is_procstack;
3963 	int use_read_lock = 1;
3964 	int count;
3965 
3966 	/*
3967 	 * Find the vm
3968 	 */
3969 	lp = curthread->td_lwp;
3970 	p = curthread->td_proc;
3971 	KKASSERT(lp != NULL);
3972 	vm = lp->lwp_vmspace;
3973 
3974 	/*
3975 	 * Growstack is only allowed on the current process.  We disallow
3976 	 * other use cases, e.g. trying to access memory via procfs that
3977 	 * the stack hasn't grown into.
3978 	 */
3979 	if (map != &vm->vm_map) {
3980 		return KERN_FAILURE;
3981 	}
3982 
3983 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3984 Retry:
3985 	if (use_read_lock)
3986 		vm_map_lock_read(map);
3987 	else
3988 		vm_map_lock(map);
3989 
3990 	/*
3991 	 * If addr is already in the entry range, no need to grow.
3992 	 * prev_entry returns NULL if addr is at the head.
3993 	 */
3994 	if (vm_map_lookup_entry(map, addr, &prev_entry))
3995 		goto done;
3996 	if (prev_entry)
3997 		stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
3998 	else
3999 		stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
4000 
4001 	if (stack_entry == NULL)
4002 		goto done;
4003 	if (prev_entry == NULL)
4004 		end = stack_entry->ba.start - stack_entry->aux.avail_ssize;
4005 	else
4006 		end = prev_entry->ba.end;
4007 
4008 	/*
4009 	 * This next test mimics the old grow function in vm_machdep.c.
4010 	 * It really doesn't quite make sense, but we do it anyway
4011 	 * for compatibility.
4012 	 *
4013 	 * If not growable stack, return success.  This signals the
4014 	 * caller to proceed as he would normally with normal vm.
4015 	 */
4016 	if (stack_entry->aux.avail_ssize < 1 ||
4017 	    addr >= stack_entry->ba.start ||
4018 	    addr <  stack_entry->ba.start - stack_entry->aux.avail_ssize) {
4019 		goto done;
4020 	}
4021 
4022 	/* Find the minimum grow amount */
4023 	grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE);
4024 	if (grow_amount > stack_entry->aux.avail_ssize) {
4025 		rv = KERN_NO_SPACE;
4026 		goto done;
4027 	}
4028 
4029 	/*
4030 	 * If there is no longer enough space between the entries
4031 	 * nogo, and adjust the available space.  Note: this
4032 	 * should only happen if the user has mapped into the
4033 	 * stack area after the stack was created, and is
4034 	 * probably an error.
4035 	 *
4036 	 * This also effectively destroys any guard page the user
4037 	 * might have intended by limiting the stack size.
4038 	 */
4039 	if (grow_amount > stack_entry->ba.start - end) {
4040 		if (use_read_lock && vm_map_lock_upgrade(map)) {
4041 			/* lost lock */
4042 			use_read_lock = 0;
4043 			goto Retry;
4044 		}
4045 		use_read_lock = 0;
4046 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4047 		rv = KERN_NO_SPACE;
4048 		goto done;
4049 	}
4050 
4051 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
4052 
4053 	/* If this is the main process stack, see if we're over the
4054 	 * stack limit.
4055 	 */
4056 	if (is_procstack && (vm->vm_ssize + grow_amount >
4057 			     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4058 		rv = KERN_NO_SPACE;
4059 		goto done;
4060 	}
4061 
4062 	/* Round up the grow amount modulo SGROWSIZ */
4063 	grow_amount = roundup (grow_amount, sgrowsiz);
4064 	if (grow_amount > stack_entry->aux.avail_ssize) {
4065 		grow_amount = stack_entry->aux.avail_ssize;
4066 	}
4067 	if (is_procstack && (vm->vm_ssize + grow_amount >
4068 	                     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4069 		grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
4070 	}
4071 
4072 	/* If we would blow our VMEM resource limit, no go */
4073 	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
4074 		rv = KERN_NO_SPACE;
4075 		goto done;
4076 	}
4077 
4078 	if (use_read_lock && vm_map_lock_upgrade(map)) {
4079 		/* lost lock */
4080 		use_read_lock = 0;
4081 		goto Retry;
4082 	}
4083 	use_read_lock = 0;
4084 
4085 	/* Get the preliminary new entry start value */
4086 	addr = stack_entry->ba.start - grow_amount;
4087 
4088 	/* If this puts us into the previous entry, cut back our growth
4089 	 * to the available space.  Also, see the note above.
4090 	 */
4091 	if (addr < end) {
4092 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4093 		addr = end;
4094 	}
4095 
4096 	rv = vm_map_insert(map, &count, NULL, NULL,
4097 			   0, addr, stack_entry->ba.start,
4098 			   VM_MAPTYPE_NORMAL,
4099 			   VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4100 
4101 	/* Adjust the available stack space by the amount we grew. */
4102 	if (rv == KERN_SUCCESS) {
4103 		if (prev_entry) {
4104 			vm_map_clip_end(map, prev_entry, addr, &count);
4105 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
4106 		} else {
4107 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4108 		}
4109 		if (next->ba.end != stack_entry->ba.start  ||
4110 		    next->ba.start != addr) {
4111 			panic ("Bad stack grow start/end in new stack entry");
4112 		} else {
4113 			next->aux.avail_ssize =
4114 				stack_entry->aux.avail_ssize -
4115 				(next->ba.end - next->ba.start);
4116 			if (is_procstack) {
4117 				vm->vm_ssize += next->ba.end -
4118 						next->ba.start;
4119 			}
4120 		}
4121 
4122 		if (map->flags & MAP_WIREFUTURE)
4123 			vm_map_unwire(map, next->ba.start, next->ba.end, FALSE);
4124 	}
4125 
4126 done:
4127 	if (use_read_lock)
4128 		vm_map_unlock_read(map);
4129 	else
4130 		vm_map_unlock(map);
4131 	vm_map_entry_release(count);
4132 	return (rv);
4133 }
4134 
4135 /*
4136  * Unshare the specified VM space for exec.  If other processes are
4137  * mapped to it, then create a new one.  The new vmspace is null.
4138  *
4139  * No requirements.
4140  */
4141 void
4142 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4143 {
4144 	struct vmspace *oldvmspace = p->p_vmspace;
4145 	struct vmspace *newvmspace;
4146 	vm_map_t map = &p->p_vmspace->vm_map;
4147 
4148 	/*
4149 	 * If we are execing a resident vmspace we fork it, otherwise
4150 	 * we create a new vmspace.  Note that exitingcnt is not
4151 	 * copied to the new vmspace.
4152 	 */
4153 	lwkt_gettoken(&oldvmspace->vm_map.token);
4154 	if (vmcopy)  {
4155 		newvmspace = vmspace_fork(vmcopy);
4156 		lwkt_gettoken(&newvmspace->vm_map.token);
4157 	} else {
4158 		newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4159 		lwkt_gettoken(&newvmspace->vm_map.token);
4160 		bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4161 		      (caddr_t)&oldvmspace->vm_endcopy -
4162 		       (caddr_t)&oldvmspace->vm_startcopy);
4163 	}
4164 
4165 	/*
4166 	 * Finish initializing the vmspace before assigning it
4167 	 * to the process.  The vmspace will become the current vmspace
4168 	 * if p == curproc.
4169 	 */
4170 	pmap_pinit2(vmspace_pmap(newvmspace));
4171 	pmap_replacevm(p, newvmspace, 0);
4172 	lwkt_reltoken(&newvmspace->vm_map.token);
4173 	lwkt_reltoken(&oldvmspace->vm_map.token);
4174 	vmspace_rel(oldvmspace);
4175 }
4176 
4177 /*
4178  * Unshare the specified VM space for forcing COW.  This
4179  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4180  */
4181 void
4182 vmspace_unshare(struct proc *p)
4183 {
4184 	struct vmspace *oldvmspace = p->p_vmspace;
4185 	struct vmspace *newvmspace;
4186 
4187 	lwkt_gettoken(&oldvmspace->vm_map.token);
4188 	if (vmspace_getrefs(oldvmspace) == 1) {
4189 		lwkt_reltoken(&oldvmspace->vm_map.token);
4190 		return;
4191 	}
4192 	newvmspace = vmspace_fork(oldvmspace);
4193 	lwkt_gettoken(&newvmspace->vm_map.token);
4194 	pmap_pinit2(vmspace_pmap(newvmspace));
4195 	pmap_replacevm(p, newvmspace, 0);
4196 	lwkt_reltoken(&newvmspace->vm_map.token);
4197 	lwkt_reltoken(&oldvmspace->vm_map.token);
4198 	vmspace_rel(oldvmspace);
4199 }
4200 
4201 /*
4202  * vm_map_hint: return the beginning of the best area suitable for
4203  * creating a new mapping with "prot" protection.
4204  *
4205  * No requirements.
4206  */
4207 vm_offset_t
4208 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4209 {
4210 	struct vmspace *vms = p->p_vmspace;
4211 	struct rlimit limit;
4212 	rlim_t dsiz;
4213 
4214 	/*
4215 	 * Acquire datasize limit for mmap() operation,
4216 	 * calculate nearest power of 2.
4217 	 */
4218 	if (kern_getrlimit(RLIMIT_DATA, &limit))
4219 		limit.rlim_cur = maxdsiz;
4220 	dsiz = limit.rlim_cur;
4221 
4222 	if (!randomize_mmap || addr != 0) {
4223 		/*
4224 		 * Set a reasonable start point for the hint if it was
4225 		 * not specified or if it falls within the heap space.
4226 		 * Hinted mmap()s do not allocate out of the heap space.
4227 		 */
4228 		if (addr == 0 ||
4229 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4230 		     addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4231 			addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4232 		}
4233 
4234 		return addr;
4235 	}
4236 
4237 	/*
4238 	 * randomize_mmap && addr == 0.  For now randomize the
4239 	 * address within a dsiz range beyond the data limit.
4240 	 */
4241 	addr = (vm_offset_t)vms->vm_daddr + dsiz;
4242 	if (dsiz)
4243 		addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4244 	return (round_page(addr));
4245 }
4246 
4247 /*
4248  * Finds the VM object, offset, and protection for a given virtual address
4249  * in the specified map, assuming a page fault of the type specified.
4250  *
4251  * Leaves the map in question locked for read; return values are guaranteed
4252  * until a vm_map_lookup_done call is performed.  Note that the map argument
4253  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4254  *
4255  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4256  * that fast.
4257  *
4258  * If a lookup is requested with "write protection" specified, the map may
4259  * be changed to perform virtual copying operations, although the data
4260  * referenced will remain the same.
4261  *
4262  * No requirements.
4263  */
4264 int
4265 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
4266 	      vm_offset_t vaddr,
4267 	      vm_prot_t fault_typea,
4268 	      vm_map_entry_t *out_entry,	/* OUT */
4269 	      struct vm_map_backing **bap,	/* OUT */
4270 	      vm_pindex_t *pindex,		/* OUT */
4271 	      vm_prot_t *out_prot,		/* OUT */
4272 	      int *wflags)			/* OUT */
4273 {
4274 	vm_map_entry_t entry;
4275 	vm_map_t map = *var_map;
4276 	vm_prot_t prot;
4277 	vm_prot_t fault_type = fault_typea;
4278 	int use_read_lock = 1;
4279 	int rv = KERN_SUCCESS;
4280 	int count;
4281 	thread_t td = curthread;
4282 
4283 	/*
4284 	 * vm_map_entry_reserve() implements an important mitigation
4285 	 * against mmap() span running the kernel out of vm_map_entry
4286 	 * structures, but it can also cause an infinite call recursion.
4287 	 * Use td_nest_count to prevent an infinite recursion (allows
4288 	 * the vm_map code to dig into the pcpu vm_map_entry reserve).
4289 	 */
4290 	count = 0;
4291 	if (td->td_nest_count == 0) {
4292 		++td->td_nest_count;
4293 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4294 		--td->td_nest_count;
4295 	}
4296 RetryLookup:
4297 	if (use_read_lock)
4298 		vm_map_lock_read(map);
4299 	else
4300 		vm_map_lock(map);
4301 
4302 	/*
4303 	 * Always do a full lookup.  The hint doesn't get us much anymore
4304 	 * now that the map is RB'd.
4305 	 */
4306 	cpu_ccfence();
4307 	*out_entry = NULL;
4308 	*bap = NULL;
4309 
4310 	{
4311 		vm_map_entry_t tmp_entry;
4312 
4313 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4314 			rv = KERN_INVALID_ADDRESS;
4315 			goto done;
4316 		}
4317 		entry = tmp_entry;
4318 		*out_entry = entry;
4319 	}
4320 
4321 	/*
4322 	 * Handle submaps.
4323 	 */
4324 	if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4325 		vm_map_t old_map = map;
4326 
4327 		*var_map = map = entry->ba.sub_map;
4328 		if (use_read_lock)
4329 			vm_map_unlock_read(old_map);
4330 		else
4331 			vm_map_unlock(old_map);
4332 		use_read_lock = 1;
4333 		goto RetryLookup;
4334 	}
4335 
4336 	/*
4337 	 * Check whether this task is allowed to have this page.
4338 	 * Note the special case for MAP_ENTRY_COW pages with an override.
4339 	 * This is to implement a forced COW for debuggers.
4340 	 */
4341 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
4342 		prot = entry->max_protection;
4343 	else
4344 		prot = entry->protection;
4345 
4346 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4347 	if ((fault_type & prot) != fault_type) {
4348 		rv = KERN_PROTECTION_FAILURE;
4349 		goto done;
4350 	}
4351 
4352 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4353 	    (entry->eflags & MAP_ENTRY_COW) &&
4354 	    (fault_type & VM_PROT_WRITE) &&
4355 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4356 		rv = KERN_PROTECTION_FAILURE;
4357 		goto done;
4358 	}
4359 
4360 	/*
4361 	 * If this page is not pageable, we have to get it for all possible
4362 	 * accesses.
4363 	 */
4364 	*wflags = 0;
4365 	if (entry->wired_count) {
4366 		*wflags |= FW_WIRED;
4367 		prot = fault_type = entry->protection;
4368 	}
4369 
4370 	/*
4371 	 * Virtual page tables may need to update the accessed (A) bit
4372 	 * in a page table entry.  Upgrade the fault to a write fault for
4373 	 * that case if the map will support it.  If the map does not support
4374 	 * it the page table entry simply will not be updated.
4375 	 */
4376 	if (entry->maptype == VM_MAPTYPE_VPAGETABLE) {
4377 		if (prot & VM_PROT_WRITE)
4378 			fault_type |= VM_PROT_WRITE;
4379 	}
4380 
4381 	if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4382 	    pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4383 		if ((prot & VM_PROT_WRITE) == 0)
4384 			fault_type |= VM_PROT_WRITE;
4385 	}
4386 
4387 	/*
4388 	 * Only NORMAL and VPAGETABLE maps are object-based.  UKSMAPs are not.
4389 	 */
4390 	if (entry->maptype != VM_MAPTYPE_NORMAL &&
4391 	    entry->maptype != VM_MAPTYPE_VPAGETABLE) {
4392 		*bap = NULL;
4393 		goto skip;
4394 	}
4395 
4396 	/*
4397 	 * If the entry was copy-on-write, we either ...
4398 	 */
4399 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4400 		/*
4401 		 * If we want to write the page, we may as well handle that
4402 		 * now since we've got the map locked.
4403 		 *
4404 		 * If we don't need to write the page, we just demote the
4405 		 * permissions allowed.
4406 		 */
4407 		if (fault_type & VM_PROT_WRITE) {
4408 			/*
4409 			 * Not allowed if TDF_NOFAULT is set as the shadowing
4410 			 * operation can deadlock against the faulting
4411 			 * function due to the copy-on-write.
4412 			 */
4413 			if (curthread->td_flags & TDF_NOFAULT) {
4414 				rv = KERN_FAILURE_NOFAULT;
4415 				goto done;
4416 			}
4417 
4418 			/*
4419 			 * Make a new vm_map_backing + object, and place it
4420 			 * in the object chain.  Note that no new references
4421 			 * have appeared -- one just moved from the map to
4422 			 * the new object.
4423 			 */
4424 			if (use_read_lock && vm_map_lock_upgrade(map)) {
4425 				/* lost lock */
4426 				use_read_lock = 0;
4427 				goto RetryLookup;
4428 			}
4429 			use_read_lock = 0;
4430 			vm_map_entry_shadow(entry);
4431 			*wflags |= FW_DIDCOW;
4432 		} else {
4433 			/*
4434 			 * We're attempting to read a copy-on-write page --
4435 			 * don't allow writes.
4436 			 */
4437 			prot &= ~VM_PROT_WRITE;
4438 		}
4439 	}
4440 
4441 	/*
4442 	 * Create an object if necessary.  This code also handles
4443 	 * partitioning large entries to improve vm_fault performance.
4444 	 */
4445 	if (entry->ba.object == NULL && !map->system_map) {
4446 		if (use_read_lock && vm_map_lock_upgrade(map))  {
4447 			/* lost lock */
4448 			use_read_lock = 0;
4449 			goto RetryLookup;
4450 		}
4451 		use_read_lock = 0;
4452 
4453 		/*
4454 		 * Partition large entries, giving each its own VM object,
4455 		 * to improve concurrent fault performance.  This is only
4456 		 * applicable to userspace.
4457 		 */
4458 		if (map != &kernel_map &&
4459 		    entry->maptype == VM_MAPTYPE_NORMAL &&
4460 		    ((entry->ba.start ^ entry->ba.end) &
4461 		     ~MAP_ENTRY_PARTITION_MASK) &&
4462 		    vm_map_partition_enable) {
4463 			if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4464 				entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4465 				++mycpu->gd_cnt.v_intrans_coll;
4466 				++mycpu->gd_cnt.v_intrans_wait;
4467 				vm_map_transition_wait(map, 0);
4468 				goto RetryLookup;
4469 			}
4470 			vm_map_entry_partition(map, entry, vaddr, &count);
4471 		}
4472 		vm_map_entry_allocate_object(entry);
4473 	}
4474 
4475 	/*
4476 	 * Return the object/offset from this entry.  If the entry was
4477 	 * copy-on-write or empty, it has been fixed up.
4478 	 */
4479 	*bap = &entry->ba;
4480 
4481 skip:
4482 	*pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset);
4483 
4484 	/*
4485 	 * Return whether this is the only map sharing this data.  On
4486 	 * success we return with a read lock held on the map.  On failure
4487 	 * we return with the map unlocked.
4488 	 */
4489 	*out_prot = prot;
4490 done:
4491 	if (rv == KERN_SUCCESS) {
4492 		if (use_read_lock == 0)
4493 			vm_map_lock_downgrade(map);
4494 	} else if (use_read_lock) {
4495 		vm_map_unlock_read(map);
4496 	} else {
4497 		vm_map_unlock(map);
4498 	}
4499 	if (count > 0)
4500 		vm_map_entry_release(count);
4501 
4502 	return (rv);
4503 }
4504 
4505 /*
4506  * Releases locks acquired by a vm_map_lookup()
4507  * (according to the handle returned by that lookup).
4508  *
4509  * No other requirements.
4510  */
4511 void
4512 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4513 {
4514 	/*
4515 	 * Unlock the main-level map
4516 	 */
4517 	vm_map_unlock_read(map);
4518 	if (count)
4519 		vm_map_entry_release(count);
4520 }
4521 
4522 static void
4523 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4524 		       vm_offset_t vaddr, int *countp)
4525 {
4526 	vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4527 	vm_map_clip_start(map, entry, vaddr, countp);
4528 	vaddr += MAP_ENTRY_PARTITION_SIZE;
4529 	vm_map_clip_end(map, entry, vaddr, countp);
4530 }
4531 
4532 /*
4533  * Quick hack, needs some help to make it more SMP friendly.
4534  */
4535 void
4536 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4537 		 vm_offset_t ran_beg, vm_offset_t ran_end)
4538 {
4539 	struct vm_map_ilock *scan;
4540 
4541 	ilock->ran_beg = ran_beg;
4542 	ilock->ran_end = ran_end;
4543 	ilock->flags = 0;
4544 
4545 	spin_lock(&map->ilock_spin);
4546 restart:
4547 	for (scan = map->ilock_base; scan; scan = scan->next) {
4548 		if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4549 			scan->flags |= ILOCK_WAITING;
4550 			ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4551 			goto restart;
4552 		}
4553 	}
4554 	ilock->next = map->ilock_base;
4555 	map->ilock_base = ilock;
4556 	spin_unlock(&map->ilock_spin);
4557 }
4558 
4559 void
4560 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4561 {
4562 	struct vm_map_ilock *scan;
4563 	struct vm_map_ilock **scanp;
4564 
4565 	spin_lock(&map->ilock_spin);
4566 	scanp = &map->ilock_base;
4567 	while ((scan = *scanp) != NULL) {
4568 		if (scan == ilock) {
4569 			*scanp = ilock->next;
4570 			spin_unlock(&map->ilock_spin);
4571 			if (ilock->flags & ILOCK_WAITING)
4572 				wakeup(ilock);
4573 			return;
4574 		}
4575 		scanp = &scan->next;
4576 	}
4577 	spin_unlock(&map->ilock_spin);
4578 	panic("vm_map_deinterlock: missing ilock!");
4579 }
4580 
4581 #include "opt_ddb.h"
4582 #ifdef DDB
4583 #include <ddb/ddb.h>
4584 
4585 /*
4586  * Debugging only
4587  */
4588 DB_SHOW_COMMAND(map, vm_map_print)
4589 {
4590 	static int nlines;
4591 	/* XXX convert args. */
4592 	vm_map_t map = (vm_map_t)addr;
4593 	boolean_t full = have_addr;
4594 
4595 	vm_map_entry_t entry;
4596 
4597 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4598 	    (void *)map,
4599 	    (void *)map->pmap, map->nentries, map->timestamp);
4600 	nlines++;
4601 
4602 	if (!full && db_indent)
4603 		return;
4604 
4605 	db_indent += 2;
4606 	RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4607 		db_iprintf("map entry %p: start=%p, end=%p\n",
4608 		    (void *)entry,
4609 		    (void *)entry->ba.start, (void *)entry->ba.end);
4610 		nlines++;
4611 		{
4612 			static char *inheritance_name[4] =
4613 			{"share", "copy", "none", "donate_copy"};
4614 
4615 			db_iprintf(" prot=%x/%x/%s",
4616 			    entry->protection,
4617 			    entry->max_protection,
4618 			    inheritance_name[(int)(unsigned char)
4619 						entry->inheritance]);
4620 			if (entry->wired_count != 0)
4621 				db_printf(", wired");
4622 		}
4623 		switch(entry->maptype) {
4624 		case VM_MAPTYPE_SUBMAP:
4625 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4626 			db_printf(", share=%p, offset=0x%lx\n",
4627 			    (void *)entry->ba.sub_map,
4628 			    (long)entry->ba.offset);
4629 			nlines++;
4630 
4631 			db_indent += 2;
4632 			vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4633 				     full, 0, NULL);
4634 			db_indent -= 2;
4635 			break;
4636 		case VM_MAPTYPE_NORMAL:
4637 		case VM_MAPTYPE_VPAGETABLE:
4638 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4639 			db_printf(", object=%p, offset=0x%lx",
4640 			    (void *)entry->ba.object,
4641 			    (long)entry->ba.offset);
4642 			if (entry->eflags & MAP_ENTRY_COW)
4643 				db_printf(", copy (%s)",
4644 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4645 			db_printf("\n");
4646 			nlines++;
4647 
4648 			if (entry->ba.object) {
4649 				db_indent += 2;
4650 				vm_object_print((db_expr_t)(intptr_t)
4651 						entry->ba.object,
4652 						full, 0, NULL);
4653 				nlines += 4;
4654 				db_indent -= 2;
4655 			}
4656 			break;
4657 		case VM_MAPTYPE_UKSMAP:
4658 			db_printf(", uksmap=%p, offset=0x%lx",
4659 			    (void *)entry->ba.uksmap,
4660 			    (long)entry->ba.offset);
4661 			if (entry->eflags & MAP_ENTRY_COW)
4662 				db_printf(", copy (%s)",
4663 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4664 			db_printf("\n");
4665 			nlines++;
4666 			break;
4667 		default:
4668 			break;
4669 		}
4670 	}
4671 	db_indent -= 2;
4672 	if (db_indent == 0)
4673 		nlines = 0;
4674 }
4675 
4676 /*
4677  * Debugging only
4678  */
4679 DB_SHOW_COMMAND(procvm, procvm)
4680 {
4681 	struct proc *p;
4682 
4683 	if (have_addr) {
4684 		p = (struct proc *) addr;
4685 	} else {
4686 		p = curproc;
4687 	}
4688 
4689 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4690 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4691 	    (void *)vmspace_pmap(p->p_vmspace));
4692 
4693 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4694 }
4695 
4696 #endif /* DDB */
4697