xref: /dflybsd-src/sys/vm/vm_object.c (revision 530e94fc9e8b4693c7e841a45371bdb6e76ee4cd)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91 
92 #include <vm/vm_page2.h>
93 
94 #include <machine/specialreg.h>
95 
96 #define EASY_SCAN_FACTOR	8
97 
98 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
99 					     int pagerflags);
100 static void	vm_object_lock_init(vm_object_t);
101 
102 /*
103  *	Virtual memory objects maintain the actual data
104  *	associated with allocated virtual memory.  A given
105  *	page of memory exists within exactly one object.
106  *
107  *	An object is only deallocated when all "references"
108  *	are given up.  Only one "reference" to a given
109  *	region of an object should be writeable.
110  *
111  *	Associated with each object is a list of all resident
112  *	memory pages belonging to that object; this list is
113  *	maintained by the "vm_page" module, and locked by the object's
114  *	lock.
115  *
116  *	Each object also records a "pager" routine which is
117  *	used to retrieve (and store) pages to the proper backing
118  *	storage.  In addition, objects may be backed by other
119  *	objects from which they were virtual-copied.
120  *
121  *	The only items within the object structure which are
122  *	modified after time of creation are:
123  *		reference count		locked by object's lock
124  *		pager routine		locked by object's lock
125  *
126  */
127 
128 struct vm_object kernel_object;
129 
130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
131 
132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
133 
134 #define VMOBJ_HASH_PRIME1	66555444443333333ULL
135 #define VMOBJ_HASH_PRIME2	989042931893ULL
136 
137 int vm_object_debug;
138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
139 
140 static __inline
141 struct vm_object_hash *
142 vmobj_hash(vm_object_t obj)
143 {
144 	uintptr_t hash1;
145 	uintptr_t hash2;
146 
147 	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
148 	hash1 %= VMOBJ_HASH_PRIME1;
149 	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
150 	hash2 %= VMOBJ_HASH_PRIME2;
151 	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
152 }
153 
154 #if defined(DEBUG_LOCKS)
155 
156 #define vm_object_vndeallocate(obj, vpp)	\
157                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
158 
159 /*
160  * Debug helper to track hold/drop/ref/deallocate calls.
161  */
162 static void
163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
164 {
165 	int i;
166 
167 	i = atomic_fetchadd_int(&obj->debug_index, 1);
168 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
169 	ksnprintf(obj->debug_hold_thrs[i],
170 		  sizeof(obj->debug_hold_thrs[i]),
171 		  "%c%d:(%d):%s",
172 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
173 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
174 		  obj->ref_count,
175 		  curthread->td_comm);
176 	obj->debug_hold_file[i] = file;
177 	obj->debug_hold_line[i] = line;
178 #if 0
179 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
180 	if (strcmp(curthread->td_comm, "sshd") == 0) {
181 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
182 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
183 			obj, obj->ref_count, addrem, file, line);
184 	}
185 #endif
186 }
187 
188 #endif
189 
190 /*
191  * Misc low level routines
192  */
193 static void
194 vm_object_lock_init(vm_object_t obj)
195 {
196 #if defined(DEBUG_LOCKS)
197 	int i;
198 
199 	obj->debug_index = 0;
200 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
201 		obj->debug_hold_thrs[i][0] = 0;
202 		obj->debug_hold_file[i] = NULL;
203 		obj->debug_hold_line[i] = 0;
204 	}
205 #endif
206 }
207 
208 void
209 vm_object_lock_swap(void)
210 {
211 	lwkt_token_swap();
212 }
213 
214 void
215 vm_object_lock(vm_object_t obj)
216 {
217 	lwkt_gettoken(&obj->token);
218 }
219 
220 /*
221  * Returns TRUE on sucesss
222  */
223 static int
224 vm_object_lock_try(vm_object_t obj)
225 {
226 	return(lwkt_trytoken(&obj->token));
227 }
228 
229 void
230 vm_object_lock_shared(vm_object_t obj)
231 {
232 	lwkt_gettoken_shared(&obj->token);
233 }
234 
235 void
236 vm_object_unlock(vm_object_t obj)
237 {
238 	lwkt_reltoken(&obj->token);
239 }
240 
241 void
242 vm_object_upgrade(vm_object_t obj)
243 {
244 	lwkt_reltoken(&obj->token);
245 	lwkt_gettoken(&obj->token);
246 }
247 
248 void
249 vm_object_downgrade(vm_object_t obj)
250 {
251 	lwkt_reltoken(&obj->token);
252 	lwkt_gettoken_shared(&obj->token);
253 }
254 
255 static __inline void
256 vm_object_assert_held(vm_object_t obj)
257 {
258 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
259 }
260 
261 int
262 vm_quickcolor(void)
263 {
264 	globaldata_t gd = mycpu;
265 	int pg_color;
266 
267 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
268 	pg_color += gd->gd_quick_color;
269 	gd->gd_quick_color += PQ_PRIME2;
270 
271 	return pg_color;
272 }
273 
274 void
275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
276 {
277 	KKASSERT(obj != NULL);
278 
279 	/*
280 	 * Object must be held (object allocation is stable due to callers
281 	 * context, typically already holding the token on a parent object)
282 	 * prior to potentially blocking on the lock, otherwise the object
283 	 * can get ripped away from us.
284 	 */
285 	refcount_acquire(&obj->hold_count);
286 	vm_object_lock(obj);
287 
288 #if defined(DEBUG_LOCKS)
289 	debugvm_object_add(obj, file, line, 1);
290 #endif
291 }
292 
293 int
294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
295 {
296 	KKASSERT(obj != NULL);
297 
298 	/*
299 	 * Object must be held (object allocation is stable due to callers
300 	 * context, typically already holding the token on a parent object)
301 	 * prior to potentially blocking on the lock, otherwise the object
302 	 * can get ripped away from us.
303 	 */
304 	refcount_acquire(&obj->hold_count);
305 	if (vm_object_lock_try(obj) == 0) {
306 		if (refcount_release(&obj->hold_count)) {
307 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
308 				kfree(obj, M_VM_OBJECT);
309 		}
310 		return(0);
311 	}
312 
313 #if defined(DEBUG_LOCKS)
314 	debugvm_object_add(obj, file, line, 1);
315 #endif
316 	return(1);
317 }
318 
319 void
320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
321 {
322 	KKASSERT(obj != NULL);
323 
324 	/*
325 	 * Object must be held (object allocation is stable due to callers
326 	 * context, typically already holding the token on a parent object)
327 	 * prior to potentially blocking on the lock, otherwise the object
328 	 * can get ripped away from us.
329 	 */
330 	refcount_acquire(&obj->hold_count);
331 	vm_object_lock_shared(obj);
332 
333 #if defined(DEBUG_LOCKS)
334 	debugvm_object_add(obj, file, line, 1);
335 #endif
336 }
337 
338 /*
339  * Drop the token and hold_count on the object.
340  *
341  * WARNING! Token might be shared.
342  */
343 void
344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
345 {
346 	if (obj == NULL)
347 		return;
348 
349 	/*
350 	 * No new holders should be possible once we drop hold_count 1->0 as
351 	 * there is no longer any way to reference the object.
352 	 */
353 	KKASSERT(obj->hold_count > 0);
354 	if (refcount_release(&obj->hold_count)) {
355 #if defined(DEBUG_LOCKS)
356 		debugvm_object_add(obj, file, line, -1);
357 #endif
358 
359 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
360 			vm_object_unlock(obj);
361 			kfree(obj, M_VM_OBJECT);
362 		} else {
363 			vm_object_unlock(obj);
364 		}
365 	} else {
366 #if defined(DEBUG_LOCKS)
367 		debugvm_object_add(obj, file, line, -1);
368 #endif
369 		vm_object_unlock(obj);
370 	}
371 }
372 
373 /*
374  * Initialize a freshly allocated object, returning a held object.
375  *
376  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
377  *
378  * No requirements.
379  */
380 void
381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
382 {
383 	struct vm_object_hash *hash;
384 
385 	RB_INIT(&object->rb_memq);
386 	lwkt_token_init(&object->token, "vmobj");
387 
388 	TAILQ_INIT(&object->backing_list);
389 	object->type = type;
390 	object->size = size;
391 	object->ref_count = 1;
392 	object->memattr = VM_MEMATTR_DEFAULT;
393 	object->hold_count = 0;
394 	object->flags = 0;
395 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
396 		vm_object_set_flag(object, OBJ_ONEMAPPING);
397 	object->paging_in_progress = 0;
398 	object->resident_page_count = 0;
399 	/* cpu localization twist */
400 	object->pg_color = vm_quickcolor();
401 	object->handle = NULL;
402 
403 	atomic_add_int(&object->generation, 1);
404 	object->swblock_count = 0;
405 	RB_INIT(&object->swblock_root);
406 	vm_object_lock_init(object);
407 	pmap_object_init(object);
408 
409 	vm_object_hold(object);
410 
411 	hash = vmobj_hash(object);
412 	lwkt_gettoken(&hash->token);
413 	TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
414 	lwkt_reltoken(&hash->token);
415 }
416 
417 /*
418  * Initialize a VM object.
419  */
420 void
421 vm_object_init(vm_object_t object, vm_pindex_t size)
422 {
423 	_vm_object_allocate(OBJT_DEFAULT, size, object);
424 	vm_object_drop(object);
425 }
426 
427 /*
428  * Initialize the VM objects module.
429  *
430  * Called from the low level boot code only.  Note that this occurs before
431  * kmalloc is initialized so we cannot allocate any VM objects.
432  */
433 void
434 vm_object_init1(void)
435 {
436 	int i;
437 
438 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
439 		TAILQ_INIT(&vm_object_hash[i].list);
440 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
441 	}
442 
443 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
444 			    &kernel_object);
445 	vm_object_drop(&kernel_object);
446 }
447 
448 void
449 vm_object_init2(void)
450 {
451 	kmalloc_set_unlimited(M_VM_OBJECT);
452 }
453 
454 /*
455  * Allocate and return a new object of the specified type and size.
456  *
457  * No requirements.
458  */
459 vm_object_t
460 vm_object_allocate(objtype_t type, vm_pindex_t size)
461 {
462 	vm_object_t obj;
463 
464 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
465 	_vm_object_allocate(type, size, obj);
466 	vm_object_drop(obj);
467 
468 	return (obj);
469 }
470 
471 /*
472  * This version returns a held object, allowing further atomic initialization
473  * of the object.
474  */
475 vm_object_t
476 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
477 {
478 	vm_object_t obj;
479 
480 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
481 	_vm_object_allocate(type, size, obj);
482 
483 	return (obj);
484 }
485 
486 /*
487  * Add an additional reference to a vm_object.  The object must already be
488  * held.  The original non-lock version is no longer supported.  The object
489  * must NOT be chain locked by anyone at the time the reference is added.
490  *
491  * The object must be held, but may be held shared if desired (hence why
492  * we use an atomic op).
493  */
494 void
495 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
496 {
497 	KKASSERT(object != NULL);
498 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
499 	atomic_add_int(&object->ref_count, 1);
500 	if (object->type == OBJT_VNODE) {
501 		vref(object->handle);
502 		/* XXX what if the vnode is being destroyed? */
503 	}
504 #if defined(DEBUG_LOCKS)
505 	debugvm_object_add(object, file, line, 1);
506 #endif
507 }
508 
509 /*
510  * This version is only allowed in situations where the caller
511  * already knows that the object is deterministically referenced
512  * (usually because its taken from a ref'd vnode, or during a map_entry
513  * replication).
514  */
515 void
516 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
517 {
518 	KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
519 	atomic_add_int(&object->ref_count, 1);
520 	if (object->type == OBJT_VNODE)
521 		vref(object->handle);
522 #if defined(DEBUG_LOCKS)
523 	debugvm_object_add(object, file, line, 1);
524 #endif
525 }
526 
527 /*
528  * Dereference an object and its underlying vnode.  The object may be
529  * held shared.  On return the object will remain held.
530  *
531  * This function may return a vnode in *vpp which the caller must release
532  * after the caller drops its own lock.  If vpp is NULL, we assume that
533  * the caller was holding an exclusive lock on the object and we vrele()
534  * the vp ourselves.
535  */
536 static void
537 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
538 				   VMOBJDBARGS)
539 {
540 	struct vnode *vp = (struct vnode *) object->handle;
541 
542 	KASSERT(object->type == OBJT_VNODE,
543 	    ("vm_object_vndeallocate: not a vnode object"));
544 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
545 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
546 #ifdef INVARIANTS
547 	if (object->ref_count == 0) {
548 		vprint("vm_object_vndeallocate", vp);
549 		panic("vm_object_vndeallocate: bad object reference count");
550 	}
551 #endif
552 	for (;;) {
553 		int count = object->ref_count;
554 		cpu_ccfence();
555 		if (count == 1) {
556 			vm_object_upgrade(object);
557 			if (atomic_cmpset_int(&object->ref_count, count, 0)) {
558 				vclrflags(vp, VTEXT);
559 				break;
560 			}
561 		} else {
562 			if (atomic_cmpset_int(&object->ref_count,
563 					      count, count - 1)) {
564 				break;
565 			}
566 		}
567 		/* retry */
568 	}
569 #if defined(DEBUG_LOCKS)
570 	debugvm_object_add(object, file, line, -1);
571 #endif
572 
573 	/*
574 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
575 	 * if the object was locked exclusively.  But there are two races
576 	 * here.
577 	 *
578 	 * We had to upgrade the object above to safely clear VTEXT
579 	 * but the alternative path where the shared lock is retained
580 	 * can STILL race to 0 in other paths and cause our own vrele()
581 	 * to terminate the vnode.  We can't allow that if the VM object
582 	 * is still locked shared.
583 	 */
584 	if (vpp)
585 		*vpp = vp;
586 	else
587 		vrele(vp);
588 }
589 
590 /*
591  * Release a reference to the specified object, gained either through a
592  * vm_object_allocate or a vm_object_reference call.  When all references
593  * are gone, storage associated with this object may be relinquished.
594  *
595  * The caller does not have to hold the object locked but must have control
596  * over the reference in question in order to guarantee that the object
597  * does not get ripped out from under us.
598  *
599  * XXX Currently all deallocations require an exclusive lock.
600  */
601 void
602 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
603 {
604 	struct vnode *vp;
605 	int count;
606 
607 	if (object == NULL)
608 		return;
609 
610 	for (;;) {
611 		count = object->ref_count;
612 		cpu_ccfence();
613 
614 		/*
615 		 * If decrementing the count enters into special handling
616 		 * territory (0, 1, or 2) we have to do it the hard way.
617 		 * Fortunate though, objects with only a few refs like this
618 		 * are not likely to be heavily contended anyway.
619 		 *
620 		 * For vnode objects we only care about 1->0 transitions.
621 		 */
622 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
623 #if defined(DEBUG_LOCKS)
624 			debugvm_object_add(object, file, line, 0);
625 #endif
626 			vm_object_hold(object);
627 			vm_object_deallocate_locked(object);
628 			vm_object_drop(object);
629 			break;
630 		}
631 
632 		/*
633 		 * Try to decrement ref_count without acquiring a hold on
634 		 * the object.  This is particularly important for the exec*()
635 		 * and exit*() code paths because the program binary may
636 		 * have a great deal of sharing and an exclusive lock will
637 		 * crowbar performance in those circumstances.
638 		 */
639 		if (object->type == OBJT_VNODE) {
640 			vp = (struct vnode *)object->handle;
641 			if (atomic_cmpset_int(&object->ref_count,
642 					      count, count - 1)) {
643 #if defined(DEBUG_LOCKS)
644 				debugvm_object_add(object, file, line, -1);
645 #endif
646 
647 				vrele(vp);
648 				break;
649 			}
650 			/* retry */
651 		} else {
652 			if (atomic_cmpset_int(&object->ref_count,
653 					      count, count - 1)) {
654 #if defined(DEBUG_LOCKS)
655 				debugvm_object_add(object, file, line, -1);
656 #endif
657 				break;
658 			}
659 			/* retry */
660 		}
661 		/* retry */
662 	}
663 }
664 
665 void
666 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
667 {
668 	/*
669 	 * Degenerate case
670 	 */
671 	if (object == NULL)
672 		return;
673 
674 	/*
675 	 * vnode case, caller either locked the object exclusively
676 	 * or this is a recursion with must_drop != 0 and the vnode
677 	 * object will be locked shared.
678 	 *
679 	 * If locked shared we have to drop the object before we can
680 	 * call vrele() or risk a shared/exclusive livelock.
681 	 */
682 	if (object->type == OBJT_VNODE) {
683 		ASSERT_LWKT_TOKEN_HELD(&object->token);
684 		vm_object_vndeallocate(object, NULL);
685 		return;
686 	}
687 	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
688 
689 	/*
690 	 * Normal case (object is locked exclusively)
691 	 */
692 	if (object->ref_count == 0) {
693 		panic("vm_object_deallocate: object deallocated "
694 		      "too many times: %d", object->type);
695 	}
696 	if (object->ref_count > 2) {
697 		atomic_add_int(&object->ref_count, -1);
698 #if defined(DEBUG_LOCKS)
699 		debugvm_object_add(object, file, line, -1);
700 #endif
701 		return;
702 	}
703 
704 	/*
705 	 * Drop the ref and handle termination on the 1->0 transition.
706 	 * We may have blocked above so we have to recheck.
707 	 */
708 	KKASSERT(object->ref_count != 0);
709 	if (object->ref_count >= 2) {
710 		atomic_add_int(&object->ref_count, -1);
711 #if defined(DEBUG_LOCKS)
712 		debugvm_object_add(object, file, line, -1);
713 #endif
714 		return;
715 	}
716 
717 	atomic_add_int(&object->ref_count, -1);
718 	if ((object->flags & OBJ_DEAD) == 0)
719 		vm_object_terminate(object);
720 }
721 
722 /*
723  * Destroy the specified object, freeing up related resources.
724  *
725  * The object must have zero references.
726  *
727  * The object must held.  The caller is responsible for dropping the object
728  * after terminate returns.  Terminate does NOT drop the object.
729  */
730 static int vm_object_terminate_callback(vm_page_t p, void *data);
731 
732 void
733 vm_object_terminate(vm_object_t object)
734 {
735 	struct rb_vm_page_scan_info info;
736 	struct vm_object_hash *hash;
737 
738 	/*
739 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
740 	 * able to safely block.
741 	 */
742 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
743 	KKASSERT((object->flags & OBJ_DEAD) == 0);
744 	vm_object_set_flag(object, OBJ_DEAD);
745 
746 	/*
747 	 * Wait for the pageout daemon to be done with the object
748 	 */
749 	vm_object_pip_wait(object, "objtrm1");
750 
751 	KASSERT(!object->paging_in_progress,
752 		("vm_object_terminate: pageout in progress"));
753 
754 	/*
755 	 * Clean and free the pages, as appropriate. All references to the
756 	 * object are gone, so we don't need to lock it.
757 	 */
758 	if (object->type == OBJT_VNODE) {
759 		struct vnode *vp;
760 
761 		/*
762 		 * Clean pages and flush buffers.
763 		 *
764 		 * NOTE!  TMPFS buffer flushes do not typically flush the
765 		 *	  actual page to swap as this would be highly
766 		 *	  inefficient, and normal filesystems usually wrap
767 		 *	  page flushes with buffer cache buffers.
768 		 *
769 		 *	  To deal with this we have to call vinvalbuf() both
770 		 *	  before and after the vm_object_page_clean().
771 		 */
772 		vp = (struct vnode *) object->handle;
773 		vinvalbuf(vp, V_SAVE, 0, 0);
774 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
775 		vinvalbuf(vp, V_SAVE, 0, 0);
776 	}
777 
778 	/*
779 	 * Wait for any I/O to complete, after which there had better not
780 	 * be any references left on the object.
781 	 */
782 	vm_object_pip_wait(object, "objtrm2");
783 
784 	if (object->ref_count != 0) {
785 		panic("vm_object_terminate: object with references, "
786 		      "ref_count=%d", object->ref_count);
787 	}
788 
789 	/*
790 	 * Cleanup any shared pmaps associated with this object.
791 	 */
792 	pmap_object_free(object);
793 
794 	/*
795 	 * Now free any remaining pages. For internal objects, this also
796 	 * removes them from paging queues. Don't free wired pages, just
797 	 * remove them from the object.
798 	 */
799 	info.count = 0;
800 	info.object = object;
801 	do {
802 		info.error = 0;
803 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
804 					vm_object_terminate_callback, &info);
805 	} while (info.error);
806 
807 	/*
808 	 * Let the pager know object is dead.
809 	 */
810 	vm_pager_deallocate(object);
811 
812 	/*
813 	 * Wait for the object hold count to hit 1, clean out pages as
814 	 * we go.  vmobj_token interlocks any race conditions that might
815 	 * pick the object up from the vm_object_list after we have cleared
816 	 * rb_memq.
817 	 */
818 	for (;;) {
819 		if (RB_ROOT(&object->rb_memq) == NULL)
820 			break;
821 		kprintf("vm_object_terminate: Warning, object %p "
822 			"still has %ld pages\n",
823 			object, object->resident_page_count);
824 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
825 					vm_object_terminate_callback, &info);
826 	}
827 
828 	/*
829 	 * There had better not be any pages left
830 	 */
831 	KKASSERT(object->resident_page_count == 0);
832 
833 	/*
834 	 * Remove the object from the global object list.
835 	 */
836 	hash = vmobj_hash(object);
837 	lwkt_gettoken(&hash->token);
838 	TAILQ_REMOVE(&hash->list, object, object_entry);
839 	lwkt_reltoken(&hash->token);
840 
841 	if (object->ref_count != 0) {
842 		panic("vm_object_terminate2: object with references, "
843 		      "ref_count=%d", object->ref_count);
844 	}
845 
846 	/*
847 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
848 	 *	 the object here.  See vm_object_drop().
849 	 */
850 }
851 
852 /*
853  * The caller must hold the object.
854  */
855 static int
856 vm_object_terminate_callback(vm_page_t p, void *data)
857 {
858 	struct rb_vm_page_scan_info *info = data;
859 	vm_object_t object;
860 
861 	object = p->object;
862 	KKASSERT(object == info->object);
863 	if (vm_page_busy_try(p, TRUE)) {
864 		vm_page_sleep_busy(p, TRUE, "vmotrm");
865 		info->error = 1;
866 		return 0;
867 	}
868 	if (object != p->object) {
869 		/* XXX remove once we determine it can't happen */
870 		kprintf("vm_object_terminate: Warning: Encountered "
871 			"busied page %p on queue %d\n", p, p->queue);
872 		vm_page_wakeup(p);
873 		info->error = 1;
874 	} else if (p->wire_count == 0) {
875 		/*
876 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
877 		 */
878 		vm_page_free(p);
879 		mycpu->gd_cnt.v_pfree++;
880 	} else {
881 		if (p->queue != PQ_NONE) {
882 			kprintf("vm_object_terminate: Warning: Encountered "
883 				"wired page %p on queue %d\n", p, p->queue);
884 			if (vm_object_debug > 0) {
885 				--vm_object_debug;
886 				print_backtrace(10);
887 			}
888 		}
889 		vm_page_remove(p);
890 		vm_page_wakeup(p);
891 	}
892 
893 	/*
894 	 * Must be at end to avoid SMP races, caller holds object token
895 	 */
896 	if ((++info->count & 63) == 0)
897 		lwkt_user_yield();
898 	return(0);
899 }
900 
901 /*
902  * Clean all dirty pages in the specified range of object.  Leaves page
903  * on whatever queue it is currently on.   If NOSYNC is set then do not
904  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
905  * leaving the object dirty.
906  *
907  * When stuffing pages asynchronously, allow clustering.  XXX we need a
908  * synchronous clustering mode implementation.
909  *
910  * Odd semantics: if start == end, we clean everything.
911  *
912  * The object must be locked? XXX
913  */
914 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
915 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
916 
917 void
918 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
919 		     int flags)
920 {
921 	struct rb_vm_page_scan_info info;
922 	struct vnode *vp;
923 	int wholescan;
924 	int pagerflags;
925 	int generation;
926 
927 	vm_object_hold(object);
928 	if (object->type != OBJT_VNODE ||
929 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
930 		vm_object_drop(object);
931 		return;
932 	}
933 
934 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
935 			VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
936 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
937 
938 	vp = object->handle;
939 
940 	/*
941 	 * Interlock other major object operations.  This allows us to
942 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
943 	 */
944 	vm_object_set_flag(object, OBJ_CLEANING);
945 
946 	/*
947 	 * Handle 'entire object' case
948 	 */
949 	info.start_pindex = start;
950 	if (end == 0) {
951 		info.end_pindex = object->size - 1;
952 	} else {
953 		info.end_pindex = end - 1;
954 	}
955 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
956 	info.limit = flags;
957 	info.pagerflags = pagerflags;
958 	info.object = object;
959 
960 	/*
961 	 * If cleaning the entire object do a pass to mark the pages read-only.
962 	 * If everything worked out ok, clear OBJ_WRITEABLE and
963 	 * OBJ_MIGHTBEDIRTY.
964 	 */
965 	if (wholescan) {
966 		info.error = 0;
967 		info.count = 0;
968 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
969 					vm_object_page_clean_pass1, &info);
970 		if (info.error == 0) {
971 			vm_object_clear_flag(object,
972 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
973 			if (object->type == OBJT_VNODE &&
974 			    (vp = (struct vnode *)object->handle) != NULL) {
975 				/*
976 				 * Use new-style interface to clear VISDIRTY
977 				 * because the vnode is not necessarily removed
978 				 * from the syncer list(s) as often as it was
979 				 * under the old interface, which can leave
980 				 * the vnode on the syncer list after reclaim.
981 				 */
982 				vclrobjdirty(vp);
983 			}
984 		}
985 	}
986 
987 	/*
988 	 * Do a pass to clean all the dirty pages we find.
989 	 */
990 	do {
991 		info.error = 0;
992 		info.count = 0;
993 		generation = object->generation;
994 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
995 					vm_object_page_clean_pass2, &info);
996 	} while (info.error || generation != object->generation);
997 
998 	vm_object_clear_flag(object, OBJ_CLEANING);
999 	vm_object_drop(object);
1000 }
1001 
1002 /*
1003  * The caller must hold the object.
1004  */
1005 static
1006 int
1007 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1008 {
1009 	struct rb_vm_page_scan_info *info = data;
1010 
1011 	KKASSERT(p->object == info->object);
1012 
1013 	vm_page_flag_set(p, PG_CLEANCHK);
1014 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1015 		info->error = 1;
1016 	} else if (vm_page_busy_try(p, FALSE)) {
1017 		info->error = 1;
1018 	} else {
1019 		KKASSERT(p->object == info->object);
1020 		vm_page_protect(p, VM_PROT_READ);
1021 		vm_page_wakeup(p);
1022 	}
1023 
1024 	/*
1025 	 * Must be at end to avoid SMP races, caller holds object token
1026 	 */
1027 	if ((++info->count & 63) == 0)
1028 		lwkt_user_yield();
1029 	return(0);
1030 }
1031 
1032 /*
1033  * The caller must hold the object
1034  */
1035 static
1036 int
1037 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1038 {
1039 	struct rb_vm_page_scan_info *info = data;
1040 	int generation;
1041 
1042 	KKASSERT(p->object == info->object);
1043 
1044 	/*
1045 	 * Do not mess with pages that were inserted after we started
1046 	 * the cleaning pass.
1047 	 */
1048 	if ((p->flags & PG_CLEANCHK) == 0)
1049 		goto done;
1050 
1051 	generation = info->object->generation;
1052 
1053 	if (vm_page_busy_try(p, TRUE)) {
1054 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1055 		info->error = 1;
1056 		goto done;
1057 	}
1058 
1059 	KKASSERT(p->object == info->object &&
1060 		 info->object->generation == generation);
1061 
1062 	/*
1063 	 * Before wasting time traversing the pmaps, check for trivial
1064 	 * cases where the page cannot be dirty.
1065 	 */
1066 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1067 		KKASSERT((p->dirty & p->valid) == 0 &&
1068 			 (p->flags & PG_NEED_COMMIT) == 0);
1069 		vm_page_wakeup(p);
1070 		goto done;
1071 	}
1072 
1073 	/*
1074 	 * Check whether the page is dirty or not.  The page has been set
1075 	 * to be read-only so the check will not race a user dirtying the
1076 	 * page.
1077 	 */
1078 	vm_page_test_dirty(p);
1079 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1080 		vm_page_flag_clear(p, PG_CLEANCHK);
1081 		vm_page_wakeup(p);
1082 		goto done;
1083 	}
1084 
1085 	/*
1086 	 * If we have been asked to skip nosync pages and this is a
1087 	 * nosync page, skip it.  Note that the object flags were
1088 	 * not cleared in this case (because pass1 will have returned an
1089 	 * error), so we do not have to set them.
1090 	 */
1091 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1092 		vm_page_flag_clear(p, PG_CLEANCHK);
1093 		vm_page_wakeup(p);
1094 		goto done;
1095 	}
1096 
1097 	/*
1098 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1099 	 * the pages that get successfully flushed.  Set info->error if
1100 	 * we raced an object modification.
1101 	 */
1102 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1103 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1104 
1105 	/*
1106 	 * Must be at end to avoid SMP races, caller holds object token
1107 	 */
1108 done:
1109 	if ((++info->count & 63) == 0)
1110 		lwkt_user_yield();
1111 	return(0);
1112 }
1113 
1114 /*
1115  * Collect the specified page and nearby pages and flush them out.
1116  * The number of pages flushed is returned.  The passed page is busied
1117  * by the caller and we are responsible for its disposition.
1118  *
1119  * The caller must hold the object.
1120  */
1121 static void
1122 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1123 {
1124 	int error;
1125 	int is;
1126 	int ib;
1127 	int i;
1128 	int page_base;
1129 	vm_pindex_t pi;
1130 	vm_page_t ma[BLIST_MAX_ALLOC];
1131 
1132 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1133 
1134 	pi = p->pindex;
1135 	page_base = pi % BLIST_MAX_ALLOC;
1136 	ma[page_base] = p;
1137 	ib = page_base - 1;
1138 	is = page_base + 1;
1139 
1140 	while (ib >= 0) {
1141 		vm_page_t tp;
1142 
1143 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1144 					     TRUE, &error);
1145 		if (error)
1146 			break;
1147 		if (tp == NULL)
1148 			break;
1149 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1150 		    (tp->flags & PG_CLEANCHK) == 0) {
1151 			vm_page_wakeup(tp);
1152 			break;
1153 		}
1154 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1155 			vm_page_flag_clear(tp, PG_CLEANCHK);
1156 			vm_page_wakeup(tp);
1157 			break;
1158 		}
1159 		vm_page_test_dirty(tp);
1160 		if ((tp->dirty & tp->valid) == 0 &&
1161 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1162 			vm_page_flag_clear(tp, PG_CLEANCHK);
1163 			vm_page_wakeup(tp);
1164 			break;
1165 		}
1166 		ma[ib] = tp;
1167 		--ib;
1168 	}
1169 	++ib;	/* fixup */
1170 
1171 	while (is < BLIST_MAX_ALLOC &&
1172 	       pi - page_base + is < object->size) {
1173 		vm_page_t tp;
1174 
1175 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1176 					     TRUE, &error);
1177 		if (error)
1178 			break;
1179 		if (tp == NULL)
1180 			break;
1181 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1182 		    (tp->flags & PG_CLEANCHK) == 0) {
1183 			vm_page_wakeup(tp);
1184 			break;
1185 		}
1186 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1187 			vm_page_flag_clear(tp, PG_CLEANCHK);
1188 			vm_page_wakeup(tp);
1189 			break;
1190 		}
1191 		vm_page_test_dirty(tp);
1192 		if ((tp->dirty & tp->valid) == 0 &&
1193 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1194 			vm_page_flag_clear(tp, PG_CLEANCHK);
1195 			vm_page_wakeup(tp);
1196 			break;
1197 		}
1198 		ma[is] = tp;
1199 		++is;
1200 	}
1201 
1202 	/*
1203 	 * All pages in the ma[] array are busied now
1204 	 */
1205 	for (i = ib; i < is; ++i) {
1206 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1207 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1208 	}
1209 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1210 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1211 		vm_page_unhold(ma[i]);
1212 }
1213 
1214 /*
1215  * Implements the madvise function at the object/page level.
1216  *
1217  * MADV_WILLNEED	(any object)
1218  *
1219  *	Activate the specified pages if they are resident.
1220  *
1221  * MADV_DONTNEED	(any object)
1222  *
1223  *	Deactivate the specified pages if they are resident.
1224  *
1225  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1226  *
1227  *	Deactivate and clean the specified pages if they are
1228  *	resident.  This permits the process to reuse the pages
1229  *	without faulting or the kernel to reclaim the pages
1230  *	without I/O.
1231  *
1232  * No requirements.
1233  */
1234 void
1235 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1236 		  vm_pindex_t count, int advise)
1237 {
1238 	vm_pindex_t end;
1239 	vm_page_t m;
1240 	int error;
1241 
1242 	if (object == NULL)
1243 		return;
1244 
1245 	end = pindex + count;
1246 
1247 	vm_object_hold(object);
1248 
1249 	/*
1250 	 * Locate and adjust resident pages.  This only applies to the
1251 	 * primary object in the mapping.
1252 	 */
1253 	for (; pindex < end; pindex += 1) {
1254 relookup:
1255 		/*
1256 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1257 		 * and those pages must be OBJ_ONEMAPPING.
1258 		 */
1259 		if (advise == MADV_FREE) {
1260 			if ((object->type != OBJT_DEFAULT &&
1261 			     object->type != OBJT_SWAP) ||
1262 			    (object->flags & OBJ_ONEMAPPING) == 0) {
1263 				continue;
1264 			}
1265 		}
1266 
1267 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1268 
1269 		if (error) {
1270 			vm_page_sleep_busy(m, TRUE, "madvpo");
1271 			goto relookup;
1272 		}
1273 		if (m == NULL) {
1274 			/*
1275 			 * There may be swap even if there is no backing page
1276 			 */
1277 			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1278 				swap_pager_freespace(object, pindex, 1);
1279 			continue;
1280 		}
1281 
1282 		/*
1283 		 * If the page is not in a normal active state, we skip it.
1284 		 * If the page is not managed there are no page queues to
1285 		 * mess with.  Things can break if we mess with pages in
1286 		 * any of the below states.
1287 		 */
1288 		if (m->wire_count ||
1289 		    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1290 		    m->valid != VM_PAGE_BITS_ALL
1291 		) {
1292 			vm_page_wakeup(m);
1293 			continue;
1294 		}
1295 
1296 		/*
1297 		 * Theoretically once a page is known not to be busy, an
1298 		 * interrupt cannot come along and rip it out from under us.
1299 		 */
1300 		if (advise == MADV_WILLNEED) {
1301 			vm_page_activate(m);
1302 		} else if (advise == MADV_DONTNEED) {
1303 			vm_page_dontneed(m);
1304 		} else if (advise == MADV_FREE) {
1305 			/*
1306 			 * Mark the page clean.  This will allow the page
1307 			 * to be freed up by the system.  However, such pages
1308 			 * are often reused quickly by malloc()/free()
1309 			 * so we do not do anything that would cause
1310 			 * a page fault if we can help it.
1311 			 *
1312 			 * Specifically, we do not try to actually free
1313 			 * the page now nor do we try to put it in the
1314 			 * cache (which would cause a page fault on reuse).
1315 			 *
1316 			 * But we do make the page is freeable as we
1317 			 * can without actually taking the step of unmapping
1318 			 * it.
1319 			 */
1320 			pmap_clear_modify(m);
1321 			m->dirty = 0;
1322 			m->act_count = 0;
1323 			vm_page_dontneed(m);
1324 			if (object->type == OBJT_SWAP)
1325 				swap_pager_freespace(object, pindex, 1);
1326 		}
1327 		vm_page_wakeup(m);
1328 	}
1329 	vm_object_drop(object);
1330 }
1331 
1332 /*
1333  * Removes all physical pages in the specified object range from the
1334  * object's list of pages.
1335  *
1336  * No requirements.
1337  */
1338 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1339 
1340 void
1341 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1342 		      boolean_t clean_only)
1343 {
1344 	struct rb_vm_page_scan_info info;
1345 	int all;
1346 
1347 	/*
1348 	 * Degenerate cases and assertions
1349 	 */
1350 	vm_object_hold(object);
1351 	if (object == NULL ||
1352 	    (object->resident_page_count == 0 && object->swblock_count == 0)) {
1353 		vm_object_drop(object);
1354 		return;
1355 	}
1356 	KASSERT(object->type != OBJT_PHYS,
1357 		("attempt to remove pages from a physical object"));
1358 
1359 	/*
1360 	 * Indicate that paging is occuring on the object
1361 	 */
1362 	vm_object_pip_add(object, 1);
1363 
1364 	/*
1365 	 * Figure out the actual removal range and whether we are removing
1366 	 * the entire contents of the object or not.  If removing the entire
1367 	 * contents, be sure to get all pages, even those that might be
1368 	 * beyond the end of the object.
1369 	 */
1370 	info.object = object;
1371 	info.start_pindex = start;
1372 	if (end == 0)
1373 		info.end_pindex = (vm_pindex_t)-1;
1374 	else
1375 		info.end_pindex = end - 1;
1376 	info.limit = clean_only;
1377 	info.count = 0;
1378 	all = (start == 0 && info.end_pindex >= object->size - 1);
1379 
1380 	/*
1381 	 * Loop until we are sure we have gotten them all.
1382 	 */
1383 	do {
1384 		info.error = 0;
1385 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1386 					vm_object_page_remove_callback, &info);
1387 	} while (info.error);
1388 
1389 	/*
1390 	 * Remove any related swap if throwing away pages, or for
1391 	 * non-swap objects (the swap is a clean copy in that case).
1392 	 */
1393 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1394 		if (all)
1395 			swap_pager_freespace_all(object);
1396 		else
1397 			swap_pager_freespace(object, info.start_pindex,
1398 			     info.end_pindex - info.start_pindex + 1);
1399 	}
1400 
1401 	/*
1402 	 * Cleanup
1403 	 */
1404 	vm_object_pip_wakeup(object);
1405 	vm_object_drop(object);
1406 }
1407 
1408 /*
1409  * The caller must hold the object.
1410  *
1411  * NOTE: User yields are allowed when removing more than one page, but not
1412  *	 allowed if only removing one page (the path for single page removals
1413  *	 might hold a spinlock).
1414  */
1415 static int
1416 vm_object_page_remove_callback(vm_page_t p, void *data)
1417 {
1418 	struct rb_vm_page_scan_info *info = data;
1419 
1420 	if (info->object != p->object ||
1421 	    p->pindex < info->start_pindex ||
1422 	    p->pindex > info->end_pindex) {
1423 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1424 			info->object, p);
1425 		return(0);
1426 	}
1427 	if (vm_page_busy_try(p, TRUE)) {
1428 		vm_page_sleep_busy(p, TRUE, "vmopar");
1429 		info->error = 1;
1430 		return(0);
1431 	}
1432 	if (info->object != p->object) {
1433 		/* this should never happen */
1434 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1435 			info->object, p);
1436 		vm_page_wakeup(p);
1437 		return(0);
1438 	}
1439 
1440 	/*
1441 	 * Wired pages cannot be destroyed, but they can be invalidated
1442 	 * and we do so if clean_only (limit) is not set.
1443 	 *
1444 	 * WARNING!  The page may be wired due to being part of a buffer
1445 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1446 	 *	     This is fine as part of a truncation but VFSs must be
1447 	 *	     sure to fix the buffer up when re-extending the file.
1448 	 *
1449 	 * NOTE!     PG_NEED_COMMIT is ignored.
1450 	 */
1451 	if (p->wire_count != 0) {
1452 		vm_page_protect(p, VM_PROT_NONE);
1453 		if (info->limit == 0)
1454 			p->valid = 0;
1455 		vm_page_wakeup(p);
1456 		goto done;
1457 	}
1458 
1459 	/*
1460 	 * limit is our clean_only flag.  If set and the page is dirty or
1461 	 * requires a commit, do not free it.  If set and the page is being
1462 	 * held by someone, do not free it.
1463 	 */
1464 	if (info->limit && p->valid) {
1465 		vm_page_test_dirty(p);
1466 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1467 			vm_page_wakeup(p);
1468 			goto done;
1469 		}
1470 	}
1471 
1472 	/*
1473 	 * Destroy the page
1474 	 */
1475 	vm_page_protect(p, VM_PROT_NONE);
1476 	vm_page_free(p);
1477 
1478 	/*
1479 	 * Must be at end to avoid SMP races, caller holds object token
1480 	 */
1481 done:
1482 	if ((++info->count & 63) == 0)
1483 		lwkt_user_yield();
1484 
1485 	return(0);
1486 }
1487 
1488 /*
1489  * Try to extend prev_object into an adjoining region of virtual
1490  * memory, return TRUE on success.
1491  *
1492  * The caller does not need to hold (prev_object) but must have a stable
1493  * pointer to it (typically by holding the vm_map locked).
1494  *
1495  * This function only works for anonymous memory objects which either
1496  * have (a) one reference or (b) we are extending the object's size.
1497  * Otherwise the related VM pages we want to use for the object might
1498  * be in use by another mapping.
1499  */
1500 boolean_t
1501 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1502 		   vm_size_t prev_size, vm_size_t next_size)
1503 {
1504 	vm_pindex_t next_pindex;
1505 
1506 	if (prev_object == NULL)
1507 		return (TRUE);
1508 
1509 	vm_object_hold(prev_object);
1510 
1511 	if (prev_object->type != OBJT_DEFAULT &&
1512 	    prev_object->type != OBJT_SWAP) {
1513 		vm_object_drop(prev_object);
1514 		return (FALSE);
1515 	}
1516 
1517 #if 0
1518 	/* caller now checks this */
1519 	/*
1520 	 * Try to collapse the object first
1521 	 */
1522 	vm_object_collapse(prev_object, NULL);
1523 #endif
1524 
1525 #if 0
1526 	/* caller now checks this */
1527 	/*
1528 	 * We can't coalesce if we shadow another object (figuring out the
1529 	 * relationships become too complex).
1530 	 */
1531 	if (prev_object->backing_object != NULL) {
1532 		vm_object_chain_release(prev_object);
1533 		vm_object_drop(prev_object);
1534 		return (FALSE);
1535 	}
1536 #endif
1537 
1538 	prev_size >>= PAGE_SHIFT;
1539 	next_size >>= PAGE_SHIFT;
1540 	next_pindex = prev_pindex + prev_size;
1541 
1542 	/*
1543 	 * We can't if the object has more than one ref count unless we
1544 	 * are extending it into newly minted space.
1545 	 */
1546 	if (prev_object->ref_count > 1 &&
1547 	    prev_object->size != next_pindex) {
1548 		vm_object_drop(prev_object);
1549 		return (FALSE);
1550 	}
1551 
1552 	/*
1553 	 * Remove any pages that may still be in the object from a previous
1554 	 * deallocation.
1555 	 */
1556 	if (next_pindex < prev_object->size) {
1557 		vm_object_page_remove(prev_object,
1558 				      next_pindex,
1559 				      next_pindex + next_size, FALSE);
1560 		if (prev_object->type == OBJT_SWAP)
1561 			swap_pager_freespace(prev_object,
1562 					     next_pindex, next_size);
1563 	}
1564 
1565 	/*
1566 	 * Extend the object if necessary.
1567 	 */
1568 	if (next_pindex + next_size > prev_object->size)
1569 		prev_object->size = next_pindex + next_size;
1570 	vm_object_drop(prev_object);
1571 
1572 	return (TRUE);
1573 }
1574 
1575 /*
1576  * Make the object writable and flag is being possibly dirty.
1577  *
1578  * The object might not be held (or might be held but held shared),
1579  * the related vnode is probably not held either.  Object and vnode are
1580  * stable by virtue of the vm_page busied by the caller preventing
1581  * destruction.
1582  *
1583  * If the related mount is flagged MNTK_THR_SYNC we need to call
1584  * vsetobjdirty().  Filesystems using this option usually shortcut
1585  * synchronization by only scanning the syncer list.
1586  */
1587 void
1588 vm_object_set_writeable_dirty(vm_object_t object)
1589 {
1590 	struct vnode *vp;
1591 
1592 	/*vm_object_assert_held(object);*/
1593 	/*
1594 	 * Avoid contention in vm fault path by checking the state before
1595 	 * issuing an atomic op on it.
1596 	 */
1597 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1598 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1599 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1600 	}
1601 	if (object->type == OBJT_VNODE &&
1602 	    (vp = (struct vnode *)object->handle) != NULL) {
1603 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1604 			if (vp->v_mount &&
1605 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1606 				/*
1607 				 * New style THR_SYNC places vnodes on the
1608 				 * syncer list more deterministically.
1609 				 */
1610 				vsetobjdirty(vp);
1611 			} else {
1612 				/*
1613 				 * Old style scan would not necessarily place
1614 				 * a vnode on the syncer list when possibly
1615 				 * modified via mmap.
1616 				 */
1617 				vsetflags(vp, VOBJDIRTY);
1618 			}
1619 		}
1620 	}
1621 }
1622 
1623 #include "opt_ddb.h"
1624 #ifdef DDB
1625 #include <sys/cons.h>
1626 
1627 #include <ddb/ddb.h>
1628 
1629 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1630 				       vm_map_entry_t entry);
1631 static int	vm_object_in_map (vm_object_t object);
1632 
1633 /*
1634  * The caller must hold the object.
1635  */
1636 static int
1637 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1638 {
1639 	vm_map_backing_t ba;
1640 	vm_map_t tmpm;
1641 	vm_map_entry_t tmpe;
1642 	int entcount;
1643 
1644 	if (map == NULL)
1645 		return 0;
1646 	if (entry == NULL) {
1647 		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1648 		entcount = map->nentries;
1649 		while (entcount-- && tmpe) {
1650 			if( _vm_object_in_map(map, object, tmpe)) {
1651 				return 1;
1652 			}
1653 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1654 		}
1655 		return (0);
1656 	}
1657 	switch(entry->maptype) {
1658 	case VM_MAPTYPE_SUBMAP:
1659 		tmpm = entry->ba.sub_map;
1660 		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1661 		entcount = tmpm->nentries;
1662 		while (entcount-- && tmpe) {
1663 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1664 				return 1;
1665 			}
1666 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1667 		}
1668 		break;
1669 	case VM_MAPTYPE_NORMAL:
1670 	case VM_MAPTYPE_VPAGETABLE:
1671 		ba = &entry->ba;
1672 		while (ba) {
1673 			if (ba->object == object)
1674 				return TRUE;
1675 			ba = ba->backing_ba;
1676 		}
1677 		break;
1678 	default:
1679 		break;
1680 	}
1681 	return 0;
1682 }
1683 
1684 static int vm_object_in_map_callback(struct proc *p, void *data);
1685 
1686 struct vm_object_in_map_info {
1687 	vm_object_t object;
1688 	int rv;
1689 };
1690 
1691 /*
1692  * Debugging only
1693  */
1694 static int
1695 vm_object_in_map(vm_object_t object)
1696 {
1697 	struct vm_object_in_map_info info;
1698 
1699 	info.rv = 0;
1700 	info.object = object;
1701 
1702 	allproc_scan(vm_object_in_map_callback, &info, 0);
1703 	if (info.rv)
1704 		return 1;
1705 	if( _vm_object_in_map(&kernel_map, object, 0))
1706 		return 1;
1707 	if( _vm_object_in_map(&pager_map, object, 0))
1708 		return 1;
1709 	if( _vm_object_in_map(&buffer_map, object, 0))
1710 		return 1;
1711 	return 0;
1712 }
1713 
1714 /*
1715  * Debugging only
1716  */
1717 static int
1718 vm_object_in_map_callback(struct proc *p, void *data)
1719 {
1720 	struct vm_object_in_map_info *info = data;
1721 
1722 	if (p->p_vmspace) {
1723 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1724 			info->rv = 1;
1725 			return -1;
1726 		}
1727 	}
1728 	return (0);
1729 }
1730 
1731 DB_SHOW_COMMAND(vmochk, vm_object_check)
1732 {
1733 	struct vm_object_hash *hash;
1734 	vm_object_t object;
1735 	int n;
1736 
1737 	/*
1738 	 * make sure that internal objs are in a map somewhere
1739 	 * and none have zero ref counts.
1740 	 */
1741 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1742 		hash = &vm_object_hash[n];
1743 		for (object = TAILQ_FIRST(&hash->list);
1744 				object != NULL;
1745 				object = TAILQ_NEXT(object, object_entry)) {
1746 			if (object->type == OBJT_MARKER)
1747 				continue;
1748 			if (object->handle != NULL ||
1749 			    (object->type != OBJT_DEFAULT &&
1750 			     object->type != OBJT_SWAP)) {
1751 				continue;
1752 			}
1753 			if (object->ref_count == 0) {
1754 				db_printf("vmochk: internal obj has "
1755 					  "zero ref count: %ld\n",
1756 					  (long)object->size);
1757 			}
1758 			if (vm_object_in_map(object))
1759 				continue;
1760 			db_printf("vmochk: internal obj is not in a map: "
1761 				  "ref: %d, size: %lu: 0x%lx\n",
1762 				  object->ref_count, (u_long)object->size,
1763 				  (u_long)object->size);
1764 		}
1765 	}
1766 }
1767 
1768 /*
1769  * Debugging only
1770  */
1771 DB_SHOW_COMMAND(object, vm_object_print_static)
1772 {
1773 	/* XXX convert args. */
1774 	vm_object_t object = (vm_object_t)addr;
1775 	boolean_t full = have_addr;
1776 
1777 	vm_page_t p;
1778 
1779 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1780 #define	count	was_count
1781 
1782 	int count;
1783 
1784 	if (object == NULL)
1785 		return;
1786 
1787 	db_iprintf(
1788 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1789 	    object, (int)object->type, (u_long)object->size,
1790 	    object->resident_page_count, object->ref_count, object->flags);
1791 	/*
1792 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1793 	 */
1794 	db_iprintf("\n");
1795 
1796 	if (!full)
1797 		return;
1798 
1799 	db_indent += 2;
1800 	count = 0;
1801 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1802 		if (count == 0)
1803 			db_iprintf("memory:=");
1804 		else if (count == 6) {
1805 			db_printf("\n");
1806 			db_iprintf(" ...");
1807 			count = 0;
1808 		} else
1809 			db_printf(",");
1810 		count++;
1811 
1812 		db_printf("(off=0x%lx,page=0x%lx)",
1813 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1814 	}
1815 	if (count != 0)
1816 		db_printf("\n");
1817 	db_indent -= 2;
1818 }
1819 
1820 /* XXX. */
1821 #undef count
1822 
1823 /*
1824  * XXX need this non-static entry for calling from vm_map_print.
1825  *
1826  * Debugging only
1827  */
1828 void
1829 vm_object_print(/* db_expr_t */ long addr,
1830 		boolean_t have_addr,
1831 		/* db_expr_t */ long count,
1832 		char *modif)
1833 {
1834 	vm_object_print_static(addr, have_addr, count, modif);
1835 }
1836 
1837 /*
1838  * Debugging only
1839  */
1840 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1841 {
1842 	struct vm_object_hash *hash;
1843 	vm_object_t object;
1844 	int nl = 0;
1845 	int c;
1846 	int n;
1847 
1848 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1849 		hash = &vm_object_hash[n];
1850 		for (object = TAILQ_FIRST(&hash->list);
1851 				object != NULL;
1852 				object = TAILQ_NEXT(object, object_entry)) {
1853 			vm_pindex_t idx, fidx;
1854 			vm_pindex_t osize;
1855 			vm_paddr_t pa = -1, padiff;
1856 			int rcount;
1857 			vm_page_t m;
1858 
1859 			if (object->type == OBJT_MARKER)
1860 				continue;
1861 			db_printf("new object: %p\n", (void *)object);
1862 			if ( nl > 18) {
1863 				c = cngetc();
1864 				if (c != ' ')
1865 					return;
1866 				nl = 0;
1867 			}
1868 			nl++;
1869 			rcount = 0;
1870 			fidx = 0;
1871 			osize = object->size;
1872 			if (osize > 128)
1873 				osize = 128;
1874 			for (idx = 0; idx < osize; idx++) {
1875 				m = vm_page_lookup(object, idx);
1876 				if (m == NULL) {
1877 					if (rcount) {
1878 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1879 							(long)fidx, rcount, (long)pa);
1880 						if ( nl > 18) {
1881 							c = cngetc();
1882 							if (c != ' ')
1883 								return;
1884 							nl = 0;
1885 						}
1886 						nl++;
1887 						rcount = 0;
1888 					}
1889 					continue;
1890 				}
1891 
1892 				if (rcount &&
1893 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1894 					++rcount;
1895 					continue;
1896 				}
1897 				if (rcount) {
1898 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1899 					padiff >>= PAGE_SHIFT;
1900 					padiff &= PQ_L2_MASK;
1901 					if (padiff == 0) {
1902 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1903 						++rcount;
1904 						continue;
1905 					}
1906 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
1907 						(long)fidx, rcount, (long)pa);
1908 					db_printf("pd(%ld)\n", (long)padiff);
1909 					if ( nl > 18) {
1910 						c = cngetc();
1911 						if (c != ' ')
1912 							return;
1913 						nl = 0;
1914 					}
1915 					nl++;
1916 				}
1917 				fidx = idx;
1918 				pa = VM_PAGE_TO_PHYS(m);
1919 				rcount = 1;
1920 			}
1921 			if (rcount) {
1922 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1923 					(long)fidx, rcount, (long)pa);
1924 				if ( nl > 18) {
1925 					c = cngetc();
1926 					if (c != ' ')
1927 						return;
1928 					nl = 0;
1929 				}
1930 				nl++;
1931 			}
1932 		}
1933 	}
1934 }
1935 #endif /* DDB */
1936