xref: /dflybsd-src/sys/vm/vm_object.c (revision 2b3f93ea6d1f70880f3e87f3c2cbe0dc0bfc9332)
1  /*
2   * Copyright (c) 1991, 1993, 2013
3   *	The Regents of the University of California.  All rights reserved.
4   *
5   * This code is derived from software contributed to Berkeley by
6   * The Mach Operating System project at Carnegie-Mellon University.
7   *
8   * Redistribution and use in source and binary forms, with or without
9   * modification, are permitted provided that the following conditions
10   * are met:
11   * 1. Redistributions of source code must retain the above copyright
12   *    notice, this list of conditions and the following disclaimer.
13   * 2. Redistributions in binary form must reproduce the above copyright
14   *    notice, this list of conditions and the following disclaimer in the
15   *    documentation and/or other materials provided with the distribution.
16   * 3. Neither the name of the University nor the names of its contributors
17   *    may be used to endorse or promote products derived from this software
18   *    without specific prior written permission.
19   *
20   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30   * SUCH DAMAGE.
31   *
32   *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33   *
34   *
35   * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36   * All rights reserved.
37   *
38   * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39   *
40   * Permission to use, copy, modify and distribute this software and
41   * its documentation is hereby granted, provided that both the copyright
42   * notice and this permission notice appear in all copies of the
43   * software, derivative works or modified versions, and any portions
44   * thereof, and that both notices appear in supporting documentation.
45   *
46   * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47   * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48   * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49   *
50   * Carnegie Mellon requests users of this software to return to
51   *
52   *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53   *  School of Computer Science
54   *  Carnegie Mellon University
55   *  Pittsburgh PA 15213-3890
56   *
57   * any improvements or extensions that they make and grant Carnegie the
58   * rights to redistribute these changes.
59   *
60   * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61   */
62  
63  /*
64   *	Virtual memory object module.
65   */
66  
67  #include <sys/param.h>
68  #include <sys/systm.h>
69  #include <sys/proc.h>		/* for curproc, pageproc */
70  #include <sys/thread.h>
71  #include <sys/vnode.h>
72  #include <sys/vmmeter.h>
73  #include <sys/mman.h>
74  #include <sys/mount.h>
75  #include <sys/kernel.h>
76  #include <sys/malloc.h>
77  #include <sys/sysctl.h>
78  #include <sys/refcount.h>
79  
80  #include <vm/vm.h>
81  #include <vm/vm_param.h>
82  #include <vm/pmap.h>
83  #include <vm/vm_map.h>
84  #include <vm/vm_object.h>
85  #include <vm/vm_page.h>
86  #include <vm/vm_pageout.h>
87  #include <vm/vm_pager.h>
88  #include <vm/swap_pager.h>
89  #include <vm/vm_kern.h>
90  #include <vm/vm_extern.h>
91  #include <vm/vm_zone.h>
92  
93  #include <vm/vm_page2.h>
94  
95  #include <machine/specialreg.h>
96  
97  #define EASY_SCAN_FACTOR	8
98  
99  static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100  					     int pagerflags);
101  static void	vm_object_lock_init(vm_object_t);
102  
103  /*
104   *	Virtual memory objects maintain the actual data
105   *	associated with allocated virtual memory.  A given
106   *	page of memory exists within exactly one object.
107   *
108   *	An object is only deallocated when all "references"
109   *	are given up.  Only one "reference" to a given
110   *	region of an object should be writeable.
111   *
112   *	Associated with each object is a list of all resident
113   *	memory pages belonging to that object; this list is
114   *	maintained by the "vm_page" module, and locked by the object's
115   *	lock.
116   *
117   *	Each object also records a "pager" routine which is
118   *	used to retrieve (and store) pages to the proper backing
119   *	storage.  In addition, objects may be backed by other
120   *	objects from which they were virtual-copied.
121   *
122   *	The only items within the object structure which are
123   *	modified after time of creation are:
124   *		reference count		locked by object's lock
125   *		pager routine		locked by object's lock
126   *
127   */
128  
129  static struct vm_object kernel_object_store;
130  struct vm_object *kernel_object = &kernel_object_store;
131  
132  struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
133  
134  static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object),
135  		"vm_object", "vm_object structures");
136  
137  #define VMOBJ_HASH_PRIME1	66555444443333333ULL
138  #define VMOBJ_HASH_PRIME2	989042931893ULL
139  
140  int vm_object_debug;
141  SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
142  
143  static __inline
144  struct vm_object_hash *
145  vmobj_hash(vm_object_t obj)
146  {
147  	uintptr_t hash1;
148  	uintptr_t hash2;
149  
150  	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
151  	hash1 %= VMOBJ_HASH_PRIME1;
152  	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
153  	hash2 %= VMOBJ_HASH_PRIME2;
154  	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
155  }
156  
157  #if defined(DEBUG_LOCKS)
158  
159  #define vm_object_vndeallocate(obj, vpp)	\
160                  debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
161  
162  /*
163   * Debug helper to track hold/drop/ref/deallocate calls.
164   */
165  static void
166  debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
167  {
168  	int i;
169  
170  	i = atomic_fetchadd_int(&obj->debug_index, 1);
171  	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
172  	ksnprintf(obj->debug_hold_thrs[i],
173  		  sizeof(obj->debug_hold_thrs[i]),
174  		  "%c%d:(%d):%s",
175  		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
176  		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
177  		  obj->ref_count,
178  		  curthread->td_comm);
179  	obj->debug_hold_file[i] = file;
180  	obj->debug_hold_line[i] = line;
181  #if 0
182  	/* Uncomment for debugging obj refs/derefs in reproducable cases */
183  	if (strcmp(curthread->td_comm, "sshd") == 0) {
184  		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
185  			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
186  			obj, obj->ref_count, addrem, file, line);
187  	}
188  #endif
189  }
190  
191  #endif
192  
193  /*
194   * Misc low level routines
195   */
196  static void
197  vm_object_lock_init(vm_object_t obj)
198  {
199  #if defined(DEBUG_LOCKS)
200  	int i;
201  
202  	obj->debug_index = 0;
203  	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
204  		obj->debug_hold_thrs[i][0] = 0;
205  		obj->debug_hold_file[i] = NULL;
206  		obj->debug_hold_line[i] = 0;
207  	}
208  #endif
209  }
210  
211  void
212  vm_object_lock_swap(void)
213  {
214  	lwkt_token_swap();
215  }
216  
217  void
218  vm_object_lock(vm_object_t obj)
219  {
220  	lwkt_gettoken(&obj->token);
221  }
222  
223  /*
224   * Returns TRUE on sucesss
225   */
226  static int
227  vm_object_lock_try(vm_object_t obj)
228  {
229  	return(lwkt_trytoken(&obj->token));
230  }
231  
232  void
233  vm_object_lock_shared(vm_object_t obj)
234  {
235  	lwkt_gettoken_shared(&obj->token);
236  }
237  
238  void
239  vm_object_unlock(vm_object_t obj)
240  {
241  	lwkt_reltoken(&obj->token);
242  }
243  
244  void
245  vm_object_upgrade(vm_object_t obj)
246  {
247  	lwkt_reltoken(&obj->token);
248  	lwkt_gettoken(&obj->token);
249  }
250  
251  void
252  vm_object_downgrade(vm_object_t obj)
253  {
254  	lwkt_reltoken(&obj->token);
255  	lwkt_gettoken_shared(&obj->token);
256  }
257  
258  static __inline void
259  vm_object_assert_held(vm_object_t obj)
260  {
261  	ASSERT_LWKT_TOKEN_HELD(&obj->token);
262  }
263  
264  /*
265   * Aquire a semi-random base page color for a new object.  Our main concern
266   * is that the color be spread out a bit.  Further spreading out occurs in
267   * bio_page_alloc().
268   */
269  int
270  vm_quickcolor(void)
271  {
272  	globaldata_t gd = mycpu;
273  	int pg_color;
274  
275  	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
276  	pg_color += gd->gd_quick_color;
277  	gd->gd_quick_color += PQ_PRIME2;
278  
279  	return pg_color;
280  }
281  
282  void
283  VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
284  {
285  	KKASSERT(obj != NULL);
286  
287  	/*
288  	 * Object must be held (object allocation is stable due to callers
289  	 * context, typically already holding the token on a parent object)
290  	 * prior to potentially blocking on the lock, otherwise the object
291  	 * can get ripped away from us.
292  	 */
293  	refcount_acquire(&obj->hold_count);
294  	vm_object_lock(obj);
295  
296  #if defined(DEBUG_LOCKS)
297  	debugvm_object_add(obj, file, line, 1);
298  #endif
299  }
300  
301  int
302  VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
303  {
304  	KKASSERT(obj != NULL);
305  
306  	/*
307  	 * Object must be held (object allocation is stable due to callers
308  	 * context, typically already holding the token on a parent object)
309  	 * prior to potentially blocking on the lock, otherwise the object
310  	 * can get ripped away from us.
311  	 */
312  	refcount_acquire(&obj->hold_count);
313  	if (vm_object_lock_try(obj) == 0) {
314  		if (refcount_release(&obj->hold_count)) {
315  			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
316  				kfree_obj(obj, M_VM_OBJECT);
317  		}
318  		return(0);
319  	}
320  
321  #if defined(DEBUG_LOCKS)
322  	debugvm_object_add(obj, file, line, 1);
323  #endif
324  	return(1);
325  }
326  
327  void
328  VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
329  {
330  	KKASSERT(obj != NULL);
331  
332  	/*
333  	 * Object must be held (object allocation is stable due to callers
334  	 * context, typically already holding the token on a parent object)
335  	 * prior to potentially blocking on the lock, otherwise the object
336  	 * can get ripped away from us.
337  	 */
338  	refcount_acquire(&obj->hold_count);
339  	vm_object_lock_shared(obj);
340  
341  #if defined(DEBUG_LOCKS)
342  	debugvm_object_add(obj, file, line, 1);
343  #endif
344  }
345  
346  /*
347   * Drop the token and hold_count on the object.
348   *
349   * WARNING! Token might be shared.
350   */
351  void
352  VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
353  {
354  	if (obj == NULL)
355  		return;
356  
357  	/*
358  	 * No new holders should be possible once we drop hold_count 1->0 as
359  	 * there is no longer any way to reference the object.
360  	 */
361  	KKASSERT(obj->hold_count > 0);
362  	if (refcount_release(&obj->hold_count)) {
363  #if defined(DEBUG_LOCKS)
364  		debugvm_object_add(obj, file, line, -1);
365  #endif
366  
367  		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
368  			vm_object_unlock(obj);
369  			kfree_obj(obj, M_VM_OBJECT);
370  		} else {
371  			vm_object_unlock(obj);
372  		}
373  	} else {
374  #if defined(DEBUG_LOCKS)
375  		debugvm_object_add(obj, file, line, -1);
376  #endif
377  		vm_object_unlock(obj);
378  	}
379  }
380  
381  /*
382   * Initialize a freshly allocated object, returning a held object.
383   *
384   * Used only by vm_object_allocate(), zinitna() and vm_object_init().
385   *
386   * No requirements.
387   */
388  void
389  _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
390  		    const char *ident)
391  {
392  	struct vm_object_hash *hash;
393  
394  	RB_INIT(&object->rb_memq);
395  	lwkt_token_init(&object->token, ident);
396  
397  	TAILQ_INIT(&object->backing_list);
398  	lockinit(&object->backing_lk, "baclk", 0, 0);
399  
400  	object->type = type;
401  	object->size = size;
402  	object->ref_count = 1;
403  	object->memattr = VM_MEMATTR_DEFAULT;
404  	object->hold_count = 0;
405  	object->flags = 0;
406  	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
407  		vm_object_set_flag(object, OBJ_ONEMAPPING);
408  	object->paging_in_progress = 0;
409  	object->resident_page_count = 0;
410  	/* cpu localization twist */
411  	object->pg_color = vm_quickcolor();
412  	object->handle = NULL;
413  
414  	atomic_add_int(&object->generation, 1);
415  	object->swblock_count = 0;
416  	RB_INIT(&object->swblock_root);
417  	vm_object_lock_init(object);
418  	pmap_object_init(object);
419  
420  	vm_object_hold(object);
421  
422  	hash = vmobj_hash(object);
423  	lwkt_gettoken(&hash->token);
424  	TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
425  	lwkt_reltoken(&hash->token);
426  }
427  
428  /*
429   * Initialize a VM object.
430   */
431  void
432  vm_object_init(vm_object_t object, vm_pindex_t size)
433  {
434  	_vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
435  	vm_object_drop(object);
436  }
437  
438  /*
439   * Initialize the VM objects module.
440   *
441   * Called from the low level boot code only.  Note that this occurs before
442   * kmalloc is initialized so we cannot allocate any VM objects.
443   */
444  void
445  vm_object_init1(void)
446  {
447  	int i;
448  
449  	for (i = 0; i < VMOBJ_HSIZE; ++i) {
450  		TAILQ_INIT(&vm_object_hash[i].list);
451  		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
452  	}
453  
454  	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
455  			    kernel_object, "kobj");
456  	vm_object_drop(kernel_object);
457  }
458  
459  void
460  vm_object_init2(void)
461  {
462  	kmalloc_obj_set_unlimited(M_VM_OBJECT);
463  }
464  
465  /*
466   * Allocate and return a new object of the specified type and size.
467   *
468   * No requirements.
469   */
470  vm_object_t
471  vm_object_allocate(objtype_t type, vm_pindex_t size)
472  {
473  	vm_object_t obj;
474  
475  	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
476  	_vm_object_allocate(type, size, obj, "vmobj");
477  	vm_object_drop(obj);
478  
479  	return (obj);
480  }
481  
482  /*
483   * This version returns a held object, allowing further atomic initialization
484   * of the object.
485   */
486  vm_object_t
487  vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
488  {
489  	vm_object_t obj;
490  
491  	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
492  	_vm_object_allocate(type, size, obj, "vmobj");
493  
494  	return (obj);
495  }
496  
497  /*
498   * Add an additional reference to a vm_object.  The object must already be
499   * held.  The original non-lock version is no longer supported.  The object
500   * must NOT be chain locked by anyone at the time the reference is added.
501   *
502   * The object must be held, but may be held shared if desired (hence why
503   * we use an atomic op).
504   */
505  void
506  VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
507  {
508  	KKASSERT(object != NULL);
509  	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
510  	atomic_add_int(&object->ref_count, 1);
511  	if (object->type == OBJT_VNODE) {
512  		vref(object->handle);
513  		/* XXX what if the vnode is being destroyed? */
514  	}
515  #if defined(DEBUG_LOCKS)
516  	debugvm_object_add(object, file, line, 1);
517  #endif
518  }
519  
520  /*
521   * This version is only allowed in situations where the caller
522   * already knows that the object is deterministically referenced
523   * (usually because its taken from a ref'd vnode, or during a map_entry
524   * replication).
525   */
526  void
527  VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
528  {
529  	KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
530  	atomic_add_int(&object->ref_count, 1);
531  	if (object->type == OBJT_VNODE)
532  		vref(object->handle);
533  #if defined(DEBUG_LOCKS)
534  	debugvm_object_add(object, file, line, 1);
535  #endif
536  }
537  
538  /*
539   * Dereference an object and its underlying vnode.  The object may be
540   * held shared.  On return the object will remain held.
541   *
542   * This function may return a vnode in *vpp which the caller must release
543   * after the caller drops its own lock.  If vpp is NULL, we assume that
544   * the caller was holding an exclusive lock on the object and we vrele()
545   * the vp ourselves.
546   */
547  static void
548  VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
549  				   VMOBJDBARGS)
550  {
551  	struct vnode *vp = (struct vnode *) object->handle;
552  	int count;
553  
554  	KASSERT(object->type == OBJT_VNODE,
555  	    ("vm_object_vndeallocate: not a vnode object"));
556  	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
557  	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
558  #ifdef INVARIANTS
559  	if (object->ref_count == 0) {
560  		vprint("vm_object_vndeallocate", vp);
561  		panic("vm_object_vndeallocate: bad object reference count");
562  	}
563  #endif
564  	count = object->ref_count;
565  	cpu_ccfence();
566  	for (;;) {
567  		if (count == 1) {
568  			vm_object_upgrade(object);
569  			if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
570  				vclrflags(vp, VTEXT);
571  				break;
572  			}
573  		} else {
574  			if (atomic_fcmpset_int(&object->ref_count,
575  					       &count, count - 1)) {
576  				break;
577  			}
578  		}
579  		cpu_pause();
580  		/* retry */
581  	}
582  #if defined(DEBUG_LOCKS)
583  	debugvm_object_add(object, file, line, -1);
584  #endif
585  
586  	/*
587  	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
588  	 * if the object was locked exclusively.  But there are two races
589  	 * here.
590  	 *
591  	 * We had to upgrade the object above to safely clear VTEXT
592  	 * but the alternative path where the shared lock is retained
593  	 * can STILL race to 0 in other paths and cause our own vrele()
594  	 * to terminate the vnode.  We can't allow that if the VM object
595  	 * is still locked shared.
596  	 */
597  	if (vpp)
598  		*vpp = vp;
599  	else
600  		vrele(vp);
601  }
602  
603  /*
604   * Release a reference to the specified object, gained either through a
605   * vm_object_allocate or a vm_object_reference call.  When all references
606   * are gone, storage associated with this object may be relinquished.
607   *
608   * The caller does not have to hold the object locked but must have control
609   * over the reference in question in order to guarantee that the object
610   * does not get ripped out from under us.
611   *
612   * XXX Currently all deallocations require an exclusive lock.
613   */
614  void
615  VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
616  {
617  	struct vnode *vp;
618  	int count;
619  
620  	if (object == NULL)
621  		return;
622  
623  	count = object->ref_count;
624  	cpu_ccfence();
625  	for (;;) {
626  		/*
627  		 * If decrementing the count enters into special handling
628  		 * territory (0, 1, or 2) we have to do it the hard way.
629  		 * Fortunate though, objects with only a few refs like this
630  		 * are not likely to be heavily contended anyway.
631  		 *
632  		 * For vnode objects we only care about 1->0 transitions.
633  		 */
634  		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
635  #if defined(DEBUG_LOCKS)
636  			debugvm_object_add(object, file, line, 0);
637  #endif
638  			vm_object_hold(object);
639  			vm_object_deallocate_locked(object);
640  			vm_object_drop(object);
641  			break;
642  		}
643  
644  		/*
645  		 * Try to decrement ref_count without acquiring a hold on
646  		 * the object.  This is particularly important for the exec*()
647  		 * and exit*() code paths because the program binary may
648  		 * have a great deal of sharing and an exclusive lock will
649  		 * crowbar performance in those circumstances.
650  		 */
651  		if (object->type == OBJT_VNODE) {
652  			vp = (struct vnode *)object->handle;
653  			if (atomic_fcmpset_int(&object->ref_count,
654  					       &count, count - 1)) {
655  #if defined(DEBUG_LOCKS)
656  				debugvm_object_add(object, file, line, -1);
657  #endif
658  
659  				vrele(vp);
660  				break;
661  			}
662  			/* retry */
663  		} else {
664  			if (atomic_fcmpset_int(&object->ref_count,
665  					       &count, count - 1)) {
666  #if defined(DEBUG_LOCKS)
667  				debugvm_object_add(object, file, line, -1);
668  #endif
669  				break;
670  			}
671  			/* retry */
672  		}
673  		cpu_pause();
674  		/* retry */
675  	}
676  }
677  
678  void
679  VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
680  {
681  	/*
682  	 * Degenerate case
683  	 */
684  	if (object == NULL)
685  		return;
686  
687  	/*
688  	 * vnode case, caller either locked the object exclusively
689  	 * or this is a recursion with must_drop != 0 and the vnode
690  	 * object will be locked shared.
691  	 *
692  	 * If locked shared we have to drop the object before we can
693  	 * call vrele() or risk a shared/exclusive livelock.
694  	 */
695  	if (object->type == OBJT_VNODE) {
696  		ASSERT_LWKT_TOKEN_HELD(&object->token);
697  		vm_object_vndeallocate(object, NULL);
698  		return;
699  	}
700  	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
701  
702  	/*
703  	 * Normal case (object is locked exclusively)
704  	 */
705  	if (object->ref_count == 0) {
706  		panic("vm_object_deallocate: object deallocated "
707  		      "too many times: %d", object->type);
708  	}
709  	if (object->ref_count > 2) {
710  		atomic_add_int(&object->ref_count, -1);
711  #if defined(DEBUG_LOCKS)
712  		debugvm_object_add(object, file, line, -1);
713  #endif
714  		return;
715  	}
716  
717  	/*
718  	 * Drop the ref and handle termination on the 1->0 transition.
719  	 * We may have blocked above so we have to recheck.
720  	 */
721  	KKASSERT(object->ref_count != 0);
722  	if (object->ref_count >= 2) {
723  		atomic_add_int(&object->ref_count, -1);
724  #if defined(DEBUG_LOCKS)
725  		debugvm_object_add(object, file, line, -1);
726  #endif
727  		return;
728  	}
729  
730  	atomic_add_int(&object->ref_count, -1);
731  	if ((object->flags & OBJ_DEAD) == 0)
732  		vm_object_terminate(object);
733  }
734  
735  /*
736   * Destroy the specified object, freeing up related resources.
737   *
738   * The object must have zero references.
739   *
740   * The object must held.  The caller is responsible for dropping the object
741   * after terminate returns.  Terminate does NOT drop the object.
742   */
743  static int vm_object_terminate_callback(vm_page_t p, void *data);
744  
745  void
746  vm_object_terminate(vm_object_t object)
747  {
748  	struct rb_vm_page_scan_info info;
749  	struct vm_object_hash *hash;
750  
751  	/*
752  	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
753  	 * able to safely block.
754  	 */
755  	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
756  	KKASSERT((object->flags & OBJ_DEAD) == 0);
757  	vm_object_set_flag(object, OBJ_DEAD);
758  
759  	/*
760  	 * Wait for the pageout daemon to be done with the object
761  	 */
762  	vm_object_pip_wait(object, "objtrm1");
763  
764  	KASSERT(!object->paging_in_progress,
765  		("vm_object_terminate: pageout in progress"));
766  
767  	/*
768  	 * Clean and free the pages, as appropriate. All references to the
769  	 * object are gone, so we don't need to lock it.
770  	 */
771  	if (object->type == OBJT_VNODE) {
772  		struct vnode *vp;
773  
774  		/*
775  		 * Clean pages and flush buffers.
776  		 *
777  		 * NOTE!  TMPFS buffer flushes do not typically flush the
778  		 *	  actual page to swap as this would be highly
779  		 *	  inefficient, and normal filesystems usually wrap
780  		 *	  page flushes with buffer cache buffers.
781  		 *
782  		 *	  To deal with this we have to call vinvalbuf() both
783  		 *	  before and after the vm_object_page_clean().
784  		 */
785  		vp = (struct vnode *) object->handle;
786  		vinvalbuf(vp, V_SAVE, 0, 0);
787  		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
788  		vinvalbuf(vp, V_SAVE, 0, 0);
789  	}
790  
791  	/*
792  	 * Wait for any I/O to complete, after which there had better not
793  	 * be any references left on the object.
794  	 */
795  	vm_object_pip_wait(object, "objtrm2");
796  
797  	if (object->ref_count != 0) {
798  		panic("vm_object_terminate: object with references, "
799  		      "ref_count=%d", object->ref_count);
800  	}
801  
802  	/*
803  	 * Cleanup any shared pmaps associated with this object.
804  	 */
805  	pmap_object_free(object);
806  
807  	/*
808  	 * Now free any remaining pages. For internal objects, this also
809  	 * removes them from paging queues. Don't free wired pages, just
810  	 * remove them from the object.
811  	 */
812  	info.count = 0;
813  	info.object = object;
814  	do {
815  		info.error = 0;
816  		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
817  					vm_object_terminate_callback, &info);
818  	} while (info.error);
819  
820  	/*
821  	 * Let the pager know object is dead.
822  	 */
823  	vm_pager_deallocate(object);
824  
825  	/*
826  	 * Wait for the object hold count to hit 1, clean out pages as
827  	 * we go.  vmobj_token interlocks any race conditions that might
828  	 * pick the object up from the vm_object_list after we have cleared
829  	 * rb_memq.
830  	 */
831  	for (;;) {
832  		if (RB_ROOT(&object->rb_memq) == NULL)
833  			break;
834  		kprintf("vm_object_terminate: Warning, object %p "
835  			"still has %ld pages\n",
836  			object, object->resident_page_count);
837  		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
838  					vm_object_terminate_callback, &info);
839  	}
840  
841  	/*
842  	 * There had better not be any pages left
843  	 */
844  	KKASSERT(object->resident_page_count == 0);
845  
846  	/*
847  	 * Remove the object from the global object list.
848  	 */
849  	hash = vmobj_hash(object);
850  	lwkt_gettoken(&hash->token);
851  	TAILQ_REMOVE(&hash->list, object, object_entry);
852  	lwkt_reltoken(&hash->token);
853  
854  	if (object->ref_count != 0) {
855  		panic("vm_object_terminate2: object with references, "
856  		      "ref_count=%d", object->ref_count);
857  	}
858  
859  	/*
860  	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
861  	 *	 the object here.  See vm_object_drop().
862  	 */
863  }
864  
865  /*
866   * The caller must hold the object.
867   *
868   * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED
869   *	 or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
870   *	 is called, due to normal pmap operations.  This is because only
871   *	 global pmap operations on the vm_page can clear the bits and not
872   *	 just local operations on individual pmaps.
873   *
874   *	 Most interactions that necessitate the clearing of these bits
875   *	 proactively call vm_page_protect(), and we must do so here as well.
876   */
877  static int
878  vm_object_terminate_callback(vm_page_t p, void *data)
879  {
880  	struct rb_vm_page_scan_info *info = data;
881  	vm_object_t object;
882  
883  	object = p->object;
884  	KKASSERT(object == info->object);
885  	if (vm_page_busy_try(p, TRUE)) {
886  		vm_page_sleep_busy(p, TRUE, "vmotrm");
887  		info->error = 1;
888  		return 0;
889  	}
890  	if (object != p->object) {
891  		/* XXX remove once we determine it can't happen */
892  		kprintf("vm_object_terminate: Warning: Encountered "
893  			"busied page %p on queue %d\n", p, p->queue);
894  		vm_page_wakeup(p);
895  		info->error = 1;
896  	} else if (p->wire_count == 0) {
897  		/*
898  		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
899  		 */
900  		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
901  			vm_page_protect(p, VM_PROT_NONE);
902  		vm_page_free(p);
903  		mycpu->gd_cnt.v_pfree++;
904  	} else {
905  		if (p->queue != PQ_NONE) {
906  			kprintf("vm_object_terminate: Warning: Encountered "
907  				"wired page %p on queue %d\n", p, p->queue);
908  			if (vm_object_debug > 0) {
909  				--vm_object_debug;
910  				print_backtrace(10);
911  			}
912  		}
913  		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
914  			vm_page_protect(p, VM_PROT_NONE);
915  		vm_page_remove(p);
916  		vm_page_wakeup(p);
917  	}
918  
919  	/*
920  	 * Must be at end to avoid SMP races, caller holds object token
921  	 */
922  	if ((++info->count & 63) == 0)
923  		lwkt_user_yield();
924  	return(0);
925  }
926  
927  /*
928   * Clean all dirty pages in the specified range of object.  Leaves page
929   * on whatever queue it is currently on.   If NOSYNC is set then do not
930   * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
931   * leaving the object dirty.
932   *
933   * When stuffing pages asynchronously, allow clustering.  XXX we need a
934   * synchronous clustering mode implementation.
935   *
936   * Odd semantics: if start == end, we clean everything.
937   *
938   * The object must be locked? XXX
939   */
940  static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
941  static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
942  
943  void
944  vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
945  		     int flags)
946  {
947  	struct rb_vm_page_scan_info info;
948  	struct vnode *vp;
949  	int wholescan;
950  	int pagerflags;
951  	int generation;
952  
953  	vm_object_hold(object);
954  	if (object->type != OBJT_VNODE ||
955  	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
956  		vm_object_drop(object);
957  		return;
958  	}
959  
960  	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
961  			OBJPC_SYNC : OBJPC_CLUSTER_OK;
962  	pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0;
963  
964  	vp = object->handle;
965  
966  	/*
967  	 * Interlock other major object operations.  This allows us to
968  	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
969  	 */
970  	vm_object_set_flag(object, OBJ_CLEANING);
971  
972  	/*
973  	 * Handle 'entire object' case
974  	 */
975  	info.start_pindex = start;
976  	if (end == 0) {
977  		info.end_pindex = object->size - 1;
978  	} else {
979  		info.end_pindex = end - 1;
980  	}
981  	wholescan = (start == 0 && info.end_pindex == object->size - 1);
982  	info.limit = flags;
983  	info.pagerflags = pagerflags;
984  	info.object = object;
985  
986  	/*
987  	 * If cleaning the entire object do a pass to mark the pages read-only.
988  	 * If everything worked out ok, clear OBJ_WRITEABLE and
989  	 * OBJ_MIGHTBEDIRTY.
990  	 */
991  	if (wholescan) {
992  		info.error = 0;
993  		info.count = 0;
994  		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
995  					vm_object_page_clean_pass1, &info);
996  		if (info.error == 0) {
997  			vm_object_clear_flag(object,
998  					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
999  			if (object->type == OBJT_VNODE &&
1000  			    (vp = (struct vnode *)object->handle) != NULL) {
1001  				/*
1002  				 * Use new-style interface to clear VISDIRTY
1003  				 * because the vnode is not necessarily removed
1004  				 * from the syncer list(s) as often as it was
1005  				 * under the old interface, which can leave
1006  				 * the vnode on the syncer list after reclaim.
1007  				 */
1008  				vclrobjdirty(vp);
1009  			}
1010  		}
1011  	}
1012  
1013  	/*
1014  	 * Do a pass to clean all the dirty pages we find.
1015  	 */
1016  	do {
1017  		info.error = 0;
1018  		info.count = 0;
1019  		generation = object->generation;
1020  		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1021  					vm_object_page_clean_pass2, &info);
1022  	} while (info.error || generation != object->generation);
1023  
1024  	vm_object_clear_flag(object, OBJ_CLEANING);
1025  	vm_object_drop(object);
1026  }
1027  
1028  /*
1029   * The caller must hold the object.
1030   */
1031  static
1032  int
1033  vm_object_page_clean_pass1(struct vm_page *p, void *data)
1034  {
1035  	struct rb_vm_page_scan_info *info = data;
1036  
1037  	KKASSERT(p->object == info->object);
1038  
1039  	vm_page_flag_set(p, PG_CLEANCHK);
1040  	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1041  		info->error = 1;
1042  	} else if (vm_page_busy_try(p, FALSE)) {
1043  		info->error = 1;
1044  	} else {
1045  		KKASSERT(p->object == info->object);
1046  		vm_page_protect(p, VM_PROT_READ);
1047  		vm_page_wakeup(p);
1048  	}
1049  
1050  	/*
1051  	 * Must be at end to avoid SMP races, caller holds object token
1052  	 */
1053  	if ((++info->count & 63) == 0)
1054  		lwkt_user_yield();
1055  	return(0);
1056  }
1057  
1058  /*
1059   * The caller must hold the object
1060   */
1061  static
1062  int
1063  vm_object_page_clean_pass2(struct vm_page *p, void *data)
1064  {
1065  	struct rb_vm_page_scan_info *info = data;
1066  	int generation;
1067  
1068  	KKASSERT(p->object == info->object);
1069  
1070  	/*
1071  	 * Do not mess with pages that were inserted after we started
1072  	 * the cleaning pass.
1073  	 */
1074  	if ((p->flags & PG_CLEANCHK) == 0)
1075  		goto done;
1076  
1077  	generation = info->object->generation;
1078  
1079  	if (vm_page_busy_try(p, TRUE)) {
1080  		vm_page_sleep_busy(p, TRUE, "vpcwai");
1081  		info->error = 1;
1082  		goto done;
1083  	}
1084  
1085  	KKASSERT(p->object == info->object &&
1086  		 info->object->generation == generation);
1087  
1088  	/*
1089  	 * Before wasting time traversing the pmaps, check for trivial
1090  	 * cases where the page cannot be dirty.
1091  	 */
1092  	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1093  		KKASSERT((p->dirty & p->valid) == 0 &&
1094  			 (p->flags & PG_NEED_COMMIT) == 0);
1095  		vm_page_wakeup(p);
1096  		goto done;
1097  	}
1098  
1099  	/*
1100  	 * Check whether the page is dirty or not.  The page has been set
1101  	 * to be read-only so the check will not race a user dirtying the
1102  	 * page.
1103  	 */
1104  	vm_page_test_dirty(p);
1105  	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1106  		vm_page_flag_clear(p, PG_CLEANCHK);
1107  		vm_page_wakeup(p);
1108  		goto done;
1109  	}
1110  
1111  	/*
1112  	 * If we have been asked to skip nosync pages and this is a
1113  	 * nosync page, skip it.  Note that the object flags were
1114  	 * not cleared in this case (because pass1 will have returned an
1115  	 * error), so we do not have to set them.
1116  	 */
1117  	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1118  		vm_page_flag_clear(p, PG_CLEANCHK);
1119  		vm_page_wakeup(p);
1120  		goto done;
1121  	}
1122  
1123  	/*
1124  	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1125  	 * the pages that get successfully flushed.  Set info->error if
1126  	 * we raced an object modification.
1127  	 */
1128  	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1129  	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1130  
1131  	/*
1132  	 * Must be at end to avoid SMP races, caller holds object token
1133  	 */
1134  done:
1135  	if ((++info->count & 63) == 0)
1136  		lwkt_user_yield();
1137  	return(0);
1138  }
1139  
1140  /*
1141   * Collect the specified page and nearby pages and flush them out.
1142   * The number of pages flushed is returned.  The passed page is busied
1143   * by the caller and we are responsible for its disposition.
1144   *
1145   * The caller must hold the object.
1146   */
1147  static void
1148  vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1149  {
1150  	int error;
1151  	int is;
1152  	int ib;
1153  	int i;
1154  	int page_base;
1155  	vm_pindex_t pi;
1156  	vm_page_t ma[BLIST_MAX_ALLOC];
1157  
1158  	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1159  
1160  	pi = p->pindex;
1161  	page_base = pi % BLIST_MAX_ALLOC;
1162  	ma[page_base] = p;
1163  	ib = page_base - 1;
1164  	is = page_base + 1;
1165  
1166  	while (ib >= 0) {
1167  		vm_page_t tp;
1168  
1169  		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1170  					     TRUE, &error);
1171  		if (error)
1172  			break;
1173  		if (tp == NULL)
1174  			break;
1175  		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1176  		    (tp->flags & PG_CLEANCHK) == 0) {
1177  			vm_page_wakeup(tp);
1178  			break;
1179  		}
1180  		if ((tp->queue - tp->pc) == PQ_CACHE) {
1181  			vm_page_flag_clear(tp, PG_CLEANCHK);
1182  			vm_page_wakeup(tp);
1183  			break;
1184  		}
1185  		vm_page_test_dirty(tp);
1186  		if ((tp->dirty & tp->valid) == 0 &&
1187  		    (tp->flags & PG_NEED_COMMIT) == 0) {
1188  			vm_page_flag_clear(tp, PG_CLEANCHK);
1189  			vm_page_wakeup(tp);
1190  			break;
1191  		}
1192  		ma[ib] = tp;
1193  		--ib;
1194  	}
1195  	++ib;	/* fixup */
1196  
1197  	while (is < BLIST_MAX_ALLOC &&
1198  	       pi - page_base + is < object->size) {
1199  		vm_page_t tp;
1200  
1201  		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1202  					     TRUE, &error);
1203  		if (error)
1204  			break;
1205  		if (tp == NULL)
1206  			break;
1207  		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1208  		    (tp->flags & PG_CLEANCHK) == 0) {
1209  			vm_page_wakeup(tp);
1210  			break;
1211  		}
1212  		if ((tp->queue - tp->pc) == PQ_CACHE) {
1213  			vm_page_flag_clear(tp, PG_CLEANCHK);
1214  			vm_page_wakeup(tp);
1215  			break;
1216  		}
1217  		vm_page_test_dirty(tp);
1218  		if ((tp->dirty & tp->valid) == 0 &&
1219  		    (tp->flags & PG_NEED_COMMIT) == 0) {
1220  			vm_page_flag_clear(tp, PG_CLEANCHK);
1221  			vm_page_wakeup(tp);
1222  			break;
1223  		}
1224  		ma[is] = tp;
1225  		++is;
1226  	}
1227  
1228  	/*
1229  	 * All pages in the ma[] array are busied now
1230  	 */
1231  	for (i = ib; i < is; ++i) {
1232  		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1233  		vm_page_hold(ma[i]);	/* XXX need this any more? */
1234  	}
1235  	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1236  	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1237  		vm_page_unhold(ma[i]);
1238  }
1239  
1240  /*
1241   * Implements the madvise function at the object/page level.
1242   *
1243   * MADV_WILLNEED	(any object)
1244   *
1245   *	Activate the specified pages if they are resident.
1246   *
1247   * MADV_DONTNEED	(any object)
1248   *
1249   *	Deactivate the specified pages if they are resident.
1250   *
1251   * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1252   *
1253   *	Deactivate and clean the specified pages if they are
1254   *	resident.  This permits the process to reuse the pages
1255   *	without faulting or the kernel to reclaim the pages
1256   *	without I/O.
1257   *
1258   * No requirements.
1259   */
1260  void
1261  vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1262  		  vm_pindex_t count, int advise)
1263  {
1264  	vm_pindex_t end;
1265  	vm_page_t m;
1266  	int error;
1267  
1268  	if (object == NULL)
1269  		return;
1270  
1271  	end = pindex + count;
1272  
1273  	vm_object_hold(object);
1274  
1275  	/*
1276  	 * Locate and adjust resident pages.  This only applies to the
1277  	 * primary object in the mapping.
1278  	 */
1279  	for (; pindex < end; pindex += 1) {
1280  relookup:
1281  		/*
1282  		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1283  		 * and those pages must be OBJ_ONEMAPPING.
1284  		 */
1285  		if (advise == MADV_FREE) {
1286  			if ((object->type != OBJT_DEFAULT &&
1287  			     object->type != OBJT_SWAP) ||
1288  			    (object->flags & OBJ_ONEMAPPING) == 0) {
1289  				continue;
1290  			}
1291  		}
1292  
1293  		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1294  
1295  		if (error) {
1296  			vm_page_sleep_busy(m, TRUE, "madvpo");
1297  			goto relookup;
1298  		}
1299  		if (m == NULL) {
1300  			/*
1301  			 * There may be swap even if there is no backing page
1302  			 */
1303  			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1304  				swap_pager_freespace(object, pindex, 1);
1305  			continue;
1306  		}
1307  
1308  		/*
1309  		 * If the page is not in a normal active state, we skip it.
1310  		 * If the page is not managed there are no page queues to
1311  		 * mess with.  Things can break if we mess with pages in
1312  		 * any of the below states.
1313  		 */
1314  		if (m->wire_count ||
1315  		    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1316  				 PG_NEED_COMMIT)) ||
1317  		    m->valid != VM_PAGE_BITS_ALL
1318  		) {
1319  			vm_page_wakeup(m);
1320  			continue;
1321  		}
1322  
1323  		/*
1324  		 * Theoretically once a page is known not to be busy, an
1325  		 * interrupt cannot come along and rip it out from under us.
1326  		 */
1327  		if (advise == MADV_WILLNEED) {
1328  			vm_page_activate(m);
1329  		} else if (advise == MADV_DONTNEED) {
1330  			vm_page_dontneed(m);
1331  		} else if (advise == MADV_FREE) {
1332  			/*
1333  			 * Mark the page clean.  This will allow the page
1334  			 * to be freed up by the system.  However, such pages
1335  			 * are often reused quickly by malloc()/free()
1336  			 * so we do not do anything that would cause
1337  			 * a page fault if we can help it.
1338  			 *
1339  			 * Specifically, we do not try to actually free
1340  			 * the page now nor do we try to put it in the
1341  			 * cache (which would cause a page fault on reuse).
1342  			 *
1343  			 * But we do make the page is freeable as we
1344  			 * can without actually taking the step of unmapping
1345  			 * it.
1346  			 */
1347  			pmap_clear_modify(m);
1348  			m->dirty = 0;
1349  			m->act_count = 0;
1350  			vm_page_dontneed(m);
1351  			if (object->type == OBJT_SWAP)
1352  				swap_pager_freespace(object, pindex, 1);
1353  		}
1354  		vm_page_wakeup(m);
1355  	}
1356  	vm_object_drop(object);
1357  }
1358  
1359  /*
1360   * Removes all physical pages in the specified object range from the
1361   * object's list of pages.
1362   *
1363   * No requirements.
1364   */
1365  static int vm_object_page_remove_callback(vm_page_t p, void *data);
1366  
1367  void
1368  vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1369  		      boolean_t clean_only)
1370  {
1371  	struct rb_vm_page_scan_info info;
1372  	int all;
1373  
1374  	/*
1375  	 * Degenerate cases and assertions.
1376  	 *
1377  	 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects.
1378  	 *	 These objects do not have to have their pages entered into
1379  	 *	 them and are handled via their vm_map_backing lists.
1380  	 */
1381  	vm_object_hold(object);
1382  	if (object == NULL ||
1383  	    (object->type != OBJT_MGTDEVICE &&
1384  	     object->resident_page_count == 0 && object->swblock_count == 0)) {
1385  		vm_object_drop(object);
1386  		return;
1387  	}
1388  	KASSERT(object->type != OBJT_PHYS,
1389  		("attempt to remove pages from a physical object"));
1390  
1391  	/*
1392  	 * Indicate that paging is occuring on the object
1393  	 */
1394  	vm_object_pip_add(object, 1);
1395  
1396  	/*
1397  	 * Figure out the actual removal range and whether we are removing
1398  	 * the entire contents of the object or not.  If removing the entire
1399  	 * contents, be sure to get all pages, even those that might be
1400  	 * beyond the end of the object.
1401  	 *
1402  	 * NOTE: end is non-inclusive, but info.end_pindex is inclusive.
1403  	 */
1404  	info.object = object;
1405  	info.start_pindex = start;
1406  	if (end == 0 || end == (vm_pindex_t)-1) {
1407  		info.end_pindex = (vm_pindex_t)-1;
1408  		end = object->size;
1409  	} else {
1410  		info.end_pindex = end - 1;
1411  	}
1412  	info.limit = clean_only;
1413  	info.count = 0;
1414  	all = (start == 0 && info.end_pindex >= object->size - 1);
1415  
1416  	/*
1417  	 * Efficiently remove pages from the pmap via a backing scan.
1418  	 *
1419  	 * NOTE: This is the only way pages can be removed and unwired
1420  	 *	 from OBJT_MGTDEVICE devices which typically do not enter
1421  	 *	 their pages into the vm_object's RB tree.  And possibly
1422  	 *	 other OBJT_* types in the future.
1423  	 */
1424  	{
1425  		vm_map_backing_t ba;
1426  		vm_pindex_t sba, eba;
1427  		vm_offset_t sva, eva;
1428  
1429  		lockmgr(&object->backing_lk, LK_EXCLUSIVE);
1430  		TAILQ_FOREACH(ba, &object->backing_list, entry) {
1431  			/*
1432  			 * object offset range within the ba, intersectioned
1433  			 * with the page range specified for the object
1434  			 */
1435  			sba = OFF_TO_IDX(ba->offset);
1436  			eba = sba + OFF_TO_IDX(ba->end - ba->start);
1437  			if (sba < start)
1438  				sba = start;
1439  			if (eba > end)
1440  				eba = end;
1441  
1442  			/*
1443  			 * If the intersection is valid, remove the related
1444  			 * pages.
1445  			 *
1446  			 * NOTE! This may also remove other incidental pages
1447  			 *	 in the pmap, as the backing area may be
1448  			 *	 overloaded.
1449  			 *
1450  			 * NOTE! pages for MGTDEVICE objects are only removed
1451  			 *	 here, they aren't entered into rb_memq, so
1452  			 *	 we must use pmap_remove() instead of
1453  			 *	 the non-TLB-invalidating pmap_remove_pages().
1454  			 */
1455  			if (sba < eba) {
1456  				sva = ba->start + IDX_TO_OFF(sba) - ba->offset;
1457  				eva = sva + IDX_TO_OFF(eba - sba);
1458  #if 0
1459  				kprintf("VM_OBJECT_PAGE_REMOVE "
1460  					"%p[%016jx] %016jx-%016jx\n",
1461  					ba->pmap, ba->start, sva, eva);
1462  #endif
1463  				pmap_remove(ba->pmap, sva, eva);
1464  			}
1465  		}
1466  		lockmgr(&object->backing_lk, LK_RELEASE);
1467  	}
1468  
1469  	/*
1470  	 * Remove and free pages entered onto the object list.  Note that
1471  	 * for OBJT_MGTDEVICE objects, there are typically no pages entered.
1472  	 *
1473  	 * Loop until we are sure we have gotten them all.
1474  	 */
1475  	do {
1476  		info.error = 0;
1477  		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1478  					vm_object_page_remove_callback, &info);
1479  	} while (info.error);
1480  
1481  	/*
1482  	 * Remove any related swap if throwing away pages, or for
1483  	 * non-swap objects (the swap is a clean copy in that case).
1484  	 */
1485  	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1486  		if (all)
1487  			swap_pager_freespace_all(object);
1488  		else
1489  			swap_pager_freespace(object, info.start_pindex,
1490  			     info.end_pindex - info.start_pindex + 1);
1491  	}
1492  
1493  	/*
1494  	 * Cleanup
1495  	 */
1496  	vm_object_pip_wakeup(object);
1497  	vm_object_drop(object);
1498  }
1499  
1500  /*
1501   * The caller must hold the object.
1502   *
1503   * NOTE: User yields are allowed when removing more than one page, but not
1504   *	 allowed if only removing one page (the path for single page removals
1505   *	 might hold a spinlock).
1506   */
1507  static int
1508  vm_object_page_remove_callback(vm_page_t p, void *data)
1509  {
1510  	struct rb_vm_page_scan_info *info = data;
1511  
1512  	if (info->object != p->object ||
1513  	    p->pindex < info->start_pindex ||
1514  	    p->pindex > info->end_pindex) {
1515  		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1516  			info->object, p);
1517  		return(0);
1518  	}
1519  	if (vm_page_busy_try(p, TRUE)) {
1520  		vm_page_sleep_busy(p, TRUE, "vmopar");
1521  		info->error = 1;
1522  		return(0);
1523  	}
1524  	if (info->object != p->object) {
1525  		/* this should never happen */
1526  		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1527  			info->object, p);
1528  		vm_page_wakeup(p);
1529  		return(0);
1530  	}
1531  
1532  	/*
1533  	 * Wired pages cannot be destroyed, but they can be invalidated
1534  	 * and we do so if clean_only (limit) is not set.
1535  	 *
1536  	 * WARNING!  The page may be wired due to being part of a buffer
1537  	 *	     cache buffer, and the buffer might be marked B_CACHE.
1538  	 *	     This is fine as part of a truncation but VFSs must be
1539  	 *	     sure to fix the buffer up when re-extending the file.
1540  	 *
1541  	 * NOTE!     PG_NEED_COMMIT is ignored.
1542  	 */
1543  	if (p->wire_count != 0) {
1544  		vm_page_protect(p, VM_PROT_NONE);
1545  		if (info->limit == 0)
1546  			p->valid = 0;
1547  		vm_page_wakeup(p);
1548  		goto done;
1549  	}
1550  
1551  	/*
1552  	 * limit is our clean_only flag.  If set and the page is dirty or
1553  	 * requires a commit, do not free it.  If set and the page is being
1554  	 * held by someone, do not free it.
1555  	 */
1556  	if (info->limit && p->valid) {
1557  		vm_page_test_dirty(p);
1558  		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1559  			vm_page_wakeup(p);
1560  			goto done;
1561  		}
1562  	}
1563  
1564  	/*
1565  	 * Destroy the page.  But we have to re-test whether its dirty after
1566  	 * removing it from its pmaps.
1567  	 */
1568  	vm_page_protect(p, VM_PROT_NONE);
1569  	if (info->limit && p->valid) {
1570  		vm_page_test_dirty(p);
1571  		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1572  			vm_page_wakeup(p);
1573  			goto done;
1574  		}
1575  	}
1576  	vm_page_free(p);
1577  
1578  	/*
1579  	 * Must be at end to avoid SMP races, caller holds object token
1580  	 */
1581  done:
1582  	if ((++info->count & 63) == 0)
1583  		lwkt_user_yield();
1584  
1585  	return(0);
1586  }
1587  
1588  /*
1589   * Try to extend prev_object into an adjoining region of virtual
1590   * memory, return TRUE on success.
1591   *
1592   * The caller does not need to hold (prev_object) but must have a stable
1593   * pointer to it (typically by holding the vm_map locked).
1594   *
1595   * This function only works for anonymous memory objects which either
1596   * have (a) one reference or (b) we are extending the object's size.
1597   * Otherwise the related VM pages we want to use for the object might
1598   * be in use by another mapping.
1599   */
1600  boolean_t
1601  vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1602  		   vm_size_t prev_size, vm_size_t next_size)
1603  {
1604  	vm_pindex_t next_pindex;
1605  
1606  	if (prev_object == NULL)
1607  		return (TRUE);
1608  
1609  	vm_object_hold(prev_object);
1610  
1611  	if (prev_object->type != OBJT_DEFAULT &&
1612  	    prev_object->type != OBJT_SWAP) {
1613  		vm_object_drop(prev_object);
1614  		return (FALSE);
1615  	}
1616  
1617  #if 0
1618  	/* caller now checks this */
1619  	/*
1620  	 * Try to collapse the object first
1621  	 */
1622  	vm_object_collapse(prev_object, NULL);
1623  #endif
1624  
1625  #if 0
1626  	/* caller now checks this */
1627  	/*
1628  	 * We can't coalesce if we shadow another object (figuring out the
1629  	 * relationships become too complex).
1630  	 */
1631  	if (prev_object->backing_object != NULL) {
1632  		vm_object_chain_release(prev_object);
1633  		vm_object_drop(prev_object);
1634  		return (FALSE);
1635  	}
1636  #endif
1637  
1638  	prev_size >>= PAGE_SHIFT;
1639  	next_size >>= PAGE_SHIFT;
1640  	next_pindex = prev_pindex + prev_size;
1641  
1642  	/*
1643  	 * We can't if the object has more than one ref count unless we
1644  	 * are extending it into newly minted space.
1645  	 */
1646  	if (prev_object->ref_count > 1 &&
1647  	    prev_object->size != next_pindex) {
1648  		vm_object_drop(prev_object);
1649  		return (FALSE);
1650  	}
1651  
1652  	/*
1653  	 * Remove any pages that may still be in the object from a previous
1654  	 * deallocation.
1655  	 */
1656  	if (next_pindex < prev_object->size) {
1657  		vm_object_page_remove(prev_object,
1658  				      next_pindex,
1659  				      next_pindex + next_size, FALSE);
1660  		if (prev_object->type == OBJT_SWAP)
1661  			swap_pager_freespace(prev_object,
1662  					     next_pindex, next_size);
1663  	}
1664  
1665  	/*
1666  	 * Extend the object if necessary.
1667  	 */
1668  	if (next_pindex + next_size > prev_object->size)
1669  		prev_object->size = next_pindex + next_size;
1670  	vm_object_drop(prev_object);
1671  
1672  	return (TRUE);
1673  }
1674  
1675  /*
1676   * Make the object writable and flag is being possibly dirty.
1677   *
1678   * The object might not be held (or might be held but held shared),
1679   * the related vnode is probably not held either.  Object and vnode are
1680   * stable by virtue of the vm_page busied by the caller preventing
1681   * destruction.
1682   *
1683   * If the related mount is flagged MNTK_THR_SYNC we need to call
1684   * vsetobjdirty().  Filesystems using this option usually shortcut
1685   * synchronization by only scanning the syncer list.
1686   */
1687  void
1688  vm_object_set_writeable_dirty(vm_object_t object)
1689  {
1690  	struct vnode *vp;
1691  
1692  	/*vm_object_assert_held(object);*/
1693  	/*
1694  	 * Avoid contention in vm fault path by checking the state before
1695  	 * issuing an atomic op on it.
1696  	 */
1697  	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1698  	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1699  		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1700  	}
1701  	if (object->type == OBJT_VNODE &&
1702  	    (vp = (struct vnode *)object->handle) != NULL) {
1703  		if ((vp->v_flag & VOBJDIRTY) == 0) {
1704  			if (vp->v_mount &&
1705  			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1706  				/*
1707  				 * New style THR_SYNC places vnodes on the
1708  				 * syncer list more deterministically.
1709  				 */
1710  				vsetobjdirty(vp);
1711  			} else {
1712  				/*
1713  				 * Old style scan would not necessarily place
1714  				 * a vnode on the syncer list when possibly
1715  				 * modified via mmap.
1716  				 */
1717  				vsetflags(vp, VOBJDIRTY);
1718  			}
1719  		}
1720  	}
1721  }
1722  
1723  #include "opt_ddb.h"
1724  #ifdef DDB
1725  #include <sys/cons.h>
1726  
1727  #include <ddb/ddb.h>
1728  
1729  static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1730  				       vm_map_entry_t entry);
1731  static int	vm_object_in_map (vm_object_t object);
1732  
1733  /*
1734   * The caller must hold the object.
1735   */
1736  static int
1737  _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1738  {
1739  	vm_map_backing_t ba;
1740  	vm_map_t tmpm;
1741  	vm_map_entry_t tmpe;
1742  	int entcount;
1743  
1744  	if (map == NULL)
1745  		return 0;
1746  	if (entry == NULL) {
1747  		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1748  		entcount = map->nentries;
1749  		while (entcount-- && tmpe) {
1750  			if( _vm_object_in_map(map, object, tmpe)) {
1751  				return 1;
1752  			}
1753  			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1754  		}
1755  		return (0);
1756  	}
1757  	switch(entry->maptype) {
1758  	case VM_MAPTYPE_SUBMAP:
1759  		tmpm = entry->ba.sub_map;
1760  		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1761  		entcount = tmpm->nentries;
1762  		while (entcount-- && tmpe) {
1763  			if( _vm_object_in_map(tmpm, object, tmpe)) {
1764  				return 1;
1765  			}
1766  			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1767  		}
1768  		break;
1769  	case VM_MAPTYPE_NORMAL:
1770  		ba = &entry->ba;
1771  		while (ba) {
1772  			if (ba->object == object)
1773  				return TRUE;
1774  			ba = ba->backing_ba;
1775  		}
1776  		break;
1777  	default:
1778  		break;
1779  	}
1780  	return 0;
1781  }
1782  
1783  static int vm_object_in_map_callback(struct proc *p, void *data);
1784  
1785  struct vm_object_in_map_info {
1786  	vm_object_t object;
1787  	int rv;
1788  };
1789  
1790  /*
1791   * Debugging only
1792   */
1793  static int
1794  vm_object_in_map(vm_object_t object)
1795  {
1796  	struct vm_object_in_map_info info;
1797  
1798  	info.rv = 0;
1799  	info.object = object;
1800  
1801  	allproc_scan(vm_object_in_map_callback, &info, 0);
1802  	if (info.rv)
1803  		return 1;
1804  	if( _vm_object_in_map(kernel_map, object, 0))
1805  		return 1;
1806  	if( _vm_object_in_map(pager_map, object, 0))
1807  		return 1;
1808  	if( _vm_object_in_map(buffer_map, object, 0))
1809  		return 1;
1810  	return 0;
1811  }
1812  
1813  /*
1814   * Debugging only
1815   */
1816  static int
1817  vm_object_in_map_callback(struct proc *p, void *data)
1818  {
1819  	struct vm_object_in_map_info *info = data;
1820  
1821  	if (p->p_vmspace) {
1822  		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1823  			info->rv = 1;
1824  			return -1;
1825  		}
1826  	}
1827  	return (0);
1828  }
1829  
1830  DB_SHOW_COMMAND(vmochk, vm_object_check)
1831  {
1832  	struct vm_object_hash *hash;
1833  	vm_object_t object;
1834  	int n;
1835  
1836  	/*
1837  	 * make sure that internal objs are in a map somewhere
1838  	 * and none have zero ref counts.
1839  	 */
1840  	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1841  		hash = &vm_object_hash[n];
1842  		for (object = TAILQ_FIRST(&hash->list);
1843  				object != NULL;
1844  				object = TAILQ_NEXT(object, object_entry)) {
1845  			if (object->type == OBJT_MARKER)
1846  				continue;
1847  			if (object->handle != NULL ||
1848  			    (object->type != OBJT_DEFAULT &&
1849  			     object->type != OBJT_SWAP)) {
1850  				continue;
1851  			}
1852  			if (object->ref_count == 0) {
1853  				db_printf("vmochk: internal obj has "
1854  					  "zero ref count: %ld\n",
1855  					  (long)object->size);
1856  			}
1857  			if (vm_object_in_map(object))
1858  				continue;
1859  			db_printf("vmochk: internal obj is not in a map: "
1860  				  "ref: %d, size: %lu: 0x%lx\n",
1861  				  object->ref_count, (u_long)object->size,
1862  				  (u_long)object->size);
1863  		}
1864  	}
1865  }
1866  
1867  /*
1868   * Debugging only
1869   */
1870  DB_SHOW_COMMAND(object, vm_object_print_static)
1871  {
1872  	/* XXX convert args. */
1873  	vm_object_t object = (vm_object_t)addr;
1874  	boolean_t full = have_addr;
1875  
1876  	vm_page_t p;
1877  
1878  	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1879  #define	count	was_count
1880  
1881  	int count;
1882  
1883  	if (object == NULL)
1884  		return;
1885  
1886  	db_iprintf(
1887  	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1888  	    object, (int)object->type, (u_long)object->size,
1889  	    object->resident_page_count, object->ref_count, object->flags);
1890  	/*
1891  	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1892  	 */
1893  	db_iprintf("\n");
1894  
1895  	if (!full)
1896  		return;
1897  
1898  	db_indent += 2;
1899  	count = 0;
1900  	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1901  		if (count == 0)
1902  			db_iprintf("memory:=");
1903  		else if (count == 6) {
1904  			db_printf("\n");
1905  			db_iprintf(" ...");
1906  			count = 0;
1907  		} else
1908  			db_printf(",");
1909  		count++;
1910  
1911  		db_printf("(off=0x%lx,page=0x%lx)",
1912  		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1913  	}
1914  	if (count != 0)
1915  		db_printf("\n");
1916  	db_indent -= 2;
1917  }
1918  
1919  /* XXX. */
1920  #undef count
1921  
1922  /*
1923   * XXX need this non-static entry for calling from vm_map_print.
1924   *
1925   * Debugging only
1926   */
1927  void
1928  vm_object_print(/* db_expr_t */ long addr,
1929  		boolean_t have_addr,
1930  		/* db_expr_t */ long count,
1931  		char *modif)
1932  {
1933  	vm_object_print_static(addr, have_addr, count, modif);
1934  }
1935  
1936  /*
1937   * Debugging only
1938   */
1939  DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1940  {
1941  	struct vm_object_hash *hash;
1942  	vm_object_t object;
1943  	int nl = 0;
1944  	int c;
1945  	int n;
1946  
1947  	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1948  		hash = &vm_object_hash[n];
1949  		for (object = TAILQ_FIRST(&hash->list);
1950  				object != NULL;
1951  				object = TAILQ_NEXT(object, object_entry)) {
1952  			vm_pindex_t idx, fidx;
1953  			vm_pindex_t osize;
1954  			vm_paddr_t pa = -1, padiff;
1955  			int rcount;
1956  			vm_page_t m;
1957  
1958  			if (object->type == OBJT_MARKER)
1959  				continue;
1960  			db_printf("new object: %p\n", (void *)object);
1961  			if ( nl > 18) {
1962  				c = cngetc();
1963  				if (c != ' ')
1964  					return;
1965  				nl = 0;
1966  			}
1967  			nl++;
1968  			rcount = 0;
1969  			fidx = 0;
1970  			osize = object->size;
1971  			if (osize > 128)
1972  				osize = 128;
1973  			for (idx = 0; idx < osize; idx++) {
1974  				m = vm_page_lookup(object, idx);
1975  				if (m == NULL) {
1976  					if (rcount) {
1977  						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1978  							(long)fidx, rcount, (long)pa);
1979  						if ( nl > 18) {
1980  							c = cngetc();
1981  							if (c != ' ')
1982  								return;
1983  							nl = 0;
1984  						}
1985  						nl++;
1986  						rcount = 0;
1987  					}
1988  					continue;
1989  				}
1990  
1991  				if (rcount &&
1992  					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1993  					++rcount;
1994  					continue;
1995  				}
1996  				if (rcount) {
1997  					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1998  					padiff >>= PAGE_SHIFT;
1999  					padiff &= PQ_L2_MASK;
2000  					if (padiff == 0) {
2001  						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
2002  						++rcount;
2003  						continue;
2004  					}
2005  					db_printf(" index(%ld)run(%d)pa(0x%lx)",
2006  						(long)fidx, rcount, (long)pa);
2007  					db_printf("pd(%ld)\n", (long)padiff);
2008  					if ( nl > 18) {
2009  						c = cngetc();
2010  						if (c != ' ')
2011  							return;
2012  						nl = 0;
2013  					}
2014  					nl++;
2015  				}
2016  				fidx = idx;
2017  				pa = VM_PAGE_TO_PHYS(m);
2018  				rcount = 1;
2019  			}
2020  			if (rcount) {
2021  				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2022  					(long)fidx, rcount, (long)pa);
2023  				if ( nl > 18) {
2024  					c = cngetc();
2025  					if (c != ' ')
2026  						return;
2027  					nl = 0;
2028  				}
2029  				nl++;
2030  			}
2031  		}
2032  	}
2033  }
2034  #endif /* DDB */
2035