xref: /dflybsd-src/sys/vm/vm_page.c (revision 0087561d6d4d84b8ac1a312cc720339cbf66781d)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
33  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
34  */
35 
36 /*
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62 /*
63  * Resident memory management module.  The module manipulates 'VM pages'.
64  * A VM page is the core building block for memory management.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/malloc.h>
70 #include <sys/proc.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
73 #include <sys/kernel.h>
74 #include <sys/alist.h>
75 #include <sys/sysctl.h>
76 #include <sys/cpu_topology.h>
77 
78 #include <vm/vm.h>
79 #include <vm/vm_param.h>
80 #include <sys/lock.h>
81 #include <vm/vm_kern.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_extern.h>
89 #include <vm/swap_pager.h>
90 
91 #include <machine/inttypes.h>
92 #include <machine/md_var.h>
93 #include <machine/specialreg.h>
94 
95 #include <vm/vm_page2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Action hash for user umtx support.
100  */
101 #define VMACTION_HSIZE		256
102 #define VMACTION_HMASK		(VMACTION_HSIZE - 1)
103 
104 /*
105  * SET - Minimum required set associative size, must be a power of 2.  We
106  *	 want this to match or exceed the set-associativeness of the cpu.
107  *
108  * GRP - A larger set that allows bleed-over into the domains of other
109  *	 nearby cpus.  Also must be a power of 2.  Used by the page zeroing
110  *	 code to smooth things out a bit.
111  */
112 #define PQ_SET_ASSOC		16
113 #define PQ_SET_ASSOC_MASK	(PQ_SET_ASSOC - 1)
114 
115 #define PQ_GRP_ASSOC		(PQ_SET_ASSOC * 2)
116 #define PQ_GRP_ASSOC_MASK	(PQ_GRP_ASSOC - 1)
117 
118 static void vm_page_queue_init(void);
119 static void vm_page_free_wakeup(void);
120 static vm_page_t vm_page_select_cache(u_short pg_color);
121 static vm_page_t _vm_page_list_find2(int basequeue, int index);
122 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
123 
124 /*
125  * Array of tailq lists
126  */
127 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
128 
129 LIST_HEAD(vm_page_action_list, vm_page_action);
130 
131 struct vm_page_action_hash {
132 	struct vm_page_action_list list;
133 	struct lock	lk;
134 } __cachealign;
135 
136 struct vm_page_action_hash	action_hash[VMACTION_HSIZE];
137 static volatile int vm_pages_waiting;
138 
139 static struct alist vm_contig_alist;
140 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
141 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
142 
143 static u_long vm_dma_reserved = 0;
144 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
145 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
146 	    "Memory reserved for DMA");
147 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
148 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
149 
150 static int vm_contig_verbose = 0;
151 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
152 
153 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
154 	     vm_pindex_t, pindex);
155 
156 static void
157 vm_page_queue_init(void)
158 {
159 	int i;
160 
161 	for (i = 0; i < PQ_L2_SIZE; i++)
162 		vm_page_queues[PQ_FREE+i].cnt_offset =
163 			offsetof(struct vmstats, v_free_count);
164 	for (i = 0; i < PQ_L2_SIZE; i++)
165 		vm_page_queues[PQ_CACHE+i].cnt_offset =
166 			offsetof(struct vmstats, v_cache_count);
167 	for (i = 0; i < PQ_L2_SIZE; i++)
168 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
169 			offsetof(struct vmstats, v_inactive_count);
170 	for (i = 0; i < PQ_L2_SIZE; i++)
171 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
172 			offsetof(struct vmstats, v_active_count);
173 	for (i = 0; i < PQ_L2_SIZE; i++)
174 		vm_page_queues[PQ_HOLD+i].cnt_offset =
175 			offsetof(struct vmstats, v_active_count);
176 	/* PQ_NONE has no queue */
177 
178 	for (i = 0; i < PQ_COUNT; i++) {
179 		TAILQ_INIT(&vm_page_queues[i].pl);
180 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
181 	}
182 
183 	/*
184 	 * NOTE: Action lock might recurse due to callback, so allow
185 	 *	 recursion.
186 	 */
187 	for (i = 0; i < VMACTION_HSIZE; i++) {
188 		LIST_INIT(&action_hash[i].list);
189 		lockinit(&action_hash[i].lk, "actlk", 0, LK_CANRECURSE);
190 	}
191 }
192 
193 /*
194  * note: place in initialized data section?  Is this necessary?
195  */
196 long first_page = 0;
197 int vm_page_array_size = 0;
198 vm_page_t vm_page_array = NULL;
199 vm_paddr_t vm_low_phys_reserved;
200 
201 /*
202  * (low level boot)
203  *
204  * Sets the page size, perhaps based upon the memory size.
205  * Must be called before any use of page-size dependent functions.
206  */
207 void
208 vm_set_page_size(void)
209 {
210 	if (vmstats.v_page_size == 0)
211 		vmstats.v_page_size = PAGE_SIZE;
212 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
213 		panic("vm_set_page_size: page size not a power of two");
214 }
215 
216 /*
217  * (low level boot)
218  *
219  * Add a new page to the freelist for use by the system.  New pages
220  * are added to both the head and tail of the associated free page
221  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
222  * requests pull 'recent' adds (higher physical addresses) first.
223  *
224  * Beware that the page zeroing daemon will also be running soon after
225  * boot, moving pages from the head to the tail of the PQ_FREE queues.
226  *
227  * Must be called in a critical section.
228  */
229 static void
230 vm_add_new_page(vm_paddr_t pa)
231 {
232 	struct vpgqueues *vpq;
233 	vm_page_t m;
234 
235 	m = PHYS_TO_VM_PAGE(pa);
236 	m->phys_addr = pa;
237 	m->flags = 0;
238 	m->pat_mode = PAT_WRITE_BACK;
239 	m->pc = (pa >> PAGE_SHIFT);
240 
241 	/*
242 	 * Twist for cpu localization in addition to page coloring, so
243 	 * different cpus selecting by m->queue get different page colors.
244 	 */
245 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
246 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
247 	m->pc &= PQ_L2_MASK;
248 
249 	/*
250 	 * Reserve a certain number of contiguous low memory pages for
251 	 * contigmalloc() to use.
252 	 */
253 	if (pa < vm_low_phys_reserved) {
254 		atomic_add_int(&vmstats.v_page_count, 1);
255 		atomic_add_int(&vmstats.v_dma_pages, 1);
256 		m->queue = PQ_NONE;
257 		m->wire_count = 1;
258 		atomic_add_int(&vmstats.v_wire_count, 1);
259 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
260 		return;
261 	}
262 
263 	/*
264 	 * General page
265 	 */
266 	m->queue = m->pc + PQ_FREE;
267 	KKASSERT(m->dirty == 0);
268 
269 	atomic_add_int(&vmstats.v_page_count, 1);
270 	atomic_add_int(&vmstats.v_free_count, 1);
271 	vpq = &vm_page_queues[m->queue];
272 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
273 	++vpq->lcnt;
274 }
275 
276 /*
277  * (low level boot)
278  *
279  * Initializes the resident memory module.
280  *
281  * Preallocates memory for critical VM structures and arrays prior to
282  * kernel_map becoming available.
283  *
284  * Memory is allocated from (virtual2_start, virtual2_end) if available,
285  * otherwise memory is allocated from (virtual_start, virtual_end).
286  *
287  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
288  * large enough to hold vm_page_array & other structures for machines with
289  * large amounts of ram, so we want to use virtual2* when available.
290  */
291 void
292 vm_page_startup(void)
293 {
294 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
295 	vm_offset_t mapped;
296 	vm_size_t npages;
297 	vm_paddr_t page_range;
298 	vm_paddr_t new_end;
299 	int i;
300 	vm_paddr_t pa;
301 	vm_paddr_t last_pa;
302 	vm_paddr_t end;
303 	vm_paddr_t biggestone, biggestsize;
304 	vm_paddr_t total;
305 	vm_page_t m;
306 
307 	total = 0;
308 	biggestsize = 0;
309 	biggestone = 0;
310 	vaddr = round_page(vaddr);
311 
312 	/*
313 	 * Make sure ranges are page-aligned.
314 	 */
315 	for (i = 0; phys_avail[i].phys_end; ++i) {
316 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
317 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
318 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
319 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
320 	}
321 
322 	/*
323 	 * Locate largest block
324 	 */
325 	for (i = 0; phys_avail[i].phys_end; ++i) {
326 		vm_paddr_t size = phys_avail[i].phys_end -
327 				  phys_avail[i].phys_beg;
328 
329 		if (size > biggestsize) {
330 			biggestone = i;
331 			biggestsize = size;
332 		}
333 		total += size;
334 	}
335 	--i;	/* adjust to last entry for use down below */
336 
337 	end = phys_avail[biggestone].phys_end;
338 	end = trunc_page(end);
339 
340 	/*
341 	 * Initialize the queue headers for the free queue, the active queue
342 	 * and the inactive queue.
343 	 */
344 	vm_page_queue_init();
345 
346 #if !defined(_KERNEL_VIRTUAL)
347 	/*
348 	 * VKERNELs don't support minidumps and as such don't need
349 	 * vm_page_dump
350 	 *
351 	 * Allocate a bitmap to indicate that a random physical page
352 	 * needs to be included in a minidump.
353 	 *
354 	 * The amd64 port needs this to indicate which direct map pages
355 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
356 	 *
357 	 * However, i386 still needs this workspace internally within the
358 	 * minidump code.  In theory, they are not needed on i386, but are
359 	 * included should the sf_buf code decide to use them.
360 	 */
361 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
362 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
363 	end -= vm_page_dump_size;
364 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
365 					VM_PROT_READ | VM_PROT_WRITE);
366 	bzero((void *)vm_page_dump, vm_page_dump_size);
367 #endif
368 	/*
369 	 * Compute the number of pages of memory that will be available for
370 	 * use (taking into account the overhead of a page structure per
371 	 * page).
372 	 */
373 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
374 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
375 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
376 
377 #ifndef _KERNEL_VIRTUAL
378 	/*
379 	 * (only applies to real kernels)
380 	 *
381 	 * Reserve a large amount of low memory for potential 32-bit DMA
382 	 * space allocations.  Once device initialization is complete we
383 	 * release most of it, but keep (vm_dma_reserved) memory reserved
384 	 * for later use.  Typically for X / graphics.  Through trial and
385 	 * error we find that GPUs usually requires ~60-100MB or so.
386 	 *
387 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
388 	 */
389 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
390 	if (vm_low_phys_reserved > total / 4)
391 		vm_low_phys_reserved = total / 4;
392 	if (vm_dma_reserved == 0) {
393 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
394 		if (vm_dma_reserved > total / 16)
395 			vm_dma_reserved = total / 16;
396 	}
397 #endif
398 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
399 		   ALIST_RECORDS_65536);
400 
401 	/*
402 	 * Initialize the mem entry structures now, and put them in the free
403 	 * queue.
404 	 */
405 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
406 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
407 	vm_page_array = (vm_page_t)mapped;
408 
409 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
410 	/*
411 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
412 	 * we have to manually add these pages to the minidump tracking so
413 	 * that they can be dumped, including the vm_page_array.
414 	 */
415 	for (pa = new_end;
416 	     pa < phys_avail[biggestone].phys_end;
417 	     pa += PAGE_SIZE) {
418 		dump_add_page(pa);
419 	}
420 #endif
421 
422 	/*
423 	 * Clear all of the page structures, run basic initialization so
424 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
425 	 * map.
426 	 */
427 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
428 	vm_page_array_size = page_range;
429 
430 	m = &vm_page_array[0];
431 	pa = ptoa(first_page);
432 	for (i = 0; i < page_range; ++i) {
433 		spin_init(&m->spin, "vm_page");
434 		m->phys_addr = pa;
435 		pa += PAGE_SIZE;
436 		++m;
437 	}
438 
439 	/*
440 	 * Construct the free queue(s) in ascending order (by physical
441 	 * address) so that the first 16MB of physical memory is allocated
442 	 * last rather than first.  On large-memory machines, this avoids
443 	 * the exhaustion of low physical memory before isa_dmainit has run.
444 	 */
445 	vmstats.v_page_count = 0;
446 	vmstats.v_free_count = 0;
447 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
448 		pa = phys_avail[i].phys_beg;
449 		if (i == biggestone)
450 			last_pa = new_end;
451 		else
452 			last_pa = phys_avail[i].phys_end;
453 		while (pa < last_pa && npages-- > 0) {
454 			vm_add_new_page(pa);
455 			pa += PAGE_SIZE;
456 		}
457 	}
458 	if (virtual2_start)
459 		virtual2_start = vaddr;
460 	else
461 		virtual_start = vaddr;
462 	mycpu->gd_vmstats = vmstats;
463 }
464 
465 /*
466  * Reorganize VM pages based on numa data.  May be called as many times as
467  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
468  * to allow vm_page_alloc() to choose pages based on socket affinity.
469  *
470  * NOTE: This function is only called while we are still in UP mode, so
471  *	 we only need a critical section to protect the queues (which
472  *	 saves a lot of time, there are likely a ton of pages).
473  */
474 void
475 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
476 {
477 	vm_paddr_t scan_beg;
478 	vm_paddr_t scan_end;
479 	vm_paddr_t ran_end;
480 	struct vpgqueues *vpq;
481 	vm_page_t m;
482 	vm_page_t mend;
483 	int i;
484 	int socket_mod;
485 	int socket_value;
486 
487 	/*
488 	 * Check if no physical information, or there was only one socket
489 	 * (so don't waste time doing nothing!).
490 	 */
491 	if (cpu_topology_phys_ids <= 1 ||
492 	    cpu_topology_core_ids == 0) {
493 		return;
494 	}
495 
496 	/*
497 	 * Setup for our iteration.  Note that ACPI may iterate CPU
498 	 * sockets starting at 0 or 1 or some other number.  The
499 	 * cpu_topology code mod's it against the socket count.
500 	 */
501 	ran_end = ran_beg + bytes;
502 	physid %= cpu_topology_phys_ids;
503 
504 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
505 	socket_value = physid * socket_mod;
506 	mend = &vm_page_array[vm_page_array_size];
507 
508 	crit_enter();
509 
510 	/*
511 	 * Adjust vm_page->pc and requeue all affected pages.  The
512 	 * allocator will then be able to localize memory allocations
513 	 * to some degree.
514 	 */
515 	for (i = 0; phys_avail[i].phys_end; ++i) {
516 		scan_beg = phys_avail[i].phys_beg;
517 		scan_end = phys_avail[i].phys_end;
518 		if (scan_end <= ran_beg)
519 			continue;
520 		if (scan_beg >= ran_end)
521 			continue;
522 		if (scan_beg < ran_beg)
523 			scan_beg = ran_beg;
524 		if (scan_end > ran_end)
525 			scan_end = ran_end;
526 		if (atop(scan_end) > first_page + vm_page_array_size)
527 			scan_end = ptoa(first_page + vm_page_array_size);
528 
529 		m = PHYS_TO_VM_PAGE(scan_beg);
530 		while (scan_beg < scan_end) {
531 			KKASSERT(m < mend);
532 			if (m->queue != PQ_NONE) {
533 				vpq = &vm_page_queues[m->queue];
534 				TAILQ_REMOVE(&vpq->pl, m, pageq);
535 				--vpq->lcnt;
536 				/* queue doesn't change, no need to adj cnt */
537 				m->queue -= m->pc;
538 				m->pc %= socket_mod;
539 				m->pc += socket_value;
540 				m->pc &= PQ_L2_MASK;
541 				m->queue += m->pc;
542 				vpq = &vm_page_queues[m->queue];
543 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
544 				++vpq->lcnt;
545 				/* queue doesn't change, no need to adj cnt */
546 			} else {
547 				m->pc %= socket_mod;
548 				m->pc += socket_value;
549 				m->pc &= PQ_L2_MASK;
550 			}
551 			scan_beg += PAGE_SIZE;
552 			++m;
553 		}
554 	}
555 	crit_exit();
556 }
557 
558 /*
559  * We tended to reserve a ton of memory for contigmalloc().  Now that most
560  * drivers have initialized we want to return most the remaining free
561  * reserve back to the VM page queues so they can be used for normal
562  * allocations.
563  *
564  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
565  */
566 static void
567 vm_page_startup_finish(void *dummy __unused)
568 {
569 	alist_blk_t blk;
570 	alist_blk_t rblk;
571 	alist_blk_t count;
572 	alist_blk_t xcount;
573 	alist_blk_t bfree;
574 	vm_page_t m;
575 
576 	spin_lock(&vm_contig_spin);
577 	for (;;) {
578 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
579 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
580 			break;
581 		if (count == 0)
582 			break;
583 
584 		/*
585 		 * Figure out how much of the initial reserve we have to
586 		 * free in order to reach our target.
587 		 */
588 		bfree -= vm_dma_reserved / PAGE_SIZE;
589 		if (count > bfree) {
590 			blk += count - bfree;
591 			count = bfree;
592 		}
593 
594 		/*
595 		 * Calculate the nearest power of 2 <= count.
596 		 */
597 		for (xcount = 1; xcount <= count; xcount <<= 1)
598 			;
599 		xcount >>= 1;
600 		blk += count - xcount;
601 		count = xcount;
602 
603 		/*
604 		 * Allocate the pages from the alist, then free them to
605 		 * the normal VM page queues.
606 		 *
607 		 * Pages allocated from the alist are wired.  We have to
608 		 * busy, unwire, and free them.  We must also adjust
609 		 * vm_low_phys_reserved before freeing any pages to prevent
610 		 * confusion.
611 		 */
612 		rblk = alist_alloc(&vm_contig_alist, blk, count);
613 		if (rblk != blk) {
614 			kprintf("vm_page_startup_finish: Unable to return "
615 				"dma space @0x%08x/%d -> 0x%08x\n",
616 				blk, count, rblk);
617 			break;
618 		}
619 		atomic_add_int(&vmstats.v_dma_pages, -count);
620 		spin_unlock(&vm_contig_spin);
621 
622 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
623 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
624 		while (count) {
625 			vm_page_busy_wait(m, FALSE, "cpgfr");
626 			vm_page_unwire(m, 0);
627 			vm_page_free(m);
628 			--count;
629 			++m;
630 		}
631 		spin_lock(&vm_contig_spin);
632 	}
633 	spin_unlock(&vm_contig_spin);
634 
635 	/*
636 	 * Print out how much DMA space drivers have already allocated and
637 	 * how much is left over.
638 	 */
639 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
640 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
641 		(PAGE_SIZE / 1024),
642 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
643 }
644 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
645 	vm_page_startup_finish, NULL);
646 
647 
648 /*
649  * Scan comparison function for Red-Black tree scans.  An inclusive
650  * (start,end) is expected.  Other fields are not used.
651  */
652 int
653 rb_vm_page_scancmp(struct vm_page *p, void *data)
654 {
655 	struct rb_vm_page_scan_info *info = data;
656 
657 	if (p->pindex < info->start_pindex)
658 		return(-1);
659 	if (p->pindex > info->end_pindex)
660 		return(1);
661 	return(0);
662 }
663 
664 int
665 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
666 {
667 	if (p1->pindex < p2->pindex)
668 		return(-1);
669 	if (p1->pindex > p2->pindex)
670 		return(1);
671 	return(0);
672 }
673 
674 void
675 vm_page_init(vm_page_t m)
676 {
677 	/* do nothing for now.  Called from pmap_page_init() */
678 }
679 
680 /*
681  * Each page queue has its own spin lock, which is fairly optimal for
682  * allocating and freeing pages at least.
683  *
684  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
685  * queue spinlock via this function.  Also note that m->queue cannot change
686  * unless both the page and queue are locked.
687  */
688 static __inline
689 void
690 _vm_page_queue_spin_lock(vm_page_t m)
691 {
692 	u_short queue;
693 
694 	queue = m->queue;
695 	if (queue != PQ_NONE) {
696 		spin_lock(&vm_page_queues[queue].spin);
697 		KKASSERT(queue == m->queue);
698 	}
699 }
700 
701 static __inline
702 void
703 _vm_page_queue_spin_unlock(vm_page_t m)
704 {
705 	u_short queue;
706 
707 	queue = m->queue;
708 	cpu_ccfence();
709 	if (queue != PQ_NONE)
710 		spin_unlock(&vm_page_queues[queue].spin);
711 }
712 
713 static __inline
714 void
715 _vm_page_queues_spin_lock(u_short queue)
716 {
717 	cpu_ccfence();
718 	if (queue != PQ_NONE)
719 		spin_lock(&vm_page_queues[queue].spin);
720 }
721 
722 
723 static __inline
724 void
725 _vm_page_queues_spin_unlock(u_short queue)
726 {
727 	cpu_ccfence();
728 	if (queue != PQ_NONE)
729 		spin_unlock(&vm_page_queues[queue].spin);
730 }
731 
732 void
733 vm_page_queue_spin_lock(vm_page_t m)
734 {
735 	_vm_page_queue_spin_lock(m);
736 }
737 
738 void
739 vm_page_queues_spin_lock(u_short queue)
740 {
741 	_vm_page_queues_spin_lock(queue);
742 }
743 
744 void
745 vm_page_queue_spin_unlock(vm_page_t m)
746 {
747 	_vm_page_queue_spin_unlock(m);
748 }
749 
750 void
751 vm_page_queues_spin_unlock(u_short queue)
752 {
753 	_vm_page_queues_spin_unlock(queue);
754 }
755 
756 /*
757  * This locks the specified vm_page and its queue in the proper order
758  * (page first, then queue).  The queue may change so the caller must
759  * recheck on return.
760  */
761 static __inline
762 void
763 _vm_page_and_queue_spin_lock(vm_page_t m)
764 {
765 	vm_page_spin_lock(m);
766 	_vm_page_queue_spin_lock(m);
767 }
768 
769 static __inline
770 void
771 _vm_page_and_queue_spin_unlock(vm_page_t m)
772 {
773 	_vm_page_queues_spin_unlock(m->queue);
774 	vm_page_spin_unlock(m);
775 }
776 
777 void
778 vm_page_and_queue_spin_unlock(vm_page_t m)
779 {
780 	_vm_page_and_queue_spin_unlock(m);
781 }
782 
783 void
784 vm_page_and_queue_spin_lock(vm_page_t m)
785 {
786 	_vm_page_and_queue_spin_lock(m);
787 }
788 
789 /*
790  * Helper function removes vm_page from its current queue.
791  * Returns the base queue the page used to be on.
792  *
793  * The vm_page and the queue must be spinlocked.
794  * This function will unlock the queue but leave the page spinlocked.
795  */
796 static __inline u_short
797 _vm_page_rem_queue_spinlocked(vm_page_t m)
798 {
799 	struct vpgqueues *pq;
800 	u_short queue;
801 	u_short oqueue;
802 	int *cnt;
803 
804 	queue = m->queue;
805 	if (queue != PQ_NONE) {
806 		pq = &vm_page_queues[queue];
807 		TAILQ_REMOVE(&pq->pl, m, pageq);
808 
809 		/*
810 		 * Adjust our pcpu stats.  In order for the nominal low-memory
811 		 * algorithms to work properly we don't let any pcpu stat get
812 		 * too negative before we force it to be rolled-up into the
813 		 * global stats.  Otherwise our pageout and vm_wait tests
814 		 * will fail badly.
815 		 *
816 		 * The idea here is to reduce unnecessary SMP cache
817 		 * mastership changes in the global vmstats, which can be
818 		 * particularly bad in multi-socket systems.
819 		 */
820 		cnt = (int *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
821 		atomic_add_int(cnt, -1);
822 		if (*cnt < -VMMETER_SLOP_COUNT) {
823 			u_int copy = atomic_swap_int(cnt, 0);
824 			cnt = (int *)((char *)&vmstats + pq->cnt_offset);
825 			atomic_add_int(cnt, copy);
826 			cnt = (int *)((char *)&mycpu->gd_vmstats +
827 				      pq->cnt_offset);
828 			atomic_add_int(cnt, copy);
829 		}
830 		pq->lcnt--;
831 		m->queue = PQ_NONE;
832 		oqueue = queue;
833 		queue -= m->pc;
834 		vm_page_queues_spin_unlock(oqueue);	/* intended */
835 	}
836 	return queue;
837 }
838 
839 /*
840  * Helper function places the vm_page on the specified queue.
841  *
842  * The vm_page must be spinlocked.
843  * This function will return with both the page and the queue locked.
844  */
845 static __inline void
846 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
847 {
848 	struct vpgqueues *pq;
849 	u_int *cnt;
850 
851 	KKASSERT(m->queue == PQ_NONE);
852 
853 	if (queue != PQ_NONE) {
854 		vm_page_queues_spin_lock(queue);
855 		pq = &vm_page_queues[queue];
856 		++pq->lcnt;
857 
858 		/*
859 		 * Adjust our pcpu stats.  If a system entity really needs
860 		 * to incorporate the count it will call vmstats_rollup()
861 		 * to roll it all up into the global vmstats strufture.
862 		 */
863 		cnt = (int *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
864 		atomic_add_int(cnt, 1);
865 
866 		/*
867 		 * PQ_FREE is always handled LIFO style to try to provide
868 		 * cache-hot pages to programs.
869 		 */
870 		m->queue = queue;
871 		if (queue - m->pc == PQ_FREE) {
872 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
873 		} else if (athead) {
874 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
875 		} else {
876 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
877 		}
878 		/* leave the queue spinlocked */
879 	}
880 }
881 
882 /*
883  * Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
884  * m->busy is zero.  Returns TRUE if it had to sleep, FALSE if we
885  * did not.  Only one sleep call will be made before returning.
886  *
887  * This function does NOT busy the page and on return the page is not
888  * guaranteed to be available.
889  */
890 void
891 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
892 {
893 	u_int32_t flags;
894 
895 	for (;;) {
896 		flags = m->flags;
897 		cpu_ccfence();
898 
899 		if ((flags & PG_BUSY) == 0 &&
900 		    (also_m_busy == 0 || (flags & PG_SBUSY) == 0)) {
901 			break;
902 		}
903 		tsleep_interlock(m, 0);
904 		if (atomic_cmpset_int(&m->flags, flags,
905 				      flags | PG_WANTED | PG_REFERENCED)) {
906 			tsleep(m, PINTERLOCKED, msg, 0);
907 			break;
908 		}
909 	}
910 }
911 
912 /*
913  * This calculates and returns a page color given an optional VM object and
914  * either a pindex or an iterator.  We attempt to return a cpu-localized
915  * pg_color that is still roughly 16-way set-associative.  The CPU topology
916  * is used if it was probed.
917  *
918  * The caller may use the returned value to index into e.g. PQ_FREE when
919  * allocating a page in order to nominally obtain pages that are hopefully
920  * already localized to the requesting cpu.  This function is not able to
921  * provide any sort of guarantee of this, but does its best to improve
922  * hardware cache management performance.
923  *
924  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
925  */
926 u_short
927 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
928 {
929 	u_short pg_color;
930 	int phys_id;
931 	int core_id;
932 	int object_pg_color;
933 
934 	phys_id = get_cpu_phys_id(cpuid);
935 	core_id = get_cpu_core_id(cpuid);
936 	object_pg_color = object ? object->pg_color : 0;
937 
938 	if (cpu_topology_phys_ids && cpu_topology_core_ids) {
939 		int grpsize;
940 
941 		/*
942 		 * Break us down by socket and cpu
943 		 */
944 		pg_color = phys_id * PQ_L2_SIZE / cpu_topology_phys_ids;
945 		pg_color += core_id * PQ_L2_SIZE /
946 			    (cpu_topology_core_ids * cpu_topology_phys_ids);
947 
948 		/*
949 		 * Calculate remaining component for object/queue color
950 		 */
951 		grpsize = PQ_L2_SIZE / (cpu_topology_core_ids *
952 					cpu_topology_phys_ids);
953 		if (grpsize >= 8) {
954 			pg_color += (pindex + object_pg_color) % grpsize;
955 		} else {
956 			if (grpsize <= 2) {
957 				grpsize = 8;
958 			} else {
959 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
960 				grpsize += grpsize;
961 				if (grpsize < 8)
962 					grpsize += grpsize;
963 			}
964 			pg_color += (pindex + object_pg_color) % grpsize;
965 		}
966 	} else {
967 		/*
968 		 * Unknown topology, distribute things evenly.
969 		 */
970 		pg_color = cpuid * PQ_L2_SIZE / ncpus;
971 		pg_color += pindex + object_pg_color;
972 	}
973 	return (pg_color & PQ_L2_MASK);
974 }
975 
976 /*
977  * Wait until PG_BUSY can be set, then set it.  If also_m_busy is TRUE we
978  * also wait for m->busy to become 0 before setting PG_BUSY.
979  */
980 void
981 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
982 				     int also_m_busy, const char *msg
983 				     VM_PAGE_DEBUG_ARGS)
984 {
985 	u_int32_t flags;
986 
987 	for (;;) {
988 		flags = m->flags;
989 		cpu_ccfence();
990 		if (flags & PG_BUSY) {
991 			tsleep_interlock(m, 0);
992 			if (atomic_cmpset_int(&m->flags, flags,
993 					  flags | PG_WANTED | PG_REFERENCED)) {
994 				tsleep(m, PINTERLOCKED, msg, 0);
995 			}
996 		} else if (also_m_busy && (flags & PG_SBUSY)) {
997 			tsleep_interlock(m, 0);
998 			if (atomic_cmpset_int(&m->flags, flags,
999 					  flags | PG_WANTED | PG_REFERENCED)) {
1000 				tsleep(m, PINTERLOCKED, msg, 0);
1001 			}
1002 		} else {
1003 			if (atomic_cmpset_int(&m->flags, flags,
1004 					      flags | PG_BUSY)) {
1005 #ifdef VM_PAGE_DEBUG
1006 				m->busy_func = func;
1007 				m->busy_line = lineno;
1008 #endif
1009 				break;
1010 			}
1011 		}
1012 	}
1013 }
1014 
1015 /*
1016  * Attempt to set PG_BUSY.  If also_m_busy is TRUE we only succeed if m->busy
1017  * is also 0.
1018  *
1019  * Returns non-zero on failure.
1020  */
1021 int
1022 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1023 				    VM_PAGE_DEBUG_ARGS)
1024 {
1025 	u_int32_t flags;
1026 
1027 	for (;;) {
1028 		flags = m->flags;
1029 		cpu_ccfence();
1030 		if (flags & PG_BUSY)
1031 			return TRUE;
1032 		if (also_m_busy && (flags & PG_SBUSY))
1033 			return TRUE;
1034 		if (atomic_cmpset_int(&m->flags, flags, flags | PG_BUSY)) {
1035 #ifdef VM_PAGE_DEBUG
1036 				m->busy_func = func;
1037 				m->busy_line = lineno;
1038 #endif
1039 			return FALSE;
1040 		}
1041 	}
1042 }
1043 
1044 /*
1045  * Clear the PG_BUSY flag and return non-zero to indicate to the caller
1046  * that a wakeup() should be performed.
1047  *
1048  * The vm_page must be spinlocked and will remain spinlocked on return.
1049  * The related queue must NOT be spinlocked (which could deadlock us).
1050  *
1051  * (inline version)
1052  */
1053 static __inline
1054 int
1055 _vm_page_wakeup(vm_page_t m)
1056 {
1057 	u_int32_t flags;
1058 
1059 	for (;;) {
1060 		flags = m->flags;
1061 		cpu_ccfence();
1062 		if (atomic_cmpset_int(&m->flags, flags,
1063 				      flags & ~(PG_BUSY | PG_WANTED))) {
1064 			break;
1065 		}
1066 	}
1067 	return(flags & PG_WANTED);
1068 }
1069 
1070 /*
1071  * Clear the PG_BUSY flag and wakeup anyone waiting for the page.  This
1072  * is typically the last call you make on a page before moving onto
1073  * other things.
1074  */
1075 void
1076 vm_page_wakeup(vm_page_t m)
1077 {
1078         KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
1079 	vm_page_spin_lock(m);
1080 	if (_vm_page_wakeup(m)) {
1081 		vm_page_spin_unlock(m);
1082 		wakeup(m);
1083 	} else {
1084 		vm_page_spin_unlock(m);
1085 	}
1086 }
1087 
1088 /*
1089  * Holding a page keeps it from being reused.  Other parts of the system
1090  * can still disassociate the page from its current object and free it, or
1091  * perform read or write I/O on it and/or otherwise manipulate the page,
1092  * but if the page is held the VM system will leave the page and its data
1093  * intact and not reuse the page for other purposes until the last hold
1094  * reference is released.  (see vm_page_wire() if you want to prevent the
1095  * page from being disassociated from its object too).
1096  *
1097  * The caller must still validate the contents of the page and, if necessary,
1098  * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete
1099  * before manipulating the page.
1100  *
1101  * XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary
1102  */
1103 void
1104 vm_page_hold(vm_page_t m)
1105 {
1106 	vm_page_spin_lock(m);
1107 	atomic_add_int(&m->hold_count, 1);
1108 	if (m->queue - m->pc == PQ_FREE) {
1109 		_vm_page_queue_spin_lock(m);
1110 		_vm_page_rem_queue_spinlocked(m);
1111 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
1112 		_vm_page_queue_spin_unlock(m);
1113 	}
1114 	vm_page_spin_unlock(m);
1115 }
1116 
1117 /*
1118  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1119  * it was freed while held and must be moved back to the FREE queue.
1120  */
1121 void
1122 vm_page_unhold(vm_page_t m)
1123 {
1124 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1125 		("vm_page_unhold: pg %p illegal hold_count (%d) or on FREE queue (%d)",
1126 		 m, m->hold_count, m->queue - m->pc));
1127 	vm_page_spin_lock(m);
1128 	atomic_add_int(&m->hold_count, -1);
1129 	if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1130 		_vm_page_queue_spin_lock(m);
1131 		_vm_page_rem_queue_spinlocked(m);
1132 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
1133 		_vm_page_queue_spin_unlock(m);
1134 	}
1135 	vm_page_spin_unlock(m);
1136 }
1137 
1138 /*
1139  *	vm_page_getfake:
1140  *
1141  *	Create a fictitious page with the specified physical address and
1142  *	memory attribute.  The memory attribute is the only the machine-
1143  *	dependent aspect of a fictitious page that must be initialized.
1144  */
1145 
1146 void
1147 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1148 {
1149 
1150 	if ((m->flags & PG_FICTITIOUS) != 0) {
1151 		/*
1152 		 * The page's memattr might have changed since the
1153 		 * previous initialization.  Update the pmap to the
1154 		 * new memattr.
1155 		 */
1156 		goto memattr;
1157 	}
1158 	m->phys_addr = paddr;
1159 	m->queue = PQ_NONE;
1160 	/* Fictitious pages don't use "segind". */
1161 	/* Fictitious pages don't use "order" or "pool". */
1162 	m->flags = PG_FICTITIOUS | PG_UNMANAGED | PG_BUSY;
1163 	m->wire_count = 1;
1164 	spin_init(&m->spin, "fake_page");
1165 	pmap_page_init(m);
1166 memattr:
1167 	pmap_page_set_memattr(m, memattr);
1168 }
1169 
1170 /*
1171  * Inserts the given vm_page into the object and object list.
1172  *
1173  * The pagetables are not updated but will presumably fault the page
1174  * in if necessary, or if a kernel page the caller will at some point
1175  * enter the page into the kernel's pmap.  We are not allowed to block
1176  * here so we *can't* do this anyway.
1177  *
1178  * This routine may not block.
1179  * This routine must be called with the vm_object held.
1180  * This routine must be called with a critical section held.
1181  *
1182  * This routine returns TRUE if the page was inserted into the object
1183  * successfully, and FALSE if the page already exists in the object.
1184  */
1185 int
1186 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1187 {
1188 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1189 	if (m->object != NULL)
1190 		panic("vm_page_insert: already inserted");
1191 
1192 	object->generation++;
1193 
1194 	/*
1195 	 * Record the object/offset pair in this page and add the
1196 	 * pv_list_count of the page to the object.
1197 	 *
1198 	 * The vm_page spin lock is required for interactions with the pmap.
1199 	 */
1200 	vm_page_spin_lock(m);
1201 	m->object = object;
1202 	m->pindex = pindex;
1203 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1204 		m->object = NULL;
1205 		m->pindex = 0;
1206 		vm_page_spin_unlock(m);
1207 		return FALSE;
1208 	}
1209 	++object->resident_page_count;
1210 	++mycpu->gd_vmtotal.t_rm;
1211 	/* atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count); */
1212 	vm_page_spin_unlock(m);
1213 
1214 	/*
1215 	 * Since we are inserting a new and possibly dirty page,
1216 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1217 	 */
1218 	if ((m->valid & m->dirty) ||
1219 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1220 		vm_object_set_writeable_dirty(object);
1221 
1222 	/*
1223 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1224 	 */
1225 	swap_pager_page_inserted(m);
1226 	return TRUE;
1227 }
1228 
1229 /*
1230  * Removes the given vm_page_t from the (object,index) table
1231  *
1232  * The underlying pmap entry (if any) is NOT removed here.
1233  * This routine may not block.
1234  *
1235  * The page must be BUSY and will remain BUSY on return.
1236  * No other requirements.
1237  *
1238  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1239  *	 it busy.
1240  */
1241 void
1242 vm_page_remove(vm_page_t m)
1243 {
1244 	vm_object_t object;
1245 
1246 	if (m->object == NULL) {
1247 		return;
1248 	}
1249 
1250 	if ((m->flags & PG_BUSY) == 0)
1251 		panic("vm_page_remove: page not busy");
1252 
1253 	object = m->object;
1254 
1255 	vm_object_hold(object);
1256 
1257 	/*
1258 	 * Remove the page from the object and update the object.
1259 	 *
1260 	 * The vm_page spin lock is required for interactions with the pmap.
1261 	 */
1262 	vm_page_spin_lock(m);
1263 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1264 	--object->resident_page_count;
1265 	--mycpu->gd_vmtotal.t_rm;
1266 	/* atomic_add_int(&object->agg_pv_list_count, -m->md.pv_list_count); */
1267 	m->object = NULL;
1268 	vm_page_spin_unlock(m);
1269 
1270 	object->generation++;
1271 
1272 	vm_object_drop(object);
1273 }
1274 
1275 /*
1276  * Locate and return the page at (object, pindex), or NULL if the
1277  * page could not be found.
1278  *
1279  * The caller must hold the vm_object token.
1280  */
1281 vm_page_t
1282 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1283 {
1284 	vm_page_t m;
1285 
1286 	/*
1287 	 * Search the hash table for this object/offset pair
1288 	 */
1289 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1290 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1291 	KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
1292 	return(m);
1293 }
1294 
1295 vm_page_t
1296 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1297 					    vm_pindex_t pindex,
1298 					    int also_m_busy, const char *msg
1299 					    VM_PAGE_DEBUG_ARGS)
1300 {
1301 	u_int32_t flags;
1302 	vm_page_t m;
1303 
1304 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1305 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1306 	while (m) {
1307 		KKASSERT(m->object == object && m->pindex == pindex);
1308 		flags = m->flags;
1309 		cpu_ccfence();
1310 		if (flags & PG_BUSY) {
1311 			tsleep_interlock(m, 0);
1312 			if (atomic_cmpset_int(&m->flags, flags,
1313 					  flags | PG_WANTED | PG_REFERENCED)) {
1314 				tsleep(m, PINTERLOCKED, msg, 0);
1315 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1316 							      pindex);
1317 			}
1318 		} else if (also_m_busy && (flags & PG_SBUSY)) {
1319 			tsleep_interlock(m, 0);
1320 			if (atomic_cmpset_int(&m->flags, flags,
1321 					  flags | PG_WANTED | PG_REFERENCED)) {
1322 				tsleep(m, PINTERLOCKED, msg, 0);
1323 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1324 							      pindex);
1325 			}
1326 		} else if (atomic_cmpset_int(&m->flags, flags,
1327 					     flags | PG_BUSY)) {
1328 #ifdef VM_PAGE_DEBUG
1329 			m->busy_func = func;
1330 			m->busy_line = lineno;
1331 #endif
1332 			break;
1333 		}
1334 	}
1335 	return m;
1336 }
1337 
1338 /*
1339  * Attempt to lookup and busy a page.
1340  *
1341  * Returns NULL if the page could not be found
1342  *
1343  * Returns a vm_page and error == TRUE if the page exists but could not
1344  * be busied.
1345  *
1346  * Returns a vm_page and error == FALSE on success.
1347  */
1348 vm_page_t
1349 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1350 					   vm_pindex_t pindex,
1351 					   int also_m_busy, int *errorp
1352 					   VM_PAGE_DEBUG_ARGS)
1353 {
1354 	u_int32_t flags;
1355 	vm_page_t m;
1356 
1357 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1358 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1359 	*errorp = FALSE;
1360 	while (m) {
1361 		KKASSERT(m->object == object && m->pindex == pindex);
1362 		flags = m->flags;
1363 		cpu_ccfence();
1364 		if (flags & PG_BUSY) {
1365 			*errorp = TRUE;
1366 			break;
1367 		}
1368 		if (also_m_busy && (flags & PG_SBUSY)) {
1369 			*errorp = TRUE;
1370 			break;
1371 		}
1372 		if (atomic_cmpset_int(&m->flags, flags, flags | PG_BUSY)) {
1373 #ifdef VM_PAGE_DEBUG
1374 			m->busy_func = func;
1375 			m->busy_line = lineno;
1376 #endif
1377 			break;
1378 		}
1379 	}
1380 	return m;
1381 }
1382 
1383 /*
1384  * Attempt to repurpose the passed-in page.  If the passed-in page cannot
1385  * be repurposed it will be released, *must_reenter will be set to 1, and
1386  * this function will fall-through to vm_page_lookup_busy_try().
1387  *
1388  * The passed-in page must be wired and not busy.  The returned page will
1389  * be busied and not wired.
1390  *
1391  * A different page may be returned.  The returned page will be busied and
1392  * not wired.
1393  *
1394  * NULL can be returned.  If so, the required page could not be busied.
1395  * The passed-in page will be unwired.
1396  */
1397 vm_page_t
1398 vm_page_repurpose(struct vm_object *object, vm_pindex_t pindex,
1399 		  int also_m_busy, int *errorp, vm_page_t m,
1400 		  int *must_reenter, int *iswired)
1401 {
1402 	if (m) {
1403 		/*
1404 		 * Do not mess with pages in a complex state, such as pages
1405 		 * which are mapped, as repurposing such pages can be more
1406 		 * expensive than simply allocatin a new one.
1407 		 *
1408 		 * NOTE: Soft-busying can deadlock against putpages or I/O
1409 		 *	 so we only allow hard-busying here.
1410 		 */
1411 		KKASSERT(also_m_busy == FALSE);
1412 		vm_page_busy_wait(m, also_m_busy, "biodep");
1413 
1414 		if ((m->flags & (PG_UNMANAGED | PG_MAPPED |
1415 				 PG_FICTITIOUS | PG_SBUSY)) ||
1416 		    m->busy || m->wire_count != 1 || m->hold_count) {
1417 			vm_page_unwire(m, 0);
1418 			vm_page_wakeup(m);
1419 			/* fall through to normal lookup */
1420 		} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
1421 			vm_page_unwire(m, 0);
1422 			vm_page_deactivate(m);
1423 			vm_page_wakeup(m);
1424 			/* fall through to normal lookup */
1425 		} else {
1426 			/*
1427 			 * We can safely repurpose the page.  It should
1428 			 * already be unqueued.
1429 			 */
1430 			KKASSERT(m->queue == PQ_NONE && m->dirty == 0);
1431 			vm_page_remove(m);
1432 			m->valid = 0;
1433 			m->act_count = 0;
1434 			if (vm_page_insert(m, object, pindex)) {
1435 				*errorp = 0;
1436 				*iswired = 1;
1437 
1438 				return m;
1439 			}
1440 			vm_page_unwire(m, 0);
1441 			vm_page_free(m);
1442 			/* fall through to normal lookup */
1443 		}
1444 	}
1445 
1446 	/*
1447 	 * Cannot repurpose page, attempt to locate the desired page.  May
1448 	 * return NULL.
1449 	 */
1450 	*must_reenter = 1;
1451 	*iswired = 0;
1452 	m = vm_page_lookup_busy_try(object, pindex, also_m_busy, errorp);
1453 
1454 	return m;
1455 }
1456 
1457 /*
1458  * Caller must hold the related vm_object
1459  */
1460 vm_page_t
1461 vm_page_next(vm_page_t m)
1462 {
1463 	vm_page_t next;
1464 
1465 	next = vm_page_rb_tree_RB_NEXT(m);
1466 	if (next && next->pindex != m->pindex + 1)
1467 		next = NULL;
1468 	return (next);
1469 }
1470 
1471 /*
1472  * vm_page_rename()
1473  *
1474  * Move the given vm_page from its current object to the specified
1475  * target object/offset.  The page must be busy and will remain so
1476  * on return.
1477  *
1478  * new_object must be held.
1479  * This routine might block. XXX ?
1480  *
1481  * NOTE: Swap associated with the page must be invalidated by the move.  We
1482  *       have to do this for several reasons:  (1) we aren't freeing the
1483  *       page, (2) we are dirtying the page, (3) the VM system is probably
1484  *       moving the page from object A to B, and will then later move
1485  *       the backing store from A to B and we can't have a conflict.
1486  *
1487  * NOTE: We *always* dirty the page.  It is necessary both for the
1488  *       fact that we moved it, and because we may be invalidating
1489  *	 swap.  If the page is on the cache, we have to deactivate it
1490  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1491  *	 on the cache.
1492  */
1493 void
1494 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1495 {
1496 	KKASSERT(m->flags & PG_BUSY);
1497 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1498 	if (m->object) {
1499 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1500 		vm_page_remove(m);
1501 	}
1502 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1503 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1504 		      new_object, new_pindex);
1505 	}
1506 	if (m->queue - m->pc == PQ_CACHE)
1507 		vm_page_deactivate(m);
1508 	vm_page_dirty(m);
1509 }
1510 
1511 /*
1512  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1513  * is to remain BUSYied by the caller.
1514  *
1515  * This routine may not block.
1516  */
1517 void
1518 vm_page_unqueue_nowakeup(vm_page_t m)
1519 {
1520 	vm_page_and_queue_spin_lock(m);
1521 	(void)_vm_page_rem_queue_spinlocked(m);
1522 	vm_page_spin_unlock(m);
1523 }
1524 
1525 /*
1526  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1527  * if necessary.
1528  *
1529  * This routine may not block.
1530  */
1531 void
1532 vm_page_unqueue(vm_page_t m)
1533 {
1534 	u_short queue;
1535 
1536 	vm_page_and_queue_spin_lock(m);
1537 	queue = _vm_page_rem_queue_spinlocked(m);
1538 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1539 		vm_page_spin_unlock(m);
1540 		pagedaemon_wakeup();
1541 	} else {
1542 		vm_page_spin_unlock(m);
1543 	}
1544 }
1545 
1546 /*
1547  * vm_page_list_find()
1548  *
1549  * Find a page on the specified queue with color optimization.
1550  *
1551  * The page coloring optimization attempts to locate a page that does
1552  * not overload other nearby pages in the object in the cpu's L1 or L2
1553  * caches.  We need this optimization because cpu caches tend to be
1554  * physical caches, while object spaces tend to be virtual.
1555  *
1556  * The page coloring optimization also, very importantly, tries to localize
1557  * memory to cpus and physical sockets.
1558  *
1559  * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
1560  * and the algorithm is adjusted to localize allocations on a per-core basis.
1561  * This is done by 'twisting' the colors.
1562  *
1563  * The page is returned spinlocked and removed from its queue (it will
1564  * be on PQ_NONE), or NULL. The page is not PG_BUSY'd.  The caller
1565  * is responsible for dealing with the busy-page case (usually by
1566  * deactivating the page and looping).
1567  *
1568  * NOTE:  This routine is carefully inlined.  A non-inlined version
1569  *	  is available for outside callers but the only critical path is
1570  *	  from within this source file.
1571  *
1572  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1573  *	  represent stable storage, allowing us to order our locks vm_page
1574  *	  first, then queue.
1575  */
1576 static __inline
1577 vm_page_t
1578 _vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
1579 {
1580 	vm_page_t m;
1581 
1582 	for (;;) {
1583 		if (prefer_zero) {
1584 			m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl,
1585 				       pglist);
1586 		} else {
1587 			m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
1588 		}
1589 		if (m == NULL) {
1590 			m = _vm_page_list_find2(basequeue, index);
1591 			return(m);
1592 		}
1593 		vm_page_and_queue_spin_lock(m);
1594 		if (m->queue == basequeue + index) {
1595 			_vm_page_rem_queue_spinlocked(m);
1596 			/* vm_page_t spin held, no queue spin */
1597 			break;
1598 		}
1599 		vm_page_and_queue_spin_unlock(m);
1600 	}
1601 	return(m);
1602 }
1603 
1604 /*
1605  * If we could not find the page in the desired queue try to find it in
1606  * a nearby queue.
1607  */
1608 static vm_page_t
1609 _vm_page_list_find2(int basequeue, int index)
1610 {
1611 	struct vpgqueues *pq;
1612 	vm_page_t m = NULL;
1613 	int pqmask = PQ_SET_ASSOC_MASK >> 1;
1614 	int pqi;
1615 	int i;
1616 
1617 	index &= PQ_L2_MASK;
1618 	pq = &vm_page_queues[basequeue];
1619 
1620 	/*
1621 	 * Run local sets of 16, 32, 64, 128, and the whole queue if all
1622 	 * else fails (PQ_L2_MASK which is 255).
1623 	 */
1624 	do {
1625 		pqmask = (pqmask << 1) | 1;
1626 		for (i = 0; i <= pqmask; ++i) {
1627 			pqi = (index & ~pqmask) | ((index + i) & pqmask);
1628 			m = TAILQ_FIRST(&pq[pqi].pl);
1629 			if (m) {
1630 				_vm_page_and_queue_spin_lock(m);
1631 				if (m->queue == basequeue + pqi) {
1632 					_vm_page_rem_queue_spinlocked(m);
1633 					return(m);
1634 				}
1635 				_vm_page_and_queue_spin_unlock(m);
1636 				--i;
1637 				continue;
1638 			}
1639 		}
1640 	} while (pqmask != PQ_L2_MASK);
1641 
1642 	return(m);
1643 }
1644 
1645 /*
1646  * Returns a vm_page candidate for allocation.  The page is not busied so
1647  * it can move around.  The caller must busy the page (and typically
1648  * deactivate it if it cannot be busied!)
1649  *
1650  * Returns a spinlocked vm_page that has been removed from its queue.
1651  */
1652 vm_page_t
1653 vm_page_list_find(int basequeue, int index, boolean_t prefer_zero)
1654 {
1655 	return(_vm_page_list_find(basequeue, index, prefer_zero));
1656 }
1657 
1658 /*
1659  * Find a page on the cache queue with color optimization, remove it
1660  * from the queue, and busy it.  The returned page will not be spinlocked.
1661  *
1662  * A candidate failure will be deactivated.  Candidates can fail due to
1663  * being busied by someone else, in which case they will be deactivated.
1664  *
1665  * This routine may not block.
1666  *
1667  */
1668 static vm_page_t
1669 vm_page_select_cache(u_short pg_color)
1670 {
1671 	vm_page_t m;
1672 
1673 	for (;;) {
1674 		m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK, FALSE);
1675 		if (m == NULL)
1676 			break;
1677 		/*
1678 		 * (m) has been removed from its queue and spinlocked
1679 		 */
1680 		if (vm_page_busy_try(m, TRUE)) {
1681 			_vm_page_deactivate_locked(m, 0);
1682 			vm_page_spin_unlock(m);
1683 		} else {
1684 			/*
1685 			 * We successfully busied the page
1686 			 */
1687 			if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0 &&
1688 			    m->hold_count == 0 &&
1689 			    m->wire_count == 0 &&
1690 			    (m->dirty & m->valid) == 0) {
1691 				vm_page_spin_unlock(m);
1692 				pagedaemon_wakeup();
1693 				return(m);
1694 			}
1695 
1696 			/*
1697 			 * The page cannot be recycled, deactivate it.
1698 			 */
1699 			_vm_page_deactivate_locked(m, 0);
1700 			if (_vm_page_wakeup(m)) {
1701 				vm_page_spin_unlock(m);
1702 				wakeup(m);
1703 			} else {
1704 				vm_page_spin_unlock(m);
1705 			}
1706 		}
1707 	}
1708 	return (m);
1709 }
1710 
1711 /*
1712  * Find a free or zero page, with specified preference.  We attempt to
1713  * inline the nominal case and fall back to _vm_page_select_free()
1714  * otherwise.  A busied page is removed from the queue and returned.
1715  *
1716  * This routine may not block.
1717  */
1718 static __inline vm_page_t
1719 vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
1720 {
1721 	vm_page_t m;
1722 
1723 	for (;;) {
1724 		m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK,
1725 				       prefer_zero);
1726 		if (m == NULL)
1727 			break;
1728 		if (vm_page_busy_try(m, TRUE)) {
1729 			/*
1730 			 * Various mechanisms such as a pmap_collect can
1731 			 * result in a busy page on the free queue.  We
1732 			 * have to move the page out of the way so we can
1733 			 * retry the allocation.  If the other thread is not
1734 			 * allocating the page then m->valid will remain 0 and
1735 			 * the pageout daemon will free the page later on.
1736 			 *
1737 			 * Since we could not busy the page, however, we
1738 			 * cannot make assumptions as to whether the page
1739 			 * will be allocated by the other thread or not,
1740 			 * so all we can do is deactivate it to move it out
1741 			 * of the way.  In particular, if the other thread
1742 			 * wires the page it may wind up on the inactive
1743 			 * queue and the pageout daemon will have to deal
1744 			 * with that case too.
1745 			 */
1746 			_vm_page_deactivate_locked(m, 0);
1747 			vm_page_spin_unlock(m);
1748 		} else {
1749 			/*
1750 			 * Theoretically if we are able to busy the page
1751 			 * atomic with the queue removal (using the vm_page
1752 			 * lock) nobody else should be able to mess with the
1753 			 * page before us.
1754 			 */
1755 			KKASSERT((m->flags & (PG_UNMANAGED |
1756 					      PG_NEED_COMMIT)) == 0);
1757 			KASSERT(m->hold_count == 0, ("m->hold_count is not zero "
1758 						     "pg %p q=%d flags=%08x hold=%d wire=%d",
1759 						     m, m->queue, m->flags, m->hold_count, m->wire_count));
1760 			KKASSERT(m->wire_count == 0);
1761 			vm_page_spin_unlock(m);
1762 			pagedaemon_wakeup();
1763 
1764 			/* return busied and removed page */
1765 			return(m);
1766 		}
1767 	}
1768 	return(m);
1769 }
1770 
1771 /*
1772  * vm_page_alloc()
1773  *
1774  * Allocate and return a memory cell associated with this VM object/offset
1775  * pair.  If object is NULL an unassociated page will be allocated.
1776  *
1777  * The returned page will be busied and removed from its queues.  This
1778  * routine can block and may return NULL if a race occurs and the page
1779  * is found to already exist at the specified (object, pindex).
1780  *
1781  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
1782  *	VM_ALLOC_QUICK		like normal but cannot use cache
1783  *	VM_ALLOC_SYSTEM		greater free drain
1784  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
1785  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
1786  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
1787  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
1788  *				(see vm_page_grab())
1789  *	VM_ALLOC_USE_GD		ok to use per-gd cache
1790  *
1791  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
1792  *
1793  * The object must be held if not NULL
1794  * This routine may not block
1795  *
1796  * Additional special handling is required when called from an interrupt
1797  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
1798  * in this case.
1799  */
1800 vm_page_t
1801 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
1802 {
1803 	globaldata_t gd;
1804 	vm_object_t obj;
1805 	vm_page_t m;
1806 	u_short pg_color;
1807 	int cpuid_local;
1808 
1809 #if 0
1810 	/*
1811 	 * Special per-cpu free VM page cache.  The pages are pre-busied
1812 	 * and pre-zerod for us.
1813 	 */
1814 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
1815 		crit_enter_gd(gd);
1816 		if (gd->gd_vmpg_count) {
1817 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
1818 			crit_exit_gd(gd);
1819 			goto done;
1820                 }
1821 		crit_exit_gd(gd);
1822         }
1823 #endif
1824 	m = NULL;
1825 
1826 	/*
1827 	 * CPU LOCALIZATION
1828 	 *
1829 	 * CPU localization algorithm.  Break the page queues up by physical
1830 	 * id and core id (note that two cpu threads will have the same core
1831 	 * id, and core_id != gd_cpuid).
1832 	 *
1833 	 * This is nowhere near perfect, for example the last pindex in a
1834 	 * subgroup will overflow into the next cpu or package.  But this
1835 	 * should get us good page reuse locality in heavy mixed loads.
1836 	 *
1837 	 * (may be executed before the APs are started, so other GDs might
1838 	 *  not exist!)
1839 	 */
1840 	if (page_req & VM_ALLOC_CPU_SPEC)
1841 		cpuid_local = VM_ALLOC_GETCPU(page_req);
1842 	else
1843 		cpuid_local = mycpu->gd_cpuid;
1844 
1845 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
1846 
1847 	KKASSERT(page_req &
1848 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
1849 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
1850 
1851 	/*
1852 	 * Certain system threads (pageout daemon, buf_daemon's) are
1853 	 * allowed to eat deeper into the free page list.
1854 	 */
1855 	if (curthread->td_flags & TDF_SYSTHREAD)
1856 		page_req |= VM_ALLOC_SYSTEM;
1857 
1858 	/*
1859 	 * Impose various limitations.  Note that the v_free_reserved test
1860 	 * must match the opposite of vm_page_count_target() to avoid
1861 	 * livelocks, be careful.
1862 	 */
1863 loop:
1864 	gd = mycpu;
1865 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
1866 	    ((page_req & VM_ALLOC_INTERRUPT) &&
1867 	     gd->gd_vmstats.v_free_count > 0) ||
1868 	    ((page_req & VM_ALLOC_SYSTEM) &&
1869 	     gd->gd_vmstats.v_cache_count == 0 &&
1870 		gd->gd_vmstats.v_free_count >
1871 		gd->gd_vmstats.v_interrupt_free_min)
1872 	) {
1873 		/*
1874 		 * The free queue has sufficient free pages to take one out.
1875 		 */
1876 		if (page_req & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO))
1877 			m = vm_page_select_free(pg_color, TRUE);
1878 		else
1879 			m = vm_page_select_free(pg_color, FALSE);
1880 	} else if (page_req & VM_ALLOC_NORMAL) {
1881 		/*
1882 		 * Allocatable from the cache (non-interrupt only).  On
1883 		 * success, we must free the page and try again, thus
1884 		 * ensuring that vmstats.v_*_free_min counters are replenished.
1885 		 */
1886 #ifdef INVARIANTS
1887 		if (curthread->td_preempted) {
1888 			kprintf("vm_page_alloc(): warning, attempt to allocate"
1889 				" cache page from preempting interrupt\n");
1890 			m = NULL;
1891 		} else {
1892 			m = vm_page_select_cache(pg_color);
1893 		}
1894 #else
1895 		m = vm_page_select_cache(pg_color);
1896 #endif
1897 		/*
1898 		 * On success move the page into the free queue and loop.
1899 		 *
1900 		 * Only do this if we can safely acquire the vm_object lock,
1901 		 * because this is effectively a random page and the caller
1902 		 * might be holding the lock shared, we don't want to
1903 		 * deadlock.
1904 		 */
1905 		if (m != NULL) {
1906 			KASSERT(m->dirty == 0,
1907 				("Found dirty cache page %p", m));
1908 			if ((obj = m->object) != NULL) {
1909 				if (vm_object_hold_try(obj)) {
1910 					vm_page_protect(m, VM_PROT_NONE);
1911 					vm_page_free(m);
1912 					/* m->object NULL here */
1913 					vm_object_drop(obj);
1914 				} else {
1915 					vm_page_deactivate(m);
1916 					vm_page_wakeup(m);
1917 				}
1918 			} else {
1919 				vm_page_protect(m, VM_PROT_NONE);
1920 				vm_page_free(m);
1921 			}
1922 			goto loop;
1923 		}
1924 
1925 		/*
1926 		 * On failure return NULL
1927 		 */
1928 		atomic_add_int(&vm_pageout_deficit, 1);
1929 		pagedaemon_wakeup();
1930 		return (NULL);
1931 	} else {
1932 		/*
1933 		 * No pages available, wakeup the pageout daemon and give up.
1934 		 */
1935 		atomic_add_int(&vm_pageout_deficit, 1);
1936 		pagedaemon_wakeup();
1937 		return (NULL);
1938 	}
1939 
1940 	/*
1941 	 * v_free_count can race so loop if we don't find the expected
1942 	 * page.
1943 	 */
1944 	if (m == NULL) {
1945 		vmstats_rollup();
1946 		goto loop;
1947 	}
1948 
1949 	/*
1950 	 * Good page found.  The page has already been busied for us and
1951 	 * removed from its queues.
1952 	 */
1953 	KASSERT(m->dirty == 0,
1954 		("vm_page_alloc: free/cache page %p was dirty", m));
1955 	KKASSERT(m->queue == PQ_NONE);
1956 
1957 #if 0
1958 done:
1959 #endif
1960 	/*
1961 	 * Initialize the structure, inheriting some flags but clearing
1962 	 * all the rest.  The page has already been busied for us.
1963 	 */
1964 	vm_page_flag_clear(m, ~(PG_BUSY | PG_SBUSY));
1965 	KKASSERT(m->wire_count == 0);
1966 	KKASSERT(m->busy == 0);
1967 	m->act_count = 0;
1968 	m->valid = 0;
1969 
1970 	/*
1971 	 * Caller must be holding the object lock (asserted by
1972 	 * vm_page_insert()).
1973 	 *
1974 	 * NOTE: Inserting a page here does not insert it into any pmaps
1975 	 *	 (which could cause us to block allocating memory).
1976 	 *
1977 	 * NOTE: If no object an unassociated page is allocated, m->pindex
1978 	 *	 can be used by the caller for any purpose.
1979 	 */
1980 	if (object) {
1981 		if (vm_page_insert(m, object, pindex) == FALSE) {
1982 			vm_page_free(m);
1983 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
1984 				panic("PAGE RACE %p[%ld]/%p",
1985 				      object, (long)pindex, m);
1986 			m = NULL;
1987 		}
1988 	} else {
1989 		m->pindex = pindex;
1990 	}
1991 
1992 	/*
1993 	 * Don't wakeup too often - wakeup the pageout daemon when
1994 	 * we would be nearly out of memory.
1995 	 */
1996 	pagedaemon_wakeup();
1997 
1998 	/*
1999 	 * A PG_BUSY page is returned.
2000 	 */
2001 	return (m);
2002 }
2003 
2004 /*
2005  * Returns number of pages available in our DMA memory reserve
2006  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2007  */
2008 vm_size_t
2009 vm_contig_avail_pages(void)
2010 {
2011 	alist_blk_t blk;
2012 	alist_blk_t count;
2013 	alist_blk_t bfree;
2014 	spin_lock(&vm_contig_spin);
2015 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2016 	spin_unlock(&vm_contig_spin);
2017 
2018 	return bfree;
2019 }
2020 
2021 /*
2022  * Attempt to allocate contiguous physical memory with the specified
2023  * requirements.
2024  */
2025 vm_page_t
2026 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2027 		     unsigned long alignment, unsigned long boundary,
2028 		     unsigned long size, vm_memattr_t memattr)
2029 {
2030 	alist_blk_t blk;
2031 	vm_page_t m;
2032 	int i;
2033 
2034 	alignment >>= PAGE_SHIFT;
2035 	if (alignment == 0)
2036 		alignment = 1;
2037 	boundary >>= PAGE_SHIFT;
2038 	if (boundary == 0)
2039 		boundary = 1;
2040 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2041 
2042 	spin_lock(&vm_contig_spin);
2043 	blk = alist_alloc(&vm_contig_alist, 0, size);
2044 	if (blk == ALIST_BLOCK_NONE) {
2045 		spin_unlock(&vm_contig_spin);
2046 		if (bootverbose) {
2047 			kprintf("vm_page_alloc_contig: %ldk nospace\n",
2048 				(size + PAGE_MASK) * (PAGE_SIZE / 1024));
2049 		}
2050 		return(NULL);
2051 	}
2052 	if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2053 		alist_free(&vm_contig_alist, blk, size);
2054 		spin_unlock(&vm_contig_spin);
2055 		if (bootverbose) {
2056 			kprintf("vm_page_alloc_contig: %ldk high "
2057 				"%016jx failed\n",
2058 				(size + PAGE_MASK) * (PAGE_SIZE / 1024),
2059 				(intmax_t)high);
2060 		}
2061 		return(NULL);
2062 	}
2063 	spin_unlock(&vm_contig_spin);
2064 	if (vm_contig_verbose) {
2065 		kprintf("vm_page_alloc_contig: %016jx/%ldk\n",
2066 			(intmax_t)(vm_paddr_t)blk << PAGE_SHIFT,
2067 			(size + PAGE_MASK) * (PAGE_SIZE / 1024));
2068 	}
2069 
2070 	m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2071 	if (memattr != VM_MEMATTR_DEFAULT)
2072 		for (i = 0;i < size;i++)
2073 			pmap_page_set_memattr(&m[i], memattr);
2074 	return m;
2075 }
2076 
2077 /*
2078  * Free contiguously allocated pages.  The pages will be wired but not busy.
2079  * When freeing to the alist we leave them wired and not busy.
2080  */
2081 void
2082 vm_page_free_contig(vm_page_t m, unsigned long size)
2083 {
2084 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2085 	vm_pindex_t start = pa >> PAGE_SHIFT;
2086 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2087 
2088 	if (vm_contig_verbose) {
2089 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2090 			(intmax_t)pa, size / 1024);
2091 	}
2092 	if (pa < vm_low_phys_reserved) {
2093 		KKASSERT(pa + size <= vm_low_phys_reserved);
2094 		spin_lock(&vm_contig_spin);
2095 		alist_free(&vm_contig_alist, start, pages);
2096 		spin_unlock(&vm_contig_spin);
2097 	} else {
2098 		while (pages) {
2099 			vm_page_busy_wait(m, FALSE, "cpgfr");
2100 			vm_page_unwire(m, 0);
2101 			vm_page_free(m);
2102 			--pages;
2103 			++m;
2104 		}
2105 
2106 	}
2107 }
2108 
2109 
2110 /*
2111  * Wait for sufficient free memory for nominal heavy memory use kernel
2112  * operations.
2113  *
2114  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2115  *	     will trivially deadlock the system.
2116  */
2117 void
2118 vm_wait_nominal(void)
2119 {
2120 	while (vm_page_count_min(0))
2121 		vm_wait(0);
2122 }
2123 
2124 /*
2125  * Test if vm_wait_nominal() would block.
2126  */
2127 int
2128 vm_test_nominal(void)
2129 {
2130 	if (vm_page_count_min(0))
2131 		return(1);
2132 	return(0);
2133 }
2134 
2135 /*
2136  * Block until free pages are available for allocation, called in various
2137  * places before memory allocations.
2138  *
2139  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2140  * more generous then that.
2141  */
2142 void
2143 vm_wait(int timo)
2144 {
2145 	/*
2146 	 * never wait forever
2147 	 */
2148 	if (timo == 0)
2149 		timo = hz;
2150 	lwkt_gettoken(&vm_token);
2151 
2152 	if (curthread == pagethread) {
2153 		/*
2154 		 * The pageout daemon itself needs pages, this is bad.
2155 		 */
2156 		if (vm_page_count_min(0)) {
2157 			vm_pageout_pages_needed = 1;
2158 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2159 		}
2160 	} else {
2161 		/*
2162 		 * Wakeup the pageout daemon if necessary and wait.
2163 		 *
2164 		 * Do not wait indefinitely for the target to be reached,
2165 		 * as load might prevent it from being reached any time soon.
2166 		 * But wait a little to try to slow down page allocations
2167 		 * and to give more important threads (the pagedaemon)
2168 		 * allocation priority.
2169 		 */
2170 		if (vm_page_count_target()) {
2171 			if (vm_pages_needed == 0) {
2172 				vm_pages_needed = 1;
2173 				wakeup(&vm_pages_needed);
2174 			}
2175 			++vm_pages_waiting;	/* SMP race ok */
2176 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2177 		}
2178 	}
2179 	lwkt_reltoken(&vm_token);
2180 }
2181 
2182 /*
2183  * Block until free pages are available for allocation
2184  *
2185  * Called only from vm_fault so that processes page faulting can be
2186  * easily tracked.
2187  */
2188 void
2189 vm_wait_pfault(void)
2190 {
2191 	/*
2192 	 * Wakeup the pageout daemon if necessary and wait.
2193 	 *
2194 	 * Do not wait indefinitely for the target to be reached,
2195 	 * as load might prevent it from being reached any time soon.
2196 	 * But wait a little to try to slow down page allocations
2197 	 * and to give more important threads (the pagedaemon)
2198 	 * allocation priority.
2199 	 */
2200 	if (vm_page_count_min(0)) {
2201 		lwkt_gettoken(&vm_token);
2202 		while (vm_page_count_severe()) {
2203 			if (vm_page_count_target()) {
2204 				thread_t td;
2205 
2206 				if (vm_pages_needed == 0) {
2207 					vm_pages_needed = 1;
2208 					wakeup(&vm_pages_needed);
2209 				}
2210 				++vm_pages_waiting;	/* SMP race ok */
2211 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2212 
2213 				/*
2214 				 * Do not stay stuck in the loop if the system is trying
2215 				 * to kill the process.
2216 				 */
2217 				td = curthread;
2218 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2219 					break;
2220 			}
2221 		}
2222 		lwkt_reltoken(&vm_token);
2223 	}
2224 }
2225 
2226 /*
2227  * Put the specified page on the active list (if appropriate).  Ensure
2228  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2229  *
2230  * The caller should be holding the page busied ? XXX
2231  * This routine may not block.
2232  */
2233 void
2234 vm_page_activate(vm_page_t m)
2235 {
2236 	u_short oqueue;
2237 
2238 	vm_page_spin_lock(m);
2239 	if (m->queue - m->pc != PQ_ACTIVE) {
2240 		_vm_page_queue_spin_lock(m);
2241 		oqueue = _vm_page_rem_queue_spinlocked(m);
2242 		/* page is left spinlocked, queue is unlocked */
2243 
2244 		if (oqueue == PQ_CACHE)
2245 			mycpu->gd_cnt.v_reactivated++;
2246 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2247 			if (m->act_count < ACT_INIT)
2248 				m->act_count = ACT_INIT;
2249 			_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
2250 		}
2251 		_vm_page_and_queue_spin_unlock(m);
2252 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
2253 			pagedaemon_wakeup();
2254 	} else {
2255 		if (m->act_count < ACT_INIT)
2256 			m->act_count = ACT_INIT;
2257 		vm_page_spin_unlock(m);
2258 	}
2259 }
2260 
2261 /*
2262  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
2263  * routine is called when a page has been added to the cache or free
2264  * queues.
2265  *
2266  * This routine may not block.
2267  */
2268 static __inline void
2269 vm_page_free_wakeup(void)
2270 {
2271 	globaldata_t gd = mycpu;
2272 
2273 	/*
2274 	 * If the pageout daemon itself needs pages, then tell it that
2275 	 * there are some free.
2276 	 */
2277 	if (vm_pageout_pages_needed &&
2278 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
2279 	    gd->gd_vmstats.v_pageout_free_min
2280 	) {
2281 		vm_pageout_pages_needed = 0;
2282 		wakeup(&vm_pageout_pages_needed);
2283 	}
2284 
2285 	/*
2286 	 * Wakeup processes that are waiting on memory.
2287 	 *
2288 	 * Generally speaking we want to wakeup stuck processes as soon as
2289 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
2290 	 * where we can do this.  Wait a bit longer to reduce degenerate
2291 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
2292 	 * to make sure the min-check w/hysteresis does not exceed the
2293 	 * normal target.
2294 	 */
2295 	if (vm_pages_waiting) {
2296 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
2297 		    !vm_page_count_target()) {
2298 			vm_pages_waiting = 0;
2299 			wakeup(&vmstats.v_free_count);
2300 			++mycpu->gd_cnt.v_ppwakeups;
2301 		}
2302 #if 0
2303 		if (!vm_page_count_target()) {
2304 			/*
2305 			 * Plenty of pages are free, wakeup everyone.
2306 			 */
2307 			vm_pages_waiting = 0;
2308 			wakeup(&vmstats.v_free_count);
2309 			++mycpu->gd_cnt.v_ppwakeups;
2310 		} else if (!vm_page_count_min(0)) {
2311 			/*
2312 			 * Some pages are free, wakeup someone.
2313 			 */
2314 			int wcount = vm_pages_waiting;
2315 			if (wcount > 0)
2316 				--wcount;
2317 			vm_pages_waiting = wcount;
2318 			wakeup_one(&vmstats.v_free_count);
2319 			++mycpu->gd_cnt.v_ppwakeups;
2320 		}
2321 #endif
2322 	}
2323 }
2324 
2325 /*
2326  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
2327  * it from its VM object.
2328  *
2329  * The vm_page must be PG_BUSY on entry.  PG_BUSY will be released on
2330  * return (the page will have been freed).
2331  */
2332 void
2333 vm_page_free_toq(vm_page_t m)
2334 {
2335 	mycpu->gd_cnt.v_tfree++;
2336 	KKASSERT((m->flags & PG_MAPPED) == 0);
2337 	KKASSERT(m->flags & PG_BUSY);
2338 
2339 	if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
2340 		kprintf("vm_page_free: pindex(%lu), busy(%d), "
2341 			"PG_BUSY(%d), hold(%d)\n",
2342 			(u_long)m->pindex, m->busy,
2343 			((m->flags & PG_BUSY) ? 1 : 0), m->hold_count);
2344 		if ((m->queue - m->pc) == PQ_FREE)
2345 			panic("vm_page_free: freeing free page");
2346 		else
2347 			panic("vm_page_free: freeing busy page");
2348 	}
2349 
2350 	/*
2351 	 * Remove from object, spinlock the page and its queues and
2352 	 * remove from any queue.  No queue spinlock will be held
2353 	 * after this section (because the page was removed from any
2354 	 * queue).
2355 	 */
2356 	vm_page_remove(m);
2357 	vm_page_and_queue_spin_lock(m);
2358 	_vm_page_rem_queue_spinlocked(m);
2359 
2360 	/*
2361 	 * No further management of fictitious pages occurs beyond object
2362 	 * and queue removal.
2363 	 */
2364 	if ((m->flags & PG_FICTITIOUS) != 0) {
2365 		vm_page_spin_unlock(m);
2366 		vm_page_wakeup(m);
2367 		return;
2368 	}
2369 
2370 	m->valid = 0;
2371 	vm_page_undirty(m);
2372 
2373 	if (m->wire_count != 0) {
2374 		if (m->wire_count > 1) {
2375 		    panic(
2376 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
2377 			m->wire_count, (long)m->pindex);
2378 		}
2379 		panic("vm_page_free: freeing wired page");
2380 	}
2381 
2382 	/*
2383 	 * Clear the UNMANAGED flag when freeing an unmanaged page.
2384 	 * Clear the NEED_COMMIT flag
2385 	 */
2386 	if (m->flags & PG_UNMANAGED)
2387 		vm_page_flag_clear(m, PG_UNMANAGED);
2388 	if (m->flags & PG_NEED_COMMIT)
2389 		vm_page_flag_clear(m, PG_NEED_COMMIT);
2390 
2391 	if (m->hold_count != 0) {
2392 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
2393 	} else {
2394 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 0);
2395 	}
2396 
2397 	/*
2398 	 * This sequence allows us to clear PG_BUSY while still holding
2399 	 * its spin lock, which reduces contention vs allocators.  We
2400 	 * must not leave the queue locked or _vm_page_wakeup() may
2401 	 * deadlock.
2402 	 */
2403 	_vm_page_queue_spin_unlock(m);
2404 	if (_vm_page_wakeup(m)) {
2405 		vm_page_spin_unlock(m);
2406 		wakeup(m);
2407 	} else {
2408 		vm_page_spin_unlock(m);
2409 	}
2410 	vm_page_free_wakeup();
2411 }
2412 
2413 /*
2414  * vm_page_unmanage()
2415  *
2416  * Prevent PV management from being done on the page.  The page is
2417  * removed from the paging queues as if it were wired, and as a
2418  * consequence of no longer being managed the pageout daemon will not
2419  * touch it (since there is no way to locate the pte mappings for the
2420  * page).  madvise() calls that mess with the pmap will also no longer
2421  * operate on the page.
2422  *
2423  * Beyond that the page is still reasonably 'normal'.  Freeing the page
2424  * will clear the flag.
2425  *
2426  * This routine is used by OBJT_PHYS objects - objects using unswappable
2427  * physical memory as backing store rather then swap-backed memory and
2428  * will eventually be extended to support 4MB unmanaged physical
2429  * mappings.
2430  *
2431  * Caller must be holding the page busy.
2432  */
2433 void
2434 vm_page_unmanage(vm_page_t m)
2435 {
2436 	KKASSERT(m->flags & PG_BUSY);
2437 	if ((m->flags & PG_UNMANAGED) == 0) {
2438 		if (m->wire_count == 0)
2439 			vm_page_unqueue(m);
2440 	}
2441 	vm_page_flag_set(m, PG_UNMANAGED);
2442 }
2443 
2444 /*
2445  * Mark this page as wired down by yet another map, removing it from
2446  * paging queues as necessary.
2447  *
2448  * Caller must be holding the page busy.
2449  */
2450 void
2451 vm_page_wire(vm_page_t m)
2452 {
2453 	/*
2454 	 * Only bump the wire statistics if the page is not already wired,
2455 	 * and only unqueue the page if it is on some queue (if it is unmanaged
2456 	 * it is already off the queues).  Don't do anything with fictitious
2457 	 * pages because they are always wired.
2458 	 */
2459 	KKASSERT(m->flags & PG_BUSY);
2460 	if ((m->flags & PG_FICTITIOUS) == 0) {
2461 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
2462 			if ((m->flags & PG_UNMANAGED) == 0)
2463 				vm_page_unqueue(m);
2464 			atomic_add_int(&mycpu->gd_vmstats_adj.v_wire_count, 1);
2465 		}
2466 		KASSERT(m->wire_count != 0,
2467 			("vm_page_wire: wire_count overflow m=%p", m));
2468 	}
2469 }
2470 
2471 /*
2472  * Release one wiring of this page, potentially enabling it to be paged again.
2473  *
2474  * Many pages placed on the inactive queue should actually go
2475  * into the cache, but it is difficult to figure out which.  What
2476  * we do instead, if the inactive target is well met, is to put
2477  * clean pages at the head of the inactive queue instead of the tail.
2478  * This will cause them to be moved to the cache more quickly and
2479  * if not actively re-referenced, freed more quickly.  If we just
2480  * stick these pages at the end of the inactive queue, heavy filesystem
2481  * meta-data accesses can cause an unnecessary paging load on memory bound
2482  * processes.  This optimization causes one-time-use metadata to be
2483  * reused more quickly.
2484  *
2485  * Pages marked PG_NEED_COMMIT are always activated and never placed on
2486  * the inactive queue.  This helps the pageout daemon determine memory
2487  * pressure and act on out-of-memory situations more quickly.
2488  *
2489  * BUT, if we are in a low-memory situation we have no choice but to
2490  * put clean pages on the cache queue.
2491  *
2492  * A number of routines use vm_page_unwire() to guarantee that the page
2493  * will go into either the inactive or active queues, and will NEVER
2494  * be placed in the cache - for example, just after dirtying a page.
2495  * dirty pages in the cache are not allowed.
2496  *
2497  * This routine may not block.
2498  */
2499 void
2500 vm_page_unwire(vm_page_t m, int activate)
2501 {
2502 	KKASSERT(m->flags & PG_BUSY);
2503 	if (m->flags & PG_FICTITIOUS) {
2504 		/* do nothing */
2505 	} else if (m->wire_count <= 0) {
2506 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
2507 	} else {
2508 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
2509 			atomic_add_int(&mycpu->gd_vmstats_adj.v_wire_count, -1);
2510 			if (m->flags & PG_UNMANAGED) {
2511 				;
2512 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
2513 				vm_page_spin_lock(m);
2514 				_vm_page_add_queue_spinlocked(m,
2515 							PQ_ACTIVE + m->pc, 0);
2516 				_vm_page_and_queue_spin_unlock(m);
2517 			} else {
2518 				vm_page_spin_lock(m);
2519 				vm_page_flag_clear(m, PG_WINATCFLS);
2520 				_vm_page_add_queue_spinlocked(m,
2521 							PQ_INACTIVE + m->pc, 0);
2522 				++vm_swapcache_inactive_heuristic;
2523 				_vm_page_and_queue_spin_unlock(m);
2524 			}
2525 		}
2526 	}
2527 }
2528 
2529 /*
2530  * Move the specified page to the inactive queue.  If the page has
2531  * any associated swap, the swap is deallocated.
2532  *
2533  * Normally athead is 0 resulting in LRU operation.  athead is set
2534  * to 1 if we want this page to be 'as if it were placed in the cache',
2535  * except without unmapping it from the process address space.
2536  *
2537  * vm_page's spinlock must be held on entry and will remain held on return.
2538  * This routine may not block.
2539  */
2540 static void
2541 _vm_page_deactivate_locked(vm_page_t m, int athead)
2542 {
2543 	u_short oqueue;
2544 
2545 	/*
2546 	 * Ignore if already inactive.
2547 	 */
2548 	if (m->queue - m->pc == PQ_INACTIVE)
2549 		return;
2550 	_vm_page_queue_spin_lock(m);
2551 	oqueue = _vm_page_rem_queue_spinlocked(m);
2552 
2553 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2554 		if (oqueue == PQ_CACHE)
2555 			mycpu->gd_cnt.v_reactivated++;
2556 		vm_page_flag_clear(m, PG_WINATCFLS);
2557 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
2558 		if (athead == 0)
2559 			++vm_swapcache_inactive_heuristic;
2560 	}
2561 	/* NOTE: PQ_NONE if condition not taken */
2562 	_vm_page_queue_spin_unlock(m);
2563 	/* leaves vm_page spinlocked */
2564 }
2565 
2566 /*
2567  * Attempt to deactivate a page.
2568  *
2569  * No requirements.
2570  */
2571 void
2572 vm_page_deactivate(vm_page_t m)
2573 {
2574 	vm_page_spin_lock(m);
2575 	_vm_page_deactivate_locked(m, 0);
2576 	vm_page_spin_unlock(m);
2577 }
2578 
2579 void
2580 vm_page_deactivate_locked(vm_page_t m)
2581 {
2582 	_vm_page_deactivate_locked(m, 0);
2583 }
2584 
2585 /*
2586  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
2587  *
2588  * This function returns non-zero if it successfully moved the page to
2589  * PQ_CACHE.
2590  *
2591  * This function unconditionally unbusies the page on return.
2592  */
2593 int
2594 vm_page_try_to_cache(vm_page_t m)
2595 {
2596 	vm_page_spin_lock(m);
2597 	if (m->dirty || m->hold_count || m->wire_count ||
2598 	    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT))) {
2599 		if (_vm_page_wakeup(m)) {
2600 			vm_page_spin_unlock(m);
2601 			wakeup(m);
2602 		} else {
2603 			vm_page_spin_unlock(m);
2604 		}
2605 		return(0);
2606 	}
2607 	vm_page_spin_unlock(m);
2608 
2609 	/*
2610 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
2611 	 * be moved to the cache.
2612 	 */
2613 	vm_page_test_dirty(m);
2614 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2615 		vm_page_wakeup(m);
2616 		return(0);
2617 	}
2618 	vm_page_cache(m);
2619 	return(1);
2620 }
2621 
2622 /*
2623  * Attempt to free the page.  If we cannot free it, we do nothing.
2624  * 1 is returned on success, 0 on failure.
2625  *
2626  * No requirements.
2627  */
2628 int
2629 vm_page_try_to_free(vm_page_t m)
2630 {
2631 	vm_page_spin_lock(m);
2632 	if (vm_page_busy_try(m, TRUE)) {
2633 		vm_page_spin_unlock(m);
2634 		return(0);
2635 	}
2636 
2637 	/*
2638 	 * The page can be in any state, including already being on the free
2639 	 * queue.  Check to see if it really can be freed.
2640 	 */
2641 	if (m->dirty ||				/* can't free if it is dirty */
2642 	    m->hold_count ||			/* or held (XXX may be wrong) */
2643 	    m->wire_count ||			/* or wired */
2644 	    (m->flags & (PG_UNMANAGED |		/* or unmanaged */
2645 			 PG_NEED_COMMIT)) ||	/* or needs a commit */
2646 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
2647 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
2648 		if (_vm_page_wakeup(m)) {
2649 			vm_page_spin_unlock(m);
2650 			wakeup(m);
2651 		} else {
2652 			vm_page_spin_unlock(m);
2653 		}
2654 		return(0);
2655 	}
2656 	vm_page_spin_unlock(m);
2657 
2658 	/*
2659 	 * We can probably free the page.
2660 	 *
2661 	 * Page busied by us and no longer spinlocked.  Dirty pages will
2662 	 * not be freed by this function.    We have to re-test the
2663 	 * dirty bit after cleaning out the pmaps.
2664 	 */
2665 	vm_page_test_dirty(m);
2666 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2667 		vm_page_wakeup(m);
2668 		return(0);
2669 	}
2670 	vm_page_protect(m, VM_PROT_NONE);
2671 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2672 		vm_page_wakeup(m);
2673 		return(0);
2674 	}
2675 	vm_page_free(m);
2676 	return(1);
2677 }
2678 
2679 /*
2680  * vm_page_cache
2681  *
2682  * Put the specified page onto the page cache queue (if appropriate).
2683  *
2684  * The page must be busy, and this routine will release the busy and
2685  * possibly even free the page.
2686  */
2687 void
2688 vm_page_cache(vm_page_t m)
2689 {
2690 	/*
2691 	 * Not suitable for the cache
2692 	 */
2693 	if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
2694 	    m->busy || m->wire_count || m->hold_count) {
2695 		vm_page_wakeup(m);
2696 		return;
2697 	}
2698 
2699 	/*
2700 	 * Already in the cache (and thus not mapped)
2701 	 */
2702 	if ((m->queue - m->pc) == PQ_CACHE) {
2703 		KKASSERT((m->flags & PG_MAPPED) == 0);
2704 		vm_page_wakeup(m);
2705 		return;
2706 	}
2707 
2708 	/*
2709 	 * Caller is required to test m->dirty, but note that the act of
2710 	 * removing the page from its maps can cause it to become dirty
2711 	 * on an SMP system due to another cpu running in usermode.
2712 	 */
2713 	if (m->dirty) {
2714 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
2715 			(long)m->pindex);
2716 	}
2717 
2718 	/*
2719 	 * Remove all pmaps and indicate that the page is not
2720 	 * writeable or mapped.  Our vm_page_protect() call may
2721 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
2722 	 * everything.
2723 	 */
2724 	vm_page_protect(m, VM_PROT_NONE);
2725 	if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
2726 	    m->busy || m->wire_count || m->hold_count) {
2727 		vm_page_wakeup(m);
2728 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2729 		vm_page_deactivate(m);
2730 		vm_page_wakeup(m);
2731 	} else {
2732 		_vm_page_and_queue_spin_lock(m);
2733 		_vm_page_rem_queue_spinlocked(m);
2734 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
2735 		_vm_page_queue_spin_unlock(m);
2736 		if (_vm_page_wakeup(m)) {
2737 			vm_page_spin_unlock(m);
2738 			wakeup(m);
2739 		} else {
2740 			vm_page_spin_unlock(m);
2741 		}
2742 		vm_page_free_wakeup();
2743 	}
2744 }
2745 
2746 /*
2747  * vm_page_dontneed()
2748  *
2749  * Cache, deactivate, or do nothing as appropriate.  This routine
2750  * is typically used by madvise() MADV_DONTNEED.
2751  *
2752  * Generally speaking we want to move the page into the cache so
2753  * it gets reused quickly.  However, this can result in a silly syndrome
2754  * due to the page recycling too quickly.  Small objects will not be
2755  * fully cached.  On the otherhand, if we move the page to the inactive
2756  * queue we wind up with a problem whereby very large objects
2757  * unnecessarily blow away our inactive and cache queues.
2758  *
2759  * The solution is to move the pages based on a fixed weighting.  We
2760  * either leave them alone, deactivate them, or move them to the cache,
2761  * where moving them to the cache has the highest weighting.
2762  * By forcing some pages into other queues we eventually force the
2763  * system to balance the queues, potentially recovering other unrelated
2764  * space from active.  The idea is to not force this to happen too
2765  * often.
2766  *
2767  * The page must be busied.
2768  */
2769 void
2770 vm_page_dontneed(vm_page_t m)
2771 {
2772 	static int dnweight;
2773 	int dnw;
2774 	int head;
2775 
2776 	dnw = ++dnweight;
2777 
2778 	/*
2779 	 * occassionally leave the page alone
2780 	 */
2781 	if ((dnw & 0x01F0) == 0 ||
2782 	    m->queue - m->pc == PQ_INACTIVE ||
2783 	    m->queue - m->pc == PQ_CACHE
2784 	) {
2785 		if (m->act_count >= ACT_INIT)
2786 			--m->act_count;
2787 		return;
2788 	}
2789 
2790 	/*
2791 	 * If vm_page_dontneed() is inactivating a page, it must clear
2792 	 * the referenced flag; otherwise the pagedaemon will see references
2793 	 * on the page in the inactive queue and reactivate it. Until the
2794 	 * page can move to the cache queue, madvise's job is not done.
2795 	 */
2796 	vm_page_flag_clear(m, PG_REFERENCED);
2797 	pmap_clear_reference(m);
2798 
2799 	if (m->dirty == 0)
2800 		vm_page_test_dirty(m);
2801 
2802 	if (m->dirty || (dnw & 0x0070) == 0) {
2803 		/*
2804 		 * Deactivate the page 3 times out of 32.
2805 		 */
2806 		head = 0;
2807 	} else {
2808 		/*
2809 		 * Cache the page 28 times out of every 32.  Note that
2810 		 * the page is deactivated instead of cached, but placed
2811 		 * at the head of the queue instead of the tail.
2812 		 */
2813 		head = 1;
2814 	}
2815 	vm_page_spin_lock(m);
2816 	_vm_page_deactivate_locked(m, head);
2817 	vm_page_spin_unlock(m);
2818 }
2819 
2820 /*
2821  * These routines manipulate the 'soft busy' count for a page.  A soft busy
2822  * is almost like PG_BUSY except that it allows certain compatible operations
2823  * to occur on the page while it is busy.  For example, a page undergoing a
2824  * write can still be mapped read-only.
2825  *
2826  * Because vm_pages can overlap buffers m->busy can be > 1.  m->busy is only
2827  * adjusted while the vm_page is PG_BUSY so the flash will occur when the
2828  * busy bit is cleared.
2829  */
2830 void
2831 vm_page_io_start(vm_page_t m)
2832 {
2833         KASSERT(m->flags & PG_BUSY, ("vm_page_io_start: page not busy!!!"));
2834         atomic_add_char(&m->busy, 1);
2835 	vm_page_flag_set(m, PG_SBUSY);
2836 }
2837 
2838 void
2839 vm_page_io_finish(vm_page_t m)
2840 {
2841         KASSERT(m->flags & PG_BUSY, ("vm_page_io_finish: page not busy!!!"));
2842         atomic_subtract_char(&m->busy, 1);
2843 	if (m->busy == 0)
2844 		vm_page_flag_clear(m, PG_SBUSY);
2845 }
2846 
2847 /*
2848  * Indicate that a clean VM page requires a filesystem commit and cannot
2849  * be reused.  Used by tmpfs.
2850  */
2851 void
2852 vm_page_need_commit(vm_page_t m)
2853 {
2854 	vm_page_flag_set(m, PG_NEED_COMMIT);
2855 	vm_object_set_writeable_dirty(m->object);
2856 }
2857 
2858 void
2859 vm_page_clear_commit(vm_page_t m)
2860 {
2861 	vm_page_flag_clear(m, PG_NEED_COMMIT);
2862 }
2863 
2864 /*
2865  * Grab a page, blocking if it is busy and allocating a page if necessary.
2866  * A busy page is returned or NULL.  The page may or may not be valid and
2867  * might not be on a queue (the caller is responsible for the disposition of
2868  * the page).
2869  *
2870  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
2871  * page will be zero'd and marked valid.
2872  *
2873  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
2874  * valid even if it already exists.
2875  *
2876  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
2877  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
2878  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
2879  *
2880  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
2881  * always returned if we had blocked.
2882  *
2883  * This routine may not be called from an interrupt.
2884  *
2885  * No other requirements.
2886  */
2887 vm_page_t
2888 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2889 {
2890 	vm_page_t m;
2891 	int error;
2892 	int shared = 1;
2893 
2894 	KKASSERT(allocflags &
2895 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2896 	vm_object_hold_shared(object);
2897 	for (;;) {
2898 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
2899 		if (error) {
2900 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
2901 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
2902 				m = NULL;
2903 				break;
2904 			}
2905 			/* retry */
2906 		} else if (m == NULL) {
2907 			if (shared) {
2908 				vm_object_upgrade(object);
2909 				shared = 0;
2910 			}
2911 			if (allocflags & VM_ALLOC_RETRY)
2912 				allocflags |= VM_ALLOC_NULL_OK;
2913 			m = vm_page_alloc(object, pindex,
2914 					  allocflags & ~VM_ALLOC_RETRY);
2915 			if (m)
2916 				break;
2917 			vm_wait(0);
2918 			if ((allocflags & VM_ALLOC_RETRY) == 0)
2919 				goto failed;
2920 		} else {
2921 			/* m found */
2922 			break;
2923 		}
2924 	}
2925 
2926 	/*
2927 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
2928 	 *
2929 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
2930 	 * valid even if already valid.
2931 	 *
2932 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
2933 	 *	  removed the idle zeroing code.  These optimizations actually
2934 	 *	  slow things down on modern cpus because the zerod area is
2935 	 *	  likely uncached, placing a memory-access burden on the
2936 	 *	  accesors taking the fault.
2937 	 *
2938 	 *	  By always zeroing the page in-line with the fault, no
2939 	 *	  dynamic ram reads are needed and the caches are hot, ready
2940 	 *	  for userland to access the memory.
2941 	 */
2942 	if (m->valid == 0) {
2943 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
2944 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
2945 			m->valid = VM_PAGE_BITS_ALL;
2946 		}
2947 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
2948 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
2949 		m->valid = VM_PAGE_BITS_ALL;
2950 	}
2951 failed:
2952 	vm_object_drop(object);
2953 	return(m);
2954 }
2955 
2956 /*
2957  * Mapping function for valid bits or for dirty bits in
2958  * a page.  May not block.
2959  *
2960  * Inputs are required to range within a page.
2961  *
2962  * No requirements.
2963  * Non blocking.
2964  */
2965 int
2966 vm_page_bits(int base, int size)
2967 {
2968 	int first_bit;
2969 	int last_bit;
2970 
2971 	KASSERT(
2972 	    base + size <= PAGE_SIZE,
2973 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
2974 	);
2975 
2976 	if (size == 0)		/* handle degenerate case */
2977 		return(0);
2978 
2979 	first_bit = base >> DEV_BSHIFT;
2980 	last_bit = (base + size - 1) >> DEV_BSHIFT;
2981 
2982 	return ((2 << last_bit) - (1 << first_bit));
2983 }
2984 
2985 /*
2986  * Sets portions of a page valid and clean.  The arguments are expected
2987  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2988  * of any partial chunks touched by the range.  The invalid portion of
2989  * such chunks will be zero'd.
2990  *
2991  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
2992  *	 align base to DEV_BSIZE so as not to mark clean a partially
2993  *	 truncated device block.  Otherwise the dirty page status might be
2994  *	 lost.
2995  *
2996  * This routine may not block.
2997  *
2998  * (base + size) must be less then or equal to PAGE_SIZE.
2999  */
3000 static void
3001 _vm_page_zero_valid(vm_page_t m, int base, int size)
3002 {
3003 	int frag;
3004 	int endoff;
3005 
3006 	if (size == 0)	/* handle degenerate case */
3007 		return;
3008 
3009 	/*
3010 	 * If the base is not DEV_BSIZE aligned and the valid
3011 	 * bit is clear, we have to zero out a portion of the
3012 	 * first block.
3013 	 */
3014 
3015 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
3016 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3017 	) {
3018 		pmap_zero_page_area(
3019 		    VM_PAGE_TO_PHYS(m),
3020 		    frag,
3021 		    base - frag
3022 		);
3023 	}
3024 
3025 	/*
3026 	 * If the ending offset is not DEV_BSIZE aligned and the
3027 	 * valid bit is clear, we have to zero out a portion of
3028 	 * the last block.
3029 	 */
3030 
3031 	endoff = base + size;
3032 
3033 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
3034 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3035 	) {
3036 		pmap_zero_page_area(
3037 		    VM_PAGE_TO_PHYS(m),
3038 		    endoff,
3039 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3040 		);
3041 	}
3042 }
3043 
3044 /*
3045  * Set valid, clear dirty bits.  If validating the entire
3046  * page we can safely clear the pmap modify bit.  We also
3047  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3048  * takes a write fault on a MAP_NOSYNC memory area the flag will
3049  * be set again.
3050  *
3051  * We set valid bits inclusive of any overlap, but we can only
3052  * clear dirty bits for DEV_BSIZE chunks that are fully within
3053  * the range.
3054  *
3055  * Page must be busied?
3056  * No other requirements.
3057  */
3058 void
3059 vm_page_set_valid(vm_page_t m, int base, int size)
3060 {
3061 	_vm_page_zero_valid(m, base, size);
3062 	m->valid |= vm_page_bits(base, size);
3063 }
3064 
3065 
3066 /*
3067  * Set valid bits and clear dirty bits.
3068  *
3069  * Page must be busied by caller.
3070  *
3071  * NOTE: This function does not clear the pmap modified bit.
3072  *	 Also note that e.g. NFS may use a byte-granular base
3073  *	 and size.
3074  *
3075  * No other requirements.
3076  */
3077 void
3078 vm_page_set_validclean(vm_page_t m, int base, int size)
3079 {
3080 	int pagebits;
3081 
3082 	_vm_page_zero_valid(m, base, size);
3083 	pagebits = vm_page_bits(base, size);
3084 	m->valid |= pagebits;
3085 	m->dirty &= ~pagebits;
3086 	if (base == 0 && size == PAGE_SIZE) {
3087 		/*pmap_clear_modify(m);*/
3088 		vm_page_flag_clear(m, PG_NOSYNC);
3089 	}
3090 }
3091 
3092 /*
3093  * Set valid & dirty.  Used by buwrite()
3094  *
3095  * Page must be busied by caller.
3096  */
3097 void
3098 vm_page_set_validdirty(vm_page_t m, int base, int size)
3099 {
3100 	int pagebits;
3101 
3102 	pagebits = vm_page_bits(base, size);
3103 	m->valid |= pagebits;
3104 	m->dirty |= pagebits;
3105 	if (m->object)
3106 	       vm_object_set_writeable_dirty(m->object);
3107 }
3108 
3109 /*
3110  * Clear dirty bits.
3111  *
3112  * NOTE: This function does not clear the pmap modified bit.
3113  *	 Also note that e.g. NFS may use a byte-granular base
3114  *	 and size.
3115  *
3116  * Page must be busied?
3117  * No other requirements.
3118  */
3119 void
3120 vm_page_clear_dirty(vm_page_t m, int base, int size)
3121 {
3122 	m->dirty &= ~vm_page_bits(base, size);
3123 	if (base == 0 && size == PAGE_SIZE) {
3124 		/*pmap_clear_modify(m);*/
3125 		vm_page_flag_clear(m, PG_NOSYNC);
3126 	}
3127 }
3128 
3129 /*
3130  * Make the page all-dirty.
3131  *
3132  * Also make sure the related object and vnode reflect the fact that the
3133  * object may now contain a dirty page.
3134  *
3135  * Page must be busied?
3136  * No other requirements.
3137  */
3138 void
3139 vm_page_dirty(vm_page_t m)
3140 {
3141 #ifdef INVARIANTS
3142         int pqtype = m->queue - m->pc;
3143 #endif
3144         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3145                 ("vm_page_dirty: page in free/cache queue!"));
3146 	if (m->dirty != VM_PAGE_BITS_ALL) {
3147 		m->dirty = VM_PAGE_BITS_ALL;
3148 		if (m->object)
3149 			vm_object_set_writeable_dirty(m->object);
3150 	}
3151 }
3152 
3153 /*
3154  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3155  * valid and dirty bits for the effected areas are cleared.
3156  *
3157  * Page must be busied?
3158  * Does not block.
3159  * No other requirements.
3160  */
3161 void
3162 vm_page_set_invalid(vm_page_t m, int base, int size)
3163 {
3164 	int bits;
3165 
3166 	bits = vm_page_bits(base, size);
3167 	m->valid &= ~bits;
3168 	m->dirty &= ~bits;
3169 	m->object->generation++;
3170 }
3171 
3172 /*
3173  * The kernel assumes that the invalid portions of a page contain
3174  * garbage, but such pages can be mapped into memory by user code.
3175  * When this occurs, we must zero out the non-valid portions of the
3176  * page so user code sees what it expects.
3177  *
3178  * Pages are most often semi-valid when the end of a file is mapped
3179  * into memory and the file's size is not page aligned.
3180  *
3181  * Page must be busied?
3182  * No other requirements.
3183  */
3184 void
3185 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3186 {
3187 	int b;
3188 	int i;
3189 
3190 	/*
3191 	 * Scan the valid bits looking for invalid sections that
3192 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
3193 	 * valid bit may be set ) have already been zerod by
3194 	 * vm_page_set_validclean().
3195 	 */
3196 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3197 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3198 		    (m->valid & (1 << i))
3199 		) {
3200 			if (i > b) {
3201 				pmap_zero_page_area(
3202 				    VM_PAGE_TO_PHYS(m),
3203 				    b << DEV_BSHIFT,
3204 				    (i - b) << DEV_BSHIFT
3205 				);
3206 			}
3207 			b = i + 1;
3208 		}
3209 	}
3210 
3211 	/*
3212 	 * setvalid is TRUE when we can safely set the zero'd areas
3213 	 * as being valid.  We can do this if there are no cache consistency
3214 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3215 	 */
3216 	if (setvalid)
3217 		m->valid = VM_PAGE_BITS_ALL;
3218 }
3219 
3220 /*
3221  * Is a (partial) page valid?  Note that the case where size == 0
3222  * will return FALSE in the degenerate case where the page is entirely
3223  * invalid, and TRUE otherwise.
3224  *
3225  * Does not block.
3226  * No other requirements.
3227  */
3228 int
3229 vm_page_is_valid(vm_page_t m, int base, int size)
3230 {
3231 	int bits = vm_page_bits(base, size);
3232 
3233 	if (m->valid && ((m->valid & bits) == bits))
3234 		return 1;
3235 	else
3236 		return 0;
3237 }
3238 
3239 /*
3240  * update dirty bits from pmap/mmu.  May not block.
3241  *
3242  * Caller must hold the page busy
3243  */
3244 void
3245 vm_page_test_dirty(vm_page_t m)
3246 {
3247 	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
3248 		vm_page_dirty(m);
3249 	}
3250 }
3251 
3252 /*
3253  * Register an action, associating it with its vm_page
3254  */
3255 void
3256 vm_page_register_action(vm_page_action_t action, vm_page_event_t event)
3257 {
3258 	struct vm_page_action_hash *hash;
3259 	int hv;
3260 
3261 	hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK;
3262 	hash = &action_hash[hv];
3263 
3264 	lockmgr(&hash->lk, LK_EXCLUSIVE);
3265 	vm_page_flag_set(action->m, PG_ACTIONLIST);
3266 	action->event = event;
3267 	LIST_INSERT_HEAD(&hash->list, action, entry);
3268 	lockmgr(&hash->lk, LK_RELEASE);
3269 }
3270 
3271 /*
3272  * Unregister an action, disassociating it from its related vm_page
3273  */
3274 void
3275 vm_page_unregister_action(vm_page_action_t action)
3276 {
3277 	struct vm_page_action_hash *hash;
3278 	int hv;
3279 
3280 	hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK;
3281 	hash = &action_hash[hv];
3282 	lockmgr(&hash->lk, LK_EXCLUSIVE);
3283 	if (action->event != VMEVENT_NONE) {
3284 		action->event = VMEVENT_NONE;
3285 		LIST_REMOVE(action, entry);
3286 
3287 		if (LIST_EMPTY(&hash->list))
3288 			vm_page_flag_clear(action->m, PG_ACTIONLIST);
3289 	}
3290 	lockmgr(&hash->lk, LK_RELEASE);
3291 }
3292 
3293 /*
3294  * Issue an event on a VM page.  Corresponding action structures are
3295  * removed from the page's list and called.
3296  *
3297  * If the vm_page has no more pending action events we clear its
3298  * PG_ACTIONLIST flag.
3299  */
3300 void
3301 vm_page_event_internal(vm_page_t m, vm_page_event_t event)
3302 {
3303 	struct vm_page_action_hash *hash;
3304 	struct vm_page_action *scan;
3305 	struct vm_page_action *next;
3306 	int hv;
3307 	int all;
3308 
3309 	hv = (int)((intptr_t)m >> 8) & VMACTION_HMASK;
3310 	hash = &action_hash[hv];
3311 	all = 1;
3312 
3313 	lockmgr(&hash->lk, LK_EXCLUSIVE);
3314 	LIST_FOREACH_MUTABLE(scan, &hash->list, entry, next) {
3315 		if (scan->m == m) {
3316 			if (scan->event == event) {
3317 				scan->event = VMEVENT_NONE;
3318 				LIST_REMOVE(scan, entry);
3319 				scan->func(m, scan);
3320 				/* XXX */
3321 			} else {
3322 				all = 0;
3323 			}
3324 		}
3325 	}
3326 	if (all)
3327 		vm_page_flag_clear(m, PG_ACTIONLIST);
3328 	lockmgr(&hash->lk, LK_RELEASE);
3329 }
3330 
3331 #include "opt_ddb.h"
3332 #ifdef DDB
3333 #include <sys/kernel.h>
3334 
3335 #include <ddb/ddb.h>
3336 
3337 DB_SHOW_COMMAND(page, vm_page_print_page_info)
3338 {
3339 	db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count);
3340 	db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count);
3341 	db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count);
3342 	db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count);
3343 	db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count);
3344 	db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved);
3345 	db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min);
3346 	db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target);
3347 	db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min);
3348 	db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target);
3349 }
3350 
3351 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3352 {
3353 	int i;
3354 	db_printf("PQ_FREE:");
3355 	for (i = 0; i < PQ_L2_SIZE; i++) {
3356 		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
3357 	}
3358 	db_printf("\n");
3359 
3360 	db_printf("PQ_CACHE:");
3361 	for(i = 0; i < PQ_L2_SIZE; i++) {
3362 		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
3363 	}
3364 	db_printf("\n");
3365 
3366 	db_printf("PQ_ACTIVE:");
3367 	for(i = 0; i < PQ_L2_SIZE; i++) {
3368 		db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
3369 	}
3370 	db_printf("\n");
3371 
3372 	db_printf("PQ_INACTIVE:");
3373 	for(i = 0; i < PQ_L2_SIZE; i++) {
3374 		db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
3375 	}
3376 	db_printf("\n");
3377 }
3378 #endif /* DDB */
3379