xref: /netbsd-src/sys/uvm/uvm_page.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: uvm_page.c,v 1.234 2020/03/17 18:31:39 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1997 Charles D. Cranor and Washington University.
34  * Copyright (c) 1991, 1993, The Regents of the University of California.
35  *
36  * All rights reserved.
37  *
38  * This code is derived from software contributed to Berkeley by
39  * The Mach Operating System project at Carnegie-Mellon University.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vm_page.c   8.3 (Berkeley) 3/21/94
66  * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
67  *
68  *
69  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
70  * All rights reserved.
71  *
72  * Permission to use, copy, modify and distribute this software and
73  * its documentation is hereby granted, provided that both the copyright
74  * notice and this permission notice appear in all copies of the
75  * software, derivative works or modified versions, and any portions
76  * thereof, and that both notices appear in supporting documentation.
77  *
78  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
79  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
80  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
81  *
82  * Carnegie Mellon requests users of this software to return to
83  *
84  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
85  *  School of Computer Science
86  *  Carnegie Mellon University
87  *  Pittsburgh PA 15213-3890
88  *
89  * any improvements or extensions that they make and grant Carnegie the
90  * rights to redistribute these changes.
91  */
92 
93 /*
94  * uvm_page.c: page ops.
95  */
96 
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.234 2020/03/17 18:31:39 ad Exp $");
99 
100 #include "opt_ddb.h"
101 #include "opt_uvm.h"
102 #include "opt_uvmhist.h"
103 #include "opt_readahead.h"
104 
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/sched.h>
108 #include <sys/kernel.h>
109 #include <sys/vnode.h>
110 #include <sys/proc.h>
111 #include <sys/radixtree.h>
112 #include <sys/atomic.h>
113 #include <sys/cpu.h>
114 #include <sys/extent.h>
115 
116 #include <uvm/uvm.h>
117 #include <uvm/uvm_ddb.h>
118 #include <uvm/uvm_pdpolicy.h>
119 #include <uvm/uvm_pgflcache.h>
120 
121 /*
122  * Some supported CPUs in a given architecture don't support all
123  * of the things necessary to do idle page zero'ing efficiently.
124  * We therefore provide a way to enable it from machdep code here.
125  */
126 bool vm_page_zero_enable = false;
127 
128 /*
129  * number of pages per-CPU to reserve for the kernel.
130  */
131 #ifndef	UVM_RESERVED_PAGES_PER_CPU
132 #define	UVM_RESERVED_PAGES_PER_CPU	5
133 #endif
134 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
135 
136 /*
137  * physical memory size;
138  */
139 psize_t physmem;
140 
141 /*
142  * local variables
143  */
144 
145 /*
146  * these variables record the values returned by vm_page_bootstrap,
147  * for debugging purposes.  The implementation of uvm_pageboot_alloc
148  * and pmap_startup here also uses them internally.
149  */
150 
151 static vaddr_t      virtual_space_start;
152 static vaddr_t      virtual_space_end;
153 
154 /*
155  * we allocate an initial number of page colors in uvm_page_init(),
156  * and remember them.  We may re-color pages as cache sizes are
157  * discovered during the autoconfiguration phase.  But we can never
158  * free the initial set of buckets, since they are allocated using
159  * uvm_pageboot_alloc().
160  */
161 
162 static size_t recolored_pages_memsize /* = 0 */;
163 static char *recolored_pages_mem;
164 
165 /*
166  * freelist locks - one per bucket.
167  */
168 
169 union uvm_freelist_lock	uvm_freelist_locks[PGFL_MAX_BUCKETS]
170     __cacheline_aligned;
171 
172 /*
173  * basic NUMA information.
174  */
175 
176 static struct uvm_page_numa_region {
177 	struct uvm_page_numa_region	*next;
178 	paddr_t				start;
179 	paddr_t				size;
180 	u_int				numa_id;
181 } *uvm_page_numa_region;
182 
183 #ifdef DEBUG
184 kmutex_t uvm_zerochecklock __cacheline_aligned;
185 vaddr_t uvm_zerocheckkva;
186 #endif /* DEBUG */
187 
188 /*
189  * These functions are reserved for uvm(9) internal use and are not
190  * exported in the header file uvm_physseg.h
191  *
192  * Thus they are redefined here.
193  */
194 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
195 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
196 
197 /* returns a pgs array */
198 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
199 
200 /*
201  * inline functions
202  */
203 
204 /*
205  * uvm_pageinsert: insert a page in the object.
206  *
207  * => caller must lock object
208  * => call should have already set pg's object and offset pointers
209  *    and bumped the version counter
210  */
211 
212 static inline void
213 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
214 {
215 
216 	KASSERT(uobj == pg->uobject);
217 	KASSERT(rw_write_held(uobj->vmobjlock));
218 	KASSERT((pg->flags & PG_TABLED) == 0);
219 
220 	if ((pg->flags & PG_STAT) != 0) {
221 		/* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
222 		const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
223 		const bool isaobj = (pg->flags & PG_AOBJ) != 0;
224 
225 		if (!isaobj) {
226 			KASSERT((pg->flags & PG_FILE) != 0);
227 			if (uobj->uo_npages == 0) {
228 				struct vnode *vp = (struct vnode *)uobj;
229 				mutex_enter(vp->v_interlock);
230 				KASSERT((vp->v_iflag & VI_PAGES) == 0);
231 				vp->v_iflag |= VI_PAGES;
232 				vholdl(vp);
233 				mutex_exit(vp->v_interlock);
234 			}
235 			kpreempt_disable();
236 			if (UVM_OBJ_IS_VTEXT(uobj)) {
237 				CPU_COUNT(CPU_COUNT_EXECPAGES, 1);
238 			} else {
239 				CPU_COUNT(CPU_COUNT_FILEPAGES, 1);
240 			}
241 			CPU_COUNT(CPU_COUNT_FILEUNKNOWN + status, 1);
242 		} else {
243 			kpreempt_disable();
244 			CPU_COUNT(CPU_COUNT_ANONPAGES, 1);
245 			CPU_COUNT(CPU_COUNT_ANONUNKNOWN + status, 1);
246 		}
247 		kpreempt_enable();
248 	}
249 	pg->flags |= PG_TABLED;
250 	uobj->uo_npages++;
251 }
252 
253 static inline int
254 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
255 {
256 	const uint64_t idx = pg->offset >> PAGE_SHIFT;
257 	int error;
258 
259 	error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
260 	if (error != 0) {
261 		return error;
262 	}
263 	if ((pg->flags & PG_CLEAN) == 0) {
264 		radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG);
265 	}
266 	KASSERT(((pg->flags & PG_CLEAN) == 0) ==
267 	    radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG));
268 	return 0;
269 }
270 
271 /*
272  * uvm_page_remove: remove page from object.
273  *
274  * => caller must lock object
275  */
276 
277 static inline void
278 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
279 {
280 
281 	KASSERT(uobj == pg->uobject);
282 	KASSERT(rw_write_held(uobj->vmobjlock));
283 	KASSERT(pg->flags & PG_TABLED);
284 
285 	if ((pg->flags & PG_STAT) != 0) {
286 		/* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
287 		const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
288 		const bool isaobj = (pg->flags & PG_AOBJ) != 0;
289 
290 		if (!isaobj) {
291 			KASSERT((pg->flags & PG_FILE) != 0);
292 			if (uobj->uo_npages == 1) {
293 				struct vnode *vp = (struct vnode *)uobj;
294 				mutex_enter(vp->v_interlock);
295 				KASSERT((vp->v_iflag & VI_PAGES) != 0);
296 				vp->v_iflag &= ~VI_PAGES;
297 				holdrelel(vp);
298 				mutex_exit(vp->v_interlock);
299 			}
300 			kpreempt_disable();
301 			if (UVM_OBJ_IS_VTEXT(uobj)) {
302 				CPU_COUNT(CPU_COUNT_EXECPAGES, -1);
303 			} else {
304 				CPU_COUNT(CPU_COUNT_FILEPAGES, -1);
305 			}
306 			CPU_COUNT(CPU_COUNT_FILEUNKNOWN + status, -1);
307 		} else {
308 			kpreempt_disable();
309 			CPU_COUNT(CPU_COUNT_ANONPAGES, -1);
310 			CPU_COUNT(CPU_COUNT_ANONUNKNOWN + status, -1);
311 		}
312 		kpreempt_enable();
313 	}
314 	uobj->uo_npages--;
315 	pg->flags &= ~PG_TABLED;
316 	pg->uobject = NULL;
317 }
318 
319 static inline void
320 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
321 {
322 	struct vm_page *opg __unused;
323 
324 	opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
325 	KASSERT(pg == opg);
326 }
327 
328 static void
329 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
330 {
331 	int i;
332 
333 	pgb->pgb_nfree = 0;
334 	for (i = 0; i < uvmexp.ncolors; i++) {
335 		LIST_INIT(&pgb->pgb_colors[i]);
336 	}
337 	pgfl->pgfl_buckets[num] = pgb;
338 }
339 
340 /*
341  * uvm_page_init: init the page system.   called from uvm_init().
342  *
343  * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
344  */
345 
346 void
347 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
348 {
349 	static struct uvm_cpu boot_cpu __cacheline_aligned;
350 	psize_t freepages, pagecount, bucketsize, n;
351 	struct pgflbucket *pgb;
352 	struct vm_page *pagearray;
353 	char *bucketarray;
354 	uvm_physseg_t bank;
355 	int fl, b;
356 
357 	KASSERT(ncpu <= 1);
358 
359 	/*
360 	 * init the page queues and free page queue locks, except the
361 	 * free list; we allocate that later (with the initial vm_page
362 	 * structures).
363 	 */
364 
365 	curcpu()->ci_data.cpu_uvm = &boot_cpu;
366 	uvmpdpol_init();
367 	for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
368 		mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
369 	}
370 
371 	/*
372 	 * allocate vm_page structures.
373 	 */
374 
375 	/*
376 	 * sanity check:
377 	 * before calling this function the MD code is expected to register
378 	 * some free RAM with the uvm_page_physload() function.   our job
379 	 * now is to allocate vm_page structures for this memory.
380 	 */
381 
382 	if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
383 		panic("uvm_page_bootstrap: no memory pre-allocated");
384 
385 	/*
386 	 * first calculate the number of free pages...
387 	 *
388 	 * note that we use start/end rather than avail_start/avail_end.
389 	 * this allows us to allocate extra vm_page structures in case we
390 	 * want to return some memory to the pool after booting.
391 	 */
392 
393 	freepages = 0;
394 
395 	for (bank = uvm_physseg_get_first();
396 	     uvm_physseg_valid_p(bank) ;
397 	     bank = uvm_physseg_get_next(bank)) {
398 		freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
399 	}
400 
401 	/*
402 	 * Let MD code initialize the number of colors, or default
403 	 * to 1 color if MD code doesn't care.
404 	 */
405 	if (uvmexp.ncolors == 0)
406 		uvmexp.ncolors = 1;
407 	uvmexp.colormask = uvmexp.ncolors - 1;
408 	KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
409 
410 	/* We always start with only 1 bucket. */
411 	uvm.bucketcount = 1;
412 
413 	/*
414 	 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
415 	 * use.   for each page of memory we use we need a vm_page structure.
416 	 * thus, the total number of pages we can use is the total size of
417 	 * the memory divided by the PAGE_SIZE plus the size of the vm_page
418 	 * structure.   we add one to freepages as a fudge factor to avoid
419 	 * truncation errors (since we can only allocate in terms of whole
420 	 * pages).
421 	 */
422 	pagecount = ((freepages + 1) << PAGE_SHIFT) /
423 	    (PAGE_SIZE + sizeof(struct vm_page));
424 	bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
425 	bucketsize = roundup2(bucketsize, coherency_unit);
426 	bucketarray = (void *)uvm_pageboot_alloc(
427 	    bucketsize * VM_NFREELIST +
428 	    pagecount * sizeof(struct vm_page));
429 	pagearray = (struct vm_page *)
430 	    (bucketarray + bucketsize * VM_NFREELIST);
431 
432 	for (fl = 0; fl < VM_NFREELIST; fl++) {
433 		pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
434 		uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
435 	}
436 	memset(pagearray, 0, pagecount * sizeof(struct vm_page));
437 
438 	/*
439 	 * init the freelist cache in the disabled state.
440 	 */
441 	uvm_pgflcache_init();
442 
443 	/*
444 	 * init the vm_page structures and put them in the correct place.
445 	 */
446 	/* First init the extent */
447 
448 	for (bank = uvm_physseg_get_first(),
449 		 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
450 	     uvm_physseg_valid_p(bank);
451 	     bank = uvm_physseg_get_next(bank)) {
452 
453 		n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
454 		uvm_physseg_seg_alloc_from_slab(bank, n);
455 		uvm_physseg_init_seg(bank, pagearray);
456 
457 		/* set up page array pointers */
458 		pagearray += n;
459 		pagecount -= n;
460 	}
461 
462 	/*
463 	 * pass up the values of virtual_space_start and
464 	 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
465 	 * layers of the VM.
466 	 */
467 
468 	*kvm_startp = round_page(virtual_space_start);
469 	*kvm_endp = trunc_page(virtual_space_end);
470 #ifdef DEBUG
471 	/*
472 	 * steal kva for uvm_pagezerocheck().
473 	 */
474 	uvm_zerocheckkva = *kvm_startp;
475 	*kvm_startp += PAGE_SIZE;
476 	mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM);
477 #endif /* DEBUG */
478 
479 	/*
480 	 * init various thresholds.
481 	 */
482 
483 	uvmexp.reserve_pagedaemon = 1;
484 	uvmexp.reserve_kernel = vm_page_reserve_kernel;
485 
486 	/*
487 	 * done!
488 	 */
489 
490 	uvm.page_init_done = true;
491 }
492 
493 /*
494  * uvm_pgfl_lock: lock all freelist buckets
495  */
496 
497 void
498 uvm_pgfl_lock(void)
499 {
500 	int i;
501 
502 	for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
503 		mutex_spin_enter(&uvm_freelist_locks[i].lock);
504 	}
505 }
506 
507 /*
508  * uvm_pgfl_unlock: unlock all freelist buckets
509  */
510 
511 void
512 uvm_pgfl_unlock(void)
513 {
514 	int i;
515 
516 	for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
517 		mutex_spin_exit(&uvm_freelist_locks[i].lock);
518 	}
519 }
520 
521 /*
522  * uvm_setpagesize: set the page size
523  *
524  * => sets page_shift and page_mask from uvmexp.pagesize.
525  */
526 
527 void
528 uvm_setpagesize(void)
529 {
530 
531 	/*
532 	 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
533 	 * to be a constant (indicated by being a non-zero value).
534 	 */
535 	if (uvmexp.pagesize == 0) {
536 		if (PAGE_SIZE == 0)
537 			panic("uvm_setpagesize: uvmexp.pagesize not set");
538 		uvmexp.pagesize = PAGE_SIZE;
539 	}
540 	uvmexp.pagemask = uvmexp.pagesize - 1;
541 	if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
542 		panic("uvm_setpagesize: page size %u (%#x) not a power of two",
543 		    uvmexp.pagesize, uvmexp.pagesize);
544 	for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
545 		if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
546 			break;
547 }
548 
549 /*
550  * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
551  */
552 
553 vaddr_t
554 uvm_pageboot_alloc(vsize_t size)
555 {
556 	static bool initialized = false;
557 	vaddr_t addr;
558 #if !defined(PMAP_STEAL_MEMORY)
559 	vaddr_t vaddr;
560 	paddr_t paddr;
561 #endif
562 
563 	/*
564 	 * on first call to this function, initialize ourselves.
565 	 */
566 	if (initialized == false) {
567 		pmap_virtual_space(&virtual_space_start, &virtual_space_end);
568 
569 		/* round it the way we like it */
570 		virtual_space_start = round_page(virtual_space_start);
571 		virtual_space_end = trunc_page(virtual_space_end);
572 
573 		initialized = true;
574 	}
575 
576 	/* round to page size */
577 	size = round_page(size);
578 	uvmexp.bootpages += atop(size);
579 
580 #if defined(PMAP_STEAL_MEMORY)
581 
582 	/*
583 	 * defer bootstrap allocation to MD code (it may want to allocate
584 	 * from a direct-mapped segment).  pmap_steal_memory should adjust
585 	 * virtual_space_start/virtual_space_end if necessary.
586 	 */
587 
588 	addr = pmap_steal_memory(size, &virtual_space_start,
589 	    &virtual_space_end);
590 
591 	return(addr);
592 
593 #else /* !PMAP_STEAL_MEMORY */
594 
595 	/*
596 	 * allocate virtual memory for this request
597 	 */
598 	if (virtual_space_start == virtual_space_end ||
599 	    (virtual_space_end - virtual_space_start) < size)
600 		panic("uvm_pageboot_alloc: out of virtual space");
601 
602 	addr = virtual_space_start;
603 
604 #ifdef PMAP_GROWKERNEL
605 	/*
606 	 * If the kernel pmap can't map the requested space,
607 	 * then allocate more resources for it.
608 	 */
609 	if (uvm_maxkaddr < (addr + size)) {
610 		uvm_maxkaddr = pmap_growkernel(addr + size);
611 		if (uvm_maxkaddr < (addr + size))
612 			panic("uvm_pageboot_alloc: pmap_growkernel() failed");
613 	}
614 #endif
615 
616 	virtual_space_start += size;
617 
618 	/*
619 	 * allocate and mapin physical pages to back new virtual pages
620 	 */
621 
622 	for (vaddr = round_page(addr) ; vaddr < addr + size ;
623 	    vaddr += PAGE_SIZE) {
624 
625 		if (!uvm_page_physget(&paddr))
626 			panic("uvm_pageboot_alloc: out of memory");
627 
628 		/*
629 		 * Note this memory is no longer managed, so using
630 		 * pmap_kenter is safe.
631 		 */
632 		pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
633 	}
634 	pmap_update(pmap_kernel());
635 	return(addr);
636 #endif	/* PMAP_STEAL_MEMORY */
637 }
638 
639 #if !defined(PMAP_STEAL_MEMORY)
640 /*
641  * uvm_page_physget: "steal" one page from the vm_physmem structure.
642  *
643  * => attempt to allocate it off the end of a segment in which the "avail"
644  *    values match the start/end values.   if we can't do that, then we
645  *    will advance both values (making them equal, and removing some
646  *    vm_page structures from the non-avail area).
647  * => return false if out of memory.
648  */
649 
650 /* subroutine: try to allocate from memory chunks on the specified freelist */
651 static bool uvm_page_physget_freelist(paddr_t *, int);
652 
653 static bool
654 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
655 {
656 	uvm_physseg_t lcv;
657 
658 	/* pass 1: try allocating from a matching end */
659 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
660 	for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
661 #else
662 	for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
663 #endif
664 	{
665 		if (uvm.page_init_done == true)
666 			panic("uvm_page_physget: called _after_ bootstrap");
667 
668 		/* Try to match at front or back on unused segment */
669 		if (uvm_page_physunload(lcv, freelist, paddrp))
670 			return true;
671 	}
672 
673 	/* pass2: forget about matching ends, just allocate something */
674 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
675 	for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
676 #else
677 	for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
678 #endif
679 	{
680 		/* Try the front regardless. */
681 		if (uvm_page_physunload_force(lcv, freelist, paddrp))
682 			return true;
683 	}
684 	return false;
685 }
686 
687 bool
688 uvm_page_physget(paddr_t *paddrp)
689 {
690 	int i;
691 
692 	/* try in the order of freelist preference */
693 	for (i = 0; i < VM_NFREELIST; i++)
694 		if (uvm_page_physget_freelist(paddrp, i) == true)
695 			return (true);
696 	return (false);
697 }
698 #endif /* PMAP_STEAL_MEMORY */
699 
700 /*
701  * PHYS_TO_VM_PAGE: find vm_page for a PA.   used by MI code to get vm_pages
702  * back from an I/O mapping (ugh!).   used in some MD code as well.
703  */
704 struct vm_page *
705 uvm_phys_to_vm_page(paddr_t pa)
706 {
707 	paddr_t pf = atop(pa);
708 	paddr_t	off;
709 	uvm_physseg_t	upm;
710 
711 	upm = uvm_physseg_find(pf, &off);
712 	if (upm != UVM_PHYSSEG_TYPE_INVALID)
713 		return uvm_physseg_get_pg(upm, off);
714 	return(NULL);
715 }
716 
717 paddr_t
718 uvm_vm_page_to_phys(const struct vm_page *pg)
719 {
720 
721 	return pg->phys_addr & ~(PAGE_SIZE - 1);
722 }
723 
724 /*
725  * uvm_page_numa_load: load NUMA range description.
726  */
727 void
728 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
729 {
730 	struct uvm_page_numa_region *d;
731 
732 	KASSERT(numa_id < PGFL_MAX_BUCKETS);
733 
734 	d = kmem_alloc(sizeof(*d), KM_SLEEP);
735 	d->start = start;
736 	d->size = size;
737 	d->numa_id = numa_id;
738 	d->next = uvm_page_numa_region;
739 	uvm_page_numa_region = d;
740 }
741 
742 /*
743  * uvm_page_numa_lookup: lookup NUMA node for the given page.
744  */
745 static u_int
746 uvm_page_numa_lookup(struct vm_page *pg)
747 {
748 	struct uvm_page_numa_region *d;
749 	static bool warned;
750 	paddr_t pa;
751 
752 	KASSERT(uvm.numa_alloc);
753 	KASSERT(uvm_page_numa_region != NULL);
754 
755 	pa = VM_PAGE_TO_PHYS(pg);
756 	for (d = uvm_page_numa_region; d != NULL; d = d->next) {
757 		if (pa >= d->start && pa < d->start + d->size) {
758 			return d->numa_id;
759 		}
760 	}
761 
762 	if (!warned) {
763 		printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
764 		    PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
765 		warned = true;
766 	}
767 
768 	return 0;
769 }
770 
771 /*
772  * uvm_page_redim: adjust freelist dimensions if they have changed.
773  */
774 
775 static void
776 uvm_page_redim(int newncolors, int newnbuckets)
777 {
778 	struct pgfreelist npgfl;
779 	struct pgflbucket *opgb, *npgb;
780 	struct pgflist *ohead, *nhead;
781 	struct vm_page *pg;
782 	size_t bucketsize, bucketmemsize, oldbucketmemsize;
783 	int fl, ob, oc, nb, nc, obuckets, ocolors;
784 	char *bucketarray, *oldbucketmem, *bucketmem;
785 
786 	KASSERT(((newncolors - 1) & newncolors) == 0);
787 
788 	/* Anything to do? */
789 	if (newncolors <= uvmexp.ncolors &&
790 	    newnbuckets == uvm.bucketcount) {
791 		return;
792 	}
793 	if (uvm.page_init_done == false) {
794 		uvmexp.ncolors = newncolors;
795 		return;
796 	}
797 
798 	bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
799 	bucketsize = roundup2(bucketsize, coherency_unit);
800 	bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
801 	    coherency_unit - 1;
802 	bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
803 	bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
804 
805 	ocolors = uvmexp.ncolors;
806 	obuckets = uvm.bucketcount;
807 
808 	/* Freelist cache musn't be enabled. */
809 	uvm_pgflcache_pause();
810 
811 	/* Make sure we should still do this. */
812 	uvm_pgfl_lock();
813 	if (newncolors <= uvmexp.ncolors &&
814 	    newnbuckets == uvm.bucketcount) {
815 		uvm_pgfl_unlock();
816 		uvm_pgflcache_resume();
817 		kmem_free(bucketmem, bucketmemsize);
818 		return;
819 	}
820 
821 	uvmexp.ncolors = newncolors;
822 	uvmexp.colormask = uvmexp.ncolors - 1;
823 	uvm.bucketcount = newnbuckets;
824 
825 	for (fl = 0; fl < VM_NFREELIST; fl++) {
826 		/* Init new buckets in new freelist. */
827 		memset(&npgfl, 0, sizeof(npgfl));
828 		for (nb = 0; nb < newnbuckets; nb++) {
829 			npgb = (struct pgflbucket *)bucketarray;
830 			uvm_page_init_bucket(&npgfl, npgb, nb);
831 			bucketarray += bucketsize;
832 		}
833 		/* Now transfer pages from the old freelist. */
834 		for (nb = ob = 0; ob < obuckets; ob++) {
835 			opgb = uvm.page_free[fl].pgfl_buckets[ob];
836 			for (oc = 0; oc < ocolors; oc++) {
837 				ohead = &opgb->pgb_colors[oc];
838 				while ((pg = LIST_FIRST(ohead)) != NULL) {
839 					LIST_REMOVE(pg, pageq.list);
840 					/*
841 					 * Here we decide on the NEW color &
842 					 * bucket for the page.  For NUMA
843 					 * we'll use the info that the
844 					 * hardware gave us.  For non-NUMA
845 					 * assign take physical page frame
846 					 * number and cache color into
847 					 * account.  We do this to try and
848 					 * avoid defeating any memory
849 					 * interleaving in the hardware.
850 					 */
851 					KASSERT(
852 					    uvm_page_get_bucket(pg) == ob);
853 					KASSERT(fl ==
854 					    uvm_page_get_freelist(pg));
855 					if (uvm.numa_alloc) {
856 						nb = uvm_page_numa_lookup(pg);
857 					} else {
858 						nb = atop(VM_PAGE_TO_PHYS(pg))
859 						    / uvmexp.ncolors / 8
860 						    % newnbuckets;
861 					}
862 					uvm_page_set_bucket(pg, nb);
863 					npgb = npgfl.pgfl_buckets[nb];
864 					npgb->pgb_nfree++;
865 					nc = VM_PGCOLOR(pg);
866 					nhead = &npgb->pgb_colors[nc];
867 					LIST_INSERT_HEAD(nhead, pg, pageq.list);
868 				}
869 			}
870 		}
871 		/* Install the new freelist. */
872 		memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
873 	}
874 
875 	/* Unlock and free the old memory. */
876 	oldbucketmemsize = recolored_pages_memsize;
877 	oldbucketmem = recolored_pages_mem;
878 	recolored_pages_memsize = bucketmemsize;
879 	recolored_pages_mem = bucketmem;
880 
881 	uvm_pgfl_unlock();
882 	uvm_pgflcache_resume();
883 
884 	if (oldbucketmemsize) {
885 		kmem_free(oldbucketmem, oldbucketmemsize);
886 	}
887 
888 	/*
889 	 * this calls uvm_km_alloc() which may want to hold
890 	 * uvm_freelist_lock.
891 	 */
892 	uvm_pager_realloc_emerg();
893 }
894 
895 /*
896  * uvm_page_recolor: Recolor the pages if the new color count is
897  * larger than the old one.
898  */
899 
900 void
901 uvm_page_recolor(int newncolors)
902 {
903 
904 	uvm_page_redim(newncolors, uvm.bucketcount);
905 }
906 
907 /*
908  * uvm_page_rebucket: Determine a bucket structure and redim the free
909  * lists to match.
910  */
911 
912 void
913 uvm_page_rebucket(void)
914 {
915 	u_int min_numa, max_numa, npackage, shift;
916 	struct cpu_info *ci, *ci2, *ci3;
917 	CPU_INFO_ITERATOR cii;
918 
919 	/*
920 	 * If we have more than one NUMA node, and the maximum NUMA node ID
921 	 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
922 	 * for free pages.  uvm_pagefree() will not reassign pages to a
923 	 * different bucket on free.
924 	 */
925 	min_numa = (u_int)-1;
926 	max_numa = 0;
927 	for (CPU_INFO_FOREACH(cii, ci)) {
928 		if (ci->ci_numa_id < min_numa) {
929 			min_numa = ci->ci_numa_id;
930 		}
931 		if (ci->ci_numa_id > max_numa) {
932 			max_numa = ci->ci_numa_id;
933 		}
934 	}
935 	if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
936 #ifdef NUMA
937 		/*
938 		 * We can do this, and it seems to work well, but until
939 		 * further experiments are done we'll stick with the cache
940 		 * locality strategy.
941 		 */
942 		aprint_debug("UVM: using NUMA allocation scheme\n");
943 		for (CPU_INFO_FOREACH(cii, ci)) {
944 			ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
945 		}
946 		uvm.numa_alloc = true;
947 	 	uvm_page_redim(uvmexp.ncolors, max_numa + 1);
948 	 	return;
949 #endif
950 	}
951 
952 	/*
953 	 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
954 	 * and minimise lock contention.  Count the total number of CPU
955 	 * packages, and then try to distribute the buckets among CPU
956 	 * packages evenly.  uvm_pagefree() will reassign pages to the
957 	 * freeing CPU's preferred bucket on free.
958 	 */
959 	npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
960 
961 	/*
962 	 * Figure out how to arrange the packages & buckets, and the total
963 	 * number of buckets we need.  XXX 2 may not be the best factor.
964 	 */
965 	for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
966 		npackage >>= 1;
967 	}
968  	uvm_page_redim(uvmexp.ncolors, npackage);
969 
970  	/*
971  	 * Now tell each CPU which bucket to use.  In the outer loop, scroll
972  	 * through all CPU packages.
973  	 */
974  	npackage = 0;
975 	ci = curcpu();
976 	ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
977 	do {
978 		/*
979 		 * In the inner loop, scroll through all CPUs in the package
980 		 * and assign the same bucket ID.
981 		 */
982 		ci3 = ci2;
983 		do {
984 			ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
985 			ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
986 		} while (ci3 != ci2);
987 		npackage++;
988 		ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
989 	} while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
990 
991 	aprint_debug("UVM: using package allocation scheme, "
992 	    "%d package(s) per bucket\n", 1 << shift);
993 }
994 
995 /*
996  * uvm_cpu_attach: initialize per-CPU data structures.
997  */
998 
999 void
1000 uvm_cpu_attach(struct cpu_info *ci)
1001 {
1002 	struct uvm_cpu *ucpu;
1003 
1004 	/* Already done in uvm_page_init(). */
1005 	if (!CPU_IS_PRIMARY(ci)) {
1006 		/* Add more reserve pages for this CPU. */
1007 		uvmexp.reserve_kernel += vm_page_reserve_kernel;
1008 
1009 		/* Allocate per-CPU data structures. */
1010 		ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
1011 		    KM_SLEEP);
1012 		ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
1013 		    coherency_unit);
1014 		ci->ci_data.cpu_uvm = ucpu;
1015 	} else {
1016 		ucpu = ci->ci_data.cpu_uvm;
1017 	}
1018 
1019 	uvmpdpol_init_cpu(ucpu);
1020 
1021 	/*
1022 	 * Attach RNG source for this CPU's VM events
1023 	 */
1024         rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
1025 	    RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
1026 	    RND_FLAG_ESTIMATE_VALUE);
1027 }
1028 
1029 /*
1030  * uvm_availmem: fetch the total amount of free memory in pages.  this can
1031  * have a detrimental effect on performance due to false sharing; don't call
1032  * unless needed.
1033  */
1034 
1035 int
1036 uvm_availmem(void)
1037 {
1038 	struct pgfreelist *pgfl;
1039 	int fl, b, fpages;
1040 
1041 	fpages = 0;
1042 	for (fl = 0; fl < VM_NFREELIST; fl++) {
1043 		pgfl = &uvm.page_free[fl];
1044 		for (b = 0; b < uvm.bucketcount; b++) {
1045 			fpages += pgfl->pgfl_buckets[b]->pgb_nfree;
1046 		}
1047 	}
1048 	return fpages;
1049 }
1050 
1051 /*
1052  * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
1053  * specific freelist and specific bucket only.
1054  *
1055  * => must be at IPL_VM or higher to protect per-CPU data structures.
1056  */
1057 
1058 static struct vm_page *
1059 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
1060 {
1061 	int c, trycolor, colormask;
1062 	struct pgflbucket *pgb;
1063 	struct vm_page *pg;
1064 	kmutex_t *lock;
1065 	bool fill;
1066 
1067 	/*
1068 	 * Skip the bucket if empty, no lock needed.  There could be many
1069 	 * empty freelists/buckets.
1070 	 */
1071 	pgb = uvm.page_free[f].pgfl_buckets[b];
1072 	if (pgb->pgb_nfree == 0) {
1073 		return NULL;
1074 	}
1075 
1076 	/* Skip bucket if low on memory. */
1077 	lock = &uvm_freelist_locks[b].lock;
1078 	mutex_spin_enter(lock);
1079 	if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
1080 		if ((flags & UVM_PGA_USERESERVE) == 0 ||
1081 		    (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
1082 		     curlwp != uvm.pagedaemon_lwp)) {
1083 			mutex_spin_exit(lock);
1084 		     	return NULL;
1085 		}
1086 		fill = false;
1087 	} else {
1088 		fill = true;
1089 	}
1090 
1091 	/* Try all page colors as needed. */
1092 	c = trycolor = *trycolorp;
1093 	colormask = uvmexp.colormask;
1094 	do {
1095 		pg = LIST_FIRST(&pgb->pgb_colors[c]);
1096 		if (__predict_true(pg != NULL)) {
1097 			/*
1098 			 * Got a free page!  PG_FREE must be cleared under
1099 			 * lock because of uvm_pglistalloc().
1100 			 */
1101 			LIST_REMOVE(pg, pageq.list);
1102 			KASSERT(pg->flags & PG_FREE);
1103 			pg->flags &= PG_ZERO;
1104 			pgb->pgb_nfree--;
1105 
1106 			/*
1107 			 * While we have the bucket locked and our data
1108 			 * structures fresh in L1 cache, we have an ideal
1109 			 * opportunity to grab some pages for the freelist
1110 			 * cache without causing extra contention.  Only do
1111 			 * so if we found pages in this CPU's preferred
1112 			 * bucket.
1113 			 */
1114 			if (__predict_true(b == ucpu->pgflbucket && fill)) {
1115 				uvm_pgflcache_fill(ucpu, f, b, c);
1116 			}
1117 			mutex_spin_exit(lock);
1118 			KASSERT(uvm_page_get_bucket(pg) == b);
1119 			CPU_COUNT(c == trycolor ?
1120 			    CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
1121 			CPU_COUNT(CPU_COUNT_CPUMISS, 1);
1122 			*trycolorp = c;
1123 			return pg;
1124 		}
1125 		c = (c + 1) & colormask;
1126 	} while (c != trycolor);
1127 	mutex_spin_exit(lock);
1128 
1129 	return NULL;
1130 }
1131 
1132 /*
1133  * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
1134  * any color from any bucket, in a specific freelist.
1135  *
1136  * => must be at IPL_VM or higher to protect per-CPU data structures.
1137  */
1138 
1139 static struct vm_page *
1140 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
1141 {
1142 	int b, trybucket, bucketcount;
1143 	struct vm_page *pg;
1144 
1145 	/* Try for the exact thing in the per-CPU cache. */
1146 	if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
1147 		CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1148 		CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1149 		return pg;
1150 	}
1151 
1152 	/* Walk through all buckets, trying our preferred bucket first. */
1153 	trybucket = ucpu->pgflbucket;
1154 	b = trybucket;
1155 	bucketcount = uvm.bucketcount;
1156 	do {
1157 		pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
1158 		if (pg != NULL) {
1159 			return pg;
1160 		}
1161 		b = (b + 1 == bucketcount ? 0 : b + 1);
1162 	} while (b != trybucket);
1163 
1164 	return NULL;
1165 }
1166 
1167 /*
1168  * uvm_pagealloc_strat: allocate vm_page from a particular free list.
1169  *
1170  * => return null if no pages free
1171  * => wake up pagedaemon if number of free pages drops below low water mark
1172  * => if obj != NULL, obj must be locked (to put in obj's tree)
1173  * => if anon != NULL, anon must be locked (to put in anon)
1174  * => only one of obj or anon can be non-null
1175  * => caller must activate/deactivate page if it is not wired.
1176  * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
1177  * => policy decision: it is more important to pull a page off of the
1178  *	appropriate priority free list than it is to get a zero'd or
1179  *	unknown contents page.  This is because we live with the
1180  *	consequences of a bad free list decision for the entire
1181  *	lifetime of the page, e.g. if the page comes from memory that
1182  *	is slower to access.
1183  */
1184 
1185 struct vm_page *
1186 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
1187     int flags, int strat, int free_list)
1188 {
1189 	int zeroit = 0, color;
1190 	int lcv, error, s;
1191 	struct uvm_cpu *ucpu;
1192 	struct vm_page *pg;
1193 	lwp_t *l;
1194 
1195 	KASSERT(obj == NULL || anon == NULL);
1196 	KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
1197 	KASSERT(off == trunc_page(off));
1198 	KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
1199 	KASSERT(anon == NULL || anon->an_lock == NULL ||
1200 	    rw_write_held(anon->an_lock));
1201 
1202 	/*
1203 	 * This implements a global round-robin page coloring
1204 	 * algorithm.
1205 	 */
1206 
1207 	s = splvm();
1208 	ucpu = curcpu()->ci_data.cpu_uvm;
1209 	if (flags & UVM_FLAG_COLORMATCH) {
1210 		color = atop(off) & uvmexp.colormask;
1211 	} else {
1212 		color = ucpu->pgflcolor;
1213 	}
1214 
1215 	/*
1216 	 * fail if any of these conditions is true:
1217 	 * [1]  there really are no free pages, or
1218 	 * [2]  only kernel "reserved" pages remain and
1219 	 *        reserved pages have not been requested.
1220 	 * [3]  only pagedaemon "reserved" pages remain and
1221 	 *        the requestor isn't the pagedaemon.
1222 	 * we make kernel reserve pages available if called by a
1223 	 * kernel thread or a realtime thread.
1224 	 */
1225 	l = curlwp;
1226 	if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) {
1227 		flags |= UVM_PGA_USERESERVE;
1228 	}
1229 
1230 	/* If the allocator's running in NUMA mode, go with NUMA strategy. */
1231 	if (uvm.numa_alloc && strat == UVM_PGA_STRAT_NORMAL) {
1232 		strat = UVM_PGA_STRAT_NUMA;
1233 	}
1234 
1235  again:
1236 	switch (strat) {
1237 	case UVM_PGA_STRAT_NORMAL:
1238 		/* Check freelists: descending priority (ascending id) order. */
1239 		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1240 			pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
1241 			if (pg != NULL) {
1242 				goto gotit;
1243 			}
1244 		}
1245 
1246 		/* No pages free!  Have pagedaemon free some memory. */
1247 		splx(s);
1248 		uvm_kick_pdaemon();
1249 		return NULL;
1250 
1251 	case UVM_PGA_STRAT_ONLY:
1252 	case UVM_PGA_STRAT_FALLBACK:
1253 		/* Attempt to allocate from the specified free list. */
1254 		KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
1255 		pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
1256 		if (pg != NULL) {
1257 			goto gotit;
1258 		}
1259 
1260 		/* Fall back, if possible. */
1261 		if (strat == UVM_PGA_STRAT_FALLBACK) {
1262 			strat = UVM_PGA_STRAT_NORMAL;
1263 			goto again;
1264 		}
1265 
1266 		/* No pages free!  Have pagedaemon free some memory. */
1267 		splx(s);
1268 		uvm_kick_pdaemon();
1269 		return NULL;
1270 
1271 	case UVM_PGA_STRAT_NUMA:
1272 		/*
1273 		 * NUMA strategy: allocating from the correct bucket is more
1274 		 * important than observing freelist priority.  Look only to
1275 		 * the current NUMA node; if that fails, we need to look to
1276 		 * other NUMA nodes, so retry with the normal strategy.
1277 		 */
1278 		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1279 			pg = uvm_pgflcache_alloc(ucpu, lcv, color);
1280 			if (pg != NULL) {
1281 				CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1282 				CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1283 				goto gotit;
1284 			}
1285 			pg = uvm_pagealloc_pgb(ucpu, lcv,
1286 			    ucpu->pgflbucket, &color, flags);
1287 			if (pg != NULL) {
1288 				goto gotit;
1289 			}
1290 		}
1291 		strat = UVM_PGA_STRAT_NORMAL;
1292 		goto again;
1293 
1294 	default:
1295 		panic("uvm_pagealloc_strat: bad strat %d", strat);
1296 		/* NOTREACHED */
1297 	}
1298 
1299  gotit:
1300 	/*
1301 	 * We now know which color we actually allocated from; set
1302 	 * the next color accordingly.
1303 	 */
1304 
1305 	ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
1306 
1307 	/*
1308 	 * while still at IPL_VM, update allocation statistics and remember
1309 	 * if we have to zero the page
1310 	 */
1311 
1312 	if (flags & UVM_PGA_ZERO) {
1313 		if (pg->flags & PG_ZERO) {
1314 		    	CPU_COUNT(CPU_COUNT_PGA_ZEROHIT, 1);
1315 			zeroit = 0;
1316 		} else {
1317 		    	CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
1318 			zeroit = 1;
1319 		}
1320 	}
1321 	if (pg->flags & PG_ZERO) {
1322 	    	CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
1323 	}
1324 	if (anon) {
1325 		CPU_COUNT(CPU_COUNT_ANONPAGES, 1);
1326 		CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
1327 	}
1328 	splx(s);
1329 	KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
1330 
1331 	/*
1332 	 * assign the page to the object.  as the page was free, we know
1333 	 * that pg->uobject and pg->uanon are NULL.  we only need to take
1334 	 * the page's interlock if we are changing the values.
1335 	 */
1336 	if (anon != NULL || obj != NULL) {
1337 		mutex_enter(&pg->interlock);
1338 	}
1339 	pg->offset = off;
1340 	pg->uobject = obj;
1341 	pg->uanon = anon;
1342 	KASSERT(uvm_page_owner_locked_p(pg, true));
1343 	pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE;
1344 	if (anon) {
1345 		anon->an_page = pg;
1346 		pg->flags |= PG_ANON;
1347 		mutex_exit(&pg->interlock);
1348 	} else if (obj) {
1349 		/*
1350 		 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
1351 		 */
1352 		if (UVM_OBJ_IS_VNODE(obj)) {
1353 			pg->flags |= PG_FILE;
1354 		} else {
1355 			pg->flags |= PG_AOBJ;
1356 		}
1357 		uvm_pageinsert_object(obj, pg);
1358 		mutex_exit(&pg->interlock);
1359 		error = uvm_pageinsert_tree(obj, pg);
1360 		if (error != 0) {
1361 			mutex_enter(&pg->interlock);
1362 			uvm_pageremove_object(obj, pg);
1363 			mutex_exit(&pg->interlock);
1364 			uvm_pagefree(pg);
1365 			return NULL;
1366 		}
1367 	}
1368 
1369 #if defined(UVM_PAGE_TRKOWN)
1370 	pg->owner_tag = NULL;
1371 #endif
1372 	UVM_PAGE_OWN(pg, "new alloc");
1373 
1374 	if (flags & UVM_PGA_ZERO) {
1375 		/*
1376 		 * A zero'd page is not clean.  If we got a page not already
1377 		 * zero'd, then we have to zero it ourselves.
1378 		 */
1379 		if (obj != NULL || anon != NULL) {
1380 			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1381 		}
1382 		if (zeroit) {
1383 			pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1384 		}
1385 	}
1386 
1387 	return(pg);
1388 }
1389 
1390 /*
1391  * uvm_pagereplace: replace a page with another
1392  *
1393  * => object must be locked
1394  * => page interlocks must be held
1395  */
1396 
1397 void
1398 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1399 {
1400 	struct uvm_object *uobj = oldpg->uobject;
1401 	struct vm_page *pg __diagused;
1402 	uint64_t idx;
1403 
1404 	KASSERT((oldpg->flags & PG_TABLED) != 0);
1405 	KASSERT(uobj != NULL);
1406 	KASSERT((newpg->flags & PG_TABLED) == 0);
1407 	KASSERT(newpg->uobject == NULL);
1408 	KASSERT(rw_write_held(uobj->vmobjlock));
1409 	KASSERT(mutex_owned(&oldpg->interlock));
1410 	KASSERT(mutex_owned(&newpg->interlock));
1411 
1412 	newpg->uobject = uobj;
1413 	newpg->offset = oldpg->offset;
1414 	idx = newpg->offset >> PAGE_SHIFT;
1415 	pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
1416 	KASSERT(pg == oldpg);
1417 	if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
1418 		if ((newpg->flags & PG_CLEAN) != 0) {
1419 			radix_tree_clear_tag(&uobj->uo_pages, idx,
1420 			    UVM_PAGE_DIRTY_TAG);
1421 		} else {
1422 			radix_tree_set_tag(&uobj->uo_pages, idx,
1423 			    UVM_PAGE_DIRTY_TAG);
1424 		}
1425 	}
1426 	/*
1427 	 * oldpg's PG_STAT is stable.  newpg is not reachable by others yet.
1428 	 */
1429 	newpg->flags |=
1430 	    (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
1431 	uvm_pageinsert_object(uobj, newpg);
1432 	uvm_pageremove_object(uobj, oldpg);
1433 }
1434 
1435 /*
1436  * uvm_pagerealloc: reallocate a page from one object to another
1437  *
1438  * => both objects must be locked
1439  * => both interlocks must be held
1440  */
1441 
1442 void
1443 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1444 {
1445 	/*
1446 	 * remove it from the old object
1447 	 */
1448 
1449 	if (pg->uobject) {
1450 		uvm_pageremove_tree(pg->uobject, pg);
1451 		uvm_pageremove_object(pg->uobject, pg);
1452 	}
1453 
1454 	/*
1455 	 * put it in the new object
1456 	 */
1457 
1458 	if (newobj) {
1459 		/*
1460 		 * XXX we have no in-tree users of this functionality
1461 		 */
1462 		panic("uvm_pagerealloc: no impl");
1463 	}
1464 }
1465 
1466 #ifdef DEBUG
1467 /*
1468  * check if page is zero-filled
1469  */
1470 void
1471 uvm_pagezerocheck(struct vm_page *pg)
1472 {
1473 	int *p, *ep;
1474 
1475 	KASSERT(uvm_zerocheckkva != 0);
1476 
1477 	/*
1478 	 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1479 	 * uvm page allocator.
1480 	 *
1481 	 * it might be better to have "CPU-local temporary map" pmap interface.
1482 	 */
1483 	mutex_spin_enter(&uvm_zerochecklock);
1484 	pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1485 	p = (int *)uvm_zerocheckkva;
1486 	ep = (int *)((char *)p + PAGE_SIZE);
1487 	pmap_update(pmap_kernel());
1488 	while (p < ep) {
1489 		if (*p != 0)
1490 			panic("PG_ZERO page isn't zero-filled");
1491 		p++;
1492 	}
1493 	pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1494 	mutex_spin_exit(&uvm_zerochecklock);
1495 	/*
1496 	 * pmap_update() is not necessary here because no one except us
1497 	 * uses this VA.
1498 	 */
1499 }
1500 #endif /* DEBUG */
1501 
1502 /*
1503  * uvm_pagefree: free page
1504  *
1505  * => erase page's identity (i.e. remove from object)
1506  * => put page on free list
1507  * => caller must lock owning object (either anon or uvm_object)
1508  * => assumes all valid mappings of pg are gone
1509  */
1510 
1511 void
1512 uvm_pagefree(struct vm_page *pg)
1513 {
1514 	struct pgfreelist *pgfl;
1515 	struct pgflbucket *pgb;
1516 	struct uvm_cpu *ucpu;
1517 	kmutex_t *lock;
1518 	int bucket, s;
1519 	bool locked;
1520 
1521 #ifdef DEBUG
1522 	if (pg->uobject == (void *)0xdeadbeef &&
1523 	    pg->uanon == (void *)0xdeadbeef) {
1524 		panic("uvm_pagefree: freeing free page %p", pg);
1525 	}
1526 #endif /* DEBUG */
1527 
1528 	KASSERT((pg->flags & PG_PAGEOUT) == 0);
1529 	KASSERT(!(pg->flags & PG_FREE));
1530 	KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
1531 	KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1532 		rw_write_held(pg->uanon->an_lock));
1533 
1534 	/*
1535 	 * remove the page from the object's tree before acquiring any page
1536 	 * interlocks: this can acquire locks to free radixtree nodes.
1537 	 */
1538 	if (pg->uobject != NULL) {
1539 		uvm_pageremove_tree(pg->uobject, pg);
1540 	}
1541 
1542 	/*
1543 	 * if the page is loaned, resolve the loan instead of freeing.
1544 	 */
1545 
1546 	if (pg->loan_count) {
1547 		KASSERT(pg->wire_count == 0);
1548 
1549 		/*
1550 		 * if the page is owned by an anon then we just want to
1551 		 * drop anon ownership.  the kernel will free the page when
1552 		 * it is done with it.  if the page is owned by an object,
1553 		 * remove it from the object and mark it dirty for the benefit
1554 		 * of possible anon owners.
1555 		 *
1556 		 * regardless of previous ownership, wakeup any waiters,
1557 		 * unbusy the page, and we're done.
1558 		 */
1559 
1560 		uvm_pagelock(pg);
1561 		locked = true;
1562 		if (pg->uobject != NULL) {
1563 			uvm_pageremove_object(pg->uobject, pg);
1564 			pg->flags &= ~(PG_FILE|PG_AOBJ);
1565 		} else if (pg->uanon != NULL) {
1566 			if ((pg->flags & PG_ANON) == 0) {
1567 				pg->loan_count--;
1568 			} else {
1569 				pg->flags &= ~PG_ANON;
1570 				cpu_count(CPU_COUNT_ANONPAGES, -1);
1571 			}
1572 			pg->uanon->an_page = NULL;
1573 			pg->uanon = NULL;
1574 		}
1575 		if (pg->pqflags & PQ_WANTED) {
1576 			wakeup(pg);
1577 		}
1578 		pg->pqflags &= ~PQ_WANTED;
1579 		pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
1580 #ifdef UVM_PAGE_TRKOWN
1581 		pg->owner_tag = NULL;
1582 #endif
1583 		KASSERT((pg->flags & PG_STAT) == 0);
1584 		if (pg->loan_count) {
1585 			KASSERT(pg->uobject == NULL);
1586 			if (pg->uanon == NULL) {
1587 				uvm_pagedequeue(pg);
1588 			}
1589 			uvm_pageunlock(pg);
1590 			return;
1591 		}
1592 	} else if (pg->uobject != NULL || pg->uanon != NULL ||
1593 	           pg->wire_count != 0) {
1594 		uvm_pagelock(pg);
1595 		locked = true;
1596 	} else {
1597 		locked = false;
1598 	}
1599 
1600 	/*
1601 	 * remove page from its object or anon.
1602 	 */
1603 	if (pg->uobject != NULL) {
1604 		uvm_pageremove_object(pg->uobject, pg);
1605 	} else if (pg->uanon != NULL) {
1606 		const unsigned int status = uvm_pagegetdirty(pg);
1607 		pg->uanon->an_page = NULL;
1608 		pg->uanon = NULL;
1609 		kpreempt_disable();
1610 		CPU_COUNT(CPU_COUNT_ANONPAGES, -1);
1611 		CPU_COUNT(CPU_COUNT_ANONUNKNOWN + status, -1);
1612 		kpreempt_enable();
1613 	}
1614 
1615 	/*
1616 	 * if the page was wired, unwire it now.
1617 	 */
1618 
1619 	if (pg->wire_count) {
1620 		pg->wire_count = 0;
1621 		atomic_dec_uint(&uvmexp.wired);
1622 	}
1623 	if (locked) {
1624 		/*
1625 		 * wake anyone waiting on the page.
1626 		 */
1627 		if ((pg->pqflags & PQ_WANTED) != 0) {
1628 			pg->pqflags &= ~PQ_WANTED;
1629 			wakeup(pg);
1630 		}
1631 
1632 		/*
1633 		 * now remove the page from the queues.
1634 		 */
1635 		uvm_pagedequeue(pg);
1636 		uvm_pageunlock(pg);
1637 	} else {
1638 		KASSERT(!uvmpdpol_pageisqueued_p(pg));
1639 	}
1640 
1641 	/*
1642 	 * and put on free queue
1643 	 */
1644 
1645 #ifdef DEBUG
1646 	pg->uobject = (void *)0xdeadbeef;
1647 	pg->uanon = (void *)0xdeadbeef;
1648 	if (pg->flags & PG_ZERO)
1649 		uvm_pagezerocheck(pg);
1650 #endif /* DEBUG */
1651 
1652 	/* Try to send the page to the per-CPU cache. */
1653 	s = splvm();
1654 	if (pg->flags & PG_ZERO) {
1655 	    	CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
1656 	}
1657 	ucpu = curcpu()->ci_data.cpu_uvm;
1658 	bucket = uvm_page_get_bucket(pg);
1659 	if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
1660 		splx(s);
1661 		return;
1662 	}
1663 
1664 	/* Didn't work.  Never mind, send it to a global bucket. */
1665 	pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
1666 	pgb = pgfl->pgfl_buckets[bucket];
1667 	lock = &uvm_freelist_locks[bucket].lock;
1668 
1669 	mutex_spin_enter(lock);
1670 	/* PG_FREE must be set under lock because of uvm_pglistalloc(). */
1671 	pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
1672 	LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
1673 	pgb->pgb_nfree++;
1674 	mutex_spin_exit(lock);
1675 	splx(s);
1676 }
1677 
1678 /*
1679  * uvm_page_unbusy: unbusy an array of pages.
1680  *
1681  * => pages must either all belong to the same object, or all belong to anons.
1682  * => if pages are object-owned, object must be locked.
1683  * => if pages are anon-owned, anons must be locked.
1684  * => caller must make sure that anon-owned pages are not PG_RELEASED.
1685  */
1686 
1687 void
1688 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1689 {
1690 	struct vm_page *pg;
1691 	int i;
1692 	UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
1693 
1694 	for (i = 0; i < npgs; i++) {
1695 		pg = pgs[i];
1696 		if (pg == NULL || pg == PGO_DONTCARE) {
1697 			continue;
1698 		}
1699 
1700 		KASSERT(uvm_page_owner_locked_p(pg, true));
1701 		KASSERT(pg->flags & PG_BUSY);
1702 		KASSERT((pg->flags & PG_PAGEOUT) == 0);
1703 		if (pg->flags & PG_RELEASED) {
1704 			UVMHIST_LOG(ubchist, "releasing pg %#jx",
1705 			    (uintptr_t)pg, 0, 0, 0);
1706 			KASSERT(pg->uobject != NULL ||
1707 			    (pg->uanon != NULL && pg->uanon->an_ref > 0));
1708 			pg->flags &= ~PG_RELEASED;
1709 			uvm_pagefree(pg);
1710 		} else {
1711 			UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1712 			    (uintptr_t)pg, 0, 0, 0);
1713 			KASSERT((pg->flags & PG_FAKE) == 0);
1714 			pg->flags &= ~PG_BUSY;
1715 			uvm_pagelock(pg);
1716 			uvm_pagewakeup(pg);
1717 			uvm_pageunlock(pg);
1718 			UVM_PAGE_OWN(pg, NULL);
1719 		}
1720 	}
1721 }
1722 
1723 /*
1724  * uvm_pagewait: wait for a busy page
1725  *
1726  * => page must be known PG_BUSY
1727  * => object must be read or write locked
1728  * => object will be unlocked on return
1729  */
1730 
1731 void
1732 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
1733 {
1734 
1735 	KASSERT(rw_lock_held(lock));
1736 	KASSERT((pg->flags & PG_BUSY) != 0);
1737 	KASSERT(uvm_page_owner_locked_p(pg, false));
1738 
1739 	mutex_enter(&pg->interlock);
1740 	rw_exit(lock);
1741 	pg->pqflags |= PQ_WANTED;
1742 	UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
1743 }
1744 
1745 /*
1746  * uvm_pagewakeup: wake anyone waiting on a page
1747  *
1748  * => page interlock must be held
1749  */
1750 
1751 void
1752 uvm_pagewakeup(struct vm_page *pg)
1753 {
1754 	UVMHIST_FUNC("uvm_pagewakeup"); UVMHIST_CALLED(ubchist);
1755 
1756 	KASSERT(mutex_owned(&pg->interlock));
1757 
1758 	UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
1759 
1760 	if ((pg->pqflags & PQ_WANTED) != 0) {
1761 		wakeup(pg);
1762 		pg->pqflags &= ~PQ_WANTED;
1763 	}
1764 }
1765 
1766 #if defined(UVM_PAGE_TRKOWN)
1767 /*
1768  * uvm_page_own: set or release page ownership
1769  *
1770  * => this is a debugging function that keeps track of who sets PG_BUSY
1771  *	and where they do it.   it can be used to track down problems
1772  *	such a process setting "PG_BUSY" and never releasing it.
1773  * => page's object [if any] must be locked
1774  * => if "tag" is NULL then we are releasing page ownership
1775  */
1776 void
1777 uvm_page_own(struct vm_page *pg, const char *tag)
1778 {
1779 
1780 	KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1781 	KASSERT(uvm_page_owner_locked_p(pg, true));
1782 
1783 	/* gain ownership? */
1784 	if (tag) {
1785 		KASSERT((pg->flags & PG_BUSY) != 0);
1786 		if (pg->owner_tag) {
1787 			printf("uvm_page_own: page %p already owned "
1788 			    "by proc %d [%s]\n", pg,
1789 			    pg->owner, pg->owner_tag);
1790 			panic("uvm_page_own");
1791 		}
1792 		pg->owner = curproc->p_pid;
1793 		pg->lowner = curlwp->l_lid;
1794 		pg->owner_tag = tag;
1795 		return;
1796 	}
1797 
1798 	/* drop ownership */
1799 	KASSERT((pg->flags & PG_BUSY) == 0);
1800 	if (pg->owner_tag == NULL) {
1801 		printf("uvm_page_own: dropping ownership of an non-owned "
1802 		    "page (%p)\n", pg);
1803 		panic("uvm_page_own");
1804 	}
1805 	pg->owner_tag = NULL;
1806 }
1807 #endif
1808 
1809 /*
1810  * uvm_pageidlezero: zero free pages while the system is idle.
1811  */
1812 void
1813 uvm_pageidlezero(void)
1814 {
1815 
1816 	/*
1817 	 * Disabled for the moment.  Previous strategy too cache heavy.  In
1818 	 * the future we may experiment with zeroing the pages held in the
1819 	 * per-CPU cache (uvm_pgflcache).
1820 	 */
1821 }
1822 
1823 /*
1824  * uvm_pagelookup: look up a page
1825  *
1826  * => caller should lock object to keep someone from pulling the page
1827  *	out from under it
1828  */
1829 
1830 struct vm_page *
1831 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1832 {
1833 	struct vm_page *pg;
1834 
1835 	/* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */
1836 
1837 	pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1838 
1839 	KASSERT(pg == NULL || obj->uo_npages != 0);
1840 	KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1841 		(pg->flags & PG_BUSY) != 0);
1842 	return pg;
1843 }
1844 
1845 /*
1846  * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1847  *
1848  * => caller must lock objects
1849  * => caller must hold pg->interlock
1850  */
1851 
1852 void
1853 uvm_pagewire(struct vm_page *pg)
1854 {
1855 
1856 	KASSERT(uvm_page_owner_locked_p(pg, true));
1857 	KASSERT(mutex_owned(&pg->interlock));
1858 #if defined(READAHEAD_STATS)
1859 	if ((pg->flags & PG_READAHEAD) != 0) {
1860 		uvm_ra_hit.ev_count++;
1861 		pg->flags &= ~PG_READAHEAD;
1862 	}
1863 #endif /* defined(READAHEAD_STATS) */
1864 	if (pg->wire_count == 0) {
1865 		uvm_pagedequeue(pg);
1866 		atomic_inc_uint(&uvmexp.wired);
1867 	}
1868 	pg->wire_count++;
1869 	KASSERT(pg->wire_count > 0);	/* detect wraparound */
1870 }
1871 
1872 /*
1873  * uvm_pageunwire: unwire the page.
1874  *
1875  * => activate if wire count goes to zero.
1876  * => caller must lock objects
1877  * => caller must hold pg->interlock
1878  */
1879 
1880 void
1881 uvm_pageunwire(struct vm_page *pg)
1882 {
1883 
1884 	KASSERT(uvm_page_owner_locked_p(pg, true));
1885 	KASSERT(pg->wire_count != 0);
1886 	KASSERT(!uvmpdpol_pageisqueued_p(pg));
1887 	KASSERT(mutex_owned(&pg->interlock));
1888 	pg->wire_count--;
1889 	if (pg->wire_count == 0) {
1890 		uvm_pageactivate(pg);
1891 		KASSERT(uvmexp.wired != 0);
1892 		atomic_dec_uint(&uvmexp.wired);
1893 	}
1894 }
1895 
1896 /*
1897  * uvm_pagedeactivate: deactivate page
1898  *
1899  * => caller must lock objects
1900  * => caller must check to make sure page is not wired
1901  * => object that page belongs to must be locked (so we can adjust pg->flags)
1902  * => caller must clear the reference on the page before calling
1903  * => caller must hold pg->interlock
1904  */
1905 
1906 void
1907 uvm_pagedeactivate(struct vm_page *pg)
1908 {
1909 
1910 	KASSERT(uvm_page_owner_locked_p(pg, false));
1911 	KASSERT(mutex_owned(&pg->interlock));
1912 	if (pg->wire_count == 0) {
1913 		KASSERT(uvmpdpol_pageisqueued_p(pg));
1914 		uvmpdpol_pagedeactivate(pg);
1915 	}
1916 }
1917 
1918 /*
1919  * uvm_pageactivate: activate page
1920  *
1921  * => caller must lock objects
1922  * => caller must hold pg->interlock
1923  */
1924 
1925 void
1926 uvm_pageactivate(struct vm_page *pg)
1927 {
1928 
1929 	KASSERT(uvm_page_owner_locked_p(pg, false));
1930 	KASSERT(mutex_owned(&pg->interlock));
1931 #if defined(READAHEAD_STATS)
1932 	if ((pg->flags & PG_READAHEAD) != 0) {
1933 		uvm_ra_hit.ev_count++;
1934 		pg->flags &= ~PG_READAHEAD;
1935 	}
1936 #endif /* defined(READAHEAD_STATS) */
1937 	if (pg->wire_count == 0) {
1938 		uvmpdpol_pageactivate(pg);
1939 	}
1940 }
1941 
1942 /*
1943  * uvm_pagedequeue: remove a page from any paging queue
1944  *
1945  * => caller must lock objects
1946  * => caller must hold pg->interlock
1947  */
1948 void
1949 uvm_pagedequeue(struct vm_page *pg)
1950 {
1951 
1952 	KASSERT(uvm_page_owner_locked_p(pg, true));
1953 	KASSERT(mutex_owned(&pg->interlock));
1954 	if (uvmpdpol_pageisqueued_p(pg)) {
1955 		uvmpdpol_pagedequeue(pg);
1956 	}
1957 }
1958 
1959 /*
1960  * uvm_pageenqueue: add a page to a paging queue without activating.
1961  * used where a page is not really demanded (yet).  eg. read-ahead
1962  *
1963  * => caller must lock objects
1964  * => caller must hold pg->interlock
1965  */
1966 void
1967 uvm_pageenqueue(struct vm_page *pg)
1968 {
1969 
1970 	KASSERT(uvm_page_owner_locked_p(pg, false));
1971 	KASSERT(mutex_owned(&pg->interlock));
1972 	if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1973 		uvmpdpol_pageenqueue(pg);
1974 	}
1975 }
1976 
1977 /*
1978  * uvm_pagelock: acquire page interlock
1979  */
1980 void
1981 uvm_pagelock(struct vm_page *pg)
1982 {
1983 
1984 	mutex_enter(&pg->interlock);
1985 }
1986 
1987 /*
1988  * uvm_pagelock2: acquire two page interlocks
1989  */
1990 void
1991 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
1992 {
1993 
1994 	if (pg1 < pg2) {
1995 		mutex_enter(&pg1->interlock);
1996 		mutex_enter(&pg2->interlock);
1997 	} else {
1998 		mutex_enter(&pg2->interlock);
1999 		mutex_enter(&pg1->interlock);
2000 	}
2001 }
2002 
2003 /*
2004  * uvm_pageunlock: release page interlock, and if a page replacement intent
2005  * is set on the page, pass it to uvmpdpol to make real.
2006  *
2007  * => caller must hold pg->interlock
2008  */
2009 void
2010 uvm_pageunlock(struct vm_page *pg)
2011 {
2012 
2013 	if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
2014 	    (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
2015 	    	mutex_exit(&pg->interlock);
2016 	    	return;
2017 	}
2018 	pg->pqflags |= PQ_INTENT_QUEUED;
2019 	mutex_exit(&pg->interlock);
2020 	uvmpdpol_pagerealize(pg);
2021 }
2022 
2023 /*
2024  * uvm_pageunlock2: release two page interlocks, and for both pages if a
2025  * page replacement intent is set on the page, pass it to uvmpdpol to make
2026  * real.
2027  *
2028  * => caller must hold pg->interlock
2029  */
2030 void
2031 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
2032 {
2033 
2034 	if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
2035 	    (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
2036 	    	mutex_exit(&pg1->interlock);
2037 	    	pg1 = NULL;
2038 	} else {
2039 		pg1->pqflags |= PQ_INTENT_QUEUED;
2040 		mutex_exit(&pg1->interlock);
2041 	}
2042 
2043 	if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
2044 	    (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
2045 	    	mutex_exit(&pg2->interlock);
2046 	    	pg2 = NULL;
2047 	} else {
2048 		pg2->pqflags |= PQ_INTENT_QUEUED;
2049 		mutex_exit(&pg2->interlock);
2050 	}
2051 
2052 	if (pg1 != NULL) {
2053 		uvmpdpol_pagerealize(pg1);
2054 	}
2055 	if (pg2 != NULL) {
2056 		uvmpdpol_pagerealize(pg2);
2057 	}
2058 }
2059 
2060 /*
2061  * uvm_pagezero: zero fill a page
2062  *
2063  * => if page is part of an object then the object should be locked
2064  *	to protect pg->flags.
2065  */
2066 
2067 void
2068 uvm_pagezero(struct vm_page *pg)
2069 {
2070 
2071 	uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
2072 	pmap_zero_page(VM_PAGE_TO_PHYS(pg));
2073 }
2074 
2075 /*
2076  * uvm_pagecopy: copy a page
2077  *
2078  * => if page is part of an object then the object should be locked
2079  *	to protect pg->flags.
2080  */
2081 
2082 void
2083 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
2084 {
2085 
2086 	uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
2087 	pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
2088 }
2089 
2090 /*
2091  * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
2092  */
2093 
2094 bool
2095 uvm_pageismanaged(paddr_t pa)
2096 {
2097 
2098 	return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
2099 }
2100 
2101 /*
2102  * uvm_page_lookup_freelist: look up the free list for the specified page
2103  */
2104 
2105 int
2106 uvm_page_lookup_freelist(struct vm_page *pg)
2107 {
2108 	uvm_physseg_t upm;
2109 
2110 	upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
2111 	KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
2112 	return uvm_physseg_get_free_list(upm);
2113 }
2114 
2115 /*
2116  * uvm_page_owner_locked_p: return true if object associated with page is
2117  * locked.  this is a weak check for runtime assertions only.
2118  */
2119 
2120 bool
2121 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
2122 {
2123 
2124 	if (pg->uobject != NULL) {
2125 		return exclusive
2126 		    ? rw_write_held(pg->uobject->vmobjlock)
2127 		    : rw_lock_held(pg->uobject->vmobjlock);
2128 	}
2129 	if (pg->uanon != NULL) {
2130 		return exclusive
2131 		    ? rw_write_held(pg->uanon->an_lock)
2132 		    : rw_lock_held(pg->uanon->an_lock);
2133 	}
2134 	return true;
2135 }
2136 
2137 /*
2138  * uvm_pagereadonly_p: return if the page should be mapped read-only
2139  */
2140 
2141 bool
2142 uvm_pagereadonly_p(struct vm_page *pg)
2143 {
2144 	struct uvm_object * const uobj = pg->uobject;
2145 
2146 	KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
2147 	KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
2148 	if ((pg->flags & PG_RDONLY) != 0) {
2149 		return true;
2150 	}
2151 	if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
2152 		return true;
2153 	}
2154 	if (uobj == NULL) {
2155 		return false;
2156 	}
2157 	return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
2158 }
2159 
2160 #ifdef PMAP_DIRECT
2161 /*
2162  * Call pmap to translate physical address into a virtual and to run a callback
2163  * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
2164  * or equivalent.
2165  */
2166 int
2167 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
2168             int (*process)(void *, size_t, void *), void *arg)
2169 {
2170 	int error = 0;
2171 	paddr_t pa;
2172 	size_t todo;
2173 	voff_t pgoff = (off & PAGE_MASK);
2174 	struct vm_page *pg;
2175 
2176 	KASSERT(npages > 0 && len > 0);
2177 
2178 	for (int i = 0; i < npages; i++) {
2179 		pg = pgs[i];
2180 
2181 		KASSERT(len > 0);
2182 
2183 		/*
2184 		 * Caller is responsible for ensuring all the pages are
2185 		 * available.
2186 		 */
2187 		KASSERT(pg != NULL && pg != PGO_DONTCARE);
2188 
2189 		pa = VM_PAGE_TO_PHYS(pg);
2190 		todo = MIN(len, PAGE_SIZE - pgoff);
2191 
2192 		error = pmap_direct_process(pa, pgoff, todo, process, arg);
2193 		if (error)
2194 			break;
2195 
2196 		pgoff = 0;
2197 		len -= todo;
2198 	}
2199 
2200 	KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
2201 	return error;
2202 }
2203 #endif /* PMAP_DIRECT */
2204 
2205 #if defined(DDB) || defined(DEBUGPRINT)
2206 
2207 /*
2208  * uvm_page_printit: actually print the page
2209  */
2210 
2211 static const char page_flagbits[] = UVM_PGFLAGBITS;
2212 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
2213 
2214 void
2215 uvm_page_printit(struct vm_page *pg, bool full,
2216     void (*pr)(const char *, ...))
2217 {
2218 	struct vm_page *tpg;
2219 	struct uvm_object *uobj;
2220 	struct pgflbucket *pgb;
2221 	struct pgflist *pgl;
2222 	char pgbuf[128];
2223 
2224 	(*pr)("PAGE %p:\n", pg);
2225 	snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
2226 	(*pr)("  flags=%s\n", pgbuf);
2227 	snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
2228 	(*pr)("  pqflags=%s\n", pgbuf);
2229 	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
2230 	    pg->uobject, pg->uanon, (long long)pg->offset);
2231 	(*pr)("  loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
2232 	    pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
2233 	    uvm_page_get_freelist(pg));
2234 	(*pr)("  pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
2235 #if defined(UVM_PAGE_TRKOWN)
2236 	if (pg->flags & PG_BUSY)
2237 		(*pr)("  owning process = %d, tag=%s\n",
2238 		    pg->owner, pg->owner_tag);
2239 	else
2240 		(*pr)("  page not busy, no owner\n");
2241 #else
2242 	(*pr)("  [page ownership tracking disabled]\n");
2243 #endif
2244 
2245 	if (!full)
2246 		return;
2247 
2248 	/* cross-verify object/anon */
2249 	if ((pg->flags & PG_FREE) == 0) {
2250 		if (pg->flags & PG_ANON) {
2251 			if (pg->uanon == NULL || pg->uanon->an_page != pg)
2252 			    (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2253 				(pg->uanon) ? pg->uanon->an_page : NULL);
2254 			else
2255 				(*pr)("  anon backpointer is OK\n");
2256 		} else {
2257 			uobj = pg->uobject;
2258 			if (uobj) {
2259 				(*pr)("  checking object list\n");
2260 				tpg = uvm_pagelookup(uobj, pg->offset);
2261 				if (tpg)
2262 					(*pr)("  page found on object list\n");
2263 				else
2264 			(*pr)("  >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
2265 			}
2266 		}
2267 	}
2268 
2269 	/* cross-verify page queue */
2270 	if (pg->flags & PG_FREE) {
2271 		int fl = uvm_page_get_freelist(pg);
2272 		int b = uvm_page_get_bucket(pg);
2273 		pgb = uvm.page_free[fl].pgfl_buckets[b];
2274 		pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
2275 		(*pr)("  checking pageq list\n");
2276 		LIST_FOREACH(tpg, pgl, pageq.list) {
2277 			if (tpg == pg) {
2278 				break;
2279 			}
2280 		}
2281 		if (tpg)
2282 			(*pr)("  page found on pageq list\n");
2283 		else
2284 			(*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
2285 	}
2286 }
2287 
2288 /*
2289  * uvm_page_printall - print a summary of all managed pages
2290  */
2291 
2292 void
2293 uvm_page_printall(void (*pr)(const char *, ...))
2294 {
2295 	uvm_physseg_t i;
2296 	paddr_t pfn;
2297 	struct vm_page *pg;
2298 
2299 	(*pr)("%18s %4s %4s %18s %18s"
2300 #ifdef UVM_PAGE_TRKOWN
2301 	    " OWNER"
2302 #endif
2303 	    "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
2304 	for (i = uvm_physseg_get_first();
2305 	     uvm_physseg_valid_p(i);
2306 	     i = uvm_physseg_get_next(i)) {
2307 		for (pfn = uvm_physseg_get_start(i);
2308 		     pfn < uvm_physseg_get_end(i);
2309 		     pfn++) {
2310 			pg = PHYS_TO_VM_PAGE(ptoa(pfn));
2311 
2312 			(*pr)("%18p %04x %08x %18p %18p",
2313 			    pg, pg->flags, pg->pqflags, pg->uobject,
2314 			    pg->uanon);
2315 #ifdef UVM_PAGE_TRKOWN
2316 			if (pg->flags & PG_BUSY)
2317 				(*pr)(" %d [%s]", pg->owner, pg->owner_tag);
2318 #endif
2319 			(*pr)("\n");
2320 		}
2321 	}
2322 }
2323 
2324 /*
2325  * uvm_page_print_freelists - print a summary freelists
2326  */
2327 
2328 void
2329 uvm_page_print_freelists(void (*pr)(const char *, ...))
2330 {
2331 	struct pgfreelist *pgfl;
2332 	struct pgflbucket *pgb;
2333 	int fl, b, c;
2334 
2335 	(*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
2336 	    VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
2337 
2338 	for (fl = 0; fl < VM_NFREELIST; fl++) {
2339 		pgfl = &uvm.page_free[fl];
2340 		(*pr)("freelist(%d) @ %p\n", fl, pgfl);
2341 		for (b = 0; b < uvm.bucketcount; b++) {
2342 			pgb = uvm.page_free[fl].pgfl_buckets[b];
2343 			(*pr)("    bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
2344 			    b, pgb, pgb->pgb_nfree,
2345 			    &uvm_freelist_locks[b].lock);
2346 			for (c = 0; c < uvmexp.ncolors; c++) {
2347 				(*pr)("        color(%d) @ %p, ", c,
2348 				    &pgb->pgb_colors[c]);
2349 				(*pr)("first page = %p\n",
2350 				    LIST_FIRST(&pgb->pgb_colors[c]));
2351 			}
2352 		}
2353 	}
2354 }
2355 
2356 #endif /* DDB || DEBUGPRINT */
2357