xref: /netbsd-src/sys/uvm/uvm_page.c (revision d90047b5d07facf36e6c01dcc0bded8997ce9cc2)
1 /*	$NetBSD: uvm_page.c,v 1.244 2020/07/09 05:57:15 skrll Exp $	*/
2 
3 /*-
4  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1997 Charles D. Cranor and Washington University.
34  * Copyright (c) 1991, 1993, The Regents of the University of California.
35  *
36  * All rights reserved.
37  *
38  * This code is derived from software contributed to Berkeley by
39  * The Mach Operating System project at Carnegie-Mellon University.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  *	@(#)vm_page.c   8.3 (Berkeley) 3/21/94
66  * from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
67  *
68  *
69  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
70  * All rights reserved.
71  *
72  * Permission to use, copy, modify and distribute this software and
73  * its documentation is hereby granted, provided that both the copyright
74  * notice and this permission notice appear in all copies of the
75  * software, derivative works or modified versions, and any portions
76  * thereof, and that both notices appear in supporting documentation.
77  *
78  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
79  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
80  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
81  *
82  * Carnegie Mellon requests users of this software to return to
83  *
84  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
85  *  School of Computer Science
86  *  Carnegie Mellon University
87  *  Pittsburgh PA 15213-3890
88  *
89  * any improvements or extensions that they make and grant Carnegie the
90  * rights to redistribute these changes.
91  */
92 
93 /*
94  * uvm_page.c: page ops.
95  */
96 
97 #include <sys/cdefs.h>
98 __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.244 2020/07/09 05:57:15 skrll Exp $");
99 
100 #include "opt_ddb.h"
101 #include "opt_uvm.h"
102 #include "opt_uvmhist.h"
103 #include "opt_readahead.h"
104 
105 #include <sys/param.h>
106 #include <sys/systm.h>
107 #include <sys/sched.h>
108 #include <sys/kernel.h>
109 #include <sys/vnode.h>
110 #include <sys/proc.h>
111 #include <sys/radixtree.h>
112 #include <sys/atomic.h>
113 #include <sys/cpu.h>
114 
115 #include <uvm/uvm.h>
116 #include <uvm/uvm_ddb.h>
117 #include <uvm/uvm_pdpolicy.h>
118 #include <uvm/uvm_pgflcache.h>
119 
120 /*
121  * number of pages per-CPU to reserve for the kernel.
122  */
123 #ifndef	UVM_RESERVED_PAGES_PER_CPU
124 #define	UVM_RESERVED_PAGES_PER_CPU	5
125 #endif
126 int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
127 
128 /*
129  * physical memory size;
130  */
131 psize_t physmem;
132 
133 /*
134  * local variables
135  */
136 
137 /*
138  * these variables record the values returned by vm_page_bootstrap,
139  * for debugging purposes.  The implementation of uvm_pageboot_alloc
140  * and pmap_startup here also uses them internally.
141  */
142 
143 static vaddr_t      virtual_space_start;
144 static vaddr_t      virtual_space_end;
145 
146 /*
147  * we allocate an initial number of page colors in uvm_page_init(),
148  * and remember them.  We may re-color pages as cache sizes are
149  * discovered during the autoconfiguration phase.  But we can never
150  * free the initial set of buckets, since they are allocated using
151  * uvm_pageboot_alloc().
152  */
153 
154 static size_t recolored_pages_memsize /* = 0 */;
155 static char *recolored_pages_mem;
156 
157 /*
158  * freelist locks - one per bucket.
159  */
160 
161 union uvm_freelist_lock	uvm_freelist_locks[PGFL_MAX_BUCKETS]
162     __cacheline_aligned;
163 
164 /*
165  * basic NUMA information.
166  */
167 
168 static struct uvm_page_numa_region {
169 	struct uvm_page_numa_region	*next;
170 	paddr_t				start;
171 	paddr_t				size;
172 	u_int				numa_id;
173 } *uvm_page_numa_region;
174 
175 #ifdef DEBUG
176 kmutex_t uvm_zerochecklock __cacheline_aligned;
177 vaddr_t uvm_zerocheckkva;
178 #endif /* DEBUG */
179 
180 /*
181  * These functions are reserved for uvm(9) internal use and are not
182  * exported in the header file uvm_physseg.h
183  *
184  * Thus they are redefined here.
185  */
186 void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
187 void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
188 
189 /* returns a pgs array */
190 struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
191 
192 /*
193  * inline functions
194  */
195 
196 /*
197  * uvm_pageinsert: insert a page in the object.
198  *
199  * => caller must lock object
200  * => call should have already set pg's object and offset pointers
201  *    and bumped the version counter
202  */
203 
204 static inline void
205 uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
206 {
207 
208 	KASSERT(uobj == pg->uobject);
209 	KASSERT(rw_write_held(uobj->vmobjlock));
210 	KASSERT((pg->flags & PG_TABLED) == 0);
211 
212 	if ((pg->flags & PG_STAT) != 0) {
213 		/* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
214 		const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
215 
216 		if ((pg->flags & PG_FILE) != 0) {
217 			if (uobj->uo_npages == 0) {
218 				struct vnode *vp = (struct vnode *)uobj;
219 				mutex_enter(vp->v_interlock);
220 				KASSERT((vp->v_iflag & VI_PAGES) == 0);
221 				vp->v_iflag |= VI_PAGES;
222 				vholdl(vp);
223 				mutex_exit(vp->v_interlock);
224 			}
225 			if (UVM_OBJ_IS_VTEXT(uobj)) {
226 				cpu_count(CPU_COUNT_EXECPAGES, 1);
227 			}
228 			cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
229 		} else {
230 			cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
231 		}
232 	}
233 	pg->flags |= PG_TABLED;
234 	uobj->uo_npages++;
235 }
236 
237 static inline int
238 uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
239 {
240 	const uint64_t idx = pg->offset >> PAGE_SHIFT;
241 	int error;
242 
243 	error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
244 	if (error != 0) {
245 		return error;
246 	}
247 	if ((pg->flags & PG_CLEAN) == 0) {
248 		radix_tree_set_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG);
249 	}
250 	KASSERT(((pg->flags & PG_CLEAN) == 0) ==
251 	    radix_tree_get_tag(&uobj->uo_pages, idx, UVM_PAGE_DIRTY_TAG));
252 	return 0;
253 }
254 
255 /*
256  * uvm_page_remove: remove page from object.
257  *
258  * => caller must lock object
259  */
260 
261 static inline void
262 uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
263 {
264 
265 	KASSERT(uobj == pg->uobject);
266 	KASSERT(rw_write_held(uobj->vmobjlock));
267 	KASSERT(pg->flags & PG_TABLED);
268 
269 	if ((pg->flags & PG_STAT) != 0) {
270 		/* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
271 		const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
272 
273 		if ((pg->flags & PG_FILE) != 0) {
274 			if (uobj->uo_npages == 1) {
275 				struct vnode *vp = (struct vnode *)uobj;
276 				mutex_enter(vp->v_interlock);
277 				KASSERT((vp->v_iflag & VI_PAGES) != 0);
278 				vp->v_iflag &= ~VI_PAGES;
279 				holdrelel(vp);
280 				mutex_exit(vp->v_interlock);
281 			}
282 			if (UVM_OBJ_IS_VTEXT(uobj)) {
283 				cpu_count(CPU_COUNT_EXECPAGES, -1);
284 			}
285 			cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
286 		} else {
287 			cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
288 		}
289 	}
290 	uobj->uo_npages--;
291 	pg->flags &= ~PG_TABLED;
292 	pg->uobject = NULL;
293 }
294 
295 static inline void
296 uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
297 {
298 	struct vm_page *opg __unused;
299 
300 	opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
301 	KASSERT(pg == opg);
302 }
303 
304 static void
305 uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
306 {
307 	int i;
308 
309 	pgb->pgb_nfree = 0;
310 	for (i = 0; i < uvmexp.ncolors; i++) {
311 		LIST_INIT(&pgb->pgb_colors[i]);
312 	}
313 	pgfl->pgfl_buckets[num] = pgb;
314 }
315 
316 /*
317  * uvm_page_init: init the page system.   called from uvm_init().
318  *
319  * => we return the range of kernel virtual memory in kvm_startp/kvm_endp
320  */
321 
322 void
323 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
324 {
325 	static struct uvm_cpu boot_cpu __cacheline_aligned;
326 	psize_t freepages, pagecount, bucketsize, n;
327 	struct pgflbucket *pgb;
328 	struct vm_page *pagearray;
329 	char *bucketarray;
330 	uvm_physseg_t bank;
331 	int fl, b;
332 
333 	KASSERT(ncpu <= 1);
334 
335 	/*
336 	 * init the page queues and free page queue locks, except the
337 	 * free list; we allocate that later (with the initial vm_page
338 	 * structures).
339 	 */
340 
341 	curcpu()->ci_data.cpu_uvm = &boot_cpu;
342 	uvmpdpol_init();
343 	for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
344 		mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
345 	}
346 
347 	/*
348 	 * allocate vm_page structures.
349 	 */
350 
351 	/*
352 	 * sanity check:
353 	 * before calling this function the MD code is expected to register
354 	 * some free RAM with the uvm_page_physload() function.   our job
355 	 * now is to allocate vm_page structures for this memory.
356 	 */
357 
358 	if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
359 		panic("uvm_page_bootstrap: no memory pre-allocated");
360 
361 	/*
362 	 * first calculate the number of free pages...
363 	 *
364 	 * note that we use start/end rather than avail_start/avail_end.
365 	 * this allows us to allocate extra vm_page structures in case we
366 	 * want to return some memory to the pool after booting.
367 	 */
368 
369 	freepages = 0;
370 
371 	for (bank = uvm_physseg_get_first();
372 	     uvm_physseg_valid_p(bank) ;
373 	     bank = uvm_physseg_get_next(bank)) {
374 		freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
375 	}
376 
377 	/*
378 	 * Let MD code initialize the number of colors, or default
379 	 * to 1 color if MD code doesn't care.
380 	 */
381 	if (uvmexp.ncolors == 0)
382 		uvmexp.ncolors = 1;
383 	uvmexp.colormask = uvmexp.ncolors - 1;
384 	KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
385 
386 	/* We always start with only 1 bucket. */
387 	uvm.bucketcount = 1;
388 
389 	/*
390 	 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
391 	 * use.   for each page of memory we use we need a vm_page structure.
392 	 * thus, the total number of pages we can use is the total size of
393 	 * the memory divided by the PAGE_SIZE plus the size of the vm_page
394 	 * structure.   we add one to freepages as a fudge factor to avoid
395 	 * truncation errors (since we can only allocate in terms of whole
396 	 * pages).
397 	 */
398 	pagecount = ((freepages + 1) << PAGE_SHIFT) /
399 	    (PAGE_SIZE + sizeof(struct vm_page));
400 	bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
401 	bucketsize = roundup2(bucketsize, coherency_unit);
402 	bucketarray = (void *)uvm_pageboot_alloc(
403 	    bucketsize * VM_NFREELIST +
404 	    pagecount * sizeof(struct vm_page));
405 	pagearray = (struct vm_page *)
406 	    (bucketarray + bucketsize * VM_NFREELIST);
407 
408 	for (fl = 0; fl < VM_NFREELIST; fl++) {
409 		pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
410 		uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
411 	}
412 	memset(pagearray, 0, pagecount * sizeof(struct vm_page));
413 
414 	/*
415 	 * init the freelist cache in the disabled state.
416 	 */
417 	uvm_pgflcache_init();
418 
419 	/*
420 	 * init the vm_page structures and put them in the correct place.
421 	 */
422 	/* First init the extent */
423 
424 	for (bank = uvm_physseg_get_first(),
425 		 uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
426 	     uvm_physseg_valid_p(bank);
427 	     bank = uvm_physseg_get_next(bank)) {
428 
429 		n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
430 		uvm_physseg_seg_alloc_from_slab(bank, n);
431 		uvm_physseg_init_seg(bank, pagearray);
432 
433 		/* set up page array pointers */
434 		pagearray += n;
435 		pagecount -= n;
436 	}
437 
438 	/*
439 	 * pass up the values of virtual_space_start and
440 	 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
441 	 * layers of the VM.
442 	 */
443 
444 	*kvm_startp = round_page(virtual_space_start);
445 	*kvm_endp = trunc_page(virtual_space_end);
446 #ifdef DEBUG
447 	/*
448 	 * steal kva for uvm_pagezerocheck().
449 	 */
450 	uvm_zerocheckkva = *kvm_startp;
451 	*kvm_startp += PAGE_SIZE;
452 	mutex_init(&uvm_zerochecklock, MUTEX_DEFAULT, IPL_VM);
453 #endif /* DEBUG */
454 
455 	/*
456 	 * init various thresholds.
457 	 */
458 
459 	uvmexp.reserve_pagedaemon = 1;
460 	uvmexp.reserve_kernel = vm_page_reserve_kernel;
461 
462 	/*
463 	 * done!
464 	 */
465 
466 	uvm.page_init_done = true;
467 }
468 
469 /*
470  * uvm_pgfl_lock: lock all freelist buckets
471  */
472 
473 void
474 uvm_pgfl_lock(void)
475 {
476 	int i;
477 
478 	for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
479 		mutex_spin_enter(&uvm_freelist_locks[i].lock);
480 	}
481 }
482 
483 /*
484  * uvm_pgfl_unlock: unlock all freelist buckets
485  */
486 
487 void
488 uvm_pgfl_unlock(void)
489 {
490 	int i;
491 
492 	for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
493 		mutex_spin_exit(&uvm_freelist_locks[i].lock);
494 	}
495 }
496 
497 /*
498  * uvm_setpagesize: set the page size
499  *
500  * => sets page_shift and page_mask from uvmexp.pagesize.
501  */
502 
503 void
504 uvm_setpagesize(void)
505 {
506 
507 	/*
508 	 * If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
509 	 * to be a constant (indicated by being a non-zero value).
510 	 */
511 	if (uvmexp.pagesize == 0) {
512 		if (PAGE_SIZE == 0)
513 			panic("uvm_setpagesize: uvmexp.pagesize not set");
514 		uvmexp.pagesize = PAGE_SIZE;
515 	}
516 	uvmexp.pagemask = uvmexp.pagesize - 1;
517 	if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
518 		panic("uvm_setpagesize: page size %u (%#x) not a power of two",
519 		    uvmexp.pagesize, uvmexp.pagesize);
520 	for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
521 		if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
522 			break;
523 }
524 
525 /*
526  * uvm_pageboot_alloc: steal memory from physmem for bootstrapping
527  */
528 
529 vaddr_t
530 uvm_pageboot_alloc(vsize_t size)
531 {
532 	static bool initialized = false;
533 	vaddr_t addr;
534 #if !defined(PMAP_STEAL_MEMORY)
535 	vaddr_t vaddr;
536 	paddr_t paddr;
537 #endif
538 
539 	/*
540 	 * on first call to this function, initialize ourselves.
541 	 */
542 	if (initialized == false) {
543 		pmap_virtual_space(&virtual_space_start, &virtual_space_end);
544 
545 		/* round it the way we like it */
546 		virtual_space_start = round_page(virtual_space_start);
547 		virtual_space_end = trunc_page(virtual_space_end);
548 
549 		initialized = true;
550 	}
551 
552 	/* round to page size */
553 	size = round_page(size);
554 	uvmexp.bootpages += atop(size);
555 
556 #if defined(PMAP_STEAL_MEMORY)
557 
558 	/*
559 	 * defer bootstrap allocation to MD code (it may want to allocate
560 	 * from a direct-mapped segment).  pmap_steal_memory should adjust
561 	 * virtual_space_start/virtual_space_end if necessary.
562 	 */
563 
564 	addr = pmap_steal_memory(size, &virtual_space_start,
565 	    &virtual_space_end);
566 
567 	return(addr);
568 
569 #else /* !PMAP_STEAL_MEMORY */
570 
571 	/*
572 	 * allocate virtual memory for this request
573 	 */
574 	if (virtual_space_start == virtual_space_end ||
575 	    (virtual_space_end - virtual_space_start) < size)
576 		panic("uvm_pageboot_alloc: out of virtual space");
577 
578 	addr = virtual_space_start;
579 
580 #ifdef PMAP_GROWKERNEL
581 	/*
582 	 * If the kernel pmap can't map the requested space,
583 	 * then allocate more resources for it.
584 	 */
585 	if (uvm_maxkaddr < (addr + size)) {
586 		uvm_maxkaddr = pmap_growkernel(addr + size);
587 		if (uvm_maxkaddr < (addr + size))
588 			panic("uvm_pageboot_alloc: pmap_growkernel() failed");
589 	}
590 #endif
591 
592 	virtual_space_start += size;
593 
594 	/*
595 	 * allocate and mapin physical pages to back new virtual pages
596 	 */
597 
598 	for (vaddr = round_page(addr) ; vaddr < addr + size ;
599 	    vaddr += PAGE_SIZE) {
600 
601 		if (!uvm_page_physget(&paddr))
602 			panic("uvm_pageboot_alloc: out of memory");
603 
604 		/*
605 		 * Note this memory is no longer managed, so using
606 		 * pmap_kenter is safe.
607 		 */
608 		pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
609 	}
610 	pmap_update(pmap_kernel());
611 	return(addr);
612 #endif	/* PMAP_STEAL_MEMORY */
613 }
614 
615 #if !defined(PMAP_STEAL_MEMORY)
616 /*
617  * uvm_page_physget: "steal" one page from the vm_physmem structure.
618  *
619  * => attempt to allocate it off the end of a segment in which the "avail"
620  *    values match the start/end values.   if we can't do that, then we
621  *    will advance both values (making them equal, and removing some
622  *    vm_page structures from the non-avail area).
623  * => return false if out of memory.
624  */
625 
626 /* subroutine: try to allocate from memory chunks on the specified freelist */
627 static bool uvm_page_physget_freelist(paddr_t *, int);
628 
629 static bool
630 uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
631 {
632 	uvm_physseg_t lcv;
633 
634 	/* pass 1: try allocating from a matching end */
635 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
636 	for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
637 #else
638 	for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
639 #endif
640 	{
641 		if (uvm.page_init_done == true)
642 			panic("uvm_page_physget: called _after_ bootstrap");
643 
644 		/* Try to match at front or back on unused segment */
645 		if (uvm_page_physunload(lcv, freelist, paddrp))
646 			return true;
647 	}
648 
649 	/* pass2: forget about matching ends, just allocate something */
650 #if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
651 	for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
652 #else
653 	for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
654 #endif
655 	{
656 		/* Try the front regardless. */
657 		if (uvm_page_physunload_force(lcv, freelist, paddrp))
658 			return true;
659 	}
660 	return false;
661 }
662 
663 bool
664 uvm_page_physget(paddr_t *paddrp)
665 {
666 	int i;
667 
668 	/* try in the order of freelist preference */
669 	for (i = 0; i < VM_NFREELIST; i++)
670 		if (uvm_page_physget_freelist(paddrp, i) == true)
671 			return (true);
672 	return (false);
673 }
674 #endif /* PMAP_STEAL_MEMORY */
675 
676 /*
677  * PHYS_TO_VM_PAGE: find vm_page for a PA.   used by MI code to get vm_pages
678  * back from an I/O mapping (ugh!).   used in some MD code as well.
679  */
680 struct vm_page *
681 uvm_phys_to_vm_page(paddr_t pa)
682 {
683 	paddr_t pf = atop(pa);
684 	paddr_t	off;
685 	uvm_physseg_t	upm;
686 
687 	upm = uvm_physseg_find(pf, &off);
688 	if (upm != UVM_PHYSSEG_TYPE_INVALID)
689 		return uvm_physseg_get_pg(upm, off);
690 	return(NULL);
691 }
692 
693 paddr_t
694 uvm_vm_page_to_phys(const struct vm_page *pg)
695 {
696 
697 	return pg->phys_addr & ~(PAGE_SIZE - 1);
698 }
699 
700 /*
701  * uvm_page_numa_load: load NUMA range description.
702  */
703 void
704 uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
705 {
706 	struct uvm_page_numa_region *d;
707 
708 	KASSERT(numa_id < PGFL_MAX_BUCKETS);
709 
710 	d = kmem_alloc(sizeof(*d), KM_SLEEP);
711 	d->start = start;
712 	d->size = size;
713 	d->numa_id = numa_id;
714 	d->next = uvm_page_numa_region;
715 	uvm_page_numa_region = d;
716 }
717 
718 /*
719  * uvm_page_numa_lookup: lookup NUMA node for the given page.
720  */
721 static u_int
722 uvm_page_numa_lookup(struct vm_page *pg)
723 {
724 	struct uvm_page_numa_region *d;
725 	static bool warned;
726 	paddr_t pa;
727 
728 	KASSERT(uvm_page_numa_region != NULL);
729 
730 	pa = VM_PAGE_TO_PHYS(pg);
731 	for (d = uvm_page_numa_region; d != NULL; d = d->next) {
732 		if (pa >= d->start && pa < d->start + d->size) {
733 			return d->numa_id;
734 		}
735 	}
736 
737 	if (!warned) {
738 		printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
739 		    PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
740 		warned = true;
741 	}
742 
743 	return 0;
744 }
745 
746 /*
747  * uvm_page_redim: adjust freelist dimensions if they have changed.
748  */
749 
750 static void
751 uvm_page_redim(int newncolors, int newnbuckets)
752 {
753 	struct pgfreelist npgfl;
754 	struct pgflbucket *opgb, *npgb;
755 	struct pgflist *ohead, *nhead;
756 	struct vm_page *pg;
757 	size_t bucketsize, bucketmemsize, oldbucketmemsize;
758 	int fl, ob, oc, nb, nc, obuckets, ocolors;
759 	char *bucketarray, *oldbucketmem, *bucketmem;
760 
761 	KASSERT(((newncolors - 1) & newncolors) == 0);
762 
763 	/* Anything to do? */
764 	if (newncolors <= uvmexp.ncolors &&
765 	    newnbuckets == uvm.bucketcount) {
766 		return;
767 	}
768 	if (uvm.page_init_done == false) {
769 		uvmexp.ncolors = newncolors;
770 		return;
771 	}
772 
773 	bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
774 	bucketsize = roundup2(bucketsize, coherency_unit);
775 	bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
776 	    coherency_unit - 1;
777 	bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
778 	bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
779 
780 	ocolors = uvmexp.ncolors;
781 	obuckets = uvm.bucketcount;
782 
783 	/* Freelist cache musn't be enabled. */
784 	uvm_pgflcache_pause();
785 
786 	/* Make sure we should still do this. */
787 	uvm_pgfl_lock();
788 	if (newncolors <= uvmexp.ncolors &&
789 	    newnbuckets == uvm.bucketcount) {
790 		uvm_pgfl_unlock();
791 		uvm_pgflcache_resume();
792 		kmem_free(bucketmem, bucketmemsize);
793 		return;
794 	}
795 
796 	uvmexp.ncolors = newncolors;
797 	uvmexp.colormask = uvmexp.ncolors - 1;
798 	uvm.bucketcount = newnbuckets;
799 
800 	for (fl = 0; fl < VM_NFREELIST; fl++) {
801 		/* Init new buckets in new freelist. */
802 		memset(&npgfl, 0, sizeof(npgfl));
803 		for (nb = 0; nb < newnbuckets; nb++) {
804 			npgb = (struct pgflbucket *)bucketarray;
805 			uvm_page_init_bucket(&npgfl, npgb, nb);
806 			bucketarray += bucketsize;
807 		}
808 		/* Now transfer pages from the old freelist. */
809 		for (nb = ob = 0; ob < obuckets; ob++) {
810 			opgb = uvm.page_free[fl].pgfl_buckets[ob];
811 			for (oc = 0; oc < ocolors; oc++) {
812 				ohead = &opgb->pgb_colors[oc];
813 				while ((pg = LIST_FIRST(ohead)) != NULL) {
814 					LIST_REMOVE(pg, pageq.list);
815 					/*
816 					 * Here we decide on the NEW color &
817 					 * bucket for the page.  For NUMA
818 					 * we'll use the info that the
819 					 * hardware gave us.  For non-NUMA
820 					 * assign take physical page frame
821 					 * number and cache color into
822 					 * account.  We do this to try and
823 					 * avoid defeating any memory
824 					 * interleaving in the hardware.
825 					 */
826 					KASSERT(
827 					    uvm_page_get_bucket(pg) == ob);
828 					KASSERT(fl ==
829 					    uvm_page_get_freelist(pg));
830 					if (uvm_page_numa_region != NULL) {
831 						nb = uvm_page_numa_lookup(pg);
832 					} else {
833 						nb = atop(VM_PAGE_TO_PHYS(pg))
834 						    / uvmexp.ncolors / 8
835 						    % newnbuckets;
836 					}
837 					uvm_page_set_bucket(pg, nb);
838 					npgb = npgfl.pgfl_buckets[nb];
839 					npgb->pgb_nfree++;
840 					nc = VM_PGCOLOR(pg);
841 					nhead = &npgb->pgb_colors[nc];
842 					LIST_INSERT_HEAD(nhead, pg, pageq.list);
843 				}
844 			}
845 		}
846 		/* Install the new freelist. */
847 		memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
848 	}
849 
850 	/* Unlock and free the old memory. */
851 	oldbucketmemsize = recolored_pages_memsize;
852 	oldbucketmem = recolored_pages_mem;
853 	recolored_pages_memsize = bucketmemsize;
854 	recolored_pages_mem = bucketmem;
855 
856 	uvm_pgfl_unlock();
857 	uvm_pgflcache_resume();
858 
859 	if (oldbucketmemsize) {
860 		kmem_free(oldbucketmem, oldbucketmemsize);
861 	}
862 
863 	/*
864 	 * this calls uvm_km_alloc() which may want to hold
865 	 * uvm_freelist_lock.
866 	 */
867 	uvm_pager_realloc_emerg();
868 }
869 
870 /*
871  * uvm_page_recolor: Recolor the pages if the new color count is
872  * larger than the old one.
873  */
874 
875 void
876 uvm_page_recolor(int newncolors)
877 {
878 
879 	uvm_page_redim(newncolors, uvm.bucketcount);
880 }
881 
882 /*
883  * uvm_page_rebucket: Determine a bucket structure and redim the free
884  * lists to match.
885  */
886 
887 void
888 uvm_page_rebucket(void)
889 {
890 	u_int min_numa, max_numa, npackage, shift;
891 	struct cpu_info *ci, *ci2, *ci3;
892 	CPU_INFO_ITERATOR cii;
893 
894 	/*
895 	 * If we have more than one NUMA node, and the maximum NUMA node ID
896 	 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
897 	 * for free pages.
898 	 */
899 	min_numa = (u_int)-1;
900 	max_numa = 0;
901 	for (CPU_INFO_FOREACH(cii, ci)) {
902 		if (ci->ci_numa_id < min_numa) {
903 			min_numa = ci->ci_numa_id;
904 		}
905 		if (ci->ci_numa_id > max_numa) {
906 			max_numa = ci->ci_numa_id;
907 		}
908 	}
909 	if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
910 		aprint_debug("UVM: using NUMA allocation scheme\n");
911 		for (CPU_INFO_FOREACH(cii, ci)) {
912 			ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
913 		}
914 	 	uvm_page_redim(uvmexp.ncolors, max_numa + 1);
915 	 	return;
916 	}
917 
918 	/*
919 	 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
920 	 * and minimise lock contention.  Count the total number of CPU
921 	 * packages, and then try to distribute the buckets among CPU
922 	 * packages evenly.
923 	 */
924 	npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
925 
926 	/*
927 	 * Figure out how to arrange the packages & buckets, and the total
928 	 * number of buckets we need.  XXX 2 may not be the best factor.
929 	 */
930 	for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
931 		npackage >>= 1;
932 	}
933  	uvm_page_redim(uvmexp.ncolors, npackage);
934 
935  	/*
936  	 * Now tell each CPU which bucket to use.  In the outer loop, scroll
937  	 * through all CPU packages.
938  	 */
939  	npackage = 0;
940 	ci = curcpu();
941 	ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
942 	do {
943 		/*
944 		 * In the inner loop, scroll through all CPUs in the package
945 		 * and assign the same bucket ID.
946 		 */
947 		ci3 = ci2;
948 		do {
949 			ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
950 			ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
951 		} while (ci3 != ci2);
952 		npackage++;
953 		ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
954 	} while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
955 
956 	aprint_debug("UVM: using package allocation scheme, "
957 	    "%d package(s) per bucket\n", 1 << shift);
958 }
959 
960 /*
961  * uvm_cpu_attach: initialize per-CPU data structures.
962  */
963 
964 void
965 uvm_cpu_attach(struct cpu_info *ci)
966 {
967 	struct uvm_cpu *ucpu;
968 
969 	/* Already done in uvm_page_init(). */
970 	if (!CPU_IS_PRIMARY(ci)) {
971 		/* Add more reserve pages for this CPU. */
972 		uvmexp.reserve_kernel += vm_page_reserve_kernel;
973 
974 		/* Allocate per-CPU data structures. */
975 		ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
976 		    KM_SLEEP);
977 		ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
978 		    coherency_unit);
979 		ci->ci_data.cpu_uvm = ucpu;
980 	} else {
981 		ucpu = ci->ci_data.cpu_uvm;
982 	}
983 
984 	uvmpdpol_init_cpu(ucpu);
985 
986 	/*
987 	 * Attach RNG source for this CPU's VM events
988 	 */
989         rnd_attach_source(&ucpu->rs, ci->ci_data.cpu_name, RND_TYPE_VM,
990 	    RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
991 	    RND_FLAG_ESTIMATE_VALUE);
992 }
993 
994 /*
995  * uvm_availmem: fetch the total amount of free memory in pages.  this can
996  * have a detrimental effect on performance due to false sharing; don't call
997  * unless needed.
998  *
999  * some users can request the amount of free memory so often that it begins
1000  * to impact upon performance.  if calling frequently and an inexact value
1001  * is okay, call with cached = true.
1002  */
1003 
1004 int
1005 uvm_availmem(bool cached)
1006 {
1007 	int64_t fp;
1008 
1009 	cpu_count_sync(cached);
1010 	if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
1011 		/*
1012 		 * XXXAD could briefly go negative because it's impossible
1013 		 * to get a clean snapshot.  address this for other counters
1014 		 * used as running totals before NetBSD 10 although less
1015 		 * important for those.
1016 		 */
1017 		fp = 0;
1018 	}
1019 	return (int)fp;
1020 }
1021 
1022 /*
1023  * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
1024  * specific freelist and specific bucket only.
1025  *
1026  * => must be at IPL_VM or higher to protect per-CPU data structures.
1027  */
1028 
1029 static struct vm_page *
1030 uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
1031 {
1032 	int c, trycolor, colormask;
1033 	struct pgflbucket *pgb;
1034 	struct vm_page *pg;
1035 	kmutex_t *lock;
1036 	bool fill;
1037 
1038 	/*
1039 	 * Skip the bucket if empty, no lock needed.  There could be many
1040 	 * empty freelists/buckets.
1041 	 */
1042 	pgb = uvm.page_free[f].pgfl_buckets[b];
1043 	if (pgb->pgb_nfree == 0) {
1044 		return NULL;
1045 	}
1046 
1047 	/* Skip bucket if low on memory. */
1048 	lock = &uvm_freelist_locks[b].lock;
1049 	mutex_spin_enter(lock);
1050 	if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
1051 		if ((flags & UVM_PGA_USERESERVE) == 0 ||
1052 		    (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
1053 		     curlwp != uvm.pagedaemon_lwp)) {
1054 			mutex_spin_exit(lock);
1055 		     	return NULL;
1056 		}
1057 		fill = false;
1058 	} else {
1059 		fill = true;
1060 	}
1061 
1062 	/* Try all page colors as needed. */
1063 	c = trycolor = *trycolorp;
1064 	colormask = uvmexp.colormask;
1065 	do {
1066 		pg = LIST_FIRST(&pgb->pgb_colors[c]);
1067 		if (__predict_true(pg != NULL)) {
1068 			/*
1069 			 * Got a free page!  PG_FREE must be cleared under
1070 			 * lock because of uvm_pglistalloc().
1071 			 */
1072 			LIST_REMOVE(pg, pageq.list);
1073 			KASSERT(pg->flags == PG_FREE);
1074 			pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
1075 			pgb->pgb_nfree--;
1076 
1077 			/*
1078 			 * While we have the bucket locked and our data
1079 			 * structures fresh in L1 cache, we have an ideal
1080 			 * opportunity to grab some pages for the freelist
1081 			 * cache without causing extra contention.  Only do
1082 			 * so if we found pages in this CPU's preferred
1083 			 * bucket.
1084 			 */
1085 			if (__predict_true(b == ucpu->pgflbucket && fill)) {
1086 				uvm_pgflcache_fill(ucpu, f, b, c);
1087 			}
1088 			mutex_spin_exit(lock);
1089 			KASSERT(uvm_page_get_bucket(pg) == b);
1090 			CPU_COUNT(c == trycolor ?
1091 			    CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
1092 			CPU_COUNT(CPU_COUNT_CPUMISS, 1);
1093 			*trycolorp = c;
1094 			return pg;
1095 		}
1096 		c = (c + 1) & colormask;
1097 	} while (c != trycolor);
1098 	mutex_spin_exit(lock);
1099 
1100 	return NULL;
1101 }
1102 
1103 /*
1104  * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
1105  * any color from any bucket, in a specific freelist.
1106  *
1107  * => must be at IPL_VM or higher to protect per-CPU data structures.
1108  */
1109 
1110 static struct vm_page *
1111 uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
1112 {
1113 	int b, trybucket, bucketcount;
1114 	struct vm_page *pg;
1115 
1116 	/* Try for the exact thing in the per-CPU cache. */
1117 	if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
1118 		CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1119 		CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1120 		return pg;
1121 	}
1122 
1123 	/* Walk through all buckets, trying our preferred bucket first. */
1124 	trybucket = ucpu->pgflbucket;
1125 	b = trybucket;
1126 	bucketcount = uvm.bucketcount;
1127 	do {
1128 		pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
1129 		if (pg != NULL) {
1130 			return pg;
1131 		}
1132 		b = (b + 1 == bucketcount ? 0 : b + 1);
1133 	} while (b != trybucket);
1134 
1135 	return NULL;
1136 }
1137 
1138 /*
1139  * uvm_pagealloc_strat: allocate vm_page from a particular free list.
1140  *
1141  * => return null if no pages free
1142  * => wake up pagedaemon if number of free pages drops below low water mark
1143  * => if obj != NULL, obj must be locked (to put in obj's tree)
1144  * => if anon != NULL, anon must be locked (to put in anon)
1145  * => only one of obj or anon can be non-null
1146  * => caller must activate/deactivate page if it is not wired.
1147  * => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
1148  * => policy decision: it is more important to pull a page off of the
1149  *	appropriate priority free list than it is to get a page from the
1150  *	correct bucket or color bin.  This is because we live with the
1151  *	consequences of a bad free list decision for the entire
1152  *	lifetime of the page, e.g. if the page comes from memory that
1153  *	is slower to access.
1154  */
1155 
1156 struct vm_page *
1157 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
1158     int flags, int strat, int free_list)
1159 {
1160 	int color, lcv, error, s;
1161 	struct uvm_cpu *ucpu;
1162 	struct vm_page *pg;
1163 	lwp_t *l;
1164 
1165 	KASSERT(obj == NULL || anon == NULL);
1166 	KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0);
1167 	KASSERT(off == trunc_page(off));
1168 	KASSERT(obj == NULL || rw_write_held(obj->vmobjlock));
1169 	KASSERT(anon == NULL || anon->an_lock == NULL ||
1170 	    rw_write_held(anon->an_lock));
1171 
1172 	/*
1173 	 * This implements a global round-robin page coloring
1174 	 * algorithm.
1175 	 */
1176 
1177 	s = splvm();
1178 	ucpu = curcpu()->ci_data.cpu_uvm;
1179 	if (flags & UVM_FLAG_COLORMATCH) {
1180 		color = atop(off) & uvmexp.colormask;
1181 	} else {
1182 		color = ucpu->pgflcolor;
1183 	}
1184 
1185 	/*
1186 	 * fail if any of these conditions is true:
1187 	 * [1]  there really are no free pages, or
1188 	 * [2]  only kernel "reserved" pages remain and
1189 	 *        reserved pages have not been requested.
1190 	 * [3]  only pagedaemon "reserved" pages remain and
1191 	 *        the requestor isn't the pagedaemon.
1192 	 * we make kernel reserve pages available if called by a
1193 	 * kernel thread.
1194 	 */
1195 	l = curlwp;
1196 	if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
1197 		flags |= UVM_PGA_USERESERVE;
1198 	}
1199 
1200  again:
1201 	switch (strat) {
1202 	case UVM_PGA_STRAT_NORMAL:
1203 		/* Check freelists: descending priority (ascending id) order. */
1204 		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1205 			pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
1206 			if (pg != NULL) {
1207 				goto gotit;
1208 			}
1209 		}
1210 
1211 		/* No pages free!  Have pagedaemon free some memory. */
1212 		splx(s);
1213 		uvm_kick_pdaemon();
1214 		return NULL;
1215 
1216 	case UVM_PGA_STRAT_ONLY:
1217 	case UVM_PGA_STRAT_FALLBACK:
1218 		/* Attempt to allocate from the specified free list. */
1219 		KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
1220 		pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
1221 		if (pg != NULL) {
1222 			goto gotit;
1223 		}
1224 
1225 		/* Fall back, if possible. */
1226 		if (strat == UVM_PGA_STRAT_FALLBACK) {
1227 			strat = UVM_PGA_STRAT_NORMAL;
1228 			goto again;
1229 		}
1230 
1231 		/* No pages free!  Have pagedaemon free some memory. */
1232 		splx(s);
1233 		uvm_kick_pdaemon();
1234 		return NULL;
1235 
1236 	case UVM_PGA_STRAT_NUMA:
1237 		/*
1238 		 * NUMA strategy (experimental): allocating from the correct
1239 		 * bucket is more important than observing freelist
1240 		 * priority.  Look only to the current NUMA node; if that
1241 		 * fails, we need to look to other NUMA nodes, so retry with
1242 		 * the normal strategy.
1243 		 */
1244 		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
1245 			pg = uvm_pgflcache_alloc(ucpu, lcv, color);
1246 			if (pg != NULL) {
1247 				CPU_COUNT(CPU_COUNT_CPUHIT, 1);
1248 				CPU_COUNT(CPU_COUNT_COLORHIT, 1);
1249 				goto gotit;
1250 			}
1251 			pg = uvm_pagealloc_pgb(ucpu, lcv,
1252 			    ucpu->pgflbucket, &color, flags);
1253 			if (pg != NULL) {
1254 				goto gotit;
1255 			}
1256 		}
1257 		strat = UVM_PGA_STRAT_NORMAL;
1258 		goto again;
1259 
1260 	default:
1261 		panic("uvm_pagealloc_strat: bad strat %d", strat);
1262 		/* NOTREACHED */
1263 	}
1264 
1265  gotit:
1266 	/*
1267 	 * We now know which color we actually allocated from; set
1268 	 * the next color accordingly.
1269 	 */
1270 
1271 	ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
1272 
1273 	/*
1274 	 * while still at IPL_VM, update allocation statistics.
1275 	 */
1276 
1277     	CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
1278 	if (anon) {
1279 		CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
1280 	}
1281 	splx(s);
1282 	KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
1283 
1284 	/*
1285 	 * assign the page to the object.  as the page was free, we know
1286 	 * that pg->uobject and pg->uanon are NULL.  we only need to take
1287 	 * the page's interlock if we are changing the values.
1288 	 */
1289 	if (anon != NULL || obj != NULL) {
1290 		mutex_enter(&pg->interlock);
1291 	}
1292 	pg->offset = off;
1293 	pg->uobject = obj;
1294 	pg->uanon = anon;
1295 	KASSERT(uvm_page_owner_locked_p(pg, true));
1296 	if (anon) {
1297 		anon->an_page = pg;
1298 		pg->flags |= PG_ANON;
1299 		mutex_exit(&pg->interlock);
1300 	} else if (obj) {
1301 		/*
1302 		 * set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
1303 		 */
1304 		if (UVM_OBJ_IS_VNODE(obj)) {
1305 			pg->flags |= PG_FILE;
1306 		} else if (UVM_OBJ_IS_AOBJ(obj)) {
1307 			pg->flags |= PG_AOBJ;
1308 		}
1309 		uvm_pageinsert_object(obj, pg);
1310 		mutex_exit(&pg->interlock);
1311 		error = uvm_pageinsert_tree(obj, pg);
1312 		if (error != 0) {
1313 			mutex_enter(&pg->interlock);
1314 			uvm_pageremove_object(obj, pg);
1315 			mutex_exit(&pg->interlock);
1316 			uvm_pagefree(pg);
1317 			return NULL;
1318 		}
1319 	}
1320 
1321 #if defined(UVM_PAGE_TRKOWN)
1322 	pg->owner_tag = NULL;
1323 #endif
1324 	UVM_PAGE_OWN(pg, "new alloc");
1325 
1326 	if (flags & UVM_PGA_ZERO) {
1327 		/* A zero'd page is not clean. */
1328 		if (obj != NULL || anon != NULL) {
1329 			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1330 		}
1331 		pmap_zero_page(VM_PAGE_TO_PHYS(pg));
1332 	}
1333 
1334 	return(pg);
1335 }
1336 
1337 /*
1338  * uvm_pagereplace: replace a page with another
1339  *
1340  * => object must be locked
1341  * => page interlocks must be held
1342  */
1343 
1344 void
1345 uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
1346 {
1347 	struct uvm_object *uobj = oldpg->uobject;
1348 	struct vm_page *pg __diagused;
1349 	uint64_t idx;
1350 
1351 	KASSERT((oldpg->flags & PG_TABLED) != 0);
1352 	KASSERT(uobj != NULL);
1353 	KASSERT((newpg->flags & PG_TABLED) == 0);
1354 	KASSERT(newpg->uobject == NULL);
1355 	KASSERT(rw_write_held(uobj->vmobjlock));
1356 	KASSERT(mutex_owned(&oldpg->interlock));
1357 	KASSERT(mutex_owned(&newpg->interlock));
1358 
1359 	newpg->uobject = uobj;
1360 	newpg->offset = oldpg->offset;
1361 	idx = newpg->offset >> PAGE_SHIFT;
1362 	pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
1363 	KASSERT(pg == oldpg);
1364 	if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
1365 		if ((newpg->flags & PG_CLEAN) != 0) {
1366 			radix_tree_clear_tag(&uobj->uo_pages, idx,
1367 			    UVM_PAGE_DIRTY_TAG);
1368 		} else {
1369 			radix_tree_set_tag(&uobj->uo_pages, idx,
1370 			    UVM_PAGE_DIRTY_TAG);
1371 		}
1372 	}
1373 	/*
1374 	 * oldpg's PG_STAT is stable.  newpg is not reachable by others yet.
1375 	 */
1376 	newpg->flags |=
1377 	    (newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
1378 	uvm_pageinsert_object(uobj, newpg);
1379 	uvm_pageremove_object(uobj, oldpg);
1380 }
1381 
1382 /*
1383  * uvm_pagerealloc: reallocate a page from one object to another
1384  *
1385  * => both objects must be locked
1386  */
1387 
1388 int
1389 uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
1390 {
1391 	int error = 0;
1392 
1393 	/*
1394 	 * remove it from the old object
1395 	 */
1396 
1397 	if (pg->uobject) {
1398 		uvm_pageremove_tree(pg->uobject, pg);
1399 		uvm_pageremove_object(pg->uobject, pg);
1400 	}
1401 
1402 	/*
1403 	 * put it in the new object
1404 	 */
1405 
1406 	if (newobj) {
1407 		mutex_enter(&pg->interlock);
1408 		pg->uobject = newobj;
1409 		pg->offset = newoff;
1410 		if (UVM_OBJ_IS_VNODE(newobj)) {
1411 			pg->flags |= PG_FILE;
1412 		} else if (UVM_OBJ_IS_AOBJ(newobj)) {
1413 			pg->flags |= PG_AOBJ;
1414 		}
1415 		uvm_pageinsert_object(newobj, pg);
1416 		mutex_exit(&pg->interlock);
1417 		error = uvm_pageinsert_tree(newobj, pg);
1418 		if (error != 0) {
1419 			mutex_enter(&pg->interlock);
1420 			uvm_pageremove_object(newobj, pg);
1421 			mutex_exit(&pg->interlock);
1422 		}
1423 	}
1424 
1425 	return error;
1426 }
1427 
1428 #ifdef DEBUG
1429 /*
1430  * check if page is zero-filled
1431  */
1432 void
1433 uvm_pagezerocheck(struct vm_page *pg)
1434 {
1435 	int *p, *ep;
1436 
1437 	KASSERT(uvm_zerocheckkva != 0);
1438 
1439 	/*
1440 	 * XXX assuming pmap_kenter_pa and pmap_kremove never call
1441 	 * uvm page allocator.
1442 	 *
1443 	 * it might be better to have "CPU-local temporary map" pmap interface.
1444 	 */
1445 	mutex_spin_enter(&uvm_zerochecklock);
1446 	pmap_kenter_pa(uvm_zerocheckkva, VM_PAGE_TO_PHYS(pg), VM_PROT_READ, 0);
1447 	p = (int *)uvm_zerocheckkva;
1448 	ep = (int *)((char *)p + PAGE_SIZE);
1449 	pmap_update(pmap_kernel());
1450 	while (p < ep) {
1451 		if (*p != 0)
1452 			panic("zero page isn't zero-filled");
1453 		p++;
1454 	}
1455 	pmap_kremove(uvm_zerocheckkva, PAGE_SIZE);
1456 	mutex_spin_exit(&uvm_zerochecklock);
1457 	/*
1458 	 * pmap_update() is not necessary here because no one except us
1459 	 * uses this VA.
1460 	 */
1461 }
1462 #endif /* DEBUG */
1463 
1464 /*
1465  * uvm_pagefree: free page
1466  *
1467  * => erase page's identity (i.e. remove from object)
1468  * => put page on free list
1469  * => caller must lock owning object (either anon or uvm_object)
1470  * => assumes all valid mappings of pg are gone
1471  */
1472 
1473 void
1474 uvm_pagefree(struct vm_page *pg)
1475 {
1476 	struct pgfreelist *pgfl;
1477 	struct pgflbucket *pgb;
1478 	struct uvm_cpu *ucpu;
1479 	kmutex_t *lock;
1480 	int bucket, s;
1481 	bool locked;
1482 
1483 #ifdef DEBUG
1484 	if (pg->uobject == (void *)0xdeadbeef &&
1485 	    pg->uanon == (void *)0xdeadbeef) {
1486 		panic("uvm_pagefree: freeing free page %p", pg);
1487 	}
1488 #endif /* DEBUG */
1489 
1490 	KASSERT((pg->flags & PG_PAGEOUT) == 0);
1491 	KASSERT(!(pg->flags & PG_FREE));
1492 	KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock));
1493 	KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
1494 		rw_write_held(pg->uanon->an_lock));
1495 
1496 	/*
1497 	 * remove the page from the object's tree before acquiring any page
1498 	 * interlocks: this can acquire locks to free radixtree nodes.
1499 	 */
1500 	if (pg->uobject != NULL) {
1501 		uvm_pageremove_tree(pg->uobject, pg);
1502 	}
1503 
1504 	/*
1505 	 * if the page is loaned, resolve the loan instead of freeing.
1506 	 */
1507 
1508 	if (pg->loan_count) {
1509 		KASSERT(pg->wire_count == 0);
1510 
1511 		/*
1512 		 * if the page is owned by an anon then we just want to
1513 		 * drop anon ownership.  the kernel will free the page when
1514 		 * it is done with it.  if the page is owned by an object,
1515 		 * remove it from the object and mark it dirty for the benefit
1516 		 * of possible anon owners.
1517 		 *
1518 		 * regardless of previous ownership, wakeup any waiters,
1519 		 * unbusy the page, and we're done.
1520 		 */
1521 
1522 		uvm_pagelock(pg);
1523 		locked = true;
1524 		if (pg->uobject != NULL) {
1525 			uvm_pageremove_object(pg->uobject, pg);
1526 			pg->flags &= ~(PG_FILE|PG_AOBJ);
1527 		} else if (pg->uanon != NULL) {
1528 			if ((pg->flags & PG_ANON) == 0) {
1529 				pg->loan_count--;
1530 			} else {
1531 				const unsigned status = uvm_pagegetdirty(pg);
1532 				pg->flags &= ~PG_ANON;
1533 				cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1534 			}
1535 			pg->uanon->an_page = NULL;
1536 			pg->uanon = NULL;
1537 		}
1538 		if (pg->pqflags & PQ_WANTED) {
1539 			wakeup(pg);
1540 		}
1541 		pg->pqflags &= ~PQ_WANTED;
1542 		pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
1543 #ifdef UVM_PAGE_TRKOWN
1544 		pg->owner_tag = NULL;
1545 #endif
1546 		KASSERT((pg->flags & PG_STAT) == 0);
1547 		if (pg->loan_count) {
1548 			KASSERT(pg->uobject == NULL);
1549 			if (pg->uanon == NULL) {
1550 				uvm_pagedequeue(pg);
1551 			}
1552 			uvm_pageunlock(pg);
1553 			return;
1554 		}
1555 	} else if (pg->uobject != NULL || pg->uanon != NULL ||
1556 	           pg->wire_count != 0) {
1557 		uvm_pagelock(pg);
1558 		locked = true;
1559 	} else {
1560 		locked = false;
1561 	}
1562 
1563 	/*
1564 	 * remove page from its object or anon.
1565 	 */
1566 	if (pg->uobject != NULL) {
1567 		uvm_pageremove_object(pg->uobject, pg);
1568 	} else if (pg->uanon != NULL) {
1569 		const unsigned int status = uvm_pagegetdirty(pg);
1570 		pg->uanon->an_page = NULL;
1571 		pg->uanon = NULL;
1572 		cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
1573 	}
1574 
1575 	/*
1576 	 * if the page was wired, unwire it now.
1577 	 */
1578 
1579 	if (pg->wire_count) {
1580 		pg->wire_count = 0;
1581 		atomic_dec_uint(&uvmexp.wired);
1582 	}
1583 	if (locked) {
1584 		/*
1585 		 * wake anyone waiting on the page.
1586 		 */
1587 		if ((pg->pqflags & PQ_WANTED) != 0) {
1588 			pg->pqflags &= ~PQ_WANTED;
1589 			wakeup(pg);
1590 		}
1591 
1592 		/*
1593 		 * now remove the page from the queues.
1594 		 */
1595 		uvm_pagedequeue(pg);
1596 		uvm_pageunlock(pg);
1597 	} else {
1598 		KASSERT(!uvmpdpol_pageisqueued_p(pg));
1599 	}
1600 
1601 	/*
1602 	 * and put on free queue
1603 	 */
1604 
1605 #ifdef DEBUG
1606 	pg->uobject = (void *)0xdeadbeef;
1607 	pg->uanon = (void *)0xdeadbeef;
1608 #endif /* DEBUG */
1609 
1610 	/* Try to send the page to the per-CPU cache. */
1611 	s = splvm();
1612     	CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
1613 	ucpu = curcpu()->ci_data.cpu_uvm;
1614 	bucket = uvm_page_get_bucket(pg);
1615 	if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
1616 		splx(s);
1617 		return;
1618 	}
1619 
1620 	/* Didn't work.  Never mind, send it to a global bucket. */
1621 	pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
1622 	pgb = pgfl->pgfl_buckets[bucket];
1623 	lock = &uvm_freelist_locks[bucket].lock;
1624 
1625 	mutex_spin_enter(lock);
1626 	/* PG_FREE must be set under lock because of uvm_pglistalloc(). */
1627 	pg->flags = PG_FREE;
1628 	LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
1629 	pgb->pgb_nfree++;
1630 	mutex_spin_exit(lock);
1631 	splx(s);
1632 }
1633 
1634 /*
1635  * uvm_page_unbusy: unbusy an array of pages.
1636  *
1637  * => pages must either all belong to the same object, or all belong to anons.
1638  * => if pages are object-owned, object must be locked.
1639  * => if pages are anon-owned, anons must be locked.
1640  * => caller must make sure that anon-owned pages are not PG_RELEASED.
1641  */
1642 
1643 void
1644 uvm_page_unbusy(struct vm_page **pgs, int npgs)
1645 {
1646 	struct vm_page *pg;
1647 	int i;
1648 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1649 
1650 	for (i = 0; i < npgs; i++) {
1651 		pg = pgs[i];
1652 		if (pg == NULL || pg == PGO_DONTCARE) {
1653 			continue;
1654 		}
1655 
1656 		KASSERT(uvm_page_owner_locked_p(pg, true));
1657 		KASSERT(pg->flags & PG_BUSY);
1658 		KASSERT((pg->flags & PG_PAGEOUT) == 0);
1659 		if (pg->flags & PG_RELEASED) {
1660 			UVMHIST_LOG(ubchist, "releasing pg %#jx",
1661 			    (uintptr_t)pg, 0, 0, 0);
1662 			KASSERT(pg->uobject != NULL ||
1663 			    (pg->uanon != NULL && pg->uanon->an_ref > 0));
1664 			pg->flags &= ~PG_RELEASED;
1665 			uvm_pagefree(pg);
1666 		} else {
1667 			UVMHIST_LOG(ubchist, "unbusying pg %#jx",
1668 			    (uintptr_t)pg, 0, 0, 0);
1669 			KASSERT((pg->flags & PG_FAKE) == 0);
1670 			pg->flags &= ~PG_BUSY;
1671 			uvm_pagelock(pg);
1672 			uvm_pagewakeup(pg);
1673 			uvm_pageunlock(pg);
1674 			UVM_PAGE_OWN(pg, NULL);
1675 		}
1676 	}
1677 }
1678 
1679 /*
1680  * uvm_pagewait: wait for a busy page
1681  *
1682  * => page must be known PG_BUSY
1683  * => object must be read or write locked
1684  * => object will be unlocked on return
1685  */
1686 
1687 void
1688 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
1689 {
1690 
1691 	KASSERT(rw_lock_held(lock));
1692 	KASSERT((pg->flags & PG_BUSY) != 0);
1693 	KASSERT(uvm_page_owner_locked_p(pg, false));
1694 
1695 	mutex_enter(&pg->interlock);
1696 	pg->pqflags |= PQ_WANTED;
1697 	rw_exit(lock);
1698 	UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
1699 }
1700 
1701 /*
1702  * uvm_pagewakeup: wake anyone waiting on a page
1703  *
1704  * => page interlock must be held
1705  */
1706 
1707 void
1708 uvm_pagewakeup(struct vm_page *pg)
1709 {
1710 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
1711 
1712 	KASSERT(mutex_owned(&pg->interlock));
1713 
1714 	UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
1715 
1716 	if ((pg->pqflags & PQ_WANTED) != 0) {
1717 		wakeup(pg);
1718 		pg->pqflags &= ~PQ_WANTED;
1719 	}
1720 }
1721 
1722 /*
1723  * uvm_pagewanted_p: return true if someone is waiting on the page
1724  *
1725  * => object must be write locked (lock out all concurrent access)
1726  */
1727 
1728 bool
1729 uvm_pagewanted_p(struct vm_page *pg)
1730 {
1731 
1732 	KASSERT(uvm_page_owner_locked_p(pg, true));
1733 
1734 	return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
1735 }
1736 
1737 #if defined(UVM_PAGE_TRKOWN)
1738 /*
1739  * uvm_page_own: set or release page ownership
1740  *
1741  * => this is a debugging function that keeps track of who sets PG_BUSY
1742  *	and where they do it.   it can be used to track down problems
1743  *	such a process setting "PG_BUSY" and never releasing it.
1744  * => page's object [if any] must be locked
1745  * => if "tag" is NULL then we are releasing page ownership
1746  */
1747 void
1748 uvm_page_own(struct vm_page *pg, const char *tag)
1749 {
1750 
1751 	KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
1752 	KASSERT(uvm_page_owner_locked_p(pg, true));
1753 
1754 	/* gain ownership? */
1755 	if (tag) {
1756 		KASSERT((pg->flags & PG_BUSY) != 0);
1757 		if (pg->owner_tag) {
1758 			printf("uvm_page_own: page %p already owned "
1759 			    "by proc %d.%d [%s]\n", pg,
1760 			    pg->owner, pg->lowner, pg->owner_tag);
1761 			panic("uvm_page_own");
1762 		}
1763 		pg->owner = curproc->p_pid;
1764 		pg->lowner = curlwp->l_lid;
1765 		pg->owner_tag = tag;
1766 		return;
1767 	}
1768 
1769 	/* drop ownership */
1770 	KASSERT((pg->flags & PG_BUSY) == 0);
1771 	if (pg->owner_tag == NULL) {
1772 		printf("uvm_page_own: dropping ownership of an non-owned "
1773 		    "page (%p)\n", pg);
1774 		panic("uvm_page_own");
1775 	}
1776 	pg->owner_tag = NULL;
1777 }
1778 #endif
1779 
1780 /*
1781  * uvm_pagelookup: look up a page
1782  *
1783  * => caller should lock object to keep someone from pulling the page
1784  *	out from under it
1785  */
1786 
1787 struct vm_page *
1788 uvm_pagelookup(struct uvm_object *obj, voff_t off)
1789 {
1790 	struct vm_page *pg;
1791 
1792 	/* No - used from DDB. KASSERT(rw_lock_held(obj->vmobjlock)); */
1793 
1794 	pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
1795 
1796 	KASSERT(pg == NULL || obj->uo_npages != 0);
1797 	KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1798 		(pg->flags & PG_BUSY) != 0);
1799 	return pg;
1800 }
1801 
1802 /*
1803  * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
1804  *
1805  * => caller must lock objects
1806  * => caller must hold pg->interlock
1807  */
1808 
1809 void
1810 uvm_pagewire(struct vm_page *pg)
1811 {
1812 
1813 	KASSERT(uvm_page_owner_locked_p(pg, true));
1814 	KASSERT(mutex_owned(&pg->interlock));
1815 #if defined(READAHEAD_STATS)
1816 	if ((pg->flags & PG_READAHEAD) != 0) {
1817 		uvm_ra_hit.ev_count++;
1818 		pg->flags &= ~PG_READAHEAD;
1819 	}
1820 #endif /* defined(READAHEAD_STATS) */
1821 	if (pg->wire_count == 0) {
1822 		uvm_pagedequeue(pg);
1823 		atomic_inc_uint(&uvmexp.wired);
1824 	}
1825 	pg->wire_count++;
1826 	KASSERT(pg->wire_count > 0);	/* detect wraparound */
1827 }
1828 
1829 /*
1830  * uvm_pageunwire: unwire the page.
1831  *
1832  * => activate if wire count goes to zero.
1833  * => caller must lock objects
1834  * => caller must hold pg->interlock
1835  */
1836 
1837 void
1838 uvm_pageunwire(struct vm_page *pg)
1839 {
1840 
1841 	KASSERT(uvm_page_owner_locked_p(pg, true));
1842 	KASSERT(pg->wire_count != 0);
1843 	KASSERT(!uvmpdpol_pageisqueued_p(pg));
1844 	KASSERT(mutex_owned(&pg->interlock));
1845 	pg->wire_count--;
1846 	if (pg->wire_count == 0) {
1847 		uvm_pageactivate(pg);
1848 		KASSERT(uvmexp.wired != 0);
1849 		atomic_dec_uint(&uvmexp.wired);
1850 	}
1851 }
1852 
1853 /*
1854  * uvm_pagedeactivate: deactivate page
1855  *
1856  * => caller must lock objects
1857  * => caller must check to make sure page is not wired
1858  * => object that page belongs to must be locked (so we can adjust pg->flags)
1859  * => caller must clear the reference on the page before calling
1860  * => caller must hold pg->interlock
1861  */
1862 
1863 void
1864 uvm_pagedeactivate(struct vm_page *pg)
1865 {
1866 
1867 	KASSERT(uvm_page_owner_locked_p(pg, false));
1868 	KASSERT(mutex_owned(&pg->interlock));
1869 	if (pg->wire_count == 0) {
1870 		KASSERT(uvmpdpol_pageisqueued_p(pg));
1871 		uvmpdpol_pagedeactivate(pg);
1872 	}
1873 }
1874 
1875 /*
1876  * uvm_pageactivate: activate page
1877  *
1878  * => caller must lock objects
1879  * => caller must hold pg->interlock
1880  */
1881 
1882 void
1883 uvm_pageactivate(struct vm_page *pg)
1884 {
1885 
1886 	KASSERT(uvm_page_owner_locked_p(pg, false));
1887 	KASSERT(mutex_owned(&pg->interlock));
1888 #if defined(READAHEAD_STATS)
1889 	if ((pg->flags & PG_READAHEAD) != 0) {
1890 		uvm_ra_hit.ev_count++;
1891 		pg->flags &= ~PG_READAHEAD;
1892 	}
1893 #endif /* defined(READAHEAD_STATS) */
1894 	if (pg->wire_count == 0) {
1895 		uvmpdpol_pageactivate(pg);
1896 	}
1897 }
1898 
1899 /*
1900  * uvm_pagedequeue: remove a page from any paging queue
1901  *
1902  * => caller must lock objects
1903  * => caller must hold pg->interlock
1904  */
1905 void
1906 uvm_pagedequeue(struct vm_page *pg)
1907 {
1908 
1909 	KASSERT(uvm_page_owner_locked_p(pg, true));
1910 	KASSERT(mutex_owned(&pg->interlock));
1911 	if (uvmpdpol_pageisqueued_p(pg)) {
1912 		uvmpdpol_pagedequeue(pg);
1913 	}
1914 }
1915 
1916 /*
1917  * uvm_pageenqueue: add a page to a paging queue without activating.
1918  * used where a page is not really demanded (yet).  eg. read-ahead
1919  *
1920  * => caller must lock objects
1921  * => caller must hold pg->interlock
1922  */
1923 void
1924 uvm_pageenqueue(struct vm_page *pg)
1925 {
1926 
1927 	KASSERT(uvm_page_owner_locked_p(pg, false));
1928 	KASSERT(mutex_owned(&pg->interlock));
1929 	if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
1930 		uvmpdpol_pageenqueue(pg);
1931 	}
1932 }
1933 
1934 /*
1935  * uvm_pagelock: acquire page interlock
1936  */
1937 void
1938 uvm_pagelock(struct vm_page *pg)
1939 {
1940 
1941 	mutex_enter(&pg->interlock);
1942 }
1943 
1944 /*
1945  * uvm_pagelock2: acquire two page interlocks
1946  */
1947 void
1948 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
1949 {
1950 
1951 	if (pg1 < pg2) {
1952 		mutex_enter(&pg1->interlock);
1953 		mutex_enter(&pg2->interlock);
1954 	} else {
1955 		mutex_enter(&pg2->interlock);
1956 		mutex_enter(&pg1->interlock);
1957 	}
1958 }
1959 
1960 /*
1961  * uvm_pageunlock: release page interlock, and if a page replacement intent
1962  * is set on the page, pass it to uvmpdpol to make real.
1963  *
1964  * => caller must hold pg->interlock
1965  */
1966 void
1967 uvm_pageunlock(struct vm_page *pg)
1968 {
1969 
1970 	if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
1971 	    (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
1972 	    	mutex_exit(&pg->interlock);
1973 	    	return;
1974 	}
1975 	pg->pqflags |= PQ_INTENT_QUEUED;
1976 	mutex_exit(&pg->interlock);
1977 	uvmpdpol_pagerealize(pg);
1978 }
1979 
1980 /*
1981  * uvm_pageunlock2: release two page interlocks, and for both pages if a
1982  * page replacement intent is set on the page, pass it to uvmpdpol to make
1983  * real.
1984  *
1985  * => caller must hold pg->interlock
1986  */
1987 void
1988 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
1989 {
1990 
1991 	if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
1992 	    (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
1993 	    	mutex_exit(&pg1->interlock);
1994 	    	pg1 = NULL;
1995 	} else {
1996 		pg1->pqflags |= PQ_INTENT_QUEUED;
1997 		mutex_exit(&pg1->interlock);
1998 	}
1999 
2000 	if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
2001 	    (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
2002 	    	mutex_exit(&pg2->interlock);
2003 	    	pg2 = NULL;
2004 	} else {
2005 		pg2->pqflags |= PQ_INTENT_QUEUED;
2006 		mutex_exit(&pg2->interlock);
2007 	}
2008 
2009 	if (pg1 != NULL) {
2010 		uvmpdpol_pagerealize(pg1);
2011 	}
2012 	if (pg2 != NULL) {
2013 		uvmpdpol_pagerealize(pg2);
2014 	}
2015 }
2016 
2017 /*
2018  * uvm_pagezero: zero fill a page
2019  *
2020  * => if page is part of an object then the object should be locked
2021  *	to protect pg->flags.
2022  */
2023 
2024 void
2025 uvm_pagezero(struct vm_page *pg)
2026 {
2027 
2028 	uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
2029 	pmap_zero_page(VM_PAGE_TO_PHYS(pg));
2030 }
2031 
2032 /*
2033  * uvm_pagecopy: copy a page
2034  *
2035  * => if page is part of an object then the object should be locked
2036  *	to protect pg->flags.
2037  */
2038 
2039 void
2040 uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
2041 {
2042 
2043 	uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
2044 	pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
2045 }
2046 
2047 /*
2048  * uvm_pageismanaged: test it see that a page (specified by PA) is managed.
2049  */
2050 
2051 bool
2052 uvm_pageismanaged(paddr_t pa)
2053 {
2054 
2055 	return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
2056 }
2057 
2058 /*
2059  * uvm_page_lookup_freelist: look up the free list for the specified page
2060  */
2061 
2062 int
2063 uvm_page_lookup_freelist(struct vm_page *pg)
2064 {
2065 	uvm_physseg_t upm;
2066 
2067 	upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
2068 	KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
2069 	return uvm_physseg_get_free_list(upm);
2070 }
2071 
2072 /*
2073  * uvm_page_owner_locked_p: return true if object associated with page is
2074  * locked.  this is a weak check for runtime assertions only.
2075  */
2076 
2077 bool
2078 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
2079 {
2080 
2081 	if (pg->uobject != NULL) {
2082 		return exclusive
2083 		    ? rw_write_held(pg->uobject->vmobjlock)
2084 		    : rw_lock_held(pg->uobject->vmobjlock);
2085 	}
2086 	if (pg->uanon != NULL) {
2087 		return exclusive
2088 		    ? rw_write_held(pg->uanon->an_lock)
2089 		    : rw_lock_held(pg->uanon->an_lock);
2090 	}
2091 	return true;
2092 }
2093 
2094 /*
2095  * uvm_pagereadonly_p: return if the page should be mapped read-only
2096  */
2097 
2098 bool
2099 uvm_pagereadonly_p(struct vm_page *pg)
2100 {
2101 	struct uvm_object * const uobj = pg->uobject;
2102 
2103 	KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
2104 	KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock));
2105 	if ((pg->flags & PG_RDONLY) != 0) {
2106 		return true;
2107 	}
2108 	if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
2109 		return true;
2110 	}
2111 	if (uobj == NULL) {
2112 		return false;
2113 	}
2114 	return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
2115 }
2116 
2117 #ifdef PMAP_DIRECT
2118 /*
2119  * Call pmap to translate physical address into a virtual and to run a callback
2120  * for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
2121  * or equivalent.
2122  */
2123 int
2124 uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
2125             int (*process)(void *, size_t, void *), void *arg)
2126 {
2127 	int error = 0;
2128 	paddr_t pa;
2129 	size_t todo;
2130 	voff_t pgoff = (off & PAGE_MASK);
2131 	struct vm_page *pg;
2132 
2133 	KASSERT(npages > 0 && len > 0);
2134 
2135 	for (int i = 0; i < npages; i++) {
2136 		pg = pgs[i];
2137 
2138 		KASSERT(len > 0);
2139 
2140 		/*
2141 		 * Caller is responsible for ensuring all the pages are
2142 		 * available.
2143 		 */
2144 		KASSERT(pg != NULL && pg != PGO_DONTCARE);
2145 
2146 		pa = VM_PAGE_TO_PHYS(pg);
2147 		todo = MIN(len, PAGE_SIZE - pgoff);
2148 
2149 		error = pmap_direct_process(pa, pgoff, todo, process, arg);
2150 		if (error)
2151 			break;
2152 
2153 		pgoff = 0;
2154 		len -= todo;
2155 	}
2156 
2157 	KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
2158 	return error;
2159 }
2160 #endif /* PMAP_DIRECT */
2161 
2162 #if defined(DDB) || defined(DEBUGPRINT)
2163 
2164 /*
2165  * uvm_page_printit: actually print the page
2166  */
2167 
2168 static const char page_flagbits[] = UVM_PGFLAGBITS;
2169 static const char page_pqflagbits[] = UVM_PQFLAGBITS;
2170 
2171 void
2172 uvm_page_printit(struct vm_page *pg, bool full,
2173     void (*pr)(const char *, ...))
2174 {
2175 	struct vm_page *tpg;
2176 	struct uvm_object *uobj;
2177 	struct pgflbucket *pgb;
2178 	struct pgflist *pgl;
2179 	char pgbuf[128];
2180 
2181 	(*pr)("PAGE %p:\n", pg);
2182 	snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
2183 	(*pr)("  flags=%s\n", pgbuf);
2184 	snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
2185 	(*pr)("  pqflags=%s\n", pgbuf);
2186 	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx\n",
2187 	    pg->uobject, pg->uanon, (long long)pg->offset);
2188 	(*pr)("  loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
2189 	    pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
2190 	    uvm_page_get_freelist(pg));
2191 	(*pr)("  pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
2192 #if defined(UVM_PAGE_TRKOWN)
2193 	if (pg->flags & PG_BUSY)
2194 		(*pr)("  owning process = %d.%d, tag=%s\n",
2195 		    pg->owner, pg->lowner, pg->owner_tag);
2196 	else
2197 		(*pr)("  page not busy, no owner\n");
2198 #else
2199 	(*pr)("  [page ownership tracking disabled]\n");
2200 #endif
2201 
2202 	if (!full)
2203 		return;
2204 
2205 	/* cross-verify object/anon */
2206 	if ((pg->flags & PG_FREE) == 0) {
2207 		if (pg->flags & PG_ANON) {
2208 			if (pg->uanon == NULL || pg->uanon->an_page != pg)
2209 			    (*pr)("  >>> ANON DOES NOT POINT HERE <<< (%p)\n",
2210 				(pg->uanon) ? pg->uanon->an_page : NULL);
2211 			else
2212 				(*pr)("  anon backpointer is OK\n");
2213 		} else {
2214 			uobj = pg->uobject;
2215 			if (uobj) {
2216 				(*pr)("  checking object list\n");
2217 				tpg = uvm_pagelookup(uobj, pg->offset);
2218 				if (tpg)
2219 					(*pr)("  page found on object list\n");
2220 				else
2221 			(*pr)("  >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
2222 			}
2223 		}
2224 	}
2225 
2226 	/* cross-verify page queue */
2227 	if (pg->flags & PG_FREE) {
2228 		int fl = uvm_page_get_freelist(pg);
2229 		int b = uvm_page_get_bucket(pg);
2230 		pgb = uvm.page_free[fl].pgfl_buckets[b];
2231 		pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
2232 		(*pr)("  checking pageq list\n");
2233 		LIST_FOREACH(tpg, pgl, pageq.list) {
2234 			if (tpg == pg) {
2235 				break;
2236 			}
2237 		}
2238 		if (tpg)
2239 			(*pr)("  page found on pageq list\n");
2240 		else
2241 			(*pr)("  >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
2242 	}
2243 }
2244 
2245 /*
2246  * uvm_page_printall - print a summary of all managed pages
2247  */
2248 
2249 void
2250 uvm_page_printall(void (*pr)(const char *, ...))
2251 {
2252 	uvm_physseg_t i;
2253 	paddr_t pfn;
2254 	struct vm_page *pg;
2255 
2256 	(*pr)("%18s %4s %4s %18s %18s"
2257 #ifdef UVM_PAGE_TRKOWN
2258 	    " OWNER"
2259 #endif
2260 	    "\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
2261 	for (i = uvm_physseg_get_first();
2262 	     uvm_physseg_valid_p(i);
2263 	     i = uvm_physseg_get_next(i)) {
2264 		for (pfn = uvm_physseg_get_start(i);
2265 		     pfn < uvm_physseg_get_end(i);
2266 		     pfn++) {
2267 			pg = PHYS_TO_VM_PAGE(ptoa(pfn));
2268 
2269 			(*pr)("%18p %04x %08x %18p %18p",
2270 			    pg, pg->flags, pg->pqflags, pg->uobject,
2271 			    pg->uanon);
2272 #ifdef UVM_PAGE_TRKOWN
2273 			if (pg->flags & PG_BUSY)
2274 				(*pr)(" %d [%s]", pg->owner, pg->owner_tag);
2275 #endif
2276 			(*pr)("\n");
2277 		}
2278 	}
2279 }
2280 
2281 /*
2282  * uvm_page_print_freelists - print a summary freelists
2283  */
2284 
2285 void
2286 uvm_page_print_freelists(void (*pr)(const char *, ...))
2287 {
2288 	struct pgfreelist *pgfl;
2289 	struct pgflbucket *pgb;
2290 	int fl, b, c;
2291 
2292 	(*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
2293 	    VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
2294 
2295 	for (fl = 0; fl < VM_NFREELIST; fl++) {
2296 		pgfl = &uvm.page_free[fl];
2297 		(*pr)("freelist(%d) @ %p\n", fl, pgfl);
2298 		for (b = 0; b < uvm.bucketcount; b++) {
2299 			pgb = uvm.page_free[fl].pgfl_buckets[b];
2300 			(*pr)("    bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
2301 			    b, pgb, pgb->pgb_nfree,
2302 			    &uvm_freelist_locks[b].lock);
2303 			for (c = 0; c < uvmexp.ncolors; c++) {
2304 				(*pr)("        color(%d) @ %p, ", c,
2305 				    &pgb->pgb_colors[c]);
2306 				(*pr)("first page = %p\n",
2307 				    LIST_FIRST(&pgb->pgb_colors[c]));
2308 			}
2309 		}
2310 	}
2311 }
2312 
2313 #endif /* DDB || DEBUGPRINT */
2314