xref: /freebsd-src/sys/riscv/riscv/pmap.c (revision ccbe9a9f732ee2f10f252a5586309a82e6ffd1e7)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  * Copyright (c) 2014 Andrew Turner
15  * All rights reserved.
16  * Copyright (c) 2014 The FreeBSD Foundation
17  * All rights reserved.
18  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19  * All rights reserved.
20  *
21  * This code is derived from software contributed to Berkeley by
22  * the Systems Programming Group of the University of Utah Computer
23  * Science Department and William Jolitz of UUNET Technologies Inc.
24  *
25  * Portions of this software were developed by Andrew Turner under
26  * sponsorship from The FreeBSD Foundation.
27  *
28  * Portions of this software were developed by SRI International and the
29  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31  *
32  * Portions of this software were developed by the University of Cambridge
33  * Computer Laboratory as part of the CTSRD Project, with support from the
34  * UK Higher Education Innovation Fund (HEIF).
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  */
64 /*-
65  * Copyright (c) 2003 Networks Associates Technology, Inc.
66  * All rights reserved.
67  *
68  * This software was developed for the FreeBSD Project by Jake Burkholder,
69  * Safeport Network Services, and Network Associates Laboratories, the
70  * Security Research Division of Network Associates, Inc. under
71  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
72  * CHATS research program.
73  *
74  * Redistribution and use in source and binary forms, with or without
75  * modification, are permitted provided that the following conditions
76  * are met:
77  * 1. Redistributions of source code must retain the above copyright
78  *    notice, this list of conditions and the following disclaimer.
79  * 2. Redistributions in binary form must reproduce the above copyright
80  *    notice, this list of conditions and the following disclaimer in the
81  *    documentation and/or other materials provided with the distribution.
82  *
83  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93  * SUCH DAMAGE.
94  */
95 
96 /*
97  *	Manages physical address maps.
98  *
99  *	Since the information managed by this module is
100  *	also stored by the logical address mapping module,
101  *	this module may throw away valid virtual-to-physical
102  *	mappings at almost any time.  However, invalidations
103  *	of virtual-to-physical mappings must be done as
104  *	requested.
105  *
106  *	In order to cope with hardware architectures which
107  *	make virtual-to-physical map invalidates expensive,
108  *	this module may delay invalidate or reduced protection
109  *	operations until such time as they are actually
110  *	necessary.  This module is given full information as
111  *	to which processors are currently using which maps,
112  *	and to when physical maps must be made correct.
113  */
114 
115 #include "opt_pmap.h"
116 
117 #include <sys/param.h>
118 #include <sys/systm.h>
119 #include <sys/bitstring.h>
120 #include <sys/bus.h>
121 #include <sys/cpuset.h>
122 #include <sys/kernel.h>
123 #include <sys/ktr.h>
124 #include <sys/lock.h>
125 #include <sys/malloc.h>
126 #include <sys/mman.h>
127 #include <sys/msgbuf.h>
128 #include <sys/mutex.h>
129 #include <sys/physmem.h>
130 #include <sys/proc.h>
131 #include <sys/rwlock.h>
132 #include <sys/sbuf.h>
133 #include <sys/sx.h>
134 #include <sys/vmem.h>
135 #include <sys/vmmeter.h>
136 #include <sys/sched.h>
137 #include <sys/sysctl.h>
138 #include <sys/smp.h>
139 
140 #include <vm/vm.h>
141 #include <vm/vm_param.h>
142 #include <vm/vm_kern.h>
143 #include <vm/vm_page.h>
144 #include <vm/vm_map.h>
145 #include <vm/vm_object.h>
146 #include <vm/vm_extern.h>
147 #include <vm/vm_pageout.h>
148 #include <vm/vm_pager.h>
149 #include <vm/vm_phys.h>
150 #include <vm/vm_radix.h>
151 #include <vm/vm_reserv.h>
152 #include <vm/vm_dumpset.h>
153 #include <vm/uma.h>
154 
155 #include <machine/machdep.h>
156 #include <machine/md_var.h>
157 #include <machine/pcb.h>
158 #include <machine/sbi.h>
159 #include <machine/thead.h>
160 
161 /*
162  * Boundary values for the page table page index space:
163  *
164  * L3 pages: [0, NUL2E)
165  * L2 pages: [NUL2E, NUL2E + NUL1E)
166  * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
167  *
168  * Note that these ranges are used in both SV39 and SV48 mode.  In SV39 mode the
169  * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
170  * in a set of page tables.
171  */
172 #define	NUL0E		Ln_ENTRIES
173 #define	NUL1E		(Ln_ENTRIES * NUL0E)
174 #define	NUL2E		(Ln_ENTRIES * NUL1E)
175 
176 #ifdef PV_STATS
177 #define PV_STAT(x)	do { x ; } while (0)
178 #define	__pv_stat_used
179 #else
180 #define PV_STAT(x)	do { } while (0)
181 #define	__pv_stat_used	__unused
182 #endif
183 
184 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
185 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
186 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
187 
188 #define	NPV_LIST_LOCKS	MAXCPU
189 
190 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
191 			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
192 
193 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
194 	struct rwlock **_lockp = (lockp);		\
195 	struct rwlock *_new_lock;			\
196 							\
197 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
198 	if (_new_lock != *_lockp) {			\
199 		if (*_lockp != NULL)			\
200 			rw_wunlock(*_lockp);		\
201 		*_lockp = _new_lock;			\
202 		rw_wlock(*_lockp);			\
203 	}						\
204 } while (0)
205 
206 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
207 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
208 
209 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
210 	struct rwlock **_lockp = (lockp);		\
211 							\
212 	if (*_lockp != NULL) {				\
213 		rw_wunlock(*_lockp);			\
214 		*_lockp = NULL;				\
215 	}						\
216 } while (0)
217 
218 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
219 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
220 
221 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
222     "VM/pmap parameters");
223 
224 /* The list of all the user pmaps */
225 LIST_HEAD(pmaplist, pmap);
226 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
227 
228 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39;
229 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
230     &pmap_mode, 0,
231     "translation mode, 0 = SV39, 1 = SV48");
232 
233 struct pmap kernel_pmap_store;
234 
235 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
236 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
237 vm_offset_t kernel_vm_end = 0;
238 
239 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
240 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
241 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
242 
243 /* This code assumes all L1 DMAP entries will be used */
244 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
245 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
246 
247 /*
248  * This code assumes that the early DEVMAP is L2_SIZE aligned.
249  */
250 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0);
251 
252 static struct rwlock_padalign pvh_global_lock;
253 static struct mtx_padalign allpmaps_lock;
254 
255 static int __read_frequently superpages_enabled = 1;
256 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
257     CTLFLAG_RDTUN, &superpages_enabled, 0,
258     "Enable support for transparent superpages");
259 
260 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
261     "2MB page mapping counters");
262 
263 static u_long pmap_l2_demotions;
264 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
265     &pmap_l2_demotions, 0,
266     "2MB page demotions");
267 
268 static u_long pmap_l2_mappings;
269 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
270     &pmap_l2_mappings, 0,
271     "2MB page mappings");
272 
273 static u_long pmap_l2_p_failures;
274 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
275     &pmap_l2_p_failures, 0,
276     "2MB page promotion failures");
277 
278 static u_long pmap_l2_promotions;
279 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
280     &pmap_l2_promotions, 0,
281     "2MB page promotions");
282 
283 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
284     "L1 (1GB) page mapping counters");
285 
286 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
287 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
288     &pmap_l1_demotions, "L1 (1GB) page demotions");
289 
290 /*
291  * Data for the pv entry allocation mechanism
292  */
293 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
294 static struct mtx pv_chunks_mutex;
295 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
296 static struct md_page *pv_table;
297 static struct md_page pv_dummy;
298 
299 extern cpuset_t all_harts;
300 
301 /*
302  * Internal flags for pmap_enter()'s helper functions.
303  */
304 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
305 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
306 
307 static void	free_pv_chunk(struct pv_chunk *pc);
308 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
309 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
310 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
311 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
312 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
313 		    vm_offset_t va);
314 static bool	pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va);
315 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
316 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
317 		    vm_offset_t va, struct rwlock **lockp);
318 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
319 		    u_int flags, vm_page_t m, struct rwlock **lockp);
320 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
321     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
322 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
323     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
324 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
325     vm_page_t m, struct rwlock **lockp);
326 
327 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
328 		struct rwlock **lockp);
329 
330 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
331     struct spglist *free);
332 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
333 
334 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
335 
336 static uint64_t pmap_satp_mode(void);
337 
338 #define	pmap_clear(pte)			pmap_store(pte, 0)
339 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
340 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
341 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
342 #define	pmap_load(pte)			atomic_load_64(pte)
343 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
344 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
345 
346 /********************/
347 /* Inline functions */
348 /********************/
349 
350 static __inline void
351 pagecopy(void *s, void *d)
352 {
353 
354 	memcpy(d, s, PAGE_SIZE);
355 }
356 
357 static __inline void
358 pagezero(void *p)
359 {
360 
361 	bzero(p, PAGE_SIZE);
362 }
363 
364 #define	pmap_l0_index(va)	(((va) >> L0_SHIFT) & Ln_ADDR_MASK)
365 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
366 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
367 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
368 
369 #define	PTE_TO_PHYS(pte) \
370     ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
371 #define	L2PTE_TO_PHYS(l2) \
372     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
373 #define	L1PTE_TO_PHYS(l1) \
374     ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
375 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
376 
377 /*
378  * Construct a page table entry of the specified level pointing to physical
379  * address pa, with PTE bits 'bits'.
380  *
381  * A leaf PTE of any level must point to an address matching its alignment,
382  * e.g. L2 pages must be 2MB aligned in memory.
383  */
384 #define	L1_PTE(pa, bits)	((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits))
385 #define	L2_PTE(pa, bits)	((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits))
386 #define	L3_PTE(pa, bits)	((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits))
387 
388 /*
389  * Construct a page directory entry (PDE), pointing to next level entry at pa,
390  * with PTE bits 'bits'.
391  *
392  * Unlike PTEs, page directory entries can point to any 4K-aligned physical
393  * address.
394  */
395 #define	L0_PDE(pa, bits)	L3_PTE(pa, bits)
396 #define	L1_PDE(pa, bits)	L3_PTE(pa, bits)
397 #define	L2_PDE(pa, bits)	L3_PTE(pa, bits)
398 
399 static __inline pd_entry_t *
400 pmap_l0(pmap_t pmap, vm_offset_t va)
401 {
402 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
403 	KASSERT(VIRT_IS_VALID(va),
404 	    ("%s: malformed virtual address %#lx", __func__, va));
405 	return (&pmap->pm_top[pmap_l0_index(va)]);
406 }
407 
408 static __inline pd_entry_t *
409 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
410 {
411 	vm_paddr_t phys;
412 	pd_entry_t *l1;
413 
414 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
415 	phys = PTE_TO_PHYS(pmap_load(l0));
416 	l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
417 
418 	return (&l1[pmap_l1_index(va)]);
419 }
420 
421 static __inline pd_entry_t *
422 pmap_l1(pmap_t pmap, vm_offset_t va)
423 {
424 	pd_entry_t *l0;
425 
426 	KASSERT(VIRT_IS_VALID(va),
427 	    ("%s: malformed virtual address %#lx", __func__, va));
428 	if (pmap_mode == PMAP_MODE_SV39) {
429 		return (&pmap->pm_top[pmap_l1_index(va)]);
430 	} else {
431 		l0 = pmap_l0(pmap, va);
432 		if ((pmap_load(l0) & PTE_V) == 0)
433 			return (NULL);
434 		if ((pmap_load(l0) & PTE_RX) != 0)
435 			return (NULL);
436 		return (pmap_l0_to_l1(l0, va));
437 	}
438 }
439 
440 static __inline pd_entry_t *
441 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
442 {
443 	vm_paddr_t phys;
444 	pd_entry_t *l2;
445 
446 	phys = PTE_TO_PHYS(pmap_load(l1));
447 	l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
448 
449 	return (&l2[pmap_l2_index(va)]);
450 }
451 
452 static __inline pd_entry_t *
453 pmap_l2(pmap_t pmap, vm_offset_t va)
454 {
455 	pd_entry_t *l1;
456 
457 	l1 = pmap_l1(pmap, va);
458 	if (l1 == NULL)
459 		return (NULL);
460 	if ((pmap_load(l1) & PTE_V) == 0)
461 		return (NULL);
462 	if ((pmap_load(l1) & PTE_RX) != 0)
463 		return (NULL);
464 
465 	return (pmap_l1_to_l2(l1, va));
466 }
467 
468 static __inline pt_entry_t *
469 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
470 {
471 	vm_paddr_t phys;
472 	pt_entry_t *l3;
473 
474 	phys = PTE_TO_PHYS(pmap_load(l2));
475 	l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
476 
477 	return (&l3[pmap_l3_index(va)]);
478 }
479 
480 static __inline pt_entry_t *
481 pmap_l3(pmap_t pmap, vm_offset_t va)
482 {
483 	pd_entry_t *l2;
484 
485 	l2 = pmap_l2(pmap, va);
486 	if (l2 == NULL)
487 		return (NULL);
488 	if ((pmap_load(l2) & PTE_V) == 0)
489 		return (NULL);
490 	if ((pmap_load(l2) & PTE_RX) != 0)
491 		return (NULL);
492 
493 	return (pmap_l2_to_l3(l2, va));
494 }
495 
496 static __inline void
497 pmap_resident_count_inc(pmap_t pmap, int count)
498 {
499 
500 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
501 	pmap->pm_stats.resident_count += count;
502 }
503 
504 static __inline void
505 pmap_resident_count_dec(pmap_t pmap, int count)
506 {
507 
508 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
509 	KASSERT(pmap->pm_stats.resident_count >= count,
510 	    ("pmap %p resident count underflow %ld %d", pmap,
511 	    pmap->pm_stats.resident_count, count));
512 	pmap->pm_stats.resident_count -= count;
513 }
514 
515 static void
516 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
517     pt_entry_t entry)
518 {
519 	struct pmap *user_pmap;
520 	pd_entry_t *l1;
521 
522 	/*
523 	 * Distribute new kernel L1 entry to all the user pmaps.  This is only
524 	 * necessary with three-level paging configured: with four-level paging
525 	 * the kernel's half of the top-level page table page is static and can
526 	 * simply be copied at pmap initialization time.
527 	 */
528 	if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39)
529 		return;
530 
531 	mtx_lock(&allpmaps_lock);
532 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
533 		l1 = &user_pmap->pm_top[l1index];
534 		pmap_store(l1, entry);
535 	}
536 	mtx_unlock(&allpmaps_lock);
537 }
538 
539 /*
540  * Holds the PTE mode bits (defined in pte.h) for defining e.g. cacheability.
541  *
542  * The indices correspond to the VM_MEMATTR_* defines in riscv/include/vm.h.
543  *
544  * The array will be empty if no mode bits are supported by the CPU, e.g. when
545  * lacking the Svpbmt extension.
546  */
547 static __read_frequently pt_entry_t memattr_bits[VM_MEMATTR_TOTAL];
548 static __read_frequently pt_entry_t memattr_mask;
549 
550 static __inline pt_entry_t
551 pmap_memattr_bits(vm_memattr_t mode)
552 {
553 	KASSERT(pmap_is_valid_memattr(kernel_pmap, mode),
554 	    ("invalid memory mode %u\n", mode));
555 	return (memattr_bits[(int)mode]);
556 }
557 
558 /*
559  * This should only be used during pmap bootstrap e.g. by
560  * pmap_create_pagetables().
561  */
562 static pt_entry_t *
563 pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages)
564 {
565 	pt_entry_t *pt;
566 
567 	pt = (pt_entry_t *)*freemempos;
568 	*freemempos += npages * PAGE_SIZE;
569 	bzero(pt, npages * PAGE_SIZE);
570 
571 	return (pt);
572 }
573 
574 /*
575  *	Construct the direct map -- a linear mapping of physical memory into
576  *	the kernel address space.
577  *
578  *	We walk the list of physical memory segments (of arbitrary size and
579  *	address) mapping each appropriately using L2 and L1 superpages.
580  *	Consequently, the DMAP address space will have unmapped regions
581  *	corresponding to any holes between physical memory segments.
582  *
583  *	The lowest usable physical address will always be mapped to
584  *	DMAP_MIN_ADDRESS.
585  */
586 static vm_paddr_t
587 pmap_bootstrap_dmap(pd_entry_t *l1, vm_paddr_t freemempos)
588 {
589 	vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
590 	vm_offset_t va;
591 	vm_paddr_t min_pa, max_pa, pa, endpa;
592 	pd_entry_t *l2;
593 	pt_entry_t memattr;
594 	u_int l1slot, l2slot;
595 	int physmap_idx;
596 
597 	physmap_idx = physmem_avail(physmap, nitems(physmap));
598 	min_pa = physmap[0];
599 	max_pa = physmap[physmap_idx - 1];
600 
601 	printf("physmap_idx %u\n", physmap_idx);
602 	printf("min_pa %lx\n", min_pa);
603 	printf("max_pa %lx\n", max_pa);
604 
605 	/* Set the limits of the DMAP region. */
606 	dmap_phys_base = rounddown(min_pa, L1_SIZE);
607 	dmap_phys_max = max_pa;
608 
609 	memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
610 
611 	/* Walk the physmap table. */
612 	l2 = NULL;
613 	l1slot = Ln_ENTRIES; /* sentinel value */
614 	for (int idx = 0; idx < physmap_idx; idx += 2) {
615 		pa = rounddown(physmap[idx], L2_SIZE);
616 		endpa = physmap[idx + 1];
617 
618 		/* Virtual address for this range. */
619 		va = PHYS_TO_DMAP(pa);
620 
621 		/* Any 1GB possible for this range? */
622 		if (roundup(pa, L1_SIZE) + L1_SIZE > endpa)
623 			goto l2end;
624 
625 		/* Loop until the next 1GB boundary. */
626 		while ((pa & L1_OFFSET) != 0) {
627 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
628 				/* Need to alloc another page table. */
629 				l2 = pmap_early_alloc_tables(&freemempos, 1);
630 
631 				/* Link it. */
632 				l1slot = pmap_l1_index(va);
633 				pmap_store(&l1[l1slot],
634 				    L1_PDE((vm_paddr_t)l2, PTE_V));
635 			}
636 
637 			/* map l2 pages */
638 			l2slot = pmap_l2_index(va);
639 			pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
640 
641 			pa += L2_SIZE;
642 			va += L2_SIZE;
643 		}
644 
645 		/* Map what we can with 1GB superpages. */
646 		while (pa + L1_SIZE - 1 < endpa) {
647 			/* map l1 pages */
648 			l1slot = pmap_l1_index(va);
649 			pmap_store(&l1[l1slot], L1_PTE(pa, PTE_KERN | memattr));
650 
651 			pa += L1_SIZE;
652 			va += L1_SIZE;
653 		}
654 
655 l2end:
656 		while (pa < endpa) {
657 			if (l2 == NULL || pmap_l1_index(va) != l1slot) {
658 				/* Need to alloc another page table. */
659 				l2 = pmap_early_alloc_tables(&freemempos, 1);
660 
661 				/* Link it. */
662 				l1slot = pmap_l1_index(va);
663 				pmap_store(&l1[l1slot],
664 				    L1_PDE((vm_paddr_t)l2, PTE_V));
665 			}
666 
667 			/* map l2 pages */
668 			l2slot = pmap_l2_index(va);
669 			pmap_store(&l2[l2slot], L2_PTE(pa, PTE_KERN | memattr));
670 
671 			pa += L2_SIZE;
672 			va += L2_SIZE;
673 		}
674 	}
675 
676 	/* And finally, the limit on DMAP VA. */
677 	dmap_max_addr = va;
678 
679 	return (freemempos);
680 }
681 
682 /*
683  *	Create a new set of pagetables to run the kernel with.
684  *
685  *	An initial, temporary setup was created in locore.S, which serves well
686  *	enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB
687  *	superpages, and created a 1GB identity map, which allows this function
688  *	to dereference physical addresses.
689  *
690  *	The memory backing these page tables is allocated in the space
691  *	immediately following the kernel's preload area. Depending on the size
692  *	of this area, some, all, or none of these pages can be implicitly
693  *	mapped by the kernel's 2MB mappings. This memory will only ever be
694  *	accessed through the direct map, however.
695  */
696 static vm_paddr_t
697 pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen,
698     vm_paddr_t *root_pt_phys)
699 {
700 	pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3;
701 	pt_entry_t memattr;
702 	pd_entry_t *devmap_l2;
703 	vm_paddr_t kernend, freemempos, pa;
704 	int nkernl2, nkernl3, ndevmapl3;
705 	int i, slot;
706 	int mode;
707 
708 	kernend = kernstart + kernlen;
709 
710 	/* Static allocations begin after the kernel staging area. */
711 	freemempos = roundup2(kernend, PAGE_SIZE);
712 
713 	/* Detect Sv48 mode. */
714 	mode = PMAP_MODE_SV39;
715 	TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
716 
717 	if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
718 		/*
719 		 * Sv48 mode: allocate an L0 page table to be the root. The
720 		 * layout of KVA is otherwise identical to Sv39.
721 		 */
722 		l0 = pmap_early_alloc_tables(&freemempos, 1);
723 		*root_pt_phys = (vm_paddr_t)l0;
724 		pmap_mode = PMAP_MODE_SV48;
725 	} else {
726 		l0 = NULL;
727 	}
728 
729 	/*
730 	 * Allocate an L1 page table.
731 	 */
732 	l1 = pmap_early_alloc_tables(&freemempos, 1);
733 	if (pmap_mode == PMAP_MODE_SV39)
734 		*root_pt_phys = (vm_paddr_t)l1;
735 
736 	/*
737 	 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is
738 	 * needed.
739 	 */
740 	nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES);
741 	kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2);
742 
743 	/*
744 	 * Allocate an L2 page table for the static devmap, located at the end
745 	 * of KVA. We can expect that the devmap will always be less than 1GB
746 	 * in size.
747 	 */
748 	devmap_l2 = pmap_early_alloc_tables(&freemempos, 1);
749 
750 	/* Allocate L3 page tables for the devmap. */
751 	ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE),
752 	    Ln_ENTRIES);
753 	devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3);
754 
755 	/*
756 	 * Allocate some L3 bootstrap pages, for early KVA allocations before
757 	 * vm_mem_init() has run. For example, the message buffer.
758 	 *
759 	 * A somewhat arbitrary choice of 32MB. This should be more than enough
760 	 * for any early allocations. There is no need to worry about waste, as
761 	 * whatever is not used will be consumed by later calls to
762 	 * pmap_growkernel().
763 	 */
764 	nkernl3 = 16;
765 	kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3);
766 
767 	/* Bootstrap the direct map. */
768 	freemempos = pmap_bootstrap_dmap(l1, freemempos);
769 
770 	/* Allocations are done. */
771 	if (freemempos < roundup2(kernend, L2_SIZE))
772 		freemempos = roundup2(kernend, L2_SIZE);
773 
774 	/* Memory attributes for standard/main memory. */
775 	memattr = pmap_memattr_bits(VM_MEMATTR_DEFAULT);
776 
777 	/*
778 	 * Map the kernel (and preloaded modules or data) using L2 superpages.
779 	 *
780 	 * kernstart is 2MB-aligned. This is enforced by loader(8) and required
781 	 * by locore assembly.
782 	 *
783 	 * TODO: eventually, this should be done with proper permissions for
784 	 * each segment, rather than mapping the entire kernel and preloaded
785 	 * modules RWX.
786 	 */
787 	slot = pmap_l2_index(KERNBASE);
788 	for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) {
789 		pmap_store(&kern_l2[slot],
790 		    L2_PTE(pa, PTE_KERN | PTE_X | memattr));
791 	}
792 
793 	/*
794 	 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs
795 	 * themselves are invalid.
796 	 */
797 	slot = pmap_l2_index(freemempos - kernstart + KERNBASE);
798 	for (i = 0; i < nkernl3; i++, slot++) {
799 		pa = (vm_paddr_t)kern_l3 + ptoa(i);
800 		pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V));
801 	}
802 
803 	/* Connect the L2 tables to the L1 table. */
804 	slot = pmap_l1_index(KERNBASE);
805 	for (i = 0; i < nkernl2; i++, slot++) {
806 		pa = (vm_paddr_t)kern_l2 + ptoa(i);
807 		pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
808 	}
809 
810 	/* Connect the L1 table to L0, if in use. */
811 	if (pmap_mode == PMAP_MODE_SV48) {
812 		slot = pmap_l0_index(KERNBASE);
813 		pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V));
814 	}
815 
816 	/*
817 	 * Connect the devmap L3 pages to the L2 table. The devmap PTEs
818 	 * themselves are invalid.
819 	 */
820 	slot = pmap_l2_index(DEVMAP_MIN_VADDR);
821 	for (i = 0; i < ndevmapl3; i++, slot++) {
822 		pa = (vm_paddr_t)devmap_l3 + ptoa(i);
823 		pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V));
824 	}
825 
826 	/* Connect the devmap L2 pages to the L1 table. */
827 	slot = pmap_l1_index(DEVMAP_MIN_VADDR);
828 	pa = (vm_paddr_t)devmap_l2;
829 	pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
830 
831 	/* Return the next position of free memory */
832 	return (freemempos);
833 }
834 
835 /*
836  *	Bootstrap the system enough to run with virtual memory.
837  */
838 void
839 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
840 {
841 	vm_paddr_t freemempos, pa;
842 	vm_paddr_t root_pt_phys;
843 	vm_offset_t freeva;
844 	vm_offset_t dpcpu, msgbufpv;
845 	pt_entry_t *pte;
846 	int i;
847 
848 	printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
849 
850 	PMAP_LOCK_INIT(kernel_pmap);
851 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
852 	vm_radix_init(&kernel_pmap->pm_root);
853 
854 	rw_init(&pvh_global_lock, "pmap pv global");
855 
856 	/*
857 	 * Set the current CPU as active in the kernel pmap. Secondary cores
858 	 * will add themselves later in init_secondary(). The SBI firmware
859 	 * may rely on this mask being precise, so CPU_FILL() is not used.
860 	 */
861 	CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
862 
863 	/*
864 	 * Set up the memory attribute bits.
865 	 */
866 	if (has_svpbmt) {
867 		memattr_bits[VM_MEMATTR_PMA] = PTE_MA_NONE;
868 		memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_MA_NC;
869 		memattr_bits[VM_MEMATTR_DEVICE] = PTE_MA_IO;
870 		memattr_mask = PTE_MA_MASK;
871 	} else if (has_errata_thead_pbmt) {
872 		memattr_bits[VM_MEMATTR_PMA] = PTE_THEAD_MA_NONE;
873 		memattr_bits[VM_MEMATTR_UNCACHEABLE] = PTE_THEAD_MA_NC;
874 		memattr_bits[VM_MEMATTR_DEVICE] = PTE_THEAD_MA_IO;
875 		memattr_mask = PTE_THEAD_MA_MASK;
876 	}
877 
878 	/* Create a new set of pagetables to run the kernel in. */
879 	freemempos = pmap_create_pagetables(kernstart, kernlen, &root_pt_phys);
880 
881 	/* Switch to the newly created page tables. */
882 	kernel_pmap->pm_stage = PM_STAGE1;
883 	kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys);
884 	kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode();
885 	csr_write(satp, kernel_pmap->pm_satp);
886 	sfence_vma();
887 
888 	/*
889 	 * Now, we need to make a few more static reservations from KVA.
890 	 *
891 	 * Set freeva to freemempos virtual address, and be sure to advance
892 	 * them together.
893 	 */
894 	freeva = freemempos - kernstart + KERNBASE;
895 #define reserve_space(var, pa, size)					\
896 	do {								\
897 		var = freeva;						\
898 		pa = freemempos;					\
899 		freeva += size;						\
900 		freemempos += size;					\
901 	} while (0)
902 
903 	/* Allocate the dynamic per-cpu area. */
904 	reserve_space(dpcpu, pa, DPCPU_SIZE);
905 
906 	/* Map it. */
907 	pte = pmap_l3(kernel_pmap, dpcpu);
908 	KASSERT(pte != NULL, ("Bootstrap pages missing"));
909 	for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++)
910 		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
911 		    pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
912 
913 	/* Now, it can be initialized. */
914 	dpcpu_init((void *)dpcpu, 0);
915 
916 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
917 	reserve_space(msgbufpv, pa, round_page(msgbufsize));
918 	msgbufp = (void *)msgbufpv;
919 
920 	/* Map it. */
921 	pte = pmap_l3(kernel_pmap, msgbufpv);
922 	KASSERT(pte != NULL, ("Bootstrap pages missing"));
923 	for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++)
924 		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN |
925 		    pmap_memattr_bits(VM_MEMATTR_DEFAULT)));
926 
927 #undef	reserve_space
928 
929 	/* Mark the bounds of our available virtual address space */
930 	virtual_avail = kernel_vm_end = freeva;
931 	virtual_end = DEVMAP_MIN_VADDR;
932 
933 	/* Exclude the reserved physical memory from allocations. */
934 	physmem_exclude_region(kernstart, freemempos - kernstart,
935 	    EXFLAG_NOALLOC);
936 }
937 
938 /*
939  *	Initialize a vm_page's machine-dependent fields.
940  */
941 void
942 pmap_page_init(vm_page_t m)
943 {
944 
945 	TAILQ_INIT(&m->md.pv_list);
946 	m->md.pv_memattr = VM_MEMATTR_DEFAULT;
947 }
948 
949 /*
950  *	Initialize the pmap module.
951  *
952  *	Called by vm_mem_init(), to initialize any structures that the pmap
953  *	system needs to map virtual memory.
954  */
955 void
956 pmap_init(void)
957 {
958 	vm_size_t s;
959 	int i, pv_npg;
960 
961 	/*
962 	 * Initialize the pv chunk and pmap list mutexes.
963 	 */
964 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
965 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
966 
967 	/*
968 	 * Initialize the pool of pv list locks.
969 	 */
970 	for (i = 0; i < NPV_LIST_LOCKS; i++)
971 		rw_init(&pv_list_locks[i], "pmap pv list");
972 
973 	/*
974 	 * Calculate the size of the pv head table for superpages.
975 	 */
976 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
977 
978 	/*
979 	 * Allocate memory for the pv head table for superpages.
980 	 */
981 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
982 	s = round_page(s);
983 	pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
984 	for (i = 0; i < pv_npg; i++)
985 		TAILQ_INIT(&pv_table[i].pv_list);
986 	TAILQ_INIT(&pv_dummy.pv_list);
987 
988 	if (superpages_enabled)
989 		pagesizes[1] = L2_SIZE;
990 }
991 
992 #ifdef SMP
993 /*
994  * For SMP, these functions have to use IPIs for coherence.
995  *
996  * In general, the calling thread uses a plain fence to order the
997  * writes to the page tables before invoking an SBI callback to invoke
998  * sfence_vma() on remote CPUs.
999  */
1000 static void
1001 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1002 {
1003 	cpuset_t mask;
1004 
1005 	sched_pin();
1006 	mask = pmap->pm_active;
1007 	CPU_CLR(PCPU_GET(hart), &mask);
1008 	fence();
1009 	if (!CPU_EMPTY(&mask) && smp_started)
1010 		sbi_remote_sfence_vma(mask.__bits, va, 1);
1011 	sfence_vma_page(va);
1012 	sched_unpin();
1013 }
1014 
1015 static void
1016 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1017 {
1018 	cpuset_t mask;
1019 
1020 	sched_pin();
1021 	mask = pmap->pm_active;
1022 	CPU_CLR(PCPU_GET(hart), &mask);
1023 	fence();
1024 	if (!CPU_EMPTY(&mask) && smp_started)
1025 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
1026 
1027 	/*
1028 	 * Might consider a loop of sfence_vma_page() for a small
1029 	 * number of pages in the future.
1030 	 */
1031 	sfence_vma();
1032 	sched_unpin();
1033 }
1034 
1035 static void
1036 pmap_invalidate_all(pmap_t pmap)
1037 {
1038 	cpuset_t mask;
1039 
1040 	sched_pin();
1041 	mask = pmap->pm_active;
1042 	CPU_CLR(PCPU_GET(hart), &mask);
1043 
1044 	/*
1045 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
1046 	 * address to perform a global fence.  BBL currently treats
1047 	 * all sfence_vma requests as global however.
1048 	 */
1049 	fence();
1050 	if (!CPU_EMPTY(&mask) && smp_started)
1051 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
1052 	sfence_vma();
1053 	sched_unpin();
1054 }
1055 #else
1056 /*
1057  * Normal, non-SMP, invalidation functions.
1058  * We inline these within pmap.c for speed.
1059  */
1060 static __inline void
1061 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1062 {
1063 
1064 	sfence_vma_page(va);
1065 }
1066 
1067 static __inline void
1068 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1069 {
1070 
1071 	/*
1072 	 * Might consider a loop of sfence_vma_page() for a small
1073 	 * number of pages in the future.
1074 	 */
1075 	sfence_vma();
1076 }
1077 
1078 static __inline void
1079 pmap_invalidate_all(pmap_t pmap)
1080 {
1081 
1082 	sfence_vma();
1083 }
1084 #endif
1085 
1086 /*
1087  *	Routine:	pmap_extract
1088  *	Function:
1089  *		Extract the physical page address associated
1090  *		with the given map/virtual_address pair.
1091  */
1092 vm_paddr_t
1093 pmap_extract(pmap_t pmap, vm_offset_t va)
1094 {
1095 	pd_entry_t *l2p, l2;
1096 	pt_entry_t *l3p;
1097 	vm_paddr_t pa;
1098 
1099 	pa = 0;
1100 
1101 	/*
1102 	 * Start with an L2 lookup, L1 superpages are currently not implemented.
1103 	 */
1104 	PMAP_LOCK(pmap);
1105 	l2p = pmap_l2(pmap, va);
1106 	if (l2p != NULL && ((l2 = pmap_load(l2p)) & PTE_V) != 0) {
1107 		if ((l2 & PTE_RWX) == 0) {
1108 			l3p = pmap_l2_to_l3(l2p, va);
1109 			pa = PTE_TO_PHYS(pmap_load(l3p));
1110 			pa |= (va & L3_OFFSET);
1111 		} else {
1112 			/* L2 is a superpage mapping. */
1113 			pa = L2PTE_TO_PHYS(l2);
1114 			pa |= (va & L2_OFFSET);
1115 		}
1116 	}
1117 	PMAP_UNLOCK(pmap);
1118 	return (pa);
1119 }
1120 
1121 /*
1122  *	Routine:	pmap_extract_and_hold
1123  *	Function:
1124  *		Atomically extract and hold the physical page
1125  *		with the given pmap and virtual address pair
1126  *		if that mapping permits the given protection.
1127  */
1128 vm_page_t
1129 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1130 {
1131 	pt_entry_t *l3p, l3;
1132 	vm_page_t m;
1133 
1134 	m = NULL;
1135 	PMAP_LOCK(pmap);
1136 	l3p = pmap_l3(pmap, va);
1137 	if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) {
1138 		if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
1139 			m = PTE_TO_VM_PAGE(l3);
1140 			if (!vm_page_wire_mapped(m))
1141 				m = NULL;
1142 		}
1143 	}
1144 	PMAP_UNLOCK(pmap);
1145 	return (m);
1146 }
1147 
1148 /*
1149  *	Routine:	pmap_kextract
1150  *	Function:
1151  *		Extract the physical page address associated with the given kernel
1152  *		virtual address.
1153  */
1154 vm_paddr_t
1155 pmap_kextract(vm_offset_t va)
1156 {
1157 	pd_entry_t *l2, l2e;
1158 	pt_entry_t *l3;
1159 	vm_paddr_t pa;
1160 
1161 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1162 		pa = DMAP_TO_PHYS(va);
1163 	} else {
1164 		l2 = pmap_l2(kernel_pmap, va);
1165 		if (l2 == NULL)
1166 			panic("pmap_kextract: No l2");
1167 		l2e = pmap_load(l2);
1168 		/*
1169 		 * Beware of concurrent promotion and demotion! We must
1170 		 * use l2e rather than loading from l2 multiple times to
1171 		 * ensure we see a consistent state, including the
1172 		 * implicit load in pmap_l2_to_l3.  It is, however, safe
1173 		 * to use an old l2e because the L3 page is preserved by
1174 		 * promotion.
1175 		 */
1176 		if ((l2e & PTE_RX) != 0) {
1177 			/* superpages */
1178 			pa = L2PTE_TO_PHYS(l2e);
1179 			pa |= (va & L2_OFFSET);
1180 			return (pa);
1181 		}
1182 
1183 		l3 = pmap_l2_to_l3(&l2e, va);
1184 		pa = PTE_TO_PHYS(pmap_load(l3));
1185 		pa |= (va & PAGE_MASK);
1186 	}
1187 	return (pa);
1188 }
1189 
1190 /***************************************************
1191  * Low level mapping routines.....
1192  ***************************************************/
1193 
1194 void
1195 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1196 {
1197 	pt_entry_t entry;
1198 	pt_entry_t *l3;
1199 	pt_entry_t memattr;
1200 	vm_offset_t va;
1201 	pn_t pn;
1202 
1203 	KASSERT((pa & L3_OFFSET) == 0,
1204 	   ("pmap_kenter_device: Invalid physical address"));
1205 	KASSERT((sva & L3_OFFSET) == 0,
1206 	   ("pmap_kenter_device: Invalid virtual address"));
1207 	KASSERT((size & PAGE_MASK) == 0,
1208 	    ("pmap_kenter_device: Mapping is not page-sized"));
1209 
1210 	memattr = pmap_memattr_bits(mode);
1211 	va = sva;
1212 	while (size != 0) {
1213 		l3 = pmap_l3(kernel_pmap, va);
1214 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1215 
1216 		pn = (pa / PAGE_SIZE);
1217 		entry = PTE_KERN;
1218 		entry |= memattr;
1219 		entry |= (pn << PTE_PPN0_S);
1220 		pmap_store(l3, entry);
1221 
1222 		va += PAGE_SIZE;
1223 		pa += PAGE_SIZE;
1224 		size -= PAGE_SIZE;
1225 	}
1226 	pmap_invalidate_range(kernel_pmap, sva, va);
1227 }
1228 
1229 void
1230 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1231 {
1232 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1233 }
1234 
1235 /*
1236  * Remove a page from the kernel pagetables.
1237  * Note: not SMP coherent.
1238  */
1239 void
1240 pmap_kremove(vm_offset_t va)
1241 {
1242 	pt_entry_t *l3;
1243 
1244 	l3 = pmap_l3(kernel_pmap, va);
1245 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1246 
1247 	pmap_clear(l3);
1248 	sfence_vma();
1249 }
1250 
1251 void
1252 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1253 {
1254 	pt_entry_t *l3;
1255 	vm_offset_t va;
1256 
1257 	KASSERT((sva & L3_OFFSET) == 0,
1258 	   ("pmap_kremove_device: Invalid virtual address"));
1259 	KASSERT((size & PAGE_MASK) == 0,
1260 	    ("pmap_kremove_device: Mapping is not page-sized"));
1261 
1262 	va = sva;
1263 	while (size != 0) {
1264 		l3 = pmap_l3(kernel_pmap, va);
1265 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1266 		pmap_clear(l3);
1267 
1268 		va += PAGE_SIZE;
1269 		size -= PAGE_SIZE;
1270 	}
1271 
1272 	pmap_invalidate_range(kernel_pmap, sva, va);
1273 }
1274 
1275 /*
1276  *	Used to map a range of physical addresses into kernel
1277  *	virtual address space.
1278  *
1279  *	The value passed in '*virt' is a suggested virtual address for
1280  *	the mapping. Architectures which can support a direct-mapped
1281  *	physical to virtual region can return the appropriate address
1282  *	within that region, leaving '*virt' unchanged. Other
1283  *	architectures should map the pages starting at '*virt' and
1284  *	update '*virt' with the first usable address after the mapped
1285  *	region.
1286  */
1287 vm_offset_t
1288 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1289 {
1290 
1291 	return PHYS_TO_DMAP(start);
1292 }
1293 
1294 /*
1295  * Add a list of wired pages to the kva
1296  * this routine is only used for temporary
1297  * kernel mappings that do not need to have
1298  * page modification or references recorded.
1299  * Note that old mappings are simply written
1300  * over.  The page *must* be wired.
1301  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1302  */
1303 void
1304 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1305 {
1306 	pt_entry_t *l3;
1307 	vm_paddr_t pa;
1308 	vm_offset_t va;
1309 	vm_page_t m;
1310 	pt_entry_t entry;
1311 	pn_t pn;
1312 	int i;
1313 
1314 	va = sva;
1315 	for (i = 0; i < count; i++) {
1316 		m = ma[i];
1317 		pa = VM_PAGE_TO_PHYS(m);
1318 		pn = (pa / PAGE_SIZE);
1319 		l3 = pmap_l3(kernel_pmap, va);
1320 
1321 		entry = PTE_KERN;
1322 		entry |= pmap_memattr_bits(m->md.pv_memattr);
1323 		entry |= (pn << PTE_PPN0_S);
1324 		pmap_store(l3, entry);
1325 
1326 		va += L3_SIZE;
1327 	}
1328 	pmap_invalidate_range(kernel_pmap, sva, va);
1329 }
1330 
1331 /*
1332  * This routine tears out page mappings from the
1333  * kernel -- it is meant only for temporary mappings.
1334  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1335  */
1336 void
1337 pmap_qremove(vm_offset_t sva, int count)
1338 {
1339 	pt_entry_t *l3;
1340 	vm_offset_t va;
1341 
1342 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1343 
1344 	for (va = sva; count-- > 0; va += PAGE_SIZE) {
1345 		l3 = pmap_l3(kernel_pmap, va);
1346 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1347 		pmap_clear(l3);
1348 	}
1349 	pmap_invalidate_range(kernel_pmap, sva, va);
1350 }
1351 
1352 bool
1353 pmap_ps_enabled(pmap_t pmap __unused)
1354 {
1355 
1356 	return (superpages_enabled);
1357 }
1358 
1359 /***************************************************
1360  * Page table page management routines.....
1361  ***************************************************/
1362 /*
1363  * Schedule the specified unused page table page to be freed.  Specifically,
1364  * add the page to the specified list of pages that will be released to the
1365  * physical memory manager after the TLB has been updated.
1366  */
1367 static __inline void
1368 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
1369 {
1370 
1371 	if (set_PG_ZERO)
1372 		m->flags |= PG_ZERO;
1373 	else
1374 		m->flags &= ~PG_ZERO;
1375 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1376 }
1377 
1378 /*
1379  * Inserts the specified page table page into the specified pmap's collection
1380  * of idle page table pages.  Each of a pmap's page table pages is responsible
1381  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1382  * ordered by this virtual address range.
1383  *
1384  * If "promoted" is false, then the page table page "mpte" must be zero filled;
1385  * "mpte"'s valid field will be set to 0.
1386  *
1387  * If "promoted" is true and "all_l3e_PTE_A_set" is false, then "mpte" must
1388  * contain valid mappings with identical attributes except for PTE_A;
1389  * "mpte"'s valid field will be set to 1.
1390  *
1391  * If "promoted" and "all_l3e_PTE_A_set" are both true, then "mpte" must contain
1392  * valid mappings with identical attributes including PTE_A; "mpte"'s valid
1393  * field will be set to VM_PAGE_BITS_ALL.
1394  */
1395 static __inline int
1396 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
1397     bool all_l3e_PTE_A_set)
1398 {
1399 
1400 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1401 	KASSERT(promoted || !all_l3e_PTE_A_set,
1402 	    ("a zero-filled PTP can't have PTE_A set in every PTE"));
1403 	mpte->valid = promoted ? (all_l3e_PTE_A_set ? VM_PAGE_BITS_ALL : 1) : 0;
1404 	return (vm_radix_insert(&pmap->pm_root, mpte));
1405 }
1406 
1407 /*
1408  * Removes the page table page mapping the specified virtual address from the
1409  * specified pmap's collection of idle page table pages, and returns it.
1410  * Otherwise, returns NULL if there is no page table page corresponding to the
1411  * specified virtual address.
1412  */
1413 static __inline vm_page_t
1414 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1415 {
1416 
1417 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1418 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1419 }
1420 
1421 /*
1422  * Decrements a page table page's reference count, which is used to record the
1423  * number of valid page table entries within the page.  If the reference count
1424  * drops to zero, then the page table page is unmapped.  Returns true if the
1425  * page table page was unmapped and false otherwise.
1426  */
1427 static inline bool
1428 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1429 {
1430 	KASSERT(m->ref_count > 0,
1431 	    ("%s: page %p ref count underflow", __func__, m));
1432 
1433 	--m->ref_count;
1434 	if (m->ref_count == 0) {
1435 		_pmap_unwire_ptp(pmap, va, m, free);
1436 		return (true);
1437 	} else {
1438 		return (false);
1439 	}
1440 }
1441 
1442 static void
1443 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1444 {
1445 
1446 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1447 	if (m->pindex >= NUL2E + NUL1E) {
1448 		pd_entry_t *l0;
1449 		l0 = pmap_l0(pmap, va);
1450 		pmap_clear(l0);
1451 	} else if (m->pindex >= NUL2E) {
1452 		pd_entry_t *l1;
1453 		l1 = pmap_l1(pmap, va);
1454 		pmap_clear(l1);
1455 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1456 	} else {
1457 		pd_entry_t *l2;
1458 		l2 = pmap_l2(pmap, va);
1459 		pmap_clear(l2);
1460 	}
1461 	pmap_resident_count_dec(pmap, 1);
1462 	if (m->pindex < NUL2E) {
1463 		pd_entry_t *l1;
1464 		vm_page_t pdpg;
1465 
1466 		l1 = pmap_l1(pmap, va);
1467 		pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1468 		pmap_unwire_ptp(pmap, va, pdpg, free);
1469 	} else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) {
1470 		pd_entry_t *l0;
1471 		vm_page_t pdpg;
1472 
1473 		l0 = pmap_l0(pmap, va);
1474 		pdpg = PTE_TO_VM_PAGE(pmap_load(l0));
1475 		pmap_unwire_ptp(pmap, va, pdpg, free);
1476 	}
1477 	pmap_invalidate_page(pmap, va);
1478 
1479 	vm_wire_sub(1);
1480 
1481 	/*
1482 	 * Put page on a list so that it is released after
1483 	 * *ALL* TLB shootdown is done
1484 	 */
1485 	pmap_add_delayed_free_list(m, free, true);
1486 }
1487 
1488 /*
1489  * After removing a page table entry, this routine is used to
1490  * conditionally free the page, and manage the reference count.
1491  */
1492 static int
1493 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1494     struct spglist *free)
1495 {
1496 	vm_page_t mpte;
1497 
1498 	if (va >= VM_MAXUSER_ADDRESS)
1499 		return (0);
1500 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1501 	mpte = PTE_TO_VM_PAGE(ptepde);
1502 	return (pmap_unwire_ptp(pmap, va, mpte, free));
1503 }
1504 
1505 static uint64_t
1506 pmap_satp_mode(void)
1507 {
1508 	return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48);
1509 }
1510 
1511 void
1512 pmap_pinit0(pmap_t pmap)
1513 {
1514 	PMAP_LOCK_INIT(pmap);
1515 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1516 	pmap->pm_stage = PM_STAGE1;
1517 	pmap->pm_top = kernel_pmap->pm_top;
1518 	pmap->pm_satp = pmap_satp_mode() |
1519 	    (vtophys(pmap->pm_top) >> PAGE_SHIFT);
1520 	CPU_ZERO(&pmap->pm_active);
1521 	TAILQ_INIT(&pmap->pm_pvchunk);
1522 	vm_radix_init(&pmap->pm_root);
1523 	pmap_activate_boot(pmap);
1524 }
1525 
1526 int
1527 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage)
1528 {
1529 	vm_paddr_t topphys;
1530 	vm_page_t m;
1531 	size_t i;
1532 
1533 	/*
1534 	 * Top directory is 4 pages in hypervisor case.
1535 	 * Current address space layout makes 3 of them unused.
1536 	 */
1537 	if (stage == PM_STAGE1)
1538 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
1539 		    VM_ALLOC_WAITOK);
1540 	else
1541 		m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
1542 		    4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT);
1543 
1544 	topphys = VM_PAGE_TO_PHYS(m);
1545 	pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
1546 	pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
1547 	pmap->pm_stage = stage;
1548 
1549 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1550 
1551 	CPU_ZERO(&pmap->pm_active);
1552 
1553 	if (stage == PM_STAGE2)
1554 		goto finish;
1555 
1556 	if (pmap_mode == PMAP_MODE_SV39) {
1557 		/*
1558 		 * Copy L1 entries from the kernel pmap.  This must be done with
1559 		 * the allpmaps lock held to avoid races with
1560 		 * pmap_distribute_l1().
1561 		 */
1562 		mtx_lock(&allpmaps_lock);
1563 		LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1564 		for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS);
1565 		    i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++)
1566 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1567 		for (i = pmap_l1_index(DMAP_MIN_ADDRESS);
1568 		    i < pmap_l1_index(DMAP_MAX_ADDRESS); i++)
1569 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1570 		mtx_unlock(&allpmaps_lock);
1571 	} else {
1572 		i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS);
1573 		pmap->pm_top[i] = kernel_pmap->pm_top[i];
1574 	}
1575 
1576 finish:
1577 	TAILQ_INIT(&pmap->pm_pvchunk);
1578 	vm_radix_init(&pmap->pm_root);
1579 
1580 	return (1);
1581 }
1582 
1583 int
1584 pmap_pinit(pmap_t pmap)
1585 {
1586 
1587 	return (pmap_pinit_stage(pmap, PM_STAGE1));
1588 }
1589 
1590 /*
1591  * This routine is called if the desired page table page does not exist.
1592  *
1593  * If page table page allocation fails, this routine may sleep before
1594  * returning NULL.  It sleeps only if a lock pointer was given.
1595  *
1596  * Note: If a page allocation fails at page table level two or three,
1597  * one or two pages may be held during the wait, only to be released
1598  * afterwards.  This conservative approach is easily argued to avoid
1599  * race conditions.
1600  */
1601 static vm_page_t
1602 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1603 {
1604 	vm_page_t m, pdpg;
1605 	pt_entry_t entry;
1606 	vm_paddr_t phys;
1607 	pn_t pn;
1608 
1609 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1610 
1611 	/*
1612 	 * Allocate a page table page.
1613 	 */
1614 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1615 	if (m == NULL) {
1616 		if (lockp != NULL) {
1617 			RELEASE_PV_LIST_LOCK(lockp);
1618 			PMAP_UNLOCK(pmap);
1619 			rw_runlock(&pvh_global_lock);
1620 			vm_wait(NULL);
1621 			rw_rlock(&pvh_global_lock);
1622 			PMAP_LOCK(pmap);
1623 		}
1624 
1625 		/*
1626 		 * Indicate the need to retry.  While waiting, the page table
1627 		 * page may have been allocated.
1628 		 */
1629 		return (NULL);
1630 	}
1631 	m->pindex = ptepindex;
1632 
1633 	/*
1634 	 * Map the pagetable page into the process address space, if
1635 	 * it isn't already there.
1636 	 */
1637 	pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
1638 	if (ptepindex >= NUL2E + NUL1E) {
1639 		pd_entry_t *l0;
1640 		vm_pindex_t l0index;
1641 
1642 		KASSERT(pmap_mode != PMAP_MODE_SV39,
1643 		    ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
1644 		KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
1645 		    ("%s: pindex %#lx out of range", __func__, ptepindex));
1646 
1647 		l0index = ptepindex - (NUL2E + NUL1E);
1648 		l0 = &pmap->pm_top[l0index];
1649 		KASSERT((pmap_load(l0) & PTE_V) == 0,
1650 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
1651 
1652 		entry = PTE_V | (pn << PTE_PPN0_S);
1653 		pmap_store(l0, entry);
1654 	} else if (ptepindex >= NUL2E) {
1655 		pd_entry_t *l0, *l1;
1656 		vm_pindex_t l0index, l1index;
1657 
1658 		l1index = ptepindex - NUL2E;
1659 		if (pmap_mode == PMAP_MODE_SV39) {
1660 			l1 = &pmap->pm_top[l1index];
1661 		} else {
1662 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1663 			l0 = &pmap->pm_top[l0index];
1664 			if (pmap_load(l0) == 0) {
1665 				/* Recurse to allocate the L1 page. */
1666 				if (_pmap_alloc_l3(pmap,
1667 				    NUL2E + NUL1E + l0index, lockp) == NULL)
1668 					goto fail;
1669 				phys = PTE_TO_PHYS(pmap_load(l0));
1670 			} else {
1671 				phys = PTE_TO_PHYS(pmap_load(l0));
1672 				pdpg = PHYS_TO_VM_PAGE(phys);
1673 				pdpg->ref_count++;
1674 			}
1675 			l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1676 			l1 = &l1[ptepindex & Ln_ADDR_MASK];
1677 		}
1678 		KASSERT((pmap_load(l1) & PTE_V) == 0,
1679 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1680 
1681 		entry = PTE_V | (pn << PTE_PPN0_S);
1682 		pmap_store(l1, entry);
1683 		pmap_distribute_l1(pmap, l1index, entry);
1684 	} else {
1685 		vm_pindex_t l0index, l1index;
1686 		pd_entry_t *l0, *l1, *l2;
1687 
1688 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1689 		if (pmap_mode == PMAP_MODE_SV39) {
1690 			l1 = &pmap->pm_top[l1index];
1691 			if (pmap_load(l1) == 0) {
1692 				/* recurse for allocating page dir */
1693 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1694 				    lockp) == NULL)
1695 					goto fail;
1696 			} else {
1697 				pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1698 				pdpg->ref_count++;
1699 			}
1700 		} else {
1701 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1702 			l0 = &pmap->pm_top[l0index];
1703 			if (pmap_load(l0) == 0) {
1704 				/* Recurse to allocate the L1 entry. */
1705 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1706 				    lockp) == NULL)
1707 					goto fail;
1708 				phys = PTE_TO_PHYS(pmap_load(l0));
1709 				l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1710 				l1 = &l1[l1index & Ln_ADDR_MASK];
1711 			} else {
1712 				phys = PTE_TO_PHYS(pmap_load(l0));
1713 				l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1714 				l1 = &l1[l1index & Ln_ADDR_MASK];
1715 				if (pmap_load(l1) == 0) {
1716 					/* Recurse to allocate the L2 page. */
1717 					if (_pmap_alloc_l3(pmap,
1718 					    NUL2E + l1index, lockp) == NULL)
1719 						goto fail;
1720 				} else {
1721 					pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
1722 					pdpg->ref_count++;
1723 				}
1724 			}
1725 		}
1726 
1727 		phys = PTE_TO_PHYS(pmap_load(l1));
1728 		l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1729 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1730 		KASSERT((pmap_load(l2) & PTE_V) == 0,
1731 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1732 
1733 		entry = PTE_V | (pn << PTE_PPN0_S);
1734 		pmap_store(l2, entry);
1735 	}
1736 
1737 	pmap_resident_count_inc(pmap, 1);
1738 
1739 	return (m);
1740 
1741 fail:
1742 	vm_page_unwire_noq(m);
1743 	vm_page_free_zero(m);
1744 	return (NULL);
1745 }
1746 
1747 static vm_page_t
1748 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1749 {
1750 	pd_entry_t *l1;
1751 	vm_page_t l2pg;
1752 	vm_pindex_t pindex;
1753 
1754 retry:
1755 	l1 = pmap_l1(pmap, va);
1756 	if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1757 		KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1758 		    ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1759 		    pmap_load(l1), va));
1760 		/* Add a reference to the L2 page. */
1761 		l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
1762 		l2pg->ref_count++;
1763 	} else {
1764 		/* Allocate a L2 page. */
1765 		pindex = pmap_l1_pindex(va);
1766 		l2pg = _pmap_alloc_l3(pmap, pindex, lockp);
1767 		if (l2pg == NULL && lockp != NULL)
1768 			goto retry;
1769 	}
1770 	return (l2pg);
1771 }
1772 
1773 static vm_page_t
1774 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1775 {
1776 	vm_pindex_t ptepindex;
1777 	pd_entry_t *l2;
1778 	vm_page_t m;
1779 
1780 	/*
1781 	 * Calculate pagetable page index
1782 	 */
1783 	ptepindex = pmap_l2_pindex(va);
1784 retry:
1785 	/*
1786 	 * Get the page directory entry
1787 	 */
1788 	l2 = pmap_l2(pmap, va);
1789 
1790 	/*
1791 	 * If the page table page is mapped, we just increment the
1792 	 * hold count, and activate it.
1793 	 */
1794 	if (l2 != NULL && pmap_load(l2) != 0) {
1795 		m = PTE_TO_VM_PAGE(pmap_load(l2));
1796 		m->ref_count++;
1797 	} else {
1798 		/*
1799 		 * Here if the pte page isn't mapped, or if it has been
1800 		 * deallocated.
1801 		 */
1802 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1803 		if (m == NULL && lockp != NULL)
1804 			goto retry;
1805 	}
1806 	return (m);
1807 }
1808 
1809 /***************************************************
1810  * Pmap allocation/deallocation routines.
1811  ***************************************************/
1812 
1813 /*
1814  * Release any resources held by the given physical map.
1815  * Called when a pmap initialized by pmap_pinit is being released.
1816  * Should only be called if the map contains no valid mappings.
1817  */
1818 void
1819 pmap_release(pmap_t pmap)
1820 {
1821 	vm_page_t m;
1822 	int npages;
1823 	int i;
1824 
1825 	KASSERT(pmap->pm_stats.resident_count == 0,
1826 	    ("pmap_release: pmap resident count %ld != 0",
1827 	    pmap->pm_stats.resident_count));
1828 	KASSERT(CPU_EMPTY(&pmap->pm_active),
1829 	    ("releasing active pmap %p", pmap));
1830 
1831 	if (pmap->pm_stage == PM_STAGE2)
1832 		goto finish;
1833 
1834 	if (pmap_mode == PMAP_MODE_SV39) {
1835 		mtx_lock(&allpmaps_lock);
1836 		LIST_REMOVE(pmap, pm_list);
1837 		mtx_unlock(&allpmaps_lock);
1838 	}
1839 
1840 finish:
1841 	npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1;
1842 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
1843 	for (i = 0; i < npages; i++) {
1844 		vm_page_unwire_noq(m);
1845 		vm_page_free(m);
1846 		m++;
1847 	}
1848 }
1849 
1850 static int
1851 kvm_size(SYSCTL_HANDLER_ARGS)
1852 {
1853 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1854 
1855 	return sysctl_handle_long(oidp, &ksize, 0, req);
1856 }
1857 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1858     0, 0, kvm_size, "LU",
1859     "Size of KVM");
1860 
1861 static int
1862 kvm_free(SYSCTL_HANDLER_ARGS)
1863 {
1864 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1865 
1866 	return sysctl_handle_long(oidp, &kfree, 0, req);
1867 }
1868 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1869     0, 0, kvm_free, "LU",
1870     "Amount of KVM free");
1871 
1872 /*
1873  * grow the number of kernel page table entries, if needed
1874  */
1875 void
1876 pmap_growkernel(vm_offset_t addr)
1877 {
1878 	vm_paddr_t paddr;
1879 	vm_page_t nkpg;
1880 	pd_entry_t *l1, *l2;
1881 	pt_entry_t entry;
1882 	pn_t pn;
1883 
1884 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1885 
1886 	addr = roundup2(addr, L2_SIZE);
1887 	if (addr - 1 >= vm_map_max(kernel_map))
1888 		addr = vm_map_max(kernel_map);
1889 	while (kernel_vm_end < addr) {
1890 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1891 		if (pmap_load(l1) == 0) {
1892 			/* We need a new PDP entry */
1893 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1894 			    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1895 			if (nkpg == NULL)
1896 				panic("%s: no memory to grow kernel", __func__);
1897 			nkpg->pindex = pmap_l1_pindex(kernel_vm_end);
1898 			paddr = VM_PAGE_TO_PHYS(nkpg);
1899 
1900 			pn = (paddr / PAGE_SIZE);
1901 			entry = (PTE_V);
1902 			entry |= (pn << PTE_PPN0_S);
1903 			pmap_store(l1, entry);
1904 			pmap_distribute_l1(kernel_pmap,
1905 			    pmap_l1_index(kernel_vm_end), entry);
1906 			continue; /* try again */
1907 		}
1908 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1909 		if ((pmap_load(l2) & PTE_V) != 0 &&
1910 		    (pmap_load(l2) & PTE_RWX) == 0) {
1911 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1912 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1913 				kernel_vm_end = vm_map_max(kernel_map);
1914 				break;
1915 			}
1916 			continue;
1917 		}
1918 
1919 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1920 		    VM_ALLOC_NOFREE | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1921 		if (nkpg == NULL)
1922 			panic("%s: no memory to grow kernel", __func__);
1923 		nkpg->pindex = pmap_l2_pindex(kernel_vm_end);
1924 		paddr = VM_PAGE_TO_PHYS(nkpg);
1925 
1926 		pn = (paddr / PAGE_SIZE);
1927 		entry = (PTE_V);
1928 		entry |= (pn << PTE_PPN0_S);
1929 		pmap_store(l2, entry);
1930 
1931 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1932 
1933 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1934 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1935 			kernel_vm_end = vm_map_max(kernel_map);
1936 			break;
1937 		}
1938 	}
1939 }
1940 
1941 /***************************************************
1942  * page management routines.
1943  ***************************************************/
1944 
1945 static const uint64_t pc_freemask[_NPCM] = {
1946 	[0 ... _NPCM - 2] = PC_FREEN,
1947 	[_NPCM - 1] = PC_FREEL
1948 };
1949 
1950 #ifdef PV_STATS
1951 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1952 
1953 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1954 	"Current number of pv entry chunks");
1955 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1956 	"Current number of pv entry chunks allocated");
1957 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1958 	"Current number of pv entry chunks frees");
1959 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1960 	"Number of times tried to get a chunk page but failed.");
1961 
1962 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1963 static int pv_entry_spare;
1964 
1965 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1966 	"Current number of pv entry frees");
1967 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1968 	"Current number of pv entry allocs");
1969 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1970 	"Current number of pv entries");
1971 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1972 	"Current number of spare pv entries");
1973 #endif
1974 
1975 /*
1976  * We are in a serious low memory condition.  Resort to
1977  * drastic measures to free some pages so we can allocate
1978  * another pv entry chunk.
1979  *
1980  * Returns NULL if PV entries were reclaimed from the specified pmap.
1981  *
1982  * We do not, however, unmap 2mpages because subsequent accesses will
1983  * allocate per-page pv entries until repromotion occurs, thereby
1984  * exacerbating the shortage of free pv entries.
1985  */
1986 static vm_page_t
1987 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1988 {
1989 
1990 	panic("RISCVTODO: reclaim_pv_chunk");
1991 }
1992 
1993 /*
1994  * free the pv_entry back to the free list
1995  */
1996 static void
1997 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1998 {
1999 	struct pv_chunk *pc;
2000 	int idx, field, bit;
2001 
2002 	rw_assert(&pvh_global_lock, RA_LOCKED);
2003 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2004 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2005 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2006 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2007 	pc = pv_to_chunk(pv);
2008 	idx = pv - &pc->pc_pventry[0];
2009 	field = idx / 64;
2010 	bit = idx % 64;
2011 	pc->pc_map[field] |= 1ul << bit;
2012 	if (!pc_is_free(pc)) {
2013 		/* 98% of the time, pc is already at the head of the list. */
2014 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2015 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2016 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2017 		}
2018 		return;
2019 	}
2020 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2021 	free_pv_chunk(pc);
2022 }
2023 
2024 static void
2025 free_pv_chunk(struct pv_chunk *pc)
2026 {
2027 	vm_page_t m;
2028 
2029 	mtx_lock(&pv_chunks_mutex);
2030  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2031 	mtx_unlock(&pv_chunks_mutex);
2032 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2033 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2034 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2035 	/* entire chunk is free, return it */
2036 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2037 	dump_drop_page(m->phys_addr);
2038 	vm_page_unwire_noq(m);
2039 	vm_page_free(m);
2040 }
2041 
2042 /*
2043  * Returns a new PV entry, allocating a new PV chunk from the system when
2044  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2045  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2046  * returned.
2047  *
2048  * The given PV list lock may be released.
2049  */
2050 static pv_entry_t
2051 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2052 {
2053 	int bit, field;
2054 	pv_entry_t pv;
2055 	struct pv_chunk *pc;
2056 	vm_page_t m;
2057 
2058 	rw_assert(&pvh_global_lock, RA_LOCKED);
2059 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2060 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2061 retry:
2062 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2063 	if (pc != NULL) {
2064 		for (field = 0; field < _NPCM; field++) {
2065 			if (pc->pc_map[field]) {
2066 				bit = ffsl(pc->pc_map[field]) - 1;
2067 				break;
2068 			}
2069 		}
2070 		if (field < _NPCM) {
2071 			pv = &pc->pc_pventry[field * 64 + bit];
2072 			pc->pc_map[field] &= ~(1ul << bit);
2073 			/* If this was the last item, move it to tail */
2074 			if (pc_is_full(pc)) {
2075 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2076 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2077 				    pc_list);
2078 			}
2079 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2080 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2081 			return (pv);
2082 		}
2083 	}
2084 	/* No free items, allocate another chunk */
2085 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2086 	if (m == NULL) {
2087 		if (lockp == NULL) {
2088 			PV_STAT(pc_chunk_tryfail++);
2089 			return (NULL);
2090 		}
2091 		m = reclaim_pv_chunk(pmap, lockp);
2092 		if (m == NULL)
2093 			goto retry;
2094 	}
2095 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2096 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2097 	dump_add_page(m->phys_addr);
2098 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2099 	pc->pc_pmap = pmap;
2100 	pc->pc_map[0] = PC_FREEN & ~1ul;	/* preallocated bit 0 */
2101 	pc->pc_map[1] = PC_FREEN;
2102 	pc->pc_map[2] = PC_FREEL;
2103 	mtx_lock(&pv_chunks_mutex);
2104 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2105 	mtx_unlock(&pv_chunks_mutex);
2106 	pv = &pc->pc_pventry[0];
2107 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2108 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2109 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2110 	return (pv);
2111 }
2112 
2113 /*
2114  * Ensure that the number of spare PV entries in the specified pmap meets or
2115  * exceeds the given count, "needed".
2116  *
2117  * The given PV list lock may be released.
2118  */
2119 static void
2120 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2121 {
2122 	struct pch new_tail;
2123 	struct pv_chunk *pc;
2124 	vm_page_t m;
2125 	int avail, free;
2126 	bool reclaimed;
2127 
2128 	rw_assert(&pvh_global_lock, RA_LOCKED);
2129 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2130 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2131 
2132 	/*
2133 	 * Newly allocated PV chunks must be stored in a private list until
2134 	 * the required number of PV chunks have been allocated.  Otherwise,
2135 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2136 	 * contrast, these chunks must be added to the pmap upon allocation.
2137 	 */
2138 	TAILQ_INIT(&new_tail);
2139 retry:
2140 	avail = 0;
2141 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2142 		bit_count((bitstr_t *)pc->pc_map, 0,
2143 		    sizeof(pc->pc_map) * NBBY, &free);
2144 		if (free == 0)
2145 			break;
2146 		avail += free;
2147 		if (avail >= needed)
2148 			break;
2149 	}
2150 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2151 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2152 		if (m == NULL) {
2153 			m = reclaim_pv_chunk(pmap, lockp);
2154 			if (m == NULL)
2155 				goto retry;
2156 			reclaimed = true;
2157 		}
2158 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2159 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2160 		dump_add_page(m->phys_addr);
2161 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2162 		pc->pc_pmap = pmap;
2163 		pc->pc_map[0] = PC_FREEN;
2164 		pc->pc_map[1] = PC_FREEN;
2165 		pc->pc_map[2] = PC_FREEL;
2166 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2167 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2168 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2169 
2170 		/*
2171 		 * The reclaim might have freed a chunk from the current pmap.
2172 		 * If that chunk contained available entries, we need to
2173 		 * re-count the number of available entries.
2174 		 */
2175 		if (reclaimed)
2176 			goto retry;
2177 	}
2178 	if (!TAILQ_EMPTY(&new_tail)) {
2179 		mtx_lock(&pv_chunks_mutex);
2180 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2181 		mtx_unlock(&pv_chunks_mutex);
2182 	}
2183 }
2184 
2185 /*
2186  * First find and then remove the pv entry for the specified pmap and virtual
2187  * address from the specified pv list.  Returns the pv entry if found and NULL
2188  * otherwise.  This operation can be performed on pv lists for either 4KB or
2189  * 2MB page mappings.
2190  */
2191 static __inline pv_entry_t
2192 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2193 {
2194 	pv_entry_t pv;
2195 
2196 	rw_assert(&pvh_global_lock, RA_LOCKED);
2197 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2198 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2199 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2200 			pvh->pv_gen++;
2201 			break;
2202 		}
2203 	}
2204 	return (pv);
2205 }
2206 
2207 /*
2208  * First find and then destroy the pv entry for the specified pmap and virtual
2209  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2210  * page mappings.
2211  */
2212 static void
2213 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2214 {
2215 	pv_entry_t pv;
2216 
2217 	pv = pmap_pvh_remove(pvh, pmap, va);
2218 
2219 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
2220 	free_pv_entry(pmap, pv);
2221 }
2222 
2223 /*
2224  * Conditionally create the PV entry for a 4KB page mapping if the required
2225  * memory can be allocated without resorting to reclamation.
2226  */
2227 static bool
2228 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2229     struct rwlock **lockp)
2230 {
2231 	pv_entry_t pv;
2232 
2233 	rw_assert(&pvh_global_lock, RA_LOCKED);
2234 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2235 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2236 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2237 		pv->pv_va = va;
2238 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2239 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2240 		m->md.pv_gen++;
2241 		return (true);
2242 	} else
2243 		return (false);
2244 }
2245 
2246 /*
2247  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2248  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2249  * entries for each of the 4KB page mappings.
2250  */
2251 static void __unused
2252 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2253     struct rwlock **lockp)
2254 {
2255 	struct md_page *pvh;
2256 	struct pv_chunk *pc;
2257 	pv_entry_t pv;
2258 	vm_page_t m;
2259 	vm_offset_t va_last;
2260 	int bit, field;
2261 
2262 	rw_assert(&pvh_global_lock, RA_LOCKED);
2263 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2264 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2265 
2266 	/*
2267 	 * Transfer the 2mpage's pv entry for this mapping to the first
2268 	 * page's pv list.  Once this transfer begins, the pv list lock
2269 	 * must not be released until the last pv entry is reinstantiated.
2270 	 */
2271 	pvh = pa_to_pvh(pa);
2272 	va &= ~L2_OFFSET;
2273 	pv = pmap_pvh_remove(pvh, pmap, va);
2274 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2275 	m = PHYS_TO_VM_PAGE(pa);
2276 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2277 	m->md.pv_gen++;
2278 	/* Instantiate the remaining 511 pv entries. */
2279 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2280 	va_last = va + L2_SIZE - PAGE_SIZE;
2281 	for (;;) {
2282 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2283 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
2284 		for (field = 0; field < _NPCM; field++) {
2285 			while (pc->pc_map[field] != 0) {
2286 				bit = ffsl(pc->pc_map[field]) - 1;
2287 				pc->pc_map[field] &= ~(1ul << bit);
2288 				pv = &pc->pc_pventry[field * 64 + bit];
2289 				va += PAGE_SIZE;
2290 				pv->pv_va = va;
2291 				m++;
2292 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2293 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2294 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2295 				m->md.pv_gen++;
2296 				if (va == va_last)
2297 					goto out;
2298 			}
2299 		}
2300 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2301 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2302 	}
2303 out:
2304 	if (pc_is_full(pc)) {
2305 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2306 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2307 	}
2308 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2309 	PV_STAT(atomic_add_int(&pv_entry_spare, -(Ln_ENTRIES - 1)));
2310 }
2311 
2312 #if VM_NRESERVLEVEL > 0
2313 static void
2314 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2315     struct rwlock **lockp)
2316 {
2317 	struct md_page *pvh;
2318 	pv_entry_t pv;
2319 	vm_page_t m;
2320 	vm_offset_t va_last;
2321 
2322 	rw_assert(&pvh_global_lock, RA_LOCKED);
2323 	KASSERT((pa & L2_OFFSET) == 0,
2324 	    ("pmap_pv_promote_l2: misaligned pa %#lx", pa));
2325 
2326 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2327 
2328 	m = PHYS_TO_VM_PAGE(pa);
2329 	va = va & ~L2_OFFSET;
2330 	pv = pmap_pvh_remove(&m->md, pmap, va);
2331 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
2332 	pvh = pa_to_pvh(pa);
2333 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2334 	pvh->pv_gen++;
2335 
2336 	va_last = va + L2_SIZE - PAGE_SIZE;
2337 	do {
2338 		m++;
2339 		va += PAGE_SIZE;
2340 		pmap_pvh_free(&m->md, pmap, va);
2341 	} while (va < va_last);
2342 }
2343 #endif /* VM_NRESERVLEVEL > 0 */
2344 
2345 /*
2346  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
2347  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
2348  * false if the PV entry cannot be allocated without resorting to reclamation.
2349  */
2350 static bool
2351 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2352     struct rwlock **lockp)
2353 {
2354 	struct md_page *pvh;
2355 	pv_entry_t pv;
2356 	vm_paddr_t pa;
2357 
2358 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2359 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2360 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2361 	    NULL : lockp)) == NULL)
2362 		return (false);
2363 	pv->pv_va = va;
2364 	pa = PTE_TO_PHYS(l2e);
2365 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2366 	pvh = pa_to_pvh(pa);
2367 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2368 	pvh->pv_gen++;
2369 	return (true);
2370 }
2371 
2372 static void
2373 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2374 {
2375 	pt_entry_t newl2, oldl2 __diagused;
2376 	vm_page_t ml3;
2377 	vm_paddr_t ml3pa;
2378 
2379 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2380 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2381 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2382 
2383 	ml3 = pmap_remove_pt_page(pmap, va);
2384 	if (ml3 == NULL)
2385 		panic("pmap_remove_kernel_l2: Missing pt page");
2386 
2387 	ml3pa = VM_PAGE_TO_PHYS(ml3);
2388 	newl2 = ml3pa | PTE_V;
2389 
2390 	/*
2391 	 * If this page table page was unmapped by a promotion, then it
2392 	 * contains valid mappings.  Zero it to invalidate those mappings.
2393 	 */
2394 	if (vm_page_any_valid(ml3))
2395 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
2396 
2397 	/*
2398 	 * Demote the mapping.
2399 	 */
2400 	oldl2 = pmap_load_store(l2, newl2);
2401 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2402 	    __func__, l2, oldl2));
2403 }
2404 
2405 /*
2406  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2407  */
2408 static int
2409 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2410     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2411 {
2412 	struct md_page *pvh;
2413 	pt_entry_t oldl2;
2414 	vm_offset_t eva, va;
2415 	vm_page_t m, ml3;
2416 
2417 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2418 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2419 	oldl2 = pmap_load_clear(l2);
2420 	KASSERT((oldl2 & PTE_RWX) != 0,
2421 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2422 
2423 	/*
2424 	 * The sfence.vma documentation states that it is sufficient to specify
2425 	 * a single address within a superpage mapping.  However, since we do
2426 	 * not perform any invalidation upon promotion, TLBs may still be
2427 	 * caching 4KB mappings within the superpage, so we must invalidate the
2428 	 * entire range.
2429 	 */
2430 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2431 	if ((oldl2 & PTE_SW_WIRED) != 0)
2432 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2433 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2434 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
2435 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2436 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2437 		pmap_pvh_free(pvh, pmap, sva);
2438 		eva = sva + L2_SIZE;
2439 		for (va = sva, m = PTE_TO_VM_PAGE(oldl2);
2440 		    va < eva; va += PAGE_SIZE, m++) {
2441 			if ((oldl2 & PTE_D) != 0)
2442 				vm_page_dirty(m);
2443 			if ((oldl2 & PTE_A) != 0)
2444 				vm_page_aflag_set(m, PGA_REFERENCED);
2445 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2446 			    TAILQ_EMPTY(&pvh->pv_list))
2447 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2448 		}
2449 	}
2450 	if (pmap == kernel_pmap) {
2451 		pmap_remove_kernel_l2(pmap, l2, sva);
2452 	} else {
2453 		ml3 = pmap_remove_pt_page(pmap, sva);
2454 		if (ml3 != NULL) {
2455 			KASSERT(vm_page_any_valid(ml3),
2456 			    ("pmap_remove_l2: l3 page not promoted"));
2457 			pmap_resident_count_dec(pmap, 1);
2458 			KASSERT(ml3->ref_count == Ln_ENTRIES,
2459 			    ("pmap_remove_l2: l3 page ref count error"));
2460 			ml3->ref_count = 1;
2461 			vm_page_unwire_noq(ml3);
2462 			pmap_add_delayed_free_list(ml3, free, false);
2463 		}
2464 	}
2465 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2466 }
2467 
2468 /*
2469  * pmap_remove_l3: do the things to unmap a page in a process
2470  */
2471 static int
2472 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2473     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2474 {
2475 	struct md_page *pvh;
2476 	pt_entry_t old_l3;
2477 	vm_page_t m;
2478 
2479 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2480 	old_l3 = pmap_load_clear(l3);
2481 	pmap_invalidate_page(pmap, va);
2482 	if (old_l3 & PTE_SW_WIRED)
2483 		pmap->pm_stats.wired_count -= 1;
2484 	pmap_resident_count_dec(pmap, 1);
2485 	if (old_l3 & PTE_SW_MANAGED) {
2486 		m = PTE_TO_VM_PAGE(old_l3);
2487 		if ((old_l3 & PTE_D) != 0)
2488 			vm_page_dirty(m);
2489 		if (old_l3 & PTE_A)
2490 			vm_page_aflag_set(m, PGA_REFERENCED);
2491 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2492 		pmap_pvh_free(&m->md, pmap, va);
2493 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2494 		    (m->flags & PG_FICTITIOUS) == 0) {
2495 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2496 			if (TAILQ_EMPTY(&pvh->pv_list))
2497 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2498 		}
2499 	}
2500 
2501 	return (pmap_unuse_pt(pmap, va, l2e, free));
2502 }
2503 
2504 /*
2505  *	Remove the given range of addresses from the specified map.
2506  *
2507  *	It is assumed that the start and end are properly
2508  *	rounded to the page size.
2509  */
2510 void
2511 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2512 {
2513 	struct spglist free;
2514 	struct rwlock *lock;
2515 	vm_offset_t va, va_next;
2516 	pd_entry_t *l0, *l1, *l2, l2e;
2517 	pt_entry_t *l3;
2518 
2519 	/*
2520 	 * Perform an unsynchronized read.  This is, however, safe.
2521 	 */
2522 	if (pmap->pm_stats.resident_count == 0)
2523 		return;
2524 
2525 	SLIST_INIT(&free);
2526 
2527 	rw_rlock(&pvh_global_lock);
2528 	PMAP_LOCK(pmap);
2529 
2530 	lock = NULL;
2531 	for (; sva < eva; sva = va_next) {
2532 		if (pmap->pm_stats.resident_count == 0)
2533 			break;
2534 
2535 		if (pmap_mode == PMAP_MODE_SV48) {
2536 			l0 = pmap_l0(pmap, sva);
2537 			if (pmap_load(l0) == 0) {
2538 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2539 				if (va_next < sva)
2540 					va_next = eva;
2541 				continue;
2542 			}
2543 			l1 = pmap_l0_to_l1(l0, sva);
2544 		} else {
2545 			l1 = pmap_l1(pmap, sva);
2546 		}
2547 
2548 		if (pmap_load(l1) == 0) {
2549 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2550 			if (va_next < sva)
2551 				va_next = eva;
2552 			continue;
2553 		}
2554 
2555 		/*
2556 		 * Calculate index for next page table.
2557 		 */
2558 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2559 		if (va_next < sva)
2560 			va_next = eva;
2561 
2562 		l2 = pmap_l1_to_l2(l1, sva);
2563 		if ((l2e = pmap_load(l2)) == 0)
2564 			continue;
2565 		if ((l2e & PTE_RWX) != 0) {
2566 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2567 				(void)pmap_remove_l2(pmap, l2, sva,
2568 				    pmap_load(l1), &free, &lock);
2569 				continue;
2570 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
2571 			    &lock)) {
2572 				/*
2573 				 * The large page mapping was destroyed.
2574 				 */
2575 				continue;
2576 			}
2577 			l2e = pmap_load(l2);
2578 		}
2579 
2580 		/*
2581 		 * Limit our scan to either the end of the va represented
2582 		 * by the current page table page, or to the end of the
2583 		 * range being removed.
2584 		 */
2585 		if (va_next > eva)
2586 			va_next = eva;
2587 
2588 		va = va_next;
2589 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2590 		    sva += L3_SIZE) {
2591 			if (pmap_load(l3) == 0) {
2592 				if (va != va_next) {
2593 					pmap_invalidate_range(pmap, va, sva);
2594 					va = va_next;
2595 				}
2596 				continue;
2597 			}
2598 			if (va == va_next)
2599 				va = sva;
2600 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2601 				sva += L3_SIZE;
2602 				break;
2603 			}
2604 		}
2605 		if (va != va_next)
2606 			pmap_invalidate_range(pmap, va, sva);
2607 	}
2608 	if (lock != NULL)
2609 		rw_wunlock(lock);
2610 	rw_runlock(&pvh_global_lock);
2611 	PMAP_UNLOCK(pmap);
2612 	vm_page_free_pages_toq(&free, false);
2613 }
2614 
2615 /*
2616  *	Routine:	pmap_remove_all
2617  *	Function:
2618  *		Removes this physical page from
2619  *		all physical maps in which it resides.
2620  *		Reflects back modify bits to the pager.
2621  *
2622  *	Notes:
2623  *		Original versions of this routine were very
2624  *		inefficient because they iteratively called
2625  *		pmap_remove (slow...)
2626  */
2627 
2628 void
2629 pmap_remove_all(vm_page_t m)
2630 {
2631 	struct spglist free;
2632 	struct md_page *pvh;
2633 	pmap_t pmap;
2634 	pt_entry_t *l3, l3e;
2635 	pd_entry_t *l2, l2e __diagused;
2636 	pv_entry_t pv;
2637 	vm_offset_t va;
2638 
2639 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2640 	    ("pmap_remove_all: page %p is not managed", m));
2641 	SLIST_INIT(&free);
2642 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2643 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2644 
2645 	rw_wlock(&pvh_global_lock);
2646 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2647 		pmap = PV_PMAP(pv);
2648 		PMAP_LOCK(pmap);
2649 		va = pv->pv_va;
2650 		l2 = pmap_l2(pmap, va);
2651 		(void)pmap_demote_l2(pmap, l2, va);
2652 		PMAP_UNLOCK(pmap);
2653 	}
2654 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2655 		pmap = PV_PMAP(pv);
2656 		PMAP_LOCK(pmap);
2657 		pmap_resident_count_dec(pmap, 1);
2658 		l2 = pmap_l2(pmap, pv->pv_va);
2659 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2660 		l2e = pmap_load(l2);
2661 
2662 		KASSERT((l2e & PTE_RX) == 0,
2663 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
2664 
2665 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
2666 		l3e = pmap_load_clear(l3);
2667 		pmap_invalidate_page(pmap, pv->pv_va);
2668 		if (l3e & PTE_SW_WIRED)
2669 			pmap->pm_stats.wired_count--;
2670 		if ((l3e & PTE_A) != 0)
2671 			vm_page_aflag_set(m, PGA_REFERENCED);
2672 
2673 		/*
2674 		 * Update the vm_page_t clean and reference bits.
2675 		 */
2676 		if ((l3e & PTE_D) != 0)
2677 			vm_page_dirty(m);
2678 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2679 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2680 		m->md.pv_gen++;
2681 		free_pv_entry(pmap, pv);
2682 		PMAP_UNLOCK(pmap);
2683 	}
2684 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2685 	rw_wunlock(&pvh_global_lock);
2686 	vm_page_free_pages_toq(&free, false);
2687 }
2688 
2689 /*
2690  *	Set the physical protection on the
2691  *	specified range of this map as requested.
2692  */
2693 void
2694 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2695 {
2696 	pd_entry_t *l0, *l1, *l2, l2e;
2697 	pt_entry_t *l3, l3e, mask;
2698 	vm_page_t m, mt;
2699 	vm_offset_t va_next;
2700 	bool anychanged, pv_lists_locked;
2701 
2702 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2703 		pmap_remove(pmap, sva, eva);
2704 		return;
2705 	}
2706 
2707 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2708 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2709 		return;
2710 
2711 	anychanged = false;
2712 	pv_lists_locked = false;
2713 	mask = 0;
2714 	if ((prot & VM_PROT_WRITE) == 0)
2715 		mask |= PTE_W | PTE_D;
2716 	if ((prot & VM_PROT_EXECUTE) == 0)
2717 		mask |= PTE_X;
2718 resume:
2719 	PMAP_LOCK(pmap);
2720 	for (; sva < eva; sva = va_next) {
2721 		if (pmap_mode == PMAP_MODE_SV48) {
2722 			l0 = pmap_l0(pmap, sva);
2723 			if (pmap_load(l0) == 0) {
2724 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2725 				if (va_next < sva)
2726 					va_next = eva;
2727 				continue;
2728 			}
2729 			l1 = pmap_l0_to_l1(l0, sva);
2730 		} else {
2731 			l1 = pmap_l1(pmap, sva);
2732 		}
2733 
2734 		if (pmap_load(l1) == 0) {
2735 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2736 			if (va_next < sva)
2737 				va_next = eva;
2738 			continue;
2739 		}
2740 
2741 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2742 		if (va_next < sva)
2743 			va_next = eva;
2744 
2745 		l2 = pmap_l1_to_l2(l1, sva);
2746 		if ((l2e = pmap_load(l2)) == 0)
2747 			continue;
2748 		if ((l2e & PTE_RWX) != 0) {
2749 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2750 retryl2:
2751 				if ((prot & VM_PROT_WRITE) == 0 &&
2752 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2753 				    (PTE_SW_MANAGED | PTE_D)) {
2754 					m = PTE_TO_VM_PAGE(l2e);
2755 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2756 						vm_page_dirty(mt);
2757 				}
2758 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2759 					goto retryl2;
2760 				anychanged = true;
2761 				continue;
2762 			} else {
2763 				if (!pv_lists_locked) {
2764 					pv_lists_locked = true;
2765 					if (!rw_try_rlock(&pvh_global_lock)) {
2766 						if (anychanged)
2767 							pmap_invalidate_all(
2768 							    pmap);
2769 						PMAP_UNLOCK(pmap);
2770 						rw_rlock(&pvh_global_lock);
2771 						goto resume;
2772 					}
2773 				}
2774 				if (!pmap_demote_l2(pmap, l2, sva)) {
2775 					/*
2776 					 * The large page mapping was destroyed.
2777 					 */
2778 					continue;
2779 				}
2780 			}
2781 		}
2782 
2783 		if (va_next > eva)
2784 			va_next = eva;
2785 
2786 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2787 		    sva += L3_SIZE) {
2788 			l3e = pmap_load(l3);
2789 retryl3:
2790 			if ((l3e & PTE_V) == 0)
2791 				continue;
2792 			if ((prot & VM_PROT_WRITE) == 0 &&
2793 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2794 			    (PTE_SW_MANAGED | PTE_D)) {
2795 				m = PTE_TO_VM_PAGE(l3e);
2796 				vm_page_dirty(m);
2797 			}
2798 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2799 				goto retryl3;
2800 			anychanged = true;
2801 		}
2802 	}
2803 	if (anychanged)
2804 		pmap_invalidate_all(pmap);
2805 	if (pv_lists_locked)
2806 		rw_runlock(&pvh_global_lock);
2807 	PMAP_UNLOCK(pmap);
2808 }
2809 
2810 int
2811 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2812 {
2813 	pd_entry_t *l2, l2e;
2814 	pt_entry_t bits, *pte, oldpte;
2815 	int rv;
2816 
2817 	KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va));
2818 
2819 	rv = 0;
2820 	PMAP_LOCK(pmap);
2821 	l2 = pmap_l2(pmap, va);
2822 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2823 		goto done;
2824 	if ((l2e & PTE_RWX) == 0) {
2825 		pte = pmap_l2_to_l3(l2, va);
2826 		if (((oldpte = pmap_load(pte)) & PTE_V) == 0)
2827 			goto done;
2828 	} else {
2829 		pte = l2;
2830 		oldpte = l2e;
2831 	}
2832 
2833 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2834 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2835 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2836 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2837 		goto done;
2838 
2839 	bits = PTE_A;
2840 	if (ftype == VM_PROT_WRITE)
2841 		bits |= PTE_D;
2842 
2843 	/*
2844 	 * Spurious faults can occur if the implementation caches invalid
2845 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2846 	 * race with each other.
2847 	 */
2848 	if ((oldpte & bits) != bits)
2849 		pmap_store_bits(pte, bits);
2850 	sfence_vma();
2851 	rv = 1;
2852 done:
2853 	PMAP_UNLOCK(pmap);
2854 	return (rv);
2855 }
2856 
2857 /*
2858  *	Demote the specified L1 page to separate L2 pages.
2859  *	Currently only used for DMAP entries.
2860  */
2861 static bool
2862 pmap_demote_l1(pmap_t pmap, pd_entry_t *l1, vm_offset_t va)
2863 {
2864 	vm_page_t m;
2865 	pt_entry_t *l2, oldl1, newl2;
2866 	pd_entry_t newl1;
2867 	vm_paddr_t l2phys;
2868 
2869 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2870 
2871 	oldl1 = pmap_load(l1);
2872 	KASSERT((oldl1 & PTE_RWX) != 0,
2873 	    ("pmap_demote_l1: oldl1 is not a leaf PTE"));
2874 	KASSERT((oldl1 & PTE_A) != 0,
2875 	    ("pmap_demote_l1: oldl1 is missing PTE_A"));
2876 	KASSERT((oldl1 & (PTE_D | PTE_W)) != PTE_W,
2877 	    ("pmap_demote_l1: not dirty!"));
2878 	KASSERT((oldl1 & PTE_SW_MANAGED) == 0,
2879 	    ("pmap_demote_l1: L1 table shouldn't be managed"));
2880 	KASSERT(VIRT_IN_DMAP(va),
2881 	    ("pmap_demote_l1: is unsupported for non-DMAP va=%#lx", va));
2882 
2883 	/* Demoting L1 means we need to allocate a new page-table page. */
2884 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
2885 	if (m == NULL) {
2886 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx in pmap %p",
2887 		    va, pmap);
2888 		return (false);
2889 	}
2890 
2891 	l2phys = VM_PAGE_TO_PHYS(m);
2892 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
2893 
2894 	/*
2895 	 * Create new entries, relying on the fact that only the low bits
2896 	 * (index) of the physical address are changing.
2897 	 */
2898 	newl2 = oldl1;
2899 	for (int i = 0; i < Ln_ENTRIES; i++)
2900 		pmap_store(&l2[i], newl2 | (i << PTE_PPN1_S));
2901 
2902 	/*
2903 	 * And update the L1 entry.
2904 	 *
2905 	 * NB: flushing the TLB is the responsibility of the caller. Cached
2906 	 * translations are still "correct" for demoted mappings until some
2907 	 * subset of the demoted range is modified.
2908 	 */
2909 	newl1 = ((l2phys / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
2910 	pmap_store(l1, newl1);
2911 
2912 	counter_u64_add(pmap_l1_demotions, 1);
2913 	CTR2(KTR_PMAP, "pmap_demote_l1: success for va %#lx in pmap %p",
2914 	    va, pmap);
2915 	return (true);
2916 }
2917 
2918 static bool
2919 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
2920 {
2921 	struct rwlock *lock;
2922 	bool rv;
2923 
2924 	lock = NULL;
2925 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
2926 	if (lock != NULL)
2927 		rw_wunlock(lock);
2928 	return (rv);
2929 }
2930 
2931 /*
2932  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
2933  * mapping is invalidated.
2934  */
2935 static bool
2936 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2937     struct rwlock **lockp)
2938 {
2939 	struct spglist free;
2940 	vm_page_t mpte;
2941 	pd_entry_t newl2, oldl2;
2942 	pt_entry_t *firstl3, newl3;
2943 	vm_paddr_t mptepa;
2944 	int i;
2945 
2946 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2947 
2948 	oldl2 = pmap_load(l2);
2949 	KASSERT((oldl2 & PTE_RWX) != 0,
2950 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
2951 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2952 	    NULL) {
2953 		KASSERT((oldl2 & PTE_SW_WIRED) == 0,
2954 		    ("pmap_demote_l2_locked: page table page for a wired mapping is missing"));
2955 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
2956 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
2957 		    VM_ALLOC_WIRED)) == NULL) {
2958 			SLIST_INIT(&free);
2959 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
2960 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
2961 			vm_page_free_pages_toq(&free, true);
2962 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
2963 			    "failure for va %#lx in pmap %p", va, pmap);
2964 			return (false);
2965 		}
2966 		mpte->pindex = pmap_l2_pindex(va);
2967 		if (va < VM_MAXUSER_ADDRESS) {
2968 			mpte->ref_count = Ln_ENTRIES;
2969 			pmap_resident_count_inc(pmap, 1);
2970 		}
2971 	}
2972 	mptepa = VM_PAGE_TO_PHYS(mpte);
2973 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2974 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
2975 	KASSERT((oldl2 & PTE_A) != 0,
2976 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
2977 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
2978 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
2979 	newl3 = oldl2;
2980 
2981 	/*
2982 	 * If the page table page is not leftover from an earlier promotion,
2983 	 * initialize it.
2984 	 */
2985 	if (!vm_page_all_valid(mpte)) {
2986 		for (i = 0; i < Ln_ENTRIES; i++)
2987 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
2988 	}
2989 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
2990 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
2991 	    "addresses"));
2992 
2993 	/*
2994 	 * If the mapping has changed attributes, update the PTEs.
2995 	 */
2996 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
2997 		for (i = 0; i < Ln_ENTRIES; i++)
2998 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
2999 
3000 	/*
3001 	 * The spare PV entries must be reserved prior to demoting the
3002 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
3003 	 * state of the L2 entry and the PV lists will be inconsistent, which
3004 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
3005 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
3006 	 * expected PV entry for the 2MB page mapping that is being demoted.
3007 	 */
3008 	if ((oldl2 & PTE_SW_MANAGED) != 0)
3009 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
3010 
3011 	/*
3012 	 * Demote the mapping.
3013 	 */
3014 	pmap_store(l2, newl2);
3015 
3016 	/*
3017 	 * Demote the PV entry.
3018 	 */
3019 	if ((oldl2 & PTE_SW_MANAGED) != 0)
3020 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
3021 
3022 	atomic_add_long(&pmap_l2_demotions, 1);
3023 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
3024 	    va, pmap);
3025 	return (true);
3026 }
3027 
3028 #if VM_NRESERVLEVEL > 0
3029 static bool
3030 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t ml3,
3031     struct rwlock **lockp)
3032 {
3033 	pt_entry_t all_l3e_PTE_A, *firstl3, firstl3e, *l3, l3e;
3034 	vm_paddr_t pa;
3035 
3036 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3037 	if (!pmap_ps_enabled(pmap))
3038 		return (false);
3039 
3040 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3041 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
3042 
3043 	/*
3044 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
3045 	 * ineligible for promotion or does not map the first 4KB physical page
3046 	 * within a 2MB page.
3047 	 */
3048 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
3049 	firstl3e = pmap_load(firstl3);
3050 	pa = PTE_TO_PHYS(firstl3e);
3051 	if ((pa & L2_OFFSET) != 0) {
3052 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3053 		    va, pmap);
3054 		atomic_add_long(&pmap_l2_p_failures, 1);
3055 		return (false);
3056 	}
3057 
3058 	/*
3059 	 * Downgrade a clean, writable mapping to read-only to ensure that the
3060 	 * hardware does not set PTE_D while we are comparing PTEs.
3061 	 *
3062 	 * Upon a write access to a clean mapping, the implementation will
3063 	 * either atomically check protections and set PTE_D, or raise a page
3064 	 * fault.  In the latter case, the pmap lock provides atomicity.  Thus,
3065 	 * we do not issue an sfence.vma here and instead rely on pmap_fault()
3066 	 * to do so lazily.
3067 	 */
3068 	while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
3069 		if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
3070 			firstl3e &= ~PTE_W;
3071 			break;
3072 		}
3073 	}
3074 
3075 	/*
3076 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3077 	 * PTE maps an unexpected 4KB physical page or does not have identical
3078 	 * characteristics to the first PTE.
3079 	 */
3080 	all_l3e_PTE_A = firstl3e & PTE_A;
3081 	pa += L2_SIZE - PAGE_SIZE;
3082 	for (l3 = firstl3 + Ln_ENTRIES - 1; l3 > firstl3; l3--) {
3083 		l3e = pmap_load(l3);
3084 		if (PTE_TO_PHYS(l3e) != pa) {
3085 			CTR2(KTR_PMAP,
3086 			    "pmap_promote_l2: failure for va %#lx pmap %p",
3087 			    va, pmap);
3088 			atomic_add_long(&pmap_l2_p_failures, 1);
3089 			return (false);
3090 		}
3091 		while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
3092 			if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
3093 				l3e &= ~PTE_W;
3094 				break;
3095 			}
3096 		}
3097 		if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
3098 			CTR2(KTR_PMAP,
3099 			    "pmap_promote_l2: failure for va %#lx pmap %p",
3100 			    va, pmap);
3101 			atomic_add_long(&pmap_l2_p_failures, 1);
3102 			return (false);
3103 		}
3104 		all_l3e_PTE_A &= l3e;
3105 		pa -= PAGE_SIZE;
3106 	}
3107 
3108 	/*
3109 	 * Unless all PTEs have PTE_A set, clear it from the superpage
3110 	 * mapping, so that promotions triggered by speculative mappings,
3111 	 * such as pmap_enter_quick(), don't automatically mark the
3112 	 * underlying pages as referenced.
3113 	 */
3114 	firstl3e &= ~PTE_A | all_l3e_PTE_A;
3115 
3116 	/*
3117 	 * Save the page table page in its current state until the L2
3118 	 * mapping the superpage is demoted by pmap_demote_l2() or
3119 	 * destroyed by pmap_remove_l3().
3120 	 */
3121 	if (ml3 == NULL)
3122 		ml3 = PTE_TO_VM_PAGE(pmap_load(l2));
3123 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
3124 	    ("pmap_promote_l2: page table page's pindex is wrong"));
3125 	if (pmap_insert_pt_page(pmap, ml3, true, all_l3e_PTE_A != 0)) {
3126 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
3127 		    va, pmap);
3128 		atomic_add_long(&pmap_l2_p_failures, 1);
3129 		return (false);
3130 	}
3131 
3132 	if ((firstl3e & PTE_SW_MANAGED) != 0)
3133 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
3134 
3135 	pmap_store(l2, firstl3e);
3136 
3137 	atomic_add_long(&pmap_l2_promotions, 1);
3138 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3139 	    pmap);
3140 	return (true);
3141 }
3142 #endif
3143 
3144 /*
3145  *	Insert the given physical page (p) at
3146  *	the specified virtual address (v) in the
3147  *	target physical map with the protection requested.
3148  *
3149  *	If specified, the page will be wired down, meaning
3150  *	that the related pte can not be reclaimed.
3151  *
3152  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3153  *	or lose information.  That is, this routine must actually
3154  *	insert this page into the given map NOW.
3155  */
3156 int
3157 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3158     u_int flags, int8_t psind)
3159 {
3160 	struct rwlock *lock;
3161 	pd_entry_t *l2, l2e;
3162 	pt_entry_t new_l3, orig_l3;
3163 	pt_entry_t *l3;
3164 	pv_entry_t pv;
3165 	vm_paddr_t opa, pa;
3166 	vm_page_t mpte, om;
3167 	pn_t pn;
3168 	int rv;
3169 	bool nosleep;
3170 
3171 	va = trunc_page(va);
3172 	if ((m->oflags & VPO_UNMANAGED) == 0)
3173 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
3174 	pa = VM_PAGE_TO_PHYS(m);
3175 	pn = (pa / PAGE_SIZE);
3176 
3177 	new_l3 = PTE_V | PTE_R | PTE_A;
3178 	if (prot & VM_PROT_EXECUTE)
3179 		new_l3 |= PTE_X;
3180 	if (flags & VM_PROT_WRITE)
3181 		new_l3 |= PTE_D;
3182 	if (prot & VM_PROT_WRITE)
3183 		new_l3 |= PTE_W;
3184 	if (va < VM_MAX_USER_ADDRESS)
3185 		new_l3 |= PTE_U;
3186 
3187 	new_l3 |= (pn << PTE_PPN0_S);
3188 	if ((flags & PMAP_ENTER_WIRED) != 0)
3189 		new_l3 |= PTE_SW_WIRED;
3190 	new_l3 |= pmap_memattr_bits(m->md.pv_memattr);
3191 
3192 	/*
3193 	 * Set modified bit gratuitously for writeable mappings if
3194 	 * the page is unmanaged. We do not want to take a fault
3195 	 * to do the dirty bit accounting for these mappings.
3196 	 */
3197 	if ((m->oflags & VPO_UNMANAGED) != 0) {
3198 		if (prot & VM_PROT_WRITE)
3199 			new_l3 |= PTE_D;
3200 	} else
3201 		new_l3 |= PTE_SW_MANAGED;
3202 
3203 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3204 
3205 	lock = NULL;
3206 	mpte = NULL;
3207 	rw_rlock(&pvh_global_lock);
3208 	PMAP_LOCK(pmap);
3209 	if (psind == 1) {
3210 		/* Assert the required virtual and physical alignment. */
3211 		KASSERT((va & L2_OFFSET) == 0,
3212 		    ("pmap_enter: va %#lx unaligned", va));
3213 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3214 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
3215 		goto out;
3216 	}
3217 
3218 	l2 = pmap_l2(pmap, va);
3219 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
3220 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
3221 	    va, &lock))) {
3222 		l3 = pmap_l2_to_l3(l2, va);
3223 		if (va < VM_MAXUSER_ADDRESS) {
3224 			mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3225 			mpte->ref_count++;
3226 		}
3227 	} else if (va < VM_MAXUSER_ADDRESS) {
3228 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
3229 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
3230 		if (mpte == NULL && nosleep) {
3231 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3232 			if (lock != NULL)
3233 				rw_wunlock(lock);
3234 			rw_runlock(&pvh_global_lock);
3235 			PMAP_UNLOCK(pmap);
3236 			return (KERN_RESOURCE_SHORTAGE);
3237 		}
3238 		l3 = pmap_l3(pmap, va);
3239 	} else {
3240 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
3241 	}
3242 
3243 	orig_l3 = pmap_load(l3);
3244 	opa = PTE_TO_PHYS(orig_l3);
3245 	pv = NULL;
3246 
3247 	/*
3248 	 * Is the specified virtual address already mapped?
3249 	 */
3250 	if ((orig_l3 & PTE_V) != 0) {
3251 		/*
3252 		 * Wiring change, just update stats. We don't worry about
3253 		 * wiring PT pages as they remain resident as long as there
3254 		 * are valid mappings in them. Hence, if a user page is wired,
3255 		 * the PT page will be also.
3256 		 */
3257 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
3258 		    (orig_l3 & PTE_SW_WIRED) == 0)
3259 			pmap->pm_stats.wired_count++;
3260 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3261 		    (orig_l3 & PTE_SW_WIRED) != 0)
3262 			pmap->pm_stats.wired_count--;
3263 
3264 		/*
3265 		 * Remove the extra PT page reference.
3266 		 */
3267 		if (mpte != NULL) {
3268 			mpte->ref_count--;
3269 			KASSERT(mpte->ref_count > 0,
3270 			    ("pmap_enter: missing reference to page table page,"
3271 			     " va: 0x%lx", va));
3272 		}
3273 
3274 		/*
3275 		 * Has the physical page changed?
3276 		 */
3277 		if (opa == pa) {
3278 			/*
3279 			 * No, might be a protection or wiring change.
3280 			 */
3281 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
3282 			    (new_l3 & PTE_W) != 0)
3283 				vm_page_aflag_set(m, PGA_WRITEABLE);
3284 			goto validate;
3285 		}
3286 
3287 		/*
3288 		 * The physical page has changed.  Temporarily invalidate
3289 		 * the mapping.  This ensures that all threads sharing the
3290 		 * pmap keep a consistent view of the mapping, which is
3291 		 * necessary for the correct handling of COW faults.  It
3292 		 * also permits reuse of the old mapping's PV entry,
3293 		 * avoiding an allocation.
3294 		 *
3295 		 * For consistency, handle unmanaged mappings the same way.
3296 		 */
3297 		orig_l3 = pmap_load_clear(l3);
3298 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
3299 		    ("pmap_enter: unexpected pa update for %#lx", va));
3300 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
3301 			om = PHYS_TO_VM_PAGE(opa);
3302 
3303 			/*
3304 			 * The pmap lock is sufficient to synchronize with
3305 			 * concurrent calls to pmap_page_test_mappings() and
3306 			 * pmap_ts_referenced().
3307 			 */
3308 			if ((orig_l3 & PTE_D) != 0)
3309 				vm_page_dirty(om);
3310 			if ((orig_l3 & PTE_A) != 0)
3311 				vm_page_aflag_set(om, PGA_REFERENCED);
3312 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3313 			pv = pmap_pvh_remove(&om->md, pmap, va);
3314 			KASSERT(pv != NULL,
3315 			    ("pmap_enter: no PV entry for %#lx", va));
3316 			if ((new_l3 & PTE_SW_MANAGED) == 0)
3317 				free_pv_entry(pmap, pv);
3318 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3319 			    TAILQ_EMPTY(&om->md.pv_list) &&
3320 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3321 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3322 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3323 		}
3324 		pmap_invalidate_page(pmap, va);
3325 		orig_l3 = 0;
3326 	} else {
3327 		/*
3328 		 * Increment the counters.
3329 		 */
3330 		if ((new_l3 & PTE_SW_WIRED) != 0)
3331 			pmap->pm_stats.wired_count++;
3332 		pmap_resident_count_inc(pmap, 1);
3333 	}
3334 	/*
3335 	 * Enter on the PV list if part of our managed memory.
3336 	 */
3337 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
3338 		if (pv == NULL) {
3339 			pv = get_pv_entry(pmap, &lock);
3340 			pv->pv_va = va;
3341 		}
3342 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3343 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3344 		m->md.pv_gen++;
3345 		if ((new_l3 & PTE_W) != 0)
3346 			vm_page_aflag_set(m, PGA_WRITEABLE);
3347 	}
3348 
3349 validate:
3350 	/*
3351 	 * Sync the i-cache on all harts before updating the PTE
3352 	 * if the new PTE is executable.
3353 	 */
3354 	if (prot & VM_PROT_EXECUTE)
3355 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3356 
3357 	/*
3358 	 * Update the L3 entry.
3359 	 */
3360 	if (orig_l3 != 0) {
3361 		orig_l3 = pmap_load_store(l3, new_l3);
3362 		pmap_invalidate_page(pmap, va);
3363 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
3364 		    ("pmap_enter: invalid update"));
3365 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
3366 		    (PTE_D | PTE_SW_MANAGED))
3367 			vm_page_dirty(m);
3368 	} else {
3369 		pmap_store(l3, new_l3);
3370 	}
3371 
3372 #if VM_NRESERVLEVEL > 0
3373 	if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
3374 	    (m->flags & PG_FICTITIOUS) == 0 &&
3375 	    vm_reserv_level_iffullpop(m) == 0)
3376 		(void)pmap_promote_l2(pmap, l2, va, mpte, &lock);
3377 #endif
3378 
3379 	rv = KERN_SUCCESS;
3380 out:
3381 	if (lock != NULL)
3382 		rw_wunlock(lock);
3383 	rw_runlock(&pvh_global_lock);
3384 	PMAP_UNLOCK(pmap);
3385 	return (rv);
3386 }
3387 
3388 /*
3389  * Release a page table page reference after a failed attempt to create a
3390  * mapping.
3391  */
3392 static void
3393 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t l2pg)
3394 {
3395 	struct spglist free;
3396 
3397 	SLIST_INIT(&free);
3398 	if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3399 		/*
3400 		 * Although "va" is not mapped, paging-structure
3401 		 * caches could nonetheless have entries that
3402 		 * refer to the freed page table pages.
3403 		 * Invalidate those entries.
3404 		 */
3405 		pmap_invalidate_page(pmap, va);
3406 		vm_page_free_pages_toq(&free, true);
3407 	}
3408 }
3409 
3410 /*
3411  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns
3412  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
3413  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
3414  * "no replace", and "no reclaim" are specified.
3415  */
3416 static int
3417 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3418     struct rwlock **lockp)
3419 {
3420 	pd_entry_t new_l2;
3421 	pn_t pn;
3422 
3423 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3424 
3425 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
3426 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V |
3427 	    pmap_memattr_bits(m->md.pv_memattr));
3428 	if ((m->oflags & VPO_UNMANAGED) == 0)
3429 		new_l2 |= PTE_SW_MANAGED;
3430 	if ((prot & VM_PROT_EXECUTE) != 0)
3431 		new_l2 |= PTE_X;
3432 	if (va < VM_MAXUSER_ADDRESS)
3433 		new_l2 |= PTE_U;
3434 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
3435 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp));
3436 }
3437 
3438 /*
3439  * Returns true if every page table entry in the specified page table is
3440  * zero.
3441  */
3442 static bool
3443 pmap_every_pte_zero(vm_paddr_t pa)
3444 {
3445 	pt_entry_t *pt_end, *pte;
3446 
3447 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
3448 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
3449 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
3450 		if (*pte != 0)
3451 			return (false);
3452 	}
3453 	return (true);
3454 }
3455 
3456 /*
3457  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3458  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
3459  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
3460  * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
3461  * within the 2MB virtual address range starting at the specified virtual
3462  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
3463  * 2MB page mapping already exists at the specified virtual address.  Returns
3464  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
3465  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
3466  * and a PV entry allocation failed.
3467  *
3468  * The parameter "m" is only used when creating a managed, writeable mapping.
3469  */
3470 static int
3471 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
3472     vm_page_t m, struct rwlock **lockp)
3473 {
3474 	struct spglist free;
3475 	pd_entry_t *l2, *l3, oldl2;
3476 	vm_offset_t sva;
3477 	vm_page_t l2pg, mt;
3478 	vm_page_t uwptpg;
3479 
3480 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3481 
3482 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3483 	    NULL : lockp)) == NULL) {
3484 		CTR2(KTR_PMAP, "pmap_enter_l2: failed to allocate PT page"
3485 		    " for va %#lx in pmap %p", va, pmap);
3486 		return (KERN_RESOURCE_SHORTAGE);
3487 	}
3488 
3489 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
3490 	l2 = &l2[pmap_l2_index(va)];
3491 	if ((oldl2 = pmap_load(l2)) != 0) {
3492 		KASSERT(l2pg->ref_count > 1,
3493 		    ("pmap_enter_l2: l2pg's ref count is too low"));
3494 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3495 			if ((oldl2 & PTE_RWX) != 0) {
3496 				l2pg->ref_count--;
3497 				CTR2(KTR_PMAP,
3498 				    "pmap_enter_l2: no space for va %#lx"
3499 				    " in pmap %p", va, pmap);
3500 				return (KERN_NO_SPACE);
3501 			} else if (va < VM_MAXUSER_ADDRESS ||
3502 			    !pmap_every_pte_zero(L2PTE_TO_PHYS(oldl2))) {
3503 				l2pg->ref_count--;
3504 				CTR2(KTR_PMAP, "pmap_enter_l2:"
3505 				    " failed to replace existing mapping"
3506 				    " for va %#lx in pmap %p", va, pmap);
3507 				return (KERN_FAILURE);
3508 			}
3509 		}
3510 		SLIST_INIT(&free);
3511 		if ((oldl2 & PTE_RWX) != 0)
3512 			(void)pmap_remove_l2(pmap, l2, va,
3513 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
3514 		else
3515 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3516 				l3 = pmap_l2_to_l3(l2, sva);
3517 				if ((pmap_load(l3) & PTE_V) != 0 &&
3518 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3519 				    lockp) != 0)
3520 					break;
3521 			}
3522 		vm_page_free_pages_toq(&free, true);
3523 		if (va >= VM_MAXUSER_ADDRESS) {
3524 			/*
3525 			 * Both pmap_remove_l2() and pmap_remove_l3() will
3526 			 * leave the kernel page table page zero filled.
3527 			 */
3528 			mt = PTE_TO_VM_PAGE(pmap_load(l2));
3529 			if (pmap_insert_pt_page(pmap, mt, false, false))
3530 				panic("pmap_enter_l2: trie insert failed");
3531 		} else
3532 			KASSERT(pmap_load(l2) == 0,
3533 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
3534 	}
3535 
3536 	/*
3537 	 * Allocate leaf ptpage for wired userspace pages.
3538 	 */
3539 	uwptpg = NULL;
3540 	if ((new_l2 & PTE_SW_WIRED) != 0 && pmap != kernel_pmap) {
3541 		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3542 		if (uwptpg == NULL) {
3543 			pmap_abort_ptp(pmap, va, l2pg);
3544 			return (KERN_RESOURCE_SHORTAGE);
3545 		}
3546 		uwptpg->pindex = pmap_l2_pindex(va);
3547 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
3548 			vm_page_unwire_noq(uwptpg);
3549 			vm_page_free(uwptpg);
3550 			pmap_abort_ptp(pmap, va, l2pg);
3551 			return (KERN_RESOURCE_SHORTAGE);
3552 		}
3553 		pmap_resident_count_inc(pmap, 1);
3554 		uwptpg->ref_count = Ln_ENTRIES;
3555 	}
3556 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
3557 		/*
3558 		 * Abort this mapping if its PV entry could not be created.
3559 		 */
3560 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3561 			pmap_abort_ptp(pmap, va, l2pg);
3562 			if (uwptpg != NULL) {
3563 				mt = pmap_remove_pt_page(pmap, va);
3564 				KASSERT(mt == uwptpg,
3565 				    ("removed pt page %p, expected %p", mt,
3566 				    uwptpg));
3567 				pmap_resident_count_dec(pmap, 1);
3568 				uwptpg->ref_count = 1;
3569 				vm_page_unwire_noq(uwptpg);
3570 				vm_page_free(uwptpg);
3571 			}
3572 			CTR2(KTR_PMAP,
3573 			    "pmap_enter_l2: failed to create PV entry"
3574 			    " for va %#lx in pmap %p", va, pmap);
3575 			return (KERN_RESOURCE_SHORTAGE);
3576 		}
3577 		if ((new_l2 & PTE_W) != 0)
3578 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3579 				vm_page_aflag_set(mt, PGA_WRITEABLE);
3580 	}
3581 
3582 	/*
3583 	 * Increment counters.
3584 	 */
3585 	if ((new_l2 & PTE_SW_WIRED) != 0)
3586 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3587 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3588 
3589 	/*
3590 	 * Map the superpage.
3591 	 */
3592 	pmap_store(l2, new_l2);
3593 
3594 	atomic_add_long(&pmap_l2_mappings, 1);
3595 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3596 	    va, pmap);
3597 
3598 	return (KERN_SUCCESS);
3599 }
3600 
3601 /*
3602  * Maps a sequence of resident pages belonging to the same object.
3603  * The sequence begins with the given page m_start.  This page is
3604  * mapped at the given virtual address start.  Each subsequent page is
3605  * mapped at a virtual address that is offset from start by the same
3606  * amount as the page is offset from m_start within the object.  The
3607  * last page in the sequence is the page with the largest offset from
3608  * m_start that can be mapped at a virtual address less than the given
3609  * virtual address end.  Not every virtual page between start and end
3610  * is mapped; only those for which a resident page exists with the
3611  * corresponding offset from m_start are mapped.
3612  */
3613 void
3614 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3615     vm_page_t m_start, vm_prot_t prot)
3616 {
3617 	struct rwlock *lock;
3618 	vm_offset_t va;
3619 	vm_page_t m, mpte;
3620 	vm_pindex_t diff, psize;
3621 	int rv;
3622 
3623 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3624 
3625 	psize = atop(end - start);
3626 	mpte = NULL;
3627 	m = m_start;
3628 	lock = NULL;
3629 	rw_rlock(&pvh_global_lock);
3630 	PMAP_LOCK(pmap);
3631 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3632 		va = start + ptoa(diff);
3633 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3634 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
3635 		    ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
3636 		    KERN_SUCCESS || rv == KERN_NO_SPACE))
3637 			m = &m[L2_SIZE / PAGE_SIZE - 1];
3638 		else
3639 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3640 			    &lock);
3641 		m = TAILQ_NEXT(m, listq);
3642 	}
3643 	if (lock != NULL)
3644 		rw_wunlock(lock);
3645 	rw_runlock(&pvh_global_lock);
3646 	PMAP_UNLOCK(pmap);
3647 }
3648 
3649 /*
3650  * this code makes some *MAJOR* assumptions:
3651  * 1. Current pmap & pmap exists.
3652  * 2. Not wired.
3653  * 3. Read access.
3654  * 4. No page table pages.
3655  * but is *MUCH* faster than pmap_enter...
3656  */
3657 
3658 void
3659 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3660 {
3661 	struct rwlock *lock;
3662 
3663 	lock = NULL;
3664 	rw_rlock(&pvh_global_lock);
3665 	PMAP_LOCK(pmap);
3666 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3667 	if (lock != NULL)
3668 		rw_wunlock(lock);
3669 	rw_runlock(&pvh_global_lock);
3670 	PMAP_UNLOCK(pmap);
3671 }
3672 
3673 static vm_page_t
3674 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3675     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3676 {
3677 	struct spglist free;
3678 	pd_entry_t *l2;
3679 	pt_entry_t *l3, newl3;
3680 
3681 	KASSERT(!VA_IS_CLEANMAP(va) ||
3682 	    (m->oflags & VPO_UNMANAGED) != 0,
3683 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3684 	rw_assert(&pvh_global_lock, RA_LOCKED);
3685 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3686 	l2 = NULL;
3687 
3688 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3689 	/*
3690 	 * In the case that a page table page is not
3691 	 * resident, we are creating it here.
3692 	 */
3693 	if (va < VM_MAXUSER_ADDRESS) {
3694 		vm_pindex_t l2pindex;
3695 
3696 		/*
3697 		 * Calculate pagetable page index
3698 		 */
3699 		l2pindex = pmap_l2_pindex(va);
3700 		if (mpte && (mpte->pindex == l2pindex)) {
3701 			mpte->ref_count++;
3702 		} else {
3703 			/*
3704 			 * Get the l2 entry
3705 			 */
3706 			l2 = pmap_l2(pmap, va);
3707 
3708 			/*
3709 			 * If the page table page is mapped, we just increment
3710 			 * the hold count, and activate it.  Otherwise, we
3711 			 * attempt to allocate a page table page.  If this
3712 			 * attempt fails, we don't retry.  Instead, we give up.
3713 			 */
3714 			if (l2 != NULL && pmap_load(l2) != 0) {
3715 				if ((pmap_load(l2) & PTE_RWX) != 0)
3716 					return (NULL);
3717 				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
3718 				mpte->ref_count++;
3719 			} else {
3720 				/*
3721 				 * Pass NULL instead of the PV list lock
3722 				 * pointer, because we don't intend to sleep.
3723 				 */
3724 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3725 				if (mpte == NULL)
3726 					return (mpte);
3727 			}
3728 		}
3729 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3730 		l3 = &l3[pmap_l3_index(va)];
3731 	} else {
3732 		mpte = NULL;
3733 		l3 = pmap_l3(kernel_pmap, va);
3734 	}
3735 	if (l3 == NULL)
3736 		panic("pmap_enter_quick_locked: No l3");
3737 	if (pmap_load(l3) != 0) {
3738 		if (mpte != NULL)
3739 			mpte->ref_count--;
3740 		return (NULL);
3741 	}
3742 
3743 	/*
3744 	 * Enter on the PV list if part of our managed memory.
3745 	 */
3746 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3747 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3748 		if (mpte != NULL) {
3749 			SLIST_INIT(&free);
3750 			if (pmap_unwire_ptp(pmap, va, mpte, &free))
3751 				vm_page_free_pages_toq(&free, false);
3752 		}
3753 		return (NULL);
3754 	}
3755 
3756 	/*
3757 	 * Increment counters
3758 	 */
3759 	pmap_resident_count_inc(pmap, 1);
3760 
3761 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3762 	    PTE_V | PTE_R | pmap_memattr_bits(m->md.pv_memattr);
3763 	if ((prot & VM_PROT_EXECUTE) != 0)
3764 		newl3 |= PTE_X;
3765 	if ((m->oflags & VPO_UNMANAGED) == 0)
3766 		newl3 |= PTE_SW_MANAGED;
3767 	if (va < VM_MAX_USER_ADDRESS)
3768 		newl3 |= PTE_U;
3769 
3770 	/*
3771 	 * Sync the i-cache on all harts before updating the PTE
3772 	 * if the new PTE is executable.
3773 	 */
3774 	if (prot & VM_PROT_EXECUTE)
3775 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3776 
3777 	pmap_store(l3, newl3);
3778 
3779 #if VM_NRESERVLEVEL > 0
3780 	/*
3781 	 * If both the PTP and the reservation are fully populated, then attempt
3782 	 * promotion.
3783 	 */
3784 	if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
3785 	    (mpte == NULL || mpte->ref_count == Ln_ENTRIES) &&
3786 	    (m->flags & PG_FICTITIOUS) == 0 &&
3787 	    vm_reserv_level_iffullpop(m) == 0) {
3788 		if (l2 == NULL)
3789 			l2 = pmap_l2(pmap, va);
3790 
3791 		/*
3792 		 * If promotion succeeds, then the next call to this function
3793 		 * should not be given the unmapped PTP as a hint.
3794 		 */
3795 		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
3796 			mpte = NULL;
3797 	}
3798 #endif
3799 
3800 	return (mpte);
3801 }
3802 
3803 /*
3804  * This code maps large physical mmap regions into the
3805  * processor address space.  Note that some shortcuts
3806  * are taken, but the code works.
3807  */
3808 void
3809 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3810     vm_pindex_t pindex, vm_size_t size)
3811 {
3812 
3813 	VM_OBJECT_ASSERT_WLOCKED(object);
3814 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3815 	    ("pmap_object_init_pt: non-device object"));
3816 }
3817 
3818 /*
3819  *	Clear the wired attribute from the mappings for the specified range of
3820  *	addresses in the given pmap.  Every valid mapping within that range
3821  *	must have the wired attribute set.  In contrast, invalid mappings
3822  *	cannot have the wired attribute set, so they are ignored.
3823  *
3824  *	The wired attribute of the page table entry is not a hardware feature,
3825  *	so there is no need to invalidate any TLB entries.
3826  */
3827 void
3828 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3829 {
3830 	vm_offset_t va_next;
3831 	pd_entry_t *l0, *l1, *l2, l2e;
3832 	pt_entry_t *l3, l3e;
3833 	bool pv_lists_locked;
3834 
3835 	pv_lists_locked = false;
3836 retry:
3837 	PMAP_LOCK(pmap);
3838 	for (; sva < eva; sva = va_next) {
3839 		if (pmap_mode == PMAP_MODE_SV48) {
3840 			l0 = pmap_l0(pmap, sva);
3841 			if (pmap_load(l0) == 0) {
3842 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3843 				if (va_next < sva)
3844 					va_next = eva;
3845 				continue;
3846 			}
3847 			l1 = pmap_l0_to_l1(l0, sva);
3848 		} else {
3849 			l1 = pmap_l1(pmap, sva);
3850 		}
3851 
3852 		if (pmap_load(l1) == 0) {
3853 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3854 			if (va_next < sva)
3855 				va_next = eva;
3856 			continue;
3857 		}
3858 
3859 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3860 		if (va_next < sva)
3861 			va_next = eva;
3862 
3863 		l2 = pmap_l1_to_l2(l1, sva);
3864 		if ((l2e = pmap_load(l2)) == 0)
3865 			continue;
3866 		if ((l2e & PTE_RWX) != 0) {
3867 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3868 				if ((l2e & PTE_SW_WIRED) == 0)
3869 					panic("pmap_unwire: l2 %#jx is missing "
3870 					    "PTE_SW_WIRED", (uintmax_t)l2e);
3871 				pmap_clear_bits(l2, PTE_SW_WIRED);
3872 				continue;
3873 			} else {
3874 				if (!pv_lists_locked) {
3875 					pv_lists_locked = true;
3876 					if (!rw_try_rlock(&pvh_global_lock)) {
3877 						PMAP_UNLOCK(pmap);
3878 						rw_rlock(&pvh_global_lock);
3879 						/* Repeat sva. */
3880 						goto retry;
3881 					}
3882 				}
3883 				if (!pmap_demote_l2(pmap, l2, sva))
3884 					panic("pmap_unwire: demotion failed");
3885 			}
3886 		}
3887 
3888 		if (va_next > eva)
3889 			va_next = eva;
3890 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3891 		    sva += L3_SIZE) {
3892 			if ((l3e = pmap_load(l3)) == 0)
3893 				continue;
3894 			if ((l3e & PTE_SW_WIRED) == 0)
3895 				panic("pmap_unwire: l3 %#jx is missing "
3896 				    "PTE_SW_WIRED", (uintmax_t)l3e);
3897 
3898 			/*
3899 			 * PG_W must be cleared atomically.  Although the pmap
3900 			 * lock synchronizes access to PG_W, another processor
3901 			 * could be setting PG_M and/or PG_A concurrently.
3902 			 */
3903 			pmap_clear_bits(l3, PTE_SW_WIRED);
3904 			pmap->pm_stats.wired_count--;
3905 		}
3906 	}
3907 	if (pv_lists_locked)
3908 		rw_runlock(&pvh_global_lock);
3909 	PMAP_UNLOCK(pmap);
3910 }
3911 
3912 /*
3913  *	Copy the range specified by src_addr/len
3914  *	from the source map to the range dst_addr/len
3915  *	in the destination map.
3916  *
3917  *	This routine is only advisory and need not do anything.
3918  */
3919 
3920 void
3921 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3922     vm_offset_t src_addr)
3923 {
3924 
3925 }
3926 
3927 /*
3928  *	pmap_zero_page zeros the specified hardware page by mapping
3929  *	the page into KVM and using bzero to clear its contents.
3930  */
3931 void
3932 pmap_zero_page(vm_page_t m)
3933 {
3934 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3935 
3936 	pagezero((void *)va);
3937 }
3938 
3939 /*
3940  *	pmap_zero_page_area zeros the specified hardware page by mapping
3941  *	the page into KVM and using bzero to clear its contents.
3942  *
3943  *	off and size may not cover an area beyond a single hardware page.
3944  */
3945 void
3946 pmap_zero_page_area(vm_page_t m, int off, int size)
3947 {
3948 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3949 
3950 	if (off == 0 && size == PAGE_SIZE)
3951 		pagezero((void *)va);
3952 	else
3953 		bzero((char *)va + off, size);
3954 }
3955 
3956 /*
3957  *	pmap_copy_page copies the specified (machine independent)
3958  *	page by mapping the page into virtual memory and using
3959  *	bcopy to copy the page, one machine dependent page at a
3960  *	time.
3961  */
3962 void
3963 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3964 {
3965 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3966 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3967 
3968 	pagecopy((void *)src, (void *)dst);
3969 }
3970 
3971 int unmapped_buf_allowed = 1;
3972 
3973 void
3974 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3975     vm_offset_t b_offset, int xfersize)
3976 {
3977 	void *a_cp, *b_cp;
3978 	vm_page_t m_a, m_b;
3979 	vm_paddr_t p_a, p_b;
3980 	vm_offset_t a_pg_offset, b_pg_offset;
3981 	int cnt;
3982 
3983 	while (xfersize > 0) {
3984 		a_pg_offset = a_offset & PAGE_MASK;
3985 		m_a = ma[a_offset >> PAGE_SHIFT];
3986 		p_a = m_a->phys_addr;
3987 		b_pg_offset = b_offset & PAGE_MASK;
3988 		m_b = mb[b_offset >> PAGE_SHIFT];
3989 		p_b = m_b->phys_addr;
3990 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3991 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3992 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
3993 			panic("!DMAP a %lx", p_a);
3994 		} else {
3995 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
3996 		}
3997 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
3998 			panic("!DMAP b %lx", p_b);
3999 		} else {
4000 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4001 		}
4002 		bcopy(a_cp, b_cp, cnt);
4003 		a_offset += cnt;
4004 		b_offset += cnt;
4005 		xfersize -= cnt;
4006 	}
4007 }
4008 
4009 vm_offset_t
4010 pmap_quick_enter_page(vm_page_t m)
4011 {
4012 
4013 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4014 }
4015 
4016 void
4017 pmap_quick_remove_page(vm_offset_t addr)
4018 {
4019 }
4020 
4021 /*
4022  * Returns true if the pmap's pv is one of the first
4023  * 16 pvs linked to from this page.  This count may
4024  * be changed upwards or downwards in the future; it
4025  * is only necessary that true be returned for a small
4026  * subset of pmaps for proper page aging.
4027  */
4028 bool
4029 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4030 {
4031 	struct md_page *pvh;
4032 	struct rwlock *lock;
4033 	pv_entry_t pv;
4034 	int loops = 0;
4035 	bool rv;
4036 
4037 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4038 	    ("pmap_page_exists_quick: page %p is not managed", m));
4039 	rv = false;
4040 	rw_rlock(&pvh_global_lock);
4041 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4042 	rw_rlock(lock);
4043 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4044 		if (PV_PMAP(pv) == pmap) {
4045 			rv = true;
4046 			break;
4047 		}
4048 		loops++;
4049 		if (loops >= 16)
4050 			break;
4051 	}
4052 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4053 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4054 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4055 			if (PV_PMAP(pv) == pmap) {
4056 				rv = true;
4057 				break;
4058 			}
4059 			loops++;
4060 			if (loops >= 16)
4061 				break;
4062 		}
4063 	}
4064 	rw_runlock(lock);
4065 	rw_runlock(&pvh_global_lock);
4066 	return (rv);
4067 }
4068 
4069 /*
4070  *	pmap_page_wired_mappings:
4071  *
4072  *	Return the number of managed mappings to the given physical page
4073  *	that are wired.
4074  */
4075 int
4076 pmap_page_wired_mappings(vm_page_t m)
4077 {
4078 	struct md_page *pvh;
4079 	struct rwlock *lock;
4080 	pmap_t pmap;
4081 	pd_entry_t *l2;
4082 	pt_entry_t *l3;
4083 	pv_entry_t pv;
4084 	int count, md_gen, pvh_gen;
4085 
4086 	if ((m->oflags & VPO_UNMANAGED) != 0)
4087 		return (0);
4088 	rw_rlock(&pvh_global_lock);
4089 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4090 	rw_rlock(lock);
4091 restart:
4092 	count = 0;
4093 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4094 		pmap = PV_PMAP(pv);
4095 		if (!PMAP_TRYLOCK(pmap)) {
4096 			md_gen = m->md.pv_gen;
4097 			rw_runlock(lock);
4098 			PMAP_LOCK(pmap);
4099 			rw_rlock(lock);
4100 			if (md_gen != m->md.pv_gen) {
4101 				PMAP_UNLOCK(pmap);
4102 				goto restart;
4103 			}
4104 		}
4105 		l2 = pmap_l2(pmap, pv->pv_va);
4106 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4107 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4108 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4109 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
4110 			count++;
4111 		PMAP_UNLOCK(pmap);
4112 	}
4113 	if ((m->flags & PG_FICTITIOUS) == 0) {
4114 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4115 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4116 			pmap = PV_PMAP(pv);
4117 			if (!PMAP_TRYLOCK(pmap)) {
4118 				md_gen = m->md.pv_gen;
4119 				pvh_gen = pvh->pv_gen;
4120 				rw_runlock(lock);
4121 				PMAP_LOCK(pmap);
4122 				rw_rlock(lock);
4123 				if (md_gen != m->md.pv_gen ||
4124 				    pvh_gen != pvh->pv_gen) {
4125 					PMAP_UNLOCK(pmap);
4126 					goto restart;
4127 				}
4128 			}
4129 			l2 = pmap_l2(pmap, pv->pv_va);
4130 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
4131 				count++;
4132 			PMAP_UNLOCK(pmap);
4133 		}
4134 	}
4135 	rw_runlock(lock);
4136 	rw_runlock(&pvh_global_lock);
4137 	return (count);
4138 }
4139 
4140 /*
4141  * Returns true if the given page is mapped individually or as part of
4142  * a 2mpage.  Otherwise, returns false.
4143  */
4144 bool
4145 pmap_page_is_mapped(vm_page_t m)
4146 {
4147 	struct rwlock *lock;
4148 	bool rv;
4149 
4150 	if ((m->oflags & VPO_UNMANAGED) != 0)
4151 		return (false);
4152 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4153 	rw_rlock(lock);
4154 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4155 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4156 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4157 	rw_runlock(lock);
4158 	return (rv);
4159 }
4160 
4161 static void
4162 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
4163     struct spglist *free, bool superpage)
4164 {
4165 	struct md_page *pvh;
4166 	vm_page_t mpte, mt;
4167 
4168 	if (superpage) {
4169 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
4170 		pvh = pa_to_pvh(m->phys_addr);
4171 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4172 		pvh->pv_gen++;
4173 		if (TAILQ_EMPTY(&pvh->pv_list)) {
4174 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
4175 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
4176 				    (mt->a.flags & PGA_WRITEABLE) != 0)
4177 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
4178 		}
4179 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4180 		if (mpte != NULL) {
4181 			KASSERT(vm_page_any_valid(mpte),
4182 			    ("pmap_remove_pages: pte page not promoted"));
4183 			pmap_resident_count_dec(pmap, 1);
4184 			KASSERT(mpte->ref_count == Ln_ENTRIES,
4185 			    ("pmap_remove_pages: pte page ref count error"));
4186 			mpte->ref_count = 0;
4187 			pmap_add_delayed_free_list(mpte, free, false);
4188 		}
4189 	} else {
4190 		pmap_resident_count_dec(pmap, 1);
4191 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4192 		m->md.pv_gen++;
4193 		if (TAILQ_EMPTY(&m->md.pv_list) &&
4194 		    (m->a.flags & PGA_WRITEABLE) != 0) {
4195 			pvh = pa_to_pvh(m->phys_addr);
4196 			if (TAILQ_EMPTY(&pvh->pv_list))
4197 				vm_page_aflag_clear(m, PGA_WRITEABLE);
4198 		}
4199 	}
4200 }
4201 
4202 /*
4203  * Destroy all managed, non-wired mappings in the given user-space
4204  * pmap.  This pmap cannot be active on any processor besides the
4205  * caller.
4206  *
4207  * This function cannot be applied to the kernel pmap.  Moreover, it
4208  * is not intended for general use.  It is only to be used during
4209  * process termination.  Consequently, it can be implemented in ways
4210  * that make it faster than pmap_remove().  First, it can more quickly
4211  * destroy mappings by iterating over the pmap's collection of PV
4212  * entries, rather than searching the page table.  Second, it doesn't
4213  * have to test and clear the page table entries atomically, because
4214  * no processor is currently accessing the user address space.  In
4215  * particular, a page table entry's dirty bit won't change state once
4216  * this function starts.
4217  */
4218 void
4219 pmap_remove_pages(pmap_t pmap)
4220 {
4221 	struct spglist free;
4222 	pd_entry_t ptepde;
4223 	pt_entry_t *pte, tpte;
4224 	vm_page_t m, mt;
4225 	pv_entry_t pv;
4226 	struct pv_chunk *pc, *npc;
4227 	struct rwlock *lock;
4228 	int64_t bit;
4229 	uint64_t inuse, bitmask;
4230 	int allfree, field, freed __pv_stat_used, idx;
4231 	bool superpage;
4232 
4233 	lock = NULL;
4234 
4235 	SLIST_INIT(&free);
4236 	rw_rlock(&pvh_global_lock);
4237 	PMAP_LOCK(pmap);
4238 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4239 		allfree = 1;
4240 		freed = 0;
4241 		for (field = 0; field < _NPCM; field++) {
4242 			inuse = ~pc->pc_map[field] & pc_freemask[field];
4243 			while (inuse != 0) {
4244 				bit = ffsl(inuse) - 1;
4245 				bitmask = 1UL << bit;
4246 				idx = field * 64 + bit;
4247 				pv = &pc->pc_pventry[idx];
4248 				inuse &= ~bitmask;
4249 
4250 				pte = pmap_l1(pmap, pv->pv_va);
4251 				ptepde = pmap_load(pte);
4252 				pte = pmap_l1_to_l2(pte, pv->pv_va);
4253 				tpte = pmap_load(pte);
4254 
4255 				KASSERT((tpte & PTE_V) != 0,
4256 				    ("L2 PTE is invalid... bogus PV entry? "
4257 				    "va=%#lx, pte=%#lx", pv->pv_va, tpte));
4258 				if ((tpte & PTE_RWX) != 0) {
4259 					superpage = true;
4260 				} else {
4261 					ptepde = tpte;
4262 					pte = pmap_l2_to_l3(pte, pv->pv_va);
4263 					tpte = pmap_load(pte);
4264 					superpage = false;
4265 				}
4266 
4267 				/*
4268 				 * We cannot remove wired pages from a
4269 				 * process' mapping at this time.
4270 				 */
4271 				if (tpte & PTE_SW_WIRED) {
4272 					allfree = 0;
4273 					continue;
4274 				}
4275 
4276 				m = PTE_TO_VM_PAGE(tpte);
4277 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4278 				    m < &vm_page_array[vm_page_array_size],
4279 				    ("pmap_remove_pages: bad pte %#jx",
4280 				    (uintmax_t)tpte));
4281 
4282 				pmap_clear(pte);
4283 
4284 				/*
4285 				 * Update the vm_page_t clean/reference bits.
4286 				 */
4287 				if ((tpte & (PTE_D | PTE_W)) ==
4288 				    (PTE_D | PTE_W)) {
4289 					if (superpage)
4290 						for (mt = m;
4291 						    mt < &m[Ln_ENTRIES]; mt++)
4292 							vm_page_dirty(mt);
4293 					else
4294 						vm_page_dirty(m);
4295 				}
4296 
4297 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
4298 
4299 				/* Mark free */
4300 				pc->pc_map[field] |= bitmask;
4301 
4302 				pmap_remove_pages_pv(pmap, m, pv, &free,
4303 				    superpage);
4304 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
4305 				freed++;
4306 			}
4307 		}
4308 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
4309 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
4310 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
4311 		if (allfree) {
4312 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4313 			free_pv_chunk(pc);
4314 		}
4315 	}
4316 	if (lock != NULL)
4317 		rw_wunlock(lock);
4318 	pmap_invalidate_all(pmap);
4319 	rw_runlock(&pvh_global_lock);
4320 	PMAP_UNLOCK(pmap);
4321 	vm_page_free_pages_toq(&free, false);
4322 }
4323 
4324 static bool
4325 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
4326 {
4327 	struct md_page *pvh;
4328 	struct rwlock *lock;
4329 	pd_entry_t *l2;
4330 	pt_entry_t *l3, mask;
4331 	pv_entry_t pv;
4332 	pmap_t pmap;
4333 	int md_gen, pvh_gen;
4334 	bool rv;
4335 
4336 	mask = 0;
4337 	if (modified)
4338 		mask |= PTE_D;
4339 	if (accessed)
4340 		mask |= PTE_A;
4341 
4342 	rv = false;
4343 	rw_rlock(&pvh_global_lock);
4344 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4345 	rw_rlock(lock);
4346 restart:
4347 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4348 		pmap = PV_PMAP(pv);
4349 		if (!PMAP_TRYLOCK(pmap)) {
4350 			md_gen = m->md.pv_gen;
4351 			rw_runlock(lock);
4352 			PMAP_LOCK(pmap);
4353 			rw_rlock(lock);
4354 			if (md_gen != m->md.pv_gen) {
4355 				PMAP_UNLOCK(pmap);
4356 				goto restart;
4357 			}
4358 		}
4359 		l2 = pmap_l2(pmap, pv->pv_va);
4360 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4361 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4362 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4363 		rv = (pmap_load(l3) & mask) == mask;
4364 		PMAP_UNLOCK(pmap);
4365 		if (rv)
4366 			goto out;
4367 	}
4368 	if ((m->flags & PG_FICTITIOUS) == 0) {
4369 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4370 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4371 			pmap = PV_PMAP(pv);
4372 			if (!PMAP_TRYLOCK(pmap)) {
4373 				md_gen = m->md.pv_gen;
4374 				pvh_gen = pvh->pv_gen;
4375 				rw_runlock(lock);
4376 				PMAP_LOCK(pmap);
4377 				rw_rlock(lock);
4378 				if (md_gen != m->md.pv_gen ||
4379 				    pvh_gen != pvh->pv_gen) {
4380 					PMAP_UNLOCK(pmap);
4381 					goto restart;
4382 				}
4383 			}
4384 			l2 = pmap_l2(pmap, pv->pv_va);
4385 			rv = (pmap_load(l2) & mask) == mask;
4386 			PMAP_UNLOCK(pmap);
4387 			if (rv)
4388 				goto out;
4389 		}
4390 	}
4391 out:
4392 	rw_runlock(lock);
4393 	rw_runlock(&pvh_global_lock);
4394 	return (rv);
4395 }
4396 
4397 /*
4398  *	pmap_is_modified:
4399  *
4400  *	Return whether or not the specified physical page was modified
4401  *	in any physical maps.
4402  */
4403 bool
4404 pmap_is_modified(vm_page_t m)
4405 {
4406 
4407 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4408 	    ("pmap_is_modified: page %p is not managed", m));
4409 
4410 	/*
4411 	 * If the page is not busied then this check is racy.
4412 	 */
4413 	if (!pmap_page_is_write_mapped(m))
4414 		return (false);
4415 	return (pmap_page_test_mappings(m, false, true));
4416 }
4417 
4418 /*
4419  *	pmap_is_prefaultable:
4420  *
4421  *	Return whether or not the specified virtual address is eligible
4422  *	for prefault.
4423  */
4424 bool
4425 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4426 {
4427 	pt_entry_t *l3;
4428 	bool rv;
4429 
4430 	/*
4431 	 * Return true if and only if the L3 entry for the specified virtual
4432 	 * address is allocated but invalid.
4433 	 */
4434 	rv = false;
4435 	PMAP_LOCK(pmap);
4436 	l3 = pmap_l3(pmap, addr);
4437 	if (l3 != NULL && pmap_load(l3) == 0) {
4438 		rv = true;
4439 	}
4440 	PMAP_UNLOCK(pmap);
4441 	return (rv);
4442 }
4443 
4444 /*
4445  *	pmap_is_referenced:
4446  *
4447  *	Return whether or not the specified physical page was referenced
4448  *	in any physical maps.
4449  */
4450 bool
4451 pmap_is_referenced(vm_page_t m)
4452 {
4453 
4454 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4455 	    ("pmap_is_referenced: page %p is not managed", m));
4456 	return (pmap_page_test_mappings(m, true, false));
4457 }
4458 
4459 /*
4460  * Clear the write and modified bits in each of the given page's mappings.
4461  */
4462 void
4463 pmap_remove_write(vm_page_t m)
4464 {
4465 	struct md_page *pvh;
4466 	struct rwlock *lock;
4467 	pmap_t pmap;
4468 	pd_entry_t *l2;
4469 	pt_entry_t *l3, oldl3, newl3;
4470 	pv_entry_t next_pv, pv;
4471 	vm_offset_t va;
4472 	int md_gen, pvh_gen;
4473 
4474 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4475 	    ("pmap_remove_write: page %p is not managed", m));
4476 	vm_page_assert_busied(m);
4477 
4478 	if (!pmap_page_is_write_mapped(m))
4479 		return;
4480 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4481 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4482 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4483 	rw_rlock(&pvh_global_lock);
4484 retry_pv_loop:
4485 	rw_wlock(lock);
4486 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4487 		pmap = PV_PMAP(pv);
4488 		if (!PMAP_TRYLOCK(pmap)) {
4489 			pvh_gen = pvh->pv_gen;
4490 			rw_wunlock(lock);
4491 			PMAP_LOCK(pmap);
4492 			rw_wlock(lock);
4493 			if (pvh_gen != pvh->pv_gen) {
4494 				PMAP_UNLOCK(pmap);
4495 				rw_wunlock(lock);
4496 				goto retry_pv_loop;
4497 			}
4498 		}
4499 		va = pv->pv_va;
4500 		l2 = pmap_l2(pmap, va);
4501 		if ((pmap_load(l2) & PTE_W) != 0)
4502 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
4503 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4504 		    ("inconsistent pv lock %p %p for page %p",
4505 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4506 		PMAP_UNLOCK(pmap);
4507 	}
4508 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4509 		pmap = PV_PMAP(pv);
4510 		if (!PMAP_TRYLOCK(pmap)) {
4511 			pvh_gen = pvh->pv_gen;
4512 			md_gen = m->md.pv_gen;
4513 			rw_wunlock(lock);
4514 			PMAP_LOCK(pmap);
4515 			rw_wlock(lock);
4516 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4517 				PMAP_UNLOCK(pmap);
4518 				rw_wunlock(lock);
4519 				goto retry_pv_loop;
4520 			}
4521 		}
4522 		l2 = pmap_l2(pmap, pv->pv_va);
4523 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4524 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4525 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4526 		oldl3 = pmap_load(l3);
4527 retry:
4528 		if ((oldl3 & PTE_W) != 0) {
4529 			newl3 = oldl3 & ~(PTE_D | PTE_W);
4530 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
4531 				goto retry;
4532 			if ((oldl3 & PTE_D) != 0)
4533 				vm_page_dirty(m);
4534 			pmap_invalidate_page(pmap, pv->pv_va);
4535 		}
4536 		PMAP_UNLOCK(pmap);
4537 	}
4538 	rw_wunlock(lock);
4539 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4540 	rw_runlock(&pvh_global_lock);
4541 }
4542 
4543 /*
4544  *	pmap_ts_referenced:
4545  *
4546  *	Return a count of reference bits for a page, clearing those bits.
4547  *	It is not necessary for every reference bit to be cleared, but it
4548  *	is necessary that 0 only be returned when there are truly no
4549  *	reference bits set.
4550  *
4551  *	As an optimization, update the page's dirty field if a modified bit is
4552  *	found while counting reference bits.  This opportunistic update can be
4553  *	performed at low cost and can eliminate the need for some future calls
4554  *	to pmap_is_modified().  However, since this function stops after
4555  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4556  *	dirty pages.  Those dirty pages will only be detected by a future call
4557  *	to pmap_is_modified().
4558  */
4559 int
4560 pmap_ts_referenced(vm_page_t m)
4561 {
4562 	struct spglist free;
4563 	struct md_page *pvh;
4564 	struct rwlock *lock;
4565 	pv_entry_t pv, pvf;
4566 	pmap_t pmap;
4567 	pd_entry_t *l2, l2e;
4568 	pt_entry_t *l3, l3e;
4569 	vm_paddr_t pa;
4570 	vm_offset_t va;
4571 	int cleared, md_gen, not_cleared, pvh_gen;
4572 
4573 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4574 	    ("pmap_ts_referenced: page %p is not managed", m));
4575 	SLIST_INIT(&free);
4576 	cleared = 0;
4577 	pa = VM_PAGE_TO_PHYS(m);
4578 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4579 
4580 	lock = PHYS_TO_PV_LIST_LOCK(pa);
4581 	rw_rlock(&pvh_global_lock);
4582 	rw_wlock(lock);
4583 retry:
4584 	not_cleared = 0;
4585 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4586 		goto small_mappings;
4587 	pv = pvf;
4588 	do {
4589 		pmap = PV_PMAP(pv);
4590 		if (!PMAP_TRYLOCK(pmap)) {
4591 			pvh_gen = pvh->pv_gen;
4592 			rw_wunlock(lock);
4593 			PMAP_LOCK(pmap);
4594 			rw_wlock(lock);
4595 			if (pvh_gen != pvh->pv_gen) {
4596 				PMAP_UNLOCK(pmap);
4597 				goto retry;
4598 			}
4599 		}
4600 		va = pv->pv_va;
4601 		l2 = pmap_l2(pmap, va);
4602 		l2e = pmap_load(l2);
4603 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4604 			/*
4605 			 * Although l2e is mapping a 2MB page, because
4606 			 * this function is called at a 4KB page granularity,
4607 			 * we only update the 4KB page under test.
4608 			 */
4609 			vm_page_dirty(m);
4610 		}
4611 		if ((l2e & PTE_A) != 0) {
4612 			/*
4613 			 * Since this reference bit is shared by 512 4KB
4614 			 * pages, it should not be cleared every time it is
4615 			 * tested.  Apply a simple "hash" function on the
4616 			 * physical page number, the virtual superpage number,
4617 			 * and the pmap address to select one 4KB page out of
4618 			 * the 512 on which testing the reference bit will
4619 			 * result in clearing that reference bit.  This
4620 			 * function is designed to avoid the selection of the
4621 			 * same 4KB page for every 2MB page mapping.
4622 			 *
4623 			 * On demotion, a mapping that hasn't been referenced
4624 			 * is simply destroyed.  To avoid the possibility of a
4625 			 * subsequent page fault on a demoted wired mapping,
4626 			 * always leave its reference bit set.  Moreover,
4627 			 * since the superpage is wired, the current state of
4628 			 * its reference bit won't affect page replacement.
4629 			 */
4630 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4631 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4632 			    (l2e & PTE_SW_WIRED) == 0) {
4633 				pmap_clear_bits(l2, PTE_A);
4634 				pmap_invalidate_page(pmap, va);
4635 				cleared++;
4636 			} else
4637 				not_cleared++;
4638 		}
4639 		PMAP_UNLOCK(pmap);
4640 		/* Rotate the PV list if it has more than one entry. */
4641 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4642 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4643 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4644 			pvh->pv_gen++;
4645 		}
4646 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4647 			goto out;
4648 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4649 small_mappings:
4650 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4651 		goto out;
4652 	pv = pvf;
4653 	do {
4654 		pmap = PV_PMAP(pv);
4655 		if (!PMAP_TRYLOCK(pmap)) {
4656 			pvh_gen = pvh->pv_gen;
4657 			md_gen = m->md.pv_gen;
4658 			rw_wunlock(lock);
4659 			PMAP_LOCK(pmap);
4660 			rw_wlock(lock);
4661 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4662 				PMAP_UNLOCK(pmap);
4663 				goto retry;
4664 			}
4665 		}
4666 		l2 = pmap_l2(pmap, pv->pv_va);
4667 
4668 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
4669 		    ("pmap_ts_referenced: found an invalid l2 table"));
4670 
4671 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4672 		l3e = pmap_load(l3);
4673 		if ((l3e & PTE_D) != 0)
4674 			vm_page_dirty(m);
4675 		if ((l3e & PTE_A) != 0) {
4676 			if ((l3e & PTE_SW_WIRED) == 0) {
4677 				/*
4678 				 * Wired pages cannot be paged out so
4679 				 * doing accessed bit emulation for
4680 				 * them is wasted effort. We do the
4681 				 * hard work for unwired pages only.
4682 				 */
4683 				pmap_clear_bits(l3, PTE_A);
4684 				pmap_invalidate_page(pmap, pv->pv_va);
4685 				cleared++;
4686 			} else
4687 				not_cleared++;
4688 		}
4689 		PMAP_UNLOCK(pmap);
4690 		/* Rotate the PV list if it has more than one entry. */
4691 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4692 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4693 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4694 			m->md.pv_gen++;
4695 		}
4696 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4697 	    not_cleared < PMAP_TS_REFERENCED_MAX);
4698 out:
4699 	rw_wunlock(lock);
4700 	rw_runlock(&pvh_global_lock);
4701 	vm_page_free_pages_toq(&free, false);
4702 	return (cleared + not_cleared);
4703 }
4704 
4705 /*
4706  *	Apply the given advice to the specified range of addresses within the
4707  *	given pmap.  Depending on the advice, clear the referenced and/or
4708  *	modified flags in each mapping and set the mapped page's dirty field.
4709  */
4710 void
4711 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4712 {
4713 }
4714 
4715 /*
4716  *	Clear the modify bits on the specified physical page.
4717  */
4718 void
4719 pmap_clear_modify(vm_page_t m)
4720 {
4721 	struct md_page *pvh;
4722 	struct rwlock *lock;
4723 	pmap_t pmap;
4724 	pv_entry_t next_pv, pv;
4725 	pd_entry_t *l2, oldl2;
4726 	pt_entry_t *l3;
4727 	vm_offset_t va;
4728 	int md_gen, pvh_gen;
4729 
4730 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4731 	    ("%s: page %p is not managed", __func__, m));
4732 	vm_page_assert_busied(m);
4733 
4734 	if (!pmap_page_is_write_mapped(m))
4735 	        return;
4736 
4737 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4738 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4739 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4740 	rw_rlock(&pvh_global_lock);
4741 	rw_wlock(lock);
4742 restart:
4743 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4744 		pmap = PV_PMAP(pv);
4745 		if (!PMAP_TRYLOCK(pmap)) {
4746 			pvh_gen = pvh->pv_gen;
4747 			rw_wunlock(lock);
4748 			PMAP_LOCK(pmap);
4749 			rw_wlock(lock);
4750 			if (pvh_gen != pvh->pv_gen) {
4751 				PMAP_UNLOCK(pmap);
4752 				goto restart;
4753 			}
4754 		}
4755 		va = pv->pv_va;
4756 		l2 = pmap_l2(pmap, va);
4757 		oldl2 = pmap_load(l2);
4758 		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
4759 		if ((oldl2 & PTE_W) != 0 &&
4760 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4761 		    (oldl2 & PTE_SW_WIRED) == 0) {
4762 			/*
4763 			 * Write protect the mapping to a single page so that
4764 			 * a subsequent write access may repromote.
4765 			 */
4766 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4767 			l3 = pmap_l2_to_l3(l2, va);
4768 			pmap_clear_bits(l3, PTE_D | PTE_W);
4769 			vm_page_dirty(m);
4770 			pmap_invalidate_page(pmap, va);
4771 		}
4772 		PMAP_UNLOCK(pmap);
4773 	}
4774 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4775 		pmap = PV_PMAP(pv);
4776 		if (!PMAP_TRYLOCK(pmap)) {
4777 			md_gen = m->md.pv_gen;
4778 			pvh_gen = pvh->pv_gen;
4779 			rw_wunlock(lock);
4780 			PMAP_LOCK(pmap);
4781 			rw_wlock(lock);
4782 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4783 				PMAP_UNLOCK(pmap);
4784 				goto restart;
4785 			}
4786 		}
4787 		l2 = pmap_l2(pmap, pv->pv_va);
4788 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4789 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4790 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4791 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4792 			pmap_clear_bits(l3, PTE_D | PTE_W);
4793 			pmap_invalidate_page(pmap, pv->pv_va);
4794 		}
4795 		PMAP_UNLOCK(pmap);
4796 	}
4797 	rw_wunlock(lock);
4798 	rw_runlock(&pvh_global_lock);
4799 }
4800 
4801 void *
4802 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4803 {
4804 
4805         return ((void *)PHYS_TO_DMAP(pa));
4806 }
4807 
4808 void
4809 pmap_unmapbios(void *p, vm_size_t size)
4810 {
4811 }
4812 
4813 /*
4814  * Sets the memory attribute for the specified page.
4815  */
4816 void
4817 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4818 {
4819 
4820 	m->md.pv_memattr = ma;
4821 
4822 	/*
4823 	 * If "m" is a normal page, update its direct mapping.  This update
4824 	 * can be relied upon to perform any cache operations that are
4825 	 * required for data coherence.
4826 	 */
4827 	if ((m->flags & PG_FICTITIOUS) == 0 &&
4828 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4829 	    m->md.pv_memattr) != 0)
4830 		panic("memory attribute change on the direct map failed");
4831 }
4832 
4833 /*
4834  * Changes the specified virtual address range's memory type to that given by
4835  * the parameter "mode".  The specified virtual address range must be
4836  * completely contained within either the direct map or the kernel map.
4837  *
4838  * Returns zero if the change completed successfully, and either EINVAL or
4839  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4840  * of the virtual address range was not mapped, and ENOMEM is returned if
4841  * there was insufficient memory available to complete the change.  In the
4842  * latter case, the memory type may have been changed on some part of the
4843  * virtual address range.
4844  */
4845 int
4846 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4847 {
4848 	int error;
4849 
4850 	PMAP_LOCK(kernel_pmap);
4851 	error = pmap_change_attr_locked(va, size, mode);
4852 	PMAP_UNLOCK(kernel_pmap);
4853 	return (error);
4854 }
4855 
4856 static int
4857 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4858 {
4859 	vm_offset_t base, offset, tmpva;
4860 	vm_paddr_t phys;
4861 	pd_entry_t *l1, l1e;
4862 	pd_entry_t *l2, l2e;
4863 	pt_entry_t *l3, l3e;
4864 	pt_entry_t bits, mask;
4865 	bool anychanged = false;
4866 	int error = 0;
4867 
4868 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4869 	base = trunc_page(va);
4870 	offset = va & PAGE_MASK;
4871 	size = round_page(offset + size);
4872 
4873 	if (!VIRT_IN_DMAP(base) &&
4874 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
4875 		return (EINVAL);
4876 
4877 	bits = pmap_memattr_bits(mode);
4878 	mask = memattr_mask;
4879 
4880 	/* First loop: perform PTE validation and demotions as necessary. */
4881 	for (tmpva = base; tmpva < base + size; ) {
4882 		l1 = pmap_l1(kernel_pmap, tmpva);
4883 		if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
4884 			return (EINVAL);
4885 		if ((l1e & PTE_RWX) != 0) {
4886 			/*
4887 			 * If the existing PTE has the correct attributes, then
4888 			 * no need to demote.
4889 			 */
4890 			if ((l1e & mask) == bits) {
4891 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4892 				continue;
4893 			}
4894 
4895 			/*
4896 			 * If the 1GB page fits in the remaining range, we
4897 			 * don't need to demote.
4898 			 */
4899 			if ((tmpva & L1_OFFSET) == 0 &&
4900 			    tmpva + L1_SIZE <= base + size) {
4901 				tmpva += L1_SIZE;
4902 				continue;
4903 			}
4904 
4905 			if (!pmap_demote_l1(kernel_pmap, l1, tmpva))
4906 				return (EINVAL);
4907 		}
4908 		l2 = pmap_l1_to_l2(l1, tmpva);
4909 		if (((l2e = pmap_load(l2)) & PTE_V) == 0)
4910 			return (EINVAL);
4911 		if ((l2e & PTE_RWX) != 0) {
4912 			/*
4913 			 * If the existing PTE has the correct attributes, then
4914 			 * no need to demote.
4915 			 */
4916 			if ((l2e & mask) == bits) {
4917 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
4918 				continue;
4919 			}
4920 
4921 			/*
4922 			 * If the 2MB page fits in the remaining range, we
4923 			 * don't need to demote.
4924 			 */
4925 			if ((tmpva & L2_OFFSET) == 0 &&
4926 			    tmpva + L2_SIZE <= base + size) {
4927 				tmpva += L2_SIZE;
4928 				continue;
4929 			}
4930 
4931 			if (!pmap_demote_l2(kernel_pmap, l2, tmpva))
4932 				panic("l2 demotion failed");
4933 		}
4934 		l3 = pmap_l2_to_l3(l2, tmpva);
4935 		if (((l3e = pmap_load(l3)) & PTE_V) == 0)
4936 			return (EINVAL);
4937 
4938 		tmpva += PAGE_SIZE;
4939 	}
4940 
4941 	/* Second loop: perform PTE updates. */
4942 	for (tmpva = base; tmpva < base + size; ) {
4943 		l1 = pmap_l1(kernel_pmap, tmpva);
4944 		l1e = pmap_load(l1);
4945 		if ((l1e & PTE_RWX) != 0) {
4946 			/* Unchanged. */
4947 			if ((l1e & mask) == bits) {
4948 				tmpva += L1_SIZE;
4949 				continue;
4950 			}
4951 
4952 			l1e &= ~mask;
4953 			l1e |= bits;
4954 			pmap_store(l1, l1e);
4955 			anychanged = true;
4956 
4957 			/* Update corresponding DMAP entry */
4958 			phys = L1PTE_TO_PHYS(l1e);
4959 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
4960 				error = pmap_change_attr_locked(
4961 				    PHYS_TO_DMAP(phys), L1_SIZE, mode);
4962 				if (error != 0)
4963 					break;
4964 			}
4965 			tmpva += L1_SIZE;
4966 			continue;
4967 		}
4968 
4969 		l2 = pmap_l1_to_l2(l1, tmpva);
4970 		l2e = pmap_load(l2);
4971 		if ((l2e & PTE_RWX) != 0) {
4972 			/* Unchanged. */
4973 			if ((l2e & mask) == bits) {
4974 				tmpva += L2_SIZE;
4975 				continue;
4976 			}
4977 
4978 			l2e &= ~mask;
4979 			l2e |= bits;
4980 			pmap_store(l2, l2e);
4981 			anychanged = true;
4982 
4983 			/* Update corresponding DMAP entry */
4984 			phys = L2PTE_TO_PHYS(l2e);
4985 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
4986 				error = pmap_change_attr_locked(
4987 				    PHYS_TO_DMAP(phys), L2_SIZE, mode);
4988 				if (error != 0)
4989 					break;
4990 			}
4991 			tmpva += L2_SIZE;
4992 			continue;
4993 		}
4994 
4995 		l3 = pmap_l2_to_l3(l2, tmpva);
4996 		l3e = pmap_load(l3);
4997 
4998 		/* Unchanged. */
4999 		if ((l3e & mask) == bits) {
5000 			tmpva += PAGE_SIZE;
5001 			continue;
5002 		}
5003 
5004 		l3e &= ~mask;
5005 		l3e |= bits;
5006 		pmap_store(l3, l3e);
5007 		anychanged = true;
5008 
5009 		phys = PTE_TO_PHYS(l3e);
5010 		if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(phys)) {
5011 			error = pmap_change_attr_locked(PHYS_TO_DMAP(phys),
5012 			    L3_SIZE, mode);
5013 			if (error != 0)
5014 				break;
5015 		}
5016 		tmpva += PAGE_SIZE;
5017 	}
5018 
5019 	if (anychanged) {
5020 		pmap_invalidate_range(kernel_pmap, base, tmpva);
5021 		if (mode == VM_MEMATTR_UNCACHEABLE)
5022 			cpu_dcache_wbinv_range(base, size);
5023 	}
5024 
5025 	return (error);
5026 }
5027 
5028 /*
5029  * Perform the pmap work for mincore(2).  If the page is not both referenced and
5030  * modified by this pmap, returns its physical address so that the caller can
5031  * find other mappings.
5032  */
5033 int
5034 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
5035 {
5036 	pt_entry_t *l2, *l3, tpte;
5037 	vm_paddr_t pa;
5038 	int val;
5039 	bool managed;
5040 
5041 	PMAP_LOCK(pmap);
5042 	l2 = pmap_l2(pmap, addr);
5043 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
5044 		if ((tpte & PTE_RWX) != 0) {
5045 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
5046 			val = MINCORE_INCORE | MINCORE_PSIND(1);
5047 		} else {
5048 			l3 = pmap_l2_to_l3(l2, addr);
5049 			tpte = pmap_load(l3);
5050 			if ((tpte & PTE_V) == 0) {
5051 				PMAP_UNLOCK(pmap);
5052 				return (0);
5053 			}
5054 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
5055 			val = MINCORE_INCORE;
5056 		}
5057 
5058 		if ((tpte & PTE_D) != 0)
5059 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5060 		if ((tpte & PTE_A) != 0)
5061 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5062 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
5063 	} else {
5064 		managed = false;
5065 		val = 0;
5066 	}
5067 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5068 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
5069 		*pap = pa;
5070 	}
5071 	PMAP_UNLOCK(pmap);
5072 	return (val);
5073 }
5074 
5075 void
5076 pmap_activate_sw(struct thread *td)
5077 {
5078 	pmap_t oldpmap, pmap;
5079 	u_int hart;
5080 
5081 	oldpmap = PCPU_GET(curpmap);
5082 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5083 	if (pmap == oldpmap)
5084 		return;
5085 	csr_write(satp, pmap->pm_satp);
5086 
5087 	hart = PCPU_GET(hart);
5088 #ifdef SMP
5089 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
5090 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
5091 #else
5092 	CPU_SET(hart, &pmap->pm_active);
5093 	CPU_CLR(hart, &oldpmap->pm_active);
5094 #endif
5095 	PCPU_SET(curpmap, pmap);
5096 
5097 	sfence_vma();
5098 }
5099 
5100 void
5101 pmap_activate(struct thread *td)
5102 {
5103 
5104 	critical_enter();
5105 	pmap_activate_sw(td);
5106 	critical_exit();
5107 }
5108 
5109 void
5110 pmap_activate_boot(pmap_t pmap)
5111 {
5112 	u_int hart;
5113 
5114 	hart = PCPU_GET(hart);
5115 #ifdef SMP
5116 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
5117 #else
5118 	CPU_SET(hart, &pmap->pm_active);
5119 #endif
5120 	PCPU_SET(curpmap, pmap);
5121 }
5122 
5123 void
5124 pmap_active_cpus(pmap_t pmap, cpuset_t *res)
5125 {
5126 	*res = pmap->pm_active;
5127 }
5128 
5129 void
5130 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
5131 {
5132 	cpuset_t mask;
5133 
5134 	/*
5135 	 * From the RISC-V User-Level ISA V2.2:
5136 	 *
5137 	 * "To make a store to instruction memory visible to all
5138 	 * RISC-V harts, the writing hart has to execute a data FENCE
5139 	 * before requesting that all remote RISC-V harts execute a
5140 	 * FENCE.I."
5141 	 *
5142 	 * However, this is slightly misleading; we still need to
5143 	 * perform a FENCE.I for the local hart, as FENCE does nothing
5144 	 * for its icache. FENCE.I alone is also sufficient for the
5145 	 * local hart.
5146 	 */
5147 	sched_pin();
5148 	mask = all_harts;
5149 	CPU_CLR(PCPU_GET(hart), &mask);
5150 	fence_i();
5151 	if (!CPU_EMPTY(&mask) && smp_started) {
5152 		fence();
5153 		sbi_remote_fence_i(mask.__bits);
5154 	}
5155 	sched_unpin();
5156 }
5157 
5158 /*
5159  *	Increase the starting virtual address of the given mapping if a
5160  *	different alignment might result in more superpage mappings.
5161  */
5162 void
5163 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5164     vm_offset_t *addr, vm_size_t size)
5165 {
5166 	vm_offset_t superpage_offset;
5167 
5168 	if (size < L2_SIZE)
5169 		return;
5170 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5171 		offset += ptoa(object->pg_color);
5172 	superpage_offset = offset & L2_OFFSET;
5173 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5174 	    (*addr & L2_OFFSET) == superpage_offset)
5175 		return;
5176 	if ((*addr & L2_OFFSET) < superpage_offset)
5177 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
5178 	else
5179 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5180 }
5181 
5182 /**
5183  * Get the kernel virtual address of a set of physical pages. If there are
5184  * physical addresses not covered by the DMAP perform a transient mapping
5185  * that will be removed when calling pmap_unmap_io_transient.
5186  *
5187  * \param page        The pages the caller wishes to obtain the virtual
5188  *                    address on the kernel memory map.
5189  * \param vaddr       On return contains the kernel virtual memory address
5190  *                    of the pages passed in the page parameter.
5191  * \param count       Number of pages passed in.
5192  * \param can_fault   true if the thread using the mapped pages can take
5193  *                    page faults, false otherwise.
5194  *
5195  * \returns true if the caller must call pmap_unmap_io_transient when
5196  *          finished or false otherwise.
5197  *
5198  */
5199 bool
5200 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5201     bool can_fault)
5202 {
5203 	vm_paddr_t paddr;
5204 	bool needs_mapping;
5205 	int error __diagused, i;
5206 
5207 	/*
5208 	 * Allocate any KVA space that we need, this is done in a separate
5209 	 * loop to prevent calling vmem_alloc while pinned.
5210 	 */
5211 	needs_mapping = false;
5212 	for (i = 0; i < count; i++) {
5213 		paddr = VM_PAGE_TO_PHYS(page[i]);
5214 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
5215 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
5216 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
5217 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5218 			needs_mapping = true;
5219 		} else {
5220 			vaddr[i] = PHYS_TO_DMAP(paddr);
5221 		}
5222 	}
5223 
5224 	/* Exit early if everything is covered by the DMAP */
5225 	if (!needs_mapping)
5226 		return (false);
5227 
5228 	if (!can_fault)
5229 		sched_pin();
5230 	for (i = 0; i < count; i++) {
5231 		paddr = VM_PAGE_TO_PHYS(page[i]);
5232 		if (paddr >= DMAP_MAX_PHYSADDR) {
5233 			panic(
5234 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
5235 		}
5236 	}
5237 
5238 	return (needs_mapping);
5239 }
5240 
5241 void
5242 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5243     bool can_fault)
5244 {
5245 	vm_paddr_t paddr;
5246 	int i;
5247 
5248 	if (!can_fault)
5249 		sched_unpin();
5250 	for (i = 0; i < count; i++) {
5251 		paddr = VM_PAGE_TO_PHYS(page[i]);
5252 		if (paddr >= DMAP_MAX_PHYSADDR) {
5253 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
5254 		}
5255 	}
5256 }
5257 
5258 bool
5259 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
5260 {
5261 
5262 	return (mode >= VM_MEMATTR_DEFAULT && mode <= VM_MEMATTR_LAST);
5263 }
5264 
5265 bool
5266 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
5267     pt_entry_t **l3)
5268 {
5269 	pd_entry_t *l1p, *l2p;
5270 
5271 	/* Get l1 directory entry. */
5272 	l1p = pmap_l1(pmap, va);
5273 	*l1 = l1p;
5274 
5275 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
5276 		return (false);
5277 
5278 	if ((pmap_load(l1p) & PTE_RX) != 0) {
5279 		*l2 = NULL;
5280 		*l3 = NULL;
5281 		return (true);
5282 	}
5283 
5284 	/* Get l2 directory entry. */
5285 	l2p = pmap_l1_to_l2(l1p, va);
5286 	*l2 = l2p;
5287 
5288 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
5289 		return (false);
5290 
5291 	if ((pmap_load(l2p) & PTE_RX) != 0) {
5292 		*l3 = NULL;
5293 		return (true);
5294 	}
5295 
5296 	/* Get l3 page table entry. */
5297 	*l3 = pmap_l2_to_l3(l2p, va);
5298 
5299 	return (true);
5300 }
5301 
5302 /*
5303  * Track a range of the kernel's virtual address space that is contiguous
5304  * in various mapping attributes.
5305  */
5306 struct pmap_kernel_map_range {
5307 	vm_offset_t sva;
5308 	pt_entry_t attrs;
5309 	int l3pages;
5310 	int l2pages;
5311 	int l1pages;
5312 };
5313 
5314 static void
5315 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
5316     vm_offset_t eva)
5317 {
5318 	char *mode;
5319 	int i;
5320 
5321 	if (eva <= range->sva)
5322 		return;
5323 
5324 	for (i = 0; i < nitems(memattr_bits); i++)
5325 		if ((range->attrs & memattr_mask) == memattr_bits[i])
5326 			break;
5327 
5328 	switch (i) {
5329 	case VM_MEMATTR_PMA:
5330 		mode = "PMA";
5331 		break;
5332 	case VM_MEMATTR_UNCACHEABLE:
5333 		mode = "NC ";
5334 		break;
5335 	case VM_MEMATTR_DEVICE:
5336 		mode = "IO ";
5337 		break;
5338 	default:
5339 		mode = "???";
5340 		break;
5341 	}
5342 
5343 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %s %d %d %d\n",
5344 	    range->sva, eva,
5345 	    (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
5346 	    (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
5347 	    (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
5348 	    (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
5349 	    mode, range->l1pages, range->l2pages, range->l3pages);
5350 
5351 	/* Reset to sentinel value. */
5352 	range->sva = 0xfffffffffffffffful;
5353 }
5354 
5355 /*
5356  * Determine whether the attributes specified by a page table entry match those
5357  * being tracked by the current range.
5358  */
5359 static bool
5360 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
5361 {
5362 
5363 	return (range->attrs == attrs);
5364 }
5365 
5366 static void
5367 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
5368     pt_entry_t attrs)
5369 {
5370 
5371 	memset(range, 0, sizeof(*range));
5372 	range->sva = va;
5373 	range->attrs = attrs;
5374 }
5375 
5376 /*
5377  * Given a leaf PTE, derive the mapping's attributes. If they do not match
5378  * those of the current run, dump the address range and its attributes, and
5379  * begin a new run.
5380  */
5381 static void
5382 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
5383     vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
5384 {
5385 	pt_entry_t attrs;
5386 
5387 	/* The PTE global bit is inherited by lower levels. */
5388 	attrs = l1e & PTE_G;
5389 	if ((l1e & PTE_RWX) != 0) {
5390 		attrs |= l1e & (PTE_RWX | PTE_U);
5391 		attrs |= l1e & memattr_mask;
5392 	} else if (l2e != 0)
5393 		attrs |= l2e & PTE_G;
5394 
5395 	if ((l2e & PTE_RWX) != 0) {
5396 		attrs |= l2e & (PTE_RWX | PTE_U);
5397 		attrs |= l2e & memattr_mask;
5398 	} else if (l3e != 0) {
5399 		attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
5400 		attrs |= l3e & memattr_mask;
5401 	}
5402 
5403 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
5404 		sysctl_kmaps_dump(sb, range, va);
5405 		sysctl_kmaps_reinit(range, va, attrs);
5406 	}
5407 }
5408 
5409 static int
5410 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
5411 {
5412 	struct pmap_kernel_map_range range;
5413 	struct sbuf sbuf, *sb;
5414 	pd_entry_t *l1, l1e, *l2, l2e;
5415 	pt_entry_t *l3, l3e;
5416 	vm_offset_t sva;
5417 	vm_paddr_t pa;
5418 	int error, i, j, k;
5419 
5420 	error = sysctl_wire_old_buffer(req, 0);
5421 	if (error != 0)
5422 		return (error);
5423 	sb = &sbuf;
5424 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
5425 
5426 	/* Sentinel value. */
5427 	range.sva = 0xfffffffffffffffful;
5428 
5429 	/*
5430 	 * Iterate over the kernel page tables without holding the kernel pmap
5431 	 * lock. Kernel page table pages are never freed, so at worst we will
5432 	 * observe inconsistencies in the output.
5433 	 */
5434 	sva = VM_MIN_KERNEL_ADDRESS;
5435 	for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
5436 		if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
5437 			sbuf_printf(sb, "\nDirect map:\n");
5438 		else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
5439 			sbuf_printf(sb, "\nKernel map:\n");
5440 
5441 		l1 = pmap_l1(kernel_pmap, sva);
5442 		l1e = pmap_load(l1);
5443 		if ((l1e & PTE_V) == 0) {
5444 			sysctl_kmaps_dump(sb, &range, sva);
5445 			sva += L1_SIZE;
5446 			continue;
5447 		}
5448 		if ((l1e & PTE_RWX) != 0) {
5449 			sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
5450 			range.l1pages++;
5451 			sva += L1_SIZE;
5452 			continue;
5453 		}
5454 		pa = PTE_TO_PHYS(l1e);
5455 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5456 
5457 		for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
5458 			l2e = l2[j];
5459 			if ((l2e & PTE_V) == 0) {
5460 				sysctl_kmaps_dump(sb, &range, sva);
5461 				sva += L2_SIZE;
5462 				continue;
5463 			}
5464 			if ((l2e & PTE_RWX) != 0) {
5465 				sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
5466 				range.l2pages++;
5467 				sva += L2_SIZE;
5468 				continue;
5469 			}
5470 			pa = PTE_TO_PHYS(l2e);
5471 			l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
5472 
5473 			for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
5474 			    sva += L3_SIZE) {
5475 				l3e = l3[k];
5476 				if ((l3e & PTE_V) == 0) {
5477 					sysctl_kmaps_dump(sb, &range, sva);
5478 					continue;
5479 				}
5480 				sysctl_kmaps_check(sb, &range, sva,
5481 				    l1e, l2e, l3e);
5482 				range.l3pages++;
5483 			}
5484 		}
5485 	}
5486 
5487 	error = sbuf_finish(sb);
5488 	sbuf_delete(sb);
5489 	return (error);
5490 }
5491 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
5492     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
5493     NULL, 0, sysctl_kmaps, "A",
5494     "Dump kernel address layout");
5495