xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 7d62b00eb9ad855ffcd7da46b41e23feb5476fac)
1 /*	$NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.423 2022/09/24 11:05:47 riastradh Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 #include "opt_efi.h"
142 
143 #define	__MUTEX_PRIVATE	/* for assertions */
144 
145 #include <sys/param.h>
146 #include <sys/systm.h>
147 #include <sys/proc.h>
148 #include <sys/pool.h>
149 #include <sys/kernel.h>
150 #include <sys/atomic.h>
151 #include <sys/cpu.h>
152 #include <sys/intr.h>
153 #include <sys/xcall.h>
154 #include <sys/kcore.h>
155 #include <sys/kmem.h>
156 #include <sys/asan.h>
157 #include <sys/msan.h>
158 #include <sys/entropy.h>
159 
160 #include <uvm/uvm.h>
161 #include <uvm/pmap/pmap_pvt.h>
162 
163 #include <dev/isa/isareg.h>
164 
165 #include <machine/specialreg.h>
166 #include <machine/gdt.h>
167 #include <machine/isa_machdep.h>
168 #include <machine/cpuvar.h>
169 #include <machine/cputypes.h>
170 #include <machine/pmap_private.h>
171 
172 #include <x86/bootspace.h>
173 #include <x86/pat.h>
174 #include <x86/pmap_pv.h>
175 
176 #include <x86/i82489reg.h>
177 #include <x86/i82489var.h>
178 
179 #ifdef XEN
180 #include <xen/include/public/xen.h>
181 #include <xen/hypervisor.h>
182 #include <xen/xenpmap.h>
183 #endif
184 
185 #ifdef __HAVE_DIRECT_MAP
186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
187 #endif
188 
189 /*
190  * general info:
191  *
192  *  - for an explanation of how the x86 MMU hardware works see
193  *    the comments in <machine/pte.h>.
194  *
195  *  - for an explanation of the general memory structure used by
196  *    this pmap (including the recursive mapping), see the comments
197  *    in <machine/pmap.h>.
198  *
199  * this file contains the code for the "pmap module."   the module's
200  * job is to manage the hardware's virtual to physical address mappings.
201  * note that there are two levels of mapping in the VM system:
202  *
203  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
204  *      to map ranges of virtual address space to objects/files.  for
205  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
206  *      to the file /bin/ls starting at offset zero."   note that
207  *      the upper layer mapping is not concerned with how individual
208  *      vm_pages are mapped.
209  *
210  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
211  *      from virtual addresses.   it is concerned with which vm_page is
212  *      mapped where.   for example, when you run /bin/ls and start
213  *      at page 0x1000 the fault routine may lookup the correct page
214  *      of the /bin/ls file and then ask the pmap layer to establish
215  *      a mapping for it.
216  *
217  * note that information in the lower layer of the VM system can be
218  * thrown away since it can easily be reconstructed from the info
219  * in the upper layer.
220  *
221  * data structures we use include:
222  *
223  *  - struct pmap: describes the address space of one thread
224  *  - struct pmap_page: describes one pv-tracked page, without
225  *    necessarily a corresponding vm_page
226  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
227  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
228  *    physical memory.   the pp_pvlist points to a list of pv_entry
229  *    structures which describe all the <PMAP,VA> pairs that this
230  *    page is mapped in.    this is critical for page based operations
231  *    such as pmap_page_protect() [change protection on _all_ mappings
232  *    of a page]
233  */
234 
235 /*
236  * Locking
237  *
238  * We have the following locks that we must deal with, listed in the order
239  * that they are acquired:
240  *
241  * pg->uobject->vmobjlock, pg->uanon->an_lock
242  *
243  *	For managed pages, these per-object locks are taken by the VM system
244  *	before calling into the pmap module - either a read or write hold.
245  *	The lock hold prevent pages from changing identity while the pmap is
246  *	operating on them.  For example, the same lock is held across a call
247  *	to pmap_remove() and the following call to pmap_update(), so that a
248  *	page does not gain a new identity while its TLB visibility is stale.
249  *
250  * pmap->pm_lock
251  *
252  *	This lock protects the fields in the pmap structure including the
253  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
254  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
255  *	kernel PDEs are never freed, and the kernel is expected to be self
256  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
257  *	because they can be modified from interrupt context).
258  *
259  * pmaps_lock
260  *
261  *	This lock protects the list of active pmaps (headed by "pmaps").
262  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
263  *
264  * pp_lock
265  *
266  *	This per-page lock protects PV entry lists and the embedded PV entry
267  *	in each vm_page, allowing for concurrent operation on pages by
268  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
269  *	points it is taken context switching is usually not tolerable, and
270  *	spin mutexes must block out interrupts that could take kernel_lock.
271  */
272 
273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
274 #ifdef DIAGNOSTIC
275 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
276 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
277 #else
278 #define	PMAP_DUMMY_LOCK(pm)
279 #define	PMAP_DUMMY_UNLOCK(pm)
280 #endif
281 
282 static const struct uvm_pagerops pmap_pager = {
283 	/* nothing */
284 };
285 
286 /*
287  * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
288  */
289 #define pl_i(va, lvl) \
290         (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
291 
292 #define	pl_i_roundup(va, lvl)	pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
293 
294 /*
295  * PTP macros:
296  *   a PTP's index is the PD index of the PDE that points to it
297  *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
298  *   a PTP's VA is the first VA mapped by that PTP
299  */
300 
301 #define ptp_va2o(va, lvl)	(pl_i(va, (lvl)+1) * PAGE_SIZE)
302 
303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
306 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
307 const long nbpd[] = NBPD_INITIALIZER;
308 #ifdef i386
309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
310 #else
311 pd_entry_t *normal_pdes[3];
312 #endif
313 
314 long nkptp[] = NKPTP_INITIALIZER;
315 
316 struct pmap_head pmaps;
317 kmutex_t pmaps_lock __cacheline_aligned;
318 
319 struct pcpu_area *pcpuarea __read_mostly;
320 
321 static vaddr_t pmap_maxkvaddr;
322 
323 /*
324  * Misc. event counters.
325  */
326 struct evcnt pmap_iobmp_evcnt;
327 struct evcnt pmap_ldt_evcnt;
328 
329 /*
330  * PAT
331  */
332 static bool cpu_pat_enabled __read_mostly = false;
333 
334 /*
335  * Global data structures
336  */
337 
338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
340 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
341 
342 struct bootspace bootspace __read_mostly;
343 struct slotspace slotspace __read_mostly;
344 
345 /* Set to PTE_NX if supported. */
346 pd_entry_t pmap_pg_nx __read_mostly = 0;
347 
348 /* Set to PTE_G if supported. */
349 pd_entry_t pmap_pg_g __read_mostly = 0;
350 
351 /* Set to true if large pages are supported. */
352 int pmap_largepages __read_mostly = 0;
353 
354 paddr_t lowmem_rsvd __read_mostly;
355 paddr_t avail_start __read_mostly; /* PA of first available physical page */
356 paddr_t avail_end __read_mostly; /* PA of last available physical page */
357 
358 #ifdef XENPV
359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
360 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
361 #endif
362 
363 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
364 #define	PMAP_CHECK_PP(pp) \
365     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
366 
367 #define PAGE_ALIGNED(pp)	\
368 	__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
369 
370 /*
371  * Other data structures
372  */
373 
374 static pt_entry_t protection_codes[8] __read_mostly;
375 
376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
377 
378 /*
379  * The following two vaddr_t's are used during system startup to keep track of
380  * how much of the kernel's VM space we have used. Once the system is started,
381  * the management of the remaining kernel VM space is turned over to the
382  * kernel_map vm_map.
383  */
384 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
385 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
386 
387 #ifndef XENPV
388 /*
389  * LAPIC virtual address, and fake physical address.
390  */
391 volatile vaddr_t local_apic_va __read_mostly;
392 paddr_t local_apic_pa __read_mostly;
393 #endif
394 
395 /*
396  * pool that pmap structures are allocated from
397  */
398 struct pool_cache pmap_cache;
399 static int  pmap_ctor(void *, void *, int);
400 static void pmap_dtor(void *, void *);
401 
402 /*
403  * pv_page cache
404  */
405 static struct pool_cache pmap_pvp_cache;
406 
407 #ifdef __HAVE_DIRECT_MAP
408 vaddr_t pmap_direct_base __read_mostly;
409 vaddr_t pmap_direct_end __read_mostly;
410 #endif
411 
412 #ifndef __HAVE_DIRECT_MAP
413 /*
414  * Special VAs and the PTEs that map them
415  */
416 static pt_entry_t *early_zero_pte;
417 static void pmap_vpage_cpualloc(struct cpu_info *);
418 #ifdef XENPV
419 char *early_zerop; /* also referenced from xen_locore() */
420 #else
421 static char *early_zerop;
422 #endif
423 #endif
424 
425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
426 
427 /* PDP pool and its callbacks */
428 static struct pool pmap_pdp_pool;
429 static void pmap_pdp_init(pd_entry_t *);
430 static void pmap_pdp_fini(pd_entry_t *);
431 
432 #ifdef PAE
433 /* need to allocate items of 4 pages */
434 static void *pmap_pdp_alloc(struct pool *, int);
435 static void pmap_pdp_free(struct pool *, void *);
436 static struct pool_allocator pmap_pdp_allocator = {
437 	.pa_alloc = pmap_pdp_alloc,
438 	.pa_free = pmap_pdp_free,
439 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
440 };
441 #endif
442 
443 extern vaddr_t idt_vaddr;
444 extern paddr_t idt_paddr;
445 extern vaddr_t gdt_vaddr;
446 extern paddr_t gdt_paddr;
447 extern vaddr_t ldt_vaddr;
448 extern paddr_t ldt_paddr;
449 
450 #ifdef i386
451 /* stuff to fix the pentium f00f bug */
452 extern vaddr_t pentium_idt_vaddr;
453 #endif
454 
455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
456 struct pmap_ptparray {
457 	struct vm_page *pg[PTP_LEVELS + 1];
458 	bool alloced[PTP_LEVELS + 1];
459 };
460 
461 /*
462  * PV entries are allocated in page-sized chunks and cached per-pmap to
463  * avoid intense pressure on memory allocators.
464  */
465 
466 struct pv_page {
467 	LIST_HEAD(, pv_entry)	pvp_pves;
468 	LIST_ENTRY(pv_page)	pvp_list;
469 	long			pvp_nfree;
470 	struct pmap		*pvp_pmap;
471 };
472 
473 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
474 
475 /*
476  * PV tree prototypes
477  */
478 
479 static int	pmap_compare_key(void *, const void *, const void *);
480 static int	pmap_compare_nodes(void *, const void *, const void *);
481 
482 /* Read-black tree */
483 static const rb_tree_ops_t pmap_rbtree_ops = {
484 	.rbto_compare_nodes = pmap_compare_nodes,
485 	.rbto_compare_key = pmap_compare_key,
486 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
487 	.rbto_context = NULL
488 };
489 
490 /*
491  * Local prototypes
492  */
493 
494 #ifdef __HAVE_PCPU_AREA
495 static void pmap_init_pcpu(void);
496 #endif
497 #ifdef __HAVE_DIRECT_MAP
498 static void pmap_init_directmap(struct pmap *);
499 #endif
500 #if !defined(XENPV)
501 static void pmap_remap_global(void);
502 #endif
503 #ifndef XENPV
504 static void pmap_init_lapic(void);
505 static void pmap_remap_largepages(void);
506 #endif
507 
508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
509     struct vm_page **);
510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
512     pd_entry_t * const *);
513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
514 static void pmap_freepage(struct pmap *, struct vm_page *, int);
515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
516     pt_entry_t *, pd_entry_t * const *);
517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
518     vaddr_t);
519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
520     vaddr_t);
521 static int pmap_pvp_ctor(void *, void *, int);
522 static void pmap_pvp_dtor(void *, void *);
523 static struct pv_entry *pmap_alloc_pv(struct pmap *);
524 static void pmap_free_pv(struct pmap *, struct pv_entry *);
525 static void pmap_drain_pv(struct pmap *);
526 
527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
528 
529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
530 static void pmap_reactivate(struct pmap *);
531 
532 long
533 pmap_resident_count(struct pmap *pmap)
534 {
535 
536 	return pmap->pm_stats.resident_count;
537 }
538 
539 long
540 pmap_wired_count(struct pmap *pmap)
541 {
542 
543 	return pmap->pm_stats.wired_count;
544 }
545 
546 /*
547  * p m a p   h e l p e r   f u n c t i o n s
548  */
549 
550 static inline void
551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
552 {
553 
554 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
555 	pmap->pm_stats.resident_count += resid_diff;
556 	pmap->pm_stats.wired_count += wired_diff;
557 }
558 
559 static inline void
560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
561 {
562 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
563 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
564 
565 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
566 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
567 
568 	pmap_stats_update(pmap, resid_diff, wired_diff);
569 }
570 
571 /*
572  * ptp_to_pmap: lookup pmap by ptp
573  */
574 static inline struct pmap *
575 ptp_to_pmap(struct vm_page *ptp)
576 {
577 	struct pmap *pmap;
578 
579 	if (ptp == NULL) {
580 		return pmap_kernel();
581 	}
582 	pmap = (struct pmap *)ptp->uobject;
583 	KASSERT(pmap != NULL);
584 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
585 	return pmap;
586 }
587 
588 static inline struct pv_pte *
589 pve_to_pvpte(struct pv_entry *pve)
590 {
591 
592 	if (pve == NULL)
593 		return NULL;
594 	KASSERT((void *)&pve->pve_pte == (void *)pve);
595 	return &pve->pve_pte;
596 }
597 
598 static inline struct pv_entry *
599 pvpte_to_pve(struct pv_pte *pvpte)
600 {
601 	struct pv_entry *pve = (void *)pvpte;
602 
603 	KASSERT(pve_to_pvpte(pve) == pvpte);
604 	return pve;
605 }
606 
607 /*
608  * Return true if the pmap page has an embedded PV entry.
609  */
610 static inline bool
611 pv_pte_embedded(struct pmap_page *pp)
612 {
613 
614 	KASSERT(mutex_owned(&pp->pp_lock));
615 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
616 }
617 
618 /*
619  * pv_pte_first, pv_pte_next: PV list iterator.
620  */
621 static inline struct pv_pte *
622 pv_pte_first(struct pmap_page *pp)
623 {
624 
625 	KASSERT(mutex_owned(&pp->pp_lock));
626 	if (pv_pte_embedded(pp)) {
627 		return &pp->pp_pte;
628 	}
629 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
630 }
631 
632 static inline struct pv_pte *
633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
634 {
635 
636 	KASSERT(mutex_owned(&pp->pp_lock));
637 	KASSERT(pvpte != NULL);
638 	if (pvpte == &pp->pp_pte) {
639 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
640 	}
641 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
642 }
643 
644 static inline uint8_t
645 pmap_pte_to_pp_attrs(pt_entry_t pte)
646 {
647 	uint8_t ret = 0;
648 	if (pte & PTE_D)
649 		ret |= PP_ATTRS_D;
650 	if (pte & PTE_A)
651 		ret |= PP_ATTRS_A;
652 	if (pte & PTE_W)
653 		ret |= PP_ATTRS_W;
654 	return ret;
655 }
656 
657 static inline pt_entry_t
658 pmap_pp_attrs_to_pte(uint8_t attrs)
659 {
660 	pt_entry_t pte = 0;
661 	if (attrs & PP_ATTRS_D)
662 		pte |= PTE_D;
663 	if (attrs & PP_ATTRS_A)
664 		pte |= PTE_A;
665 	if (attrs & PP_ATTRS_W)
666 		pte |= PTE_W;
667 	return pte;
668 }
669 
670 /*
671  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
672  * of course the kernel is always loaded
673  */
674 bool
675 pmap_is_curpmap(struct pmap *pmap)
676 {
677 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
678 }
679 
680 inline void
681 pmap_reference(struct pmap *pmap)
682 {
683 
684 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
685 }
686 
687 /*
688  * rbtree: compare two nodes.
689  */
690 static int
691 pmap_compare_nodes(void *context, const void *n1, const void *n2)
692 {
693 	const struct pv_entry *pve1 = n1;
694 	const struct pv_entry *pve2 = n2;
695 
696 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
697 
698 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
699 		return -1;
700 	}
701 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
702 		return 1;
703 	}
704 	return 0;
705 }
706 
707 /*
708  * rbtree: compare a node and a key.
709  */
710 static int
711 pmap_compare_key(void *context, const void *n, const void *k)
712 {
713 	const struct pv_entry *pve = n;
714 	const vaddr_t key = (vaddr_t)k;
715 
716 	if (pve->pve_pte.pte_va < key) {
717 		return -1;
718 	}
719 	if (pve->pve_pte.pte_va > key) {
720 		return 1;
721 	}
722 	return 0;
723 }
724 
725 /*
726  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
727  */
728 static inline void
729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
730 {
731 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
732 
733 	if (va < *min) {
734 		*min = va;
735 	}
736 }
737 
738 /*
739  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
740  */
741 static inline void
742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
743 {
744 	vaddr_t sclip;
745 
746 	if (ptp == NULL) {
747 		return;
748 	}
749 
750 	sclip = (vaddr_t)ptp->uanon;
751 	sclip = (*startva < sclip ? sclip : *startva);
752 	*pte += (sclip - *startva) / PAGE_SIZE;
753 	*startva = sclip;
754 }
755 
756 /*
757  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
758  *
759  * there are several pmaps involved.  some or all of them might be same.
760  *
761  *	- the pmap given by the first argument
762  *		our caller wants to access this pmap's PTEs.
763  *
764  *	- pmap_kernel()
765  *		the kernel pmap.  note that it only contains the kernel part
766  *		of the address space which is shared by any pmap.  ie. any
767  *		pmap can be used instead of pmap_kernel() for our purpose.
768  *
769  *	- ci->ci_pmap
770  *		pmap currently loaded on the cpu.
771  *
772  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
773  *		current process' pmap.
774  *
775  * => caller must lock pmap first (if not the kernel pmap)
776  * => must be undone with pmap_unmap_ptes before returning
777  * => disables kernel preemption
778  */
779 void
780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
781     pd_entry_t * const **pdeppp)
782 {
783 	struct pmap *curpmap;
784 	struct cpu_info *ci;
785 	lwp_t *l;
786 
787 	kpreempt_disable();
788 
789 	/* The kernel's pmap is always accessible. */
790 	if (pmap == pmap_kernel()) {
791 		*pmap2 = NULL;
792 		*ptepp = PTE_BASE;
793 		*pdeppp = normal_pdes;
794 		return;
795 	}
796 
797 	KASSERT(mutex_owned(&pmap->pm_lock));
798 
799 	l = curlwp;
800 	ci = l->l_cpu;
801 	curpmap = ci->ci_pmap;
802 	if (pmap == curpmap) {
803 		/*
804 		 * Already on the CPU: make it valid.  This is very
805 		 * often the case during exit(), when we have switched
806 		 * to the kernel pmap in order to destroy a user pmap.
807 		 */
808 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
809 			pmap_reactivate(pmap);
810 		}
811 		*pmap2 = NULL;
812 	} else {
813 		/*
814 		 * Toss current pmap from CPU and install new pmap, but keep
815 		 * a reference to the old one.  Dropping the reference can
816 		 * can block as it needs to take locks, so defer that to
817 		 * pmap_unmap_ptes().
818 		 */
819 		pmap_reference(pmap);
820 		pmap_load1(l, pmap, curpmap);
821 		*pmap2 = curpmap;
822 	}
823 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
824 #ifdef DIAGNOSTIC
825 	pmap->pm_ncsw = lwp_pctr();
826 #endif
827 	*ptepp = PTE_BASE;
828 
829 #if defined(XENPV) && defined(__x86_64__)
830 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
831 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
832 	*pdeppp = ci->ci_normal_pdes;
833 #else
834 	*pdeppp = normal_pdes;
835 #endif
836 }
837 
838 /*
839  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
840  *
841  * => we cannot tolerate context switches while mapped in: assert this.
842  * => reenables kernel preemption.
843  * => does not unlock pmap.
844  */
845 void
846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
847 {
848 	struct cpu_info *ci;
849 	struct pmap *mypmap;
850 	struct lwp *l;
851 
852 	KASSERT(kpreempt_disabled());
853 
854 	/* The kernel's pmap is always accessible. */
855 	if (pmap == pmap_kernel()) {
856 		kpreempt_enable();
857 		return;
858 	}
859 
860 	l = curlwp;
861 	ci = l->l_cpu;
862 
863 	KASSERT(mutex_owned(&pmap->pm_lock));
864 	KASSERT(pmap->pm_ncsw == lwp_pctr());
865 
866 #if defined(XENPV) && defined(__x86_64__)
867 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
868 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
869 #endif
870 
871 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
872 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
873 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
874 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
875 		ci->ci_want_pmapload = 0;
876 	} else {
877 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
878 		ci->ci_tlbstate = TLBSTATE_LAZY;
879 	}
880 
881 	/* Now safe to re-enable preemption. */
882 	kpreempt_enable();
883 
884 	/* Toss reference to other pmap taken earlier. */
885 	if (pmap2 != NULL) {
886 		pmap_destroy(pmap2);
887 	}
888 }
889 
890 inline static void
891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
892 {
893 
894 #if !defined(__x86_64__)
895 	if (curproc == NULL || curproc->p_vmspace == NULL ||
896 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
897 		return;
898 
899 	if ((opte ^ npte) & PTE_X)
900 		pmap_update_pg(va);
901 
902 	/*
903 	 * Executability was removed on the last executable change.
904 	 * Reset the code segment to something conservative and
905 	 * let the trap handler deal with setting the right limit.
906 	 * We can't do that because of locking constraints on the vm map.
907 	 */
908 
909 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
910 		struct trapframe *tf = curlwp->l_md.md_regs;
911 
912 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
914 	}
915 #endif /* !defined(__x86_64__) */
916 }
917 
918 #if !defined(__x86_64__)
919 /*
920  * Fixup the code segment to cover all potential executable mappings.
921  * returns 0 if no changes to the code segment were made.
922  */
923 int
924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
925 {
926 	struct vm_map_entry *ent;
927 	struct pmap *pm = vm_map_pmap(map);
928 	vaddr_t va = 0;
929 
930 	vm_map_lock_read(map);
931 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
932 		/*
933 		 * This entry has greater va than the entries before.
934 		 * We need to make it point to the last page, not past it.
935 		 */
936 		if (ent->protection & VM_PROT_EXECUTE)
937 			va = trunc_page(ent->end) - PAGE_SIZE;
938 	}
939 	vm_map_unlock_read(map);
940 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
941 		return 0;
942 
943 	pm->pm_hiexec = va;
944 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
945 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
946 	} else {
947 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
948 		return 0;
949 	}
950 	return 1;
951 }
952 #endif /* !defined(__x86_64__) */
953 
954 void
955 pat_init(struct cpu_info *ci)
956 {
957 #ifndef XENPV
958 	uint64_t pat;
959 
960 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
961 		return;
962 
963 	/* We change WT to WC. Leave all other entries the default values. */
964 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
965 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
966 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
967 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
968 
969 	wrmsr(MSR_CR_PAT, pat);
970 	cpu_pat_enabled = true;
971 #endif
972 }
973 
974 static pt_entry_t
975 pmap_pat_flags(u_int flags)
976 {
977 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
978 
979 	if (!cpu_pat_enabled) {
980 		switch (cacheflags) {
981 		case PMAP_NOCACHE:
982 		case PMAP_NOCACHE_OVR:
983 			/* results in PGC_UCMINUS on cpus which have
984 			 * the cpuid PAT but PAT "disabled"
985 			 */
986 			return PTE_PCD;
987 		default:
988 			return 0;
989 		}
990 	}
991 
992 	switch (cacheflags) {
993 	case PMAP_NOCACHE:
994 		return PGC_UC;
995 	case PMAP_WRITE_COMBINE:
996 		return PGC_WC;
997 	case PMAP_WRITE_BACK:
998 		return PGC_WB;
999 	case PMAP_NOCACHE_OVR:
1000 		return PGC_UCMINUS;
1001 	}
1002 
1003 	return 0;
1004 }
1005 
1006 /*
1007  * p m a p   k e n t e r   f u n c t i o n s
1008  *
1009  * functions to quickly enter/remove pages from the kernel address
1010  * space.   pmap_kremove is exported to MI kernel.  we make use of
1011  * the recursive PTE mappings.
1012  */
1013 
1014 /*
1015  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1016  *
1017  * => no need to lock anything, assume va is already allocated
1018  * => should be faster than normal pmap enter function
1019  */
1020 void
1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1022 {
1023 	pt_entry_t *pte, opte, npte;
1024 
1025 	KASSERT(!(prot & ~VM_PROT_ALL));
1026 
1027 	if (va < VM_MIN_KERNEL_ADDRESS)
1028 		pte = vtopte(va);
1029 	else
1030 		pte = kvtopte(va);
1031 #if defined(XENPV) && defined(DOM0OPS)
1032 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1033 #ifdef DEBUG
1034 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1035 		    " outside range\n", __func__, pa, va);
1036 #endif /* DEBUG */
1037 		npte = pa;
1038 	} else
1039 #endif /* XENPV && DOM0OPS */
1040 		npte = pmap_pa2pte(pa);
1041 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1042 	npte |= pmap_pat_flags(flags);
1043 	opte = pmap_pte_testset(pte, npte); /* zap! */
1044 
1045 	/*
1046 	 * XXX: make sure we are not dealing with a large page, since the only
1047 	 * large pages created are for the kernel image, and they should never
1048 	 * be kentered.
1049 	 */
1050 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1051 
1052 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1053 		/* This should not happen. */
1054 		printf_nolog("%s: mapping already present\n", __func__);
1055 		kpreempt_disable();
1056 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1057 		kpreempt_enable();
1058 	}
1059 }
1060 
1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1062 
1063 #if defined(__x86_64__)
1064 /*
1065  * Change protection for a virtual address. Local for a CPU only, don't
1066  * care about TLB shootdowns.
1067  *
1068  * => must be called with preemption disabled
1069  */
1070 void
1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1072 {
1073 	pt_entry_t *pte, opte, npte;
1074 
1075 	KASSERT(kpreempt_disabled());
1076 
1077 	if (va < VM_MIN_KERNEL_ADDRESS)
1078 		pte = vtopte(va);
1079 	else
1080 		pte = kvtopte(va);
1081 
1082 	npte = opte = *pte;
1083 
1084 	if ((prot & VM_PROT_WRITE) != 0)
1085 		npte |= PTE_W;
1086 	else
1087 		npte &= ~(PTE_W|PTE_D);
1088 
1089 	if (opte != npte) {
1090 		pmap_pte_set(pte, npte);
1091 		pmap_pte_flush();
1092 		invlpg(va);
1093 	}
1094 }
1095 #endif /* defined(__x86_64__) */
1096 
1097 /*
1098  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1099  *
1100  * => no need to lock anything
1101  * => caller must dispose of any vm_page mapped in the va range
1102  * => note: not an inline function
1103  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1104  * => we assume kernel only unmaps valid addresses and thus don't bother
1105  *    checking the valid bit before doing TLB flushing
1106  * => must be followed by call to pmap_update() before reuse of page
1107  */
1108 static void
1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1110 {
1111 	pt_entry_t *pte, opte;
1112 	vaddr_t va, eva;
1113 
1114 	eva = sva + len;
1115 
1116 	kpreempt_disable();
1117 	for (va = sva; va < eva; va += PAGE_SIZE) {
1118 		pte = kvtopte(va);
1119 		opte = pmap_pte_testset(pte, 0); /* zap! */
1120 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1121 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1122 			    TLBSHOOT_KREMOVE);
1123 		}
1124 		KASSERTMSG((opte & PTE_PS) == 0,
1125 		    "va %#" PRIxVADDR " is a large page", va);
1126 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1127 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1128 	}
1129 	if (localonly) {
1130 		tlbflushg();
1131 	}
1132 	kpreempt_enable();
1133 }
1134 
1135 void
1136 pmap_kremove(vaddr_t sva, vsize_t len)
1137 {
1138 
1139 	pmap_kremove1(sva, len, false);
1140 }
1141 
1142 /*
1143  * pmap_kremove_local: like pmap_kremove(), but only worry about
1144  * TLB invalidations on the current CPU.  this is only intended
1145  * for use while writing kernel crash dumps, either after panic
1146  * or via reboot -d.
1147  */
1148 void
1149 pmap_kremove_local(vaddr_t sva, vsize_t len)
1150 {
1151 
1152 	pmap_kremove1(sva, len, true);
1153 }
1154 
1155 /*
1156  * p m a p   i n i t   f u n c t i o n s
1157  *
1158  * pmap_bootstrap and pmap_init are called during system startup
1159  * to init the pmap module.   pmap_bootstrap() does a low level
1160  * init just to get things rolling.   pmap_init() finishes the job.
1161  */
1162 
1163 /*
1164  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1165  * This function is to be used before any VM system has been set up.
1166  *
1167  * The va is taken from virtual_avail.
1168  */
1169 static vaddr_t
1170 pmap_bootstrap_valloc(size_t npages)
1171 {
1172 	vaddr_t va = virtual_avail;
1173 	virtual_avail += npages * PAGE_SIZE;
1174 	return va;
1175 }
1176 
1177 /*
1178  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1179  * This function is to be used before any VM system has been set up.
1180  *
1181  * The pa is taken from avail_start.
1182  */
1183 static paddr_t
1184 pmap_bootstrap_palloc(size_t npages)
1185 {
1186 	paddr_t pa = avail_start;
1187 	avail_start += npages * PAGE_SIZE;
1188 	return pa;
1189 }
1190 
1191 /*
1192  * pmap_bootstrap: get the system in a state where it can run with VM properly
1193  * enabled (called before main()). The VM system is fully init'd later.
1194  *
1195  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1196  *    kernel, and nkpde PTP's for the kernel.
1197  * => kva_start is the first free virtual address in kernel space.
1198  */
1199 void
1200 pmap_bootstrap(vaddr_t kva_start)
1201 {
1202 	struct pmap *kpm;
1203 	int i;
1204 	vaddr_t kva;
1205 
1206 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1207 
1208 	/*
1209 	 * Set up our local static global vars that keep track of the usage of
1210 	 * KVM before kernel_map is set up.
1211 	 */
1212 	virtual_avail = kva_start;		/* first free KVA */
1213 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1214 
1215 	/*
1216 	 * Set up protection_codes: we need to be able to convert from a MI
1217 	 * protection code (some combo of VM_PROT...) to something we can jam
1218 	 * into a x86 PTE.
1219 	 */
1220 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1221 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1222 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1223 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1224 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1225 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1226 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1227 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1228 
1229 	/*
1230 	 * Now we init the kernel's pmap.
1231 	 *
1232 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1233 	 * the pm_obj contains the list of active PTPs.
1234 	 */
1235 	kpm = pmap_kernel();
1236 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1237 	rw_init(&kpm->pm_dummy_lock);
1238 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1239 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1240 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1241 		kpm->pm_ptphint[i] = NULL;
1242 	}
1243 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1244 
1245 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1246 	for (i = 0; i < PDP_SIZE; i++)
1247 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1248 
1249 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1250 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1251 
1252 	kcpuset_create(&kpm->pm_cpus, true);
1253 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1254 
1255 	kpm->pm_ldt = NULL;
1256 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1257 
1258 	/*
1259 	 * the above is just a rough estimate and not critical to the proper
1260 	 * operation of the system.
1261 	 */
1262 
1263 #if !defined(XENPV)
1264 	/*
1265 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1266 	 * attribute to already mapped kernel pages. Do that only if SVS is
1267 	 * disabled.
1268 	 *
1269 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1270 	 * happens later in cpu_init().
1271 	 */
1272 #ifdef SVS
1273 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1274 #else
1275 	if (cpu_feature[0] & CPUID_PGE) {
1276 #endif
1277 		pmap_pg_g = PTE_G;
1278 		pmap_remap_global();
1279 	}
1280 #endif
1281 
1282 #ifndef XENPV
1283 	/*
1284 	 * Enable large pages if they are supported.
1285 	 */
1286 	if (cpu_feature[0] & CPUID_PSE) {
1287 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1288 		pmap_largepages = 1;	/* enable software */
1289 
1290 		/*
1291 		 * The TLB must be flushed after enabling large pages on Pentium
1292 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1293 		 * Software Developer's Manual, Volume 3: System Programming".
1294 		 */
1295 		tlbflushg();
1296 
1297 		/* Remap the kernel. */
1298 		pmap_remap_largepages();
1299 	}
1300 	pmap_init_lapic();
1301 #endif /* !XENPV */
1302 
1303 #ifdef __HAVE_PCPU_AREA
1304 	pmap_init_pcpu();
1305 #endif
1306 
1307 #ifdef __HAVE_DIRECT_MAP
1308 	pmap_init_directmap(kpm);
1309 #else
1310 	pmap_vpage_cpualloc(&cpu_info_primary);
1311 
1312 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1313 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1314 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1315 	} else { /* amd64 */
1316 		/*
1317 		 * zero_pte is stuck at the end of mapped space for the kernel
1318 		 * image (disjunct from kva space). This is done so that it
1319 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1320 		 * when it's called for the first time.
1321 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1322 		 */
1323 #ifdef XENPV
1324 		/* early_zerop initialized in xen_locore() */
1325 #else
1326 		early_zerop = (void *)bootspace.spareva;
1327 #endif
1328 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1329 	}
1330 #endif
1331 
1332 #if defined(XENPV) && defined(__x86_64__)
1333 	extern vaddr_t xen_dummy_page;
1334 	paddr_t xen_dummy_user_pgd;
1335 
1336 	/*
1337 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1338 	 * Xen will still consider it active. So we set user PGD to this one
1339 	 * to lift all protection on the now inactive page tables set.
1340 	 */
1341 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1342 
1343 	/* Zero fill it, the less checks in Xen it requires the better */
1344 	memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1345 	/* Mark read-only */
1346 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1347 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1348 	    UVMF_INVLPG);
1349 	/* Pin as L4 */
1350 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1351 #endif
1352 
1353 	/*
1354 	 * Allocate space for the IDT, GDT and LDT.
1355 	 */
1356 	idt_vaddr = pmap_bootstrap_valloc(1);
1357 	idt_paddr = pmap_bootstrap_palloc(1);
1358 
1359 	gdt_vaddr = pmap_bootstrap_valloc(1);
1360 	gdt_paddr = pmap_bootstrap_palloc(1);
1361 
1362 #ifdef __HAVE_PCPU_AREA
1363 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1364 #else
1365 	ldt_vaddr = pmap_bootstrap_valloc(1);
1366 #endif
1367 	ldt_paddr = pmap_bootstrap_palloc(1);
1368 
1369 #if !defined(__x86_64__)
1370 	/* pentium f00f bug stuff */
1371 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1372 #endif
1373 
1374 #if defined(XENPVHVM)
1375 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1376 	extern paddr_t HYPERVISOR_shared_info_pa;
1377 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1378 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1379 
1380 	if (vm_guest != VM_GUEST_XENPVH) {
1381 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1382 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1383 	}
1384 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1385 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1386 #endif
1387 	/*
1388 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1389 	 */
1390 	virtual_avail = reserve_dumppages(virtual_avail);
1391 
1392 	/*
1393 	 * Init the global lock and global list.
1394 	 */
1395 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1396 	LIST_INIT(&pmaps);
1397 
1398 	/*
1399 	 * Ensure the TLB is sync'd with reality by flushing it...
1400 	 */
1401 	tlbflushg();
1402 
1403 	/*
1404 	 * Calculate pmap_maxkvaddr from nkptp[].
1405 	 */
1406 	kva = VM_MIN_KERNEL_ADDRESS;
1407 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1408 		kva += nkptp[i] * nbpd[i];
1409 	}
1410 	pmap_maxkvaddr = kva;
1411 }
1412 
1413 #ifndef XENPV
1414 static void
1415 pmap_init_lapic(void)
1416 {
1417 	/*
1418 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1419 	 * x86 implementation relies a lot on this address to be valid; so just
1420 	 * allocate a fake physical page that will be kentered into
1421 	 * local_apic_va by machdep.
1422 	 *
1423 	 * If the LAPIC is present, the va will be remapped somewhere else
1424 	 * later in lapic_map.
1425 	 */
1426 	local_apic_va = pmap_bootstrap_valloc(1);
1427 	local_apic_pa = pmap_bootstrap_palloc(1);
1428 }
1429 #endif
1430 
1431 #ifdef __x86_64__
1432 static size_t
1433 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1434 {
1435 	size_t npages;
1436 	npages = (roundup(endva, pgsz) / pgsz) -
1437 	    (rounddown(startva, pgsz) / pgsz);
1438 	return npages;
1439 }
1440 #endif
1441 
1442 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1443 static inline void
1444 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1445 {
1446 	size_t sslot = slotspace.area[type].sslot;
1447 	size_t nslot = slotspace.area[type].nslot;
1448 
1449 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1450 }
1451 #endif
1452 
1453 #ifdef __x86_64__
1454 /*
1455  * Randomize the location of an area. We count the holes in the VM space. We
1456  * randomly select one hole, and then randomly select an area within that hole.
1457  * Finally we update the associated entry in the slotspace structure.
1458  */
1459 vaddr_t
1460 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1461     vaddr_t randva)
1462 {
1463 	struct {
1464 		int start;
1465 		int end;
1466 	} holes[SLSPACE_NAREAS+1];
1467 	size_t i, nholes, hole;
1468 	size_t startsl, endsl, nslots, winsize;
1469 	vaddr_t startva, va;
1470 
1471 	sz = roundup(sz, align);
1472 
1473 	/*
1474 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1475 	 * an area that crosses slots:
1476 	 *     +------+------+------+
1477 	 *     | Slot | Slot | Slot |
1478 	 *     +------+------+------+
1479 	 *        [Chosen Area]
1480 	 * And in that case we must take into account the additional slot
1481 	 * consumed.
1482 	 */
1483 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1484 
1485 	/* Get the holes. */
1486 	nholes = 0;
1487 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1488 	while (1) {
1489 		/*
1490 		 * Find the first occupied slot after the current one.
1491 		 * The area between the two is a hole.
1492 		 */
1493 		size_t minsslot = 512;
1494 		size_t minnslot = 0;
1495 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1496 			if (!slotspace.area[i].active)
1497 				continue;
1498 			if (slotspace.area[i].sslot >= curslot &&
1499 			    slotspace.area[i].sslot < minsslot) {
1500 				minsslot = slotspace.area[i].sslot;
1501 				minnslot = slotspace.area[i].nslot;
1502 			}
1503 		}
1504 
1505 		/* No hole anymore, stop here. */
1506 		if (minsslot == 512) {
1507 			break;
1508 		}
1509 
1510 		/* Register the hole. */
1511 		if (minsslot - curslot >= nslots) {
1512 			holes[nholes].start = curslot;
1513 			holes[nholes].end = minsslot;
1514 			nholes++;
1515 		}
1516 
1517 		/* Skip that hole, and iterate again. */
1518 		curslot = minsslot + minnslot;
1519 	}
1520 
1521 	if (nholes == 0) {
1522 		panic("%s: impossible", __func__);
1523 	}
1524 
1525 	/* Select a hole. */
1526 	hole = randhole;
1527 #ifdef NO_X86_ASLR
1528 	hole = 0;
1529 #endif
1530 	hole %= nholes;
1531 	startsl = holes[hole].start;
1532 	endsl = holes[hole].end;
1533 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1534 
1535 	/* Select an area within the hole. */
1536 	va = randva;
1537 #ifdef NO_X86_ASLR
1538 	va = 0;
1539 #endif
1540 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1541 	va %= winsize;
1542 	va = rounddown(va, align);
1543 	va += startva;
1544 
1545 	/* Update the entry. */
1546 	slotspace.area[type].sslot = pl4_i(va);
1547 	slotspace.area[type].nslot =
1548 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1549 	slotspace.area[type].active = true;
1550 
1551 	return va;
1552 }
1553 #endif
1554 
1555 #ifdef __HAVE_PCPU_AREA
1556 static void
1557 pmap_init_pcpu(void)
1558 {
1559 	const vaddr_t startva = PMAP_PCPU_BASE;
1560 	size_t nL4e, nL3e, nL2e, nL1e;
1561 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1562 	paddr_t pa;
1563 	vaddr_t endva;
1564 	vaddr_t tmpva;
1565 	pt_entry_t *pte;
1566 	size_t size;
1567 	int i;
1568 
1569 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1570 
1571 	size = sizeof(struct pcpu_area);
1572 
1573 	endva = startva + size;
1574 
1575 	/* We will use this temporary va. */
1576 	tmpva = bootspace.spareva;
1577 	pte = PTE_BASE + pl1_i(tmpva);
1578 
1579 	/* Build L4 */
1580 	L4e_idx = pl4_i(startva);
1581 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1582 	KASSERT(nL4e  == 1);
1583 	for (i = 0; i < nL4e; i++) {
1584 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1585 
1586 		pa = pmap_bootstrap_palloc(1);
1587 		*pte = (pa & PTE_FRAME) | pteflags;
1588 		pmap_update_pg(tmpva);
1589 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1590 
1591 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1592 	}
1593 
1594 	/* Build L3 */
1595 	L3e_idx = pl3_i(startva);
1596 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1597 	for (i = 0; i < nL3e; i++) {
1598 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1599 
1600 		pa = pmap_bootstrap_palloc(1);
1601 		*pte = (pa & PTE_FRAME) | pteflags;
1602 		pmap_update_pg(tmpva);
1603 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1604 
1605 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1606 	}
1607 
1608 	/* Build L2 */
1609 	L2e_idx = pl2_i(startva);
1610 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1611 	for (i = 0; i < nL2e; i++) {
1612 
1613 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1614 
1615 		pa = pmap_bootstrap_palloc(1);
1616 		*pte = (pa & PTE_FRAME) | pteflags;
1617 		pmap_update_pg(tmpva);
1618 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1619 
1620 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1621 	}
1622 
1623 	/* Build L1 */
1624 	L1e_idx = pl1_i(startva);
1625 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1626 	for (i = 0; i < nL1e; i++) {
1627 		/*
1628 		 * Nothing to do, the PTEs will be entered via
1629 		 * pmap_kenter_pa.
1630 		 */
1631 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1632 	}
1633 
1634 	*pte = 0;
1635 	pmap_update_pg(tmpva);
1636 
1637 	pcpuarea = (struct pcpu_area *)startva;
1638 
1639 	tlbflush();
1640 }
1641 #endif
1642 
1643 #ifdef __HAVE_DIRECT_MAP
1644 static void
1645 randomize_hole(size_t *randholep, vaddr_t *randvap)
1646 {
1647 	struct nist_hash_drbg drbg;
1648 	uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1649 	const char p[] = "x86/directmap";
1650 	int error;
1651 
1652 	entropy_extract(seed, sizeof(seed), 0);
1653 
1654 	error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1655 	    /*nonce*/NULL, 0,
1656 	    /*personalization*/p, strlen(p));
1657 	KASSERTMSG(error == 0, "error=%d", error);
1658 
1659 	error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1660 	    /*additional*/NULL, 0);
1661 	KASSERTMSG(error == 0, "error=%d", error);
1662 
1663 	error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1664 	    /*additional*/NULL, 0);
1665 	KASSERTMSG(error == 0, "error=%d", error);
1666 
1667 	explicit_memset(seed, 0, sizeof(seed));
1668 	explicit_memset(&drbg, 0, sizeof(drbg));
1669 }
1670 
1671 /*
1672  * Create the amd64 direct map. Called only once at boot time. We map all of
1673  * the physical memory contiguously using 2MB large pages, with RW permissions.
1674  * However there is a hole: the kernel is mapped with RO permissions.
1675  */
1676 static void
1677 pmap_init_directmap(struct pmap *kpm)
1678 {
1679 	extern phys_ram_seg_t mem_clusters[];
1680 	extern int mem_cluster_cnt;
1681 
1682 	vaddr_t startva;
1683 	size_t nL4e, nL3e, nL2e;
1684 	size_t L4e_idx, L3e_idx, L2e_idx;
1685 	size_t spahole, epahole;
1686 	paddr_t lastpa, pa;
1687 	vaddr_t endva;
1688 	vaddr_t tmpva;
1689 	pt_entry_t *pte;
1690 	phys_ram_seg_t *mc;
1691 	int i;
1692 	size_t randhole;
1693 	vaddr_t randva;
1694 
1695 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1696 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1697 
1698 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1699 
1700 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1701 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1702 
1703 	/* Get the last physical address available */
1704 	lastpa = 0;
1705 	for (i = 0; i < mem_cluster_cnt; i++) {
1706 		mc = &mem_clusters[i];
1707 		lastpa = MAX(lastpa, mc->start + mc->size);
1708 	}
1709 
1710 	/*
1711 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1712 	 */
1713 	if (lastpa > MAXPHYSMEM) {
1714 		panic("pmap_init_directmap: lastpa incorrect");
1715 	}
1716 
1717 	randomize_hole(&randhole, &randva);
1718 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1719 	    randhole, randva);
1720 	endva = startva + lastpa;
1721 
1722 	/* We will use this temporary va. */
1723 	tmpva = bootspace.spareva;
1724 	pte = PTE_BASE + pl1_i(tmpva);
1725 
1726 	/* Build L4 */
1727 	L4e_idx = pl4_i(startva);
1728 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1729 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1730 	for (i = 0; i < nL4e; i++) {
1731 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1732 
1733 		pa = pmap_bootstrap_palloc(1);
1734 		*pte = (pa & PTE_FRAME) | pteflags;
1735 		pmap_update_pg(tmpva);
1736 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1737 
1738 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1739 	}
1740 
1741 	/* Build L3 */
1742 	L3e_idx = pl3_i(startva);
1743 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1744 	for (i = 0; i < nL3e; i++) {
1745 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1746 
1747 		pa = pmap_bootstrap_palloc(1);
1748 		*pte = (pa & PTE_FRAME) | pteflags;
1749 		pmap_update_pg(tmpva);
1750 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1751 
1752 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1753 	}
1754 
1755 	/* Build L2 */
1756 	L2e_idx = pl2_i(startva);
1757 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1758 	for (i = 0; i < nL2e; i++) {
1759 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1760 
1761 		pa = (paddr_t)(i * NBPD_L2);
1762 
1763 		if (spahole <= pa && pa < epahole) {
1764 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1765 			    PTE_PS | pmap_pg_g;
1766 		} else {
1767 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1768 			    PTE_PS | pmap_pg_g;
1769 		}
1770 	}
1771 
1772 	*pte = 0;
1773 	pmap_update_pg(tmpva);
1774 
1775 	pmap_direct_base = startva;
1776 	pmap_direct_end = endva;
1777 
1778 	tlbflush();
1779 }
1780 #endif /* __HAVE_DIRECT_MAP */
1781 
1782 #if !defined(XENPV)
1783 /*
1784  * Remap all of the virtual pages created so far with the PTE_G bit.
1785  */
1786 static void
1787 pmap_remap_global(void)
1788 {
1789 	vaddr_t kva, kva_end;
1790 	unsigned long p1i;
1791 	size_t i;
1792 
1793 	/* head */
1794 	kva = bootspace.head.va;
1795 	kva_end = kva + bootspace.head.sz;
1796 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1797 		p1i = pl1_i(kva);
1798 		if (pmap_valid_entry(PTE_BASE[p1i]))
1799 			PTE_BASE[p1i] |= pmap_pg_g;
1800 	}
1801 
1802 	/* kernel segments */
1803 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1804 		if (bootspace.segs[i].type == BTSEG_NONE) {
1805 			continue;
1806 		}
1807 		kva = bootspace.segs[i].va;
1808 		kva_end = kva + bootspace.segs[i].sz;
1809 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1810 			p1i = pl1_i(kva);
1811 			if (pmap_valid_entry(PTE_BASE[p1i]))
1812 				PTE_BASE[p1i] |= pmap_pg_g;
1813 		}
1814 	}
1815 
1816 	/* boot space */
1817 	kva = bootspace.boot.va;
1818 	kva_end = kva + bootspace.boot.sz;
1819 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1820 		p1i = pl1_i(kva);
1821 		if (pmap_valid_entry(PTE_BASE[p1i]))
1822 			PTE_BASE[p1i] |= pmap_pg_g;
1823 	}
1824 }
1825 #endif
1826 
1827 #ifndef XENPV
1828 /*
1829  * Remap several kernel segments with large pages. We cover as many pages as we
1830  * can. Called only once at boot time, if the CPU supports large pages.
1831  */
1832 static void
1833 pmap_remap_largepages(void)
1834 {
1835 	pd_entry_t *pde;
1836 	vaddr_t kva, kva_end;
1837 	paddr_t pa;
1838 	size_t i;
1839 
1840 	/* Remap the kernel text using large pages. */
1841 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1842 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1843 			continue;
1844 		}
1845 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1846 		if (kva < bootspace.segs[i].va) {
1847 			continue;
1848 		}
1849 		kva_end = rounddown(bootspace.segs[i].va +
1850 			bootspace.segs[i].sz, NBPD_L2);
1851 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1852 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1853 			pde = &L2_BASE[pl2_i(kva)];
1854 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1855 			tlbflushg();
1856 		}
1857 	}
1858 
1859 	/* Remap the kernel rodata using large pages. */
1860 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1861 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1862 			continue;
1863 		}
1864 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1865 		if (kva < bootspace.segs[i].va) {
1866 			continue;
1867 		}
1868 		kva_end = rounddown(bootspace.segs[i].va +
1869 			bootspace.segs[i].sz, NBPD_L2);
1870 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1871 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1872 			pde = &L2_BASE[pl2_i(kva)];
1873 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1874 			tlbflushg();
1875 		}
1876 	}
1877 
1878 	/* Remap the kernel data+bss using large pages. */
1879 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1880 		if (bootspace.segs[i].type != BTSEG_DATA) {
1881 			continue;
1882 		}
1883 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1884 		if (kva < bootspace.segs[i].va) {
1885 			continue;
1886 		}
1887 		kva_end = rounddown(bootspace.segs[i].va +
1888 			bootspace.segs[i].sz, NBPD_L2);
1889 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1890 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1891 			pde = &L2_BASE[pl2_i(kva)];
1892 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1893 			tlbflushg();
1894 		}
1895 	}
1896 }
1897 #endif /* !XENPV */
1898 
1899 /*
1900  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1901  * to manage mappings.
1902  */
1903 void
1904 pmap_init(void)
1905 {
1906 	int flags;
1907 
1908 	/*
1909 	 * initialize caches.
1910 	 */
1911 
1912 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1913 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1914 
1915 #ifdef XENPV
1916 	/*
1917 	 * pool_cache(9) should not touch cached objects, since they
1918 	 * are pinned on xen and R/O for the domU
1919 	 */
1920 	flags = PR_NOTOUCH;
1921 #else
1922 	flags = 0;
1923 #endif
1924 
1925 #ifdef PAE
1926 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1927 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1928 #else
1929 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1930 	    "pdppl", NULL, IPL_NONE);
1931 #endif
1932 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1933 	     0, 0, "pvpage", &pool_allocator_kmem,
1934 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1935 
1936 	pmap_tlb_init();
1937 
1938 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1939 	pmap_tlb_cpu_init(curcpu());
1940 
1941 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1942 	    NULL, "x86", "io bitmap copy");
1943 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1944 	    NULL, "x86", "ldt sync");
1945 
1946 	/*
1947 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1948 	 * to hang a tree of pv_entry records.  Dynamically allocated
1949 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1950 	 * usual case is embedded), so cop out and use a single RB tree
1951 	 * to cover them.
1952 	 */
1953 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1954 
1955 	/*
1956 	 * done: pmap module is up (and ready for business)
1957 	 */
1958 
1959 	pmap_initialized = true;
1960 }
1961 
1962 #ifndef XENPV
1963 /*
1964  * pmap_cpu_init_late: perform late per-CPU initialization.
1965  */
1966 void
1967 pmap_cpu_init_late(struct cpu_info *ci)
1968 {
1969 	/*
1970 	 * The BP has already its own PD page allocated during early
1971 	 * MD startup.
1972 	 */
1973 	if (ci == &cpu_info_primary)
1974 		return;
1975 #ifdef PAE
1976 	cpu_alloc_l3_page(ci);
1977 #endif
1978 }
1979 #endif
1980 
1981 #ifndef __HAVE_DIRECT_MAP
1982 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1983 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1984 
1985 static void
1986 pmap_vpage_cpualloc(struct cpu_info *ci)
1987 {
1988 	bool primary = (ci == &cpu_info_primary);
1989 	size_t i, npages;
1990 	vaddr_t vabase;
1991 	vsize_t vrange;
1992 
1993 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1994 	KASSERT(npages >= VPAGE_MAX);
1995 	vrange = npages * PAGE_SIZE;
1996 
1997 	if (primary) {
1998 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1999 			/* Waste some pages to align properly */
2000 		}
2001 		/* The base is aligned, allocate the rest (contiguous) */
2002 		pmap_bootstrap_valloc(npages - 1);
2003 	} else {
2004 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
2005 		    UVM_KMF_VAONLY);
2006 		if (vabase == 0) {
2007 			panic("%s: failed to allocate tmp VA for CPU %d\n",
2008 			    __func__, cpu_index(ci));
2009 		}
2010 	}
2011 
2012 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
2013 
2014 	for (i = 0; i < VPAGE_MAX; i++) {
2015 		ci->vpage[i] = vabase + i * PAGE_SIZE;
2016 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
2017 	}
2018 }
2019 
2020 void
2021 pmap_vpage_cpu_init(struct cpu_info *ci)
2022 {
2023 	if (ci == &cpu_info_primary) {
2024 		/* cpu0 already taken care of in pmap_bootstrap */
2025 		return;
2026 	}
2027 
2028 	pmap_vpage_cpualloc(ci);
2029 }
2030 #endif
2031 
2032 /*
2033  * p v _ e n t r y   f u n c t i o n s
2034  */
2035 
2036 /*
2037  * pmap_pvp_dtor: pool_cache constructor for PV pages.
2038  */
2039 static int
2040 pmap_pvp_ctor(void *arg, void *obj, int flags)
2041 {
2042 	struct pv_page *pvp = (struct pv_page *)obj;
2043 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
2044 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
2045 
2046 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2047 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2048 
2049 	LIST_INIT(&pvp->pvp_pves);
2050 	pvp->pvp_nfree = PVE_PER_PVP;
2051 	pvp->pvp_pmap = NULL;
2052 
2053 	for (; pve < maxpve; pve++) {
2054 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2055 	}
2056 
2057 	return 0;
2058 }
2059 
2060 /*
2061  * pmap_pvp_dtor: pool_cache destructor for PV pages.
2062  */
2063 static void
2064 pmap_pvp_dtor(void *arg, void *obj)
2065 {
2066 	struct pv_page *pvp __diagused = obj;
2067 
2068 	KASSERT(pvp->pvp_pmap == NULL);
2069 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2070 }
2071 
2072 /*
2073  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2074  */
2075 static struct pv_entry *
2076 pmap_alloc_pv(struct pmap *pmap)
2077 {
2078 	struct pv_entry *pve;
2079 	struct pv_page *pvp;
2080 
2081 	KASSERT(mutex_owned(&pmap->pm_lock));
2082 
2083 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2084 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2085 			LIST_REMOVE(pvp, pvp_list);
2086 		} else {
2087 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2088 		}
2089 		if (__predict_false(pvp == NULL)) {
2090 			return NULL;
2091 		}
2092 		/* full -> part */
2093 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2094 		pvp->pvp_pmap = pmap;
2095 	}
2096 
2097 	KASSERT(pvp->pvp_pmap == pmap);
2098 	KASSERT(pvp->pvp_nfree > 0);
2099 
2100 	pve = LIST_FIRST(&pvp->pvp_pves);
2101 	LIST_REMOVE(pve, pve_list);
2102 	pvp->pvp_nfree--;
2103 
2104 	if (__predict_false(pvp->pvp_nfree == 0)) {
2105 		/* part -> empty */
2106 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2107 		LIST_REMOVE(pvp, pvp_list);
2108 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2109 	} else {
2110 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2111 	}
2112 
2113 	return pve;
2114 }
2115 
2116 /*
2117  * pmap_free_pv: delayed free of a PV entry.
2118  */
2119 static void
2120 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2121 {
2122 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2123 
2124 	KASSERT(mutex_owned(&pmap->pm_lock));
2125 	KASSERT(pvp->pvp_pmap == pmap);
2126 	KASSERT(pvp->pvp_nfree >= 0);
2127 
2128 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2129 	pvp->pvp_nfree++;
2130 
2131 	if (__predict_false(pvp->pvp_nfree == 1)) {
2132 		/* empty -> part */
2133 		LIST_REMOVE(pvp, pvp_list);
2134 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2135 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2136 		/* part -> full */
2137 		LIST_REMOVE(pvp, pvp_list);
2138 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2139 	}
2140 }
2141 
2142 /*
2143  * pmap_drain_pv: free full PV pages.
2144  */
2145 static void
2146 pmap_drain_pv(struct pmap *pmap)
2147 {
2148 	struct pv_page *pvp;
2149 
2150 	KASSERT(mutex_owned(&pmap->pm_lock));
2151 
2152 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2153 		LIST_REMOVE(pvp, pvp_list);
2154 		KASSERT(pvp->pvp_pmap == pmap);
2155 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2156 		pvp->pvp_pmap = NULL;
2157 		pool_cache_put(&pmap_pvp_cache, pvp);
2158 	}
2159 }
2160 
2161 /*
2162  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2163  */
2164 static void
2165 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2166     vaddr_t va, bool tracked)
2167 {
2168 #ifdef DEBUG
2169 	struct pv_pte *pvpte;
2170 
2171 	PMAP_CHECK_PP(pp);
2172 
2173 	mutex_spin_enter(&pp->pp_lock);
2174 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2175 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2176 			break;
2177 		}
2178 	}
2179 	mutex_spin_exit(&pp->pp_lock);
2180 
2181 	if (pvpte && !tracked) {
2182 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2183 	} else if (!pvpte && tracked) {
2184 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2185 	}
2186 #endif
2187 }
2188 
2189 /*
2190  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2191  *
2192  * => pmap must be locked
2193  */
2194 static struct pv_entry *
2195 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2196     const rb_tree_t *tree, const vaddr_t va)
2197 {
2198 	struct pv_entry *pve;
2199 	rb_node_t *node;
2200 
2201 	/*
2202 	 * Inlined lookup tailored for exactly what's needed here that is
2203 	 * quite a bit faster than using rb_tree_find_node().
2204 	 */
2205 	for (node = tree->rbt_root;;) {
2206 		if (__predict_false(RB_SENTINEL_P(node))) {
2207 			return NULL;
2208 		}
2209 		pve = (struct pv_entry *)
2210 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2211 		if (pve->pve_pte.pte_va == va) {
2212 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2213 			return pve;
2214 		}
2215 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2216 	}
2217 }
2218 
2219 /*
2220  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2221  *
2222  * => a PV entry must be known present (doesn't check for existence)
2223  * => pmap must be locked
2224  */
2225 static struct pv_entry *
2226 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2227     const struct pmap_page * const old_pp, const vaddr_t va)
2228 {
2229 	struct pv_entry *pve;
2230 	const rb_tree_t *tree;
2231 
2232 	KASSERT(mutex_owned(&pmap->pm_lock));
2233 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2234 
2235 	/*
2236 	 * [This mostly deals with the case of process-private pages, i.e.
2237 	 * anonymous memory allocations or COW.]
2238 	 *
2239 	 * If the page is tracked with an embedded entry then the tree
2240 	 * lookup can be avoided.  It's safe to check for this specific
2241 	 * set of values without pp_lock because both will only ever be
2242 	 * set together for this pmap.
2243 	 *
2244 	 */
2245 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2246 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2247 		return NULL;
2248 	}
2249 
2250 	/*
2251 	 * [This mostly deals with shared mappings, for example shared libs
2252 	 * and executables.]
2253 	 *
2254 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2255 	 * look at the lowest numbered node in the tree first.  The tree is
2256 	 * known non-empty because of the check above.  For short lived
2257 	 * processes where pmap_remove() isn't used much this gets close to
2258 	 * a 100% hit rate.
2259 	 */
2260 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2261 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2262 	pve = (struct pv_entry *)
2263 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2264 	    offsetof(struct pv_entry, pve_rb));
2265 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2266 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2267 		return pve;
2268 	}
2269 
2270 	/* Search the RB tree for the key (uncommon). */
2271 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2272 }
2273 
2274 /*
2275  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2276  *
2277  * => pmap must be locked
2278  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2279  */
2280 static int
2281 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2282     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2283     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2284 {
2285 	struct pv_entry *pve;
2286 	int error;
2287 
2288 	KASSERT(mutex_owned(&pmap->pm_lock));
2289 	KASSERT(ptp_to_pmap(ptp) == pmap);
2290 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2291 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2292 	PMAP_CHECK_PP(pp);
2293 
2294 	/*
2295 	 * If entering the same page and it's already tracked with an
2296 	 * embedded entry, we can avoid the expense below.  It's safe
2297 	 * to check for this very specific set of values without a lock
2298 	 * because both will only ever be set together for this pmap.
2299 	 */
2300 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2301 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2302 		*samepage = true;
2303 		pmap_check_pv(pmap, ptp, pp, va, true);
2304 		return 0;
2305 	}
2306 
2307 	/*
2308 	 * Check for an existing dynamic mapping at this address.  If it's
2309 	 * for the same page, then it will be reused and nothing needs to be
2310 	 * changed.
2311 	 */
2312 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2313 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2314 		*samepage = true;
2315 		pmap_check_pv(pmap, ptp, pp, va, true);
2316 		return 0;
2317 	}
2318 
2319 	/*
2320 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2321 	 * case it's needed; won't know for sure until the lock is taken.
2322 	 */
2323 	if (pmap->pm_pve == NULL) {
2324 		pmap->pm_pve = pmap_alloc_pv(pmap);
2325 	}
2326 
2327 	error = 0;
2328 	pmap_check_pv(pmap, ptp, pp, va, false);
2329 	mutex_spin_enter(&pp->pp_lock);
2330 	if (!pv_pte_embedded(pp)) {
2331 		/*
2332 		 * Embedded PV tracking available - easy.
2333 		 */
2334 		pp->pp_pte.pte_ptp = ptp;
2335 		pp->pp_pte.pte_va = va;
2336 		*new_embedded = true;
2337 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2338 		/*
2339 		 * No memory.
2340 		 */
2341 		error = ENOMEM;
2342 	} else {
2343 		/*
2344 		 * Install new pv_entry on the page.
2345 		 */
2346 		pve = pmap->pm_pve;
2347 		pmap->pm_pve = NULL;
2348 		*new_pve = pve;
2349 		pve->pve_pte.pte_ptp = ptp;
2350 		pve->pve_pte.pte_va = va;
2351 		pve->pve_pp = pp;
2352 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2353 	}
2354 	mutex_spin_exit(&pp->pp_lock);
2355 	if (error == 0) {
2356 		pmap_check_pv(pmap, ptp, pp, va, true);
2357 	}
2358 
2359 	return error;
2360 }
2361 
2362 /*
2363  * pmap_remove_pv: try to remove a mapping from a pv_list
2364  *
2365  * => pmap must be locked
2366  * => removes dynamic entries from tree and frees them
2367  * => caller should adjust ptp's wire_count and free PTP if needed
2368  */
2369 static void
2370 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2371     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2372 {
2373 	rb_tree_t *tree = (ptp != NULL ?
2374 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2375 
2376 	KASSERT(mutex_owned(&pmap->pm_lock));
2377 	KASSERT(ptp_to_pmap(ptp) == pmap);
2378 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2379 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2380 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2381 
2382 	pmap_check_pv(pmap, ptp, pp, va, true);
2383 
2384 	if (pve == NULL) {
2385 		mutex_spin_enter(&pp->pp_lock);
2386 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2387 		KASSERT(pp->pp_pte.pte_va == va);
2388 		pp->pp_attrs |= oattrs;
2389 		pp->pp_pte.pte_ptp = NULL;
2390 		pp->pp_pte.pte_va = 0;
2391 		mutex_spin_exit(&pp->pp_lock);
2392 	} else {
2393 		mutex_spin_enter(&pp->pp_lock);
2394 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2395 		    pp->pp_pte.pte_va != va);
2396 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2397 		KASSERT(pve->pve_pte.pte_va == va);
2398 		KASSERT(pve->pve_pp == pp);
2399 		pp->pp_attrs |= oattrs;
2400 		LIST_REMOVE(pve, pve_list);
2401 		mutex_spin_exit(&pp->pp_lock);
2402 
2403 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2404 		rb_tree_remove_node(tree, pve);
2405 #ifdef DIAGNOSTIC
2406 		memset(pve, 0, sizeof(*pve));
2407 #endif
2408 		pmap_free_pv(pmap, pve);
2409 	}
2410 
2411 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2412 	pmap_check_pv(pmap, ptp, pp, va, false);
2413 }
2414 
2415 /*
2416  * p t p   f u n c t i o n s
2417  */
2418 
2419 static struct vm_page *
2420 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2421 {
2422 	int lidx = level - 1;
2423 	off_t off = ptp_va2o(va, level);
2424 	struct vm_page *pg;
2425 
2426 	KASSERT(mutex_owned(&pmap->pm_lock));
2427 
2428 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2429 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2430 		pg = pmap->pm_ptphint[lidx];
2431 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2432 		return pg;
2433 	}
2434 	PMAP_DUMMY_LOCK(pmap);
2435 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2436 	PMAP_DUMMY_UNLOCK(pmap);
2437 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2438 		/* This page is queued to be freed - ignore. */
2439 		pg = NULL;
2440 	}
2441 	if (pg != NULL) {
2442 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2443 	}
2444 	pmap->pm_ptphint[lidx] = pg;
2445 	return pg;
2446 }
2447 
2448 static inline void
2449 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2450 {
2451 	int lidx;
2452 
2453 	KASSERT(ptp->wire_count <= 1);
2454 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2455 
2456 	lidx = level - 1;
2457 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2458 	if (pmap->pm_ptphint[lidx] == ptp)
2459 		pmap->pm_ptphint[lidx] = NULL;
2460 	ptp->wire_count = 0;
2461 	ptp->uanon = NULL;
2462 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2463 
2464 	/*
2465 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2466 	 * the page from the uvm_object, as that can take further locks
2467 	 * (intolerable right now because the PTEs are likely mapped in).
2468 	 * Instead mark the PTP as free and if we bump into it again, we'll
2469 	 * either ignore or reuse (depending on what's useful at the time).
2470 	 */
2471 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2472 }
2473 
2474 static void
2475 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2476 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2477 {
2478 	unsigned long index;
2479 	int level;
2480 	vaddr_t invaladdr;
2481 	pd_entry_t opde;
2482 
2483 	KASSERT(pmap != pmap_kernel());
2484 	KASSERT(mutex_owned(&pmap->pm_lock));
2485 	KASSERT(kpreempt_disabled());
2486 
2487 	level = 1;
2488 	do {
2489 		index = pl_i(va, level + 1);
2490 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2491 
2492 		/*
2493 		 * On Xen-amd64 or SVS, we need to sync the top level page
2494 		 * directory on each CPU.
2495 		 */
2496 #if defined(XENPV) && defined(__x86_64__)
2497 		if (level == PTP_LEVELS - 1) {
2498 			xen_kpm_sync(pmap, index);
2499 		}
2500 #elif defined(SVS)
2501 		if (svs_enabled && level == PTP_LEVELS - 1 &&
2502 		    pmap_is_user(pmap)) {
2503 			svs_pmap_sync(pmap, index);
2504 		}
2505 #endif
2506 
2507 		invaladdr = level == 1 ? (vaddr_t)ptes :
2508 		    (vaddr_t)pdes[level - 2];
2509 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2510 		    opde, TLBSHOOT_FREE_PTP);
2511 
2512 #if defined(XENPV)
2513 		pmap_tlb_shootnow();
2514 #endif
2515 
2516 		pmap_freepage(pmap, ptp, level);
2517 		if (level < PTP_LEVELS - 1) {
2518 			ptp = pmap_find_ptp(pmap, va, level + 1);
2519 			ptp->wire_count--;
2520 			if (ptp->wire_count > 1)
2521 				break;
2522 		}
2523 	} while (++level < PTP_LEVELS);
2524 	pmap_pte_flush();
2525 }
2526 
2527 /*
2528  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2529  *
2530  * => pmap should NOT be pmap_kernel()
2531  * => pmap should be locked
2532  * => we are not touching any PTEs yet, so they need not be mapped in
2533  */
2534 static int
2535 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2536     int flags, struct vm_page **resultp)
2537 {
2538 	struct vm_page *ptp;
2539 	int i, aflags;
2540 	struct uvm_object *obj;
2541 	voff_t off;
2542 
2543 	KASSERT(pmap != pmap_kernel());
2544 	KASSERT(mutex_owned(&pmap->pm_lock));
2545 
2546 	/*
2547 	 * Loop through all page table levels allocating a page
2548 	 * for any level where we don't already have one.
2549 	 */
2550 	memset(pt, 0, sizeof(*pt));
2551 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2552 		UVM_PGA_ZERO;
2553 	for (i = PTP_LEVELS; i > 1; i--) {
2554 		obj = &pmap->pm_obj[i - 2];
2555 		off = ptp_va2o(va, i - 1);
2556 
2557 		PMAP_DUMMY_LOCK(pmap);
2558 		pt->pg[i] = uvm_pagelookup(obj, off);
2559 
2560 		if (pt->pg[i] == NULL) {
2561 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2562 			pt->alloced[i] = (pt->pg[i] != NULL);
2563 		} else if (pt->pg[i]->wire_count == 0) {
2564 			/* This page was queued to be freed; dequeue it. */
2565 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2566 			pt->alloced[i] = true;
2567 		}
2568 		PMAP_DUMMY_UNLOCK(pmap);
2569 		if (pt->pg[i] == NULL) {
2570 			pmap_unget_ptp(pmap, pt);
2571 			return ENOMEM;
2572 		} else if (pt->alloced[i]) {
2573 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2574 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2575 			    &pmap_rbtree_ops);
2576 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2577 		}
2578 	}
2579 	ptp = pt->pg[2];
2580 	KASSERT(ptp != NULL);
2581 	*resultp = ptp;
2582 	pmap->pm_ptphint[0] = ptp;
2583 	return 0;
2584 }
2585 
2586 /*
2587  * pmap_install_ptp: install any freshly allocated PTPs
2588  *
2589  * => pmap should NOT be pmap_kernel()
2590  * => pmap should be locked
2591  * => PTEs must be mapped
2592  * => preemption must be disabled
2593  */
2594 static void
2595 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2596     pd_entry_t * const *pdes)
2597 {
2598 	struct vm_page *ptp;
2599 	unsigned long index;
2600 	pd_entry_t *pva;
2601 	paddr_t pa;
2602 	int i;
2603 
2604 	KASSERT(pmap != pmap_kernel());
2605 	KASSERT(mutex_owned(&pmap->pm_lock));
2606 	KASSERT(kpreempt_disabled());
2607 
2608 	/*
2609 	 * Now that we have all the pages looked up or allocated,
2610 	 * loop through again installing any new ones into the tree.
2611 	 */
2612 	for (i = PTP_LEVELS; i > 1; i--) {
2613 		index = pl_i(va, i);
2614 		pva = pdes[i - 2];
2615 
2616 		if (pmap_valid_entry(pva[index])) {
2617 			KASSERT(!pt->alloced[i]);
2618 			continue;
2619 		}
2620 
2621 		ptp = pt->pg[i];
2622 		ptp->flags &= ~PG_BUSY; /* never busy */
2623 		ptp->wire_count = 1;
2624 		pmap->pm_ptphint[i - 2] = ptp;
2625 		pa = VM_PAGE_TO_PHYS(ptp);
2626 		pmap_pte_set(&pva[index], (pd_entry_t)
2627 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2628 
2629 		/*
2630 		 * On Xen-amd64 or SVS, we need to sync the top level page
2631 		 * directory on each CPU.
2632 		 */
2633 #if defined(XENPV) && defined(__x86_64__)
2634 		if (i == PTP_LEVELS) {
2635 			xen_kpm_sync(pmap, index);
2636 		}
2637 #elif defined(SVS)
2638 		if (svs_enabled && i == PTP_LEVELS &&
2639 		    pmap_is_user(pmap)) {
2640 			svs_pmap_sync(pmap, index);
2641 		}
2642 #endif
2643 
2644 		pmap_pte_flush();
2645 		pmap_stats_update(pmap, 1, 0);
2646 
2647 		/*
2648 		 * If we're not in the top level, increase the
2649 		 * wire count of the parent page.
2650 		 */
2651 		if (i < PTP_LEVELS) {
2652 			pt->pg[i + 1]->wire_count++;
2653 		}
2654 	}
2655 }
2656 
2657 /*
2658  * pmap_unget_ptp: free unusued PTPs
2659  *
2660  * => pmap should NOT be pmap_kernel()
2661  * => pmap should be locked
2662  */
2663 static void
2664 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2665 {
2666 	int i;
2667 
2668 	KASSERT(pmap != pmap_kernel());
2669 	KASSERT(mutex_owned(&pmap->pm_lock));
2670 
2671 	for (i = PTP_LEVELS; i > 1; i--) {
2672 		if (!pt->alloced[i]) {
2673 			continue;
2674 		}
2675 		KASSERT(pt->pg[i]->wire_count == 0);
2676 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2677 		pmap_freepage(pmap, pt->pg[i], i - 1);
2678 	}
2679 }
2680 
2681 /*
2682  * p m a p   l i f e c y c l e   f u n c t i o n s
2683  */
2684 
2685 /*
2686  * pmap_pdp_init: constructor a new PDP.
2687  */
2688 static void
2689 pmap_pdp_init(pd_entry_t *pdir)
2690 {
2691 	paddr_t pdirpa = 0;
2692 	vaddr_t object;
2693 	int i;
2694 
2695 #if !defined(XENPV) || !defined(__x86_64__)
2696 	int npde;
2697 #endif
2698 #ifdef XENPV
2699 	int s;
2700 #endif
2701 
2702 	memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2703 
2704 	/*
2705 	 * NOTE: This is all done unlocked, but we will check afterwards
2706 	 * if we have raced with pmap_growkernel().
2707 	 */
2708 
2709 #if defined(XENPV) && defined(__x86_64__)
2710 	/* Fetch the physical address of the page directory */
2711 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2712 
2713 	/*
2714 	 * This pdir will NEVER be active in kernel mode, so mark
2715 	 * recursive entry invalid.
2716 	 */
2717 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2718 
2719 	/*
2720 	 * PDP constructed this way won't be for the kernel, hence we
2721 	 * don't put kernel mappings on Xen.
2722 	 *
2723 	 * But we need to make pmap_create() happy, so put a dummy
2724 	 * (without PTE_P) value at the right place.
2725 	 */
2726 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2727 	     (pd_entry_t)-1 & PTE_FRAME;
2728 #else /* XENPV && __x86_64__*/
2729 	object = (vaddr_t)pdir;
2730 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2731 		/* Fetch the physical address of the page directory */
2732 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2733 
2734 		/* Put in recursive PDE to map the PTEs */
2735 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2736 		    pmap_pg_nx;
2737 #ifndef XENPV
2738 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2739 #endif
2740 	}
2741 
2742 	/* Copy the kernel's top level PDE */
2743 	npde = nkptp[PTP_LEVELS - 1];
2744 
2745 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2746 	    npde * sizeof(pd_entry_t));
2747 
2748 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2749 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2750 		pdir[idx] = PDP_BASE[idx];
2751 	}
2752 
2753 #ifdef __HAVE_PCPU_AREA
2754 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2755 #endif
2756 #ifdef __HAVE_DIRECT_MAP
2757 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2758 #endif
2759 #ifdef KASAN
2760 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2761 #endif
2762 #ifdef KMSAN
2763 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2764 #endif
2765 #endif /* XENPV  && __x86_64__*/
2766 
2767 #ifdef XENPV
2768 	s = splvm();
2769 	object = (vaddr_t)pdir;
2770 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2771 	    VM_PROT_READ);
2772 	pmap_update(pmap_kernel());
2773 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2774 		/*
2775 		 * pin as L2/L4 page, we have to do the page with the
2776 		 * PDIR_SLOT_PTE entries last
2777 		 */
2778 #ifdef PAE
2779 		if (i == l2tol3(PDIR_SLOT_PTE))
2780 			continue;
2781 #endif
2782 
2783 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2784 #ifdef __x86_64__
2785 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2786 #else
2787 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2788 #endif
2789 	}
2790 #ifdef PAE
2791 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2792 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2793 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2794 #endif
2795 	splx(s);
2796 #endif /* XENPV */
2797 }
2798 
2799 /*
2800  * pmap_pdp_fini: destructor for the PDPs.
2801  */
2802 static void
2803 pmap_pdp_fini(pd_entry_t *pdir)
2804 {
2805 #ifdef XENPV
2806 	paddr_t pdirpa = 0;	/* XXX: GCC */
2807 	vaddr_t object = (vaddr_t)pdir;
2808 	int i;
2809 	int s = splvm();
2810 	pt_entry_t *pte;
2811 
2812 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2813 		/* fetch the physical address of the page directory. */
2814 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2815 		/* unpin page table */
2816 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2817 	}
2818 	object = (vaddr_t)pdir;
2819 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2820 		/* Set page RW again */
2821 		pte = kvtopte(object);
2822 		pmap_pte_set(pte, *pte | PTE_W);
2823 		xen_bcast_invlpg((vaddr_t)object);
2824 	}
2825 	splx(s);
2826 #endif  /* XENPV */
2827 }
2828 
2829 #ifdef PAE
2830 static void *
2831 pmap_pdp_alloc(struct pool *pp, int flags)
2832 {
2833 	return (void *)uvm_km_alloc(kernel_map,
2834 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2835 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2836 	    UVM_KMF_WIRED);
2837 }
2838 
2839 static void
2840 pmap_pdp_free(struct pool *pp, void *v)
2841 {
2842 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2843 	    UVM_KMF_WIRED);
2844 }
2845 #endif /* PAE */
2846 
2847 /*
2848  * pmap_ctor: constructor for the pmap cache.
2849  */
2850 static int
2851 pmap_ctor(void *arg, void *obj, int flags)
2852 {
2853 	struct pmap *pmap = obj;
2854 	pt_entry_t p;
2855 	int i;
2856 
2857 	KASSERT((flags & PR_WAITOK) != 0);
2858 
2859 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2860 	rw_init(&pmap->pm_dummy_lock);
2861 	kcpuset_create(&pmap->pm_cpus, true);
2862 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2863 #ifdef XENPV
2864 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2865 #endif
2866 	LIST_INIT(&pmap->pm_gc_ptp);
2867 	pmap->pm_pve = NULL;
2868 	LIST_INIT(&pmap->pm_pvp_full);
2869 	LIST_INIT(&pmap->pm_pvp_part);
2870 	LIST_INIT(&pmap->pm_pvp_empty);
2871 
2872 	/* allocate and init PDP */
2873 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2874 
2875 	for (;;) {
2876 		pmap_pdp_init(pmap->pm_pdir);
2877 		mutex_enter(&pmaps_lock);
2878 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2879 		if (__predict_true(p != 0)) {
2880 			break;
2881 		}
2882 		mutex_exit(&pmaps_lock);
2883 	}
2884 
2885 	for (i = 0; i < PDP_SIZE; i++)
2886 		pmap->pm_pdirpa[i] =
2887 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2888 
2889 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2890 	mutex_exit(&pmaps_lock);
2891 
2892 	return 0;
2893 }
2894 
2895 /*
2896  * pmap_ctor: destructor for the pmap cache.
2897  */
2898 static void
2899 pmap_dtor(void *arg, void *obj)
2900 {
2901 	struct pmap *pmap = obj;
2902 
2903 	mutex_enter(&pmaps_lock);
2904 	LIST_REMOVE(pmap, pm_list);
2905 	mutex_exit(&pmaps_lock);
2906 
2907 	pmap_pdp_fini(pmap->pm_pdir);
2908 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2909 	mutex_destroy(&pmap->pm_lock);
2910 	rw_destroy(&pmap->pm_dummy_lock);
2911 	kcpuset_destroy(pmap->pm_cpus);
2912 	kcpuset_destroy(pmap->pm_kernel_cpus);
2913 #ifdef XENPV
2914 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2915 #endif
2916 }
2917 
2918 /*
2919  * pmap_create: create a pmap object.
2920  */
2921 struct pmap *
2922 pmap_create(void)
2923 {
2924 	struct pmap *pmap;
2925 	int i;
2926 
2927 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2928 
2929 	/* init uvm_object */
2930 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2931 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2932 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2933 		pmap->pm_ptphint[i] = NULL;
2934 	}
2935 	pmap->pm_stats.wired_count = 0;
2936 	/* count the PDP allocd below */
2937 	pmap->pm_stats.resident_count = PDP_SIZE;
2938 #if !defined(__x86_64__)
2939 	pmap->pm_hiexec = 0;
2940 #endif
2941 
2942 	/* Used by NVMM and Xen */
2943 	pmap->pm_enter = NULL;
2944 	pmap->pm_extract = NULL;
2945 	pmap->pm_remove = NULL;
2946 	pmap->pm_sync_pv = NULL;
2947 	pmap->pm_pp_remove_ent = NULL;
2948 	pmap->pm_write_protect = NULL;
2949 	pmap->pm_unwire = NULL;
2950 	pmap->pm_tlb_flush = NULL;
2951 	pmap->pm_data = NULL;
2952 
2953 	/* init the LDT */
2954 	pmap->pm_ldt = NULL;
2955 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2956 
2957 	return pmap;
2958 }
2959 
2960 /*
2961  * pmap_check_ptps: verify that none of the pmap's page table objects
2962  * have any pages allocated to them.
2963  */
2964 static void
2965 pmap_check_ptps(struct pmap *pmap)
2966 {
2967 	int i;
2968 
2969 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2970 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2971 		    "pmap %p level %d still has %d pages",
2972 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2973 	}
2974 }
2975 
2976 static void
2977 pmap_check_inuse(struct pmap *pmap)
2978 {
2979 #ifdef DEBUG
2980 	CPU_INFO_ITERATOR cii;
2981 	struct cpu_info *ci;
2982 
2983 	for (CPU_INFO_FOREACH(cii, ci)) {
2984 		if (ci->ci_pmap == pmap)
2985 			panic("destroying pmap being used");
2986 #if defined(XENPV) && defined(__x86_64__)
2987 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
2988 			if (pmap->pm_pdir[i] != 0 &&
2989 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2990 				printf("pmap_destroy(%p) pmap_kernel %p "
2991 				    "curcpu %d cpu %d ci_pmap %p "
2992 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2993 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2994 				    pmap, pmap_kernel(), curcpu()->ci_index,
2995 				    ci->ci_index, ci->ci_pmap,
2996 				    i, ci->ci_kpm_pdir[i],
2997 				    i, pmap->pm_pdir[i]);
2998 				panic("%s: used pmap", __func__);
2999 			}
3000 		}
3001 #endif
3002 	}
3003 #endif /* DEBUG */
3004 }
3005 
3006 /*
3007  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
3008  * count goes to zero.
3009  *
3010  * => we can be called from pmap_unmap_ptes() with a different, unrelated
3011  *    pmap's lock held.  be careful!
3012  */
3013 void
3014 pmap_destroy(struct pmap *pmap)
3015 {
3016 	int i;
3017 
3018 	/*
3019 	 * drop reference count and verify not in use.
3020 	 */
3021 
3022 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
3023 		return;
3024 	}
3025 	pmap_check_inuse(pmap);
3026 
3027 	/*
3028 	 * handle any deferred frees.
3029 	 */
3030 
3031 	mutex_enter(&pmap->pm_lock);
3032 	if (pmap->pm_pve != NULL) {
3033 		pmap_free_pv(pmap, pmap->pm_pve);
3034 		pmap->pm_pve = NULL;
3035 	}
3036 	pmap_drain_pv(pmap);
3037 	mutex_exit(&pmap->pm_lock);
3038 	pmap_update(pmap);
3039 
3040 	/*
3041 	 * Reference count is zero, free pmap resources and then free pmap.
3042 	 */
3043 
3044 	pmap_check_ptps(pmap);
3045 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3046 
3047 #ifdef USER_LDT
3048 	if (pmap->pm_ldt != NULL) {
3049 		/*
3050 		 * No need to switch the LDT; this address space is gone,
3051 		 * nothing is using it.
3052 		 *
3053 		 * No need to lock the pmap for ldt_free (or anything else),
3054 		 * we're the last one to use it.
3055 		 */
3056 		/* XXXAD can't take cpu_lock here - fix soon. */
3057 		mutex_enter(&cpu_lock);
3058 		ldt_free(pmap->pm_ldt_sel);
3059 		mutex_exit(&cpu_lock);
3060 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3061 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3062 	}
3063 #endif
3064 
3065 	for (i = 0; i < PTP_LEVELS - 1; i++) {
3066 		uvm_obj_destroy(&pmap->pm_obj[i], false);
3067 	}
3068 	kcpuset_zero(pmap->pm_cpus);
3069 	kcpuset_zero(pmap->pm_kernel_cpus);
3070 #ifdef XENPV
3071 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3072 #endif
3073 
3074 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3075 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3076 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3077 
3078 	pmap_check_ptps(pmap);
3079 	if (__predict_false(pmap->pm_enter != NULL)) {
3080 		/* XXX make this a different cache */
3081 		pool_cache_destruct_object(&pmap_cache, pmap);
3082 	} else {
3083 		pool_cache_put(&pmap_cache, pmap);
3084 	}
3085 }
3086 
3087 /*
3088  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3089  *
3090  * => caller must hold pmap's lock
3091  * => PTP must be mapped into KVA
3092  * => must be called with kernel preemption disabled
3093  * => does as little work as possible
3094  */
3095 static void
3096 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3097     vaddr_t startva, vaddr_t blkendva)
3098 {
3099 #ifndef XENPV
3100 	struct pv_entry *pve;
3101 	struct vm_page *pg;
3102 	struct pmap_page *pp;
3103 	pt_entry_t opte;
3104 	rb_tree_t *tree;
3105 	vaddr_t va;
3106 	int wired;
3107 	uint8_t oattrs;
3108 	u_int cnt;
3109 
3110 	KASSERT(mutex_owned(&pmap->pm_lock));
3111 	KASSERT(kpreempt_disabled());
3112 	KASSERT(pmap != pmap_kernel());
3113 	KASSERT(ptp->wire_count > 1);
3114 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3115 
3116 	/*
3117 	 * Start at the lowest entered VA, and scan until there are no more
3118 	 * PTEs in the PTPs.
3119 	 */
3120 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3121 	pve = RB_TREE_MIN(tree);
3122 	wired = 0;
3123 	va = (vaddr_t)ptp->uanon;
3124 	pte += ((va - startva) >> PAGE_SHIFT);
3125 
3126 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3127 		/*
3128 		 * No need for an atomic to clear the PTE.  Nothing else can
3129 		 * see the address space any more and speculative access (if
3130 		 * possible) won't modify.  Therefore there's no need to
3131 		 * track the accessed/dirty bits.
3132 		 */
3133 		opte = *pte;
3134 		if (!pmap_valid_entry(opte)) {
3135 			continue;
3136 		}
3137 
3138 		/*
3139 		 * Count the PTE.  If it's not for a managed mapping
3140 		 * there's noting more to do.
3141 		 */
3142 		cnt--;
3143 		wired -= (opte & PTE_WIRED);
3144 		if ((opte & PTE_PVLIST) == 0) {
3145 #ifndef DOM0OPS
3146 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3147 			    "managed page without PTE_PVLIST for %#"
3148 			    PRIxVADDR, va);
3149 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3150 			    "pv-tracked page without PTE_PVLIST for %#"
3151 			    PRIxVADDR, va);
3152 #endif
3153 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3154 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3155 			    va) == NULL);
3156 			continue;
3157 		}
3158 
3159 		/*
3160 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3161 		 * in the PTP.  If it's for this VA, take advantage of it to
3162 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3163 		 * tree by skipping to the next VA in the tree whenever
3164 		 * there is a match here.  The tree will be cleared out in
3165 		 * one pass before return to pmap_remove_all().
3166 		 */
3167 		oattrs = pmap_pte_to_pp_attrs(opte);
3168 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3169 			pp = pve->pve_pp;
3170 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3171 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3172 			    pp->pp_pte.pte_va != va);
3173 			mutex_spin_enter(&pp->pp_lock);
3174 			pp->pp_attrs |= oattrs;
3175 			LIST_REMOVE(pve, pve_list);
3176 			mutex_spin_exit(&pp->pp_lock);
3177 
3178 			/*
3179 			 * pve won't be touched again until pmap_drain_pv(),
3180 			 * so it's still safe to traverse the tree.
3181 			 */
3182 			pmap_free_pv(pmap, pve);
3183 			pve = RB_TREE_NEXT(tree, pve);
3184 			continue;
3185 		}
3186 
3187 		/*
3188 		 * No entry in the tree so it must be embedded.  Look up the
3189 		 * page and cancel the embedded entry.
3190 		 */
3191 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3192 			pp = VM_PAGE_TO_PP(pg);
3193 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3194 			paddr_t pa = pmap_pte2pa(opte);
3195 			panic("%s: PTE_PVLIST with pv-untracked page"
3196 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3197 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3198 		}
3199 		mutex_spin_enter(&pp->pp_lock);
3200 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3201 		KASSERT(pp->pp_pte.pte_va == va);
3202 		pp->pp_attrs |= oattrs;
3203 		pp->pp_pte.pte_ptp = NULL;
3204 		pp->pp_pte.pte_va = 0;
3205 		mutex_spin_exit(&pp->pp_lock);
3206 	}
3207 
3208 	/* PTP now empty - adjust the tree & stats to match. */
3209 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3210 	ptp->wire_count = 1;
3211 #ifdef DIAGNOSTIC
3212 	rb_tree_init(tree, &pmap_rbtree_ops);
3213 #endif
3214 #else	/* !XENPV */
3215 	/*
3216 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3217 	 * I guess the hypervisor keeps track of PTEs too.
3218 	 */
3219 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3220 #endif	/* !XENPV */
3221 }
3222 
3223 /*
3224  * pmap_remove_all: remove all mappings from pmap in bulk.
3225  *
3226  * Ordinarily when removing mappings it's important to hold the UVM object's
3227  * lock, so that pages do not gain a new identity while retaining stale TLB
3228  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3229  * Here it's known that the address space is no longer visible to any user
3230  * process, so we don't need to worry about that.
3231  */
3232 bool
3233 pmap_remove_all(struct pmap *pmap)
3234 {
3235 	struct vm_page *ptps[32];
3236 	vaddr_t va, blkendva;
3237 	struct pmap *pmap2;
3238 	pt_entry_t *ptes;
3239 	pd_entry_t pde __diagused;
3240 	pd_entry_t * const *pdes;
3241 	int lvl __diagused, i, n;
3242 
3243 	/* XXX Can't handle EPT just yet. */
3244 	if (pmap->pm_remove != NULL) {
3245 		return false;
3246 	}
3247 
3248 	for (;;) {
3249 		/* Fetch a block of PTPs from tree. */
3250 		mutex_enter(&pmap->pm_lock);
3251 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3252 		    (void **)ptps, __arraycount(ptps), false);
3253 		if (n == 0) {
3254 			mutex_exit(&pmap->pm_lock);
3255 			break;
3256 		}
3257 
3258 		/* Remove all mappings in the set of PTPs. */
3259 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3260 		for (i = 0; i < n; i++) {
3261 			if (ptps[i]->wire_count == 0) {
3262 				/* It's dead: pmap_update() will expunge. */
3263 				continue;
3264 			}
3265 
3266 			/* Determine range of block. */
3267 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3268 			blkendva = x86_round_pdr(va + 1);
3269 
3270 			/* Make sure everything squares up... */
3271 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3272 			KASSERT(lvl == 1);
3273 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3274 
3275 			/* Zap! */
3276 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3277 			    blkendva);
3278 
3279 			/* PTP should now be unused - free it. */
3280 			KASSERT(ptps[i]->wire_count == 1);
3281 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3282 		}
3283 		pmap_unmap_ptes(pmap, pmap2);
3284 		pmap_drain_pv(pmap);
3285 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3286 		mutex_exit(&pmap->pm_lock);
3287 
3288 		/* Process deferred frees. */
3289 		pmap_update(pmap);
3290 
3291 		/* A breathing point. */
3292 		preempt_point();
3293 	}
3294 
3295 	/* Verify that the pmap is now completely empty. */
3296 	pmap_check_ptps(pmap);
3297 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3298 	    "pmap %p not empty", pmap);
3299 
3300 	return true;
3301 }
3302 
3303 #if defined(PMAP_FORK)
3304 /*
3305  * pmap_fork: perform any necessary data structure manipulation when
3306  * a VM space is forked.
3307  */
3308 void
3309 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3310 {
3311 #ifdef USER_LDT
3312 	union descriptor *new_ldt;
3313 	int sel;
3314 
3315 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3316 		return;
3317 	}
3318 
3319 	/*
3320 	 * Copy the LDT into the new process.
3321 	 *
3322 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3323 	 * we'll retry. This will starve if there's a stream of LDT changes
3324 	 * in another thread but that should not happen.
3325 	 */
3326 
3327 retry:
3328 	if (pmap1->pm_ldt != NULL) {
3329 		/* Allocate space for the new process's LDT */
3330 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3331 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3332 		if (new_ldt == NULL) {
3333 			printf("WARNING: %s: unable to allocate LDT space\n",
3334 			    __func__);
3335 			return;
3336 		}
3337 		mutex_enter(&cpu_lock);
3338 		/* Get a GDT slot for it */
3339 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3340 		if (sel == -1) {
3341 			mutex_exit(&cpu_lock);
3342 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3343 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3344 			printf("WARNING: %s: unable to allocate LDT selector\n",
3345 			    __func__);
3346 			return;
3347 		}
3348 	} else {
3349 		/* Wasn't anything there after all. */
3350 		new_ldt = NULL;
3351 		sel = -1;
3352 		mutex_enter(&cpu_lock);
3353 	}
3354 
3355 	/*
3356 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3357 	 */
3358 	if (pmap1->pm_ldt != NULL) {
3359 		if (new_ldt == NULL) {
3360 			/* A wild LDT just appeared. */
3361 			mutex_exit(&cpu_lock);
3362 			goto retry;
3363 		}
3364 
3365 		/* Copy the LDT data and install it in pmap2 */
3366 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3367 		pmap2->pm_ldt = new_ldt;
3368 		pmap2->pm_ldt_sel = sel;
3369 		mutex_exit(&cpu_lock);
3370 	} else {
3371 		if (new_ldt != NULL) {
3372 			/* The LDT disappeared, drop what we did. */
3373 			ldt_free(sel);
3374 			mutex_exit(&cpu_lock);
3375 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3376 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3377 			return;
3378 		}
3379 
3380 		/* We're good, just leave. */
3381 		mutex_exit(&cpu_lock);
3382 	}
3383 #endif /* USER_LDT */
3384 }
3385 #endif /* PMAP_FORK */
3386 
3387 #ifdef USER_LDT
3388 
3389 /*
3390  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3391  * is active, reload LDTR.
3392  */
3393 static void
3394 pmap_ldt_xcall(void *arg1, void *arg2)
3395 {
3396 	struct pmap *pm;
3397 
3398 	kpreempt_disable();
3399 	pm = arg1;
3400 	if (curcpu()->ci_pmap == pm) {
3401 #if defined(SVS)
3402 		if (svs_enabled) {
3403 			svs_ldt_sync(pm);
3404 		} else
3405 #endif
3406 		lldt(pm->pm_ldt_sel);
3407 	}
3408 	kpreempt_enable();
3409 }
3410 
3411 /*
3412  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3413  * in the new selector on all CPUs.
3414  */
3415 void
3416 pmap_ldt_sync(struct pmap *pm)
3417 {
3418 	uint64_t where;
3419 
3420 	KASSERT(mutex_owned(&cpu_lock));
3421 
3422 	pmap_ldt_evcnt.ev_count++;
3423 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3424 	xc_wait(where);
3425 }
3426 
3427 /*
3428  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3429  * restore the default.
3430  */
3431 void
3432 pmap_ldt_cleanup(struct lwp *l)
3433 {
3434 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3435 	union descriptor *ldt;
3436 	int sel;
3437 
3438 	if (__predict_true(pmap->pm_ldt == NULL)) {
3439 		return;
3440 	}
3441 
3442 	mutex_enter(&cpu_lock);
3443 	if (pmap->pm_ldt != NULL) {
3444 		sel = pmap->pm_ldt_sel;
3445 		ldt = pmap->pm_ldt;
3446 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3447 		pmap->pm_ldt = NULL;
3448 		pmap_ldt_sync(pmap);
3449 		ldt_free(sel);
3450 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3451 		    UVM_KMF_WIRED);
3452 	}
3453 	mutex_exit(&cpu_lock);
3454 }
3455 #endif /* USER_LDT */
3456 
3457 /*
3458  * pmap_activate: activate a process' pmap
3459  *
3460  * => must be called with kernel preemption disabled
3461  * => if lwp is the curlwp, then set ci_want_pmapload so that
3462  *    actual MMU context switch will be done by pmap_load() later
3463  */
3464 void
3465 pmap_activate(struct lwp *l)
3466 {
3467 	struct cpu_info *ci;
3468 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3469 
3470 	KASSERT(kpreempt_disabled());
3471 
3472 	ci = curcpu();
3473 
3474 	if (l != ci->ci_curlwp)
3475 		return;
3476 
3477 	KASSERT(ci->ci_want_pmapload == 0);
3478 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3479 
3480 	/*
3481 	 * no need to switch to kernel vmspace because
3482 	 * it's a subset of any vmspace.
3483 	 */
3484 
3485 	if (pmap == pmap_kernel()) {
3486 		ci->ci_want_pmapload = 0;
3487 		return;
3488 	}
3489 
3490 	ci->ci_want_pmapload = 1;
3491 }
3492 
3493 #if defined(XENPV) && defined(__x86_64__)
3494 #define	KASSERT_PDIRPA(pmap) \
3495 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3496 	    pmap == pmap_kernel())
3497 #elif defined(PAE)
3498 #define	KASSERT_PDIRPA(pmap) \
3499 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3500 #elif !defined(XENPV)
3501 #define	KASSERT_PDIRPA(pmap) \
3502 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3503 #else
3504 #define	KASSERT_PDIRPA(pmap)	KASSERT(true)	/* nothing to do */
3505 #endif
3506 
3507 /*
3508  * pmap_reactivate: try to regain reference to the pmap.
3509  *
3510  * => Must be called with kernel preemption disabled.
3511  */
3512 static void
3513 pmap_reactivate(struct pmap *pmap)
3514 {
3515 	struct cpu_info * const ci = curcpu();
3516 	const cpuid_t cid = cpu_index(ci);
3517 
3518 	KASSERT(kpreempt_disabled());
3519 	KASSERT_PDIRPA(pmap);
3520 
3521 	/*
3522 	 * If we still have a lazy reference to this pmap, we can assume
3523 	 * that there was no TLB shootdown for this pmap in the meantime.
3524 	 *
3525 	 * The order of events here is important as we must synchronize
3526 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3527 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3528 	 * change only when the state is TLBSTATE_LAZY.
3529 	 */
3530 
3531 	ci->ci_tlbstate = TLBSTATE_VALID;
3532 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3533 
3534 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3535 		/* We have the reference, state is valid. */
3536 	} else {
3537 		/*
3538 		 * Must reload the TLB, pmap has been changed during
3539 		 * deactivated.
3540 		 */
3541 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3542 
3543 		tlbflush();
3544 	}
3545 }
3546 
3547 /*
3548  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3549  * and relevant LDT info.
3550  *
3551  * Ensures that the current process' pmap is loaded on the current CPU's
3552  * MMU and that there are no stale TLB entries.
3553  *
3554  * => The caller should disable kernel preemption or do check-and-retry
3555  *    to prevent a preemption from undoing our efforts.
3556  * => This function may block.
3557  */
3558 void
3559 pmap_load(void)
3560 {
3561 	struct cpu_info *ci;
3562 	struct pmap *pmap, *oldpmap;
3563 	struct lwp *l;
3564 	uint64_t ncsw;
3565 
3566 	kpreempt_disable();
3567  retry:
3568 	ci = curcpu();
3569 	if (!ci->ci_want_pmapload) {
3570 		kpreempt_enable();
3571 		return;
3572 	}
3573 	l = ci->ci_curlwp;
3574 	ncsw = l->l_ncsw;
3575 	__insn_barrier();
3576 
3577 	/* should be able to take ipis. */
3578 	KASSERT(ci->ci_ilevel < IPL_HIGH);
3579 #ifdef XENPV
3580 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3581 	KASSERT(x86_read_psl() == 0);
3582 #else
3583 	KASSERT((x86_read_psl() & PSL_I) != 0);
3584 #endif
3585 
3586 	KASSERT(l != NULL);
3587 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3588 	KASSERT(pmap != pmap_kernel());
3589 	oldpmap = ci->ci_pmap;
3590 
3591 	if (pmap == oldpmap) {
3592 		pmap_reactivate(pmap);
3593 		ci->ci_want_pmapload = 0;
3594 		kpreempt_enable();
3595 		return;
3596 	}
3597 
3598 	/*
3599 	 * Acquire a reference to the new pmap and perform the switch.
3600 	 */
3601 
3602 	pmap_reference(pmap);
3603 	pmap_load1(l, pmap, oldpmap);
3604 	ci->ci_want_pmapload = 0;
3605 
3606 	/*
3607 	 * we're now running with the new pmap.  drop the reference
3608 	 * to the old pmap.  if we block, we need to go around again.
3609 	 */
3610 
3611 	pmap_destroy(oldpmap);
3612 	__insn_barrier();
3613 	if (l->l_ncsw != ncsw) {
3614 		goto retry;
3615 	}
3616 
3617 	kpreempt_enable();
3618 }
3619 
3620 /*
3621  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3622  * pmap_load().  It's critically important that this function does not
3623  * block.
3624  */
3625 static void
3626 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3627 {
3628 	struct cpu_info *ci;
3629 	struct pcb *pcb;
3630 	cpuid_t cid;
3631 
3632 	KASSERT(kpreempt_disabled());
3633 
3634 	pcb = lwp_getpcb(l);
3635 	ci = l->l_cpu;
3636 	cid = cpu_index(ci);
3637 
3638 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3639 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3640 
3641 	KASSERT_PDIRPA(oldpmap);
3642 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3643 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3644 
3645 	/*
3646 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3647 	 * with TLB shootdown interrupts, so set the state VALID first,
3648 	 * then register us for shootdown events on this pmap.
3649 	 */
3650 	ci->ci_tlbstate = TLBSTATE_VALID;
3651 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3652 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3653 	ci->ci_pmap = pmap;
3654 
3655 	/*
3656 	 * update tss.  now that we have registered for invalidations
3657 	 * from other CPUs, we're good to load the page tables.
3658 	 */
3659 #ifdef PAE
3660 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3661 #else
3662 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3663 #endif
3664 
3665 #ifdef i386
3666 #ifndef XENPV
3667 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3668 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3669 #endif
3670 #endif
3671 
3672 #if defined(SVS) && defined(USER_LDT)
3673 	if (svs_enabled) {
3674 		svs_ldt_sync(pmap);
3675 	} else
3676 #endif
3677 	lldt(pmap->pm_ldt_sel);
3678 
3679 	cpu_load_pmap(pmap, oldpmap);
3680 }
3681 
3682 /*
3683  * pmap_deactivate: deactivate a process' pmap.
3684  *
3685  * => Must be called with kernel preemption disabled (high IPL is enough).
3686  */
3687 void
3688 pmap_deactivate(struct lwp *l)
3689 {
3690 	struct pmap *pmap;
3691 	struct cpu_info *ci;
3692 
3693 	KASSERT(kpreempt_disabled());
3694 
3695 	if (l != curlwp) {
3696 		return;
3697 	}
3698 
3699 	/*
3700 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3701 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3702 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3703 	 * to kernel preemption or blocking routine in between.
3704 	 */
3705 	pmap_tlb_shootnow();
3706 
3707 	ci = curcpu();
3708 
3709 	if (ci->ci_want_pmapload) {
3710 		/*
3711 		 * ci_want_pmapload means that our pmap is not loaded on
3712 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3713 		 * is always considered loaded.
3714 		 */
3715 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3716 		    != pmap_kernel());
3717 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3718 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3719 
3720 		/*
3721 		 * userspace has not been touched.
3722 		 * nothing to do here.
3723 		 */
3724 
3725 		ci->ci_want_pmapload = 0;
3726 		return;
3727 	}
3728 
3729 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3730 
3731 	if (pmap == pmap_kernel()) {
3732 		return;
3733 	}
3734 
3735 	KASSERT_PDIRPA(pmap);
3736 	KASSERT(ci->ci_pmap == pmap);
3737 
3738 	/*
3739 	 * we aren't interested in TLB invalidations for this pmap,
3740 	 * at least for the time being.
3741 	 */
3742 
3743 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3744 	ci->ci_tlbstate = TLBSTATE_LAZY;
3745 }
3746 
3747 #ifdef EFI_RUNTIME
3748 
3749 extern struct pmap *efi_runtime_pmap;
3750 
3751 /*
3752  * pmap_is_user: true if pmap, which must not be the kernel pmap, is
3753  * for an unprivileged user process
3754  */
3755 bool
3756 pmap_is_user(struct pmap *pmap)
3757 {
3758 
3759 	KASSERT(pmap != pmap_kernel());
3760 	return (pmap != efi_runtime_pmap);
3761 }
3762 
3763 /*
3764  * pmap_activate_sync: synchronously activate specified pmap.
3765  *
3766  * => Must be called with kernel preemption disabled (high IPL is enough).
3767  * => Must not sleep before pmap_deactivate_sync.
3768  */
3769 void *
3770 pmap_activate_sync(struct pmap *pmap)
3771 {
3772 	struct cpu_info *ci = curcpu();
3773 	struct pmap *oldpmap = ci->ci_pmap;
3774 	unsigned cid = cpu_index(ci);
3775 
3776 	KASSERT(kpreempt_disabled());
3777 	KASSERT(pmap != pmap_kernel());
3778 
3779 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3780 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3781 
3782 	if (oldpmap) {
3783 		KASSERT_PDIRPA(oldpmap);
3784 		kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3785 		kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3786 	}
3787 
3788 	ci->ci_tlbstate = TLBSTATE_VALID;
3789 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3790 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3791 	ci->ci_pmap = pmap;
3792 
3793 #if defined(SVS) && defined(USER_LDT)
3794 	if (svs_enabled) {
3795 		svs_ldt_sync(pmap);
3796 	} else
3797 #endif
3798 	lldt(pmap->pm_ldt_sel);
3799 
3800 	cpu_load_pmap(pmap, oldpmap);
3801 
3802 	return oldpmap;
3803 }
3804 
3805 /*
3806  * pmap_deactivate_sync: synchronously deactivate specified pmap and
3807  * restore whatever was active before pmap_activate_sync.
3808  *
3809  * => Must be called with kernel preemption disabled (high IPL is enough).
3810  * => Must not have slept since pmap_activate_sync.
3811  */
3812 void
3813 pmap_deactivate_sync(struct pmap *pmap, void *cookie)
3814 {
3815 	struct cpu_info *ci = curcpu();
3816 	struct pmap *oldpmap = cookie;
3817 	unsigned cid = cpu_index(ci);
3818 
3819 	KASSERT(kpreempt_disabled());
3820 	KASSERT(pmap != pmap_kernel());
3821 	KASSERT(ci->ci_pmap == pmap);
3822 
3823 	KASSERT_PDIRPA(pmap);
3824 
3825 	KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
3826 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3827 
3828 	pmap_tlb_shootnow();
3829 
3830 	kcpuset_atomic_clear(pmap->pm_cpus, cid);
3831 	kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
3832 
3833 	ci->ci_tlbstate = TLBSTATE_VALID;
3834 	ci->ci_pmap = oldpmap;
3835 	if (oldpmap) {
3836 		kcpuset_atomic_set(oldpmap->pm_cpus, cid);
3837 		kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
3838 #if defined(SVS) && defined(USER_LDT)
3839 		if (svs_enabled) {
3840 			svs_ldt_sync(oldpmap);
3841 		} else
3842 #endif
3843 		lldt(oldpmap->pm_ldt_sel);
3844 		cpu_load_pmap(oldpmap, pmap);
3845 	} else {
3846 		lcr3(pmap_pdirpa(pmap_kernel(), 0));
3847 	}
3848 }
3849 
3850 #endif	/* EFI_RUNTIME */
3851 
3852 /*
3853  * some misc. functions
3854  */
3855 
3856 bool
3857 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3858     int *lastlvl)
3859 {
3860 	unsigned long index;
3861 	pd_entry_t pde;
3862 	int i;
3863 
3864 	for (i = PTP_LEVELS; i > 1; i--) {
3865 		index = pl_i(va, i);
3866 		pde = pdes[i - 2][index];
3867 		if ((pde & PTE_P) == 0) {
3868 			*lastlvl = i;
3869 			return false;
3870 		}
3871 		if (pde & PTE_PS)
3872 			break;
3873 	}
3874 	if (lastpde != NULL)
3875 		*lastpde = pde;
3876 	*lastlvl = i;
3877 	return true;
3878 }
3879 
3880 /*
3881  * pmap_extract: extract a PA for the given VA
3882  */
3883 bool
3884 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3885 {
3886 	pt_entry_t *ptes, pte;
3887 	pd_entry_t pde;
3888 	pd_entry_t * const *pdes;
3889 	struct pmap *pmap2;
3890 	paddr_t pa;
3891 	bool rv;
3892 	int lvl;
3893 
3894 	if (__predict_false(pmap->pm_extract != NULL)) {
3895 		return (*pmap->pm_extract)(pmap, va, pap);
3896 	}
3897 
3898 #ifdef __HAVE_DIRECT_MAP
3899 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3900 		if (pap != NULL) {
3901 			*pap = PMAP_DIRECT_UNMAP(va);
3902 		}
3903 		return true;
3904 	}
3905 #endif
3906 
3907 	rv = false;
3908 	pa = 0;
3909 
3910 	if (pmap != pmap_kernel()) {
3911 		mutex_enter(&pmap->pm_lock);
3912 	}
3913 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3914 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3915 		if (lvl == 2) {
3916 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3917 			rv = true;
3918 		} else {
3919 			KASSERT(lvl == 1);
3920 			pte = ptes[pl1_i(va)];
3921 			if (__predict_true((pte & PTE_P) != 0)) {
3922 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3923 				rv = true;
3924 			}
3925 		}
3926 	}
3927 	pmap_unmap_ptes(pmap, pmap2);
3928 	if (pmap != pmap_kernel()) {
3929 		mutex_exit(&pmap->pm_lock);
3930 	}
3931 	if (pap != NULL) {
3932 		*pap = pa;
3933 	}
3934 
3935 	return rv;
3936 }
3937 
3938 /*
3939  * vtophys: virtual address to physical address.  For use by
3940  * machine-dependent code only.
3941  */
3942 paddr_t
3943 vtophys(vaddr_t va)
3944 {
3945 	paddr_t pa;
3946 
3947 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3948 		return pa;
3949 	return 0;
3950 }
3951 
3952 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3953 
3954 #ifdef XENPV
3955 /*
3956  * vtomach: virtual address to machine address.  For use by
3957  * machine-dependent code only.
3958  */
3959 paddr_t
3960 vtomach(vaddr_t va)
3961 {
3962 	paddr_t pa;
3963 
3964 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3965 		return pa;
3966 	return 0;
3967 }
3968 #endif
3969 
3970 /*
3971  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3972  * determine the bounds of the kernel virtual address space.
3973  */
3974 void
3975 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3976 {
3977 	*startp = virtual_avail;
3978 	*endp = virtual_end;
3979 }
3980 
3981 void
3982 pmap_zero_page(paddr_t pa)
3983 {
3984 #if defined(__HAVE_DIRECT_MAP)
3985 	memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
3986 #else
3987 #if defined(XENPV)
3988 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3989 		xen_pagezero(pa);
3990 		return;
3991 	}
3992 #endif
3993 	struct cpu_info *ci;
3994 	pt_entry_t *zpte;
3995 	vaddr_t zerova;
3996 
3997 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3998 
3999 	kpreempt_disable();
4000 
4001 	ci = curcpu();
4002 	zerova = ci->vpage[VPAGE_ZER];
4003 	zpte = ci->vpage_pte[VPAGE_ZER];
4004 
4005 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
4006 
4007 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
4008 	pmap_pte_flush();
4009 	pmap_update_pg(zerova);		/* flush TLB */
4010 
4011 	memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
4012 
4013 #if defined(DIAGNOSTIC) || defined(XENPV)
4014 	pmap_pte_set(zpte, 0);				/* zap ! */
4015 	pmap_pte_flush();
4016 #endif
4017 
4018 	kpreempt_enable();
4019 #endif /* defined(__HAVE_DIRECT_MAP) */
4020 }
4021 
4022 void
4023 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
4024 {
4025 #if defined(__HAVE_DIRECT_MAP)
4026 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
4027 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
4028 
4029 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4030 #else
4031 #if defined(XENPV)
4032 	if (XEN_VERSION_SUPPORTED(3, 4)) {
4033 		xen_copy_page(srcpa, dstpa);
4034 		return;
4035 	}
4036 #endif
4037 	struct cpu_info *ci;
4038 	pt_entry_t *srcpte, *dstpte;
4039 	vaddr_t srcva, dstva;
4040 
4041 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
4042 
4043 	kpreempt_disable();
4044 
4045 	ci = curcpu();
4046 	srcva = ci->vpage[VPAGE_SRC];
4047 	dstva = ci->vpage[VPAGE_DST];
4048 	srcpte = ci->vpage_pte[VPAGE_SRC];
4049 	dstpte = ci->vpage_pte[VPAGE_DST];
4050 
4051 	KASSERT(*srcpte == 0 && *dstpte == 0);
4052 
4053 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
4054 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
4055 	pmap_pte_flush();
4056 	pmap_update_pg(srcva);
4057 	pmap_update_pg(dstva);
4058 
4059 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4060 
4061 #if defined(DIAGNOSTIC) || defined(XENPV)
4062 	pmap_pte_set(srcpte, 0);
4063 	pmap_pte_set(dstpte, 0);
4064 	pmap_pte_flush();
4065 #endif
4066 
4067 	kpreempt_enable();
4068 #endif /* defined(__HAVE_DIRECT_MAP) */
4069 }
4070 
4071 static pt_entry_t *
4072 pmap_map_ptp(struct vm_page *ptp)
4073 {
4074 #ifdef __HAVE_DIRECT_MAP
4075 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
4076 #else
4077 	struct cpu_info *ci;
4078 	pt_entry_t *ptppte;
4079 	vaddr_t ptpva;
4080 
4081 	KASSERT(kpreempt_disabled());
4082 
4083 #ifndef XENPV
4084 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
4085 #else
4086 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
4087 #endif
4088 
4089 	ci = curcpu();
4090 	ptpva = ci->vpage[VPAGE_PTP];
4091 	ptppte = ci->vpage_pte[VPAGE_PTP];
4092 
4093 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
4094 
4095 	pmap_pte_flush();
4096 	pmap_update_pg(ptpva);
4097 
4098 	return (pt_entry_t *)ptpva;
4099 #endif
4100 }
4101 
4102 static void
4103 pmap_unmap_ptp(void)
4104 {
4105 #ifndef __HAVE_DIRECT_MAP
4106 #if defined(DIAGNOSTIC) || defined(XENPV)
4107 	struct cpu_info *ci;
4108 	pt_entry_t *pte;
4109 
4110 	KASSERT(kpreempt_disabled());
4111 
4112 	ci = curcpu();
4113 	pte = ci->vpage_pte[VPAGE_PTP];
4114 
4115 	if (*pte != 0) {
4116 		pmap_pte_set(pte, 0);
4117 		pmap_pte_flush();
4118 	}
4119 #endif
4120 #endif
4121 }
4122 
4123 static pt_entry_t *
4124 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
4125 {
4126 
4127 	KASSERT(kpreempt_disabled());
4128 	if (pmap_is_curpmap(pmap)) {
4129 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
4130 	}
4131 	KASSERT(ptp != NULL);
4132 	return pmap_map_ptp(ptp) + pl1_pi(va);
4133 }
4134 
4135 static void
4136 pmap_unmap_pte(void)
4137 {
4138 
4139 	KASSERT(kpreempt_disabled());
4140 
4141 	pmap_unmap_ptp();
4142 }
4143 
4144 /*
4145  * p m a p   r e m o v e   f u n c t i o n s
4146  *
4147  * functions that remove mappings
4148  */
4149 
4150 /*
4151  * pmap_remove_ptes: remove PTEs from a PTP
4152  *
4153  * => caller must hold pmap's lock
4154  * => PTP must be mapped into KVA
4155  * => PTP should be null if pmap == pmap_kernel()
4156  * => must be called with kernel preemption disabled
4157  * => returns composite pte if at least one page should be shot down
4158  */
4159 static void
4160 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4161     vaddr_t startva, vaddr_t endva)
4162 {
4163 	pt_entry_t *pte = (pt_entry_t *)ptpva;
4164 
4165 	KASSERT(mutex_owned(&pmap->pm_lock));
4166 	KASSERT(kpreempt_disabled());
4167 
4168 	/*
4169 	 * mappings are very often sparse, so clip the given range to the
4170 	 * range of PTEs that are known present in the PTP.
4171 	 */
4172 	pmap_ptp_range_clip(ptp, &startva, &pte);
4173 
4174 	/*
4175 	 * note that ptpva points to the PTE that maps startva.   this may
4176 	 * or may not be the first PTE in the PTP.
4177 	 *
4178 	 * we loop through the PTP while there are still PTEs to look at
4179 	 * and the wire_count is greater than 1 (because we use the wire_count
4180 	 * to keep track of the number of real PTEs in the PTP).
4181 	 */
4182 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4183 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4184 		startva += PAGE_SIZE;
4185 		pte++;
4186 	}
4187 }
4188 
4189 /*
4190  * pmap_remove_pte: remove a single PTE from a PTP.
4191  *
4192  * => caller must hold pmap's lock
4193  * => PTP must be mapped into KVA
4194  * => PTP should be null if pmap == pmap_kernel()
4195  * => returns true if we removed a mapping
4196  * => must be called with kernel preemption disabled
4197  */
4198 static bool
4199 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4200     vaddr_t va)
4201 {
4202 	struct pv_entry *pve;
4203 	struct vm_page *pg;
4204 	struct pmap_page *pp;
4205 	pt_entry_t opte;
4206 
4207 	KASSERT(mutex_owned(&pmap->pm_lock));
4208 	KASSERT(kpreempt_disabled());
4209 
4210 	if (!pmap_valid_entry(*pte)) {
4211 		/* VA not mapped. */
4212 		return false;
4213 	}
4214 
4215 	/* Atomically save the old PTE and zap it. */
4216 	opte = pmap_pte_testset(pte, 0);
4217 	if (!pmap_valid_entry(opte)) {
4218 		return false;
4219 	}
4220 
4221 	pmap_exec_account(pmap, va, opte, 0);
4222 	pmap_stats_update_bypte(pmap, 0, opte);
4223 
4224 	if (ptp) {
4225 		/*
4226 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4227 		 */
4228 		ptp->wire_count--;
4229 		if (ptp->wire_count <= 1) {
4230 			opte |= PTE_A;
4231 		}
4232 	}
4233 
4234 	if ((opte & PTE_A) != 0) {
4235 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4236 	}
4237 
4238 	/*
4239 	 * If we are not on a pv list - we are done.
4240 	 */
4241 	if ((opte & PTE_PVLIST) == 0) {
4242 #ifndef DOM0OPS
4243 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4244 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4245 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4246 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4247 #endif
4248 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4249 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4250 		return true;
4251 	}
4252 
4253 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4254 		pp = VM_PAGE_TO_PP(pg);
4255 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4256 		paddr_t pa = pmap_pte2pa(opte);
4257 		panic("%s: PTE_PVLIST with pv-untracked page"
4258 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4259 		    __func__, va, pa, atop(pa));
4260 	}
4261 
4262 	/* Sync R/M bits. */
4263 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4264 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4265 	return true;
4266 }
4267 
4268 static void
4269 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4270 {
4271 	pt_entry_t *ptes;
4272 	pd_entry_t pde;
4273 	pd_entry_t * const *pdes;
4274 	bool result;
4275 	vaddr_t blkendva, va = sva;
4276 	struct vm_page *ptp;
4277 	struct pmap *pmap2;
4278 	int lvl;
4279 
4280 	KASSERT(mutex_owned(&pmap->pm_lock));
4281 
4282 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4283 
4284 	/*
4285 	 * removing one page?  take shortcut function.
4286 	 */
4287 
4288 	if (va + PAGE_SIZE == eva) {
4289 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4290 			KASSERT(lvl == 1);
4291 
4292 			/* Get PTP if non-kernel mapping. */
4293 			if (pmap != pmap_kernel()) {
4294 				ptp = pmap_find_ptp(pmap, va, 1);
4295 				KASSERTMSG(ptp != NULL,
4296 				    "%s: unmanaged PTP detected", __func__);
4297 			} else {
4298 				/* Never free kernel PTPs. */
4299 				ptp = NULL;
4300 			}
4301 
4302 			result = pmap_remove_pte(pmap, ptp,
4303 			    &ptes[pl1_i(va)], va);
4304 
4305 			/*
4306 			 * if mapping removed and the PTP is no longer
4307 			 * being used, free it!
4308 			 */
4309 
4310 			if (result && ptp && ptp->wire_count <= 1)
4311 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4312 		}
4313 	} else for (/* null */ ; va < eva ; va = blkendva) {
4314 		/* determine range of block */
4315 		blkendva = x86_round_pdr(va+1);
4316 		if (blkendva > eva)
4317 			blkendva = eva;
4318 
4319 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4320 			/* Skip a range corresponding to an invalid pde. */
4321 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4322 			continue;
4323 		}
4324 		KASSERT(lvl == 1);
4325 
4326 		/* Get PTP if non-kernel mapping. */
4327 		if (pmap != pmap_kernel()) {
4328 			ptp = pmap_find_ptp(pmap, va, 1);
4329 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4330 			    __func__);
4331 		} else {
4332 			/* Never free kernel PTPs. */
4333 			ptp = NULL;
4334 		}
4335 
4336 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4337 		    blkendva);
4338 
4339 		/* If PTP is no longer being used, free it. */
4340 		if (ptp && ptp->wire_count <= 1) {
4341 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4342 		}
4343 	}
4344 	pmap_unmap_ptes(pmap, pmap2);
4345 	pmap_drain_pv(pmap);
4346 }
4347 
4348 /*
4349  * pmap_remove: mapping removal function.
4350  *
4351  * => caller should not be holding any pmap locks
4352  */
4353 void
4354 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4355 {
4356 	if (__predict_false(pmap->pm_remove != NULL)) {
4357 		(*pmap->pm_remove)(pmap, sva, eva);
4358 		return;
4359 	}
4360 
4361 	mutex_enter(&pmap->pm_lock);
4362 	pmap_remove_locked(pmap, sva, eva);
4363 	mutex_exit(&pmap->pm_lock);
4364 }
4365 
4366 /*
4367  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4368  *
4369  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4370  * => Caller should disable kernel preemption.
4371  * => issues tlb shootdowns if necessary.
4372  */
4373 static int
4374 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4375     pt_entry_t *optep)
4376 {
4377 	struct pmap *pmap;
4378 	struct vm_page *ptp;
4379 	vaddr_t va;
4380 	pt_entry_t *ptep;
4381 	pt_entry_t opte;
4382 	pt_entry_t npte;
4383 	pt_entry_t expect;
4384 	bool need_shootdown;
4385 
4386 	ptp = pvpte->pte_ptp;
4387 	va = pvpte->pte_va;
4388 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4389 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4390 	pmap = ptp_to_pmap(ptp);
4391 	KASSERT(kpreempt_disabled());
4392 
4393 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4394 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4395 		    optep);
4396 	}
4397 
4398 	expect = pmap_pa2pte(pa) | PTE_P;
4399 
4400 	if (clearbits != ~0) {
4401 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4402 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4403 	}
4404 
4405 	ptep = pmap_map_pte(pmap, ptp, va);
4406 	do {
4407 		opte = *ptep;
4408 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4409 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4410 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4411 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4412 			/*
4413 			 * We lost a race with a V->P operation like
4414 			 * pmap_remove().  Wait for the competitor
4415 			 * reflecting pte bits into mp_attrs.
4416 			 */
4417 			pmap_unmap_pte();
4418 			return EAGAIN;
4419 		}
4420 
4421 		/*
4422 		 * Check if there's anything to do on this PTE.
4423 		 */
4424 		if ((opte & clearbits) == 0) {
4425 			need_shootdown = false;
4426 			break;
4427 		}
4428 
4429 		/*
4430 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4431 		 * ... Unless we are clearing only the PTE_W bit and
4432 		 * it isn't cached as RW (PTE_D).
4433 		 */
4434 		need_shootdown = (opte & PTE_A) != 0 &&
4435 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4436 
4437 		npte = opte & ~clearbits;
4438 
4439 		/*
4440 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4441 		 */
4442 		if (need_shootdown) {
4443 			npte &= ~(PTE_A | PTE_D);
4444 		}
4445 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4446 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4447 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4448 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4449 
4450 	if (need_shootdown) {
4451 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4452 	}
4453 	pmap_unmap_pte();
4454 
4455 	*oattrs = pmap_pte_to_pp_attrs(opte);
4456 	if (optep != NULL)
4457 		*optep = opte;
4458 	return 0;
4459 }
4460 
4461 static void
4462 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4463     vaddr_t va)
4464 {
4465 	struct pmap *pmap2;
4466 	pt_entry_t *ptes;
4467 	pd_entry_t * const *pdes;
4468 
4469 	KASSERT(mutex_owned(&pmap->pm_lock));
4470 
4471 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4472 	pmap_stats_update_bypte(pmap, 0, opte);
4473 	ptp->wire_count--;
4474 	if (ptp->wire_count <= 1) {
4475 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4476 	}
4477 	pmap_unmap_ptes(pmap, pmap2);
4478 }
4479 
4480 static void
4481 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4482 {
4483 	struct pv_pte *pvpte;
4484 	struct vm_page *ptp;
4485 	uintptr_t sum;
4486 	uint8_t oattrs;
4487 	bool locked;
4488 
4489 	/*
4490 	 * Do an unlocked check to see if the page has no mappings, eg when
4491 	 * pmap_remove_all() was called before amap_wipeout() for a process
4492 	 * private amap - common.  The page being removed must be on the way
4493 	 * out, so we don't have to worry about concurrent attempts to enter
4494 	 * it (otherwise the caller either doesn't care or has screwed up).
4495 	 */
4496 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4497 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4498 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4499 	if (sum == 0) {
4500 		return;
4501 	}
4502 
4503 	kpreempt_disable();
4504 	for (;;) {
4505 		struct pmap *pmap;
4506 		struct pv_entry *pve;
4507 		pt_entry_t opte;
4508 		vaddr_t va;
4509 
4510 		mutex_spin_enter(&pp->pp_lock);
4511 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4512 			mutex_spin_exit(&pp->pp_lock);
4513 			break;
4514 		}
4515 
4516 		/*
4517 		 * Add a reference to the pmap before clearing the pte.
4518 		 * Otherwise the pmap can disappear behind us.
4519 		 */
4520 		ptp = pvpte->pte_ptp;
4521 		pmap = ptp_to_pmap(ptp);
4522 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4523 		if (ptp != NULL) {
4524 			pmap_reference(pmap);
4525 		}
4526 
4527 		/*
4528 		 * Now try to lock it.  We need a direct handoff between
4529 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4530 		 * and kept associated with this pmap.  If that can't be
4531 		 * had, wait for the pmap's lock to become free and then
4532 		 * retry.
4533 		 */
4534 		locked = mutex_tryenter(&pmap->pm_lock);
4535 		mutex_spin_exit(&pp->pp_lock);
4536 		if (!locked) {
4537 			mutex_enter(&pmap->pm_lock);
4538 			/* nothing, just wait for it */
4539 			mutex_exit(&pmap->pm_lock);
4540 			if (ptp != NULL) {
4541 				pmap_destroy(pmap);
4542 			}
4543 			continue;
4544 		}
4545 		va = pvpte->pte_va;
4546 
4547 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4548 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4549 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4550 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4551 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4552 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4553 
4554 #ifdef DEBUG
4555 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4556 		rb_tree_t *tree = (ptp != NULL ?
4557 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4558 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4559 		if (pve == NULL) {
4560 			KASSERTMSG(&pp->pp_pte == pvpte,
4561 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4562 			    va, pmap, ptp, pvpte, pve);
4563 		} else {
4564 			KASSERTMSG(&pve->pve_pte == pvpte,
4565 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4566 			    va, pmap, ptp, pvpte, pve);
4567 		}
4568 #endif
4569 
4570 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4571 			panic("pmap_pp_remove: mapping not present");
4572 		}
4573 
4574 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4575 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4576 
4577 		/* Update the PTP reference count. Free if last reference. */
4578 		if (ptp != NULL) {
4579 			KASSERT(pmap != pmap_kernel());
4580 			pmap_tlb_shootnow();
4581 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4582 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4583 			} else {
4584 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4585 			}
4586 		} else {
4587 			KASSERT(pmap == pmap_kernel());
4588 			pmap_stats_update_bypte(pmap, 0, opte);
4589 		}
4590 		pmap_tlb_shootnow();
4591 		pmap_drain_pv(pmap);
4592 		mutex_exit(&pmap->pm_lock);
4593 		if (ptp != NULL) {
4594 			pmap_destroy(pmap);
4595 		}
4596 	}
4597 	kpreempt_enable();
4598 }
4599 
4600 /*
4601  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4602  *
4603  * => R/M bits are sync'd back to attrs
4604  */
4605 void
4606 pmap_page_remove(struct vm_page *pg)
4607 {
4608 	struct pmap_page *pp;
4609 	paddr_t pa;
4610 
4611 	pp = VM_PAGE_TO_PP(pg);
4612 	pa = VM_PAGE_TO_PHYS(pg);
4613 	pmap_pp_remove(pp, pa);
4614 }
4615 
4616 /*
4617  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4618  * that map it
4619  */
4620 void
4621 pmap_pv_remove(paddr_t pa)
4622 {
4623 	struct pmap_page *pp;
4624 
4625 	pp = pmap_pv_tracked(pa);
4626 	if (pp == NULL)
4627 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4628 	pmap_pp_remove(pp, pa);
4629 }
4630 
4631 /*
4632  * p m a p   a t t r i b u t e  f u n c t i o n s
4633  * functions that test/change managed page's attributes
4634  * since a page can be mapped multiple times we must check each PTE that
4635  * maps it by going down the pv lists.
4636  */
4637 
4638 /*
4639  * pmap_test_attrs: test a page's attributes
4640  */
4641 bool
4642 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4643 {
4644 	struct pmap_page *pp;
4645 	struct pv_pte *pvpte;
4646 	struct pmap *pmap;
4647 	uint8_t oattrs;
4648 	u_int result;
4649 	paddr_t pa;
4650 
4651 	pp = VM_PAGE_TO_PP(pg);
4652 	if ((pp->pp_attrs & testbits) != 0) {
4653 		return true;
4654 	}
4655 	pa = VM_PAGE_TO_PHYS(pg);
4656  startover:
4657 	mutex_spin_enter(&pp->pp_lock);
4658 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4659 		if ((pp->pp_attrs & testbits) != 0) {
4660 			break;
4661 		}
4662 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4663 			/*
4664 			 * raced with a V->P operation.  wait for the other
4665 			 * side to finish by acquiring pmap's lock.  if no
4666 			 * wait, updates to pp_attrs by the other side may
4667 			 * go unseen.
4668 			 */
4669 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4670 			pmap_reference(pmap);
4671 			mutex_spin_exit(&pp->pp_lock);
4672 			mutex_enter(&pmap->pm_lock);
4673 			/* nothing. */
4674 			mutex_exit(&pmap->pm_lock);
4675 			pmap_destroy(pmap);
4676 			goto startover;
4677 		}
4678 		pp->pp_attrs |= oattrs;
4679 	}
4680 	result = pp->pp_attrs & testbits;
4681 	mutex_spin_exit(&pp->pp_lock);
4682 
4683 	/*
4684 	 * note that we will exit the for loop with a non-null pve if
4685 	 * we have found the bits we are testing for.
4686 	 */
4687 
4688 	return result != 0;
4689 }
4690 
4691 static bool
4692 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4693 {
4694 	struct pv_pte *pvpte;
4695 	struct pmap *pmap;
4696 	uint8_t oattrs;
4697 	u_int result;
4698 
4699 startover:
4700 	mutex_spin_enter(&pp->pp_lock);
4701 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4702 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4703 			/*
4704 			 * raced with a V->P operation.  wait for the other
4705 			 * side to finish by acquiring pmap's lock.  it is
4706 			 * probably unmapping the page, and it will be gone
4707 			 * when the loop is restarted.
4708 			 */
4709 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4710 			pmap_reference(pmap);
4711 			mutex_spin_exit(&pp->pp_lock);
4712 			mutex_enter(&pmap->pm_lock);
4713 			/* nothing. */
4714 			mutex_exit(&pmap->pm_lock);
4715 			pmap_destroy(pmap);
4716 			goto startover;
4717 		}
4718 		pp->pp_attrs |= oattrs;
4719 	}
4720 	result = pp->pp_attrs & clearbits;
4721 	pp->pp_attrs &= ~clearbits;
4722 	pmap_tlb_shootnow();
4723 	mutex_spin_exit(&pp->pp_lock);
4724 
4725 	return result != 0;
4726 }
4727 
4728 /*
4729  * pmap_clear_attrs: clear the specified attribute for a page.
4730  *
4731  * => we return true if we cleared one of the bits we were asked to
4732  */
4733 bool
4734 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4735 {
4736 	struct pmap_page *pp;
4737 	paddr_t pa;
4738 
4739 	pp = VM_PAGE_TO_PP(pg);
4740 	pa = VM_PAGE_TO_PHYS(pg);
4741 
4742 	/*
4743 	 * If this is a new page, assert it has no mappings and simply zap
4744 	 * the stored attributes without taking any locks.
4745 	 */
4746 	if ((pg->flags & PG_FAKE) != 0) {
4747 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4748 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4749 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4750 		atomic_store_relaxed(&pp->pp_attrs, 0);
4751 		return false;
4752 	} else {
4753 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4754 	}
4755 }
4756 
4757 /*
4758  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4759  * pv-tracked page.
4760  */
4761 bool
4762 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4763 {
4764 	struct pmap_page *pp;
4765 
4766 	pp = pmap_pv_tracked(pa);
4767 	if (pp == NULL)
4768 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4769 
4770 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4771 }
4772 
4773 /*
4774  * p m a p   p r o t e c t i o n   f u n c t i o n s
4775  */
4776 
4777 /*
4778  * pmap_page_protect: change the protection of all recorded mappings
4779  * of a managed page
4780  *
4781  * => NOTE: this is an inline function in pmap.h
4782  */
4783 
4784 /* see pmap.h */
4785 
4786 /*
4787  * pmap_pv_protect: change the protection of all recorded mappings
4788  * of an unmanaged pv-tracked page
4789  *
4790  * => NOTE: this is an inline function in pmap.h
4791  */
4792 
4793 /* see pmap.h */
4794 
4795 /*
4796  * pmap_protect: set the protection in of the pages in a pmap
4797  *
4798  * => NOTE: this is an inline function in pmap.h
4799  */
4800 
4801 /* see pmap.h */
4802 
4803 /*
4804  * pmap_write_protect: write-protect pages in a pmap.
4805  *
4806  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4807  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4808  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4809  * present the page will still be considered as a kernel page, and the privilege
4810  * separation will be enforced correctly.
4811  */
4812 void
4813 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4814 {
4815 	pt_entry_t bit_rem, bit_put;
4816 	pt_entry_t *ptes;
4817 	pt_entry_t * const *pdes;
4818 	struct pmap *pmap2;
4819 	vaddr_t blockend, va;
4820 	int lvl, i;
4821 
4822 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4823 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4824 		return;
4825 	}
4826 
4827 	bit_rem = 0;
4828 	if (!(prot & VM_PROT_WRITE))
4829 		bit_rem = PTE_W;
4830 
4831 	bit_put = 0;
4832 	if (!(prot & VM_PROT_EXECUTE))
4833 		bit_put = pmap_pg_nx;
4834 
4835 	sva &= ~PAGE_MASK;
4836 	eva &= ~PAGE_MASK;
4837 
4838 	/*
4839 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4840 	 * be touching PV entries nor stats and kernel PDEs aren't
4841 	 * freed.
4842 	 */
4843 	if (pmap != pmap_kernel()) {
4844 		mutex_enter(&pmap->pm_lock);
4845 	}
4846 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4847 
4848 	for (va = sva ; va < eva; va = blockend) {
4849 		pt_entry_t *spte, *epte;
4850 
4851 		blockend = x86_round_pdr(va + 1);
4852 		if (blockend > eva)
4853 			blockend = eva;
4854 
4855 		/* Is it a valid block? */
4856 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4857 			continue;
4858 		}
4859 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4860 		KASSERT(lvl == 1);
4861 
4862 		spte = &ptes[pl1_i(va)];
4863 		epte = &ptes[pl1_i(blockend)];
4864 
4865 		for (i = 0; spte < epte; spte++, i++) {
4866 			pt_entry_t opte, npte;
4867 
4868 			do {
4869 				opte = *spte;
4870 				if (!pmap_valid_entry(opte)) {
4871 					goto next;
4872 				}
4873 				npte = (opte & ~bit_rem) | bit_put;
4874 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4875 
4876 			if ((opte & PTE_D) != 0) {
4877 				vaddr_t tva = va + x86_ptob(i);
4878 				pmap_tlb_shootdown(pmap, tva, opte,
4879 				    TLBSHOOT_WRITE_PROTECT);
4880 			}
4881 next:;
4882 		}
4883 	}
4884 
4885 	/* Release pmap. */
4886 	pmap_unmap_ptes(pmap, pmap2);
4887 	if (pmap != pmap_kernel()) {
4888 		mutex_exit(&pmap->pm_lock);
4889 	}
4890 }
4891 
4892 /*
4893  * pmap_unwire: clear the wired bit in the PTE.
4894  *
4895  * => Mapping should already be present.
4896  */
4897 void
4898 pmap_unwire(struct pmap *pmap, vaddr_t va)
4899 {
4900 	pt_entry_t *ptes, *ptep, opte;
4901 	pd_entry_t * const *pdes;
4902 	struct pmap *pmap2;
4903 	int lvl;
4904 
4905 	if (__predict_false(pmap->pm_unwire != NULL)) {
4906 		(*pmap->pm_unwire)(pmap, va);
4907 		return;
4908 	}
4909 
4910 	/*
4911 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4912 	 * statistics.
4913 	 */
4914 	mutex_enter(&pmap->pm_lock);
4915 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4916 
4917 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4918 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4919 	}
4920 	KASSERT(lvl == 1);
4921 
4922 	ptep = &ptes[pl1_i(va)];
4923 	opte = *ptep;
4924 	KASSERT(pmap_valid_entry(opte));
4925 
4926 	if (opte & PTE_WIRED) {
4927 		pt_entry_t npte = opte & ~PTE_WIRED;
4928 
4929 		opte = pmap_pte_testset(ptep, npte);
4930 		pmap_stats_update_bypte(pmap, npte, opte);
4931 	} else {
4932 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4933 		    " did not change!\n", __func__, pmap, va);
4934 	}
4935 
4936 	/* Release pmap. */
4937 	pmap_unmap_ptes(pmap, pmap2);
4938 	mutex_exit(&pmap->pm_lock);
4939 }
4940 
4941 /*
4942  * pmap_copy: copy mappings from one pmap to another
4943  *
4944  * => optional function
4945  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4946  */
4947 
4948 /*
4949  * defined as macro in pmap.h
4950  */
4951 
4952 __strict_weak_alias(pmap_enter, pmap_enter_default);
4953 
4954 int
4955 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4956     u_int flags)
4957 {
4958 	if (__predict_false(pmap->pm_enter != NULL)) {
4959 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4960 	}
4961 
4962 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4963 }
4964 
4965 /*
4966  * pmap_enter: enter a mapping into a pmap
4967  *
4968  * => must be done "now" ... no lazy-evaluation
4969  */
4970 int
4971 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4972 	   vm_prot_t prot, u_int flags, int domid)
4973 {
4974 	pt_entry_t *ptes, opte, npte;
4975 	pt_entry_t *ptep;
4976 	pd_entry_t * const *pdes;
4977 	struct vm_page *ptp;
4978 	struct vm_page *new_pg, *old_pg;
4979 	struct pmap_page *new_pp, *old_pp;
4980 	struct pv_entry *old_pve, *new_pve;
4981 	bool wired = (flags & PMAP_WIRED) != 0;
4982 	struct pmap *pmap2;
4983 	struct pmap_ptparray pt;
4984 	int error;
4985 	bool getptp, samepage, new_embedded;
4986 	rb_tree_t *tree;
4987 
4988 	KASSERT(pmap_initialized);
4989 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4990 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4991 	    PRIxVADDR " over PDP!", __func__, va);
4992 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4993 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4994 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4995 
4996 #ifdef XENPV
4997 	KASSERT(domid == DOMID_SELF || pa == 0);
4998 #endif
4999 
5000 	npte = ma | protection_codes[prot] | PTE_P;
5001 	npte |= pmap_pat_flags(flags);
5002 	if (wired)
5003 		npte |= PTE_WIRED;
5004 	if (va < VM_MAXUSER_ADDRESS) {
5005 		KASSERTMSG(pmap != pmap_kernel(),
5006 		    "entering user va %#"PRIxVADDR" into kernel pmap",
5007 		    va);
5008 		if (pmap_is_user(pmap))
5009 			npte |= PTE_U;
5010 	}
5011 
5012 	if (pmap == pmap_kernel())
5013 		npte |= pmap_pg_g;
5014 	if (flags & VM_PROT_ALL) {
5015 		npte |= PTE_A;
5016 		if (flags & VM_PROT_WRITE) {
5017 			KASSERT((npte & PTE_W) != 0);
5018 			npte |= PTE_D;
5019 		}
5020 	}
5021 
5022 #ifdef XENPV
5023 	if (domid != DOMID_SELF)
5024 		new_pg = NULL;
5025 	else
5026 #endif
5027 		new_pg = PHYS_TO_VM_PAGE(pa);
5028 
5029 	if (new_pg != NULL) {
5030 		/* This is a managed page */
5031 		npte |= PTE_PVLIST;
5032 		new_pp = VM_PAGE_TO_PP(new_pg);
5033 		PMAP_CHECK_PP(new_pp);
5034 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5035 		/* This is an unmanaged pv-tracked page */
5036 		npte |= PTE_PVLIST;
5037 		PMAP_CHECK_PP(new_pp);
5038 	} else {
5039 		new_pp = NULL;
5040 	}
5041 
5042 	/* Begin by locking the pmap. */
5043 	mutex_enter(&pmap->pm_lock);
5044 
5045 	/* Look up the PTP.  Allocate if none present. */
5046 	ptp = NULL;
5047 	getptp = false;
5048 	if (pmap != pmap_kernel()) {
5049 		ptp = pmap_find_ptp(pmap, va, 1);
5050 		if (ptp == NULL) {
5051 			getptp = true;
5052 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5053 			if (error != 0) {
5054 				if (flags & PMAP_CANFAIL) {
5055 					mutex_exit(&pmap->pm_lock);
5056 					return error;
5057 				}
5058 				panic("%s: get ptp failed, error=%d", __func__,
5059 				    error);
5060 			}
5061 		}
5062 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5063 	} else {
5064 		/* Embedded PV entries rely on this. */
5065 		KASSERT(va != 0);
5066 		tree = &pmap_kernel_rb;
5067 	}
5068 
5069 	/*
5070 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5071 	 * entry if required for the new mapping.  Temporarily track the old
5072 	 * and new mappings concurrently.  Only after the old mapping is
5073 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5074 	 * our picture of modified/accessed state for either page could get
5075 	 * out of sync (we need any P->V operation for either page to stall
5076 	 * on pmap->pm_lock until done here).
5077 	 */
5078 	new_pve = NULL;
5079 	old_pve = NULL;
5080 	samepage = false;
5081 	new_embedded = false;
5082 
5083 	if (new_pp != NULL) {
5084 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5085 		    &old_pve, &samepage, &new_embedded, tree);
5086 
5087 		/*
5088 		 * If a new pv_entry was needed and none was available, we
5089 		 * can go no further.
5090 		 */
5091 		if (error != 0) {
5092 			if (flags & PMAP_CANFAIL) {
5093 				if (getptp) {
5094 					pmap_unget_ptp(pmap, &pt);
5095 				}
5096 				mutex_exit(&pmap->pm_lock);
5097 				return error;
5098 			}
5099 			panic("%s: alloc pve failed", __func__);
5100 		}
5101 	} else {
5102 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5103 	}
5104 
5105 	/* Map PTEs into address space. */
5106 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5107 
5108 	/* Install any newly allocated PTPs. */
5109 	if (getptp) {
5110 		pmap_install_ptp(pmap, &pt, va, pdes);
5111 	}
5112 
5113 	/* Check if there is an existing mapping. */
5114 	ptep = &ptes[pl1_i(va)];
5115 	opte = *ptep;
5116 	bool have_oldpa = pmap_valid_entry(opte);
5117 	paddr_t oldpa = pmap_pte2pa(opte);
5118 
5119 	/*
5120 	 * Update the pte.
5121 	 */
5122 	do {
5123 		opte = *ptep;
5124 
5125 		/*
5126 		 * if the same page, inherit PTE_A and PTE_D.
5127 		 */
5128 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5129 			npte |= opte & (PTE_A | PTE_D);
5130 		}
5131 #if defined(XENPV)
5132 		if (domid != DOMID_SELF) {
5133 			/* pmap_pte_cas with error handling */
5134 			int s = splvm();
5135 			if (opte != *ptep) {
5136 				splx(s);
5137 				continue;
5138 			}
5139 			error = xpq_update_foreign(
5140 			    vtomach((vaddr_t)ptep), npte, domid, flags);
5141 			splx(s);
5142 			if (error) {
5143 				/* Undo pv_entry tracking - oof. */
5144 				if (new_pp != NULL) {
5145 					mutex_spin_enter(&new_pp->pp_lock);
5146 					if (new_pve != NULL) {
5147 						LIST_REMOVE(new_pve, pve_list);
5148 						KASSERT(pmap->pm_pve == NULL);
5149 						pmap->pm_pve = new_pve;
5150 					} else if (new_embedded) {
5151 						new_pp->pp_pte.pte_ptp = NULL;
5152 						new_pp->pp_pte.pte_va = 0;
5153 					}
5154 					mutex_spin_exit(&new_pp->pp_lock);
5155 				}
5156 				pmap_unmap_ptes(pmap, pmap2);
5157 				/* Free new PTP. */
5158 				if (ptp != NULL && ptp->wire_count <= 1) {
5159 					pmap_free_ptp(pmap, ptp, va, ptes,
5160 					    pdes);
5161 				}
5162 				mutex_exit(&pmap->pm_lock);
5163 				return error;
5164 			}
5165 			break;
5166 		}
5167 #endif /* defined(XENPV) */
5168 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
5169 
5170 	/*
5171 	 * Done with the PTEs: they can now be unmapped.
5172 	 */
5173 	pmap_unmap_ptes(pmap, pmap2);
5174 
5175 	/*
5176 	 * Update statistics and PTP's reference count.
5177 	 */
5178 	pmap_stats_update_bypte(pmap, npte, opte);
5179 	if (ptp != NULL) {
5180 		if (!have_oldpa) {
5181 			ptp->wire_count++;
5182 		}
5183 		/* Remember minimum VA in PTP. */
5184 		pmap_ptp_range_set(ptp, va);
5185 	}
5186 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5187 
5188 	/*
5189 	 * If the same page, we can skip pv_entry handling.
5190 	 */
5191 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5192 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5193 		if ((npte & PTE_PVLIST) != 0) {
5194 			KASSERT(samepage);
5195 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5196 		}
5197 		goto same_pa;
5198 	} else if ((npte & PTE_PVLIST) != 0) {
5199 		KASSERT(!samepage);
5200 	}
5201 
5202 	/*
5203 	 * If old page is pv-tracked, remove pv_entry from its list.
5204 	 */
5205 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5206 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5207 			old_pp = VM_PAGE_TO_PP(old_pg);
5208 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5209 			panic("%s: PTE_PVLIST with pv-untracked page"
5210 			    " va = %#"PRIxVADDR
5211 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5212 			    __func__, va, oldpa, atop(pa));
5213 		}
5214 
5215 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5216 		    pmap_pte_to_pp_attrs(opte));
5217 	} else {
5218 		KASSERT(old_pve == NULL);
5219 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5220 	}
5221 
5222 	/*
5223 	 * If new page is dynamically PV tracked, insert to tree.
5224 	 */
5225 	if (new_pve != NULL) {
5226 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5227 		old_pve = rb_tree_insert_node(tree, new_pve);
5228 		KASSERT(old_pve == new_pve);
5229 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5230 	}
5231 
5232 same_pa:
5233 	/*
5234 	 * shootdown tlb if necessary.
5235 	 */
5236 
5237 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5238 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5239 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5240 	}
5241 	pmap_drain_pv(pmap);
5242 	mutex_exit(&pmap->pm_lock);
5243 	return 0;
5244 }
5245 
5246 #if defined(XEN) && defined(DOM0OPS)
5247 
5248 struct pmap_data_gnt {
5249 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5250 	vaddr_t pd_gnt_sva;
5251 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5252 	int pd_gnt_refs; /* ref counter */
5253 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5254 };
5255 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5256 
5257 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5258 
5259 static struct pmap_data_gnt *
5260 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5261 {
5262 	struct pmap_data_gnt_head *headp;
5263 	struct pmap_data_gnt *pgnt;
5264 
5265 	KASSERT(mutex_owned(&pmap->pm_lock));
5266 	headp = pmap->pm_data;
5267 	KASSERT(headp != NULL);
5268 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5269 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5270 			return pgnt;
5271 		/* check that we're not overlapping part of a region */
5272 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5273 	}
5274 	return NULL;
5275 }
5276 
5277 static void
5278 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5279     const struct gnttab_map_grant_ref *ops)
5280 {
5281 	struct pmap_data_gnt_head *headp;
5282 	struct pmap_data_gnt *pgnt;
5283 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5284 	KASSERT(mutex_owned(&pmap->pm_lock));
5285 	KASSERT(nentries >= 1);
5286 	if (pmap->pm_remove == NULL) {
5287 		pmap->pm_remove = pmap_remove_gnt;
5288 		KASSERT(pmap->pm_data == NULL);
5289 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5290 		SLIST_INIT(headp);
5291 		pmap->pm_data = headp;
5292 	} else {
5293 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5294 		KASSERT(pmap->pm_data != NULL);
5295 		headp = pmap->pm_data;
5296 	}
5297 
5298 	pgnt = pmap_find_gnt(pmap, sva, eva);
5299 	if (pgnt != NULL) {
5300 		KASSERT(pgnt->pd_gnt_sva == sva);
5301 		KASSERT(pgnt->pd_gnt_eva == eva);
5302 		return;
5303 	}
5304 
5305 	/* new entry */
5306 	pgnt = kmem_alloc(sizeof(*pgnt) +
5307 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5308 	pgnt->pd_gnt_sva = sva;
5309 	pgnt->pd_gnt_eva = eva;
5310 	pgnt->pd_gnt_refs = 0;
5311 	memcpy(pgnt->pd_gnt_ops, ops,
5312 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5313 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5314 }
5315 
5316 static void
5317 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5318 {
5319 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5320 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5321 	KASSERT(nentries >= 1);
5322 	KASSERT(mutex_owned(&pmap->pm_lock));
5323 	KASSERT(pgnt->pd_gnt_refs == 0);
5324 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5325 	kmem_free(pgnt, sizeof(*pgnt) +
5326 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5327 	if (SLIST_EMPTY(headp)) {
5328 		kmem_free(headp, sizeof(*headp));
5329 		pmap->pm_data = NULL;
5330 		pmap->pm_remove = NULL;
5331 	}
5332 }
5333 
5334 /*
5335  * pmap_enter_gnt: enter a grant entry into a pmap
5336  *
5337  * => must be done "now" ... no lazy-evaluation
5338  */
5339 int
5340 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5341     const struct gnttab_map_grant_ref *oops)
5342 {
5343 	struct pmap_data_gnt *pgnt;
5344 	pt_entry_t *ptes, opte;
5345 #ifndef XENPV
5346 	pt_entry_t npte;
5347 #endif
5348 	pt_entry_t *ptep;
5349 	pd_entry_t * const *pdes;
5350 	struct vm_page *ptp;
5351 	struct vm_page *old_pg;
5352 	struct pmap_page *old_pp;
5353 	struct pv_entry *old_pve;
5354 	struct pmap *pmap2;
5355 	struct pmap_ptparray pt;
5356 	int error;
5357 	bool getptp;
5358 	rb_tree_t *tree;
5359 	struct gnttab_map_grant_ref *op;
5360 	int ret;
5361 	int idx;
5362 
5363 	KASSERT(pmap_initialized);
5364 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5365 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5366 	    PRIxVADDR " over PDP!", __func__, va);
5367 	KASSERT(pmap != pmap_kernel());
5368 
5369 	/* Begin by locking the pmap. */
5370 	mutex_enter(&pmap->pm_lock);
5371 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5372 
5373 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5374 	KASSERT(pgnt != NULL);
5375 
5376 	/* Look up the PTP.  Allocate if none present. */
5377 	ptp = NULL;
5378 	getptp = false;
5379 	ptp = pmap_find_ptp(pmap, va, 1);
5380 	if (ptp == NULL) {
5381 		getptp = true;
5382 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5383 		if (error != 0) {
5384 			mutex_exit(&pmap->pm_lock);
5385 			return error;
5386 		}
5387 	}
5388 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5389 
5390 	/*
5391 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5392 	 * entry if required for the new mapping.  Temporarily track the old
5393 	 * and new mappings concurrently.  Only after the old mapping is
5394 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5395 	 * our picture of modified/accessed state for either page could get
5396 	 * out of sync (we need any P->V operation for either page to stall
5397 	 * on pmap->pm_lock until done here).
5398 	 */
5399 	old_pve = NULL;
5400 
5401 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5402 
5403 	/* Map PTEs into address space. */
5404 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5405 
5406 	/* Install any newly allocated PTPs. */
5407 	if (getptp) {
5408 		pmap_install_ptp(pmap, &pt, va, pdes);
5409 	}
5410 
5411 	/* Check if there is an existing mapping. */
5412 	ptep = &ptes[pl1_i(va)];
5413 	opte = *ptep;
5414 	bool have_oldpa = pmap_valid_entry(opte);
5415 	paddr_t oldpa = pmap_pte2pa(opte);
5416 
5417 	/*
5418 	 * Update the pte.
5419 	 */
5420 
5421 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5422 	op = &pgnt->pd_gnt_ops[idx];
5423 
5424 #ifdef XENPV
5425 	KASSERT(op->flags & GNTMAP_contains_pte);
5426 	op->host_addr = xpmap_ptetomach(ptep);
5427 #else
5428 	KASSERT((op->flags & GNTMAP_contains_pte) == 0);
5429 	KASSERT(op->flags != 0);
5430 	KASSERT(op->host_addr != 0);
5431 #endif
5432 	op->dev_bus_addr = 0;
5433 	op->status = GNTST_general_error;
5434 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5435 	if (__predict_false(ret)) {
5436 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5437 		    __func__, ret);
5438 		op->status = GNTST_general_error;
5439 	}
5440 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5441 		kpause("gntmap", false, mstohz(1), NULL);
5442 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5443 		if (__predict_false(ret)) {
5444 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5445 			    __func__, ret);
5446 			op->status = GNTST_general_error;
5447 		}
5448 	}
5449 	if (__predict_false(op->status != GNTST_okay)) {
5450 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5451 		    __func__, op->status);
5452 		if (have_oldpa) { /* XXX did the pte really change if XENPV  ?*/
5453 			ptp->wire_count--;
5454 		}
5455 	} else {
5456 #ifndef XENPV
5457 		npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
5458 		if ((op->flags & GNTMAP_readonly) == 0)
5459 			npte |= PTE_W;
5460 		do {
5461 			opte = *ptep;
5462 		} while (pmap_pte_cas(ptep, opte, npte) != opte);
5463 #endif
5464 		pgnt->pd_gnt_refs++;
5465 		if (!have_oldpa) {
5466 			ptp->wire_count++;
5467 		}
5468 		KASSERT(ptp->wire_count > 1);
5469 		/* Remember minimum VA in PTP. */
5470 		pmap_ptp_range_set(ptp, va);
5471 	}
5472 	if (ptp->wire_count <= 1)
5473 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5474 
5475 	/*
5476 	 * Done with the PTEs: they can now be unmapped.
5477 	 */
5478 	pmap_unmap_ptes(pmap, pmap2);
5479 
5480 	/*
5481 	 * Update statistics and PTP's reference count.
5482 	 */
5483 	pmap_stats_update_bypte(pmap, 0, opte);
5484 
5485 	/*
5486 	 * If old page is pv-tracked, remove pv_entry from its list.
5487 	 */
5488 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5489 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5490 			old_pp = VM_PAGE_TO_PP(old_pg);
5491 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5492 			panic("%s: PTE_PVLIST with pv-untracked page"
5493 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5494 			    __func__, va, oldpa);
5495 		}
5496 
5497 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5498 		    pmap_pte_to_pp_attrs(opte));
5499 	} else {
5500 		KASSERT(old_pve == NULL);
5501 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5502 	}
5503 
5504 	pmap_drain_pv(pmap);
5505 	mutex_exit(&pmap->pm_lock);
5506 	return op->status;
5507 }
5508 
5509 /*
5510  * pmap_remove_gnt: grant mapping removal function.
5511  *
5512  * => caller should not be holding any pmap locks
5513  */
5514 static void
5515 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5516 {
5517 	struct pmap_data_gnt *pgnt;
5518 	pt_entry_t *ptes;
5519 	pd_entry_t pde;
5520 	pd_entry_t * const *pdes;
5521 	struct vm_page *ptp;
5522 	struct pmap *pmap2;
5523 	vaddr_t va;
5524 	int lvl;
5525 	int idx;
5526 	struct gnttab_map_grant_ref *op;
5527 	struct gnttab_unmap_grant_ref unmap_op;
5528 	int ret;
5529 
5530 	KASSERT(pmap != pmap_kernel());
5531 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5532 
5533 	mutex_enter(&pmap->pm_lock);
5534 	for (va = sva; va < eva; va += PAGE_SIZE) {
5535 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5536 		if (pgnt == NULL) {
5537 			pmap_remove_locked(pmap, sva, eva);
5538 			continue;
5539 		}
5540 
5541 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5542 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5543 			panic("pmap_remove_gnt pdes not valid");
5544 		}
5545 
5546 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5547 		op = &pgnt->pd_gnt_ops[idx];
5548 		KASSERT(lvl == 1);
5549 
5550 		/* Get PTP if non-kernel mapping. */
5551 		ptp = pmap_find_ptp(pmap, va, 1);
5552 		KASSERTMSG(ptp != NULL,
5553 		    "%s: unmanaged PTP detected", __func__);
5554 
5555 		if (op->status == GNTST_okay)  {
5556 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5557 #ifdef XENPV
5558 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5559 #else
5560 			unmap_op.host_addr = op->host_addr;
5561 			pmap_pte_testset(&ptes[pl1_i(va)], 0);
5562 #endif
5563 			unmap_op.handle = op->handle;
5564 			unmap_op.dev_bus_addr = 0;
5565 			ret = HYPERVISOR_grant_table_op(
5566 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5567 			if (ret) {
5568 				printf("%s: GNTTABOP_unmap_grant_ref "
5569 				    "failed: %d\n", __func__, ret);
5570 			}
5571 
5572 			ptp->wire_count--;
5573 			pgnt->pd_gnt_refs--;
5574 		}
5575 		if (pgnt->pd_gnt_refs == 0) {
5576 			pmap_free_gnt(pmap, pgnt);
5577 		}
5578 		/*
5579 		 * if mapping removed and the PTP is no longer
5580 		 * being used, free it!
5581 		 */
5582 
5583 		if (ptp->wire_count <= 1)
5584 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5585 		pmap_unmap_ptes(pmap, pmap2);
5586 	}
5587 	mutex_exit(&pmap->pm_lock);
5588 }
5589 #endif /* XEN && DOM0OPS */
5590 
5591 paddr_t
5592 pmap_get_physpage(void)
5593 {
5594 	struct vm_page *ptp;
5595 	struct pmap *kpm = pmap_kernel();
5596 	paddr_t pa;
5597 
5598 	if (!uvm.page_init_done) {
5599 		/*
5600 		 * We're growing the kernel pmap early (from
5601 		 * uvm_pageboot_alloc()). This case must be
5602 		 * handled a little differently.
5603 		 */
5604 
5605 		if (!uvm_page_physget(&pa))
5606 			panic("%s: out of memory", __func__);
5607 #if defined(__HAVE_DIRECT_MAP)
5608 		memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5609 #else
5610 #if defined(XENPV)
5611 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5612 			xen_pagezero(pa);
5613 			return pa;
5614 		}
5615 #endif
5616 		kpreempt_disable();
5617 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5618 		    PTE_W | pmap_pg_nx);
5619 		pmap_pte_flush();
5620 		pmap_update_pg((vaddr_t)early_zerop);
5621 		memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5622 #if defined(DIAGNOSTIC) || defined(XENPV)
5623 		pmap_pte_set(early_zero_pte, 0);
5624 		pmap_pte_flush();
5625 #endif /* defined(DIAGNOSTIC) */
5626 		kpreempt_enable();
5627 #endif /* defined(__HAVE_DIRECT_MAP) */
5628 	} else {
5629 		/* XXX */
5630 		ptp = uvm_pagealloc(NULL, 0, NULL,
5631 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5632 		if (ptp == NULL)
5633 			panic("%s: out of memory", __func__);
5634 		ptp->flags &= ~PG_BUSY;
5635 		ptp->wire_count = 1;
5636 		pa = VM_PAGE_TO_PHYS(ptp);
5637 	}
5638 	pmap_stats_update(kpm, 1, 0);
5639 
5640 	return pa;
5641 }
5642 
5643 /*
5644  * Expand the page tree with the specified amount of PTPs, mapping virtual
5645  * addresses starting at kva. We populate all the levels but the last one
5646  * (L1). The nodes of the tree are created as RW, but the pages covered
5647  * will be kentered in L1, with proper permissions.
5648  *
5649  * Used only by pmap_growkernel.
5650  */
5651 static void
5652 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5653 {
5654 	unsigned long i;
5655 	paddr_t pa;
5656 	unsigned long index, endindex;
5657 	int level;
5658 	pd_entry_t *pdep;
5659 #ifdef XENPV
5660 	int s = splvm(); /* protect xpq_* */
5661 #endif
5662 
5663 	for (level = PTP_LEVELS; level > 1; level--) {
5664 		if (level == PTP_LEVELS)
5665 			pdep = cpm->pm_pdir;
5666 		else
5667 			pdep = normal_pdes[level - 2];
5668 		index = pl_i_roundup(kva, level);
5669 		endindex = index + needed_ptps[level - 1] - 1;
5670 
5671 		for (i = index; i <= endindex; i++) {
5672 			pt_entry_t pte;
5673 
5674 			KASSERT(!pmap_valid_entry(pdep[i]));
5675 			pa = pmap_get_physpage();
5676 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5677 #ifdef __x86_64__
5678 			pte |= pmap_pg_nx;
5679 #endif
5680 			pmap_pte_set(&pdep[i], pte);
5681 
5682 #ifdef XENPV
5683 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5684 				if (__predict_true(
5685 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5686 					/* update per-cpu PMDs on all cpus */
5687 					xen_kpm_sync(pmap_kernel(), i);
5688 				} else {
5689 					/*
5690 					 * too early; update primary CPU
5691 					 * PMD only (without locks)
5692 					 */
5693 #ifdef __x86_64__
5694 					pd_entry_t *cpu_pdep =
5695 						&cpu_info_primary.ci_kpm_pdir[i];
5696 #else
5697 					pd_entry_t *cpu_pdep =
5698 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5699 #endif
5700 					pmap_pte_set(cpu_pdep, pte);
5701 				}
5702 			}
5703 #endif
5704 
5705 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5706 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5707 			nkptp[level - 1]++;
5708 		}
5709 		pmap_pte_flush();
5710 	}
5711 #ifdef XENPV
5712 	splx(s);
5713 #endif
5714 }
5715 
5716 /*
5717  * pmap_growkernel: increase usage of KVM space.
5718  *
5719  * => we allocate new PTPs for the kernel and install them in all
5720  *    the pmaps on the system.
5721  */
5722 vaddr_t
5723 pmap_growkernel(vaddr_t maxkvaddr)
5724 {
5725 	struct pmap *kpm = pmap_kernel();
5726 	struct pmap *cpm;
5727 #if !defined(XENPV) || !defined(__x86_64__)
5728 	struct pmap *pm;
5729 	long old;
5730 #endif
5731 	int s, i;
5732 	long needed_kptp[PTP_LEVELS], target_nptp;
5733 	bool invalidate = false;
5734 
5735 	s = splvm();	/* to be safe */
5736 	mutex_enter(&kpm->pm_lock);
5737 
5738 	if (maxkvaddr <= pmap_maxkvaddr) {
5739 		mutex_exit(&kpm->pm_lock);
5740 		splx(s);
5741 		return pmap_maxkvaddr;
5742 	}
5743 
5744 	maxkvaddr = x86_round_pdr(maxkvaddr);
5745 #if !defined(XENPV) || !defined(__x86_64__)
5746 	old = nkptp[PTP_LEVELS - 1];
5747 #endif
5748 
5749 	/* Initialize needed_kptp. */
5750 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5751 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5752 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5753 
5754 		if (target_nptp > nkptpmax[i])
5755 			panic("out of KVA space");
5756 		KASSERT(target_nptp >= nkptp[i]);
5757 		needed_kptp[i] = target_nptp - nkptp[i];
5758 	}
5759 
5760 #ifdef XENPV
5761 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5762 	cpm = kpm;
5763 #else
5764 	/* Get the current pmap */
5765 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5766 		cpm = curcpu()->ci_pmap;
5767 	} else {
5768 		cpm = kpm;
5769 	}
5770 #endif
5771 
5772 	kasan_shadow_map((void *)pmap_maxkvaddr,
5773 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5774 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5775 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5776 
5777 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5778 
5779 	/*
5780 	 * If the number of top level entries changed, update all pmaps.
5781 	 */
5782 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5783 #ifdef XENPV
5784 #ifdef __x86_64__
5785 		/* nothing, kernel entries are never entered in user pmap */
5786 #else
5787 		int pdkidx;
5788 
5789 		mutex_enter(&pmaps_lock);
5790 		LIST_FOREACH(pm, &pmaps, pm_list) {
5791 			for (pdkidx = PDIR_SLOT_KERN + old;
5792 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5793 			    pdkidx++) {
5794 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5795 				    kpm->pm_pdir[pdkidx]);
5796 			}
5797 			pmap_pte_flush();
5798 		}
5799 		mutex_exit(&pmaps_lock);
5800 #endif /* __x86_64__ */
5801 #else /* XENPV */
5802 		size_t newpdes;
5803 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5804 		if (cpm != kpm) {
5805 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5806 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5807 			    newpdes * sizeof(pd_entry_t));
5808 		}
5809 
5810 		mutex_enter(&pmaps_lock);
5811 		LIST_FOREACH(pm, &pmaps, pm_list) {
5812 			if (__predict_false(pm->pm_enter != NULL)) {
5813 				/*
5814 				 * Not a native pmap, the kernel is not mapped,
5815 				 * so nothing to synchronize.
5816 				 */
5817 				continue;
5818 			}
5819 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5820 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5821 			    newpdes * sizeof(pd_entry_t));
5822 		}
5823 		mutex_exit(&pmaps_lock);
5824 #endif
5825 		invalidate = true;
5826 	}
5827 	pmap_maxkvaddr = maxkvaddr;
5828 	mutex_exit(&kpm->pm_lock);
5829 	splx(s);
5830 
5831 	if (invalidate && pmap_initialized) {
5832 		/* Invalidate the pmap cache. */
5833 		pool_cache_invalidate(&pmap_cache);
5834 	}
5835 
5836 	return maxkvaddr;
5837 }
5838 
5839 #ifdef DEBUG
5840 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5841 
5842 /*
5843  * pmap_dump: dump all the mappings from a pmap
5844  *
5845  * => caller should not be holding any pmap locks
5846  */
5847 void
5848 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5849 {
5850 	pt_entry_t *ptes, *pte;
5851 	pd_entry_t * const *pdes;
5852 	struct pmap *pmap2;
5853 	vaddr_t blkendva;
5854 	int lvl;
5855 
5856 	/*
5857 	 * if end is out of range truncate.
5858 	 * if (end == start) update to max.
5859 	 */
5860 
5861 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5862 		eva = VM_MAXUSER_ADDRESS;
5863 
5864 	mutex_enter(&pmap->pm_lock);
5865 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5866 
5867 	/*
5868 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5869 	 */
5870 
5871 	for (/* null */ ; sva < eva ; sva = blkendva) {
5872 
5873 		/* determine range of block */
5874 		blkendva = x86_round_pdr(sva+1);
5875 		if (blkendva > eva)
5876 			blkendva = eva;
5877 
5878 		/* valid block? */
5879 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5880 			continue;
5881 		KASSERT(lvl == 1);
5882 
5883 		pte = &ptes[pl1_i(sva)];
5884 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5885 			if (!pmap_valid_entry(*pte))
5886 				continue;
5887 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5888 			    " (pte=%#" PRIxPADDR ")\n",
5889 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5890 		}
5891 	}
5892 	pmap_unmap_ptes(pmap, pmap2);
5893 	mutex_exit(&pmap->pm_lock);
5894 }
5895 #endif
5896 
5897 /*
5898  * pmap_update: process deferred invalidations and frees.
5899  */
5900 void
5901 pmap_update(struct pmap *pmap)
5902 {
5903 	struct pmap_page *pp;
5904 	struct vm_page *ptp;
5905 
5906 	/*
5907 	 * Initiate any pending TLB shootdowns.  Wait for them to
5908 	 * complete before returning control to the caller.
5909 	 */
5910 	kpreempt_disable();
5911 	pmap_tlb_shootnow();
5912 	kpreempt_enable();
5913 
5914 	/*
5915 	 * Now that shootdowns are complete, process deferred frees.  This
5916 	 * is an unlocked check, but is safe as we're only interested in
5917 	 * work done in this LWP - we won't get a false negative.
5918 	 */
5919 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5920 		return;
5921 	}
5922 
5923 	mutex_enter(&pmap->pm_lock);
5924 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5925 		KASSERT(ptp->wire_count == 0);
5926 		KASSERT(ptp->uanon == NULL);
5927 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5928 		pp = VM_PAGE_TO_PP(ptp);
5929 		LIST_INIT(&pp->pp_pvlist);
5930 		pp->pp_attrs = 0;
5931 		pp->pp_pte.pte_ptp = NULL;
5932 		pp->pp_pte.pte_va = 0;
5933 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5934 
5935 		/*
5936 		 * XXX Hack to avoid extra locking, and lock
5937 		 * assertions in uvm_pagefree().  Despite uobject
5938 		 * being set, this isn't a managed page.
5939 		 */
5940 		PMAP_DUMMY_LOCK(pmap);
5941 		uvm_pagerealloc(ptp, NULL, 0);
5942 		PMAP_DUMMY_UNLOCK(pmap);
5943 		uvm_pagefree(ptp);
5944 	}
5945 	mutex_exit(&pmap->pm_lock);
5946 }
5947 
5948 #if PTP_LEVELS > 4
5949 #error "Unsupported number of page table mappings"
5950 #endif
5951 
5952 paddr_t
5953 pmap_init_tmp_pgtbl(paddr_t pg)
5954 {
5955 	static bool maps_loaded;
5956 	static const paddr_t x86_tmp_pml_paddr[] = {
5957 	    4 * PAGE_SIZE,	/* L1 */
5958 	    5 * PAGE_SIZE,	/* L2 */
5959 	    6 * PAGE_SIZE,	/* L3 */
5960 	    7 * PAGE_SIZE	/* L4 */
5961 	};
5962 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5963 
5964 	pd_entry_t *tmp_pml, *kernel_pml;
5965 
5966 	int level;
5967 
5968 	if (!maps_loaded) {
5969 		for (level = 0; level < PTP_LEVELS; ++level) {
5970 			x86_tmp_pml_vaddr[level] =
5971 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5972 			    UVM_KMF_VAONLY);
5973 
5974 			if (x86_tmp_pml_vaddr[level] == 0)
5975 				panic("mapping of real mode PML failed\n");
5976 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5977 			    x86_tmp_pml_paddr[level],
5978 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5979 		}
5980 		pmap_update(pmap_kernel());
5981 		maps_loaded = true;
5982 	}
5983 
5984 	/* Zero levels 1-3 */
5985 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
5986 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5987 		memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
5988 	}
5989 
5990 	/* Copy PML4 */
5991 	kernel_pml = pmap_kernel()->pm_pdir;
5992 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
5993 	memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
5994 
5995 #ifdef PAE
5996 	/*
5997 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
5998 	 * last entries are unlikely to be used for temporary mappings.
5999 	 * 508: maps 0->1GB (userland)
6000 	 * 509: unused
6001 	 * 510: unused
6002 	 * 511: maps 3->4GB (kernel)
6003 	 */
6004 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
6005 	tmp_pml[509] = 0;
6006 	tmp_pml[510] = 0;
6007 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
6008 #endif
6009 
6010 	for (level = PTP_LEVELS - 1; level > 0; --level) {
6011 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6012 
6013 		tmp_pml[pl_i(pg, level + 1)] =
6014 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
6015 	}
6016 
6017 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
6018 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
6019 
6020 #ifdef PAE
6021 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
6022 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
6023 #endif
6024 
6025 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
6026 }
6027 
6028 u_int
6029 x86_mmap_flags(paddr_t mdpgno)
6030 {
6031 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
6032 	u_int pflag = 0;
6033 
6034 	if (nflag & X86_MMAP_FLAG_PREFETCH)
6035 		pflag |= PMAP_WRITE_COMBINE;
6036 
6037 	return pflag;
6038 }
6039 
6040 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
6041 
6042 /*
6043  * -----------------------------------------------------------------------------
6044  * *****************************************************************************
6045  * *****************************************************************************
6046  * *****************************************************************************
6047  * *****************************************************************************
6048  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
6049  * *****************************************************************************
6050  * *****************************************************************************
6051  * *****************************************************************************
6052  * *****************************************************************************
6053  * -----------------------------------------------------------------------------
6054  *
6055  * These functions are invoked as callbacks from the code above. Contrary to
6056  * native, EPT does not have a recursive slot; therefore, it is not possible
6057  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
6058  * tree manually.
6059  *
6060  * Apart from that, the logic is mostly the same as native. Once a pmap has
6061  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
6062  * After that we're good, and the callbacks will handle the translations
6063  * for us.
6064  *
6065  * -----------------------------------------------------------------------------
6066  */
6067 
6068 /* Hardware bits. */
6069 #define EPT_R		__BIT(0)	/* read */
6070 #define EPT_W		__BIT(1)	/* write */
6071 #define EPT_X		__BIT(2)	/* execute */
6072 #define EPT_T		__BITS(5,3)	/* type */
6073 #define		TYPE_UC	0
6074 #define		TYPE_WC	1
6075 #define		TYPE_WT	4
6076 #define		TYPE_WP	5
6077 #define		TYPE_WB	6
6078 #define EPT_NOPAT	__BIT(6)
6079 #define EPT_L		__BIT(7)	/* large */
6080 #define EPT_A		__BIT(8)	/* accessed */
6081 #define EPT_D		__BIT(9)	/* dirty */
6082 /* Software bits. */
6083 #define EPT_PVLIST	__BIT(60)
6084 #define EPT_WIRED	__BIT(61)
6085 
6086 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
6087 
6088 bool pmap_ept_has_ad __read_mostly;
6089 
6090 static inline void
6091 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
6092 {
6093 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
6094 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
6095 
6096 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6097 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6098 
6099 	pmap_stats_update(pmap, resid_diff, wired_diff);
6100 }
6101 
6102 static pt_entry_t
6103 pmap_ept_type(u_int flags)
6104 {
6105 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
6106 	pt_entry_t ret;
6107 
6108 	switch (cacheflags) {
6109 	case PMAP_NOCACHE:
6110 	case PMAP_NOCACHE_OVR:
6111 		ret = __SHIFTIN(TYPE_UC, EPT_T);
6112 		break;
6113 	case PMAP_WRITE_COMBINE:
6114 		ret = __SHIFTIN(TYPE_WC, EPT_T);
6115 		break;
6116 	case PMAP_WRITE_BACK:
6117 	default:
6118 		ret = __SHIFTIN(TYPE_WB, EPT_T);
6119 		break;
6120 	}
6121 
6122 	ret |= EPT_NOPAT;
6123 	return ret;
6124 }
6125 
6126 static inline pt_entry_t
6127 pmap_ept_prot(vm_prot_t prot)
6128 {
6129 	pt_entry_t res = 0;
6130 
6131 	if (prot & VM_PROT_READ)
6132 		res |= EPT_R;
6133 	if (prot & VM_PROT_WRITE)
6134 		res |= EPT_W;
6135 	if (prot & VM_PROT_EXECUTE)
6136 		res |= EPT_X;
6137 
6138 	return res;
6139 }
6140 
6141 static inline uint8_t
6142 pmap_ept_to_pp_attrs(pt_entry_t ept)
6143 {
6144 	uint8_t ret = 0;
6145 	if (pmap_ept_has_ad) {
6146 		if (ept & EPT_D)
6147 			ret |= PP_ATTRS_D;
6148 		if (ept & EPT_A)
6149 			ret |= PP_ATTRS_A;
6150 	} else {
6151 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
6152 	}
6153 	if (ept & EPT_W)
6154 		ret |= PP_ATTRS_W;
6155 	return ret;
6156 }
6157 
6158 static inline pt_entry_t
6159 pmap_pp_attrs_to_ept(uint8_t attrs)
6160 {
6161 	pt_entry_t ept = 0;
6162 	if (attrs & PP_ATTRS_D)
6163 		ept |= EPT_D;
6164 	if (attrs & PP_ATTRS_A)
6165 		ept |= EPT_A;
6166 	if (attrs & PP_ATTRS_W)
6167 		ept |= EPT_W;
6168 	return ept;
6169 }
6170 
6171 /*
6172  * Helper for pmap_ept_free_ptp.
6173  * tree[0] = &L2[L2idx]
6174  * tree[1] = &L3[L3idx]
6175  * tree[2] = &L4[L4idx]
6176  */
6177 static void
6178 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6179 {
6180 	pt_entry_t *pteva;
6181 	paddr_t ptepa;
6182 	int i, index;
6183 
6184 	ptepa = pmap->pm_pdirpa[0];
6185 	for (i = PTP_LEVELS; i > 1; i--) {
6186 		index = pl_pi(va, i);
6187 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6188 		KASSERT(pmap_ept_valid_entry(pteva[index]));
6189 		tree[i - 2] = &pteva[index];
6190 		ptepa = pmap_pte2pa(pteva[index]);
6191 	}
6192 }
6193 
6194 static void
6195 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6196 {
6197 	pd_entry_t *tree[3];
6198 	int level;
6199 
6200 	KASSERT(pmap != pmap_kernel());
6201 	KASSERT(mutex_owned(&pmap->pm_lock));
6202 	KASSERT(kpreempt_disabled());
6203 
6204 	pmap_ept_get_tree(pmap, va, tree);
6205 
6206 	level = 1;
6207 	do {
6208 		(void)pmap_pte_testset(tree[level - 1], 0);
6209 
6210 		pmap_freepage(pmap, ptp, level);
6211 		if (level < PTP_LEVELS - 1) {
6212 			ptp = pmap_find_ptp(pmap, va, level + 1);
6213 			ptp->wire_count--;
6214 			if (ptp->wire_count > 1)
6215 				break;
6216 		}
6217 	} while (++level < PTP_LEVELS);
6218 	pmap_pte_flush();
6219 }
6220 
6221 /* Allocate L4->L3->L2. Return L2. */
6222 static void
6223 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6224 {
6225 	struct vm_page *ptp;
6226 	unsigned long index;
6227 	pd_entry_t *pteva;
6228 	paddr_t ptepa;
6229 	int i;
6230 
6231 	KASSERT(pmap != pmap_kernel());
6232 	KASSERT(mutex_owned(&pmap->pm_lock));
6233 	KASSERT(kpreempt_disabled());
6234 
6235 	/*
6236 	 * Now that we have all the pages looked up or allocated,
6237 	 * loop through again installing any new ones into the tree.
6238 	 */
6239 	ptepa = pmap->pm_pdirpa[0];
6240 	for (i = PTP_LEVELS; i > 1; i--) {
6241 		index = pl_pi(va, i);
6242 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6243 
6244 		if (pmap_ept_valid_entry(pteva[index])) {
6245 			KASSERT(!pt->alloced[i]);
6246 			ptepa = pmap_pte2pa(pteva[index]);
6247 			continue;
6248 		}
6249 
6250 		ptp = pt->pg[i];
6251 		ptp->flags &= ~PG_BUSY; /* never busy */
6252 		ptp->wire_count = 1;
6253 		pmap->pm_ptphint[i - 2] = ptp;
6254 		ptepa = VM_PAGE_TO_PHYS(ptp);
6255 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6256 
6257 		pmap_pte_flush();
6258 		pmap_stats_update(pmap, 1, 0);
6259 
6260 		/*
6261 		 * If we're not in the top level, increase the
6262 		 * wire count of the parent page.
6263 		 */
6264 		if (i < PTP_LEVELS) {
6265 			pt->pg[i + 1]->wire_count++;
6266 		}
6267 	}
6268 }
6269 
6270 static int
6271 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6272     u_int flags)
6273 {
6274 	pt_entry_t *ptes, opte, npte;
6275 	pt_entry_t *ptep;
6276 	struct vm_page *ptp;
6277 	struct vm_page *new_pg, *old_pg;
6278 	struct pmap_page *new_pp, *old_pp;
6279 	struct pv_entry *old_pve, *new_pve;
6280 	bool wired = (flags & PMAP_WIRED) != 0;
6281 	bool accessed;
6282 	struct pmap_ptparray pt;
6283 	int error;
6284 	bool getptp, samepage, new_embedded;
6285 	rb_tree_t *tree;
6286 
6287 	KASSERT(pmap_initialized);
6288 	KASSERT(va < VM_MAXUSER_ADDRESS);
6289 
6290 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6291 
6292 	if (wired)
6293 		npte |= EPT_WIRED;
6294 	if (flags & VM_PROT_ALL) {
6295 		npte |= EPT_A;
6296 		if (flags & VM_PROT_WRITE) {
6297 			KASSERT((npte & EPT_W) != 0);
6298 			npte |= EPT_D;
6299 		}
6300 	}
6301 
6302 	new_pg = PHYS_TO_VM_PAGE(pa);
6303 	if (new_pg != NULL) {
6304 		/* This is a managed page */
6305 		npte |= EPT_PVLIST;
6306 		new_pp = VM_PAGE_TO_PP(new_pg);
6307 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6308 		/* This is an unmanaged pv-tracked page */
6309 		npte |= EPT_PVLIST;
6310 	} else {
6311 		new_pp = NULL;
6312 	}
6313 
6314 	/* Begin by locking the pmap. */
6315 	mutex_enter(&pmap->pm_lock);
6316 
6317 	/* Look up the PTP.  Allocate if none present. */
6318 	ptp = NULL;
6319 	getptp = false;
6320 	if (pmap != pmap_kernel()) {
6321 		ptp = pmap_find_ptp(pmap, va, 1);
6322 		if (ptp == NULL) {
6323 			getptp = true;
6324 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6325 			if (error != 0) {
6326 				if (flags & PMAP_CANFAIL) {
6327 					mutex_exit(&pmap->pm_lock);
6328 					return error;
6329 				}
6330 				panic("%s: get ptp failed, error=%d", __func__,
6331 				    error);
6332 			}
6333 		}
6334 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6335 	} else {
6336 		/* Embedded PV entries rely on this. */
6337 		KASSERT(va != 0);
6338 		tree = &pmap_kernel_rb;
6339 	}
6340 
6341 	/*
6342 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6343 	 * entry if required for the new mapping.  Temporarily track the old
6344 	 * and new mappings concurrently.  Only after the old mapping is
6345 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6346 	 * our picture of modified/accessed state for either page could get
6347 	 * out of sync (we need any P->V operation for either page to stall
6348 	 * on pmap->pm_lock until done here).
6349 	 */
6350 	new_pve = NULL;
6351 	old_pve = NULL;
6352 	samepage = false;
6353 	new_embedded = false;
6354 
6355 	if (new_pp != NULL) {
6356 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6357 		    &old_pve, &samepage, &new_embedded, tree);
6358 
6359 		/*
6360 		 * If a new pv_entry was needed and none was available, we
6361 		 * can go no further.
6362 		 */
6363 		if (error != 0) {
6364 			if (flags & PMAP_CANFAIL) {
6365 				if (getptp) {
6366 					pmap_unget_ptp(pmap, &pt);
6367 				}
6368 				mutex_exit(&pmap->pm_lock);
6369 				return error;
6370 			}
6371 			panic("%s: alloc pve failed", __func__);
6372 		}
6373 	} else {
6374 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6375 	}
6376 
6377 	/* Map PTEs into address space. */
6378 	kpreempt_disable();
6379 
6380 	/* Install any newly allocated PTPs. */
6381 	if (getptp) {
6382 		pmap_ept_install_ptp(pmap, &pt, va);
6383 	}
6384 
6385 	/* Check if there is an existing mapping. */
6386 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6387 	ptep = &ptes[pl1_pi(va)];
6388 	opte = *ptep;
6389 	bool have_oldpa = pmap_ept_valid_entry(opte);
6390 	paddr_t oldpa = pmap_pte2pa(opte);
6391 
6392 	/*
6393 	 * Update the pte.
6394 	 */
6395 	do {
6396 		opte = *ptep;
6397 
6398 		/*
6399 		 * if the same page, inherit PTE_A and PTE_D.
6400 		 */
6401 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6402 			npte |= opte & (EPT_A | EPT_D);
6403 		}
6404 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6405 
6406 	/*
6407 	 * Done with the PTEs: they can now be unmapped.
6408 	 */
6409 	kpreempt_enable();
6410 
6411 	/*
6412 	 * Update statistics and PTP's reference count.
6413 	 */
6414 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6415 	if (ptp != NULL) {
6416 		if (!have_oldpa) {
6417 			ptp->wire_count++;
6418 		}
6419 		/* Remember minimum VA in PTP. */
6420 		pmap_ptp_range_set(ptp, va);
6421 	}
6422 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6423 
6424 	/*
6425 	 * If the same page, we can skip pv_entry handling.
6426 	 */
6427 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6428 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6429 		if ((npte & EPT_PVLIST) != 0) {
6430 			KASSERT(samepage);
6431 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6432 		}
6433 		goto same_pa;
6434 	} else if ((npte & EPT_PVLIST) != 0) {
6435 		KASSERT(!samepage);
6436 	}
6437 
6438 	/*
6439 	 * If old page is pv-tracked, remove pv_entry from its list.
6440 	 */
6441 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6442 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6443 			old_pp = VM_PAGE_TO_PP(old_pg);
6444 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6445 			panic("%s: EPT_PVLIST with pv-untracked page"
6446 			    " va = %#"PRIxVADDR
6447 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6448 			    __func__, va, oldpa, atop(pa));
6449 		}
6450 
6451 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6452 		    pmap_ept_to_pp_attrs(opte));
6453 	} else {
6454 		KASSERT(old_pve == NULL);
6455 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6456 	}
6457 
6458 	/*
6459 	 * If new page is dynamically PV tracked, insert to tree.
6460 	 */
6461 	if (new_pve != NULL) {
6462 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6463 		old_pve = rb_tree_insert_node(tree, new_pve);
6464 		KASSERT(old_pve == new_pve);
6465 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6466 	}
6467 
6468 same_pa:
6469 	/*
6470 	 * shootdown tlb if necessary.
6471 	 */
6472 
6473 	if (pmap_ept_has_ad) {
6474 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6475 	} else {
6476 		accessed = (opte & EPT_R) != 0;
6477 	}
6478 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6479 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6480 	}
6481 	pmap_drain_pv(pmap);
6482 	mutex_exit(&pmap->pm_lock);
6483 	return 0;
6484 }
6485 
6486 /* Pay close attention, this returns L2. */
6487 static int
6488 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6489 {
6490 	pt_entry_t *pteva;
6491 	paddr_t ptepa;
6492 	int i, index;
6493 
6494 	KASSERT(mutex_owned(&pmap->pm_lock));
6495 
6496 	ptepa = pmap->pm_pdirpa[0];
6497 	for (i = PTP_LEVELS; i > 1; i--) {
6498 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6499 		index = pl_pi(va, i);
6500 		if (!pmap_ept_valid_entry(pteva[index]))
6501 			return i;
6502 		ptepa = pmap_pte2pa(pteva[index]);
6503 	}
6504 	if (lastpde != NULL) {
6505 		*lastpde = pteva[index];
6506 	}
6507 
6508 	return 0;
6509 }
6510 
6511 static bool
6512 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6513 {
6514 	pt_entry_t *ptes, pte;
6515 	pd_entry_t pde;
6516 	paddr_t ptppa, pa;
6517 	bool rv;
6518 
6519 #ifdef __HAVE_DIRECT_MAP
6520 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6521 		if (pap != NULL) {
6522 			*pap = PMAP_DIRECT_UNMAP(va);
6523 		}
6524 		return true;
6525 	}
6526 #endif
6527 
6528 	rv = false;
6529 	pa = 0;
6530 
6531 	mutex_enter(&pmap->pm_lock);
6532 	kpreempt_disable();
6533 
6534 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6535 		ptppa = pmap_pte2pa(pde);
6536 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6537 		pte = ptes[pl1_pi(va)];
6538 		if (__predict_true((pte & EPT_R) != 0)) {
6539 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6540 			rv = true;
6541 		}
6542 	}
6543 
6544 	kpreempt_enable();
6545 	mutex_exit(&pmap->pm_lock);
6546 
6547 	if (pap != NULL) {
6548 		*pap = pa;
6549 	}
6550 	return rv;
6551 }
6552 
6553 static bool
6554 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6555     vaddr_t va)
6556 {
6557 	struct pv_entry *pve;
6558 	struct vm_page *pg;
6559 	struct pmap_page *pp;
6560 	pt_entry_t opte;
6561 	bool accessed;
6562 
6563 	KASSERT(pmap != pmap_kernel());
6564 	KASSERT(mutex_owned(&pmap->pm_lock));
6565 	KASSERT(kpreempt_disabled());
6566 
6567 	if (!pmap_ept_valid_entry(*pte)) {
6568 		/* VA not mapped. */
6569 		return false;
6570 	}
6571 
6572 	/* Atomically save the old PTE and zap it. */
6573 	opte = pmap_pte_testset(pte, 0);
6574 	if (!pmap_ept_valid_entry(opte)) {
6575 		return false;
6576 	}
6577 
6578 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6579 
6580 	if (ptp) {
6581 		/*
6582 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6583 		 */
6584 		ptp->wire_count--;
6585 		if (ptp->wire_count <= 1) {
6586 			opte |= EPT_A;
6587 		}
6588 	}
6589 
6590 	if (pmap_ept_has_ad) {
6591 		accessed = (opte & EPT_A) != 0;
6592 	} else {
6593 		accessed = true;
6594 	}
6595 	if (accessed) {
6596 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6597 	}
6598 
6599 	/*
6600 	 * If we are not on a pv list - we are done.
6601 	 */
6602 	if ((opte & EPT_PVLIST) == 0) {
6603 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6604 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6605 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6606 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6607 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6608 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6609 		return true;
6610 	}
6611 
6612 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6613 		pp = VM_PAGE_TO_PP(pg);
6614 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6615 		paddr_t pa = pmap_pte2pa(opte);
6616 		panic("%s: EPT_PVLIST with pv-untracked page"
6617 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6618 		    __func__, va, pa, atop(pa));
6619 	}
6620 
6621 	/* Sync R/M bits. */
6622 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6623 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6624 	return true;
6625 }
6626 
6627 static void
6628 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6629     vaddr_t startva, vaddr_t endva)
6630 {
6631 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6632 
6633 	KASSERT(pmap != pmap_kernel());
6634 	KASSERT(mutex_owned(&pmap->pm_lock));
6635 	KASSERT(kpreempt_disabled());
6636 
6637 	/*
6638 	 * mappings are very often sparse, so clip the given range to the
6639 	 * range of PTEs that are known present in the PTP.
6640 	 */
6641 	pmap_ptp_range_clip(ptp, &startva, &pte);
6642 
6643 	/*
6644 	 * note that ptpva points to the PTE that maps startva.   this may
6645 	 * or may not be the first PTE in the PTP.
6646 	 *
6647 	 * we loop through the PTP while there are still PTEs to look at
6648 	 * and the wire_count is greater than 1 (because we use the wire_count
6649 	 * to keep track of the number of real PTEs in the PTP).
6650 	 */
6651 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6652 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6653 		startva += PAGE_SIZE;
6654 		pte++;
6655 	}
6656 }
6657 
6658 static void
6659 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6660 {
6661 	pt_entry_t *ptes;
6662 	pd_entry_t pde;
6663 	paddr_t ptppa;
6664 	vaddr_t blkendva, va = sva;
6665 	struct vm_page *ptp;
6666 
6667 	mutex_enter(&pmap->pm_lock);
6668 	kpreempt_disable();
6669 
6670 	for (/* null */ ; va < eva ; va = blkendva) {
6671 		int lvl;
6672 
6673 		/* determine range of block */
6674 		blkendva = x86_round_pdr(va+1);
6675 		if (blkendva > eva)
6676 			blkendva = eva;
6677 
6678 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6679 		if (lvl != 0) {
6680 			/* Skip a range corresponding to an invalid pde. */
6681 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6682 			continue;
6683 		}
6684 
6685 		/* PA of the PTP */
6686 		ptppa = pmap_pte2pa(pde);
6687 
6688 		ptp = pmap_find_ptp(pmap, va, 1);
6689 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6690 		    __func__);
6691 
6692 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6693 
6694 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6695 		    blkendva);
6696 
6697 		/* If PTP is no longer being used, free it. */
6698 		if (ptp && ptp->wire_count <= 1) {
6699 			pmap_ept_free_ptp(pmap, ptp, va);
6700 		}
6701 	}
6702 
6703 	kpreempt_enable();
6704 	pmap_drain_pv(pmap);
6705 	mutex_exit(&pmap->pm_lock);
6706 }
6707 
6708 static int
6709 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6710     uint8_t *oattrs, pt_entry_t *optep)
6711 {
6712 	struct pmap *pmap;
6713 	pt_entry_t *ptep;
6714 	pt_entry_t opte;
6715 	pt_entry_t npte;
6716 	pt_entry_t expect;
6717 	bool need_shootdown;
6718 
6719 	expect = pmap_pa2pte(pa) | EPT_R;
6720 	pmap = ptp_to_pmap(ptp);
6721 
6722 	if (clearbits != ~0) {
6723 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6724 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6725 	}
6726 
6727 	ptep = pmap_map_pte(pmap, ptp, va);
6728 	do {
6729 		opte = *ptep;
6730 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6731 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6732 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6733 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6734 			/*
6735 			 * We lost a race with a V->P operation like
6736 			 * pmap_remove().  Wait for the competitor
6737 			 * reflecting pte bits into mp_attrs.
6738 			 */
6739 			pmap_unmap_pte();
6740 			return EAGAIN;
6741 		}
6742 
6743 		/*
6744 		 * Check if there's anything to do on this PTE.
6745 		 */
6746 		if ((opte & clearbits) == 0) {
6747 			need_shootdown = false;
6748 			break;
6749 		}
6750 
6751 		/*
6752 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6753 		 * ... Unless we are clearing only the EPT_W bit and
6754 		 * it isn't cached as RW (EPT_D).
6755 		 */
6756 		if (pmap_ept_has_ad) {
6757 			need_shootdown = (opte & EPT_A) != 0 &&
6758 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6759 		} else {
6760 			need_shootdown = true;
6761 		}
6762 
6763 		npte = opte & ~clearbits;
6764 
6765 		/*
6766 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6767 		 */
6768 		if (need_shootdown) {
6769 			npte &= ~(EPT_A | EPT_D);
6770 		}
6771 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6772 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6773 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6774 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6775 
6776 	if (need_shootdown) {
6777 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6778 	}
6779 	pmap_unmap_pte();
6780 
6781 	*oattrs = pmap_ept_to_pp_attrs(opte);
6782 	if (optep != NULL)
6783 		*optep = opte;
6784 	return 0;
6785 }
6786 
6787 static void
6788 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6789     vaddr_t va)
6790 {
6791 
6792 	KASSERT(mutex_owned(&pmap->pm_lock));
6793 
6794 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6795 	ptp->wire_count--;
6796 	if (ptp->wire_count <= 1) {
6797 		pmap_ept_free_ptp(pmap, ptp, va);
6798 	}
6799 }
6800 
6801 static void
6802 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6803 {
6804 	pt_entry_t bit_rem;
6805 	pt_entry_t *ptes, *spte;
6806 	pt_entry_t opte, npte;
6807 	pd_entry_t pde;
6808 	paddr_t ptppa;
6809 	vaddr_t va;
6810 	bool modified;
6811 
6812 	bit_rem = 0;
6813 	if (!(prot & VM_PROT_WRITE))
6814 		bit_rem = EPT_W;
6815 
6816 	sva &= PTE_FRAME;
6817 	eva &= PTE_FRAME;
6818 
6819 	/* Acquire pmap. */
6820 	mutex_enter(&pmap->pm_lock);
6821 	kpreempt_disable();
6822 
6823 	for (va = sva; va < eva; va += PAGE_SIZE) {
6824 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6825 			continue;
6826 		}
6827 
6828 		ptppa = pmap_pte2pa(pde);
6829 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6830 		spte = &ptes[pl1_pi(va)];
6831 
6832 		do {
6833 			opte = *spte;
6834 			if (!pmap_ept_valid_entry(opte)) {
6835 				goto next;
6836 			}
6837 			npte = (opte & ~bit_rem);
6838 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6839 
6840 		if (pmap_ept_has_ad) {
6841 			modified = (opte & EPT_D) != 0;
6842 		} else {
6843 			modified = true;
6844 		}
6845 		if (modified) {
6846 			vaddr_t tva = x86_ptob(spte - ptes);
6847 			pmap_tlb_shootdown(pmap, tva, 0,
6848 			    TLBSHOOT_WRITE_PROTECT);
6849 		}
6850 next:;
6851 	}
6852 
6853 	kpreempt_enable();
6854 	mutex_exit(&pmap->pm_lock);
6855 }
6856 
6857 static void
6858 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6859 {
6860 	pt_entry_t *ptes, *ptep, opte;
6861 	pd_entry_t pde;
6862 	paddr_t ptppa;
6863 
6864 	/* Acquire pmap. */
6865 	mutex_enter(&pmap->pm_lock);
6866 	kpreempt_disable();
6867 
6868 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6869 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6870 	}
6871 
6872 	ptppa = pmap_pte2pa(pde);
6873 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6874 	ptep = &ptes[pl1_pi(va)];
6875 	opte = *ptep;
6876 	KASSERT(pmap_ept_valid_entry(opte));
6877 
6878 	if (opte & EPT_WIRED) {
6879 		pt_entry_t npte = opte & ~EPT_WIRED;
6880 
6881 		opte = pmap_pte_testset(ptep, npte);
6882 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6883 	} else {
6884 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6885 		    "did not change!\n", __func__, pmap, va);
6886 	}
6887 
6888 	/* Release pmap. */
6889 	kpreempt_enable();
6890 	mutex_exit(&pmap->pm_lock);
6891 }
6892 
6893 /* -------------------------------------------------------------------------- */
6894 
6895 void
6896 pmap_ept_transform(struct pmap *pmap)
6897 {
6898 	pmap->pm_enter = pmap_ept_enter;
6899 	pmap->pm_extract = pmap_ept_extract;
6900 	pmap->pm_remove = pmap_ept_remove;
6901 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6902 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6903 	pmap->pm_write_protect = pmap_ept_write_protect;
6904 	pmap->pm_unwire = pmap_ept_unwire;
6905 
6906 	memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6907 }
6908 
6909 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6910