xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision b5c47949a45ac972130c38cf13dfd8afb1f09285)
1 /*	$NetBSD: pmap.c,v 1.409 2021/02/06 21:24:19 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.409 2021/02/06 21:24:19 jdolecek Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 
142 #define	__MUTEX_PRIVATE	/* for assertions */
143 
144 #include <sys/param.h>
145 #include <sys/systm.h>
146 #include <sys/proc.h>
147 #include <sys/pool.h>
148 #include <sys/kernel.h>
149 #include <sys/atomic.h>
150 #include <sys/cpu.h>
151 #include <sys/intr.h>
152 #include <sys/xcall.h>
153 #include <sys/kcore.h>
154 #include <sys/kmem.h>
155 #include <sys/asan.h>
156 #include <sys/msan.h>
157 #include <sys/entropy.h>
158 
159 #include <uvm/uvm.h>
160 #include <uvm/pmap/pmap_pvt.h>
161 
162 #include <dev/isa/isareg.h>
163 
164 #include <machine/specialreg.h>
165 #include <machine/gdt.h>
166 #include <machine/isa_machdep.h>
167 #include <machine/cpuvar.h>
168 #include <machine/cputypes.h>
169 
170 #include <x86/pmap_pv.h>
171 
172 #include <x86/i82489reg.h>
173 #include <x86/i82489var.h>
174 
175 #ifdef XEN
176 #include <xen/include/public/xen.h>
177 #include <xen/hypervisor.h>
178 #include <xen/xenpmap.h>
179 #endif
180 
181 /*
182  * general info:
183  *
184  *  - for an explanation of how the x86 MMU hardware works see
185  *    the comments in <machine/pte.h>.
186  *
187  *  - for an explanation of the general memory structure used by
188  *    this pmap (including the recursive mapping), see the comments
189  *    in <machine/pmap.h>.
190  *
191  * this file contains the code for the "pmap module."   the module's
192  * job is to manage the hardware's virtual to physical address mappings.
193  * note that there are two levels of mapping in the VM system:
194  *
195  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
196  *      to map ranges of virtual address space to objects/files.  for
197  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
198  *      to the file /bin/ls starting at offset zero."   note that
199  *      the upper layer mapping is not concerned with how individual
200  *      vm_pages are mapped.
201  *
202  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
203  *      from virtual addresses.   it is concerned with which vm_page is
204  *      mapped where.   for example, when you run /bin/ls and start
205  *      at page 0x1000 the fault routine may lookup the correct page
206  *      of the /bin/ls file and then ask the pmap layer to establish
207  *      a mapping for it.
208  *
209  * note that information in the lower layer of the VM system can be
210  * thrown away since it can easily be reconstructed from the info
211  * in the upper layer.
212  *
213  * data structures we use include:
214  *
215  *  - struct pmap: describes the address space of one thread
216  *  - struct pmap_page: describes one pv-tracked page, without
217  *    necessarily a corresponding vm_page
218  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
219  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
220  *    physical memory.   the pp_pvlist points to a list of pv_entry
221  *    structures which describe all the <PMAP,VA> pairs that this
222  *    page is mapped in.    this is critical for page based operations
223  *    such as pmap_page_protect() [change protection on _all_ mappings
224  *    of a page]
225  */
226 
227 /*
228  * Locking
229  *
230  * We have the following locks that we must deal with, listed in the order
231  * that they are acquired:
232  *
233  * pg->uobject->vmobjlock, pg->uanon->an_lock
234  *
235  * 	For managed pages, these per-object locks are taken by the VM system
236  *	before calling into the pmap module - either a read or write hold.
237  *	The lock hold prevent pages from changing identity while the pmap is
238  *	operating on them.  For example, the same lock is held across a call
239  *	to pmap_remove() and the following call to pmap_update(), so that a
240  *	page does not gain a new identity while its TLB visibility is stale.
241  *
242  * pmap->pm_lock
243  *
244  *	This lock protects the fields in the pmap structure including the
245  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
246  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
247  *	kernel PDEs are never freed, and the kernel is expected to be self
248  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
249  *	because they can be modified from interrupt context).
250  *
251  * pmaps_lock
252  *
253  *	This lock protects the list of active pmaps (headed by "pmaps").
254  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
255  *
256  * pp_lock
257  *
258  *	This per-page lock protects PV entry lists and the embedded PV entry
259  *	in each vm_page, allowing for concurrent operation on pages by
260  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
261  *	points it is taken context switching is usually not tolerable, and
262  *	spin mutexes must block out interrupts that could take kernel_lock.
263  */
264 
265 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
266 #ifdef DIAGNOSTIC
267 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
268 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
269 #else
270 #define	PMAP_DUMMY_LOCK(pm)
271 #define	PMAP_DUMMY_UNLOCK(pm)
272 #endif
273 
274 static const struct uvm_pagerops pmap_pager = {
275 	/* nothing */
276 };
277 
278 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
279 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
280 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
281 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
282 const long nbpd[] = NBPD_INITIALIZER;
283 #ifdef i386
284 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
285 #else
286 pd_entry_t *normal_pdes[3];
287 #endif
288 
289 long nkptp[] = NKPTP_INITIALIZER;
290 
291 struct pmap_head pmaps;
292 kmutex_t pmaps_lock __cacheline_aligned;
293 
294 struct pcpu_area *pcpuarea __read_mostly;
295 
296 static vaddr_t pmap_maxkvaddr;
297 
298 /*
299  * Misc. event counters.
300  */
301 struct evcnt pmap_iobmp_evcnt;
302 struct evcnt pmap_ldt_evcnt;
303 
304 /*
305  * PAT
306  */
307 static bool cpu_pat_enabled __read_mostly = false;
308 
309 /*
310  * Global data structures
311  */
312 
313 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
314 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
315 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
316 
317 struct bootspace bootspace __read_mostly;
318 struct slotspace slotspace __read_mostly;
319 
320 /* Set to PTE_NX if supported. */
321 pd_entry_t pmap_pg_nx __read_mostly = 0;
322 
323 /* Set to PTE_G if supported. */
324 pd_entry_t pmap_pg_g __read_mostly = 0;
325 
326 /* Set to true if large pages are supported. */
327 int pmap_largepages __read_mostly = 0;
328 
329 paddr_t lowmem_rsvd __read_mostly;
330 paddr_t avail_start __read_mostly; /* PA of first available physical page */
331 paddr_t avail_end __read_mostly; /* PA of last available physical page */
332 
333 #ifdef XENPV
334 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
335 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
336 #endif
337 
338 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
339 #define	PMAP_CHECK_PP(pp) \
340     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
341 
342 #define PAGE_ALIGNED(pp)	\
343 	__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
344 
345 /*
346  * Other data structures
347  */
348 
349 static pt_entry_t protection_codes[8] __read_mostly;
350 
351 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
352 
353 /*
354  * The following two vaddr_t's are used during system startup to keep track of
355  * how much of the kernel's VM space we have used. Once the system is started,
356  * the management of the remaining kernel VM space is turned over to the
357  * kernel_map vm_map.
358  */
359 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
360 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
361 
362 #ifndef XENPV
363 /*
364  * LAPIC virtual address, and fake physical address.
365  */
366 volatile vaddr_t local_apic_va __read_mostly;
367 paddr_t local_apic_pa __read_mostly;
368 #endif
369 
370 /*
371  * pool that pmap structures are allocated from
372  */
373 struct pool_cache pmap_cache;
374 static int  pmap_ctor(void *, void *, int);
375 static void pmap_dtor(void *, void *);
376 
377 /*
378  * pv_page cache
379  */
380 static struct pool_cache pmap_pvp_cache;
381 
382 #ifdef __HAVE_DIRECT_MAP
383 vaddr_t pmap_direct_base __read_mostly;
384 vaddr_t pmap_direct_end __read_mostly;
385 #endif
386 
387 #ifndef __HAVE_DIRECT_MAP
388 /*
389  * Special VAs and the PTEs that map them
390  */
391 static pt_entry_t *early_zero_pte;
392 static void pmap_vpage_cpualloc(struct cpu_info *);
393 #ifdef XENPV
394 char *early_zerop; /* also referenced from xen_locore() */
395 #else
396 static char *early_zerop;
397 #endif
398 #endif
399 
400 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
401 
402 /* PDP pool and its callbacks */
403 static struct pool pmap_pdp_pool;
404 static void pmap_pdp_init(pd_entry_t *);
405 static void pmap_pdp_fini(pd_entry_t *);
406 
407 #ifdef PAE
408 /* need to allocate items of 4 pages */
409 static void *pmap_pdp_alloc(struct pool *, int);
410 static void pmap_pdp_free(struct pool *, void *);
411 static struct pool_allocator pmap_pdp_allocator = {
412 	.pa_alloc = pmap_pdp_alloc,
413 	.pa_free = pmap_pdp_free,
414 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
415 };
416 #endif
417 
418 extern vaddr_t idt_vaddr;
419 extern paddr_t idt_paddr;
420 extern vaddr_t gdt_vaddr;
421 extern paddr_t gdt_paddr;
422 extern vaddr_t ldt_vaddr;
423 extern paddr_t ldt_paddr;
424 
425 #ifdef i386
426 /* stuff to fix the pentium f00f bug */
427 extern vaddr_t pentium_idt_vaddr;
428 #endif
429 
430 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
431 struct pmap_ptparray {
432 	struct vm_page *pg[PTP_LEVELS + 1];
433 	bool alloced[PTP_LEVELS + 1];
434 };
435 
436 /*
437  * PV entries are allocated in page-sized chunks and cached per-pmap to
438  * avoid intense pressure on memory allocators.
439  */
440 
441 struct pv_page {
442 	LIST_HEAD(, pv_entry)	pvp_pves;
443 	LIST_ENTRY(pv_page)	pvp_list;
444 	long			pvp_nfree;
445 	struct pmap		*pvp_pmap;
446 };
447 
448 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
449 
450 /*
451  * PV tree prototypes
452  */
453 
454 static int	pmap_compare_key(void *, const void *, const void *);
455 static int	pmap_compare_nodes(void *, const void *, const void *);
456 
457 /* Read-black tree */
458 static const rb_tree_ops_t pmap_rbtree_ops = {
459 	.rbto_compare_nodes = pmap_compare_nodes,
460 	.rbto_compare_key = pmap_compare_key,
461 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
462 	.rbto_context = NULL
463 };
464 
465 /*
466  * Local prototypes
467  */
468 
469 #ifdef __HAVE_PCPU_AREA
470 static void pmap_init_pcpu(void);
471 #endif
472 #ifdef __HAVE_DIRECT_MAP
473 static void pmap_init_directmap(struct pmap *);
474 #endif
475 #if !defined(XENPV)
476 static void pmap_remap_global(void);
477 #endif
478 #ifndef XENPV
479 static void pmap_init_lapic(void);
480 static void pmap_remap_largepages(void);
481 #endif
482 
483 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
484     struct vm_page **);
485 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
486 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
487     pd_entry_t * const *);
488 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
489 static void pmap_freepage(struct pmap *, struct vm_page *, int);
490 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
491     pt_entry_t *, pd_entry_t * const *);
492 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
493     vaddr_t);
494 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
495     vaddr_t);
496 static int pmap_pvp_ctor(void *, void *, int);
497 static void pmap_pvp_dtor(void *, void *);
498 static struct pv_entry *pmap_alloc_pv(struct pmap *);
499 static void pmap_free_pv(struct pmap *, struct pv_entry *);
500 static void pmap_drain_pv(struct pmap *);
501 
502 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
503 
504 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
505 static void pmap_reactivate(struct pmap *);
506 
507 /*
508  * p m a p   h e l p e r   f u n c t i o n s
509  */
510 
511 static inline void
512 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
513 {
514 
515 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
516 	pmap->pm_stats.resident_count += resid_diff;
517 	pmap->pm_stats.wired_count += wired_diff;
518 }
519 
520 static inline void
521 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
522 {
523 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
524 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
525 
526 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
527 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
528 
529 	pmap_stats_update(pmap, resid_diff, wired_diff);
530 }
531 
532 /*
533  * ptp_to_pmap: lookup pmap by ptp
534  */
535 static inline struct pmap *
536 ptp_to_pmap(struct vm_page *ptp)
537 {
538 	struct pmap *pmap;
539 
540 	if (ptp == NULL) {
541 		return pmap_kernel();
542 	}
543 	pmap = (struct pmap *)ptp->uobject;
544 	KASSERT(pmap != NULL);
545 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
546 	return pmap;
547 }
548 
549 static inline struct pv_pte *
550 pve_to_pvpte(struct pv_entry *pve)
551 {
552 
553 	if (pve == NULL)
554 		return NULL;
555 	KASSERT((void *)&pve->pve_pte == (void *)pve);
556 	return &pve->pve_pte;
557 }
558 
559 static inline struct pv_entry *
560 pvpte_to_pve(struct pv_pte *pvpte)
561 {
562 	struct pv_entry *pve = (void *)pvpte;
563 
564 	KASSERT(pve_to_pvpte(pve) == pvpte);
565 	return pve;
566 }
567 
568 /*
569  * Return true if the pmap page has an embedded PV entry.
570  */
571 static inline bool
572 pv_pte_embedded(struct pmap_page *pp)
573 {
574 
575 	KASSERT(mutex_owned(&pp->pp_lock));
576 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
577 }
578 
579 /*
580  * pv_pte_first, pv_pte_next: PV list iterator.
581  */
582 static inline struct pv_pte *
583 pv_pte_first(struct pmap_page *pp)
584 {
585 
586 	KASSERT(mutex_owned(&pp->pp_lock));
587 	if (pv_pte_embedded(pp)) {
588 		return &pp->pp_pte;
589 	}
590 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
591 }
592 
593 static inline struct pv_pte *
594 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
595 {
596 
597 	KASSERT(mutex_owned(&pp->pp_lock));
598 	KASSERT(pvpte != NULL);
599 	if (pvpte == &pp->pp_pte) {
600 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
601 	}
602 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
603 }
604 
605 static inline uint8_t
606 pmap_pte_to_pp_attrs(pt_entry_t pte)
607 {
608 	uint8_t ret = 0;
609 	if (pte & PTE_D)
610 		ret |= PP_ATTRS_D;
611 	if (pte & PTE_A)
612 		ret |= PP_ATTRS_A;
613 	if (pte & PTE_W)
614 		ret |= PP_ATTRS_W;
615 	return ret;
616 }
617 
618 static inline pt_entry_t
619 pmap_pp_attrs_to_pte(uint8_t attrs)
620 {
621 	pt_entry_t pte = 0;
622 	if (attrs & PP_ATTRS_D)
623 		pte |= PTE_D;
624 	if (attrs & PP_ATTRS_A)
625 		pte |= PTE_A;
626 	if (attrs & PP_ATTRS_W)
627 		pte |= PTE_W;
628 	return pte;
629 }
630 
631 /*
632  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
633  * of course the kernel is always loaded
634  */
635 bool
636 pmap_is_curpmap(struct pmap *pmap)
637 {
638 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
639 }
640 
641 inline void
642 pmap_reference(struct pmap *pmap)
643 {
644 
645 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
646 }
647 
648 /*
649  * rbtree: compare two nodes.
650  */
651 static int
652 pmap_compare_nodes(void *context, const void *n1, const void *n2)
653 {
654 	const struct pv_entry *pve1 = n1;
655 	const struct pv_entry *pve2 = n2;
656 
657 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
658 
659 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
660 		return -1;
661 	}
662 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
663 		return 1;
664 	}
665 	return 0;
666 }
667 
668 /*
669  * rbtree: compare a node and a key.
670  */
671 static int
672 pmap_compare_key(void *context, const void *n, const void *k)
673 {
674 	const struct pv_entry *pve = n;
675 	const vaddr_t key = (vaddr_t)k;
676 
677 	if (pve->pve_pte.pte_va < key) {
678 		return -1;
679 	}
680 	if (pve->pve_pte.pte_va > key) {
681 		return 1;
682 	}
683 	return 0;
684 }
685 
686 /*
687  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
688  */
689 static inline void
690 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
691 {
692 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
693 
694 	if (va < *min) {
695 		*min = va;
696 	}
697 }
698 
699 /*
700  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
701  */
702 static inline void
703 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
704 {
705 	vaddr_t sclip;
706 
707 	if (ptp == NULL) {
708 		return;
709 	}
710 
711 	sclip = (vaddr_t)ptp->uanon;
712 	sclip = (*startva < sclip ? sclip : *startva);
713 	*pte += (sclip - *startva) / PAGE_SIZE;
714 	*startva = sclip;
715 }
716 
717 /*
718  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
719  *
720  * there are several pmaps involved.  some or all of them might be same.
721  *
722  *	- the pmap given by the first argument
723  *		our caller wants to access this pmap's PTEs.
724  *
725  *	- pmap_kernel()
726  *		the kernel pmap.  note that it only contains the kernel part
727  *		of the address space which is shared by any pmap.  ie. any
728  *		pmap can be used instead of pmap_kernel() for our purpose.
729  *
730  *	- ci->ci_pmap
731  *		pmap currently loaded on the cpu.
732  *
733  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
734  *		current process' pmap.
735  *
736  * => caller must lock pmap first (if not the kernel pmap)
737  * => must be undone with pmap_unmap_ptes before returning
738  * => disables kernel preemption
739  */
740 void
741 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
742     pd_entry_t * const **pdeppp)
743 {
744 	struct pmap *curpmap;
745 	struct cpu_info *ci;
746 	lwp_t *l;
747 
748 	kpreempt_disable();
749 
750 	/* The kernel's pmap is always accessible. */
751 	if (pmap == pmap_kernel()) {
752 		*pmap2 = NULL;
753 		*ptepp = PTE_BASE;
754 		*pdeppp = normal_pdes;
755 		return;
756 	}
757 
758 	KASSERT(mutex_owned(&pmap->pm_lock));
759 
760 	l = curlwp;
761 	ci = l->l_cpu;
762 	curpmap = ci->ci_pmap;
763 	if (pmap == curpmap) {
764 		/*
765 		 * Already on the CPU: make it valid.  This is very
766 		 * often the case during exit(), when we have switched
767 		 * to the kernel pmap in order to destroy a user pmap.
768 		 */
769 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
770 			pmap_reactivate(pmap);
771 		}
772 		*pmap2 = NULL;
773 	} else {
774 		/*
775 		 * Toss current pmap from CPU and install new pmap, but keep
776 		 * a reference to the old one.  Dropping the reference can
777 		 * can block as it needs to take locks, so defer that to
778 		 * pmap_unmap_ptes().
779 		 */
780 		pmap_reference(pmap);
781 		pmap_load1(l, pmap, curpmap);
782 		*pmap2 = curpmap;
783 	}
784 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
785 #ifdef DIAGNOSTIC
786 	pmap->pm_ncsw = lwp_pctr();
787 #endif
788 	*ptepp = PTE_BASE;
789 
790 #if defined(XENPV) && defined(__x86_64__)
791 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
792 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
793 	*pdeppp = ci->ci_normal_pdes;
794 #else
795 	*pdeppp = normal_pdes;
796 #endif
797 }
798 
799 /*
800  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
801  *
802  * => we cannot tolerate context switches while mapped in: assert this.
803  * => reenables kernel preemption.
804  * => does not unlock pmap.
805  */
806 void
807 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
808 {
809 	struct cpu_info *ci;
810 	struct pmap *mypmap;
811 	struct lwp *l;
812 
813 	KASSERT(kpreempt_disabled());
814 
815 	/* The kernel's pmap is always accessible. */
816 	if (pmap == pmap_kernel()) {
817 		kpreempt_enable();
818 		return;
819 	}
820 
821 	l = curlwp;
822 	ci = l->l_cpu;
823 
824 	KASSERT(mutex_owned(&pmap->pm_lock));
825 	KASSERT(pmap->pm_ncsw == lwp_pctr());
826 
827 #if defined(XENPV) && defined(__x86_64__)
828 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
829 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
830 #endif
831 
832 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
833 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
834 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
835 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
836 		ci->ci_want_pmapload = 0;
837 	} else {
838 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
839 		ci->ci_tlbstate = TLBSTATE_LAZY;
840 	}
841 
842 	/* Now safe to re-enable preemption. */
843 	kpreempt_enable();
844 
845 	/* Toss reference to other pmap taken earlier. */
846 	if (pmap2 != NULL) {
847 		pmap_destroy(pmap2);
848 	}
849 }
850 
851 inline static void
852 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
853 {
854 
855 #if !defined(__x86_64__)
856 	if (curproc == NULL || curproc->p_vmspace == NULL ||
857 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
858 		return;
859 
860 	if ((opte ^ npte) & PTE_X)
861 		pmap_update_pg(va);
862 
863 	/*
864 	 * Executability was removed on the last executable change.
865 	 * Reset the code segment to something conservative and
866 	 * let the trap handler deal with setting the right limit.
867 	 * We can't do that because of locking constraints on the vm map.
868 	 */
869 
870 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
871 		struct trapframe *tf = curlwp->l_md.md_regs;
872 
873 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
874 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
875 	}
876 #endif /* !defined(__x86_64__) */
877 }
878 
879 #if !defined(__x86_64__)
880 /*
881  * Fixup the code segment to cover all potential executable mappings.
882  * returns 0 if no changes to the code segment were made.
883  */
884 int
885 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
886 {
887 	struct vm_map_entry *ent;
888 	struct pmap *pm = vm_map_pmap(map);
889 	vaddr_t va = 0;
890 
891 	vm_map_lock_read(map);
892 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
893 		/*
894 		 * This entry has greater va than the entries before.
895 		 * We need to make it point to the last page, not past it.
896 		 */
897 		if (ent->protection & VM_PROT_EXECUTE)
898 			va = trunc_page(ent->end) - PAGE_SIZE;
899 	}
900 	vm_map_unlock_read(map);
901 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
902 		return 0;
903 
904 	pm->pm_hiexec = va;
905 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
906 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
907 	} else {
908 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
909 		return 0;
910 	}
911 	return 1;
912 }
913 #endif /* !defined(__x86_64__) */
914 
915 void
916 pat_init(struct cpu_info *ci)
917 {
918 	uint64_t pat;
919 
920 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
921 		return;
922 
923 	/* We change WT to WC. Leave all other entries the default values. */
924 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
925 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
926 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
927 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
928 
929 	wrmsr(MSR_CR_PAT, pat);
930 	cpu_pat_enabled = true;
931 }
932 
933 static pt_entry_t
934 pmap_pat_flags(u_int flags)
935 {
936 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
937 
938 	if (!cpu_pat_enabled) {
939 		switch (cacheflags) {
940 		case PMAP_NOCACHE:
941 		case PMAP_NOCACHE_OVR:
942 			/* results in PGC_UCMINUS on cpus which have
943 			 * the cpuid PAT but PAT "disabled"
944 			 */
945 			return PTE_PCD;
946 		default:
947 			return 0;
948 		}
949 	}
950 
951 	switch (cacheflags) {
952 	case PMAP_NOCACHE:
953 		return PGC_UC;
954 	case PMAP_WRITE_COMBINE:
955 		return PGC_WC;
956 	case PMAP_WRITE_BACK:
957 		return PGC_WB;
958 	case PMAP_NOCACHE_OVR:
959 		return PGC_UCMINUS;
960 	}
961 
962 	return 0;
963 }
964 
965 /*
966  * p m a p   k e n t e r   f u n c t i o n s
967  *
968  * functions to quickly enter/remove pages from the kernel address
969  * space.   pmap_kremove is exported to MI kernel.  we make use of
970  * the recursive PTE mappings.
971  */
972 
973 /*
974  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
975  *
976  * => no need to lock anything, assume va is already allocated
977  * => should be faster than normal pmap enter function
978  */
979 void
980 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
981 {
982 	pt_entry_t *pte, opte, npte;
983 
984 	KASSERT(!(prot & ~VM_PROT_ALL));
985 
986 	if (va < VM_MIN_KERNEL_ADDRESS)
987 		pte = vtopte(va);
988 	else
989 		pte = kvtopte(va);
990 #if defined(XENPV) && defined(DOM0OPS)
991 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
992 #ifdef DEBUG
993 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
994 		    " outside range\n", __func__, pa, va);
995 #endif /* DEBUG */
996 		npte = pa;
997 	} else
998 #endif /* XENPV && DOM0OPS */
999 		npte = pmap_pa2pte(pa);
1000 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1001 	npte |= pmap_pat_flags(flags);
1002 	opte = pmap_pte_testset(pte, npte); /* zap! */
1003 
1004 	/*
1005 	 * XXX: make sure we are not dealing with a large page, since the only
1006 	 * large pages created are for the kernel image, and they should never
1007 	 * be kentered.
1008 	 */
1009 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1010 
1011 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1012 		/* This should not happen. */
1013 		printf_nolog("%s: mapping already present\n", __func__);
1014 		kpreempt_disable();
1015 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1016 		kpreempt_enable();
1017 	}
1018 }
1019 
1020 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1021 
1022 #if defined(__x86_64__)
1023 /*
1024  * Change protection for a virtual address. Local for a CPU only, don't
1025  * care about TLB shootdowns.
1026  *
1027  * => must be called with preemption disabled
1028  */
1029 void
1030 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1031 {
1032 	pt_entry_t *pte, opte, npte;
1033 
1034 	KASSERT(kpreempt_disabled());
1035 
1036 	if (va < VM_MIN_KERNEL_ADDRESS)
1037 		pte = vtopte(va);
1038 	else
1039 		pte = kvtopte(va);
1040 
1041 	npte = opte = *pte;
1042 
1043 	if ((prot & VM_PROT_WRITE) != 0)
1044 		npte |= PTE_W;
1045 	else
1046 		npte &= ~(PTE_W|PTE_D);
1047 
1048 	if (opte != npte) {
1049 		pmap_pte_set(pte, npte);
1050 		pmap_pte_flush();
1051 		invlpg(va);
1052 	}
1053 }
1054 #endif /* defined(__x86_64__) */
1055 
1056 /*
1057  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1058  *
1059  * => no need to lock anything
1060  * => caller must dispose of any vm_page mapped in the va range
1061  * => note: not an inline function
1062  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1063  * => we assume kernel only unmaps valid addresses and thus don't bother
1064  *    checking the valid bit before doing TLB flushing
1065  * => must be followed by call to pmap_update() before reuse of page
1066  */
1067 static void
1068 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1069 {
1070 	pt_entry_t *pte, opte;
1071 	vaddr_t va, eva;
1072 
1073 	eva = sva + len;
1074 
1075 	kpreempt_disable();
1076 	for (va = sva; va < eva; va += PAGE_SIZE) {
1077 		pte = kvtopte(va);
1078 		opte = pmap_pte_testset(pte, 0); /* zap! */
1079 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1080 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1081 			    TLBSHOOT_KREMOVE);
1082 		}
1083 		KASSERTMSG((opte & PTE_PS) == 0,
1084 		    "va %#" PRIxVADDR " is a large page", va);
1085 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1086 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1087 	}
1088 	if (localonly) {
1089 		tlbflushg();
1090 	}
1091 	kpreempt_enable();
1092 }
1093 
1094 void
1095 pmap_kremove(vaddr_t sva, vsize_t len)
1096 {
1097 
1098 	pmap_kremove1(sva, len, false);
1099 }
1100 
1101 /*
1102  * pmap_kremove_local: like pmap_kremove(), but only worry about
1103  * TLB invalidations on the current CPU.  this is only intended
1104  * for use while writing kernel crash dumps, either after panic
1105  * or via reboot -d.
1106  */
1107 void
1108 pmap_kremove_local(vaddr_t sva, vsize_t len)
1109 {
1110 
1111 	pmap_kremove1(sva, len, true);
1112 }
1113 
1114 /*
1115  * p m a p   i n i t   f u n c t i o n s
1116  *
1117  * pmap_bootstrap and pmap_init are called during system startup
1118  * to init the pmap module.   pmap_bootstrap() does a low level
1119  * init just to get things rolling.   pmap_init() finishes the job.
1120  */
1121 
1122 /*
1123  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1124  * This function is to be used before any VM system has been set up.
1125  *
1126  * The va is taken from virtual_avail.
1127  */
1128 static vaddr_t
1129 pmap_bootstrap_valloc(size_t npages)
1130 {
1131 	vaddr_t va = virtual_avail;
1132 	virtual_avail += npages * PAGE_SIZE;
1133 	return va;
1134 }
1135 
1136 /*
1137  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1138  * This function is to be used before any VM system has been set up.
1139  *
1140  * The pa is taken from avail_start.
1141  */
1142 static paddr_t
1143 pmap_bootstrap_palloc(size_t npages)
1144 {
1145 	paddr_t pa = avail_start;
1146 	avail_start += npages * PAGE_SIZE;
1147 	return pa;
1148 }
1149 
1150 /*
1151  * pmap_bootstrap: get the system in a state where it can run with VM properly
1152  * enabled (called before main()). The VM system is fully init'd later.
1153  *
1154  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1155  *    kernel, and nkpde PTP's for the kernel.
1156  * => kva_start is the first free virtual address in kernel space.
1157  */
1158 void
1159 pmap_bootstrap(vaddr_t kva_start)
1160 {
1161 	struct pmap *kpm;
1162 	int i;
1163 	vaddr_t kva;
1164 
1165 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1166 
1167 	/*
1168 	 * Set up our local static global vars that keep track of the usage of
1169 	 * KVM before kernel_map is set up.
1170 	 */
1171 	virtual_avail = kva_start;		/* first free KVA */
1172 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1173 
1174 	/*
1175 	 * Set up protection_codes: we need to be able to convert from a MI
1176 	 * protection code (some combo of VM_PROT...) to something we can jam
1177 	 * into a x86 PTE.
1178 	 */
1179 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1180 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1181 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1182 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1183 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1184 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1185 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1186 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1187 
1188 	/*
1189 	 * Now we init the kernel's pmap.
1190 	 *
1191 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1192 	 * the pm_obj contains the list of active PTPs.
1193 	 */
1194 	kpm = pmap_kernel();
1195 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1196 	rw_init(&kpm->pm_dummy_lock);
1197 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1198 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1199 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1200 		kpm->pm_ptphint[i] = NULL;
1201 	}
1202 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1203 
1204 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1205 	for (i = 0; i < PDP_SIZE; i++)
1206 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1207 
1208 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1209 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1210 
1211 	kcpuset_create(&kpm->pm_cpus, true);
1212 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1213 
1214 	kpm->pm_ldt = NULL;
1215 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1216 
1217 	/*
1218 	 * the above is just a rough estimate and not critical to the proper
1219 	 * operation of the system.
1220 	 */
1221 
1222 #if !defined(XENPV)
1223 	/*
1224 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1225 	 * attribute to already mapped kernel pages. Do that only if SVS is
1226 	 * disabled.
1227 	 *
1228 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1229 	 * happens later in cpu_init().
1230 	 */
1231 #ifdef SVS
1232 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1233 #else
1234 	if (cpu_feature[0] & CPUID_PGE) {
1235 #endif
1236 		pmap_pg_g = PTE_G;
1237 		pmap_remap_global();
1238 	}
1239 #endif
1240 
1241 #ifndef XENPV
1242 	/*
1243 	 * Enable large pages if they are supported.
1244 	 */
1245 	if (cpu_feature[0] & CPUID_PSE) {
1246 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1247 		pmap_largepages = 1;	/* enable software */
1248 
1249 		/*
1250 		 * The TLB must be flushed after enabling large pages on Pentium
1251 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1252 		 * Software Developer's Manual, Volume 3: System Programming".
1253 		 */
1254 		tlbflushg();
1255 
1256 		/* Remap the kernel. */
1257 		pmap_remap_largepages();
1258 	}
1259 	pmap_init_lapic();
1260 #endif /* !XENPV */
1261 
1262 #ifdef __HAVE_PCPU_AREA
1263 	pmap_init_pcpu();
1264 #endif
1265 
1266 #ifdef __HAVE_DIRECT_MAP
1267 	pmap_init_directmap(kpm);
1268 #else
1269 	pmap_vpage_cpualloc(&cpu_info_primary);
1270 
1271 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1272 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1273 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1274 	} else { /* amd64 */
1275 		/*
1276 		 * zero_pte is stuck at the end of mapped space for the kernel
1277 		 * image (disjunct from kva space). This is done so that it
1278 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1279 		 * when it's called for the first time.
1280 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1281 		 */
1282 #ifdef XENPV
1283 		/* early_zerop initialized in xen_locore() */
1284 #else
1285 		early_zerop = (void *)bootspace.spareva;
1286 #endif
1287 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1288 	}
1289 #endif
1290 
1291 #if defined(XENPV) && defined(__x86_64__)
1292 	extern vaddr_t xen_dummy_page;
1293 	paddr_t xen_dummy_user_pgd;
1294 
1295 	/*
1296 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1297 	 * Xen will still consider it active. So we set user PGD to this one
1298 	 * to lift all protection on the now inactive page tables set.
1299 	 */
1300 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1301 
1302 	/* Zero fill it, the less checks in Xen it requires the better */
1303 	memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1304 	/* Mark read-only */
1305 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1306 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1307 	    UVMF_INVLPG);
1308 	/* Pin as L4 */
1309 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1310 #endif
1311 
1312 	/*
1313 	 * Allocate space for the IDT, GDT and LDT.
1314 	 */
1315 	idt_vaddr = pmap_bootstrap_valloc(1);
1316 	idt_paddr = pmap_bootstrap_palloc(1);
1317 
1318 	gdt_vaddr = pmap_bootstrap_valloc(1);
1319 	gdt_paddr = pmap_bootstrap_palloc(1);
1320 
1321 #ifdef __HAVE_PCPU_AREA
1322 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1323 #else
1324 	ldt_vaddr = pmap_bootstrap_valloc(1);
1325 #endif
1326 	ldt_paddr = pmap_bootstrap_palloc(1);
1327 
1328 #if !defined(__x86_64__)
1329 	/* pentium f00f bug stuff */
1330 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1331 #endif
1332 
1333 #if defined(XENPVHVM)
1334 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1335 	extern paddr_t HYPERVISOR_shared_info_pa;
1336 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1337 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1338 
1339 	if (vm_guest != VM_GUEST_XENPVH) {
1340 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1341 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1342 	}
1343 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1344 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1345 #endif
1346 	/*
1347 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1348 	 */
1349 	virtual_avail = reserve_dumppages(virtual_avail);
1350 
1351 	/*
1352 	 * Init the global lock and global list.
1353 	 */
1354 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1355 	LIST_INIT(&pmaps);
1356 
1357 	/*
1358 	 * Ensure the TLB is sync'd with reality by flushing it...
1359 	 */
1360 	tlbflushg();
1361 
1362 	/*
1363 	 * Calculate pmap_maxkvaddr from nkptp[].
1364 	 */
1365 	kva = VM_MIN_KERNEL_ADDRESS;
1366 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1367 		kva += nkptp[i] * nbpd[i];
1368 	}
1369 	pmap_maxkvaddr = kva;
1370 }
1371 
1372 #ifndef XENPV
1373 static void
1374 pmap_init_lapic(void)
1375 {
1376 	/*
1377 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1378 	 * x86 implementation relies a lot on this address to be valid; so just
1379 	 * allocate a fake physical page that will be kentered into
1380 	 * local_apic_va by machdep.
1381 	 *
1382 	 * If the LAPIC is present, the va will be remapped somewhere else
1383 	 * later in lapic_map.
1384 	 */
1385 	local_apic_va = pmap_bootstrap_valloc(1);
1386 	local_apic_pa = pmap_bootstrap_palloc(1);
1387 }
1388 #endif
1389 
1390 #ifdef __x86_64__
1391 static size_t
1392 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1393 {
1394 	size_t npages;
1395 	npages = (roundup(endva, pgsz) / pgsz) -
1396 	    (rounddown(startva, pgsz) / pgsz);
1397 	return npages;
1398 }
1399 #endif
1400 
1401 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1402 static inline void
1403 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1404 {
1405 	size_t sslot = slotspace.area[type].sslot;
1406 	size_t nslot = slotspace.area[type].nslot;
1407 
1408 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1409 }
1410 #endif
1411 
1412 #ifdef __x86_64__
1413 /*
1414  * Randomize the location of an area. We count the holes in the VM space. We
1415  * randomly select one hole, and then randomly select an area within that hole.
1416  * Finally we update the associated entry in the slotspace structure.
1417  */
1418 vaddr_t
1419 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1420     vaddr_t randva)
1421 {
1422 	struct {
1423 		int start;
1424 		int end;
1425 	} holes[SLSPACE_NAREAS+1];
1426 	size_t i, nholes, hole;
1427 	size_t startsl, endsl, nslots, winsize;
1428 	vaddr_t startva, va;
1429 
1430 	sz = roundup(sz, align);
1431 
1432 	/*
1433 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1434 	 * an area that crosses slots:
1435 	 *     +------+------+------+
1436 	 *     | Slot | Slot | Slot |
1437 	 *     +------+------+------+
1438 	 *        [Chosen Area]
1439 	 * And in that case we must take into account the additional slot
1440 	 * consumed.
1441 	 */
1442 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1443 
1444 	/* Get the holes. */
1445 	nholes = 0;
1446 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1447 	while (1) {
1448 		/*
1449 		 * Find the first occupied slot after the current one.
1450 		 * The area between the two is a hole.
1451 		 */
1452 		size_t minsslot = 512;
1453 		size_t minnslot = 0;
1454 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1455 			if (!slotspace.area[i].active)
1456 				continue;
1457 			if (slotspace.area[i].sslot >= curslot &&
1458 			    slotspace.area[i].sslot < minsslot) {
1459 				minsslot = slotspace.area[i].sslot;
1460 				minnslot = slotspace.area[i].nslot;
1461 			}
1462 		}
1463 
1464 		/* No hole anymore, stop here. */
1465 		if (minsslot == 512) {
1466 			break;
1467 		}
1468 
1469 		/* Register the hole. */
1470 		if (minsslot - curslot >= nslots) {
1471 			holes[nholes].start = curslot;
1472 			holes[nholes].end = minsslot;
1473 			nholes++;
1474 		}
1475 
1476 		/* Skip that hole, and iterate again. */
1477 		curslot = minsslot + minnslot;
1478 	}
1479 
1480 	if (nholes == 0) {
1481 		panic("%s: impossible", __func__);
1482 	}
1483 
1484 	/* Select a hole. */
1485 	hole = randhole;
1486 #ifdef NO_X86_ASLR
1487 	hole = 0;
1488 #endif
1489 	hole %= nholes;
1490 	startsl = holes[hole].start;
1491 	endsl = holes[hole].end;
1492 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1493 
1494 	/* Select an area within the hole. */
1495 	va = randva;
1496 #ifdef NO_X86_ASLR
1497 	va = 0;
1498 #endif
1499 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1500 	va %= winsize;
1501 	va = rounddown(va, align);
1502 	va += startva;
1503 
1504 	/* Update the entry. */
1505 	slotspace.area[type].sslot = pl4_i(va);
1506 	slotspace.area[type].nslot =
1507 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1508 	slotspace.area[type].active = true;
1509 
1510 	return va;
1511 }
1512 #endif
1513 
1514 #ifdef __HAVE_PCPU_AREA
1515 static void
1516 pmap_init_pcpu(void)
1517 {
1518 	const vaddr_t startva = PMAP_PCPU_BASE;
1519 	size_t nL4e, nL3e, nL2e, nL1e;
1520 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1521 	paddr_t pa;
1522 	vaddr_t endva;
1523 	vaddr_t tmpva;
1524 	pt_entry_t *pte;
1525 	size_t size;
1526 	int i;
1527 
1528 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1529 
1530 	size = sizeof(struct pcpu_area);
1531 
1532 	endva = startva + size;
1533 
1534 	/* We will use this temporary va. */
1535 	tmpva = bootspace.spareva;
1536 	pte = PTE_BASE + pl1_i(tmpva);
1537 
1538 	/* Build L4 */
1539 	L4e_idx = pl4_i(startva);
1540 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1541 	KASSERT(nL4e  == 1);
1542 	for (i = 0; i < nL4e; i++) {
1543 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1544 
1545 		pa = pmap_bootstrap_palloc(1);
1546 		*pte = (pa & PTE_FRAME) | pteflags;
1547 		pmap_update_pg(tmpva);
1548 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1549 
1550 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1551 	}
1552 
1553 	/* Build L3 */
1554 	L3e_idx = pl3_i(startva);
1555 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1556 	for (i = 0; i < nL3e; i++) {
1557 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1558 
1559 		pa = pmap_bootstrap_palloc(1);
1560 		*pte = (pa & PTE_FRAME) | pteflags;
1561 		pmap_update_pg(tmpva);
1562 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1563 
1564 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1565 	}
1566 
1567 	/* Build L2 */
1568 	L2e_idx = pl2_i(startva);
1569 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1570 	for (i = 0; i < nL2e; i++) {
1571 
1572 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1573 
1574 		pa = pmap_bootstrap_palloc(1);
1575 		*pte = (pa & PTE_FRAME) | pteflags;
1576 		pmap_update_pg(tmpva);
1577 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1578 
1579 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1580 	}
1581 
1582 	/* Build L1 */
1583 	L1e_idx = pl1_i(startva);
1584 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1585 	for (i = 0; i < nL1e; i++) {
1586 		/*
1587 		 * Nothing to do, the PTEs will be entered via
1588 		 * pmap_kenter_pa.
1589 		 */
1590 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1591 	}
1592 
1593 	*pte = 0;
1594 	pmap_update_pg(tmpva);
1595 
1596 	pcpuarea = (struct pcpu_area *)startva;
1597 
1598 	tlbflush();
1599 }
1600 #endif
1601 
1602 #ifdef __HAVE_DIRECT_MAP
1603 /*
1604  * Create the amd64 direct map. Called only once at boot time. We map all of
1605  * the physical memory contiguously using 2MB large pages, with RW permissions.
1606  * However there is a hole: the kernel is mapped with RO permissions.
1607  */
1608 static void
1609 pmap_init_directmap(struct pmap *kpm)
1610 {
1611 	extern phys_ram_seg_t mem_clusters[];
1612 	extern int mem_cluster_cnt;
1613 
1614 	vaddr_t startva;
1615 	size_t nL4e, nL3e, nL2e;
1616 	size_t L4e_idx, L3e_idx, L2e_idx;
1617 	size_t spahole, epahole;
1618 	paddr_t lastpa, pa;
1619 	vaddr_t endva;
1620 	vaddr_t tmpva;
1621 	pt_entry_t *pte;
1622 	phys_ram_seg_t *mc;
1623 	int i;
1624 	size_t randhole;
1625 	vaddr_t randva;
1626 
1627 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1628 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1629 
1630 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1631 
1632 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1633 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1634 
1635 	/* Get the last physical address available */
1636 	lastpa = 0;
1637 	for (i = 0; i < mem_cluster_cnt; i++) {
1638 		mc = &mem_clusters[i];
1639 		lastpa = MAX(lastpa, mc->start + mc->size);
1640 	}
1641 
1642 	/*
1643 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1644 	 */
1645 	if (lastpa > MAXPHYSMEM) {
1646 		panic("pmap_init_directmap: lastpa incorrect");
1647 	}
1648 
1649 	entropy_extract(&randhole, sizeof randhole, 0);
1650 	entropy_extract(&randva, sizeof randva, 0);
1651 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1652 	    randhole, randva);
1653 	endva = startva + lastpa;
1654 
1655 	/* We will use this temporary va. */
1656 	tmpva = bootspace.spareva;
1657 	pte = PTE_BASE + pl1_i(tmpva);
1658 
1659 	/* Build L4 */
1660 	L4e_idx = pl4_i(startva);
1661 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1662 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1663 	for (i = 0; i < nL4e; i++) {
1664 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1665 
1666 		pa = pmap_bootstrap_palloc(1);
1667 		*pte = (pa & PTE_FRAME) | pteflags;
1668 		pmap_update_pg(tmpva);
1669 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1670 
1671 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1672 	}
1673 
1674 	/* Build L3 */
1675 	L3e_idx = pl3_i(startva);
1676 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1677 	for (i = 0; i < nL3e; i++) {
1678 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1679 
1680 		pa = pmap_bootstrap_palloc(1);
1681 		*pte = (pa & PTE_FRAME) | pteflags;
1682 		pmap_update_pg(tmpva);
1683 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1684 
1685 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1686 	}
1687 
1688 	/* Build L2 */
1689 	L2e_idx = pl2_i(startva);
1690 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1691 	for (i = 0; i < nL2e; i++) {
1692 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1693 
1694 		pa = (paddr_t)(i * NBPD_L2);
1695 
1696 		if (spahole <= pa && pa < epahole) {
1697 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1698 			    PTE_PS | pmap_pg_g;
1699 		} else {
1700 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1701 			    PTE_PS | pmap_pg_g;
1702 		}
1703 	}
1704 
1705 	*pte = 0;
1706 	pmap_update_pg(tmpva);
1707 
1708 	pmap_direct_base = startva;
1709 	pmap_direct_end = endva;
1710 
1711 	tlbflush();
1712 }
1713 #endif /* __HAVE_DIRECT_MAP */
1714 
1715 #if !defined(XENPV)
1716 /*
1717  * Remap all of the virtual pages created so far with the PTE_G bit.
1718  */
1719 static void
1720 pmap_remap_global(void)
1721 {
1722 	vaddr_t kva, kva_end;
1723 	unsigned long p1i;
1724 	size_t i;
1725 
1726 	/* head */
1727 	kva = bootspace.head.va;
1728 	kva_end = kva + bootspace.head.sz;
1729 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1730 		p1i = pl1_i(kva);
1731 		if (pmap_valid_entry(PTE_BASE[p1i]))
1732 			PTE_BASE[p1i] |= pmap_pg_g;
1733 	}
1734 
1735 	/* kernel segments */
1736 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1737 		if (bootspace.segs[i].type == BTSEG_NONE) {
1738 			continue;
1739 		}
1740 		kva = bootspace.segs[i].va;
1741 		kva_end = kva + bootspace.segs[i].sz;
1742 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1743 			p1i = pl1_i(kva);
1744 			if (pmap_valid_entry(PTE_BASE[p1i]))
1745 				PTE_BASE[p1i] |= pmap_pg_g;
1746 		}
1747 	}
1748 
1749 	/* boot space */
1750 	kva = bootspace.boot.va;
1751 	kva_end = kva + bootspace.boot.sz;
1752 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1753 		p1i = pl1_i(kva);
1754 		if (pmap_valid_entry(PTE_BASE[p1i]))
1755 			PTE_BASE[p1i] |= pmap_pg_g;
1756 	}
1757 }
1758 #endif
1759 
1760 #ifndef XENPV
1761 /*
1762  * Remap several kernel segments with large pages. We cover as many pages as we
1763  * can. Called only once at boot time, if the CPU supports large pages.
1764  */
1765 static void
1766 pmap_remap_largepages(void)
1767 {
1768 	pd_entry_t *pde;
1769 	vaddr_t kva, kva_end;
1770 	paddr_t pa;
1771 	size_t i;
1772 
1773 	/* Remap the kernel text using large pages. */
1774 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1775 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1776 			continue;
1777 		}
1778 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1779 		if (kva < bootspace.segs[i].va) {
1780 			continue;
1781 		}
1782 		kva_end = rounddown(bootspace.segs[i].va +
1783 			bootspace.segs[i].sz, NBPD_L2);
1784 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1785 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1786 			pde = &L2_BASE[pl2_i(kva)];
1787 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1788 			tlbflushg();
1789 		}
1790 	}
1791 
1792 	/* Remap the kernel rodata using large pages. */
1793 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1794 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1795 			continue;
1796 		}
1797 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1798 		if (kva < bootspace.segs[i].va) {
1799 			continue;
1800 		}
1801 		kva_end = rounddown(bootspace.segs[i].va +
1802 			bootspace.segs[i].sz, NBPD_L2);
1803 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1804 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1805 			pde = &L2_BASE[pl2_i(kva)];
1806 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1807 			tlbflushg();
1808 		}
1809 	}
1810 
1811 	/* Remap the kernel data+bss using large pages. */
1812 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1813 		if (bootspace.segs[i].type != BTSEG_DATA) {
1814 			continue;
1815 		}
1816 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1817 		if (kva < bootspace.segs[i].va) {
1818 			continue;
1819 		}
1820 		kva_end = rounddown(bootspace.segs[i].va +
1821 			bootspace.segs[i].sz, NBPD_L2);
1822 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1823 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1824 			pde = &L2_BASE[pl2_i(kva)];
1825 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1826 			tlbflushg();
1827 		}
1828 	}
1829 }
1830 #endif /* !XENPV */
1831 
1832 /*
1833  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1834  * to manage mappings.
1835  */
1836 void
1837 pmap_init(void)
1838 {
1839 	int flags;
1840 
1841 	/*
1842 	 * initialize caches.
1843 	 */
1844 
1845 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1846 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1847 
1848 #ifdef XENPV
1849 	/*
1850 	 * pool_cache(9) should not touch cached objects, since they
1851 	 * are pinned on xen and R/O for the domU
1852 	 */
1853 	flags = PR_NOTOUCH;
1854 #else
1855 	flags = 0;
1856 #endif
1857 
1858 #ifdef PAE
1859 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1860 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1861 #else
1862 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1863 	    "pdppl", NULL, IPL_NONE);
1864 #endif
1865 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1866 	     0, 0, "pvpage", &pool_allocator_kmem,
1867 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1868 
1869 	pmap_tlb_init();
1870 
1871 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1872 	pmap_tlb_cpu_init(curcpu());
1873 
1874 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1875 	    NULL, "x86", "io bitmap copy");
1876 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1877 	    NULL, "x86", "ldt sync");
1878 
1879 	/*
1880 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1881 	 * to hang a tree of pv_entry records.  Dynamically allocated
1882 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1883 	 * usual case is embedded), so cop out and use a single RB tree
1884 	 * to cover them.
1885 	 */
1886 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1887 
1888 	/*
1889 	 * done: pmap module is up (and ready for business)
1890 	 */
1891 
1892 	pmap_initialized = true;
1893 }
1894 
1895 #ifndef XENPV
1896 /*
1897  * pmap_cpu_init_late: perform late per-CPU initialization.
1898  */
1899 void
1900 pmap_cpu_init_late(struct cpu_info *ci)
1901 {
1902 	/*
1903 	 * The BP has already its own PD page allocated during early
1904 	 * MD startup.
1905 	 */
1906 	if (ci == &cpu_info_primary)
1907 		return;
1908 #ifdef PAE
1909 	cpu_alloc_l3_page(ci);
1910 #endif
1911 }
1912 #endif
1913 
1914 #ifndef __HAVE_DIRECT_MAP
1915 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1916 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1917 
1918 static void
1919 pmap_vpage_cpualloc(struct cpu_info *ci)
1920 {
1921 	bool primary = (ci == &cpu_info_primary);
1922 	size_t i, npages;
1923 	vaddr_t vabase;
1924 	vsize_t vrange;
1925 
1926 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1927 	KASSERT(npages >= VPAGE_MAX);
1928 	vrange = npages * PAGE_SIZE;
1929 
1930 	if (primary) {
1931 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1932 			/* Waste some pages to align properly */
1933 		}
1934 		/* The base is aligned, allocate the rest (contiguous) */
1935 		pmap_bootstrap_valloc(npages - 1);
1936 	} else {
1937 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1938 		    UVM_KMF_VAONLY);
1939 		if (vabase == 0) {
1940 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1941 			    __func__, cpu_index(ci));
1942 		}
1943 	}
1944 
1945 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1946 
1947 	for (i = 0; i < VPAGE_MAX; i++) {
1948 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1949 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1950 	}
1951 }
1952 
1953 void
1954 pmap_vpage_cpu_init(struct cpu_info *ci)
1955 {
1956 	if (ci == &cpu_info_primary) {
1957 		/* cpu0 already taken care of in pmap_bootstrap */
1958 		return;
1959 	}
1960 
1961 	pmap_vpage_cpualloc(ci);
1962 }
1963 #endif
1964 
1965 /*
1966  * p v _ e n t r y   f u n c t i o n s
1967  */
1968 
1969 /*
1970  * pmap_pvp_dtor: pool_cache constructor for PV pages.
1971  */
1972 static int
1973 pmap_pvp_ctor(void *arg, void *obj, int flags)
1974 {
1975 	struct pv_page *pvp = (struct pv_page *)obj;
1976 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
1977 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
1978 
1979 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
1980 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
1981 
1982 	LIST_INIT(&pvp->pvp_pves);
1983 	pvp->pvp_nfree = PVE_PER_PVP;
1984 	pvp->pvp_pmap = NULL;
1985 
1986 	for (; pve < maxpve; pve++) {
1987 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
1988 	}
1989 
1990 	return 0;
1991 }
1992 
1993 /*
1994  * pmap_pvp_dtor: pool_cache destructor for PV pages.
1995  */
1996 static void
1997 pmap_pvp_dtor(void *arg, void *obj)
1998 {
1999 	struct pv_page *pvp __diagused = obj;
2000 
2001 	KASSERT(pvp->pvp_pmap == NULL);
2002 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2003 }
2004 
2005 /*
2006  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2007  */
2008 static struct pv_entry *
2009 pmap_alloc_pv(struct pmap *pmap)
2010 {
2011 	struct pv_entry *pve;
2012 	struct pv_page *pvp;
2013 
2014 	KASSERT(mutex_owned(&pmap->pm_lock));
2015 
2016 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2017 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2018 			LIST_REMOVE(pvp, pvp_list);
2019 		} else {
2020 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2021 		}
2022 		if (__predict_false(pvp == NULL)) {
2023 			return NULL;
2024 		}
2025 		/* full -> part */
2026 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2027 		pvp->pvp_pmap = pmap;
2028 	}
2029 
2030 	KASSERT(pvp->pvp_pmap == pmap);
2031 	KASSERT(pvp->pvp_nfree > 0);
2032 
2033 	pve = LIST_FIRST(&pvp->pvp_pves);
2034 	LIST_REMOVE(pve, pve_list);
2035 	pvp->pvp_nfree--;
2036 
2037 	if (__predict_false(pvp->pvp_nfree == 0)) {
2038 		/* part -> empty */
2039 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2040 		LIST_REMOVE(pvp, pvp_list);
2041 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2042 	} else {
2043 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2044 	}
2045 
2046 	return pve;
2047 }
2048 
2049 /*
2050  * pmap_free_pv: delayed free of a PV entry.
2051  */
2052 static void
2053 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2054 {
2055 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2056 
2057 	KASSERT(mutex_owned(&pmap->pm_lock));
2058 	KASSERT(pvp->pvp_pmap == pmap);
2059 	KASSERT(pvp->pvp_nfree >= 0);
2060 
2061 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2062 	pvp->pvp_nfree++;
2063 
2064 	if (__predict_false(pvp->pvp_nfree == 1)) {
2065 		/* empty -> part */
2066 		LIST_REMOVE(pvp, pvp_list);
2067 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2068 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2069 		/* part -> full */
2070 		LIST_REMOVE(pvp, pvp_list);
2071 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2072 	}
2073 }
2074 
2075 /*
2076  * pmap_drain_pv: free full PV pages.
2077  */
2078 static void
2079 pmap_drain_pv(struct pmap *pmap)
2080 {
2081 	struct pv_page *pvp;
2082 
2083 	KASSERT(mutex_owned(&pmap->pm_lock));
2084 
2085 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2086 		LIST_REMOVE(pvp, pvp_list);
2087 		KASSERT(pvp->pvp_pmap == pmap);
2088 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2089 		pvp->pvp_pmap = NULL;
2090 		pool_cache_put(&pmap_pvp_cache, pvp);
2091 	}
2092 }
2093 
2094 /*
2095  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2096  */
2097 static void
2098 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2099     vaddr_t va, bool tracked)
2100 {
2101 #ifdef DEBUG
2102 	struct pv_pte *pvpte;
2103 
2104 	PMAP_CHECK_PP(pp);
2105 
2106 	mutex_spin_enter(&pp->pp_lock);
2107 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2108 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2109 			break;
2110 		}
2111 	}
2112 	mutex_spin_exit(&pp->pp_lock);
2113 
2114 	if (pvpte && !tracked) {
2115 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2116 	} else if (!pvpte && tracked) {
2117 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2118 	}
2119 #endif
2120 }
2121 
2122 /*
2123  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2124  *
2125  * => pmap must be locked
2126  */
2127 static struct pv_entry *
2128 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2129     const rb_tree_t *tree, const vaddr_t va)
2130 {
2131 	struct pv_entry *pve;
2132 	rb_node_t *node;
2133 
2134 	/*
2135 	 * Inlined lookup tailored for exactly what's needed here that is
2136 	 * quite a bit faster than using rb_tree_find_node().
2137 	 */
2138 	for (node = tree->rbt_root;;) {
2139 		if (__predict_false(RB_SENTINEL_P(node))) {
2140 			return NULL;
2141 		}
2142 		pve = (struct pv_entry *)
2143 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2144 		if (pve->pve_pte.pte_va == va) {
2145 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2146 			return pve;
2147 		}
2148 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2149 	}
2150 }
2151 
2152 /*
2153  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2154  *
2155  * => a PV entry must be known present (doesn't check for existence)
2156  * => pmap must be locked
2157  */
2158 static struct pv_entry *
2159 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2160     const struct pmap_page * const old_pp, const vaddr_t va)
2161 {
2162 	struct pv_entry *pve;
2163 	const rb_tree_t *tree;
2164 
2165 	KASSERT(mutex_owned(&pmap->pm_lock));
2166 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2167 
2168 	/*
2169 	 * [This mostly deals with the case of process-private pages, i.e.
2170 	 * anonymous memory allocations or COW.]
2171 	 *
2172 	 * If the page is tracked with an embedded entry then the tree
2173 	 * lookup can be avoided.  It's safe to check for this specific
2174 	 * set of values without pp_lock because both will only ever be
2175 	 * set together for this pmap.
2176 	 *
2177 	 */
2178 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2179 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2180 		return NULL;
2181 	}
2182 
2183 	/*
2184 	 * [This mostly deals with shared mappings, for example shared libs
2185 	 * and executables.]
2186 	 *
2187 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2188 	 * look at the lowest numbered node in the tree first.  The tree is
2189 	 * known non-empty because of the check above.  For short lived
2190 	 * processes where pmap_remove() isn't used much this gets close to
2191 	 * a 100% hit rate.
2192 	 */
2193 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2194 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2195 	pve = (struct pv_entry *)
2196 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2197 	    offsetof(struct pv_entry, pve_rb));
2198 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2199 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2200 		return pve;
2201 	}
2202 
2203 	/* Search the RB tree for the key (uncommon). */
2204 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2205 }
2206 
2207 /*
2208  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2209  *
2210  * => pmap must be locked
2211  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2212  */
2213 static int
2214 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2215     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2216     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2217 {
2218 	struct pv_entry *pve;
2219 	int error;
2220 
2221 	KASSERT(mutex_owned(&pmap->pm_lock));
2222 	KASSERT(ptp_to_pmap(ptp) == pmap);
2223 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2224 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2225 	PMAP_CHECK_PP(pp);
2226 
2227 	/*
2228 	 * If entering the same page and it's already tracked with an
2229 	 * embedded entry, we can avoid the expense below.  It's safe
2230 	 * to check for this very specific set of values without a lock
2231 	 * because both will only ever be set together for this pmap.
2232 	 */
2233 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2234 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2235 		*samepage = true;
2236 		pmap_check_pv(pmap, ptp, pp, va, true);
2237 		return 0;
2238 	}
2239 
2240 	/*
2241 	 * Check for an existing dynamic mapping at this address.  If it's
2242 	 * for the same page, then it will be reused and nothing needs to be
2243 	 * changed.
2244 	 */
2245 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2246 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2247 		*samepage = true;
2248 		pmap_check_pv(pmap, ptp, pp, va, true);
2249 		return 0;
2250 	}
2251 
2252 	/*
2253 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2254 	 * case it's needed; won't know for sure until the lock is taken.
2255 	 */
2256 	if (pmap->pm_pve == NULL) {
2257 		pmap->pm_pve = pmap_alloc_pv(pmap);
2258 	}
2259 
2260 	error = 0;
2261 	pmap_check_pv(pmap, ptp, pp, va, false);
2262 	mutex_spin_enter(&pp->pp_lock);
2263 	if (!pv_pte_embedded(pp)) {
2264 		/*
2265 		 * Embedded PV tracking available - easy.
2266 		 */
2267 		pp->pp_pte.pte_ptp = ptp;
2268 		pp->pp_pte.pte_va = va;
2269 		*new_embedded = true;
2270 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2271 		/*
2272 		 * No memory.
2273 		 */
2274 		error = ENOMEM;
2275 	} else {
2276 		/*
2277 		 * Install new pv_entry on the page.
2278 		 */
2279 		pve = pmap->pm_pve;
2280 		pmap->pm_pve = NULL;
2281 		*new_pve = pve;
2282 		pve->pve_pte.pte_ptp = ptp;
2283 		pve->pve_pte.pte_va = va;
2284 		pve->pve_pp = pp;
2285 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2286 	}
2287 	mutex_spin_exit(&pp->pp_lock);
2288 	if (error == 0) {
2289 		pmap_check_pv(pmap, ptp, pp, va, true);
2290 	}
2291 
2292 	return error;
2293 }
2294 
2295 /*
2296  * pmap_remove_pv: try to remove a mapping from a pv_list
2297  *
2298  * => pmap must be locked
2299  * => removes dynamic entries from tree and frees them
2300  * => caller should adjust ptp's wire_count and free PTP if needed
2301  */
2302 static void
2303 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2304     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2305 {
2306 	rb_tree_t *tree = (ptp != NULL ?
2307 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2308 
2309 	KASSERT(mutex_owned(&pmap->pm_lock));
2310 	KASSERT(ptp_to_pmap(ptp) == pmap);
2311 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2312 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2313 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2314 
2315 	pmap_check_pv(pmap, ptp, pp, va, true);
2316 
2317 	if (pve == NULL) {
2318 		mutex_spin_enter(&pp->pp_lock);
2319 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2320 		KASSERT(pp->pp_pte.pte_va == va);
2321 		pp->pp_attrs |= oattrs;
2322 		pp->pp_pte.pte_ptp = NULL;
2323 		pp->pp_pte.pte_va = 0;
2324 		mutex_spin_exit(&pp->pp_lock);
2325 	} else {
2326 		mutex_spin_enter(&pp->pp_lock);
2327 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2328 		    pp->pp_pte.pte_va != va);
2329 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2330 		KASSERT(pve->pve_pte.pte_va == va);
2331 		KASSERT(pve->pve_pp == pp);
2332 		pp->pp_attrs |= oattrs;
2333 		LIST_REMOVE(pve, pve_list);
2334 		mutex_spin_exit(&pp->pp_lock);
2335 
2336 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2337 		rb_tree_remove_node(tree, pve);
2338 #ifdef DIAGNOSTIC
2339 		memset(pve, 0, sizeof(*pve));
2340 #endif
2341 		pmap_free_pv(pmap, pve);
2342 	}
2343 
2344 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2345 	pmap_check_pv(pmap, ptp, pp, va, false);
2346 }
2347 
2348 /*
2349  * p t p   f u n c t i o n s
2350  */
2351 
2352 static struct vm_page *
2353 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2354 {
2355 	int lidx = level - 1;
2356 	off_t off = ptp_va2o(va, level);
2357 	struct vm_page *pg;
2358 
2359 	KASSERT(mutex_owned(&pmap->pm_lock));
2360 
2361 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2362 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2363 		pg = pmap->pm_ptphint[lidx];
2364 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2365 		return pg;
2366 	}
2367 	PMAP_DUMMY_LOCK(pmap);
2368 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2369 	PMAP_DUMMY_UNLOCK(pmap);
2370 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2371 		/* This page is queued to be freed - ignore. */
2372 		pg = NULL;
2373 	}
2374 	if (pg != NULL) {
2375 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2376 	}
2377 	pmap->pm_ptphint[lidx] = pg;
2378 	return pg;
2379 }
2380 
2381 static inline void
2382 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2383 {
2384 	int lidx;
2385 
2386 	KASSERT(ptp->wire_count <= 1);
2387 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2388 
2389 	lidx = level - 1;
2390 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2391 	if (pmap->pm_ptphint[lidx] == ptp)
2392 		pmap->pm_ptphint[lidx] = NULL;
2393 	ptp->wire_count = 0;
2394 	ptp->uanon = NULL;
2395 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2396 
2397 	/*
2398 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2399 	 * the page from the uvm_object, as that can take further locks
2400 	 * (intolerable right now because the PTEs are likely mapped in).
2401 	 * Instead mark the PTP as free and if we bump into it again, we'll
2402 	 * either ignore or reuse (depending on what's useful at the time).
2403 	 */
2404 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2405 }
2406 
2407 static void
2408 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2409 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2410 {
2411 	unsigned long index;
2412 	int level;
2413 	vaddr_t invaladdr;
2414 	pd_entry_t opde;
2415 
2416 	KASSERT(pmap != pmap_kernel());
2417 	KASSERT(mutex_owned(&pmap->pm_lock));
2418 	KASSERT(kpreempt_disabled());
2419 
2420 	level = 1;
2421 	do {
2422 		index = pl_i(va, level + 1);
2423 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2424 
2425 		/*
2426 		 * On Xen-amd64 or SVS, we need to sync the top level page
2427 		 * directory on each CPU.
2428 		 */
2429 #if defined(XENPV) && defined(__x86_64__)
2430 		if (level == PTP_LEVELS - 1) {
2431 			xen_kpm_sync(pmap, index);
2432 		}
2433 #elif defined(SVS)
2434 		if (svs_enabled && level == PTP_LEVELS - 1) {
2435 			svs_pmap_sync(pmap, index);
2436 		}
2437 #endif
2438 
2439 		invaladdr = level == 1 ? (vaddr_t)ptes :
2440 		    (vaddr_t)pdes[level - 2];
2441 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2442 		    opde, TLBSHOOT_FREE_PTP);
2443 
2444 #if defined(XENPV)
2445 		pmap_tlb_shootnow();
2446 #endif
2447 
2448 		pmap_freepage(pmap, ptp, level);
2449 		if (level < PTP_LEVELS - 1) {
2450 			ptp = pmap_find_ptp(pmap, va, level + 1);
2451 			ptp->wire_count--;
2452 			if (ptp->wire_count > 1)
2453 				break;
2454 		}
2455 	} while (++level < PTP_LEVELS);
2456 	pmap_pte_flush();
2457 }
2458 
2459 /*
2460  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2461  *
2462  * => pmap should NOT be pmap_kernel()
2463  * => pmap should be locked
2464  * => we are not touching any PTEs yet, so they need not be mapped in
2465  */
2466 static int
2467 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2468     int flags, struct vm_page **resultp)
2469 {
2470 	struct vm_page *ptp;
2471 	int i, aflags;
2472 	struct uvm_object *obj;
2473 	voff_t off;
2474 
2475 	KASSERT(pmap != pmap_kernel());
2476 	KASSERT(mutex_owned(&pmap->pm_lock));
2477 
2478 	/*
2479 	 * Loop through all page table levels allocating a page
2480 	 * for any level where we don't already have one.
2481 	 */
2482 	memset(pt, 0, sizeof(*pt));
2483 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2484 		UVM_PGA_ZERO;
2485 	for (i = PTP_LEVELS; i > 1; i--) {
2486 		obj = &pmap->pm_obj[i - 2];
2487 		off = ptp_va2o(va, i - 1);
2488 
2489 		PMAP_DUMMY_LOCK(pmap);
2490 		pt->pg[i] = uvm_pagelookup(obj, off);
2491 
2492 		if (pt->pg[i] == NULL) {
2493 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2494 			pt->alloced[i] = (pt->pg[i] != NULL);
2495 		} else if (pt->pg[i]->wire_count == 0) {
2496 			/* This page was queued to be freed; dequeue it. */
2497 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2498 			pt->alloced[i] = true;
2499 		}
2500 		PMAP_DUMMY_UNLOCK(pmap);
2501 		if (pt->pg[i] == NULL) {
2502 			pmap_unget_ptp(pmap, pt);
2503 			return ENOMEM;
2504 		} else if (pt->alloced[i]) {
2505 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2506 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2507 			    &pmap_rbtree_ops);
2508 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2509 		}
2510 	}
2511 	ptp = pt->pg[2];
2512 	KASSERT(ptp != NULL);
2513 	*resultp = ptp;
2514 	pmap->pm_ptphint[0] = ptp;
2515 	return 0;
2516 }
2517 
2518 /*
2519  * pmap_install_ptp: install any freshly allocated PTPs
2520  *
2521  * => pmap should NOT be pmap_kernel()
2522  * => pmap should be locked
2523  * => PTEs must be mapped
2524  * => preemption must be disabled
2525  */
2526 static void
2527 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2528     pd_entry_t * const *pdes)
2529 {
2530 	struct vm_page *ptp;
2531 	unsigned long index;
2532 	pd_entry_t *pva;
2533 	paddr_t pa;
2534 	int i;
2535 
2536 	KASSERT(pmap != pmap_kernel());
2537 	KASSERT(mutex_owned(&pmap->pm_lock));
2538 	KASSERT(kpreempt_disabled());
2539 
2540 	/*
2541 	 * Now that we have all the pages looked up or allocated,
2542 	 * loop through again installing any new ones into the tree.
2543 	 */
2544 	for (i = PTP_LEVELS; i > 1; i--) {
2545 		index = pl_i(va, i);
2546 		pva = pdes[i - 2];
2547 
2548 		if (pmap_valid_entry(pva[index])) {
2549 			KASSERT(!pt->alloced[i]);
2550 			continue;
2551 		}
2552 
2553 		ptp = pt->pg[i];
2554 		ptp->flags &= ~PG_BUSY; /* never busy */
2555 		ptp->wire_count = 1;
2556 		pmap->pm_ptphint[i - 2] = ptp;
2557 		pa = VM_PAGE_TO_PHYS(ptp);
2558 		pmap_pte_set(&pva[index], (pd_entry_t)
2559 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2560 
2561 		/*
2562 		 * On Xen-amd64 or SVS, we need to sync the top level page
2563 		 * directory on each CPU.
2564 		 */
2565 #if defined(XENPV) && defined(__x86_64__)
2566 		if (i == PTP_LEVELS) {
2567 			xen_kpm_sync(pmap, index);
2568 		}
2569 #elif defined(SVS)
2570 		if (svs_enabled && i == PTP_LEVELS) {
2571 			svs_pmap_sync(pmap, index);
2572 		}
2573 #endif
2574 
2575 		pmap_pte_flush();
2576 		pmap_stats_update(pmap, 1, 0);
2577 
2578 		/*
2579 		 * If we're not in the top level, increase the
2580 		 * wire count of the parent page.
2581 		 */
2582 		if (i < PTP_LEVELS) {
2583 			pt->pg[i + 1]->wire_count++;
2584 		}
2585 	}
2586 }
2587 
2588 /*
2589  * pmap_unget_ptp: free unusued PTPs
2590  *
2591  * => pmap should NOT be pmap_kernel()
2592  * => pmap should be locked
2593  */
2594 static void
2595 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2596 {
2597 	int i;
2598 
2599 	KASSERT(pmap != pmap_kernel());
2600 	KASSERT(mutex_owned(&pmap->pm_lock));
2601 
2602 	for (i = PTP_LEVELS; i > 1; i--) {
2603 		if (!pt->alloced[i]) {
2604 			continue;
2605 		}
2606 		KASSERT(pt->pg[i]->wire_count == 0);
2607 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2608 		pmap_freepage(pmap, pt->pg[i], i - 1);
2609 	}
2610 }
2611 
2612 /*
2613  * p m a p   l i f e c y c l e   f u n c t i o n s
2614  */
2615 
2616 /*
2617  * pmap_pdp_init: constructor a new PDP.
2618  */
2619 static void
2620 pmap_pdp_init(pd_entry_t *pdir)
2621 {
2622 	paddr_t pdirpa = 0;
2623 	vaddr_t object;
2624 	int i;
2625 
2626 #if !defined(XENPV) || !defined(__x86_64__)
2627 	int npde;
2628 #endif
2629 #ifdef XENPV
2630 	int s;
2631 #endif
2632 
2633 	memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2634 
2635 	/*
2636 	 * NOTE: This is all done unlocked, but we will check afterwards
2637 	 * if we have raced with pmap_growkernel().
2638 	 */
2639 
2640 #if defined(XENPV) && defined(__x86_64__)
2641 	/* Fetch the physical address of the page directory */
2642 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2643 
2644 	/*
2645 	 * This pdir will NEVER be active in kernel mode, so mark
2646 	 * recursive entry invalid.
2647 	 */
2648 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2649 
2650 	/*
2651 	 * PDP constructed this way won't be for the kernel, hence we
2652 	 * don't put kernel mappings on Xen.
2653 	 *
2654 	 * But we need to make pmap_create() happy, so put a dummy
2655 	 * (without PTE_P) value at the right place.
2656 	 */
2657 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2658 	     (pd_entry_t)-1 & PTE_FRAME;
2659 #else /* XENPV && __x86_64__*/
2660 	object = (vaddr_t)pdir;
2661 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2662 		/* Fetch the physical address of the page directory */
2663 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2664 
2665 		/* Put in recursive PDE to map the PTEs */
2666 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2667 		    pmap_pg_nx;
2668 #ifndef XENPV
2669 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2670 #endif
2671 	}
2672 
2673 	/* Copy the kernel's top level PDE */
2674 	npde = nkptp[PTP_LEVELS - 1];
2675 
2676 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2677 	    npde * sizeof(pd_entry_t));
2678 
2679 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2680 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2681 		pdir[idx] = PDP_BASE[idx];
2682 	}
2683 
2684 #ifdef __HAVE_PCPU_AREA
2685 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2686 #endif
2687 #ifdef __HAVE_DIRECT_MAP
2688 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2689 #endif
2690 #ifdef KASAN
2691 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2692 #endif
2693 #ifdef KMSAN
2694 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2695 #endif
2696 #endif /* XENPV  && __x86_64__*/
2697 
2698 #ifdef XENPV
2699 	s = splvm();
2700 	object = (vaddr_t)pdir;
2701 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2702 	    VM_PROT_READ);
2703 	pmap_update(pmap_kernel());
2704 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2705 		/*
2706 		 * pin as L2/L4 page, we have to do the page with the
2707 		 * PDIR_SLOT_PTE entries last
2708 		 */
2709 #ifdef PAE
2710 		if (i == l2tol3(PDIR_SLOT_PTE))
2711 			continue;
2712 #endif
2713 
2714 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2715 #ifdef __x86_64__
2716 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2717 #else
2718 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2719 #endif
2720 	}
2721 #ifdef PAE
2722 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2723 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2724 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2725 #endif
2726 	splx(s);
2727 #endif /* XENPV */
2728 }
2729 
2730 /*
2731  * pmap_pdp_fini: destructor for the PDPs.
2732  */
2733 static void
2734 pmap_pdp_fini(pd_entry_t *pdir)
2735 {
2736 #ifdef XENPV
2737 	paddr_t pdirpa = 0;	/* XXX: GCC */
2738 	vaddr_t object = (vaddr_t)pdir;
2739 	int i;
2740 	int s = splvm();
2741 	pt_entry_t *pte;
2742 
2743 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2744 		/* fetch the physical address of the page directory. */
2745 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2746 		/* unpin page table */
2747 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2748 	}
2749 	object = (vaddr_t)pdir;
2750 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2751 		/* Set page RW again */
2752 		pte = kvtopte(object);
2753 		pmap_pte_set(pte, *pte | PTE_W);
2754 		xen_bcast_invlpg((vaddr_t)object);
2755 	}
2756 	splx(s);
2757 #endif  /* XENPV */
2758 }
2759 
2760 #ifdef PAE
2761 static void *
2762 pmap_pdp_alloc(struct pool *pp, int flags)
2763 {
2764 	return (void *)uvm_km_alloc(kernel_map,
2765 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2766 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2767 	    UVM_KMF_WIRED);
2768 }
2769 
2770 static void
2771 pmap_pdp_free(struct pool *pp, void *v)
2772 {
2773 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2774 	    UVM_KMF_WIRED);
2775 }
2776 #endif /* PAE */
2777 
2778 /*
2779  * pmap_ctor: constructor for the pmap cache.
2780  */
2781 static int
2782 pmap_ctor(void *arg, void *obj, int flags)
2783 {
2784 	struct pmap *pmap = obj;
2785 	pt_entry_t p;
2786 	int i;
2787 
2788 	KASSERT((flags & PR_WAITOK) != 0);
2789 
2790 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2791 	rw_init(&pmap->pm_dummy_lock);
2792 	kcpuset_create(&pmap->pm_cpus, true);
2793 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2794 #ifdef XENPV
2795 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2796 #endif
2797 	LIST_INIT(&pmap->pm_gc_ptp);
2798 	pmap->pm_pve = NULL;
2799 	LIST_INIT(&pmap->pm_pvp_full);
2800 	LIST_INIT(&pmap->pm_pvp_part);
2801 	LIST_INIT(&pmap->pm_pvp_empty);
2802 
2803 	/* allocate and init PDP */
2804 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2805 
2806 	for (;;) {
2807 		pmap_pdp_init(pmap->pm_pdir);
2808 		mutex_enter(&pmaps_lock);
2809 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2810 		if (__predict_true(p != 0)) {
2811 			break;
2812 		}
2813 		mutex_exit(&pmaps_lock);
2814 	}
2815 
2816 	for (i = 0; i < PDP_SIZE; i++)
2817 		pmap->pm_pdirpa[i] =
2818 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2819 
2820 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2821 	mutex_exit(&pmaps_lock);
2822 
2823 	return 0;
2824 }
2825 
2826 /*
2827  * pmap_ctor: destructor for the pmap cache.
2828  */
2829 static void
2830 pmap_dtor(void *arg, void *obj)
2831 {
2832 	struct pmap *pmap = obj;
2833 
2834 	mutex_enter(&pmaps_lock);
2835 	LIST_REMOVE(pmap, pm_list);
2836 	mutex_exit(&pmaps_lock);
2837 
2838 	pmap_pdp_fini(pmap->pm_pdir);
2839 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2840 	mutex_destroy(&pmap->pm_lock);
2841 	rw_destroy(&pmap->pm_dummy_lock);
2842 	kcpuset_destroy(pmap->pm_cpus);
2843 	kcpuset_destroy(pmap->pm_kernel_cpus);
2844 #ifdef XENPV
2845 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2846 #endif
2847 }
2848 
2849 /*
2850  * pmap_create: create a pmap object.
2851  */
2852 struct pmap *
2853 pmap_create(void)
2854 {
2855 	struct pmap *pmap;
2856 	int i;
2857 
2858 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2859 
2860 	/* init uvm_object */
2861 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2862 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2863 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2864 		pmap->pm_ptphint[i] = NULL;
2865 	}
2866 	pmap->pm_stats.wired_count = 0;
2867 	/* count the PDP allocd below */
2868 	pmap->pm_stats.resident_count = PDP_SIZE;
2869 #if !defined(__x86_64__)
2870 	pmap->pm_hiexec = 0;
2871 #endif
2872 
2873 	/* Used by NVMM and Xen */
2874 	pmap->pm_enter = NULL;
2875 	pmap->pm_extract = NULL;
2876 	pmap->pm_remove = NULL;
2877 	pmap->pm_sync_pv = NULL;
2878 	pmap->pm_pp_remove_ent = NULL;
2879 	pmap->pm_write_protect = NULL;
2880 	pmap->pm_unwire = NULL;
2881 	pmap->pm_tlb_flush = NULL;
2882 	pmap->pm_data = NULL;
2883 
2884 	/* init the LDT */
2885 	pmap->pm_ldt = NULL;
2886 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2887 
2888 	return (pmap);
2889 }
2890 
2891 /*
2892  * pmap_check_ptps: verify that none of the pmap's page table objects
2893  * have any pages allocated to them.
2894  */
2895 static void
2896 pmap_check_ptps(struct pmap *pmap)
2897 {
2898 	int i;
2899 
2900 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2901 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2902 		    "pmap %p level %d still has %d pages",
2903 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2904 	}
2905 }
2906 
2907 static void
2908 pmap_check_inuse(struct pmap *pmap)
2909 {
2910 #ifdef DEBUG
2911 	CPU_INFO_ITERATOR cii;
2912 	struct cpu_info *ci;
2913 
2914 	for (CPU_INFO_FOREACH(cii, ci)) {
2915 		if (ci->ci_pmap == pmap)
2916 			panic("destroying pmap being used");
2917 #if defined(XENPV) && defined(__x86_64__)
2918 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
2919 			if (pmap->pm_pdir[i] != 0 &&
2920 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2921 				printf("pmap_destroy(%p) pmap_kernel %p "
2922 				    "curcpu %d cpu %d ci_pmap %p "
2923 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2924 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2925 				    pmap, pmap_kernel(), curcpu()->ci_index,
2926 				    ci->ci_index, ci->ci_pmap,
2927 				    i, ci->ci_kpm_pdir[i],
2928 				    i, pmap->pm_pdir[i]);
2929 				panic("%s: used pmap", __func__);
2930 			}
2931 		}
2932 #endif
2933 	}
2934 #endif /* DEBUG */
2935 }
2936 
2937 /*
2938  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
2939  * count goes to zero.
2940  *
2941  * => we can be called from pmap_unmap_ptes() with a different, unrelated
2942  *    pmap's lock held.  be careful!
2943  */
2944 void
2945 pmap_destroy(struct pmap *pmap)
2946 {
2947 	int i;
2948 
2949 	/*
2950 	 * drop reference count and verify not in use.
2951 	 */
2952 
2953 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2954 		return;
2955 	}
2956 	pmap_check_inuse(pmap);
2957 
2958 	/*
2959 	 * handle any deferred frees.
2960 	 */
2961 
2962 	mutex_enter(&pmap->pm_lock);
2963 	if (pmap->pm_pve != NULL) {
2964 		pmap_free_pv(pmap, pmap->pm_pve);
2965 		pmap->pm_pve = NULL;
2966 	}
2967 	pmap_drain_pv(pmap);
2968 	mutex_exit(&pmap->pm_lock);
2969 	pmap_update(pmap);
2970 
2971 	/*
2972 	 * Reference count is zero, free pmap resources and then free pmap.
2973 	 */
2974 
2975 	pmap_check_ptps(pmap);
2976 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
2977 
2978 #ifdef USER_LDT
2979 	if (pmap->pm_ldt != NULL) {
2980 		/*
2981 		 * No need to switch the LDT; this address space is gone,
2982 		 * nothing is using it.
2983 		 *
2984 		 * No need to lock the pmap for ldt_free (or anything else),
2985 		 * we're the last one to use it.
2986 		 */
2987 		/* XXXAD can't take cpu_lock here - fix soon. */
2988 		mutex_enter(&cpu_lock);
2989 		ldt_free(pmap->pm_ldt_sel);
2990 		mutex_exit(&cpu_lock);
2991 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2992 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
2993 	}
2994 #endif
2995 
2996 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2997 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2998 	}
2999 	kcpuset_zero(pmap->pm_cpus);
3000 	kcpuset_zero(pmap->pm_kernel_cpus);
3001 #ifdef XENPV
3002 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3003 #endif
3004 
3005 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3006 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3007 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3008 
3009 	pmap_check_ptps(pmap);
3010 	if (__predict_false(pmap->pm_enter != NULL)) {
3011 		/* XXX make this a different cache */
3012 		pool_cache_destruct_object(&pmap_cache, pmap);
3013 	} else {
3014 		pool_cache_put(&pmap_cache, pmap);
3015 	}
3016 }
3017 
3018 /*
3019  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3020  *
3021  * => caller must hold pmap's lock
3022  * => PTP must be mapped into KVA
3023  * => must be called with kernel preemption disabled
3024  * => does as little work as possible
3025  */
3026 static void
3027 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3028     vaddr_t startva, vaddr_t blkendva)
3029 {
3030 #ifndef XENPV
3031 	struct pv_entry *pve;
3032 	struct vm_page *pg;
3033 	struct pmap_page *pp;
3034 	pt_entry_t opte;
3035 	rb_tree_t *tree;
3036 	vaddr_t va;
3037 	int wired;
3038 	uint8_t oattrs;
3039 	u_int cnt;
3040 
3041 	KASSERT(mutex_owned(&pmap->pm_lock));
3042 	KASSERT(kpreempt_disabled());
3043 	KASSERT(pmap != pmap_kernel());
3044 	KASSERT(ptp->wire_count > 1);
3045 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3046 
3047 	/*
3048 	 * Start at the lowest entered VA, and scan until there are no more
3049 	 * PTEs in the PTPs.
3050 	 */
3051 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3052 	pve = RB_TREE_MIN(tree);
3053 	wired = 0;
3054 	va = (vaddr_t)ptp->uanon;
3055 	pte += ((va - startva) >> PAGE_SHIFT);
3056 
3057 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3058 		/*
3059 		 * No need for an atomic to clear the PTE.  Nothing else can
3060 		 * see the address space any more and speculative access (if
3061 		 * possible) won't modify.  Therefore there's no need to
3062 		 * track the accessed/dirty bits.
3063 		 */
3064 		opte = *pte;
3065 		if (!pmap_valid_entry(opte)) {
3066 			continue;
3067 		}
3068 
3069 		/*
3070 		 * Count the PTE.  If it's not for a managed mapping
3071 		 * there's noting more to do.
3072 		 */
3073 		cnt--;
3074 		wired -= (opte & PTE_WIRED);
3075 		if ((opte & PTE_PVLIST) == 0) {
3076 #ifndef DOM0OPS
3077 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3078 			    "managed page without PTE_PVLIST for %#"
3079 			    PRIxVADDR, va);
3080 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3081 			    "pv-tracked page without PTE_PVLIST for %#"
3082 			    PRIxVADDR, va);
3083 #endif
3084 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3085 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3086 			    va) == NULL);
3087 			continue;
3088 		}
3089 
3090 		/*
3091 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3092 		 * in the PTP.  If it's for this VA, take advantage of it to
3093 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3094 		 * tree by skipping to the next VA in the tree whenever
3095 		 * there is a match here.  The tree will be cleared out in
3096 		 * one pass before return to pmap_remove_all().
3097 		 */
3098 		oattrs = pmap_pte_to_pp_attrs(opte);
3099 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3100 			pp = pve->pve_pp;
3101 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3102 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3103 			    pp->pp_pte.pte_va != va);
3104 			mutex_spin_enter(&pp->pp_lock);
3105 			pp->pp_attrs |= oattrs;
3106 			LIST_REMOVE(pve, pve_list);
3107 			mutex_spin_exit(&pp->pp_lock);
3108 
3109 			/*
3110 			 * pve won't be touched again until pmap_drain_pv(),
3111 			 * so it's still safe to traverse the tree.
3112 			 */
3113 			pmap_free_pv(pmap, pve);
3114 			pve = RB_TREE_NEXT(tree, pve);
3115 			continue;
3116 		}
3117 
3118 		/*
3119 		 * No entry in the tree so it must be embedded.  Look up the
3120 		 * page and cancel the embedded entry.
3121 		 */
3122 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3123 			pp = VM_PAGE_TO_PP(pg);
3124 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3125 			paddr_t pa = pmap_pte2pa(opte);
3126 			panic("%s: PTE_PVLIST with pv-untracked page"
3127 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3128 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3129 		}
3130 		mutex_spin_enter(&pp->pp_lock);
3131 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3132 		KASSERT(pp->pp_pte.pte_va == va);
3133 		pp->pp_attrs |= oattrs;
3134 		pp->pp_pte.pte_ptp = NULL;
3135 		pp->pp_pte.pte_va = 0;
3136 		mutex_spin_exit(&pp->pp_lock);
3137 	}
3138 
3139 	/* PTP now empty - adjust the tree & stats to match. */
3140 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3141 	ptp->wire_count = 1;
3142 #ifdef DIAGNOSTIC
3143 	rb_tree_init(tree, &pmap_rbtree_ops);
3144 #endif
3145 #else	/* !XENPV */
3146 	/*
3147 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3148 	 * I guess the hypervisor keeps track of PTEs too.
3149 	 */
3150 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3151 #endif	/* !XENPV */
3152 }
3153 
3154 /*
3155  * pmap_remove_all: remove all mappings from pmap in bulk.
3156  *
3157  * Ordinarily when removing mappings it's important to hold the UVM object's
3158  * lock, so that pages do not gain a new identity while retaining stale TLB
3159  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3160  * Here it's known that the address space is no longer visible to any user
3161  * process, so we don't need to worry about that.
3162  */
3163 bool
3164 pmap_remove_all(struct pmap *pmap)
3165 {
3166 	struct vm_page *ptps[32];
3167 	vaddr_t va, blkendva;
3168 	struct pmap *pmap2;
3169 	pt_entry_t *ptes;
3170 	pd_entry_t pde __diagused;
3171 	pd_entry_t * const *pdes;
3172 	int lvl __diagused, i, n;
3173 
3174 	/* XXX Can't handle EPT just yet. */
3175 	if (pmap->pm_remove != NULL) {
3176 		return false;
3177 	}
3178 
3179 	for (;;) {
3180 		/* Fetch a block of PTPs from tree. */
3181 		mutex_enter(&pmap->pm_lock);
3182 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3183 		    (void **)ptps, __arraycount(ptps), false);
3184 		if (n == 0) {
3185 			mutex_exit(&pmap->pm_lock);
3186 			break;
3187 		}
3188 
3189 		/* Remove all mappings in the set of PTPs. */
3190 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3191 		for (i = 0; i < n; i++) {
3192 			if (ptps[i]->wire_count == 0) {
3193 				/* It's dead: pmap_update() will expunge. */
3194 				continue;
3195 			}
3196 
3197 			/* Determine range of block. */
3198 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3199 			blkendva = x86_round_pdr(va + 1);
3200 
3201 			/* Make sure everything squares up... */
3202 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3203 			KASSERT(lvl == 1);
3204 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3205 
3206 			/* Zap! */
3207 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3208 			    blkendva);
3209 
3210 			/* PTP should now be unused - free it. */
3211 			KASSERT(ptps[i]->wire_count == 1);
3212 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3213 		}
3214 		pmap_unmap_ptes(pmap, pmap2);
3215 		pmap_drain_pv(pmap);
3216 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3217 		mutex_exit(&pmap->pm_lock);
3218 
3219 		/* Process deferred frees. */
3220 		pmap_update(pmap);
3221 
3222 		/* A breathing point. */
3223 		preempt_point();
3224 	}
3225 
3226 	/* Verify that the pmap is now completely empty. */
3227 	pmap_check_ptps(pmap);
3228 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3229 	    "pmap %p not empty", pmap);
3230 
3231 	return true;
3232 }
3233 
3234 #if defined(PMAP_FORK)
3235 /*
3236  * pmap_fork: perform any necessary data structure manipulation when
3237  * a VM space is forked.
3238  */
3239 void
3240 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3241 {
3242 #ifdef USER_LDT
3243 	union descriptor *new_ldt;
3244 	int sel;
3245 
3246 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3247 		return;
3248 	}
3249 
3250 	/*
3251 	 * Copy the LDT into the new process.
3252 	 *
3253 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3254 	 * we'll retry. This will starve if there's a stream of LDT changes
3255 	 * in another thread but that should not happen.
3256 	 */
3257 
3258 retry:
3259 	if (pmap1->pm_ldt != NULL) {
3260 		/* Allocate space for the new process's LDT */
3261 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3262 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3263 		if (new_ldt == NULL) {
3264 			printf("WARNING: %s: unable to allocate LDT space\n",
3265 			    __func__);
3266 			return;
3267 		}
3268 		mutex_enter(&cpu_lock);
3269 		/* Get a GDT slot for it */
3270 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3271 		if (sel == -1) {
3272 			mutex_exit(&cpu_lock);
3273 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3274 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3275 			printf("WARNING: %s: unable to allocate LDT selector\n",
3276 			    __func__);
3277 			return;
3278 		}
3279 	} else {
3280 		/* Wasn't anything there after all. */
3281 		new_ldt = NULL;
3282 		sel = -1;
3283 		mutex_enter(&cpu_lock);
3284 	}
3285 
3286  	/*
3287 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3288 	 */
3289  	if (pmap1->pm_ldt != NULL) {
3290 		if (new_ldt == NULL) {
3291 			/* A wild LDT just appeared. */
3292 			mutex_exit(&cpu_lock);
3293 			goto retry;
3294 		}
3295 
3296 		/* Copy the LDT data and install it in pmap2 */
3297 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3298 		pmap2->pm_ldt = new_ldt;
3299 		pmap2->pm_ldt_sel = sel;
3300 		mutex_exit(&cpu_lock);
3301 	} else {
3302 		if (new_ldt != NULL) {
3303 			/* The LDT disappeared, drop what we did. */
3304 			ldt_free(sel);
3305 			mutex_exit(&cpu_lock);
3306 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3307 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3308 			return;
3309 		}
3310 
3311 		/* We're good, just leave. */
3312 		mutex_exit(&cpu_lock);
3313 	}
3314 #endif /* USER_LDT */
3315 }
3316 #endif /* PMAP_FORK */
3317 
3318 #ifdef USER_LDT
3319 
3320 /*
3321  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3322  * is active, reload LDTR.
3323  */
3324 static void
3325 pmap_ldt_xcall(void *arg1, void *arg2)
3326 {
3327 	struct pmap *pm;
3328 
3329 	kpreempt_disable();
3330 	pm = arg1;
3331 	if (curcpu()->ci_pmap == pm) {
3332 #if defined(SVS)
3333 		if (svs_enabled) {
3334 			svs_ldt_sync(pm);
3335 		} else
3336 #endif
3337 		lldt(pm->pm_ldt_sel);
3338 	}
3339 	kpreempt_enable();
3340 }
3341 
3342 /*
3343  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3344  * in the new selector on all CPUs.
3345  */
3346 void
3347 pmap_ldt_sync(struct pmap *pm)
3348 {
3349 	uint64_t where;
3350 
3351 	KASSERT(mutex_owned(&cpu_lock));
3352 
3353 	pmap_ldt_evcnt.ev_count++;
3354 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3355 	xc_wait(where);
3356 }
3357 
3358 /*
3359  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3360  * restore the default.
3361  */
3362 void
3363 pmap_ldt_cleanup(struct lwp *l)
3364 {
3365 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3366 	union descriptor *ldt;
3367 	int sel;
3368 
3369 	if (__predict_true(pmap->pm_ldt == NULL)) {
3370 		return;
3371 	}
3372 
3373 	mutex_enter(&cpu_lock);
3374 	if (pmap->pm_ldt != NULL) {
3375 		sel = pmap->pm_ldt_sel;
3376 		ldt = pmap->pm_ldt;
3377 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3378 		pmap->pm_ldt = NULL;
3379 		pmap_ldt_sync(pmap);
3380 		ldt_free(sel);
3381 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3382 		    UVM_KMF_WIRED);
3383 	}
3384 	mutex_exit(&cpu_lock);
3385 }
3386 #endif /* USER_LDT */
3387 
3388 /*
3389  * pmap_activate: activate a process' pmap
3390  *
3391  * => must be called with kernel preemption disabled
3392  * => if lwp is the curlwp, then set ci_want_pmapload so that
3393  *    actual MMU context switch will be done by pmap_load() later
3394  */
3395 void
3396 pmap_activate(struct lwp *l)
3397 {
3398 	struct cpu_info *ci;
3399 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3400 
3401 	KASSERT(kpreempt_disabled());
3402 
3403 	ci = curcpu();
3404 
3405 	if (l != ci->ci_curlwp)
3406 		return;
3407 
3408 	KASSERT(ci->ci_want_pmapload == 0);
3409 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3410 
3411 	/*
3412 	 * no need to switch to kernel vmspace because
3413 	 * it's a subset of any vmspace.
3414 	 */
3415 
3416 	if (pmap == pmap_kernel()) {
3417 		ci->ci_want_pmapload = 0;
3418 		return;
3419 	}
3420 
3421 	ci->ci_want_pmapload = 1;
3422 }
3423 
3424 #if defined(XENPV) && defined(__x86_64__)
3425 #define	KASSERT_PDIRPA(pmap) \
3426 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3427 	    pmap == pmap_kernel())
3428 #elif defined(PAE)
3429 #define	KASSERT_PDIRPA(pmap) \
3430 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3431 #elif !defined(XENPV)
3432 #define	KASSERT_PDIRPA(pmap) \
3433 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3434 #else
3435 #define	KASSERT_PDIRPA(pmap) 	KASSERT(true)	/* nothing to do */
3436 #endif
3437 
3438 /*
3439  * pmap_reactivate: try to regain reference to the pmap.
3440  *
3441  * => Must be called with kernel preemption disabled.
3442  */
3443 static void
3444 pmap_reactivate(struct pmap *pmap)
3445 {
3446 	struct cpu_info * const ci = curcpu();
3447 	const cpuid_t cid = cpu_index(ci);
3448 
3449 	KASSERT(kpreempt_disabled());
3450 	KASSERT_PDIRPA(pmap);
3451 
3452 	/*
3453 	 * If we still have a lazy reference to this pmap, we can assume
3454 	 * that there was no TLB shootdown for this pmap in the meantime.
3455 	 *
3456 	 * The order of events here is important as we must synchronize
3457 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3458 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3459 	 * change only when the state is TLBSTATE_LAZY.
3460 	 */
3461 
3462 	ci->ci_tlbstate = TLBSTATE_VALID;
3463 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3464 
3465 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3466 		/* We have the reference, state is valid. */
3467 	} else {
3468 		/*
3469 		 * Must reload the TLB, pmap has been changed during
3470 		 * deactivated.
3471 		 */
3472 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3473 
3474 		tlbflush();
3475 	}
3476 }
3477 
3478 /*
3479  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3480  * and relevant LDT info.
3481  *
3482  * Ensures that the current process' pmap is loaded on the current CPU's
3483  * MMU and that there are no stale TLB entries.
3484  *
3485  * => The caller should disable kernel preemption or do check-and-retry
3486  *    to prevent a preemption from undoing our efforts.
3487  * => This function may block.
3488  */
3489 void
3490 pmap_load(void)
3491 {
3492 	struct cpu_info *ci;
3493 	struct pmap *pmap, *oldpmap;
3494 	struct lwp *l;
3495 	uint64_t ncsw;
3496 
3497 	kpreempt_disable();
3498  retry:
3499 	ci = curcpu();
3500 	if (!ci->ci_want_pmapload) {
3501 		kpreempt_enable();
3502 		return;
3503 	}
3504 	l = ci->ci_curlwp;
3505 	ncsw = l->l_ncsw;
3506 	__insn_barrier();
3507 
3508 	/* should be able to take ipis. */
3509 	KASSERT(ci->ci_ilevel < IPL_HIGH);
3510 #ifdef XENPV
3511 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3512 	KASSERT(x86_read_psl() == 0);
3513 #else
3514 	KASSERT((x86_read_psl() & PSL_I) != 0);
3515 #endif
3516 
3517 	KASSERT(l != NULL);
3518 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3519 	KASSERT(pmap != pmap_kernel());
3520 	oldpmap = ci->ci_pmap;
3521 
3522 	if (pmap == oldpmap) {
3523 		pmap_reactivate(pmap);
3524 		ci->ci_want_pmapload = 0;
3525 		kpreempt_enable();
3526 		return;
3527 	}
3528 
3529 	/*
3530 	 * Acquire a reference to the new pmap and perform the switch.
3531 	 */
3532 
3533 	pmap_reference(pmap);
3534 	pmap_load1(l, pmap, oldpmap);
3535 	ci->ci_want_pmapload = 0;
3536 
3537 	/*
3538 	 * we're now running with the new pmap.  drop the reference
3539 	 * to the old pmap.  if we block, we need to go around again.
3540 	 */
3541 
3542 	pmap_destroy(oldpmap);
3543 	__insn_barrier();
3544 	if (l->l_ncsw != ncsw) {
3545 		goto retry;
3546 	}
3547 
3548 	kpreempt_enable();
3549 }
3550 
3551 /*
3552  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3553  * pmap_load().  It's critically important that this function does not
3554  * block.
3555  */
3556 static void
3557 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3558 {
3559 	struct cpu_info *ci;
3560 	struct pcb *pcb;
3561 	cpuid_t cid;
3562 
3563 	KASSERT(kpreempt_disabled());
3564 
3565 	pcb = lwp_getpcb(l);
3566 	ci = l->l_cpu;
3567 	cid = cpu_index(ci);
3568 
3569 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3570 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3571 
3572 	KASSERT_PDIRPA(oldpmap);
3573 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3574 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3575 
3576 	/*
3577 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3578 	 * with TLB shootdown interrupts, so set the state VALID first,
3579 	 * then register us for shootdown events on this pmap.
3580 	 */
3581 	ci->ci_tlbstate = TLBSTATE_VALID;
3582 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3583 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3584 	ci->ci_pmap = pmap;
3585 
3586 	/*
3587 	 * update tss.  now that we have registered for invalidations
3588 	 * from other CPUs, we're good to load the page tables.
3589 	 */
3590 #ifdef PAE
3591 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3592 #else
3593 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3594 #endif
3595 
3596 #ifdef i386
3597 #ifndef XENPV
3598 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3599 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3600 #endif
3601 #endif
3602 
3603 #if defined(SVS) && defined(USER_LDT)
3604 	if (svs_enabled) {
3605 		svs_ldt_sync(pmap);
3606 	} else
3607 #endif
3608 	lldt(pmap->pm_ldt_sel);
3609 
3610 	cpu_load_pmap(pmap, oldpmap);
3611 }
3612 
3613 /*
3614  * pmap_deactivate: deactivate a process' pmap.
3615  *
3616  * => Must be called with kernel preemption disabled (high IPL is enough).
3617  */
3618 void
3619 pmap_deactivate(struct lwp *l)
3620 {
3621 	struct pmap *pmap;
3622 	struct cpu_info *ci;
3623 
3624 	KASSERT(kpreempt_disabled());
3625 
3626 	if (l != curlwp) {
3627 		return;
3628 	}
3629 
3630 	/*
3631 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3632 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3633 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3634 	 * to kernel preemption or blocking routine in between.
3635 	 */
3636 	pmap_tlb_shootnow();
3637 
3638 	ci = curcpu();
3639 
3640 	if (ci->ci_want_pmapload) {
3641 		/*
3642 		 * ci_want_pmapload means that our pmap is not loaded on
3643 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3644 		 * is always considered loaded.
3645 		 */
3646 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3647 		    != pmap_kernel());
3648 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3649 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3650 
3651 		/*
3652 		 * userspace has not been touched.
3653 		 * nothing to do here.
3654 		 */
3655 
3656 		ci->ci_want_pmapload = 0;
3657 		return;
3658 	}
3659 
3660 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3661 
3662 	if (pmap == pmap_kernel()) {
3663 		return;
3664 	}
3665 
3666 	KASSERT_PDIRPA(pmap);
3667 	KASSERT(ci->ci_pmap == pmap);
3668 
3669 	/*
3670 	 * we aren't interested in TLB invalidations for this pmap,
3671 	 * at least for the time being.
3672 	 */
3673 
3674 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3675 	ci->ci_tlbstate = TLBSTATE_LAZY;
3676 }
3677 
3678 /*
3679  * some misc. functions
3680  */
3681 
3682 bool
3683 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3684     int *lastlvl)
3685 {
3686 	unsigned long index;
3687 	pd_entry_t pde;
3688 	int i;
3689 
3690 	for (i = PTP_LEVELS; i > 1; i--) {
3691 		index = pl_i(va, i);
3692 		pde = pdes[i - 2][index];
3693 		if ((pde & PTE_P) == 0) {
3694 			*lastlvl = i;
3695 			return false;
3696 		}
3697 		if (pde & PTE_PS)
3698 			break;
3699 	}
3700 	if (lastpde != NULL)
3701 		*lastpde = pde;
3702 	*lastlvl = i;
3703 	return true;
3704 }
3705 
3706 /*
3707  * pmap_extract: extract a PA for the given VA
3708  */
3709 bool
3710 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3711 {
3712 	pt_entry_t *ptes, pte;
3713 	pd_entry_t pde;
3714 	pd_entry_t * const *pdes;
3715 	struct pmap *pmap2;
3716 	paddr_t pa;
3717 	bool rv;
3718 	int lvl;
3719 
3720 	if (__predict_false(pmap->pm_extract != NULL)) {
3721 		return (*pmap->pm_extract)(pmap, va, pap);
3722 	}
3723 
3724 #ifdef __HAVE_DIRECT_MAP
3725 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3726 		if (pap != NULL) {
3727 			*pap = PMAP_DIRECT_UNMAP(va);
3728 		}
3729 		return true;
3730 	}
3731 #endif
3732 
3733 	rv = false;
3734 	pa = 0;
3735 
3736 	if (pmap != pmap_kernel()) {
3737 		mutex_enter(&pmap->pm_lock);
3738 	}
3739 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3740 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3741 		if (lvl == 2) {
3742 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3743 			rv = true;
3744 		} else {
3745 			KASSERT(lvl == 1);
3746 			pte = ptes[pl1_i(va)];
3747 			if (__predict_true((pte & PTE_P) != 0)) {
3748 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3749 				rv = true;
3750 			}
3751 		}
3752 	}
3753 	pmap_unmap_ptes(pmap, pmap2);
3754 	if (pmap != pmap_kernel()) {
3755 		mutex_exit(&pmap->pm_lock);
3756 	}
3757 	if (pap != NULL) {
3758 		*pap = pa;
3759 	}
3760 
3761 	return rv;
3762 }
3763 
3764 /*
3765  * vtophys: virtual address to physical address.  For use by
3766  * machine-dependent code only.
3767  */
3768 paddr_t
3769 vtophys(vaddr_t va)
3770 {
3771 	paddr_t pa;
3772 
3773 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3774 		return pa;
3775 	return 0;
3776 }
3777 
3778 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3779 
3780 #ifdef XENPV
3781 /*
3782  * vtomach: virtual address to machine address.  For use by
3783  * machine-dependent code only.
3784  */
3785 paddr_t
3786 vtomach(vaddr_t va)
3787 {
3788 	paddr_t pa;
3789 
3790 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3791 		return pa;
3792 	return 0;
3793 }
3794 #endif
3795 
3796 /*
3797  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3798  * determine the bounds of the kernel virtual addess space.
3799  */
3800 void
3801 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3802 {
3803 	*startp = virtual_avail;
3804 	*endp = virtual_end;
3805 }
3806 
3807 void
3808 pmap_zero_page(paddr_t pa)
3809 {
3810 #if defined(__HAVE_DIRECT_MAP)
3811 	memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
3812 #else
3813 #if defined(XENPV)
3814 	if (XEN_VERSION_SUPPORTED(3, 4))
3815 		xen_pagezero(pa);
3816 #endif
3817 	struct cpu_info *ci;
3818 	pt_entry_t *zpte;
3819 	vaddr_t zerova;
3820 
3821 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3822 
3823 	kpreempt_disable();
3824 
3825 	ci = curcpu();
3826 	zerova = ci->vpage[VPAGE_ZER];
3827 	zpte = ci->vpage_pte[VPAGE_ZER];
3828 
3829 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3830 
3831 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3832 	pmap_pte_flush();
3833 	pmap_update_pg(zerova);		/* flush TLB */
3834 
3835 	memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
3836 
3837 #if defined(DIAGNOSTIC) || defined(XENPV)
3838 	pmap_pte_set(zpte, 0);				/* zap ! */
3839 	pmap_pte_flush();
3840 #endif
3841 
3842 	kpreempt_enable();
3843 #endif /* defined(__HAVE_DIRECT_MAP) */
3844 }
3845 
3846 void
3847 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3848 {
3849 #if defined(__HAVE_DIRECT_MAP)
3850 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3851 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3852 
3853 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
3854 #else
3855 #if defined(XENPV)
3856 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3857 		xen_copy_page(srcpa, dstpa);
3858 		return;
3859 	}
3860 #endif
3861 	struct cpu_info *ci;
3862 	pt_entry_t *srcpte, *dstpte;
3863 	vaddr_t srcva, dstva;
3864 
3865 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
3866 
3867 	kpreempt_disable();
3868 
3869 	ci = curcpu();
3870 	srcva = ci->vpage[VPAGE_SRC];
3871 	dstva = ci->vpage[VPAGE_DST];
3872 	srcpte = ci->vpage_pte[VPAGE_SRC];
3873 	dstpte = ci->vpage_pte[VPAGE_DST];
3874 
3875 	KASSERT(*srcpte == 0 && *dstpte == 0);
3876 
3877 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3878 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
3879 	pmap_pte_flush();
3880 	pmap_update_pg(srcva);
3881 	pmap_update_pg(dstva);
3882 
3883 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
3884 
3885 #if defined(DIAGNOSTIC) || defined(XENPV)
3886 	pmap_pte_set(srcpte, 0);
3887 	pmap_pte_set(dstpte, 0);
3888 	pmap_pte_flush();
3889 #endif
3890 
3891 	kpreempt_enable();
3892 #endif /* defined(__HAVE_DIRECT_MAP) */
3893 }
3894 
3895 static pt_entry_t *
3896 pmap_map_ptp(struct vm_page *ptp)
3897 {
3898 #ifdef __HAVE_DIRECT_MAP
3899 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3900 #else
3901 	struct cpu_info *ci;
3902 	pt_entry_t *ptppte;
3903 	vaddr_t ptpva;
3904 
3905 	KASSERT(kpreempt_disabled());
3906 
3907 #ifndef XENPV
3908 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
3909 #else
3910 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
3911 #endif
3912 
3913 	ci = curcpu();
3914 	ptpva = ci->vpage[VPAGE_PTP];
3915 	ptppte = ci->vpage_pte[VPAGE_PTP];
3916 
3917 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3918 
3919 	pmap_pte_flush();
3920 	pmap_update_pg(ptpva);
3921 
3922 	return (pt_entry_t *)ptpva;
3923 #endif
3924 }
3925 
3926 static void
3927 pmap_unmap_ptp(void)
3928 {
3929 #ifndef __HAVE_DIRECT_MAP
3930 #if defined(DIAGNOSTIC) || defined(XENPV)
3931 	struct cpu_info *ci;
3932 	pt_entry_t *pte;
3933 
3934 	KASSERT(kpreempt_disabled());
3935 
3936 	ci = curcpu();
3937 	pte = ci->vpage_pte[VPAGE_PTP];
3938 
3939 	if (*pte != 0) {
3940 		pmap_pte_set(pte, 0);
3941 		pmap_pte_flush();
3942 	}
3943 #endif
3944 #endif
3945 }
3946 
3947 static pt_entry_t *
3948 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3949 {
3950 
3951 	KASSERT(kpreempt_disabled());
3952 	if (pmap_is_curpmap(pmap)) {
3953 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3954 	}
3955 	KASSERT(ptp != NULL);
3956 	return pmap_map_ptp(ptp) + pl1_pi(va);
3957 }
3958 
3959 static void
3960 pmap_unmap_pte(void)
3961 {
3962 
3963 	KASSERT(kpreempt_disabled());
3964 
3965 	pmap_unmap_ptp();
3966 }
3967 
3968 /*
3969  * p m a p   r e m o v e   f u n c t i o n s
3970  *
3971  * functions that remove mappings
3972  */
3973 
3974 /*
3975  * pmap_remove_ptes: remove PTEs from a PTP
3976  *
3977  * => caller must hold pmap's lock
3978  * => PTP must be mapped into KVA
3979  * => PTP should be null if pmap == pmap_kernel()
3980  * => must be called with kernel preemption disabled
3981  * => returns composite pte if at least one page should be shot down
3982  */
3983 static void
3984 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3985     vaddr_t startva, vaddr_t endva)
3986 {
3987 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3988 
3989 	KASSERT(mutex_owned(&pmap->pm_lock));
3990 	KASSERT(kpreempt_disabled());
3991 
3992 	/*
3993 	 * mappings are very often sparse, so clip the given range to the
3994 	 * range of PTEs that are known present in the PTP.
3995 	 */
3996 	pmap_ptp_range_clip(ptp, &startva, &pte);
3997 
3998 	/*
3999 	 * note that ptpva points to the PTE that maps startva.   this may
4000 	 * or may not be the first PTE in the PTP.
4001 	 *
4002 	 * we loop through the PTP while there are still PTEs to look at
4003 	 * and the wire_count is greater than 1 (because we use the wire_count
4004 	 * to keep track of the number of real PTEs in the PTP).
4005 	 */
4006 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4007 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4008 		startva += PAGE_SIZE;
4009 		pte++;
4010 	}
4011 }
4012 
4013 /*
4014  * pmap_remove_pte: remove a single PTE from a PTP.
4015  *
4016  * => caller must hold pmap's lock
4017  * => PTP must be mapped into KVA
4018  * => PTP should be null if pmap == pmap_kernel()
4019  * => returns true if we removed a mapping
4020  * => must be called with kernel preemption disabled
4021  */
4022 static bool
4023 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4024     vaddr_t va)
4025 {
4026 	struct pv_entry *pve;
4027 	struct vm_page *pg;
4028 	struct pmap_page *pp;
4029 	pt_entry_t opte;
4030 
4031 	KASSERT(mutex_owned(&pmap->pm_lock));
4032 	KASSERT(kpreempt_disabled());
4033 
4034 	if (!pmap_valid_entry(*pte)) {
4035 		/* VA not mapped. */
4036 		return false;
4037 	}
4038 
4039 	/* Atomically save the old PTE and zap it. */
4040 	opte = pmap_pte_testset(pte, 0);
4041 	if (!pmap_valid_entry(opte)) {
4042 		return false;
4043 	}
4044 
4045 	pmap_exec_account(pmap, va, opte, 0);
4046 	pmap_stats_update_bypte(pmap, 0, opte);
4047 
4048 	if (ptp) {
4049 		/*
4050 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4051 		 */
4052 		ptp->wire_count--;
4053 		if (ptp->wire_count <= 1) {
4054 			opte |= PTE_A;
4055 		}
4056 	}
4057 
4058 	if ((opte & PTE_A) != 0) {
4059 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4060 	}
4061 
4062 	/*
4063 	 * If we are not on a pv list - we are done.
4064 	 */
4065 	if ((opte & PTE_PVLIST) == 0) {
4066 #ifndef DOM0OPS
4067 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4068 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4069 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4070 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4071 #endif
4072 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4073 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4074 		return true;
4075 	}
4076 
4077 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4078 		pp = VM_PAGE_TO_PP(pg);
4079 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4080 		paddr_t pa = pmap_pte2pa(opte);
4081 		panic("%s: PTE_PVLIST with pv-untracked page"
4082 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4083 		    __func__, va, pa, atop(pa));
4084 	}
4085 
4086 	/* Sync R/M bits. */
4087 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4088 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4089 	return true;
4090 }
4091 
4092 static void
4093 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4094 {
4095 	pt_entry_t *ptes;
4096 	pd_entry_t pde;
4097 	pd_entry_t * const *pdes;
4098 	bool result;
4099 	vaddr_t blkendva, va = sva;
4100 	struct vm_page *ptp;
4101 	struct pmap *pmap2;
4102 	int lvl;
4103 
4104 	KASSERT(mutex_owned(&pmap->pm_lock));
4105 
4106 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4107 
4108 	/*
4109 	 * removing one page?  take shortcut function.
4110 	 */
4111 
4112 	if (va + PAGE_SIZE == eva) {
4113 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4114 			KASSERT(lvl == 1);
4115 
4116 			/* Get PTP if non-kernel mapping. */
4117 			if (pmap != pmap_kernel()) {
4118 				ptp = pmap_find_ptp(pmap, va, 1);
4119 				KASSERTMSG(ptp != NULL,
4120 				    "%s: unmanaged PTP detected", __func__);
4121 			} else {
4122 				/* Never free kernel PTPs. */
4123 				ptp = NULL;
4124 			}
4125 
4126 			result = pmap_remove_pte(pmap, ptp,
4127 			    &ptes[pl1_i(va)], va);
4128 
4129 			/*
4130 			 * if mapping removed and the PTP is no longer
4131 			 * being used, free it!
4132 			 */
4133 
4134 			if (result && ptp && ptp->wire_count <= 1)
4135 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4136 		}
4137 	} else for (/* null */ ; va < eva ; va = blkendva) {
4138 		/* determine range of block */
4139 		blkendva = x86_round_pdr(va+1);
4140 		if (blkendva > eva)
4141 			blkendva = eva;
4142 
4143 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4144 			/* Skip a range corresponding to an invalid pde. */
4145 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4146  			continue;
4147 		}
4148 		KASSERT(lvl == 1);
4149 
4150 		/* Get PTP if non-kernel mapping. */
4151 		if (pmap != pmap_kernel()) {
4152 			ptp = pmap_find_ptp(pmap, va, 1);
4153 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4154 			    __func__);
4155 		} else {
4156 			/* Never free kernel PTPs. */
4157 			ptp = NULL;
4158 		}
4159 
4160 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4161 		    blkendva);
4162 
4163 		/* If PTP is no longer being used, free it. */
4164 		if (ptp && ptp->wire_count <= 1) {
4165 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4166 		}
4167 	}
4168 	pmap_unmap_ptes(pmap, pmap2);
4169 	pmap_drain_pv(pmap);
4170 }
4171 
4172 /*
4173  * pmap_remove: mapping removal function.
4174  *
4175  * => caller should not be holding any pmap locks
4176  */
4177 void
4178 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4179 {
4180 	if (__predict_false(pmap->pm_remove != NULL)) {
4181 		(*pmap->pm_remove)(pmap, sva, eva);
4182 		return;
4183 	}
4184 
4185 	mutex_enter(&pmap->pm_lock);
4186 	pmap_remove_locked(pmap, sva, eva);
4187 	mutex_exit(&pmap->pm_lock);
4188 }
4189 
4190 /*
4191  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4192  *
4193  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4194  * => Caller should disable kernel preemption.
4195  * => issues tlb shootdowns if necessary.
4196  */
4197 static int
4198 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4199     pt_entry_t *optep)
4200 {
4201 	struct pmap *pmap;
4202 	struct vm_page *ptp;
4203 	vaddr_t va;
4204 	pt_entry_t *ptep;
4205 	pt_entry_t opte;
4206 	pt_entry_t npte;
4207 	pt_entry_t expect;
4208 	bool need_shootdown;
4209 
4210 	ptp = pvpte->pte_ptp;
4211 	va = pvpte->pte_va;
4212 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4213 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4214 	pmap = ptp_to_pmap(ptp);
4215 	KASSERT(kpreempt_disabled());
4216 
4217 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4218 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4219 		    optep);
4220 	}
4221 
4222 	expect = pmap_pa2pte(pa) | PTE_P;
4223 
4224 	if (clearbits != ~0) {
4225 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4226 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4227 	}
4228 
4229 	ptep = pmap_map_pte(pmap, ptp, va);
4230 	do {
4231 		opte = *ptep;
4232 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4233 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4234 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4235 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4236 			/*
4237 			 * We lost a race with a V->P operation like
4238 			 * pmap_remove().  Wait for the competitor
4239 			 * reflecting pte bits into mp_attrs.
4240 			 */
4241 			pmap_unmap_pte();
4242 			return EAGAIN;
4243 		}
4244 
4245 		/*
4246 		 * Check if there's anything to do on this PTE.
4247 		 */
4248 		if ((opte & clearbits) == 0) {
4249 			need_shootdown = false;
4250 			break;
4251 		}
4252 
4253 		/*
4254 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4255 		 * ... Unless we are clearing only the PTE_W bit and
4256 		 * it isn't cached as RW (PTE_D).
4257 		 */
4258 		need_shootdown = (opte & PTE_A) != 0 &&
4259 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4260 
4261 		npte = opte & ~clearbits;
4262 
4263 		/*
4264 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4265 		 */
4266 		if (need_shootdown) {
4267 			npte &= ~(PTE_A | PTE_D);
4268 		}
4269 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4270 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4271 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4272 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4273 
4274 	if (need_shootdown) {
4275 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4276 	}
4277 	pmap_unmap_pte();
4278 
4279 	*oattrs = pmap_pte_to_pp_attrs(opte);
4280 	if (optep != NULL)
4281 		*optep = opte;
4282 	return 0;
4283 }
4284 
4285 static void
4286 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4287     vaddr_t va)
4288 {
4289 	struct pmap *pmap2;
4290 	pt_entry_t *ptes;
4291 	pd_entry_t * const *pdes;
4292 
4293 	KASSERT(mutex_owned(&pmap->pm_lock));
4294 
4295 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4296 	pmap_stats_update_bypte(pmap, 0, opte);
4297 	ptp->wire_count--;
4298 	if (ptp->wire_count <= 1) {
4299 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4300 	}
4301 	pmap_unmap_ptes(pmap, pmap2);
4302 }
4303 
4304 static void
4305 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4306 {
4307 	struct pv_pte *pvpte;
4308 	struct vm_page *ptp;
4309 	uintptr_t sum;
4310 	uint8_t oattrs;
4311 	bool locked;
4312 
4313 	/*
4314 	 * Do an unlocked check to see if the page has no mappings, eg when
4315 	 * pmap_remove_all() was called before amap_wipeout() for a process
4316 	 * private amap - common.  The page being removed must be on the way
4317 	 * out, so we don't have to worry about concurrent attempts to enter
4318 	 * it (otherwise the caller either doesn't care or has screwed up).
4319 	 */
4320 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4321 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4322 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4323 	if (sum == 0) {
4324 	    	return;
4325 	}
4326 
4327 	kpreempt_disable();
4328 	for (;;) {
4329 		struct pmap *pmap;
4330 		struct pv_entry *pve;
4331 		pt_entry_t opte;
4332 		vaddr_t va;
4333 
4334 		mutex_spin_enter(&pp->pp_lock);
4335 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4336 			mutex_spin_exit(&pp->pp_lock);
4337 			break;
4338 		}
4339 
4340 		/*
4341 		 * Add a reference to the pmap before clearing the pte.
4342 		 * Otherwise the pmap can disappear behind us.
4343 		 */
4344 		ptp = pvpte->pte_ptp;
4345 		pmap = ptp_to_pmap(ptp);
4346 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4347 		if (ptp != NULL) {
4348 			pmap_reference(pmap);
4349 		}
4350 
4351 		/*
4352 		 * Now try to lock it.  We need a direct handoff between
4353 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4354 		 * and kept associated with this pmap.  If that can't be
4355 		 * had, wait for the pmap's lock to become free and then
4356 		 * retry.
4357 		 */
4358 		locked = mutex_tryenter(&pmap->pm_lock);
4359 		mutex_spin_exit(&pp->pp_lock);
4360 		if (!locked) {
4361 			mutex_enter(&pmap->pm_lock);
4362 			/* nothing, just wait for it */
4363 			mutex_exit(&pmap->pm_lock);
4364 			if (ptp != NULL) {
4365 				pmap_destroy(pmap);
4366 			}
4367 			continue;
4368 		}
4369 		va = pvpte->pte_va;
4370 
4371 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4372 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4373 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4374 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4375 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4376 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4377 
4378 #ifdef DEBUG
4379 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4380 		rb_tree_t *tree = (ptp != NULL ?
4381 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4382 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4383 		if (pve == NULL) {
4384 			KASSERTMSG(&pp->pp_pte == pvpte,
4385 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4386 			    va, pmap, ptp, pvpte, pve);
4387 		} else {
4388 			KASSERTMSG(&pve->pve_pte == pvpte,
4389 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4390 			    va, pmap, ptp, pvpte, pve);
4391 		}
4392 #endif
4393 
4394 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4395 			panic("pmap_pp_remove: mapping not present");
4396 		}
4397 
4398 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4399 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4400 
4401 		/* Update the PTP reference count. Free if last reference. */
4402 		if (ptp != NULL) {
4403 			KASSERT(pmap != pmap_kernel());
4404 			pmap_tlb_shootnow();
4405 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4406 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4407 			} else {
4408 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4409 			}
4410 		} else {
4411 			KASSERT(pmap == pmap_kernel());
4412 			pmap_stats_update_bypte(pmap, 0, opte);
4413 		}
4414 		pmap_tlb_shootnow();
4415 		pmap_drain_pv(pmap);
4416 		mutex_exit(&pmap->pm_lock);
4417 		if (ptp != NULL) {
4418 			pmap_destroy(pmap);
4419 		}
4420 	}
4421 	kpreempt_enable();
4422 }
4423 
4424 /*
4425  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4426  *
4427  * => R/M bits are sync'd back to attrs
4428  */
4429 void
4430 pmap_page_remove(struct vm_page *pg)
4431 {
4432 	struct pmap_page *pp;
4433 	paddr_t pa;
4434 
4435 	pp = VM_PAGE_TO_PP(pg);
4436 	pa = VM_PAGE_TO_PHYS(pg);
4437 	pmap_pp_remove(pp, pa);
4438 }
4439 
4440 /*
4441  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4442  * that map it
4443  */
4444 void
4445 pmap_pv_remove(paddr_t pa)
4446 {
4447 	struct pmap_page *pp;
4448 
4449 	pp = pmap_pv_tracked(pa);
4450 	if (pp == NULL)
4451 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4452 	pmap_pp_remove(pp, pa);
4453 }
4454 
4455 /*
4456  * p m a p   a t t r i b u t e  f u n c t i o n s
4457  * functions that test/change managed page's attributes
4458  * since a page can be mapped multiple times we must check each PTE that
4459  * maps it by going down the pv lists.
4460  */
4461 
4462 /*
4463  * pmap_test_attrs: test a page's attributes
4464  */
4465 bool
4466 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4467 {
4468 	struct pmap_page *pp;
4469 	struct pv_pte *pvpte;
4470 	struct pmap *pmap;
4471 	uint8_t oattrs;
4472 	u_int result;
4473 	paddr_t pa;
4474 
4475 	pp = VM_PAGE_TO_PP(pg);
4476 	if ((pp->pp_attrs & testbits) != 0) {
4477 		return true;
4478 	}
4479 	pa = VM_PAGE_TO_PHYS(pg);
4480  startover:
4481 	mutex_spin_enter(&pp->pp_lock);
4482 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4483 		if ((pp->pp_attrs & testbits) != 0) {
4484 			break;
4485 		}
4486 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4487 			/*
4488 			 * raced with a V->P operation.  wait for the other
4489 			 * side to finish by acquring pmap's lock.  if no
4490 			 * wait, updates to pp_attrs by the other side may
4491 			 * go unseen.
4492 			 */
4493 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4494 			pmap_reference(pmap);
4495 			mutex_spin_exit(&pp->pp_lock);
4496 			mutex_enter(&pmap->pm_lock);
4497 			/* nothing. */
4498 			mutex_exit(&pmap->pm_lock);
4499 			pmap_destroy(pmap);
4500 			goto startover;
4501 		}
4502 		pp->pp_attrs |= oattrs;
4503 	}
4504 	result = pp->pp_attrs & testbits;
4505 	mutex_spin_exit(&pp->pp_lock);
4506 
4507 	/*
4508 	 * note that we will exit the for loop with a non-null pve if
4509 	 * we have found the bits we are testing for.
4510 	 */
4511 
4512 	return result != 0;
4513 }
4514 
4515 static bool
4516 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4517 {
4518 	struct pv_pte *pvpte;
4519 	struct pmap *pmap;
4520 	uint8_t oattrs;
4521 	u_int result;
4522 
4523 startover:
4524 	mutex_spin_enter(&pp->pp_lock);
4525 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4526 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4527 			/*
4528 			 * raced with a V->P operation.  wait for the other
4529 			 * side to finish by acquring pmap's lock.  it is
4530 			 * probably unmapping the page, and it will be gone
4531 			 * when the loop is restarted.
4532 			 */
4533 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4534 			pmap_reference(pmap);
4535 			mutex_spin_exit(&pp->pp_lock);
4536 			mutex_enter(&pmap->pm_lock);
4537 			/* nothing. */
4538 			mutex_exit(&pmap->pm_lock);
4539 			pmap_destroy(pmap);
4540 			goto startover;
4541 		}
4542 		pp->pp_attrs |= oattrs;
4543 	}
4544 	result = pp->pp_attrs & clearbits;
4545 	pp->pp_attrs &= ~clearbits;
4546 	pmap_tlb_shootnow();
4547 	mutex_spin_exit(&pp->pp_lock);
4548 
4549 	return result != 0;
4550 }
4551 
4552 /*
4553  * pmap_clear_attrs: clear the specified attribute for a page.
4554  *
4555  * => we return true if we cleared one of the bits we were asked to
4556  */
4557 bool
4558 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4559 {
4560 	struct pmap_page *pp;
4561 	paddr_t pa;
4562 
4563 	pp = VM_PAGE_TO_PP(pg);
4564 	pa = VM_PAGE_TO_PHYS(pg);
4565 
4566 	/*
4567 	 * If this is a new page, assert it has no mappings and simply zap
4568 	 * the stored attributes without taking any locks.
4569 	 */
4570 	if ((pg->flags & PG_FAKE) != 0) {
4571 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4572 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4573 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4574 		atomic_store_relaxed(&pp->pp_attrs, 0);
4575 		return false;
4576 	} else {
4577 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4578 	}
4579 }
4580 
4581 /*
4582  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4583  * pv-tracked page.
4584  */
4585 bool
4586 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4587 {
4588 	struct pmap_page *pp;
4589 
4590 	pp = pmap_pv_tracked(pa);
4591 	if (pp == NULL)
4592 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4593 
4594 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4595 }
4596 
4597 /*
4598  * p m a p   p r o t e c t i o n   f u n c t i o n s
4599  */
4600 
4601 /*
4602  * pmap_page_protect: change the protection of all recorded mappings
4603  * of a managed page
4604  *
4605  * => NOTE: this is an inline function in pmap.h
4606  */
4607 
4608 /* see pmap.h */
4609 
4610 /*
4611  * pmap_pv_protect: change the protection of all recorded mappings
4612  * of an unmanaged pv-tracked page
4613  *
4614  * => NOTE: this is an inline function in pmap.h
4615  */
4616 
4617 /* see pmap.h */
4618 
4619 /*
4620  * pmap_protect: set the protection in of the pages in a pmap
4621  *
4622  * => NOTE: this is an inline function in pmap.h
4623  */
4624 
4625 /* see pmap.h */
4626 
4627 /*
4628  * pmap_write_protect: write-protect pages in a pmap.
4629  *
4630  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4631  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4632  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4633  * present the page will still be considered as a kernel page, and the privilege
4634  * separation will be enforced correctly.
4635  */
4636 void
4637 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4638 {
4639 	pt_entry_t bit_rem, bit_put;
4640 	pt_entry_t *ptes;
4641 	pt_entry_t * const *pdes;
4642 	struct pmap *pmap2;
4643 	vaddr_t blockend, va;
4644 	int lvl, i;
4645 
4646 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4647 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4648 		return;
4649 	}
4650 
4651 	bit_rem = 0;
4652 	if (!(prot & VM_PROT_WRITE))
4653 		bit_rem = PTE_W;
4654 
4655 	bit_put = 0;
4656 	if (!(prot & VM_PROT_EXECUTE))
4657 		bit_put = pmap_pg_nx;
4658 
4659 	sva &= ~PAGE_MASK;
4660 	eva &= ~PAGE_MASK;
4661 
4662 	/*
4663 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4664 	 * be touching PV entries nor stats and kernel PDEs aren't
4665 	 * freed.
4666 	 */
4667 	if (pmap != pmap_kernel()) {
4668 		mutex_enter(&pmap->pm_lock);
4669 	}
4670 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4671 
4672 	for (va = sva ; va < eva; va = blockend) {
4673 		pt_entry_t *spte, *epte;
4674 
4675 		blockend = x86_round_pdr(va + 1);
4676 		if (blockend > eva)
4677 			blockend = eva;
4678 
4679 		/* Is it a valid block? */
4680 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4681 			continue;
4682 		}
4683 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4684 		KASSERT(lvl == 1);
4685 
4686 		spte = &ptes[pl1_i(va)];
4687 		epte = &ptes[pl1_i(blockend)];
4688 
4689 		for (i = 0; spte < epte; spte++, i++) {
4690 			pt_entry_t opte, npte;
4691 
4692 			do {
4693 				opte = *spte;
4694 				if (!pmap_valid_entry(opte)) {
4695 					goto next;
4696 				}
4697 				npte = (opte & ~bit_rem) | bit_put;
4698 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4699 
4700 			if ((opte & PTE_D) != 0) {
4701 				vaddr_t tva = va + x86_ptob(i);
4702 				pmap_tlb_shootdown(pmap, tva, opte,
4703 				    TLBSHOOT_WRITE_PROTECT);
4704 			}
4705 next:;
4706 		}
4707 	}
4708 
4709 	/* Release pmap. */
4710 	pmap_unmap_ptes(pmap, pmap2);
4711 	if (pmap != pmap_kernel()) {
4712 		mutex_exit(&pmap->pm_lock);
4713 	}
4714 }
4715 
4716 /*
4717  * pmap_unwire: clear the wired bit in the PTE.
4718  *
4719  * => Mapping should already be present.
4720  */
4721 void
4722 pmap_unwire(struct pmap *pmap, vaddr_t va)
4723 {
4724 	pt_entry_t *ptes, *ptep, opte;
4725 	pd_entry_t * const *pdes;
4726 	struct pmap *pmap2;
4727 	int lvl;
4728 
4729 	if (__predict_false(pmap->pm_unwire != NULL)) {
4730 		(*pmap->pm_unwire)(pmap, va);
4731 		return;
4732 	}
4733 
4734 	/*
4735 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4736 	 * statistics.
4737 	 */
4738 	mutex_enter(&pmap->pm_lock);
4739 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4740 
4741 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4742 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4743 	}
4744 	KASSERT(lvl == 1);
4745 
4746 	ptep = &ptes[pl1_i(va)];
4747 	opte = *ptep;
4748 	KASSERT(pmap_valid_entry(opte));
4749 
4750 	if (opte & PTE_WIRED) {
4751 		pt_entry_t npte = opte & ~PTE_WIRED;
4752 
4753 		opte = pmap_pte_testset(ptep, npte);
4754 		pmap_stats_update_bypte(pmap, npte, opte);
4755 	} else {
4756 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4757 		    " did not change!\n", __func__, pmap, va);
4758 	}
4759 
4760 	/* Release pmap. */
4761 	pmap_unmap_ptes(pmap, pmap2);
4762 	mutex_exit(&pmap->pm_lock);
4763 }
4764 
4765 /*
4766  * pmap_copy: copy mappings from one pmap to another
4767  *
4768  * => optional function
4769  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4770  */
4771 
4772 /*
4773  * defined as macro in pmap.h
4774  */
4775 
4776 __strict_weak_alias(pmap_enter, pmap_enter_default);
4777 
4778 int
4779 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4780     u_int flags)
4781 {
4782 	if (__predict_false(pmap->pm_enter != NULL)) {
4783 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4784 	}
4785 
4786 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4787 }
4788 
4789 /*
4790  * pmap_enter: enter a mapping into a pmap
4791  *
4792  * => must be done "now" ... no lazy-evaluation
4793  */
4794 int
4795 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4796 	   vm_prot_t prot, u_int flags, int domid)
4797 {
4798 	pt_entry_t *ptes, opte, npte;
4799 	pt_entry_t *ptep;
4800 	pd_entry_t * const *pdes;
4801 	struct vm_page *ptp;
4802 	struct vm_page *new_pg, *old_pg;
4803 	struct pmap_page *new_pp, *old_pp;
4804 	struct pv_entry *old_pve, *new_pve;
4805 	bool wired = (flags & PMAP_WIRED) != 0;
4806 	struct pmap *pmap2;
4807 	struct pmap_ptparray pt;
4808 	int error;
4809 	bool getptp, samepage, new_embedded;
4810 	rb_tree_t *tree;
4811 
4812 	KASSERT(pmap_initialized);
4813 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4814 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4815 	    PRIxVADDR " over PDP!", __func__, va);
4816 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4817 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4818 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4819 
4820 #ifdef XENPV
4821 	KASSERT(domid == DOMID_SELF || pa == 0);
4822 #endif
4823 
4824 	npte = ma | protection_codes[prot] | PTE_P;
4825 	npte |= pmap_pat_flags(flags);
4826 	if (wired)
4827 	        npte |= PTE_WIRED;
4828 	if (va < VM_MAXUSER_ADDRESS)
4829 		npte |= PTE_U;
4830 
4831 	if (pmap == pmap_kernel())
4832 		npte |= pmap_pg_g;
4833 	if (flags & VM_PROT_ALL) {
4834 		npte |= PTE_A;
4835 		if (flags & VM_PROT_WRITE) {
4836 			KASSERT((npte & PTE_W) != 0);
4837 			npte |= PTE_D;
4838 		}
4839 	}
4840 
4841 #ifdef XENPV
4842 	if (domid != DOMID_SELF)
4843 		new_pg = NULL;
4844 	else
4845 #endif
4846 		new_pg = PHYS_TO_VM_PAGE(pa);
4847 
4848 	if (new_pg != NULL) {
4849 		/* This is a managed page */
4850 		npte |= PTE_PVLIST;
4851 		new_pp = VM_PAGE_TO_PP(new_pg);
4852 		PMAP_CHECK_PP(new_pp);
4853 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4854 		/* This is an unmanaged pv-tracked page */
4855 		npte |= PTE_PVLIST;
4856 		PMAP_CHECK_PP(new_pp);
4857 	} else {
4858 		new_pp = NULL;
4859 	}
4860 
4861 	/* Begin by locking the pmap. */
4862 	mutex_enter(&pmap->pm_lock);
4863 
4864 	/* Look up the PTP.  Allocate if none present. */
4865 	ptp = NULL;
4866 	getptp = false;
4867 	if (pmap != pmap_kernel()) {
4868 		ptp = pmap_find_ptp(pmap, va, 1);
4869 		if (ptp == NULL) {
4870 			getptp = true;
4871 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
4872 			if (error != 0) {
4873 				if (flags & PMAP_CANFAIL) {
4874 					mutex_exit(&pmap->pm_lock);
4875 					return error;
4876 				}
4877 				panic("%s: get ptp failed, error=%d", __func__,
4878 				    error);
4879 			}
4880 		}
4881 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
4882 	} else {
4883 		/* Embedded PV entries rely on this. */
4884 		KASSERT(va != 0);
4885 		tree = &pmap_kernel_rb;
4886 	}
4887 
4888 	/*
4889 	 * Look up the old PV entry at this VA (if any), and insert a new PV
4890 	 * entry if required for the new mapping.  Temporarily track the old
4891 	 * and new mappings concurrently.  Only after the old mapping is
4892 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
4893 	 * our picture of modified/accessed state for either page could get
4894 	 * out of sync (we need any P->V operation for either page to stall
4895 	 * on pmap->pm_lock until done here).
4896 	 */
4897 	new_pve = NULL;
4898 	old_pve = NULL;
4899 	samepage = false;
4900 	new_embedded = false;
4901 
4902     	if (new_pp != NULL) {
4903     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
4904     		    &old_pve, &samepage, &new_embedded, tree);
4905 
4906 		/*
4907 		 * If a new pv_entry was needed and none was available, we
4908 		 * can go no further.
4909 		 */
4910 		if (error != 0) {
4911 			if (flags & PMAP_CANFAIL) {
4912 				if (getptp) {
4913 					pmap_unget_ptp(pmap, &pt);
4914 				}
4915 				mutex_exit(&pmap->pm_lock);
4916 				return error;
4917 			}
4918 			panic("%s: alloc pve failed", __func__);
4919 		}
4920 	} else {
4921 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4922 	}
4923 
4924 	/* Map PTEs into address space. */
4925 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4926 
4927 	/* Install any newly allocated PTPs. */
4928 	if (getptp) {
4929 		pmap_install_ptp(pmap, &pt, va, pdes);
4930 	}
4931 
4932 	/* Check if there is an existing mapping. */
4933 	ptep = &ptes[pl1_i(va)];
4934 	opte = *ptep;
4935 	bool have_oldpa = pmap_valid_entry(opte);
4936 	paddr_t oldpa = pmap_pte2pa(opte);
4937 
4938 	/*
4939 	 * Update the pte.
4940 	 */
4941 	do {
4942 		opte = *ptep;
4943 
4944 		/*
4945 		 * if the same page, inherit PTE_A and PTE_D.
4946 		 */
4947 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
4948 			npte |= opte & (PTE_A | PTE_D);
4949 		}
4950 #if defined(XENPV)
4951 		if (domid != DOMID_SELF) {
4952 			/* pmap_pte_cas with error handling */
4953 			int s = splvm();
4954 			if (opte != *ptep) {
4955 				splx(s);
4956 				continue;
4957 			}
4958 			error = xpq_update_foreign(
4959 			    vtomach((vaddr_t)ptep), npte, domid, flags);
4960 			splx(s);
4961 			if (error) {
4962 				/* Undo pv_entry tracking - oof. */
4963 				if (new_pp != NULL) {
4964 					mutex_spin_enter(&new_pp->pp_lock);
4965 					if (new_pve != NULL) {
4966 						LIST_REMOVE(new_pve, pve_list);
4967 						KASSERT(pmap->pm_pve == NULL);
4968 						pmap->pm_pve = new_pve;
4969 					} else if (new_embedded) {
4970 						new_pp->pp_pte.pte_ptp = NULL;
4971 						new_pp->pp_pte.pte_va = 0;
4972 					}
4973 					mutex_spin_exit(&new_pp->pp_lock);
4974 				}
4975 				pmap_unmap_ptes(pmap, pmap2);
4976 				/* Free new PTP. */
4977 				if (ptp != NULL && ptp->wire_count <= 1) {
4978 					pmap_free_ptp(pmap, ptp, va, ptes,
4979 					    pdes);
4980 				}
4981 				mutex_exit(&pmap->pm_lock);
4982 				return error;
4983 			}
4984 			break;
4985 		}
4986 #endif /* defined(XENPV) */
4987 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4988 
4989 	/*
4990 	 * Done with the PTEs: they can now be unmapped.
4991 	 */
4992 	pmap_unmap_ptes(pmap, pmap2);
4993 
4994 	/*
4995 	 * Update statistics and PTP's reference count.
4996 	 */
4997 	pmap_stats_update_bypte(pmap, npte, opte);
4998 	if (ptp != NULL) {
4999 		if (!have_oldpa) {
5000 			ptp->wire_count++;
5001 		}
5002 		/* Remember minimum VA in PTP. */
5003 		pmap_ptp_range_set(ptp, va);
5004 	}
5005 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5006 
5007 	/*
5008 	 * If the same page, we can skip pv_entry handling.
5009 	 */
5010 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5011 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5012 		if ((npte & PTE_PVLIST) != 0) {
5013 			KASSERT(samepage);
5014 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5015 		}
5016 		goto same_pa;
5017 	} else if ((npte & PTE_PVLIST) != 0) {
5018 		KASSERT(!samepage);
5019 	}
5020 
5021 	/*
5022 	 * If old page is pv-tracked, remove pv_entry from its list.
5023 	 */
5024 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5025 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5026 			old_pp = VM_PAGE_TO_PP(old_pg);
5027 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5028 			panic("%s: PTE_PVLIST with pv-untracked page"
5029 			    " va = %#"PRIxVADDR
5030 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5031 			    __func__, va, oldpa, atop(pa));
5032 		}
5033 
5034 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5035 		    pmap_pte_to_pp_attrs(opte));
5036 	} else {
5037 		KASSERT(old_pve == NULL);
5038 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5039 	}
5040 
5041 	/*
5042 	 * If new page is dynamically PV tracked, insert to tree.
5043 	 */
5044 	if (new_pve != NULL) {
5045 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5046 		old_pve = rb_tree_insert_node(tree, new_pve);
5047 		KASSERT(old_pve == new_pve);
5048 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5049 	}
5050 
5051 same_pa:
5052 	/*
5053 	 * shootdown tlb if necessary.
5054 	 */
5055 
5056 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5057 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5058 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5059 	}
5060 	pmap_drain_pv(pmap);
5061 	mutex_exit(&pmap->pm_lock);
5062 	return 0;
5063 }
5064 
5065 #if defined(XEN) && defined(DOM0OPS)
5066 
5067 struct pmap_data_gnt {
5068 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5069 	vaddr_t pd_gnt_sva;
5070 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5071 	int pd_gnt_refs; /* ref counter */
5072 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5073 };
5074 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5075 
5076 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5077 
5078 static struct pmap_data_gnt *
5079 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5080 {
5081 	struct pmap_data_gnt_head *headp;
5082 	struct pmap_data_gnt *pgnt;
5083 
5084 	KASSERT(mutex_owned(&pmap->pm_lock));
5085 	headp = pmap->pm_data;
5086 	KASSERT(headp != NULL);
5087 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5088 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5089 			return pgnt;
5090 		/* check that we're not overlapping part of a region */
5091 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5092 	}
5093 	return NULL;
5094 }
5095 
5096 static void
5097 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5098     const struct gnttab_map_grant_ref *ops)
5099 {
5100 	struct pmap_data_gnt_head *headp;
5101 	struct pmap_data_gnt *pgnt;
5102 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5103 	KASSERT(mutex_owned(&pmap->pm_lock));
5104 	KASSERT(nentries >= 1);
5105 	if (pmap->pm_remove == NULL) {
5106 		pmap->pm_remove = pmap_remove_gnt;
5107 		KASSERT(pmap->pm_data == NULL);
5108 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5109 		SLIST_INIT(headp);
5110 		pmap->pm_data = headp;
5111 	} else {
5112 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5113 		KASSERT(pmap->pm_data != NULL);
5114 		headp = pmap->pm_data;
5115 	}
5116 
5117 	pgnt = pmap_find_gnt(pmap, sva, eva);
5118 	if (pgnt != NULL) {
5119 		KASSERT(pgnt->pd_gnt_sva == sva);
5120 		KASSERT(pgnt->pd_gnt_eva == eva);
5121 		return;
5122 	}
5123 
5124 	/* new entry */
5125 	pgnt = kmem_alloc(sizeof(*pgnt) +
5126 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5127 	pgnt->pd_gnt_sva = sva;
5128 	pgnt->pd_gnt_eva = eva;
5129 	pgnt->pd_gnt_refs = 0;
5130 	memcpy(pgnt->pd_gnt_ops, ops,
5131 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5132 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5133 }
5134 
5135 static void
5136 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5137 {
5138 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5139 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5140 	KASSERT(nentries >= 1);
5141 	KASSERT(mutex_owned(&pmap->pm_lock));
5142 	KASSERT(pgnt->pd_gnt_refs == 0);
5143 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5144 	kmem_free(pgnt, sizeof(*pgnt) +
5145 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5146 	if (SLIST_EMPTY(headp)) {
5147 		kmem_free(headp, sizeof(*headp));
5148 		pmap->pm_data = NULL;
5149 		pmap->pm_remove = NULL;
5150 	}
5151 }
5152 
5153 /*
5154  * pmap_enter_gnt: enter a grant entry into a pmap
5155  *
5156  * => must be done "now" ... no lazy-evaluation
5157  */
5158 int
5159 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5160     const struct gnttab_map_grant_ref *oops)
5161 {
5162 	struct pmap_data_gnt *pgnt;
5163 	pt_entry_t *ptes, opte;
5164 	pt_entry_t *ptep;
5165 	pd_entry_t * const *pdes;
5166 	struct vm_page *ptp;
5167 	struct vm_page *old_pg;
5168 	struct pmap_page *old_pp;
5169 	struct pv_entry *old_pve;
5170 	struct pmap *pmap2;
5171 	struct pmap_ptparray pt;
5172 	int error;
5173 	bool getptp;
5174 	rb_tree_t *tree;
5175 	struct gnttab_map_grant_ref *op;
5176 	int ret;
5177 	int idx;
5178 
5179 	KASSERT(pmap_initialized);
5180 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5181 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5182 	    PRIxVADDR " over PDP!", __func__, va);
5183 	KASSERT(pmap != pmap_kernel());
5184 
5185 	/* Begin by locking the pmap. */
5186 	mutex_enter(&pmap->pm_lock);
5187 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5188 
5189 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5190 	KASSERT(pgnt != NULL);
5191 
5192 	/* Look up the PTP.  Allocate if none present. */
5193 	ptp = NULL;
5194 	getptp = false;
5195 	ptp = pmap_find_ptp(pmap, va, 1);
5196 	if (ptp == NULL) {
5197 		getptp = true;
5198 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5199 		if (error != 0) {
5200 			mutex_exit(&pmap->pm_lock);
5201 			return error;
5202 		}
5203 	}
5204 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5205 
5206 	/*
5207 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5208 	 * entry if required for the new mapping.  Temporarily track the old
5209 	 * and new mappings concurrently.  Only after the old mapping is
5210 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5211 	 * our picture of modified/accessed state for either page could get
5212 	 * out of sync (we need any P->V operation for either page to stall
5213 	 * on pmap->pm_lock until done here).
5214 	 */
5215 	old_pve = NULL;
5216 
5217 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5218 
5219 	/* Map PTEs into address space. */
5220 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5221 
5222 	/* Install any newly allocated PTPs. */
5223 	if (getptp) {
5224 		pmap_install_ptp(pmap, &pt, va, pdes);
5225 	}
5226 
5227 	/* Check if there is an existing mapping. */
5228 	ptep = &ptes[pl1_i(va)];
5229 	opte = *ptep;
5230 	bool have_oldpa = pmap_valid_entry(opte);
5231 	paddr_t oldpa = pmap_pte2pa(opte);
5232 
5233 	/*
5234 	 * Update the pte.
5235 	 */
5236 
5237 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5238 	op = &pgnt->pd_gnt_ops[idx];
5239 
5240 #ifdef XENPV /* XXX */
5241 	op->host_addr = xpmap_ptetomach(ptep);
5242 #endif
5243 	op->dev_bus_addr = 0;
5244 	op->status = GNTST_general_error;
5245 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5246 	if (__predict_false(ret)) {
5247 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5248 		    __func__, ret);
5249 		op->status = GNTST_general_error;
5250 	}
5251 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5252 		kpause("gntmap", false, mstohz(1), NULL);
5253 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5254 		if (__predict_false(ret)) {
5255 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5256 			    __func__, ret);
5257 			op->status = GNTST_general_error;
5258 		}
5259 	}
5260 	if (__predict_false(op->status != GNTST_okay)) {
5261 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5262 		    __func__, op->status);
5263 		if (have_oldpa) {
5264 			ptp->wire_count--;
5265 		}
5266 	} else {
5267 		pgnt->pd_gnt_refs++;
5268 		if (!have_oldpa) {
5269 			ptp->wire_count++;
5270 		}
5271 		KASSERT(ptp->wire_count > 1);
5272 		/* Remember minimum VA in PTP. */
5273 		pmap_ptp_range_set(ptp, va);
5274 	}
5275 	if (ptp->wire_count <= 1)
5276 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5277 
5278 	/*
5279 	 * Done with the PTEs: they can now be unmapped.
5280 	 */
5281 	pmap_unmap_ptes(pmap, pmap2);
5282 
5283 	/*
5284 	 * Update statistics and PTP's reference count.
5285 	 */
5286 	pmap_stats_update_bypte(pmap, 0, opte);
5287 
5288 	/*
5289 	 * If old page is pv-tracked, remove pv_entry from its list.
5290 	 */
5291 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5292 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5293 			old_pp = VM_PAGE_TO_PP(old_pg);
5294 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5295 			panic("%s: PTE_PVLIST with pv-untracked page"
5296 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5297 			    __func__, va, oldpa);
5298 		}
5299 
5300 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5301 		    pmap_pte_to_pp_attrs(opte));
5302 	} else {
5303 		KASSERT(old_pve == NULL);
5304 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5305 	}
5306 
5307 	pmap_drain_pv(pmap);
5308 	mutex_exit(&pmap->pm_lock);
5309 	return op->status;
5310 }
5311 
5312 /*
5313  * pmap_remove_gnt: grant mapping removal function.
5314  *
5315  * => caller should not be holding any pmap locks
5316  */
5317 static void
5318 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5319 {
5320 	struct pmap_data_gnt *pgnt;
5321 	pt_entry_t *ptes;
5322 	pd_entry_t pde;
5323 	pd_entry_t * const *pdes;
5324 	struct vm_page *ptp;
5325 	struct pmap *pmap2;
5326 	vaddr_t va;
5327 	int lvl;
5328 	int idx;
5329 	struct gnttab_map_grant_ref *op;
5330 	struct gnttab_unmap_grant_ref unmap_op;
5331 	int ret;
5332 
5333 	KASSERT(pmap != pmap_kernel());
5334 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5335 
5336 	mutex_enter(&pmap->pm_lock);
5337 	for (va = sva; va < eva; va += PAGE_SIZE) {
5338 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5339 		if (pgnt == NULL) {
5340 			pmap_remove_locked(pmap, sva, eva);
5341 			continue;
5342 		}
5343 
5344 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5345 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5346 			panic("pmap_remove_gnt pdes not valid");
5347 		}
5348 
5349 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5350 		op = &pgnt->pd_gnt_ops[idx];
5351 		KASSERT(lvl == 1);
5352 		KASSERT(op->status == GNTST_okay);
5353 
5354 		/* Get PTP if non-kernel mapping. */
5355 		ptp = pmap_find_ptp(pmap, va, 1);
5356 		KASSERTMSG(ptp != NULL,
5357 		    "%s: unmanaged PTP detected", __func__);
5358 
5359 		if (op->status == GNTST_okay)  {
5360 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5361 			unmap_op.handle = op->handle;
5362 			unmap_op.dev_bus_addr = 0;
5363 #ifdef XENPV /* XXX */
5364 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5365 #endif
5366 			ret = HYPERVISOR_grant_table_op(
5367 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5368 			if (ret) {
5369 				printf("%s: GNTTABOP_unmap_grant_ref "
5370 				    "failed: %d\n", __func__, ret);
5371 			}
5372 
5373 			ptp->wire_count--;
5374 			pgnt->pd_gnt_refs--;
5375 			if (pgnt->pd_gnt_refs == 0) {
5376 				pmap_free_gnt(pmap, pgnt);
5377 			}
5378 		}
5379 		/*
5380 		 * if mapping removed and the PTP is no longer
5381 		 * being used, free it!
5382 		 */
5383 
5384 		if (ptp->wire_count <= 1)
5385 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5386 		pmap_unmap_ptes(pmap, pmap2);
5387 	}
5388 	mutex_exit(&pmap->pm_lock);
5389 }
5390 #endif /* XEN && DOM0OPS */
5391 
5392 paddr_t
5393 pmap_get_physpage(void)
5394 {
5395 	struct vm_page *ptp;
5396 	struct pmap *kpm = pmap_kernel();
5397 	paddr_t pa;
5398 
5399 	if (!uvm.page_init_done) {
5400 		/*
5401 		 * We're growing the kernel pmap early (from
5402 		 * uvm_pageboot_alloc()). This case must be
5403 		 * handled a little differently.
5404 		 */
5405 
5406 		if (!uvm_page_physget(&pa))
5407 			panic("%s: out of memory", __func__);
5408 #if defined(__HAVE_DIRECT_MAP)
5409 		memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5410 #else
5411 #if defined(XENPV)
5412 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5413 			xen_pagezero(pa);
5414 			return pa;
5415 		}
5416 #endif
5417 		kpreempt_disable();
5418 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5419 		    PTE_W | pmap_pg_nx);
5420 		pmap_pte_flush();
5421 		pmap_update_pg((vaddr_t)early_zerop);
5422 		memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5423 #if defined(DIAGNOSTIC) || defined(XENPV)
5424 		pmap_pte_set(early_zero_pte, 0);
5425 		pmap_pte_flush();
5426 #endif /* defined(DIAGNOSTIC) */
5427 		kpreempt_enable();
5428 #endif /* defined(__HAVE_DIRECT_MAP) */
5429 	} else {
5430 		/* XXX */
5431 		ptp = uvm_pagealloc(NULL, 0, NULL,
5432 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5433 		if (ptp == NULL)
5434 			panic("%s: out of memory", __func__);
5435 		ptp->flags &= ~PG_BUSY;
5436 		ptp->wire_count = 1;
5437 		pa = VM_PAGE_TO_PHYS(ptp);
5438 	}
5439 	pmap_stats_update(kpm, 1, 0);
5440 
5441 	return pa;
5442 }
5443 
5444 /*
5445  * Expand the page tree with the specified amount of PTPs, mapping virtual
5446  * addresses starting at kva. We populate all the levels but the last one
5447  * (L1). The nodes of the tree are created as RW, but the pages covered
5448  * will be kentered in L1, with proper permissions.
5449  *
5450  * Used only by pmap_growkernel.
5451  */
5452 static void
5453 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5454 {
5455 	unsigned long i;
5456 	paddr_t pa;
5457 	unsigned long index, endindex;
5458 	int level;
5459 	pd_entry_t *pdep;
5460 #ifdef XENPV
5461 	int s = splvm(); /* protect xpq_* */
5462 #endif
5463 
5464 	for (level = PTP_LEVELS; level > 1; level--) {
5465 		if (level == PTP_LEVELS)
5466 			pdep = cpm->pm_pdir;
5467 		else
5468 			pdep = normal_pdes[level - 2];
5469 		index = pl_i_roundup(kva, level);
5470 		endindex = index + needed_ptps[level - 1] - 1;
5471 
5472 		for (i = index; i <= endindex; i++) {
5473 			pt_entry_t pte;
5474 
5475 			KASSERT(!pmap_valid_entry(pdep[i]));
5476 			pa = pmap_get_physpage();
5477 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5478 #ifdef __x86_64__
5479 			pte |= pmap_pg_nx;
5480 #endif
5481 			pmap_pte_set(&pdep[i], pte);
5482 
5483 #ifdef XENPV
5484 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5485 				if (__predict_true(
5486 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5487 					/* update per-cpu PMDs on all cpus */
5488 					xen_kpm_sync(pmap_kernel(), i);
5489 				} else {
5490 					/*
5491 					 * too early; update primary CPU
5492 					 * PMD only (without locks)
5493 					 */
5494 #ifdef __x86_64__
5495 					pd_entry_t *cpu_pdep =
5496 						&cpu_info_primary.ci_kpm_pdir[i];
5497 #else
5498 					pd_entry_t *cpu_pdep =
5499 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5500 #endif
5501 					pmap_pte_set(cpu_pdep, pte);
5502 				}
5503 			}
5504 #endif
5505 
5506 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5507 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5508 			nkptp[level - 1]++;
5509 		}
5510 		pmap_pte_flush();
5511 	}
5512 #ifdef XENPV
5513 	splx(s);
5514 #endif
5515 }
5516 
5517 /*
5518  * pmap_growkernel: increase usage of KVM space.
5519  *
5520  * => we allocate new PTPs for the kernel and install them in all
5521  *    the pmaps on the system.
5522  */
5523 vaddr_t
5524 pmap_growkernel(vaddr_t maxkvaddr)
5525 {
5526 	struct pmap *kpm = pmap_kernel();
5527 	struct pmap *cpm;
5528 #if !defined(XENPV) || !defined(__x86_64__)
5529 	struct pmap *pm;
5530 	long old;
5531 #endif
5532 	int s, i;
5533 	long needed_kptp[PTP_LEVELS], target_nptp;
5534 	bool invalidate = false;
5535 
5536 	s = splvm();	/* to be safe */
5537 	mutex_enter(&kpm->pm_lock);
5538 
5539 	if (maxkvaddr <= pmap_maxkvaddr) {
5540 		mutex_exit(&kpm->pm_lock);
5541 		splx(s);
5542 		return pmap_maxkvaddr;
5543 	}
5544 
5545 	maxkvaddr = x86_round_pdr(maxkvaddr);
5546 #if !defined(XENPV) || !defined(__x86_64__)
5547 	old = nkptp[PTP_LEVELS - 1];
5548 #endif
5549 
5550 	/* Initialize needed_kptp. */
5551 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5552 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5553 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5554 
5555 		if (target_nptp > nkptpmax[i])
5556 			panic("out of KVA space");
5557 		KASSERT(target_nptp >= nkptp[i]);
5558 		needed_kptp[i] = target_nptp - nkptp[i];
5559 	}
5560 
5561 #ifdef XENPV
5562 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5563 	cpm = kpm;
5564 #else
5565 	/* Get the current pmap */
5566 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5567 		cpm = curcpu()->ci_pmap;
5568 	} else {
5569 		cpm = kpm;
5570 	}
5571 #endif
5572 
5573 	kasan_shadow_map((void *)pmap_maxkvaddr,
5574 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5575 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5576 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5577 
5578 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5579 
5580 	/*
5581 	 * If the number of top level entries changed, update all pmaps.
5582 	 */
5583 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5584 #ifdef XENPV
5585 #ifdef __x86_64__
5586 		/* nothing, kernel entries are never entered in user pmap */
5587 #else
5588 		int pdkidx;
5589 
5590 		mutex_enter(&pmaps_lock);
5591 		LIST_FOREACH(pm, &pmaps, pm_list) {
5592 			for (pdkidx = PDIR_SLOT_KERN + old;
5593 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5594 			    pdkidx++) {
5595 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5596 				    kpm->pm_pdir[pdkidx]);
5597 			}
5598 			pmap_pte_flush();
5599 		}
5600 		mutex_exit(&pmaps_lock);
5601 #endif /* __x86_64__ */
5602 #else /* XENPV */
5603 		size_t newpdes;
5604 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5605 		if (cpm != kpm) {
5606 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5607 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5608 			    newpdes * sizeof(pd_entry_t));
5609 		}
5610 
5611 		mutex_enter(&pmaps_lock);
5612 		LIST_FOREACH(pm, &pmaps, pm_list) {
5613 			if (__predict_false(pm->pm_enter != NULL)) {
5614 				/*
5615 				 * Not a native pmap, the kernel is not mapped,
5616 				 * so nothing to synchronize.
5617 				 */
5618 				continue;
5619 			}
5620 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5621 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5622 			    newpdes * sizeof(pd_entry_t));
5623 		}
5624 		mutex_exit(&pmaps_lock);
5625 #endif
5626 		invalidate = true;
5627 	}
5628 	pmap_maxkvaddr = maxkvaddr;
5629 	mutex_exit(&kpm->pm_lock);
5630 	splx(s);
5631 
5632 	if (invalidate && pmap_initialized) {
5633 		/* Invalidate the pmap cache. */
5634 		pool_cache_invalidate(&pmap_cache);
5635 	}
5636 
5637 	return maxkvaddr;
5638 }
5639 
5640 #ifdef DEBUG
5641 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5642 
5643 /*
5644  * pmap_dump: dump all the mappings from a pmap
5645  *
5646  * => caller should not be holding any pmap locks
5647  */
5648 void
5649 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5650 {
5651 	pt_entry_t *ptes, *pte;
5652 	pd_entry_t * const *pdes;
5653 	struct pmap *pmap2;
5654 	vaddr_t blkendva;
5655 	int lvl;
5656 
5657 	/*
5658 	 * if end is out of range truncate.
5659 	 * if (end == start) update to max.
5660 	 */
5661 
5662 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5663 		eva = VM_MAXUSER_ADDRESS;
5664 
5665 	mutex_enter(&pmap->pm_lock);
5666 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5667 
5668 	/*
5669 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5670 	 */
5671 
5672 	for (/* null */ ; sva < eva ; sva = blkendva) {
5673 
5674 		/* determine range of block */
5675 		blkendva = x86_round_pdr(sva+1);
5676 		if (blkendva > eva)
5677 			blkendva = eva;
5678 
5679 		/* valid block? */
5680 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5681 			continue;
5682 		KASSERT(lvl == 1);
5683 
5684 		pte = &ptes[pl1_i(sva)];
5685 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5686 			if (!pmap_valid_entry(*pte))
5687 				continue;
5688 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5689 			    " (pte=%#" PRIxPADDR ")\n",
5690 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5691 		}
5692 	}
5693 	pmap_unmap_ptes(pmap, pmap2);
5694 	mutex_exit(&pmap->pm_lock);
5695 }
5696 #endif
5697 
5698 /*
5699  * pmap_update: process deferred invalidations and frees.
5700  */
5701 void
5702 pmap_update(struct pmap *pmap)
5703 {
5704 	struct pmap_page *pp;
5705 	struct vm_page *ptp;
5706 
5707 	/*
5708 	 * Initiate any pending TLB shootdowns.  Wait for them to
5709 	 * complete before returning control to the caller.
5710 	 */
5711 	kpreempt_disable();
5712 	pmap_tlb_shootnow();
5713 	kpreempt_enable();
5714 
5715 	/*
5716 	 * Now that shootdowns are complete, process deferred frees.  This
5717 	 * is an unlocked check, but is safe as we're only interested in
5718 	 * work done in this LWP - we won't get a false negative.
5719 	 */
5720 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5721 		return;
5722 	}
5723 
5724 	mutex_enter(&pmap->pm_lock);
5725 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5726 		KASSERT(ptp->wire_count == 0);
5727 		KASSERT(ptp->uanon == NULL);
5728 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5729 		pp = VM_PAGE_TO_PP(ptp);
5730 		LIST_INIT(&pp->pp_pvlist);
5731 		pp->pp_attrs = 0;
5732 		pp->pp_pte.pte_ptp = NULL;
5733 		pp->pp_pte.pte_va = 0;
5734 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5735 
5736 		/*
5737 		 * XXX Hack to avoid extra locking, and lock
5738 		 * assertions in uvm_pagefree().  Despite uobject
5739 		 * being set, this isn't a managed page.
5740 		 */
5741 		PMAP_DUMMY_LOCK(pmap);
5742 		uvm_pagerealloc(ptp, NULL, 0);
5743 		PMAP_DUMMY_UNLOCK(pmap);
5744 		uvm_pagefree(ptp);
5745 	}
5746 	mutex_exit(&pmap->pm_lock);
5747 }
5748 
5749 #if PTP_LEVELS > 4
5750 #error "Unsupported number of page table mappings"
5751 #endif
5752 
5753 paddr_t
5754 pmap_init_tmp_pgtbl(paddr_t pg)
5755 {
5756 	static bool maps_loaded;
5757 	static const paddr_t x86_tmp_pml_paddr[] = {
5758 	    4 * PAGE_SIZE,	/* L1 */
5759 	    5 * PAGE_SIZE,	/* L2 */
5760 	    6 * PAGE_SIZE,	/* L3 */
5761 	    7 * PAGE_SIZE	/* L4 */
5762 	};
5763 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5764 
5765 	pd_entry_t *tmp_pml, *kernel_pml;
5766 
5767 	int level;
5768 
5769 	if (!maps_loaded) {
5770 		for (level = 0; level < PTP_LEVELS; ++level) {
5771 			x86_tmp_pml_vaddr[level] =
5772 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5773 			    UVM_KMF_VAONLY);
5774 
5775 			if (x86_tmp_pml_vaddr[level] == 0)
5776 				panic("mapping of real mode PML failed\n");
5777 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5778 			    x86_tmp_pml_paddr[level],
5779 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5780 		}
5781 		pmap_update(pmap_kernel());
5782 		maps_loaded = true;
5783 	}
5784 
5785 	/* Zero levels 1-3 */
5786 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
5787 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5788 		memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
5789 	}
5790 
5791 	/* Copy PML4 */
5792 	kernel_pml = pmap_kernel()->pm_pdir;
5793 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
5794 	memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
5795 
5796 #ifdef PAE
5797 	/*
5798 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
5799 	 * last entries are unlikely to be used for temporary mappings.
5800 	 * 508: maps 0->1GB (userland)
5801 	 * 509: unused
5802 	 * 510: unused
5803 	 * 511: maps 3->4GB (kernel)
5804 	 */
5805 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
5806 	tmp_pml[509] = 0;
5807 	tmp_pml[510] = 0;
5808 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
5809 #endif
5810 
5811 	for (level = PTP_LEVELS - 1; level > 0; --level) {
5812 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5813 
5814 		tmp_pml[pl_i(pg, level + 1)] =
5815 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
5816 	}
5817 
5818 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
5819 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
5820 
5821 #ifdef PAE
5822 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
5823 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
5824 #endif
5825 
5826 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
5827 }
5828 
5829 u_int
5830 x86_mmap_flags(paddr_t mdpgno)
5831 {
5832 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
5833 	u_int pflag = 0;
5834 
5835 	if (nflag & X86_MMAP_FLAG_PREFETCH)
5836 		pflag |= PMAP_WRITE_COMBINE;
5837 
5838 	return pflag;
5839 }
5840 
5841 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
5842 
5843 /*
5844  * -----------------------------------------------------------------------------
5845  * *****************************************************************************
5846  * *****************************************************************************
5847  * *****************************************************************************
5848  * *****************************************************************************
5849  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
5850  * *****************************************************************************
5851  * *****************************************************************************
5852  * *****************************************************************************
5853  * *****************************************************************************
5854  * -----------------------------------------------------------------------------
5855  *
5856  * These functions are invoked as callbacks from the code above. Contrary to
5857  * native, EPT does not have a recursive slot; therefore, it is not possible
5858  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
5859  * tree manually.
5860  *
5861  * Apart from that, the logic is mostly the same as native. Once a pmap has
5862  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
5863  * After that we're good, and the callbacks will handle the translations
5864  * for us.
5865  *
5866  * -----------------------------------------------------------------------------
5867  */
5868 
5869 /* Hardware bits. */
5870 #define EPT_R		__BIT(0)	/* read */
5871 #define EPT_W		__BIT(1)	/* write */
5872 #define EPT_X		__BIT(2)	/* execute */
5873 #define EPT_T		__BITS(5,3)	/* type */
5874 #define		TYPE_UC	0
5875 #define		TYPE_WC	1
5876 #define		TYPE_WT	4
5877 #define		TYPE_WP	5
5878 #define		TYPE_WB	6
5879 #define EPT_NOPAT	__BIT(6)
5880 #define EPT_L		__BIT(7)	/* large */
5881 #define EPT_A		__BIT(8)	/* accessed */
5882 #define EPT_D		__BIT(9)	/* dirty */
5883 /* Software bits. */
5884 #define EPT_PVLIST	__BIT(60)
5885 #define EPT_WIRED	__BIT(61)
5886 
5887 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
5888 
5889 bool pmap_ept_has_ad __read_mostly;
5890 
5891 static inline void
5892 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
5893 {
5894 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
5895 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
5896 
5897 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5898 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5899 
5900 	pmap_stats_update(pmap, resid_diff, wired_diff);
5901 }
5902 
5903 static pt_entry_t
5904 pmap_ept_type(u_int flags)
5905 {
5906 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
5907 	pt_entry_t ret;
5908 
5909 	switch (cacheflags) {
5910 	case PMAP_NOCACHE:
5911 	case PMAP_NOCACHE_OVR:
5912 		ret = __SHIFTIN(TYPE_UC, EPT_T);
5913 		break;
5914 	case PMAP_WRITE_COMBINE:
5915 		ret = __SHIFTIN(TYPE_WC, EPT_T);
5916 		break;
5917 	case PMAP_WRITE_BACK:
5918 	default:
5919 		ret = __SHIFTIN(TYPE_WB, EPT_T);
5920 		break;
5921 	}
5922 
5923 	ret |= EPT_NOPAT;
5924 	return ret;
5925 }
5926 
5927 static inline pt_entry_t
5928 pmap_ept_prot(vm_prot_t prot)
5929 {
5930 	pt_entry_t res = 0;
5931 
5932 	if (prot & VM_PROT_READ)
5933 		res |= EPT_R;
5934 	if (prot & VM_PROT_WRITE)
5935 		res |= EPT_W;
5936 	if (prot & VM_PROT_EXECUTE)
5937 		res |= EPT_X;
5938 
5939 	return res;
5940 }
5941 
5942 static inline uint8_t
5943 pmap_ept_to_pp_attrs(pt_entry_t ept)
5944 {
5945 	uint8_t ret = 0;
5946 	if (pmap_ept_has_ad) {
5947 		if (ept & EPT_D)
5948 			ret |= PP_ATTRS_D;
5949 		if (ept & EPT_A)
5950 			ret |= PP_ATTRS_A;
5951 	} else {
5952 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
5953 	}
5954 	if (ept & EPT_W)
5955 		ret |= PP_ATTRS_W;
5956 	return ret;
5957 }
5958 
5959 static inline pt_entry_t
5960 pmap_pp_attrs_to_ept(uint8_t attrs)
5961 {
5962 	pt_entry_t ept = 0;
5963 	if (attrs & PP_ATTRS_D)
5964 		ept |= EPT_D;
5965 	if (attrs & PP_ATTRS_A)
5966 		ept |= EPT_A;
5967 	if (attrs & PP_ATTRS_W)
5968 		ept |= EPT_W;
5969 	return ept;
5970 }
5971 
5972 /*
5973  * Helper for pmap_ept_free_ptp.
5974  * tree[0] = &L2[L2idx]
5975  * tree[1] = &L3[L3idx]
5976  * tree[2] = &L4[L4idx]
5977  */
5978 static void
5979 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
5980 {
5981 	pt_entry_t *pteva;
5982 	paddr_t ptepa;
5983 	int i, index;
5984 
5985 	ptepa = pmap->pm_pdirpa[0];
5986 	for (i = PTP_LEVELS; i > 1; i--) {
5987 		index = pl_pi(va, i);
5988 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
5989 		KASSERT(pmap_ept_valid_entry(pteva[index]));
5990 		tree[i - 2] = &pteva[index];
5991 		ptepa = pmap_pte2pa(pteva[index]);
5992 	}
5993 }
5994 
5995 static void
5996 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
5997 {
5998 	pd_entry_t *tree[3];
5999 	int level;
6000 
6001 	KASSERT(pmap != pmap_kernel());
6002 	KASSERT(mutex_owned(&pmap->pm_lock));
6003 	KASSERT(kpreempt_disabled());
6004 
6005 	pmap_ept_get_tree(pmap, va, tree);
6006 
6007 	level = 1;
6008 	do {
6009 		(void)pmap_pte_testset(tree[level - 1], 0);
6010 
6011 		pmap_freepage(pmap, ptp, level);
6012 		if (level < PTP_LEVELS - 1) {
6013 			ptp = pmap_find_ptp(pmap, va, level + 1);
6014 			ptp->wire_count--;
6015 			if (ptp->wire_count > 1)
6016 				break;
6017 		}
6018 	} while (++level < PTP_LEVELS);
6019 	pmap_pte_flush();
6020 }
6021 
6022 /* Allocate L4->L3->L2. Return L2. */
6023 static void
6024 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6025 {
6026 	struct vm_page *ptp;
6027 	unsigned long index;
6028 	pd_entry_t *pteva;
6029 	paddr_t ptepa;
6030 	int i;
6031 
6032 	KASSERT(pmap != pmap_kernel());
6033 	KASSERT(mutex_owned(&pmap->pm_lock));
6034 	KASSERT(kpreempt_disabled());
6035 
6036 	/*
6037 	 * Now that we have all the pages looked up or allocated,
6038 	 * loop through again installing any new ones into the tree.
6039 	 */
6040 	ptepa = pmap->pm_pdirpa[0];
6041 	for (i = PTP_LEVELS; i > 1; i--) {
6042 		index = pl_pi(va, i);
6043 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6044 
6045 		if (pmap_ept_valid_entry(pteva[index])) {
6046 			KASSERT(!pt->alloced[i]);
6047 			ptepa = pmap_pte2pa(pteva[index]);
6048 			continue;
6049 		}
6050 
6051 		ptp = pt->pg[i];
6052 		ptp->flags &= ~PG_BUSY; /* never busy */
6053 		ptp->wire_count = 1;
6054 		pmap->pm_ptphint[i - 2] = ptp;
6055 		ptepa = VM_PAGE_TO_PHYS(ptp);
6056 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6057 
6058 		pmap_pte_flush();
6059 		pmap_stats_update(pmap, 1, 0);
6060 
6061 		/*
6062 		 * If we're not in the top level, increase the
6063 		 * wire count of the parent page.
6064 		 */
6065 		if (i < PTP_LEVELS) {
6066 			pt->pg[i + 1]->wire_count++;
6067 		}
6068 	}
6069 }
6070 
6071 static int
6072 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6073     u_int flags)
6074 {
6075 	pt_entry_t *ptes, opte, npte;
6076 	pt_entry_t *ptep;
6077 	struct vm_page *ptp;
6078 	struct vm_page *new_pg, *old_pg;
6079 	struct pmap_page *new_pp, *old_pp;
6080 	struct pv_entry *old_pve, *new_pve;
6081 	bool wired = (flags & PMAP_WIRED) != 0;
6082 	bool accessed;
6083 	struct pmap_ptparray pt;
6084 	int error;
6085 	bool getptp, samepage, new_embedded;
6086 	rb_tree_t *tree;
6087 
6088 	KASSERT(pmap_initialized);
6089 	KASSERT(va < VM_MAXUSER_ADDRESS);
6090 
6091 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6092 
6093 	if (wired)
6094 		npte |= EPT_WIRED;
6095 	if (flags & VM_PROT_ALL) {
6096 		npte |= EPT_A;
6097 		if (flags & VM_PROT_WRITE) {
6098 			KASSERT((npte & EPT_W) != 0);
6099 			npte |= EPT_D;
6100 		}
6101 	}
6102 
6103 	new_pg = PHYS_TO_VM_PAGE(pa);
6104 	if (new_pg != NULL) {
6105 		/* This is a managed page */
6106 		npte |= EPT_PVLIST;
6107 		new_pp = VM_PAGE_TO_PP(new_pg);
6108 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6109 		/* This is an unmanaged pv-tracked page */
6110 		npte |= EPT_PVLIST;
6111 	} else {
6112 		new_pp = NULL;
6113 	}
6114 
6115 	/* Begin by locking the pmap. */
6116 	mutex_enter(&pmap->pm_lock);
6117 
6118 	/* Look up the PTP.  Allocate if none present. */
6119 	ptp = NULL;
6120 	getptp = false;
6121 	if (pmap != pmap_kernel()) {
6122 		ptp = pmap_find_ptp(pmap, va, 1);
6123 		if (ptp == NULL) {
6124 			getptp = true;
6125 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6126 			if (error != 0) {
6127 				if (flags & PMAP_CANFAIL) {
6128 					mutex_exit(&pmap->pm_lock);
6129 					return error;
6130 				}
6131 				panic("%s: get ptp failed, error=%d", __func__,
6132 				    error);
6133 			}
6134 		}
6135 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6136 	} else {
6137 		/* Embedded PV entries rely on this. */
6138 		KASSERT(va != 0);
6139 		tree = &pmap_kernel_rb;
6140 	}
6141 
6142 	/*
6143 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6144 	 * entry if required for the new mapping.  Temporarily track the old
6145 	 * and new mappings concurrently.  Only after the old mapping is
6146 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6147 	 * our picture of modified/accessed state for either page could get
6148 	 * out of sync (we need any P->V operation for either page to stall
6149 	 * on pmap->pm_lock until done here).
6150 	 */
6151 	new_pve = NULL;
6152 	old_pve = NULL;
6153 	samepage = false;
6154 	new_embedded = false;
6155 
6156     	if (new_pp != NULL) {
6157     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6158     		    &old_pve, &samepage, &new_embedded, tree);
6159 
6160 		/*
6161 		 * If a new pv_entry was needed and none was available, we
6162 		 * can go no further.
6163 		 */
6164 		if (error != 0) {
6165 			if (flags & PMAP_CANFAIL) {
6166 				if (getptp) {
6167 					pmap_unget_ptp(pmap, &pt);
6168 				}
6169 				mutex_exit(&pmap->pm_lock);
6170 				return error;
6171 			}
6172 			panic("%s: alloc pve failed", __func__);
6173 		}
6174 	} else {
6175 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6176 	}
6177 
6178 	/* Map PTEs into address space. */
6179 	kpreempt_disable();
6180 
6181 	/* Install any newly allocated PTPs. */
6182 	if (getptp) {
6183 		pmap_ept_install_ptp(pmap, &pt, va);
6184 	}
6185 
6186 	/* Check if there is an existing mapping. */
6187 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6188 	ptep = &ptes[pl1_pi(va)];
6189 	opte = *ptep;
6190 	bool have_oldpa = pmap_ept_valid_entry(opte);
6191 	paddr_t oldpa = pmap_pte2pa(opte);
6192 
6193 	/*
6194 	 * Update the pte.
6195 	 */
6196 	do {
6197 		opte = *ptep;
6198 
6199 		/*
6200 		 * if the same page, inherit PTE_A and PTE_D.
6201 		 */
6202 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6203 			npte |= opte & (EPT_A | EPT_D);
6204 		}
6205 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6206 
6207 	/*
6208 	 * Done with the PTEs: they can now be unmapped.
6209 	 */
6210 	kpreempt_enable();
6211 
6212 	/*
6213 	 * Update statistics and PTP's reference count.
6214 	 */
6215 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6216 	if (ptp != NULL) {
6217 		if (!have_oldpa) {
6218 			ptp->wire_count++;
6219 		}
6220 		/* Remember minimum VA in PTP. */
6221 		pmap_ptp_range_set(ptp, va);
6222 	}
6223 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6224 
6225 	/*
6226 	 * If the same page, we can skip pv_entry handling.
6227 	 */
6228 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6229 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6230 		if ((npte & EPT_PVLIST) != 0) {
6231 			KASSERT(samepage);
6232 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6233 		}
6234 		goto same_pa;
6235 	} else if ((npte & EPT_PVLIST) != 0) {
6236 		KASSERT(!samepage);
6237 	}
6238 
6239 	/*
6240 	 * If old page is pv-tracked, remove pv_entry from its list.
6241 	 */
6242 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6243 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6244 			old_pp = VM_PAGE_TO_PP(old_pg);
6245 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6246 			panic("%s: EPT_PVLIST with pv-untracked page"
6247 			    " va = %#"PRIxVADDR
6248 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6249 			    __func__, va, oldpa, atop(pa));
6250 		}
6251 
6252 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6253 		    pmap_ept_to_pp_attrs(opte));
6254 	} else {
6255 		KASSERT(old_pve == NULL);
6256 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6257 	}
6258 
6259 	/*
6260 	 * If new page is dynamically PV tracked, insert to tree.
6261 	 */
6262 	if (new_pve != NULL) {
6263 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6264 		old_pve = rb_tree_insert_node(tree, new_pve);
6265 		KASSERT(old_pve == new_pve);
6266 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6267 	}
6268 
6269 same_pa:
6270 	/*
6271 	 * shootdown tlb if necessary.
6272 	 */
6273 
6274 	if (pmap_ept_has_ad) {
6275 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6276 	} else {
6277 		accessed = (opte & EPT_R) != 0;
6278 	}
6279 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6280 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6281 	}
6282 	pmap_drain_pv(pmap);
6283 	mutex_exit(&pmap->pm_lock);
6284 	return 0;
6285 }
6286 
6287 /* Pay close attention, this returns L2. */
6288 static int
6289 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6290 {
6291 	pt_entry_t *pteva;
6292 	paddr_t ptepa;
6293 	int i, index;
6294 
6295 	KASSERT(mutex_owned(&pmap->pm_lock));
6296 
6297 	ptepa = pmap->pm_pdirpa[0];
6298 	for (i = PTP_LEVELS; i > 1; i--) {
6299 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6300 		index = pl_pi(va, i);
6301 		if (!pmap_ept_valid_entry(pteva[index]))
6302 			return i;
6303 		ptepa = pmap_pte2pa(pteva[index]);
6304 	}
6305 	if (lastpde != NULL) {
6306 		*lastpde = pteva[index];
6307 	}
6308 
6309 	return 0;
6310 }
6311 
6312 static bool
6313 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6314 {
6315 	pt_entry_t *ptes, pte;
6316 	pd_entry_t pde;
6317 	paddr_t ptppa, pa;
6318 	bool rv;
6319 
6320 #ifdef __HAVE_DIRECT_MAP
6321 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6322 		if (pap != NULL) {
6323 			*pap = PMAP_DIRECT_UNMAP(va);
6324 		}
6325 		return true;
6326 	}
6327 #endif
6328 
6329 	rv = false;
6330 	pa = 0;
6331 
6332 	mutex_enter(&pmap->pm_lock);
6333 	kpreempt_disable();
6334 
6335 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6336 		ptppa = pmap_pte2pa(pde);
6337 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6338 		pte = ptes[pl1_pi(va)];
6339 		if (__predict_true((pte & EPT_R) != 0)) {
6340 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6341 			rv = true;
6342 		}
6343 	}
6344 
6345 	kpreempt_enable();
6346 	mutex_exit(&pmap->pm_lock);
6347 
6348 	if (pap != NULL) {
6349 		*pap = pa;
6350 	}
6351 	return rv;
6352 }
6353 
6354 static bool
6355 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6356     vaddr_t va)
6357 {
6358 	struct pv_entry *pve;
6359 	struct vm_page *pg;
6360 	struct pmap_page *pp;
6361 	pt_entry_t opte;
6362 	bool accessed;
6363 
6364 	KASSERT(pmap != pmap_kernel());
6365 	KASSERT(mutex_owned(&pmap->pm_lock));
6366 	KASSERT(kpreempt_disabled());
6367 
6368 	if (!pmap_ept_valid_entry(*pte)) {
6369 		/* VA not mapped. */
6370 		return false;
6371 	}
6372 
6373 	/* Atomically save the old PTE and zap it. */
6374 	opte = pmap_pte_testset(pte, 0);
6375 	if (!pmap_ept_valid_entry(opte)) {
6376 		return false;
6377 	}
6378 
6379 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6380 
6381 	if (ptp) {
6382 		/*
6383 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6384 		 */
6385 		ptp->wire_count--;
6386 		if (ptp->wire_count <= 1) {
6387 			opte |= EPT_A;
6388 		}
6389 	}
6390 
6391 	if (pmap_ept_has_ad) {
6392 		accessed = (opte & EPT_A) != 0;
6393 	} else {
6394 		accessed = true;
6395 	}
6396 	if (accessed) {
6397 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6398 	}
6399 
6400 	/*
6401 	 * If we are not on a pv list - we are done.
6402 	 */
6403 	if ((opte & EPT_PVLIST) == 0) {
6404 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6405 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6406 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6407 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6408 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6409 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6410 		return true;
6411 	}
6412 
6413 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6414 		pp = VM_PAGE_TO_PP(pg);
6415 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6416 		paddr_t pa = pmap_pte2pa(opte);
6417 		panic("%s: EPT_PVLIST with pv-untracked page"
6418 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6419 		    __func__, va, pa, atop(pa));
6420 	}
6421 
6422 	/* Sync R/M bits. */
6423 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6424 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6425 	return true;
6426 }
6427 
6428 static void
6429 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6430     vaddr_t startva, vaddr_t endva)
6431 {
6432 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6433 
6434 	KASSERT(pmap != pmap_kernel());
6435 	KASSERT(mutex_owned(&pmap->pm_lock));
6436 	KASSERT(kpreempt_disabled());
6437 
6438 	/*
6439 	 * mappings are very often sparse, so clip the given range to the
6440 	 * range of PTEs that are known present in the PTP.
6441 	 */
6442 	pmap_ptp_range_clip(ptp, &startva, &pte);
6443 
6444 	/*
6445 	 * note that ptpva points to the PTE that maps startva.   this may
6446 	 * or may not be the first PTE in the PTP.
6447 	 *
6448 	 * we loop through the PTP while there are still PTEs to look at
6449 	 * and the wire_count is greater than 1 (because we use the wire_count
6450 	 * to keep track of the number of real PTEs in the PTP).
6451 	 */
6452 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6453 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6454 		startva += PAGE_SIZE;
6455 		pte++;
6456 	}
6457 }
6458 
6459 static void
6460 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6461 {
6462 	pt_entry_t *ptes;
6463 	pd_entry_t pde;
6464 	paddr_t ptppa;
6465 	vaddr_t blkendva, va = sva;
6466 	struct vm_page *ptp;
6467 
6468 	mutex_enter(&pmap->pm_lock);
6469 	kpreempt_disable();
6470 
6471 	for (/* null */ ; va < eva ; va = blkendva) {
6472 		int lvl;
6473 
6474 		/* determine range of block */
6475 		blkendva = x86_round_pdr(va+1);
6476 		if (blkendva > eva)
6477 			blkendva = eva;
6478 
6479 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6480 		if (lvl != 0) {
6481 			/* Skip a range corresponding to an invalid pde. */
6482 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6483  			continue;
6484 		}
6485 
6486 		/* PA of the PTP */
6487 		ptppa = pmap_pte2pa(pde);
6488 
6489 		ptp = pmap_find_ptp(pmap, va, 1);
6490 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6491 		    __func__);
6492 
6493 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6494 
6495 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6496 		    blkendva);
6497 
6498 		/* If PTP is no longer being used, free it. */
6499 		if (ptp && ptp->wire_count <= 1) {
6500 			pmap_ept_free_ptp(pmap, ptp, va);
6501 		}
6502 	}
6503 
6504 	kpreempt_enable();
6505 	pmap_drain_pv(pmap);
6506 	mutex_exit(&pmap->pm_lock);
6507 }
6508 
6509 static int
6510 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6511     uint8_t *oattrs, pt_entry_t *optep)
6512 {
6513 	struct pmap *pmap;
6514 	pt_entry_t *ptep;
6515 	pt_entry_t opte;
6516 	pt_entry_t npte;
6517 	pt_entry_t expect;
6518 	bool need_shootdown;
6519 
6520 	expect = pmap_pa2pte(pa) | EPT_R;
6521 	pmap = ptp_to_pmap(ptp);
6522 
6523 	if (clearbits != ~0) {
6524 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6525 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6526 	}
6527 
6528 	ptep = pmap_map_pte(pmap, ptp, va);
6529 	do {
6530 		opte = *ptep;
6531 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6532 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6533 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6534 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6535 			/*
6536 			 * We lost a race with a V->P operation like
6537 			 * pmap_remove().  Wait for the competitor
6538 			 * reflecting pte bits into mp_attrs.
6539 			 */
6540 			pmap_unmap_pte();
6541 			return EAGAIN;
6542 		}
6543 
6544 		/*
6545 		 * Check if there's anything to do on this PTE.
6546 		 */
6547 		if ((opte & clearbits) == 0) {
6548 			need_shootdown = false;
6549 			break;
6550 		}
6551 
6552 		/*
6553 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6554 		 * ... Unless we are clearing only the EPT_W bit and
6555 		 * it isn't cached as RW (EPT_D).
6556 		 */
6557 		if (pmap_ept_has_ad) {
6558 			need_shootdown = (opte & EPT_A) != 0 &&
6559 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6560 		} else {
6561 			need_shootdown = true;
6562 		}
6563 
6564 		npte = opte & ~clearbits;
6565 
6566 		/*
6567 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6568 		 */
6569 		if (need_shootdown) {
6570 			npte &= ~(EPT_A | EPT_D);
6571 		}
6572 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6573 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6574 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6575 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6576 
6577 	if (need_shootdown) {
6578 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6579 	}
6580 	pmap_unmap_pte();
6581 
6582 	*oattrs = pmap_ept_to_pp_attrs(opte);
6583 	if (optep != NULL)
6584 		*optep = opte;
6585 	return 0;
6586 }
6587 
6588 static void
6589 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6590     vaddr_t va)
6591 {
6592 
6593 	KASSERT(mutex_owned(&pmap->pm_lock));
6594 
6595 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6596 	ptp->wire_count--;
6597 	if (ptp->wire_count <= 1) {
6598 		pmap_ept_free_ptp(pmap, ptp, va);
6599 	}
6600 }
6601 
6602 static void
6603 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6604 {
6605 	pt_entry_t bit_rem;
6606 	pt_entry_t *ptes, *spte;
6607 	pt_entry_t opte, npte;
6608 	pd_entry_t pde;
6609 	paddr_t ptppa;
6610 	vaddr_t va;
6611 	bool modified;
6612 
6613 	bit_rem = 0;
6614 	if (!(prot & VM_PROT_WRITE))
6615 		bit_rem = EPT_W;
6616 
6617 	sva &= PTE_FRAME;
6618 	eva &= PTE_FRAME;
6619 
6620 	/* Acquire pmap. */
6621 	mutex_enter(&pmap->pm_lock);
6622 	kpreempt_disable();
6623 
6624 	for (va = sva; va < eva; va += PAGE_SIZE) {
6625 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6626 			continue;
6627 		}
6628 
6629 		ptppa = pmap_pte2pa(pde);
6630 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6631 		spte = &ptes[pl1_pi(va)];
6632 
6633 		do {
6634 			opte = *spte;
6635 			if (!pmap_ept_valid_entry(opte)) {
6636 				goto next;
6637 			}
6638 			npte = (opte & ~bit_rem);
6639 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6640 
6641 		if (pmap_ept_has_ad) {
6642 			modified = (opte & EPT_D) != 0;
6643 		} else {
6644 			modified = true;
6645 		}
6646 		if (modified) {
6647 			vaddr_t tva = x86_ptob(spte - ptes);
6648 			pmap_tlb_shootdown(pmap, tva, 0,
6649 			    TLBSHOOT_WRITE_PROTECT);
6650 		}
6651 next:;
6652 	}
6653 
6654 	kpreempt_enable();
6655 	mutex_exit(&pmap->pm_lock);
6656 }
6657 
6658 static void
6659 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6660 {
6661 	pt_entry_t *ptes, *ptep, opte;
6662 	pd_entry_t pde;
6663 	paddr_t ptppa;
6664 
6665 	/* Acquire pmap. */
6666 	mutex_enter(&pmap->pm_lock);
6667 	kpreempt_disable();
6668 
6669 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6670 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6671 	}
6672 
6673 	ptppa = pmap_pte2pa(pde);
6674 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6675 	ptep = &ptes[pl1_pi(va)];
6676 	opte = *ptep;
6677 	KASSERT(pmap_ept_valid_entry(opte));
6678 
6679 	if (opte & EPT_WIRED) {
6680 		pt_entry_t npte = opte & ~EPT_WIRED;
6681 
6682 		opte = pmap_pte_testset(ptep, npte);
6683 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6684 	} else {
6685 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6686 		    "did not change!\n", __func__, pmap, va);
6687 	}
6688 
6689 	/* Release pmap. */
6690 	kpreempt_enable();
6691 	mutex_exit(&pmap->pm_lock);
6692 }
6693 
6694 /* -------------------------------------------------------------------------- */
6695 
6696 void
6697 pmap_ept_transform(struct pmap *pmap)
6698 {
6699 	pmap->pm_enter = pmap_ept_enter;
6700 	pmap->pm_extract = pmap_ept_extract;
6701 	pmap->pm_remove = pmap_ept_remove;
6702 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6703 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6704 	pmap->pm_write_protect = pmap_ept_write_protect;
6705 	pmap->pm_unwire = pmap_ept_unwire;
6706 
6707 	memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6708 }
6709 
6710 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6711