xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 8ecbf5f02b752fcb7debe1a8fab1dc82602bc760)
1 /*	$NetBSD: pmap.c,v 1.406 2020/09/02 17:37:57 bouyer Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.406 2020/09/02 17:37:57 bouyer Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 
142 #define	__MUTEX_PRIVATE	/* for assertions */
143 
144 #include <sys/param.h>
145 #include <sys/systm.h>
146 #include <sys/proc.h>
147 #include <sys/pool.h>
148 #include <sys/kernel.h>
149 #include <sys/atomic.h>
150 #include <sys/cpu.h>
151 #include <sys/intr.h>
152 #include <sys/xcall.h>
153 #include <sys/kcore.h>
154 #include <sys/kmem.h>
155 #include <sys/asan.h>
156 #include <sys/msan.h>
157 #include <sys/entropy.h>
158 
159 #include <uvm/uvm.h>
160 #include <uvm/pmap/pmap_pvt.h>
161 
162 #include <dev/isa/isareg.h>
163 
164 #include <machine/specialreg.h>
165 #include <machine/gdt.h>
166 #include <machine/isa_machdep.h>
167 #include <machine/cpuvar.h>
168 #include <machine/cputypes.h>
169 
170 #include <x86/pmap.h>
171 #include <x86/pmap_pv.h>
172 
173 #include <x86/i82489reg.h>
174 #include <x86/i82489var.h>
175 
176 #ifdef XEN
177 #include <xen/include/public/xen.h>
178 #include <xen/hypervisor.h>
179 #endif
180 
181 /*
182  * general info:
183  *
184  *  - for an explanation of how the x86 MMU hardware works see
185  *    the comments in <machine/pte.h>.
186  *
187  *  - for an explanation of the general memory structure used by
188  *    this pmap (including the recursive mapping), see the comments
189  *    in <machine/pmap.h>.
190  *
191  * this file contains the code for the "pmap module."   the module's
192  * job is to manage the hardware's virtual to physical address mappings.
193  * note that there are two levels of mapping in the VM system:
194  *
195  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
196  *      to map ranges of virtual address space to objects/files.  for
197  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
198  *      to the file /bin/ls starting at offset zero."   note that
199  *      the upper layer mapping is not concerned with how individual
200  *      vm_pages are mapped.
201  *
202  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
203  *      from virtual addresses.   it is concerned with which vm_page is
204  *      mapped where.   for example, when you run /bin/ls and start
205  *      at page 0x1000 the fault routine may lookup the correct page
206  *      of the /bin/ls file and then ask the pmap layer to establish
207  *      a mapping for it.
208  *
209  * note that information in the lower layer of the VM system can be
210  * thrown away since it can easily be reconstructed from the info
211  * in the upper layer.
212  *
213  * data structures we use include:
214  *
215  *  - struct pmap: describes the address space of one thread
216  *  - struct pmap_page: describes one pv-tracked page, without
217  *    necessarily a corresponding vm_page
218  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
219  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
220  *    physical memory.   the pp_pvlist points to a list of pv_entry
221  *    structures which describe all the <PMAP,VA> pairs that this
222  *    page is mapped in.    this is critical for page based operations
223  *    such as pmap_page_protect() [change protection on _all_ mappings
224  *    of a page]
225  */
226 
227 /*
228  * Locking
229  *
230  * We have the following locks that we must deal with, listed in the order
231  * that they are acquired:
232  *
233  * pg->uobject->vmobjlock, pg->uanon->an_lock
234  *
235  * 	For managed pages, these per-object locks are taken by the VM system
236  *	before calling into the pmap module - either a read or write hold.
237  *	The lock hold prevent pages from changing identity while the pmap is
238  *	operating on them.  For example, the same lock is held across a call
239  *	to pmap_remove() and the following call to pmap_update(), so that a
240  *	page does not gain a new identity while its TLB visibility is stale.
241  *
242  * pmap->pm_lock
243  *
244  *	This lock protects the fields in the pmap structure including the
245  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
246  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
247  *	kernel PDEs are never freed, and the kernel is expected to be self
248  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
249  *	because they can be modified from interrupt context).
250  *
251  * pmaps_lock
252  *
253  *	This lock protects the list of active pmaps (headed by "pmaps").
254  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
255  *
256  * pp_lock
257  *
258  *	This per-page lock protects PV entry lists and the embedded PV entry
259  *	in each vm_page, allowing for concurrent operation on pages by
260  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
261  *	points it is taken context switching is usually not tolerable, and
262  *	spin mutexes must block out interrupts that could take kernel_lock.
263  */
264 
265 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
266 #ifdef DIAGNOSTIC
267 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
268 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
269 #else
270 #define	PMAP_DUMMY_LOCK(pm)
271 #define	PMAP_DUMMY_UNLOCK(pm)
272 #endif
273 
274 static const struct uvm_pagerops pmap_pager = {
275 	/* nothing */
276 };
277 
278 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
279 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
280 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
281 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
282 const long nbpd[] = NBPD_INITIALIZER;
283 #ifdef i386
284 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
285 #else
286 pd_entry_t *normal_pdes[3];
287 #endif
288 
289 long nkptp[] = NKPTP_INITIALIZER;
290 
291 struct pmap_head pmaps;
292 kmutex_t pmaps_lock __cacheline_aligned;
293 
294 struct pcpu_area *pcpuarea __read_mostly;
295 
296 static vaddr_t pmap_maxkvaddr;
297 
298 /*
299  * Misc. event counters.
300  */
301 struct evcnt pmap_iobmp_evcnt;
302 struct evcnt pmap_ldt_evcnt;
303 
304 /*
305  * PAT
306  */
307 static bool cpu_pat_enabled __read_mostly = false;
308 
309 /*
310  * Global data structures
311  */
312 
313 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
314 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
315 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
316 
317 struct bootspace bootspace __read_mostly;
318 struct slotspace slotspace __read_mostly;
319 
320 /* Set to PTE_NX if supported. */
321 pd_entry_t pmap_pg_nx __read_mostly = 0;
322 
323 /* Set to PTE_G if supported. */
324 pd_entry_t pmap_pg_g __read_mostly = 0;
325 
326 /* Set to true if large pages are supported. */
327 int pmap_largepages __read_mostly = 0;
328 
329 paddr_t lowmem_rsvd __read_mostly;
330 paddr_t avail_start __read_mostly; /* PA of first available physical page */
331 paddr_t avail_end __read_mostly; /* PA of last available physical page */
332 
333 #ifdef XENPV
334 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
335 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
336 #endif
337 
338 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
339 #define	PMAP_CHECK_PP(pp) \
340     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
341 
342 /*
343  * Other data structures
344  */
345 
346 static pt_entry_t protection_codes[8] __read_mostly;
347 
348 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
349 
350 /*
351  * The following two vaddr_t's are used during system startup to keep track of
352  * how much of the kernel's VM space we have used. Once the system is started,
353  * the management of the remaining kernel VM space is turned over to the
354  * kernel_map vm_map.
355  */
356 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
357 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
358 
359 #ifndef XENPV
360 /*
361  * LAPIC virtual address, and fake physical address.
362  */
363 volatile vaddr_t local_apic_va __read_mostly;
364 paddr_t local_apic_pa __read_mostly;
365 #endif
366 
367 /*
368  * pool that pmap structures are allocated from
369  */
370 struct pool_cache pmap_cache;
371 static int  pmap_ctor(void *, void *, int);
372 static void pmap_dtor(void *, void *);
373 
374 /*
375  * pv_page cache
376  */
377 static struct pool_cache pmap_pvp_cache;
378 
379 #ifdef __HAVE_DIRECT_MAP
380 vaddr_t pmap_direct_base __read_mostly;
381 vaddr_t pmap_direct_end __read_mostly;
382 #endif
383 
384 #ifndef __HAVE_DIRECT_MAP
385 /*
386  * Special VAs and the PTEs that map them
387  */
388 static pt_entry_t *early_zero_pte;
389 static void pmap_vpage_cpualloc(struct cpu_info *);
390 #ifdef XENPV
391 char *early_zerop; /* also referenced from xen_locore() */
392 #else
393 static char *early_zerop;
394 #endif
395 #endif
396 
397 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
398 
399 /* PDP pool and its callbacks */
400 static struct pool pmap_pdp_pool;
401 static void pmap_pdp_init(pd_entry_t *);
402 static void pmap_pdp_fini(pd_entry_t *);
403 
404 #ifdef PAE
405 /* need to allocate items of 4 pages */
406 static void *pmap_pdp_alloc(struct pool *, int);
407 static void pmap_pdp_free(struct pool *, void *);
408 static struct pool_allocator pmap_pdp_allocator = {
409 	.pa_alloc = pmap_pdp_alloc,
410 	.pa_free = pmap_pdp_free,
411 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
412 };
413 #endif
414 
415 extern vaddr_t idt_vaddr;
416 extern paddr_t idt_paddr;
417 extern vaddr_t gdt_vaddr;
418 extern paddr_t gdt_paddr;
419 extern vaddr_t ldt_vaddr;
420 extern paddr_t ldt_paddr;
421 
422 #ifdef i386
423 /* stuff to fix the pentium f00f bug */
424 extern vaddr_t pentium_idt_vaddr;
425 #endif
426 
427 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
428 struct pmap_ptparray {
429 	struct vm_page *pg[PTP_LEVELS + 1];
430 	bool alloced[PTP_LEVELS + 1];
431 };
432 
433 /*
434  * PV entries are allocated in page-sized chunks and cached per-pmap to
435  * avoid intense pressure on memory allocators.
436  */
437 
438 struct pv_page {
439 	LIST_HEAD(, pv_entry)	pvp_pves;
440 	LIST_ENTRY(pv_page)	pvp_list;
441 	long			pvp_nfree;
442 	struct pmap		*pvp_pmap;
443 };
444 
445 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
446 
447 /*
448  * PV tree prototypes
449  */
450 
451 static int	pmap_compare_key(void *, const void *, const void *);
452 static int	pmap_compare_nodes(void *, const void *, const void *);
453 
454 /* Read-black tree */
455 static const rb_tree_ops_t pmap_rbtree_ops = {
456 	.rbto_compare_nodes = pmap_compare_nodes,
457 	.rbto_compare_key = pmap_compare_key,
458 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
459 	.rbto_context = NULL
460 };
461 
462 /*
463  * Local prototypes
464  */
465 
466 #ifdef __HAVE_PCPU_AREA
467 static void pmap_init_pcpu(void);
468 #endif
469 #ifdef __HAVE_DIRECT_MAP
470 static void pmap_init_directmap(struct pmap *);
471 #endif
472 #if !defined(XENPV)
473 static void pmap_remap_global(void);
474 #endif
475 #ifndef XENPV
476 static void pmap_init_lapic(void);
477 static void pmap_remap_largepages(void);
478 #endif
479 
480 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
481     struct vm_page **);
482 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
483 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
484     pd_entry_t * const *);
485 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
486 static void pmap_freepage(struct pmap *, struct vm_page *, int);
487 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
488     pt_entry_t *, pd_entry_t * const *);
489 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
490     vaddr_t);
491 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
492     vaddr_t);
493 static int pmap_pvp_ctor(void *, void *, int);
494 static void pmap_pvp_dtor(void *, void *);
495 static struct pv_entry *pmap_alloc_pv(struct pmap *);
496 static void pmap_free_pv(struct pmap *, struct pv_entry *);
497 static void pmap_drain_pv(struct pmap *);
498 
499 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
500 
501 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
502 static void pmap_reactivate(struct pmap *);
503 
504 /*
505  * p m a p   h e l p e r   f u n c t i o n s
506  */
507 
508 static inline void
509 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
510 {
511 
512 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
513 	pmap->pm_stats.resident_count += resid_diff;
514 	pmap->pm_stats.wired_count += wired_diff;
515 }
516 
517 static inline void
518 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
519 {
520 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
521 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
522 
523 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
524 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
525 
526 	pmap_stats_update(pmap, resid_diff, wired_diff);
527 }
528 
529 /*
530  * ptp_to_pmap: lookup pmap by ptp
531  */
532 static inline struct pmap *
533 ptp_to_pmap(struct vm_page *ptp)
534 {
535 	struct pmap *pmap;
536 
537 	if (ptp == NULL) {
538 		return pmap_kernel();
539 	}
540 	pmap = (struct pmap *)ptp->uobject;
541 	KASSERT(pmap != NULL);
542 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
543 	return pmap;
544 }
545 
546 static inline struct pv_pte *
547 pve_to_pvpte(struct pv_entry *pve)
548 {
549 
550 	if (pve == NULL)
551 		return NULL;
552 	KASSERT((void *)&pve->pve_pte == (void *)pve);
553 	return &pve->pve_pte;
554 }
555 
556 static inline struct pv_entry *
557 pvpte_to_pve(struct pv_pte *pvpte)
558 {
559 	struct pv_entry *pve = (void *)pvpte;
560 
561 	KASSERT(pve_to_pvpte(pve) == pvpte);
562 	return pve;
563 }
564 
565 /*
566  * Return true if the pmap page has an embedded PV entry.
567  */
568 static inline bool
569 pv_pte_embedded(struct pmap_page *pp)
570 {
571 
572 	KASSERT(mutex_owned(&pp->pp_lock));
573 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
574 }
575 
576 /*
577  * pv_pte_first, pv_pte_next: PV list iterator.
578  */
579 static inline struct pv_pte *
580 pv_pte_first(struct pmap_page *pp)
581 {
582 
583 	KASSERT(mutex_owned(&pp->pp_lock));
584 	if (pv_pte_embedded(pp)) {
585 		return &pp->pp_pte;
586 	}
587 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
588 }
589 
590 static inline struct pv_pte *
591 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
592 {
593 
594 	KASSERT(mutex_owned(&pp->pp_lock));
595 	KASSERT(pvpte != NULL);
596 	if (pvpte == &pp->pp_pte) {
597 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
598 	}
599 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
600 }
601 
602 static inline uint8_t
603 pmap_pte_to_pp_attrs(pt_entry_t pte)
604 {
605 	uint8_t ret = 0;
606 	if (pte & PTE_D)
607 		ret |= PP_ATTRS_D;
608 	if (pte & PTE_A)
609 		ret |= PP_ATTRS_A;
610 	if (pte & PTE_W)
611 		ret |= PP_ATTRS_W;
612 	return ret;
613 }
614 
615 static inline pt_entry_t
616 pmap_pp_attrs_to_pte(uint8_t attrs)
617 {
618 	pt_entry_t pte = 0;
619 	if (attrs & PP_ATTRS_D)
620 		pte |= PTE_D;
621 	if (attrs & PP_ATTRS_A)
622 		pte |= PTE_A;
623 	if (attrs & PP_ATTRS_W)
624 		pte |= PTE_W;
625 	return pte;
626 }
627 
628 /*
629  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
630  * of course the kernel is always loaded
631  */
632 bool
633 pmap_is_curpmap(struct pmap *pmap)
634 {
635 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
636 }
637 
638 inline void
639 pmap_reference(struct pmap *pmap)
640 {
641 
642 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
643 }
644 
645 /*
646  * rbtree: compare two nodes.
647  */
648 static int
649 pmap_compare_nodes(void *context, const void *n1, const void *n2)
650 {
651 	const struct pv_entry *pve1 = n1;
652 	const struct pv_entry *pve2 = n2;
653 
654 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
655 
656 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
657 		return -1;
658 	}
659 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
660 		return 1;
661 	}
662 	return 0;
663 }
664 
665 /*
666  * rbtree: compare a node and a key.
667  */
668 static int
669 pmap_compare_key(void *context, const void *n, const void *k)
670 {
671 	const struct pv_entry *pve = n;
672 	const vaddr_t key = (vaddr_t)k;
673 
674 	if (pve->pve_pte.pte_va < key) {
675 		return -1;
676 	}
677 	if (pve->pve_pte.pte_va > key) {
678 		return 1;
679 	}
680 	return 0;
681 }
682 
683 /*
684  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
685  */
686 static inline void
687 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
688 {
689 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
690 
691 	if (va < *min) {
692 		*min = va;
693 	}
694 }
695 
696 /*
697  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
698  */
699 static inline void
700 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
701 {
702 	vaddr_t sclip;
703 
704 	if (ptp == NULL) {
705 		return;
706 	}
707 
708 	sclip = (vaddr_t)ptp->uanon;
709 	sclip = (*startva < sclip ? sclip : *startva);
710 	*pte += (sclip - *startva) / PAGE_SIZE;
711 	*startva = sclip;
712 }
713 
714 /*
715  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
716  *
717  * there are several pmaps involved.  some or all of them might be same.
718  *
719  *	- the pmap given by the first argument
720  *		our caller wants to access this pmap's PTEs.
721  *
722  *	- pmap_kernel()
723  *		the kernel pmap.  note that it only contains the kernel part
724  *		of the address space which is shared by any pmap.  ie. any
725  *		pmap can be used instead of pmap_kernel() for our purpose.
726  *
727  *	- ci->ci_pmap
728  *		pmap currently loaded on the cpu.
729  *
730  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
731  *		current process' pmap.
732  *
733  * => caller must lock pmap first (if not the kernel pmap)
734  * => must be undone with pmap_unmap_ptes before returning
735  * => disables kernel preemption
736  */
737 void
738 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
739     pd_entry_t * const **pdeppp)
740 {
741 	struct pmap *curpmap;
742 	struct cpu_info *ci;
743 	lwp_t *l;
744 
745 	kpreempt_disable();
746 
747 	/* The kernel's pmap is always accessible. */
748 	if (pmap == pmap_kernel()) {
749 		*pmap2 = NULL;
750 		*ptepp = PTE_BASE;
751 		*pdeppp = normal_pdes;
752 		return;
753 	}
754 
755 	KASSERT(mutex_owned(&pmap->pm_lock));
756 
757 	l = curlwp;
758 	ci = l->l_cpu;
759 	curpmap = ci->ci_pmap;
760 	if (pmap == curpmap) {
761 		/*
762 		 * Already on the CPU: make it valid.  This is very
763 		 * often the case during exit(), when we have switched
764 		 * to the kernel pmap in order to destroy a user pmap.
765 		 */
766 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
767 			pmap_reactivate(pmap);
768 		}
769 		*pmap2 = NULL;
770 	} else {
771 		/*
772 		 * Toss current pmap from CPU and install new pmap, but keep
773 		 * a reference to the old one.  Dropping the reference can
774 		 * can block as it needs to take locks, so defer that to
775 		 * pmap_unmap_ptes().
776 		 */
777 		pmap_reference(pmap);
778 		pmap_load1(l, pmap, curpmap);
779 		*pmap2 = curpmap;
780 	}
781 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
782 #ifdef DIAGNOSTIC
783 	pmap->pm_ncsw = lwp_pctr();
784 #endif
785 	*ptepp = PTE_BASE;
786 
787 #if defined(XENPV) && defined(__x86_64__)
788 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
789 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
790 	*pdeppp = ci->ci_normal_pdes;
791 #else
792 	*pdeppp = normal_pdes;
793 #endif
794 }
795 
796 /*
797  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
798  *
799  * => we cannot tolerate context switches while mapped in: assert this.
800  * => reenables kernel preemption.
801  * => does not unlock pmap.
802  */
803 void
804 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
805 {
806 	struct cpu_info *ci;
807 	struct pmap *mypmap;
808 	struct lwp *l;
809 
810 	KASSERT(kpreempt_disabled());
811 
812 	/* The kernel's pmap is always accessible. */
813 	if (pmap == pmap_kernel()) {
814 		kpreempt_enable();
815 		return;
816 	}
817 
818 	l = curlwp;
819 	ci = l->l_cpu;
820 
821 	KASSERT(mutex_owned(&pmap->pm_lock));
822 	KASSERT(pmap->pm_ncsw == lwp_pctr());
823 
824 #if defined(XENPV) && defined(__x86_64__)
825 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
826 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
827 #endif
828 
829 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
830 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
831 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
832 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
833 		ci->ci_want_pmapload = 0;
834 	} else {
835 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
836 		ci->ci_tlbstate = TLBSTATE_LAZY;
837 	}
838 
839 	/* Now safe to re-enable preemption. */
840 	kpreempt_enable();
841 
842 	/* Toss reference to other pmap taken earlier. */
843 	if (pmap2 != NULL) {
844 		pmap_destroy(pmap2);
845 	}
846 }
847 
848 inline static void
849 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
850 {
851 
852 #if !defined(__x86_64__)
853 	if (curproc == NULL || curproc->p_vmspace == NULL ||
854 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
855 		return;
856 
857 	if ((opte ^ npte) & PTE_X)
858 		pmap_update_pg(va);
859 
860 	/*
861 	 * Executability was removed on the last executable change.
862 	 * Reset the code segment to something conservative and
863 	 * let the trap handler deal with setting the right limit.
864 	 * We can't do that because of locking constraints on the vm map.
865 	 */
866 
867 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
868 		struct trapframe *tf = curlwp->l_md.md_regs;
869 
870 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
871 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
872 	}
873 #endif /* !defined(__x86_64__) */
874 }
875 
876 #if !defined(__x86_64__)
877 /*
878  * Fixup the code segment to cover all potential executable mappings.
879  * returns 0 if no changes to the code segment were made.
880  */
881 int
882 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
883 {
884 	struct vm_map_entry *ent;
885 	struct pmap *pm = vm_map_pmap(map);
886 	vaddr_t va = 0;
887 
888 	vm_map_lock_read(map);
889 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
890 		/*
891 		 * This entry has greater va than the entries before.
892 		 * We need to make it point to the last page, not past it.
893 		 */
894 		if (ent->protection & VM_PROT_EXECUTE)
895 			va = trunc_page(ent->end) - PAGE_SIZE;
896 	}
897 	vm_map_unlock_read(map);
898 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
899 		return 0;
900 
901 	pm->pm_hiexec = va;
902 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
903 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
904 	} else {
905 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
906 		return 0;
907 	}
908 	return 1;
909 }
910 #endif /* !defined(__x86_64__) */
911 
912 void
913 pat_init(struct cpu_info *ci)
914 {
915 	uint64_t pat;
916 
917 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
918 		return;
919 
920 	/* We change WT to WC. Leave all other entries the default values. */
921 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
922 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
923 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
924 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
925 
926 	wrmsr(MSR_CR_PAT, pat);
927 	cpu_pat_enabled = true;
928 }
929 
930 static pt_entry_t
931 pmap_pat_flags(u_int flags)
932 {
933 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
934 
935 	if (!cpu_pat_enabled) {
936 		switch (cacheflags) {
937 		case PMAP_NOCACHE:
938 		case PMAP_NOCACHE_OVR:
939 			/* results in PGC_UCMINUS on cpus which have
940 			 * the cpuid PAT but PAT "disabled"
941 			 */
942 			return PTE_PCD;
943 		default:
944 			return 0;
945 		}
946 	}
947 
948 	switch (cacheflags) {
949 	case PMAP_NOCACHE:
950 		return PGC_UC;
951 	case PMAP_WRITE_COMBINE:
952 		return PGC_WC;
953 	case PMAP_WRITE_BACK:
954 		return PGC_WB;
955 	case PMAP_NOCACHE_OVR:
956 		return PGC_UCMINUS;
957 	}
958 
959 	return 0;
960 }
961 
962 /*
963  * p m a p   k e n t e r   f u n c t i o n s
964  *
965  * functions to quickly enter/remove pages from the kernel address
966  * space.   pmap_kremove is exported to MI kernel.  we make use of
967  * the recursive PTE mappings.
968  */
969 
970 /*
971  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
972  *
973  * => no need to lock anything, assume va is already allocated
974  * => should be faster than normal pmap enter function
975  */
976 void
977 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
978 {
979 	pt_entry_t *pte, opte, npte;
980 
981 	KASSERT(!(prot & ~VM_PROT_ALL));
982 
983 	if (va < VM_MIN_KERNEL_ADDRESS)
984 		pte = vtopte(va);
985 	else
986 		pte = kvtopte(va);
987 #if defined(XENPV) && defined(DOM0OPS)
988 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
989 #ifdef DEBUG
990 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
991 		    " outside range\n", __func__, pa, va);
992 #endif /* DEBUG */
993 		npte = pa;
994 	} else
995 #endif /* XENPV && DOM0OPS */
996 		npte = pmap_pa2pte(pa);
997 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
998 	npte |= pmap_pat_flags(flags);
999 	opte = pmap_pte_testset(pte, npte); /* zap! */
1000 
1001 	/*
1002 	 * XXX: make sure we are not dealing with a large page, since the only
1003 	 * large pages created are for the kernel image, and they should never
1004 	 * be kentered.
1005 	 */
1006 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1007 
1008 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1009 		/* This should not happen. */
1010 		printf_nolog("%s: mapping already present\n", __func__);
1011 		kpreempt_disable();
1012 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1013 		kpreempt_enable();
1014 	}
1015 }
1016 
1017 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1018 
1019 #if defined(__x86_64__)
1020 /*
1021  * Change protection for a virtual address. Local for a CPU only, don't
1022  * care about TLB shootdowns.
1023  *
1024  * => must be called with preemption disabled
1025  */
1026 void
1027 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1028 {
1029 	pt_entry_t *pte, opte, npte;
1030 
1031 	KASSERT(kpreempt_disabled());
1032 
1033 	if (va < VM_MIN_KERNEL_ADDRESS)
1034 		pte = vtopte(va);
1035 	else
1036 		pte = kvtopte(va);
1037 
1038 	npte = opte = *pte;
1039 
1040 	if ((prot & VM_PROT_WRITE) != 0)
1041 		npte |= PTE_W;
1042 	else
1043 		npte &= ~(PTE_W|PTE_D);
1044 
1045 	if (opte != npte) {
1046 		pmap_pte_set(pte, npte);
1047 		pmap_pte_flush();
1048 		invlpg(va);
1049 	}
1050 }
1051 #endif /* defined(__x86_64__) */
1052 
1053 /*
1054  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1055  *
1056  * => no need to lock anything
1057  * => caller must dispose of any vm_page mapped in the va range
1058  * => note: not an inline function
1059  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1060  * => we assume kernel only unmaps valid addresses and thus don't bother
1061  *    checking the valid bit before doing TLB flushing
1062  * => must be followed by call to pmap_update() before reuse of page
1063  */
1064 static void
1065 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1066 {
1067 	pt_entry_t *pte, opte;
1068 	vaddr_t va, eva;
1069 
1070 	eva = sva + len;
1071 
1072 	kpreempt_disable();
1073 	for (va = sva; va < eva; va += PAGE_SIZE) {
1074 		pte = kvtopte(va);
1075 		opte = pmap_pte_testset(pte, 0); /* zap! */
1076 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1077 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1078 			    TLBSHOOT_KREMOVE);
1079 		}
1080 		KASSERTMSG((opte & PTE_PS) == 0,
1081 		    "va %#" PRIxVADDR " is a large page", va);
1082 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1083 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1084 	}
1085 	if (localonly) {
1086 		tlbflushg();
1087 	}
1088 	kpreempt_enable();
1089 }
1090 
1091 void
1092 pmap_kremove(vaddr_t sva, vsize_t len)
1093 {
1094 
1095 	pmap_kremove1(sva, len, false);
1096 }
1097 
1098 /*
1099  * pmap_kremove_local: like pmap_kremove(), but only worry about
1100  * TLB invalidations on the current CPU.  this is only intended
1101  * for use while writing kernel crash dumps, either after panic
1102  * or via reboot -d.
1103  */
1104 void
1105 pmap_kremove_local(vaddr_t sva, vsize_t len)
1106 {
1107 
1108 	pmap_kremove1(sva, len, true);
1109 }
1110 
1111 /*
1112  * p m a p   i n i t   f u n c t i o n s
1113  *
1114  * pmap_bootstrap and pmap_init are called during system startup
1115  * to init the pmap module.   pmap_bootstrap() does a low level
1116  * init just to get things rolling.   pmap_init() finishes the job.
1117  */
1118 
1119 /*
1120  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1121  * This function is to be used before any VM system has been set up.
1122  *
1123  * The va is taken from virtual_avail.
1124  */
1125 static vaddr_t
1126 pmap_bootstrap_valloc(size_t npages)
1127 {
1128 	vaddr_t va = virtual_avail;
1129 	virtual_avail += npages * PAGE_SIZE;
1130 	return va;
1131 }
1132 
1133 /*
1134  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1135  * This function is to be used before any VM system has been set up.
1136  *
1137  * The pa is taken from avail_start.
1138  */
1139 static paddr_t
1140 pmap_bootstrap_palloc(size_t npages)
1141 {
1142 	paddr_t pa = avail_start;
1143 	avail_start += npages * PAGE_SIZE;
1144 	return pa;
1145 }
1146 
1147 /*
1148  * pmap_bootstrap: get the system in a state where it can run with VM properly
1149  * enabled (called before main()). The VM system is fully init'd later.
1150  *
1151  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1152  *    kernel, and nkpde PTP's for the kernel.
1153  * => kva_start is the first free virtual address in kernel space.
1154  */
1155 void
1156 pmap_bootstrap(vaddr_t kva_start)
1157 {
1158 	struct pmap *kpm;
1159 	int i;
1160 	vaddr_t kva;
1161 
1162 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1163 
1164 	/*
1165 	 * Set up our local static global vars that keep track of the usage of
1166 	 * KVM before kernel_map is set up.
1167 	 */
1168 	virtual_avail = kva_start;		/* first free KVA */
1169 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1170 
1171 	/*
1172 	 * Set up protection_codes: we need to be able to convert from a MI
1173 	 * protection code (some combo of VM_PROT...) to something we can jam
1174 	 * into a x86 PTE.
1175 	 */
1176 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1177 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1178 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1179 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1180 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1181 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1182 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1183 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1184 
1185 	/*
1186 	 * Now we init the kernel's pmap.
1187 	 *
1188 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1189 	 * the pm_obj contains the list of active PTPs.
1190 	 */
1191 	kpm = pmap_kernel();
1192 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1193 	rw_init(&kpm->pm_dummy_lock);
1194 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1195 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1196 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1197 		kpm->pm_ptphint[i] = NULL;
1198 	}
1199 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1200 
1201 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1202 	for (i = 0; i < PDP_SIZE; i++)
1203 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1204 
1205 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1206 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1207 
1208 	kcpuset_create(&kpm->pm_cpus, true);
1209 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1210 
1211 	kpm->pm_ldt = NULL;
1212 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1213 
1214 	/*
1215 	 * the above is just a rough estimate and not critical to the proper
1216 	 * operation of the system.
1217 	 */
1218 
1219 #if !defined(XENPV)
1220 	/*
1221 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1222 	 * attribute to already mapped kernel pages. Do that only if SVS is
1223 	 * disabled.
1224 	 *
1225 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1226 	 * happens later in cpu_init().
1227 	 */
1228 #ifdef SVS
1229 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1230 #else
1231 	if (cpu_feature[0] & CPUID_PGE) {
1232 #endif
1233 		pmap_pg_g = PTE_G;
1234 		pmap_remap_global();
1235 	}
1236 #endif
1237 
1238 #ifndef XENPV
1239 	/*
1240 	 * Enable large pages if they are supported.
1241 	 */
1242 	if (cpu_feature[0] & CPUID_PSE) {
1243 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1244 		pmap_largepages = 1;	/* enable software */
1245 
1246 		/*
1247 		 * The TLB must be flushed after enabling large pages on Pentium
1248 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1249 		 * Software Developer's Manual, Volume 3: System Programming".
1250 		 */
1251 		tlbflushg();
1252 
1253 		/* Remap the kernel. */
1254 		pmap_remap_largepages();
1255 	}
1256 	pmap_init_lapic();
1257 #endif /* !XENPV */
1258 
1259 #ifdef __HAVE_PCPU_AREA
1260 	pmap_init_pcpu();
1261 #endif
1262 
1263 #ifdef __HAVE_DIRECT_MAP
1264 	pmap_init_directmap(kpm);
1265 #else
1266 	pmap_vpage_cpualloc(&cpu_info_primary);
1267 
1268 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1269 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1270 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1271 	} else { /* amd64 */
1272 		/*
1273 		 * zero_pte is stuck at the end of mapped space for the kernel
1274 		 * image (disjunct from kva space). This is done so that it
1275 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1276 		 * when it's called for the first time.
1277 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1278 		 */
1279 #ifdef XENPV
1280 		/* early_zerop initialized in xen_locore() */
1281 #else
1282 		early_zerop = (void *)bootspace.spareva;
1283 #endif
1284 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1285 	}
1286 #endif
1287 
1288 #if defined(XENPV) && defined(__x86_64__)
1289 	extern vaddr_t xen_dummy_page;
1290 	paddr_t xen_dummy_user_pgd;
1291 
1292 	/*
1293 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1294 	 * Xen will still consider it active. So we set user PGD to this one
1295 	 * to lift all protection on the now inactive page tables set.
1296 	 */
1297 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1298 
1299 	/* Zero fill it, the less checks in Xen it requires the better */
1300 	memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1301 	/* Mark read-only */
1302 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1303 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1304 	    UVMF_INVLPG);
1305 	/* Pin as L4 */
1306 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1307 #endif
1308 
1309 	/*
1310 	 * Allocate space for the IDT, GDT and LDT.
1311 	 */
1312 	idt_vaddr = pmap_bootstrap_valloc(1);
1313 	idt_paddr = pmap_bootstrap_palloc(1);
1314 
1315 	gdt_vaddr = pmap_bootstrap_valloc(1);
1316 	gdt_paddr = pmap_bootstrap_palloc(1);
1317 
1318 #ifdef __HAVE_PCPU_AREA
1319 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1320 #else
1321 	ldt_vaddr = pmap_bootstrap_valloc(1);
1322 #endif
1323 	ldt_paddr = pmap_bootstrap_palloc(1);
1324 
1325 #if !defined(__x86_64__)
1326 	/* pentium f00f bug stuff */
1327 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1328 #endif
1329 
1330 #if defined(XENPVHVM)
1331 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1332 	extern paddr_t HYPERVISOR_shared_info_pa;
1333 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1334 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1335 
1336 	if (vm_guest != VM_GUEST_XENPVH) {
1337 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1338 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1339 	}
1340 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1341 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1342 #endif
1343 	/*
1344 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1345 	 */
1346 	virtual_avail = reserve_dumppages(virtual_avail);
1347 
1348 	/*
1349 	 * Init the global lock and global list.
1350 	 */
1351 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1352 	LIST_INIT(&pmaps);
1353 
1354 	/*
1355 	 * Ensure the TLB is sync'd with reality by flushing it...
1356 	 */
1357 	tlbflushg();
1358 
1359 	/*
1360 	 * Calculate pmap_maxkvaddr from nkptp[].
1361 	 */
1362 	kva = VM_MIN_KERNEL_ADDRESS;
1363 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1364 		kva += nkptp[i] * nbpd[i];
1365 	}
1366 	pmap_maxkvaddr = kva;
1367 }
1368 
1369 #ifndef XENPV
1370 static void
1371 pmap_init_lapic(void)
1372 {
1373 	/*
1374 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1375 	 * x86 implementation relies a lot on this address to be valid; so just
1376 	 * allocate a fake physical page that will be kentered into
1377 	 * local_apic_va by machdep.
1378 	 *
1379 	 * If the LAPIC is present, the va will be remapped somewhere else
1380 	 * later in lapic_map.
1381 	 */
1382 	local_apic_va = pmap_bootstrap_valloc(1);
1383 	local_apic_pa = pmap_bootstrap_palloc(1);
1384 }
1385 #endif
1386 
1387 #ifdef __x86_64__
1388 static size_t
1389 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1390 {
1391 	size_t npages;
1392 	npages = (roundup(endva, pgsz) / pgsz) -
1393 	    (rounddown(startva, pgsz) / pgsz);
1394 	return npages;
1395 }
1396 #endif
1397 
1398 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1399 static inline void
1400 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1401 {
1402 	size_t sslot = slotspace.area[type].sslot;
1403 	size_t nslot = slotspace.area[type].nslot;
1404 
1405 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1406 }
1407 #endif
1408 
1409 #ifdef __x86_64__
1410 /*
1411  * Randomize the location of an area. We count the holes in the VM space. We
1412  * randomly select one hole, and then randomly select an area within that hole.
1413  * Finally we update the associated entry in the slotspace structure.
1414  */
1415 vaddr_t
1416 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1417     vaddr_t randva)
1418 {
1419 	struct {
1420 		int start;
1421 		int end;
1422 	} holes[SLSPACE_NAREAS+1];
1423 	size_t i, nholes, hole;
1424 	size_t startsl, endsl, nslots, winsize;
1425 	vaddr_t startva, va;
1426 
1427 	sz = roundup(sz, align);
1428 
1429 	/*
1430 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1431 	 * an area that crosses slots:
1432 	 *     +------+------+------+
1433 	 *     | Slot | Slot | Slot |
1434 	 *     +------+------+------+
1435 	 *        [Chosen Area]
1436 	 * And in that case we must take into account the additional slot
1437 	 * consumed.
1438 	 */
1439 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1440 
1441 	/* Get the holes. */
1442 	nholes = 0;
1443 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1444 	while (1) {
1445 		/*
1446 		 * Find the first occupied slot after the current one.
1447 		 * The area between the two is a hole.
1448 		 */
1449 		size_t minsslot = 512;
1450 		size_t minnslot = 0;
1451 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1452 			if (!slotspace.area[i].active)
1453 				continue;
1454 			if (slotspace.area[i].sslot >= curslot &&
1455 			    slotspace.area[i].sslot < minsslot) {
1456 				minsslot = slotspace.area[i].sslot;
1457 				minnslot = slotspace.area[i].nslot;
1458 			}
1459 		}
1460 
1461 		/* No hole anymore, stop here. */
1462 		if (minsslot == 512) {
1463 			break;
1464 		}
1465 
1466 		/* Register the hole. */
1467 		if (minsslot - curslot >= nslots) {
1468 			holes[nholes].start = curslot;
1469 			holes[nholes].end = minsslot;
1470 			nholes++;
1471 		}
1472 
1473 		/* Skip that hole, and iterate again. */
1474 		curslot = minsslot + minnslot;
1475 	}
1476 
1477 	if (nholes == 0) {
1478 		panic("%s: impossible", __func__);
1479 	}
1480 
1481 	/* Select a hole. */
1482 	hole = randhole;
1483 #ifdef NO_X86_ASLR
1484 	hole = 0;
1485 #endif
1486 	hole %= nholes;
1487 	startsl = holes[hole].start;
1488 	endsl = holes[hole].end;
1489 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1490 
1491 	/* Select an area within the hole. */
1492 	va = randva;
1493 #ifdef NO_X86_ASLR
1494 	va = 0;
1495 #endif
1496 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1497 	va %= winsize;
1498 	va = rounddown(va, align);
1499 	va += startva;
1500 
1501 	/* Update the entry. */
1502 	slotspace.area[type].sslot = pl4_i(va);
1503 	slotspace.area[type].nslot =
1504 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1505 	slotspace.area[type].active = true;
1506 
1507 	return va;
1508 }
1509 #endif
1510 
1511 #ifdef __HAVE_PCPU_AREA
1512 static void
1513 pmap_init_pcpu(void)
1514 {
1515 	const vaddr_t startva = PMAP_PCPU_BASE;
1516 	size_t nL4e, nL3e, nL2e, nL1e;
1517 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1518 	paddr_t pa;
1519 	vaddr_t endva;
1520 	vaddr_t tmpva;
1521 	pt_entry_t *pte;
1522 	size_t size;
1523 	int i;
1524 
1525 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1526 
1527 	size = sizeof(struct pcpu_area);
1528 
1529 	endva = startva + size;
1530 
1531 	/* We will use this temporary va. */
1532 	tmpva = bootspace.spareva;
1533 	pte = PTE_BASE + pl1_i(tmpva);
1534 
1535 	/* Build L4 */
1536 	L4e_idx = pl4_i(startva);
1537 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1538 	KASSERT(nL4e  == 1);
1539 	for (i = 0; i < nL4e; i++) {
1540 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1541 
1542 		pa = pmap_bootstrap_palloc(1);
1543 		*pte = (pa & PTE_FRAME) | pteflags;
1544 		pmap_update_pg(tmpva);
1545 		memset((void *)tmpva, 0, PAGE_SIZE);
1546 
1547 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1548 	}
1549 
1550 	/* Build L3 */
1551 	L3e_idx = pl3_i(startva);
1552 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1553 	for (i = 0; i < nL3e; i++) {
1554 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1555 
1556 		pa = pmap_bootstrap_palloc(1);
1557 		*pte = (pa & PTE_FRAME) | pteflags;
1558 		pmap_update_pg(tmpva);
1559 		memset((void *)tmpva, 0, PAGE_SIZE);
1560 
1561 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1562 	}
1563 
1564 	/* Build L2 */
1565 	L2e_idx = pl2_i(startva);
1566 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1567 	for (i = 0; i < nL2e; i++) {
1568 
1569 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1570 
1571 		pa = pmap_bootstrap_palloc(1);
1572 		*pte = (pa & PTE_FRAME) | pteflags;
1573 		pmap_update_pg(tmpva);
1574 		memset((void *)tmpva, 0, PAGE_SIZE);
1575 
1576 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1577 	}
1578 
1579 	/* Build L1 */
1580 	L1e_idx = pl1_i(startva);
1581 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1582 	for (i = 0; i < nL1e; i++) {
1583 		/*
1584 		 * Nothing to do, the PTEs will be entered via
1585 		 * pmap_kenter_pa.
1586 		 */
1587 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1588 	}
1589 
1590 	*pte = 0;
1591 	pmap_update_pg(tmpva);
1592 
1593 	pcpuarea = (struct pcpu_area *)startva;
1594 
1595 	tlbflush();
1596 }
1597 #endif
1598 
1599 #ifdef __HAVE_DIRECT_MAP
1600 /*
1601  * Create the amd64 direct map. Called only once at boot time. We map all of
1602  * the physical memory contiguously using 2MB large pages, with RW permissions.
1603  * However there is a hole: the kernel is mapped with RO permissions.
1604  */
1605 static void
1606 pmap_init_directmap(struct pmap *kpm)
1607 {
1608 	extern phys_ram_seg_t mem_clusters[];
1609 	extern int mem_cluster_cnt;
1610 
1611 	vaddr_t startva;
1612 	size_t nL4e, nL3e, nL2e;
1613 	size_t L4e_idx, L3e_idx, L2e_idx;
1614 	size_t spahole, epahole;
1615 	paddr_t lastpa, pa;
1616 	vaddr_t endva;
1617 	vaddr_t tmpva;
1618 	pt_entry_t *pte;
1619 	phys_ram_seg_t *mc;
1620 	int i;
1621 	size_t randhole;
1622 	vaddr_t randva;
1623 
1624 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1625 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1626 
1627 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1628 
1629 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1630 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1631 
1632 	/* Get the last physical address available */
1633 	lastpa = 0;
1634 	for (i = 0; i < mem_cluster_cnt; i++) {
1635 		mc = &mem_clusters[i];
1636 		lastpa = MAX(lastpa, mc->start + mc->size);
1637 	}
1638 
1639 	/*
1640 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1641 	 */
1642 	if (lastpa > MAXPHYSMEM) {
1643 		panic("pmap_init_directmap: lastpa incorrect");
1644 	}
1645 
1646 	entropy_extract(&randhole, sizeof randhole, 0);
1647 	entropy_extract(&randva, sizeof randva, 0);
1648 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1649 	    randhole, randva);
1650 	endva = startva + lastpa;
1651 
1652 	/* We will use this temporary va. */
1653 	tmpva = bootspace.spareva;
1654 	pte = PTE_BASE + pl1_i(tmpva);
1655 
1656 	/* Build L4 */
1657 	L4e_idx = pl4_i(startva);
1658 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1659 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1660 	for (i = 0; i < nL4e; i++) {
1661 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1662 
1663 		pa = pmap_bootstrap_palloc(1);
1664 		*pte = (pa & PTE_FRAME) | pteflags;
1665 		pmap_update_pg(tmpva);
1666 		memset((void *)tmpva, 0, PAGE_SIZE);
1667 
1668 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1669 	}
1670 
1671 	/* Build L3 */
1672 	L3e_idx = pl3_i(startva);
1673 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1674 	for (i = 0; i < nL3e; i++) {
1675 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1676 
1677 		pa = pmap_bootstrap_palloc(1);
1678 		*pte = (pa & PTE_FRAME) | pteflags;
1679 		pmap_update_pg(tmpva);
1680 		memset((void *)tmpva, 0, PAGE_SIZE);
1681 
1682 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1683 	}
1684 
1685 	/* Build L2 */
1686 	L2e_idx = pl2_i(startva);
1687 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1688 	for (i = 0; i < nL2e; i++) {
1689 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1690 
1691 		pa = (paddr_t)(i * NBPD_L2);
1692 
1693 		if (spahole <= pa && pa < epahole) {
1694 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1695 			    PTE_PS | pmap_pg_g;
1696 		} else {
1697 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1698 			    PTE_PS | pmap_pg_g;
1699 		}
1700 	}
1701 
1702 	*pte = 0;
1703 	pmap_update_pg(tmpva);
1704 
1705 	pmap_direct_base = startva;
1706 	pmap_direct_end = endva;
1707 
1708 	tlbflush();
1709 }
1710 #endif /* __HAVE_DIRECT_MAP */
1711 
1712 #if !defined(XENPV)
1713 /*
1714  * Remap all of the virtual pages created so far with the PTE_G bit.
1715  */
1716 static void
1717 pmap_remap_global(void)
1718 {
1719 	vaddr_t kva, kva_end;
1720 	unsigned long p1i;
1721 	size_t i;
1722 
1723 	/* head */
1724 	kva = bootspace.head.va;
1725 	kva_end = kva + bootspace.head.sz;
1726 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1727 		p1i = pl1_i(kva);
1728 		if (pmap_valid_entry(PTE_BASE[p1i]))
1729 			PTE_BASE[p1i] |= pmap_pg_g;
1730 	}
1731 
1732 	/* kernel segments */
1733 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1734 		if (bootspace.segs[i].type == BTSEG_NONE) {
1735 			continue;
1736 		}
1737 		kva = bootspace.segs[i].va;
1738 		kva_end = kva + bootspace.segs[i].sz;
1739 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1740 			p1i = pl1_i(kva);
1741 			if (pmap_valid_entry(PTE_BASE[p1i]))
1742 				PTE_BASE[p1i] |= pmap_pg_g;
1743 		}
1744 	}
1745 
1746 	/* boot space */
1747 	kva = bootspace.boot.va;
1748 	kva_end = kva + bootspace.boot.sz;
1749 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1750 		p1i = pl1_i(kva);
1751 		if (pmap_valid_entry(PTE_BASE[p1i]))
1752 			PTE_BASE[p1i] |= pmap_pg_g;
1753 	}
1754 }
1755 #endif
1756 
1757 #ifndef XENPV
1758 /*
1759  * Remap several kernel segments with large pages. We cover as many pages as we
1760  * can. Called only once at boot time, if the CPU supports large pages.
1761  */
1762 static void
1763 pmap_remap_largepages(void)
1764 {
1765 	pd_entry_t *pde;
1766 	vaddr_t kva, kva_end;
1767 	paddr_t pa;
1768 	size_t i;
1769 
1770 	/* Remap the kernel text using large pages. */
1771 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1772 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1773 			continue;
1774 		}
1775 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1776 		if (kva < bootspace.segs[i].va) {
1777 			continue;
1778 		}
1779 		kva_end = rounddown(bootspace.segs[i].va +
1780 			bootspace.segs[i].sz, NBPD_L2);
1781 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1782 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1783 			pde = &L2_BASE[pl2_i(kva)];
1784 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1785 			tlbflushg();
1786 		}
1787 	}
1788 
1789 	/* Remap the kernel rodata using large pages. */
1790 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1791 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1792 			continue;
1793 		}
1794 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1795 		if (kva < bootspace.segs[i].va) {
1796 			continue;
1797 		}
1798 		kva_end = rounddown(bootspace.segs[i].va +
1799 			bootspace.segs[i].sz, NBPD_L2);
1800 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1801 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1802 			pde = &L2_BASE[pl2_i(kva)];
1803 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1804 			tlbflushg();
1805 		}
1806 	}
1807 
1808 	/* Remap the kernel data+bss using large pages. */
1809 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1810 		if (bootspace.segs[i].type != BTSEG_DATA) {
1811 			continue;
1812 		}
1813 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1814 		if (kva < bootspace.segs[i].va) {
1815 			continue;
1816 		}
1817 		kva_end = rounddown(bootspace.segs[i].va +
1818 			bootspace.segs[i].sz, NBPD_L2);
1819 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1820 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1821 			pde = &L2_BASE[pl2_i(kva)];
1822 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1823 			tlbflushg();
1824 		}
1825 	}
1826 }
1827 #endif /* !XENPV */
1828 
1829 /*
1830  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1831  * to manage mappings.
1832  */
1833 void
1834 pmap_init(void)
1835 {
1836 	int flags;
1837 
1838 	/*
1839 	 * initialize caches.
1840 	 */
1841 
1842 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1843 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1844 
1845 #ifdef XENPV
1846 	/*
1847 	 * pool_cache(9) should not touch cached objects, since they
1848 	 * are pinned on xen and R/O for the domU
1849 	 */
1850 	flags = PR_NOTOUCH;
1851 #else
1852 	flags = 0;
1853 #endif
1854 
1855 #ifdef PAE
1856 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1857 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1858 #else
1859 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1860 	    "pdppl", NULL, IPL_NONE);
1861 #endif
1862 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1863 	     0, 0, "pvpage", &pool_allocator_kmem,
1864 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1865 
1866 	pmap_tlb_init();
1867 
1868 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1869 	pmap_tlb_cpu_init(curcpu());
1870 
1871 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1872 	    NULL, "x86", "io bitmap copy");
1873 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1874 	    NULL, "x86", "ldt sync");
1875 
1876 	/*
1877 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1878 	 * to hang a tree of pv_entry records.  Dynamically allocated
1879 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1880 	 * usual case is embedded), so cop out and use a single RB tree
1881 	 * to cover them.
1882 	 */
1883 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1884 
1885 	/*
1886 	 * done: pmap module is up (and ready for business)
1887 	 */
1888 
1889 	pmap_initialized = true;
1890 }
1891 
1892 #ifndef XENPV
1893 /*
1894  * pmap_cpu_init_late: perform late per-CPU initialization.
1895  */
1896 void
1897 pmap_cpu_init_late(struct cpu_info *ci)
1898 {
1899 	/*
1900 	 * The BP has already its own PD page allocated during early
1901 	 * MD startup.
1902 	 */
1903 	if (ci == &cpu_info_primary)
1904 		return;
1905 #ifdef PAE
1906 	cpu_alloc_l3_page(ci);
1907 #endif
1908 }
1909 #endif
1910 
1911 #ifndef __HAVE_DIRECT_MAP
1912 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1913 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1914 
1915 static void
1916 pmap_vpage_cpualloc(struct cpu_info *ci)
1917 {
1918 	bool primary = (ci == &cpu_info_primary);
1919 	size_t i, npages;
1920 	vaddr_t vabase;
1921 	vsize_t vrange;
1922 
1923 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1924 	KASSERT(npages >= VPAGE_MAX);
1925 	vrange = npages * PAGE_SIZE;
1926 
1927 	if (primary) {
1928 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1929 			/* Waste some pages to align properly */
1930 		}
1931 		/* The base is aligned, allocate the rest (contiguous) */
1932 		pmap_bootstrap_valloc(npages - 1);
1933 	} else {
1934 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1935 		    UVM_KMF_VAONLY);
1936 		if (vabase == 0) {
1937 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1938 			    __func__, cpu_index(ci));
1939 		}
1940 	}
1941 
1942 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1943 
1944 	for (i = 0; i < VPAGE_MAX; i++) {
1945 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1946 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1947 	}
1948 }
1949 
1950 void
1951 pmap_vpage_cpu_init(struct cpu_info *ci)
1952 {
1953 	if (ci == &cpu_info_primary) {
1954 		/* cpu0 already taken care of in pmap_bootstrap */
1955 		return;
1956 	}
1957 
1958 	pmap_vpage_cpualloc(ci);
1959 }
1960 #endif
1961 
1962 /*
1963  * p v _ e n t r y   f u n c t i o n s
1964  */
1965 
1966 /*
1967  * pmap_pvp_dtor: pool_cache constructor for PV pages.
1968  */
1969 static int
1970 pmap_pvp_ctor(void *arg, void *obj, int flags)
1971 {
1972 	struct pv_page *pvp = (struct pv_page *)obj;
1973 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
1974 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
1975 
1976 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
1977 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
1978 
1979 	LIST_INIT(&pvp->pvp_pves);
1980 	pvp->pvp_nfree = PVE_PER_PVP;
1981 	pvp->pvp_pmap = NULL;
1982 
1983 	for (; pve < maxpve; pve++) {
1984 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
1985 	}
1986 
1987 	return 0;
1988 }
1989 
1990 /*
1991  * pmap_pvp_dtor: pool_cache destructor for PV pages.
1992  */
1993 static void
1994 pmap_pvp_dtor(void *arg, void *obj)
1995 {
1996 	struct pv_page *pvp __diagused = obj;
1997 
1998 	KASSERT(pvp->pvp_pmap == NULL);
1999 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2000 }
2001 
2002 /*
2003  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2004  */
2005 static struct pv_entry *
2006 pmap_alloc_pv(struct pmap *pmap)
2007 {
2008 	struct pv_entry *pve;
2009 	struct pv_page *pvp;
2010 
2011 	KASSERT(mutex_owned(&pmap->pm_lock));
2012 
2013 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2014 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2015 			LIST_REMOVE(pvp, pvp_list);
2016 		} else {
2017 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2018 		}
2019 		if (__predict_false(pvp == NULL)) {
2020 			return NULL;
2021 		}
2022 		/* full -> part */
2023 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2024 		pvp->pvp_pmap = pmap;
2025 	}
2026 
2027 	KASSERT(pvp->pvp_pmap == pmap);
2028 	KASSERT(pvp->pvp_nfree > 0);
2029 
2030 	pve = LIST_FIRST(&pvp->pvp_pves);
2031 	LIST_REMOVE(pve, pve_list);
2032 	pvp->pvp_nfree--;
2033 
2034 	if (__predict_false(pvp->pvp_nfree == 0)) {
2035 		/* part -> empty */
2036 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2037 		LIST_REMOVE(pvp, pvp_list);
2038 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2039 	} else {
2040 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2041 	}
2042 
2043 	return pve;
2044 }
2045 
2046 /*
2047  * pmap_free_pv: delayed free of a PV entry.
2048  */
2049 static void
2050 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2051 {
2052 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2053 
2054 	KASSERT(mutex_owned(&pmap->pm_lock));
2055 	KASSERT(pvp->pvp_pmap == pmap);
2056 	KASSERT(pvp->pvp_nfree >= 0);
2057 
2058 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2059 	pvp->pvp_nfree++;
2060 
2061 	if (__predict_false(pvp->pvp_nfree == 1)) {
2062 		/* empty -> part */
2063 		LIST_REMOVE(pvp, pvp_list);
2064 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2065 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2066 		/* part -> full */
2067 		LIST_REMOVE(pvp, pvp_list);
2068 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2069 	}
2070 }
2071 
2072 /*
2073  * pmap_drain_pv: free full PV pages.
2074  */
2075 static void
2076 pmap_drain_pv(struct pmap *pmap)
2077 {
2078 	struct pv_page *pvp;
2079 
2080 	KASSERT(mutex_owned(&pmap->pm_lock));
2081 
2082 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2083 		LIST_REMOVE(pvp, pvp_list);
2084 		KASSERT(pvp->pvp_pmap == pmap);
2085 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2086 		pvp->pvp_pmap = NULL;
2087 		pool_cache_put(&pmap_pvp_cache, pvp);
2088 	}
2089 }
2090 
2091 /*
2092  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2093  */
2094 static void
2095 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2096     vaddr_t va, bool tracked)
2097 {
2098 #ifdef DEBUG
2099 	struct pv_pte *pvpte;
2100 
2101 	PMAP_CHECK_PP(pp);
2102 
2103 	mutex_spin_enter(&pp->pp_lock);
2104 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2105 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2106 			break;
2107 		}
2108 	}
2109 	mutex_spin_exit(&pp->pp_lock);
2110 
2111 	if (pvpte && !tracked) {
2112 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2113 	} else if (!pvpte && tracked) {
2114 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2115 	}
2116 #endif
2117 }
2118 
2119 /*
2120  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2121  *
2122  * => pmap must be locked
2123  */
2124 static struct pv_entry *
2125 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2126     const rb_tree_t *tree, const vaddr_t va)
2127 {
2128 	struct pv_entry *pve;
2129 	rb_node_t *node;
2130 
2131 	/*
2132 	 * Inlined lookup tailored for exactly what's needed here that is
2133 	 * quite a bit faster than using rb_tree_find_node().
2134 	 */
2135 	for (node = tree->rbt_root;;) {
2136 		if (__predict_false(RB_SENTINEL_P(node))) {
2137 			return NULL;
2138 		}
2139 		pve = (struct pv_entry *)
2140 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2141 		if (pve->pve_pte.pte_va == va) {
2142 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2143 			return pve;
2144 		}
2145 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2146 	}
2147 }
2148 
2149 /*
2150  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2151  *
2152  * => a PV entry must be known present (doesn't check for existence)
2153  * => pmap must be locked
2154  */
2155 static struct pv_entry *
2156 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2157     const struct pmap_page * const old_pp, const vaddr_t va)
2158 {
2159 	struct pv_entry *pve;
2160 	const rb_tree_t *tree;
2161 
2162 	KASSERT(mutex_owned(&pmap->pm_lock));
2163 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2164 
2165 	/*
2166 	 * [This mostly deals with the case of process-private pages, i.e.
2167 	 * anonymous memory allocations or COW.]
2168 	 *
2169 	 * If the page is tracked with an embedded entry then the tree
2170 	 * lookup can be avoided.  It's safe to check for this specific
2171 	 * set of values without pp_lock because both will only ever be
2172 	 * set together for this pmap.
2173 	 *
2174 	 */
2175 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2176 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2177 		return NULL;
2178 	}
2179 
2180 	/*
2181 	 * [This mostly deals with shared mappings, for example shared libs
2182 	 * and executables.]
2183 	 *
2184 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2185 	 * look at the lowest numbered node in the tree first.  The tree is
2186 	 * known non-empty because of the check above.  For short lived
2187 	 * processes where pmap_remove() isn't used much this gets close to
2188 	 * a 100% hit rate.
2189 	 */
2190 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2191 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2192 	pve = (struct pv_entry *)
2193 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2194 	    offsetof(struct pv_entry, pve_rb));
2195 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2196 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2197 		return pve;
2198 	}
2199 
2200 	/* Search the RB tree for the key (uncommon). */
2201 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2202 }
2203 
2204 /*
2205  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2206  *
2207  * => pmap must be locked
2208  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2209  */
2210 static int
2211 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2212     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2213     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2214 {
2215 	struct pv_entry *pve;
2216 	int error;
2217 
2218 	KASSERT(mutex_owned(&pmap->pm_lock));
2219 	KASSERT(ptp_to_pmap(ptp) == pmap);
2220 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2221 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2222 	PMAP_CHECK_PP(pp);
2223 
2224 	/*
2225 	 * If entering the same page and it's already tracked with an
2226 	 * embedded entry, we can avoid the expense below.  It's safe
2227 	 * to check for this very specific set of values without a lock
2228 	 * because both will only ever be set together for this pmap.
2229 	 */
2230 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2231 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2232 		*samepage = true;
2233 		pmap_check_pv(pmap, ptp, pp, va, true);
2234 		return 0;
2235 	}
2236 
2237 	/*
2238 	 * Check for an existing dynamic mapping at this address.  If it's
2239 	 * for the same page, then it will be reused and nothing needs to be
2240 	 * changed.
2241 	 */
2242 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2243 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2244 		*samepage = true;
2245 		pmap_check_pv(pmap, ptp, pp, va, true);
2246 		return 0;
2247 	}
2248 
2249 	/*
2250 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2251 	 * case it's needed; won't know for sure until the lock is taken.
2252 	 */
2253 	if (pmap->pm_pve == NULL) {
2254 		pmap->pm_pve = pmap_alloc_pv(pmap);
2255 	}
2256 
2257 	error = 0;
2258 	pmap_check_pv(pmap, ptp, pp, va, false);
2259 	mutex_spin_enter(&pp->pp_lock);
2260 	if (!pv_pte_embedded(pp)) {
2261 		/*
2262 		 * Embedded PV tracking available - easy.
2263 		 */
2264 		pp->pp_pte.pte_ptp = ptp;
2265 		pp->pp_pte.pte_va = va;
2266 		*new_embedded = true;
2267 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2268 		/*
2269 		 * No memory.
2270 		 */
2271 		error = ENOMEM;
2272 	} else {
2273 		/*
2274 		 * Install new pv_entry on the page.
2275 		 */
2276 		pve = pmap->pm_pve;
2277 		pmap->pm_pve = NULL;
2278 		*new_pve = pve;
2279 		pve->pve_pte.pte_ptp = ptp;
2280 		pve->pve_pte.pte_va = va;
2281 		pve->pve_pp = pp;
2282 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2283 	}
2284 	mutex_spin_exit(&pp->pp_lock);
2285 	if (error == 0) {
2286 		pmap_check_pv(pmap, ptp, pp, va, true);
2287 	}
2288 
2289 	return error;
2290 }
2291 
2292 /*
2293  * pmap_remove_pv: try to remove a mapping from a pv_list
2294  *
2295  * => pmap must be locked
2296  * => removes dynamic entries from tree and frees them
2297  * => caller should adjust ptp's wire_count and free PTP if needed
2298  */
2299 static void
2300 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2301     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2302 {
2303 	rb_tree_t *tree = (ptp != NULL ?
2304 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2305 
2306 	KASSERT(mutex_owned(&pmap->pm_lock));
2307 	KASSERT(ptp_to_pmap(ptp) == pmap);
2308 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2309 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2310 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2311 
2312 	pmap_check_pv(pmap, ptp, pp, va, true);
2313 
2314 	if (pve == NULL) {
2315 		mutex_spin_enter(&pp->pp_lock);
2316 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2317 		KASSERT(pp->pp_pte.pte_va == va);
2318 		pp->pp_attrs |= oattrs;
2319 		pp->pp_pte.pte_ptp = NULL;
2320 		pp->pp_pte.pte_va = 0;
2321 		mutex_spin_exit(&pp->pp_lock);
2322 	} else {
2323 		mutex_spin_enter(&pp->pp_lock);
2324 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2325 		    pp->pp_pte.pte_va != va);
2326 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2327 		KASSERT(pve->pve_pte.pte_va == va);
2328 		KASSERT(pve->pve_pp == pp);
2329 		pp->pp_attrs |= oattrs;
2330 		LIST_REMOVE(pve, pve_list);
2331 		mutex_spin_exit(&pp->pp_lock);
2332 
2333 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2334 		rb_tree_remove_node(tree, pve);
2335 #ifdef DIAGNOSTIC
2336 		memset(pve, 0, sizeof(*pve));
2337 #endif
2338 		pmap_free_pv(pmap, pve);
2339 	}
2340 
2341 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2342 	pmap_check_pv(pmap, ptp, pp, va, false);
2343 }
2344 
2345 /*
2346  * p t p   f u n c t i o n s
2347  */
2348 
2349 static struct vm_page *
2350 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2351 {
2352 	int lidx = level - 1;
2353 	off_t off = ptp_va2o(va, level);
2354 	struct vm_page *pg;
2355 
2356 	KASSERT(mutex_owned(&pmap->pm_lock));
2357 
2358 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2359 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2360 		pg = pmap->pm_ptphint[lidx];
2361 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2362 		return pg;
2363 	}
2364 	PMAP_DUMMY_LOCK(pmap);
2365 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2366 	PMAP_DUMMY_UNLOCK(pmap);
2367 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2368 		/* This page is queued to be freed - ignore. */
2369 		pg = NULL;
2370 	}
2371 	if (pg != NULL) {
2372 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2373 	}
2374 	pmap->pm_ptphint[lidx] = pg;
2375 	return pg;
2376 }
2377 
2378 static inline void
2379 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2380 {
2381 	int lidx;
2382 
2383 	KASSERT(ptp->wire_count <= 1);
2384 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2385 
2386 	lidx = level - 1;
2387 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2388 	if (pmap->pm_ptphint[lidx] == ptp)
2389 		pmap->pm_ptphint[lidx] = NULL;
2390 	ptp->wire_count = 0;
2391 	ptp->uanon = NULL;
2392 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2393 
2394 	/*
2395 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2396 	 * the page from the uvm_object, as that can take further locks
2397 	 * (intolerable right now because the PTEs are likely mapped in).
2398 	 * Instead mark the PTP as free and if we bump into it again, we'll
2399 	 * either ignore or reuse (depending on what's useful at the time).
2400 	 */
2401 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2402 }
2403 
2404 static void
2405 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2406 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2407 {
2408 	unsigned long index;
2409 	int level;
2410 	vaddr_t invaladdr;
2411 	pd_entry_t opde;
2412 
2413 	KASSERT(pmap != pmap_kernel());
2414 	KASSERT(mutex_owned(&pmap->pm_lock));
2415 	KASSERT(kpreempt_disabled());
2416 
2417 	level = 1;
2418 	do {
2419 		index = pl_i(va, level + 1);
2420 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2421 
2422 		/*
2423 		 * On Xen-amd64 or SVS, we need to sync the top level page
2424 		 * directory on each CPU.
2425 		 */
2426 #if defined(XENPV) && defined(__x86_64__)
2427 		if (level == PTP_LEVELS - 1) {
2428 			xen_kpm_sync(pmap, index);
2429 		}
2430 #elif defined(SVS)
2431 		if (svs_enabled && level == PTP_LEVELS - 1) {
2432 			svs_pmap_sync(pmap, index);
2433 		}
2434 #endif
2435 
2436 		invaladdr = level == 1 ? (vaddr_t)ptes :
2437 		    (vaddr_t)pdes[level - 2];
2438 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2439 		    opde, TLBSHOOT_FREE_PTP);
2440 
2441 #if defined(XENPV)
2442 		pmap_tlb_shootnow();
2443 #endif
2444 
2445 		pmap_freepage(pmap, ptp, level);
2446 		if (level < PTP_LEVELS - 1) {
2447 			ptp = pmap_find_ptp(pmap, va, level + 1);
2448 			ptp->wire_count--;
2449 			if (ptp->wire_count > 1)
2450 				break;
2451 		}
2452 	} while (++level < PTP_LEVELS);
2453 	pmap_pte_flush();
2454 }
2455 
2456 /*
2457  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2458  *
2459  * => pmap should NOT be pmap_kernel()
2460  * => pmap should be locked
2461  * => we are not touching any PTEs yet, so they need not be mapped in
2462  */
2463 static int
2464 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2465     int flags, struct vm_page **resultp)
2466 {
2467 	struct vm_page *ptp;
2468 	int i, aflags;
2469 	struct uvm_object *obj;
2470 	voff_t off;
2471 
2472 	KASSERT(pmap != pmap_kernel());
2473 	KASSERT(mutex_owned(&pmap->pm_lock));
2474 
2475 	/*
2476 	 * Loop through all page table levels allocating a page
2477 	 * for any level where we don't already have one.
2478 	 */
2479 	memset(pt, 0, sizeof(*pt));
2480 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2481 		UVM_PGA_ZERO;
2482 	for (i = PTP_LEVELS; i > 1; i--) {
2483 		obj = &pmap->pm_obj[i - 2];
2484 		off = ptp_va2o(va, i - 1);
2485 
2486 		PMAP_DUMMY_LOCK(pmap);
2487 		pt->pg[i] = uvm_pagelookup(obj, off);
2488 
2489 		if (pt->pg[i] == NULL) {
2490 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2491 			pt->alloced[i] = (pt->pg[i] != NULL);
2492 		} else if (pt->pg[i]->wire_count == 0) {
2493 			/* This page was queued to be freed; dequeue it. */
2494 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2495 			pt->alloced[i] = true;
2496 		}
2497 		PMAP_DUMMY_UNLOCK(pmap);
2498 		if (pt->pg[i] == NULL) {
2499 			pmap_unget_ptp(pmap, pt);
2500 			return ENOMEM;
2501 		} else if (pt->alloced[i]) {
2502 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2503 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2504 			    &pmap_rbtree_ops);
2505 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2506 		}
2507 	}
2508 	ptp = pt->pg[2];
2509 	KASSERT(ptp != NULL);
2510 	*resultp = ptp;
2511 	pmap->pm_ptphint[0] = ptp;
2512 	return 0;
2513 }
2514 
2515 /*
2516  * pmap_install_ptp: install any freshly allocated PTPs
2517  *
2518  * => pmap should NOT be pmap_kernel()
2519  * => pmap should be locked
2520  * => PTEs must be mapped
2521  * => preemption must be disabled
2522  */
2523 static void
2524 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2525     pd_entry_t * const *pdes)
2526 {
2527 	struct vm_page *ptp;
2528 	unsigned long index;
2529 	pd_entry_t *pva;
2530 	paddr_t pa;
2531 	int i;
2532 
2533 	KASSERT(pmap != pmap_kernel());
2534 	KASSERT(mutex_owned(&pmap->pm_lock));
2535 	KASSERT(kpreempt_disabled());
2536 
2537 	/*
2538 	 * Now that we have all the pages looked up or allocated,
2539 	 * loop through again installing any new ones into the tree.
2540 	 */
2541 	for (i = PTP_LEVELS; i > 1; i--) {
2542 		index = pl_i(va, i);
2543 		pva = pdes[i - 2];
2544 
2545 		if (pmap_valid_entry(pva[index])) {
2546 			KASSERT(!pt->alloced[i]);
2547 			continue;
2548 		}
2549 
2550 		ptp = pt->pg[i];
2551 		ptp->flags &= ~PG_BUSY; /* never busy */
2552 		ptp->wire_count = 1;
2553 		pmap->pm_ptphint[i - 2] = ptp;
2554 		pa = VM_PAGE_TO_PHYS(ptp);
2555 		pmap_pte_set(&pva[index], (pd_entry_t)
2556 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2557 
2558 		/*
2559 		 * On Xen-amd64 or SVS, we need to sync the top level page
2560 		 * directory on each CPU.
2561 		 */
2562 #if defined(XENPV) && defined(__x86_64__)
2563 		if (i == PTP_LEVELS) {
2564 			xen_kpm_sync(pmap, index);
2565 		}
2566 #elif defined(SVS)
2567 		if (svs_enabled && i == PTP_LEVELS) {
2568 			svs_pmap_sync(pmap, index);
2569 		}
2570 #endif
2571 
2572 		pmap_pte_flush();
2573 		pmap_stats_update(pmap, 1, 0);
2574 
2575 		/*
2576 		 * If we're not in the top level, increase the
2577 		 * wire count of the parent page.
2578 		 */
2579 		if (i < PTP_LEVELS) {
2580 			pt->pg[i + 1]->wire_count++;
2581 		}
2582 	}
2583 }
2584 
2585 /*
2586  * pmap_unget_ptp: free unusued PTPs
2587  *
2588  * => pmap should NOT be pmap_kernel()
2589  * => pmap should be locked
2590  */
2591 static void
2592 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2593 {
2594 	int i;
2595 
2596 	KASSERT(pmap != pmap_kernel());
2597 	KASSERT(mutex_owned(&pmap->pm_lock));
2598 
2599 	for (i = PTP_LEVELS; i > 1; i--) {
2600 		if (!pt->alloced[i]) {
2601 			continue;
2602 		}
2603 		KASSERT(pt->pg[i]->wire_count == 0);
2604 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2605 		pmap_freepage(pmap, pt->pg[i], i - 1);
2606 	}
2607 }
2608 
2609 /*
2610  * p m a p   l i f e c y c l e   f u n c t i o n s
2611  */
2612 
2613 /*
2614  * pmap_pdp_init: constructor a new PDP.
2615  */
2616 static void
2617 pmap_pdp_init(pd_entry_t *pdir)
2618 {
2619 	paddr_t pdirpa = 0;
2620 	vaddr_t object;
2621 	int i;
2622 
2623 #if !defined(XENPV) || !defined(__x86_64__)
2624 	int npde;
2625 #endif
2626 #ifdef XENPV
2627 	int s;
2628 #endif
2629 
2630 	memset(pdir, 0, PDP_SIZE * PAGE_SIZE);
2631 
2632 	/*
2633 	 * NOTE: This is all done unlocked, but we will check afterwards
2634 	 * if we have raced with pmap_growkernel().
2635 	 */
2636 
2637 #if defined(XENPV) && defined(__x86_64__)
2638 	/* Fetch the physical address of the page directory */
2639 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2640 
2641 	/*
2642 	 * This pdir will NEVER be active in kernel mode, so mark
2643 	 * recursive entry invalid.
2644 	 */
2645 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2646 
2647 	/*
2648 	 * PDP constructed this way won't be for the kernel, hence we
2649 	 * don't put kernel mappings on Xen.
2650 	 *
2651 	 * But we need to make pmap_create() happy, so put a dummy
2652 	 * (without PTE_P) value at the right place.
2653 	 */
2654 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2655 	     (pd_entry_t)-1 & PTE_FRAME;
2656 #else /* XENPV && __x86_64__*/
2657 	object = (vaddr_t)pdir;
2658 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2659 		/* Fetch the physical address of the page directory */
2660 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2661 
2662 		/* Put in recursive PDE to map the PTEs */
2663 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2664 		    pmap_pg_nx;
2665 #ifndef XENPV
2666 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2667 #endif
2668 	}
2669 
2670 	/* Copy the kernel's top level PDE */
2671 	npde = nkptp[PTP_LEVELS - 1];
2672 
2673 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2674 	    npde * sizeof(pd_entry_t));
2675 
2676 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2677 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2678 		pdir[idx] = PDP_BASE[idx];
2679 	}
2680 
2681 #ifdef __HAVE_PCPU_AREA
2682 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2683 #endif
2684 #ifdef __HAVE_DIRECT_MAP
2685 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2686 #endif
2687 #ifdef KASAN
2688 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2689 #endif
2690 #ifdef KMSAN
2691 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2692 #endif
2693 #endif /* XENPV  && __x86_64__*/
2694 
2695 #ifdef XENPV
2696 	s = splvm();
2697 	object = (vaddr_t)pdir;
2698 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2699 	    VM_PROT_READ);
2700 	pmap_update(pmap_kernel());
2701 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2702 		/*
2703 		 * pin as L2/L4 page, we have to do the page with the
2704 		 * PDIR_SLOT_PTE entries last
2705 		 */
2706 #ifdef PAE
2707 		if (i == l2tol3(PDIR_SLOT_PTE))
2708 			continue;
2709 #endif
2710 
2711 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2712 #ifdef __x86_64__
2713 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2714 #else
2715 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2716 #endif
2717 	}
2718 #ifdef PAE
2719 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2720 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2721 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2722 #endif
2723 	splx(s);
2724 #endif /* XENPV */
2725 }
2726 
2727 /*
2728  * pmap_pdp_fini: destructor for the PDPs.
2729  */
2730 static void
2731 pmap_pdp_fini(pd_entry_t *pdir)
2732 {
2733 #ifdef XENPV
2734 	paddr_t pdirpa = 0;	/* XXX: GCC */
2735 	vaddr_t object = (vaddr_t)pdir;
2736 	int i;
2737 	int s = splvm();
2738 	pt_entry_t *pte;
2739 
2740 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2741 		/* fetch the physical address of the page directory. */
2742 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2743 		/* unpin page table */
2744 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2745 	}
2746 	object = (vaddr_t)pdir;
2747 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2748 		/* Set page RW again */
2749 		pte = kvtopte(object);
2750 		pmap_pte_set(pte, *pte | PTE_W);
2751 		xen_bcast_invlpg((vaddr_t)object);
2752 	}
2753 	splx(s);
2754 #endif  /* XENPV */
2755 }
2756 
2757 #ifdef PAE
2758 static void *
2759 pmap_pdp_alloc(struct pool *pp, int flags)
2760 {
2761 	return (void *)uvm_km_alloc(kernel_map,
2762 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2763 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2764 	    UVM_KMF_WIRED);
2765 }
2766 
2767 static void
2768 pmap_pdp_free(struct pool *pp, void *v)
2769 {
2770 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2771 	    UVM_KMF_WIRED);
2772 }
2773 #endif /* PAE */
2774 
2775 /*
2776  * pmap_ctor: constructor for the pmap cache.
2777  */
2778 static int
2779 pmap_ctor(void *arg, void *obj, int flags)
2780 {
2781 	struct pmap *pmap = obj;
2782 	pt_entry_t p;
2783 	int i;
2784 
2785 	KASSERT((flags & PR_WAITOK) != 0);
2786 
2787 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2788 	rw_init(&pmap->pm_dummy_lock);
2789 	kcpuset_create(&pmap->pm_cpus, true);
2790 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2791 #ifdef XENPV
2792 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2793 #endif
2794 	LIST_INIT(&pmap->pm_gc_ptp);
2795 	pmap->pm_pve = NULL;
2796 	LIST_INIT(&pmap->pm_pvp_full);
2797 	LIST_INIT(&pmap->pm_pvp_part);
2798 	LIST_INIT(&pmap->pm_pvp_empty);
2799 
2800 	/* allocate and init PDP */
2801 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2802 
2803 	for (;;) {
2804 		pmap_pdp_init(pmap->pm_pdir);
2805 		mutex_enter(&pmaps_lock);
2806 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2807 		if (__predict_true(p != 0)) {
2808 			break;
2809 		}
2810 		mutex_exit(&pmaps_lock);
2811 	}
2812 
2813 	for (i = 0; i < PDP_SIZE; i++)
2814 		pmap->pm_pdirpa[i] =
2815 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2816 
2817 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2818 	mutex_exit(&pmaps_lock);
2819 
2820 	return 0;
2821 }
2822 
2823 /*
2824  * pmap_ctor: destructor for the pmap cache.
2825  */
2826 static void
2827 pmap_dtor(void *arg, void *obj)
2828 {
2829 	struct pmap *pmap = obj;
2830 
2831 	mutex_enter(&pmaps_lock);
2832 	LIST_REMOVE(pmap, pm_list);
2833 	mutex_exit(&pmaps_lock);
2834 
2835 	pmap_pdp_fini(pmap->pm_pdir);
2836 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2837 	mutex_destroy(&pmap->pm_lock);
2838 	rw_destroy(&pmap->pm_dummy_lock);
2839 	kcpuset_destroy(pmap->pm_cpus);
2840 	kcpuset_destroy(pmap->pm_kernel_cpus);
2841 #ifdef XENPV
2842 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2843 #endif
2844 }
2845 
2846 /*
2847  * pmap_create: create a pmap object.
2848  */
2849 struct pmap *
2850 pmap_create(void)
2851 {
2852 	struct pmap *pmap;
2853 	int i;
2854 
2855 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2856 
2857 	/* init uvm_object */
2858 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2859 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2860 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2861 		pmap->pm_ptphint[i] = NULL;
2862 	}
2863 	pmap->pm_stats.wired_count = 0;
2864 	/* count the PDP allocd below */
2865 	pmap->pm_stats.resident_count = PDP_SIZE;
2866 #if !defined(__x86_64__)
2867 	pmap->pm_hiexec = 0;
2868 #endif
2869 
2870 	/* Used by NVMM and Xen */
2871 	pmap->pm_enter = NULL;
2872 	pmap->pm_extract = NULL;
2873 	pmap->pm_remove = NULL;
2874 	pmap->pm_sync_pv = NULL;
2875 	pmap->pm_pp_remove_ent = NULL;
2876 	pmap->pm_write_protect = NULL;
2877 	pmap->pm_unwire = NULL;
2878 	pmap->pm_tlb_flush = NULL;
2879 	pmap->pm_data = NULL;
2880 
2881 	/* init the LDT */
2882 	pmap->pm_ldt = NULL;
2883 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2884 
2885 	return (pmap);
2886 }
2887 
2888 /*
2889  * pmap_check_ptps: verify that none of the pmap's page table objects
2890  * have any pages allocated to them.
2891  */
2892 static void
2893 pmap_check_ptps(struct pmap *pmap)
2894 {
2895 	int i;
2896 
2897 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2898 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2899 		    "pmap %p level %d still has %d pages",
2900 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2901 	}
2902 }
2903 
2904 static void
2905 pmap_check_inuse(struct pmap *pmap)
2906 {
2907 #ifdef DEBUG
2908 	CPU_INFO_ITERATOR cii;
2909 	struct cpu_info *ci;
2910 
2911 	for (CPU_INFO_FOREACH(cii, ci)) {
2912 		if (ci->ci_pmap == pmap)
2913 			panic("destroying pmap being used");
2914 #if defined(XENPV) && defined(__x86_64__)
2915 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
2916 			if (pmap->pm_pdir[i] != 0 &&
2917 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2918 				printf("pmap_destroy(%p) pmap_kernel %p "
2919 				    "curcpu %d cpu %d ci_pmap %p "
2920 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2921 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2922 				    pmap, pmap_kernel(), curcpu()->ci_index,
2923 				    ci->ci_index, ci->ci_pmap,
2924 				    i, ci->ci_kpm_pdir[i],
2925 				    i, pmap->pm_pdir[i]);
2926 				panic("%s: used pmap", __func__);
2927 			}
2928 		}
2929 #endif
2930 	}
2931 #endif /* DEBUG */
2932 }
2933 
2934 /*
2935  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
2936  * count goes to zero.
2937  *
2938  * => we can be called from pmap_unmap_ptes() with a different, unrelated
2939  *    pmap's lock held.  be careful!
2940  */
2941 void
2942 pmap_destroy(struct pmap *pmap)
2943 {
2944 	int i;
2945 
2946 	/*
2947 	 * drop reference count and verify not in use.
2948 	 */
2949 
2950 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2951 		return;
2952 	}
2953 	pmap_check_inuse(pmap);
2954 
2955 	/*
2956 	 * handle any deferred frees.
2957 	 */
2958 
2959 	mutex_enter(&pmap->pm_lock);
2960 	if (pmap->pm_pve != NULL) {
2961 		pmap_free_pv(pmap, pmap->pm_pve);
2962 		pmap->pm_pve = NULL;
2963 	}
2964 	pmap_drain_pv(pmap);
2965 	mutex_exit(&pmap->pm_lock);
2966 	pmap_update(pmap);
2967 
2968 	/*
2969 	 * Reference count is zero, free pmap resources and then free pmap.
2970 	 */
2971 
2972 	pmap_check_ptps(pmap);
2973 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
2974 
2975 #ifdef USER_LDT
2976 	if (pmap->pm_ldt != NULL) {
2977 		/*
2978 		 * No need to switch the LDT; this address space is gone,
2979 		 * nothing is using it.
2980 		 *
2981 		 * No need to lock the pmap for ldt_free (or anything else),
2982 		 * we're the last one to use it.
2983 		 */
2984 		/* XXXAD can't take cpu_lock here - fix soon. */
2985 		mutex_enter(&cpu_lock);
2986 		ldt_free(pmap->pm_ldt_sel);
2987 		mutex_exit(&cpu_lock);
2988 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2989 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
2990 	}
2991 #endif
2992 
2993 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2994 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2995 	}
2996 	kcpuset_zero(pmap->pm_cpus);
2997 	kcpuset_zero(pmap->pm_kernel_cpus);
2998 #ifdef XENPV
2999 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3000 #endif
3001 
3002 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3003 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3004 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3005 
3006 	pmap_check_ptps(pmap);
3007 	if (__predict_false(pmap->pm_enter != NULL)) {
3008 		/* XXX make this a different cache */
3009 		pool_cache_destruct_object(&pmap_cache, pmap);
3010 	} else {
3011 		pool_cache_put(&pmap_cache, pmap);
3012 	}
3013 }
3014 
3015 /*
3016  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3017  *
3018  * => caller must hold pmap's lock
3019  * => PTP must be mapped into KVA
3020  * => must be called with kernel preemption disabled
3021  * => does as little work as possible
3022  */
3023 static void
3024 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3025     vaddr_t startva, vaddr_t blkendva)
3026 {
3027 #ifndef XENPV
3028 	struct pv_entry *pve;
3029 	struct vm_page *pg;
3030 	struct pmap_page *pp;
3031 	pt_entry_t opte;
3032 	rb_tree_t *tree;
3033 	vaddr_t va;
3034 	int wired;
3035 	uint8_t oattrs;
3036 	u_int cnt;
3037 
3038 	KASSERT(mutex_owned(&pmap->pm_lock));
3039 	KASSERT(kpreempt_disabled());
3040 	KASSERT(pmap != pmap_kernel());
3041 	KASSERT(ptp->wire_count > 1);
3042 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3043 
3044 	/*
3045 	 * Start at the lowest entered VA, and scan until there are no more
3046 	 * PTEs in the PTPs.
3047 	 */
3048 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3049 	pve = RB_TREE_MIN(tree);
3050 	wired = 0;
3051 	va = (vaddr_t)ptp->uanon;
3052 	pte += ((va - startva) >> PAGE_SHIFT);
3053 
3054 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3055 		/*
3056 		 * No need for an atomic to clear the PTE.  Nothing else can
3057 		 * see the address space any more and speculative access (if
3058 		 * possible) won't modify.  Therefore there's no need to
3059 		 * track the accessed/dirty bits.
3060 		 */
3061 		opte = *pte;
3062 		if (!pmap_valid_entry(opte)) {
3063 			continue;
3064 		}
3065 
3066 		/*
3067 		 * Count the PTE.  If it's not for a managed mapping
3068 		 * there's noting more to do.
3069 		 */
3070 		cnt--;
3071 		wired -= (opte & PTE_WIRED);
3072 		if ((opte & PTE_PVLIST) == 0) {
3073 #ifndef DOM0OPS
3074 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3075 			    "managed page without PTE_PVLIST for %#"
3076 			    PRIxVADDR, va);
3077 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3078 			    "pv-tracked page without PTE_PVLIST for %#"
3079 			    PRIxVADDR, va);
3080 #endif
3081 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3082 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3083 			    va) == NULL);
3084 			continue;
3085 		}
3086 
3087 		/*
3088 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3089 		 * in the PTP.  If it's for this VA, take advantage of it to
3090 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3091 		 * tree by skipping to the next VA in the tree whenever
3092 		 * there is a match here.  The tree will be cleared out in
3093 		 * one pass before return to pmap_remove_all().
3094 		 */
3095 		oattrs = pmap_pte_to_pp_attrs(opte);
3096 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3097 			pp = pve->pve_pp;
3098 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3099 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3100 			    pp->pp_pte.pte_va != va);
3101 			mutex_spin_enter(&pp->pp_lock);
3102 			pp->pp_attrs |= oattrs;
3103 			LIST_REMOVE(pve, pve_list);
3104 			mutex_spin_exit(&pp->pp_lock);
3105 
3106 			/*
3107 			 * pve won't be touched again until pmap_drain_pv(),
3108 			 * so it's still safe to traverse the tree.
3109 			 */
3110 			pmap_free_pv(pmap, pve);
3111 			pve = RB_TREE_NEXT(tree, pve);
3112 			continue;
3113 		}
3114 
3115 		/*
3116 		 * No entry in the tree so it must be embedded.  Look up the
3117 		 * page and cancel the embedded entry.
3118 		 */
3119 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3120 			pp = VM_PAGE_TO_PP(pg);
3121 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3122 			paddr_t pa = pmap_pte2pa(opte);
3123 			panic("%s: PTE_PVLIST with pv-untracked page"
3124 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3125 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3126 		}
3127 		mutex_spin_enter(&pp->pp_lock);
3128 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3129 		KASSERT(pp->pp_pte.pte_va == va);
3130 		pp->pp_attrs |= oattrs;
3131 		pp->pp_pte.pte_ptp = NULL;
3132 		pp->pp_pte.pte_va = 0;
3133 		mutex_spin_exit(&pp->pp_lock);
3134 	}
3135 
3136 	/* PTP now empty - adjust the tree & stats to match. */
3137 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3138 	ptp->wire_count = 1;
3139 #ifdef DIAGNOSTIC
3140 	rb_tree_init(tree, &pmap_rbtree_ops);
3141 #endif
3142 #else	/* !XENPV */
3143 	/*
3144 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3145 	 * I guess the hypervisor keeps track of PTEs too.
3146 	 */
3147 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3148 #endif	/* !XENPV */
3149 }
3150 
3151 /*
3152  * pmap_remove_all: remove all mappings from pmap in bulk.
3153  *
3154  * Ordinarily when removing mappings it's important to hold the UVM object's
3155  * lock, so that pages do not gain a new identity while retaining stale TLB
3156  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3157  * Here it's known that the address space is no longer visible to any user
3158  * process, so we don't need to worry about that.
3159  */
3160 bool
3161 pmap_remove_all(struct pmap *pmap)
3162 {
3163 	struct vm_page *ptps[32];
3164 	vaddr_t va, blkendva;
3165 	struct pmap *pmap2;
3166 	pt_entry_t *ptes;
3167 	pd_entry_t pde __diagused;
3168 	pd_entry_t * const *pdes;
3169 	int lvl __diagused, i, n;
3170 
3171 	/* XXX Can't handle EPT just yet. */
3172 	if (pmap->pm_remove != NULL) {
3173 		return false;
3174 	}
3175 
3176 	for (;;) {
3177 		/* Fetch a block of PTPs from tree. */
3178 		mutex_enter(&pmap->pm_lock);
3179 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3180 		    (void **)ptps, __arraycount(ptps), false);
3181 		if (n == 0) {
3182 			mutex_exit(&pmap->pm_lock);
3183 			break;
3184 		}
3185 
3186 		/* Remove all mappings in the set of PTPs. */
3187 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3188 		for (i = 0; i < n; i++) {
3189 			if (ptps[i]->wire_count == 0) {
3190 				/* It's dead: pmap_update() will expunge. */
3191 				continue;
3192 			}
3193 
3194 			/* Determine range of block. */
3195 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3196 			blkendva = x86_round_pdr(va + 1);
3197 
3198 			/* Make sure everything squares up... */
3199 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3200 			KASSERT(lvl == 1);
3201 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3202 
3203 			/* Zap! */
3204 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3205 			    blkendva);
3206 
3207 			/* PTP should now be unused - free it. */
3208 			KASSERT(ptps[i]->wire_count == 1);
3209 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3210 		}
3211 		pmap_unmap_ptes(pmap, pmap2);
3212 		pmap_drain_pv(pmap);
3213 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3214 		mutex_exit(&pmap->pm_lock);
3215 
3216 		/* Process deferred frees. */
3217 		pmap_update(pmap);
3218 
3219 		/* A breathing point. */
3220 		preempt_point();
3221 	}
3222 
3223 	/* Verify that the pmap is now completely empty. */
3224 	pmap_check_ptps(pmap);
3225 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3226 	    "pmap %p not empty", pmap);
3227 
3228 	return true;
3229 }
3230 
3231 #if defined(PMAP_FORK)
3232 /*
3233  * pmap_fork: perform any necessary data structure manipulation when
3234  * a VM space is forked.
3235  */
3236 void
3237 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3238 {
3239 #ifdef USER_LDT
3240 	union descriptor *new_ldt;
3241 	int sel;
3242 
3243 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3244 		return;
3245 	}
3246 
3247 	/*
3248 	 * Copy the LDT into the new process.
3249 	 *
3250 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3251 	 * we'll retry. This will starve if there's a stream of LDT changes
3252 	 * in another thread but that should not happen.
3253 	 */
3254 
3255 retry:
3256 	if (pmap1->pm_ldt != NULL) {
3257 		/* Allocate space for the new process's LDT */
3258 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3259 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3260 		if (new_ldt == NULL) {
3261 			printf("WARNING: %s: unable to allocate LDT space\n",
3262 			    __func__);
3263 			return;
3264 		}
3265 		mutex_enter(&cpu_lock);
3266 		/* Get a GDT slot for it */
3267 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3268 		if (sel == -1) {
3269 			mutex_exit(&cpu_lock);
3270 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3271 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3272 			printf("WARNING: %s: unable to allocate LDT selector\n",
3273 			    __func__);
3274 			return;
3275 		}
3276 	} else {
3277 		/* Wasn't anything there after all. */
3278 		new_ldt = NULL;
3279 		sel = -1;
3280 		mutex_enter(&cpu_lock);
3281 	}
3282 
3283  	/*
3284 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3285 	 */
3286  	if (pmap1->pm_ldt != NULL) {
3287 		if (new_ldt == NULL) {
3288 			/* A wild LDT just appeared. */
3289 			mutex_exit(&cpu_lock);
3290 			goto retry;
3291 		}
3292 
3293 		/* Copy the LDT data and install it in pmap2 */
3294 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3295 		pmap2->pm_ldt = new_ldt;
3296 		pmap2->pm_ldt_sel = sel;
3297 		mutex_exit(&cpu_lock);
3298 	} else {
3299 		if (new_ldt != NULL) {
3300 			/* The LDT disappeared, drop what we did. */
3301 			ldt_free(sel);
3302 			mutex_exit(&cpu_lock);
3303 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3304 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3305 			return;
3306 		}
3307 
3308 		/* We're good, just leave. */
3309 		mutex_exit(&cpu_lock);
3310 	}
3311 #endif /* USER_LDT */
3312 }
3313 #endif /* PMAP_FORK */
3314 
3315 #ifdef USER_LDT
3316 
3317 /*
3318  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3319  * is active, reload LDTR.
3320  */
3321 static void
3322 pmap_ldt_xcall(void *arg1, void *arg2)
3323 {
3324 	struct pmap *pm;
3325 
3326 	kpreempt_disable();
3327 	pm = arg1;
3328 	if (curcpu()->ci_pmap == pm) {
3329 #if defined(SVS)
3330 		if (svs_enabled) {
3331 			svs_ldt_sync(pm);
3332 		} else
3333 #endif
3334 		lldt(pm->pm_ldt_sel);
3335 	}
3336 	kpreempt_enable();
3337 }
3338 
3339 /*
3340  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3341  * in the new selector on all CPUs.
3342  */
3343 void
3344 pmap_ldt_sync(struct pmap *pm)
3345 {
3346 	uint64_t where;
3347 
3348 	KASSERT(mutex_owned(&cpu_lock));
3349 
3350 	pmap_ldt_evcnt.ev_count++;
3351 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3352 	xc_wait(where);
3353 }
3354 
3355 /*
3356  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3357  * restore the default.
3358  */
3359 void
3360 pmap_ldt_cleanup(struct lwp *l)
3361 {
3362 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3363 	union descriptor *ldt;
3364 	int sel;
3365 
3366 	if (__predict_true(pmap->pm_ldt == NULL)) {
3367 		return;
3368 	}
3369 
3370 	mutex_enter(&cpu_lock);
3371 	if (pmap->pm_ldt != NULL) {
3372 		sel = pmap->pm_ldt_sel;
3373 		ldt = pmap->pm_ldt;
3374 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3375 		pmap->pm_ldt = NULL;
3376 		pmap_ldt_sync(pmap);
3377 		ldt_free(sel);
3378 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3379 		    UVM_KMF_WIRED);
3380 	}
3381 	mutex_exit(&cpu_lock);
3382 }
3383 #endif /* USER_LDT */
3384 
3385 /*
3386  * pmap_activate: activate a process' pmap
3387  *
3388  * => must be called with kernel preemption disabled
3389  * => if lwp is the curlwp, then set ci_want_pmapload so that
3390  *    actual MMU context switch will be done by pmap_load() later
3391  */
3392 void
3393 pmap_activate(struct lwp *l)
3394 {
3395 	struct cpu_info *ci;
3396 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3397 
3398 	KASSERT(kpreempt_disabled());
3399 
3400 	ci = curcpu();
3401 
3402 	if (l != ci->ci_curlwp)
3403 		return;
3404 
3405 	KASSERT(ci->ci_want_pmapload == 0);
3406 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3407 
3408 	/*
3409 	 * no need to switch to kernel vmspace because
3410 	 * it's a subset of any vmspace.
3411 	 */
3412 
3413 	if (pmap == pmap_kernel()) {
3414 		ci->ci_want_pmapload = 0;
3415 		return;
3416 	}
3417 
3418 	ci->ci_want_pmapload = 1;
3419 }
3420 
3421 #if defined(XENPV) && defined(__x86_64__)
3422 #define	KASSERT_PDIRPA(pmap) \
3423 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3424 	    pmap == pmap_kernel())
3425 #elif defined(PAE)
3426 #define	KASSERT_PDIRPA(pmap) \
3427 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3428 #elif !defined(XENPV)
3429 #define	KASSERT_PDIRPA(pmap) \
3430 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3431 #else
3432 #define	KASSERT_PDIRPA(pmap) 	KASSERT(true)	/* nothing to do */
3433 #endif
3434 
3435 /*
3436  * pmap_reactivate: try to regain reference to the pmap.
3437  *
3438  * => Must be called with kernel preemption disabled.
3439  */
3440 static void
3441 pmap_reactivate(struct pmap *pmap)
3442 {
3443 	struct cpu_info * const ci = curcpu();
3444 	const cpuid_t cid = cpu_index(ci);
3445 
3446 	KASSERT(kpreempt_disabled());
3447 	KASSERT_PDIRPA(pmap);
3448 
3449 	/*
3450 	 * If we still have a lazy reference to this pmap, we can assume
3451 	 * that there was no TLB shootdown for this pmap in the meantime.
3452 	 *
3453 	 * The order of events here is important as we must synchronize
3454 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3455 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3456 	 * change only when the state is TLBSTATE_LAZY.
3457 	 */
3458 
3459 	ci->ci_tlbstate = TLBSTATE_VALID;
3460 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3461 
3462 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3463 		/* We have the reference, state is valid. */
3464 	} else {
3465 		/*
3466 		 * Must reload the TLB, pmap has been changed during
3467 		 * deactivated.
3468 		 */
3469 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3470 
3471 		tlbflush();
3472 	}
3473 }
3474 
3475 /*
3476  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3477  * and relevant LDT info.
3478  *
3479  * Ensures that the current process' pmap is loaded on the current CPU's
3480  * MMU and that there are no stale TLB entries.
3481  *
3482  * => The caller should disable kernel preemption or do check-and-retry
3483  *    to prevent a preemption from undoing our efforts.
3484  * => This function may block.
3485  */
3486 void
3487 pmap_load(void)
3488 {
3489 	struct cpu_info *ci;
3490 	struct pmap *pmap, *oldpmap;
3491 	struct lwp *l;
3492 	uint64_t ncsw;
3493 
3494 	kpreempt_disable();
3495  retry:
3496 	ci = curcpu();
3497 	if (!ci->ci_want_pmapload) {
3498 		kpreempt_enable();
3499 		return;
3500 	}
3501 	l = ci->ci_curlwp;
3502 	ncsw = l->l_ncsw;
3503 	__insn_barrier();
3504 
3505 	/* should be able to take ipis. */
3506 	KASSERT(ci->ci_ilevel < IPL_HIGH);
3507 #ifdef XENPV
3508 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3509 	KASSERT(x86_read_psl() == 0);
3510 #else
3511 	KASSERT((x86_read_psl() & PSL_I) != 0);
3512 #endif
3513 
3514 	KASSERT(l != NULL);
3515 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3516 	KASSERT(pmap != pmap_kernel());
3517 	oldpmap = ci->ci_pmap;
3518 
3519 	if (pmap == oldpmap) {
3520 		pmap_reactivate(pmap);
3521 		ci->ci_want_pmapload = 0;
3522 		kpreempt_enable();
3523 		return;
3524 	}
3525 
3526 	/*
3527 	 * Acquire a reference to the new pmap and perform the switch.
3528 	 */
3529 
3530 	pmap_reference(pmap);
3531 	pmap_load1(l, pmap, oldpmap);
3532 	ci->ci_want_pmapload = 0;
3533 
3534 	/*
3535 	 * we're now running with the new pmap.  drop the reference
3536 	 * to the old pmap.  if we block, we need to go around again.
3537 	 */
3538 
3539 	pmap_destroy(oldpmap);
3540 	__insn_barrier();
3541 	if (l->l_ncsw != ncsw) {
3542 		goto retry;
3543 	}
3544 
3545 	kpreempt_enable();
3546 }
3547 
3548 /*
3549  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3550  * pmap_load().  It's critically important that this function does not
3551  * block.
3552  */
3553 static void
3554 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3555 {
3556 	struct cpu_info *ci;
3557 	struct pcb *pcb;
3558 	cpuid_t cid;
3559 
3560 	KASSERT(kpreempt_disabled());
3561 
3562 	pcb = lwp_getpcb(l);
3563 	ci = l->l_cpu;
3564 	cid = cpu_index(ci);
3565 
3566 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3567 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3568 
3569 	KASSERT_PDIRPA(oldpmap);
3570 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3571 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3572 
3573 	/*
3574 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3575 	 * with TLB shootdown interrupts, so set the state VALID first,
3576 	 * then register us for shootdown events on this pmap.
3577 	 */
3578 	ci->ci_tlbstate = TLBSTATE_VALID;
3579 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3580 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3581 	ci->ci_pmap = pmap;
3582 
3583 	/*
3584 	 * update tss.  now that we have registered for invalidations
3585 	 * from other CPUs, we're good to load the page tables.
3586 	 */
3587 #ifdef PAE
3588 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3589 #else
3590 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3591 #endif
3592 
3593 #ifdef i386
3594 #ifndef XENPV
3595 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3596 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3597 #endif
3598 #endif
3599 
3600 #if defined(SVS) && defined(USER_LDT)
3601 	if (svs_enabled) {
3602 		svs_ldt_sync(pmap);
3603 	} else
3604 #endif
3605 	lldt(pmap->pm_ldt_sel);
3606 
3607 	cpu_load_pmap(pmap, oldpmap);
3608 }
3609 
3610 /*
3611  * pmap_deactivate: deactivate a process' pmap.
3612  *
3613  * => Must be called with kernel preemption disabled (high IPL is enough).
3614  */
3615 void
3616 pmap_deactivate(struct lwp *l)
3617 {
3618 	struct pmap *pmap;
3619 	struct cpu_info *ci;
3620 
3621 	KASSERT(kpreempt_disabled());
3622 
3623 	if (l != curlwp) {
3624 		return;
3625 	}
3626 
3627 	/*
3628 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3629 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3630 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3631 	 * to kernel preemption or blocking routine in between.
3632 	 */
3633 	pmap_tlb_shootnow();
3634 
3635 	ci = curcpu();
3636 
3637 	if (ci->ci_want_pmapload) {
3638 		/*
3639 		 * ci_want_pmapload means that our pmap is not loaded on
3640 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3641 		 * is always considered loaded.
3642 		 */
3643 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3644 		    != pmap_kernel());
3645 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3646 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3647 
3648 		/*
3649 		 * userspace has not been touched.
3650 		 * nothing to do here.
3651 		 */
3652 
3653 		ci->ci_want_pmapload = 0;
3654 		return;
3655 	}
3656 
3657 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3658 
3659 	if (pmap == pmap_kernel()) {
3660 		return;
3661 	}
3662 
3663 	KASSERT_PDIRPA(pmap);
3664 	KASSERT(ci->ci_pmap == pmap);
3665 
3666 	/*
3667 	 * we aren't interested in TLB invalidations for this pmap,
3668 	 * at least for the time being.
3669 	 */
3670 
3671 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3672 	ci->ci_tlbstate = TLBSTATE_LAZY;
3673 }
3674 
3675 /*
3676  * some misc. functions
3677  */
3678 
3679 bool
3680 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3681     int *lastlvl)
3682 {
3683 	unsigned long index;
3684 	pd_entry_t pde;
3685 	int i;
3686 
3687 	for (i = PTP_LEVELS; i > 1; i--) {
3688 		index = pl_i(va, i);
3689 		pde = pdes[i - 2][index];
3690 		if ((pde & PTE_P) == 0) {
3691 			*lastlvl = i;
3692 			return false;
3693 		}
3694 		if (pde & PTE_PS)
3695 			break;
3696 	}
3697 	if (lastpde != NULL)
3698 		*lastpde = pde;
3699 	*lastlvl = i;
3700 	return true;
3701 }
3702 
3703 /*
3704  * pmap_extract: extract a PA for the given VA
3705  */
3706 bool
3707 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3708 {
3709 	pt_entry_t *ptes, pte;
3710 	pd_entry_t pde;
3711 	pd_entry_t * const *pdes;
3712 	struct pmap *pmap2;
3713 	paddr_t pa;
3714 	bool rv;
3715 	int lvl;
3716 
3717 	if (__predict_false(pmap->pm_extract != NULL)) {
3718 		return (*pmap->pm_extract)(pmap, va, pap);
3719 	}
3720 
3721 #ifdef __HAVE_DIRECT_MAP
3722 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3723 		if (pap != NULL) {
3724 			*pap = PMAP_DIRECT_UNMAP(va);
3725 		}
3726 		return true;
3727 	}
3728 #endif
3729 
3730 	rv = false;
3731 	pa = 0;
3732 
3733 	if (pmap != pmap_kernel()) {
3734 		mutex_enter(&pmap->pm_lock);
3735 	}
3736 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3737 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3738 		if (lvl == 2) {
3739 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3740 			rv = true;
3741 		} else {
3742 			KASSERT(lvl == 1);
3743 			pte = ptes[pl1_i(va)];
3744 			if (__predict_true((pte & PTE_P) != 0)) {
3745 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3746 				rv = true;
3747 			}
3748 		}
3749 	}
3750 	pmap_unmap_ptes(pmap, pmap2);
3751 	if (pmap != pmap_kernel()) {
3752 		mutex_exit(&pmap->pm_lock);
3753 	}
3754 	if (pap != NULL) {
3755 		*pap = pa;
3756 	}
3757 
3758 	return rv;
3759 }
3760 
3761 /*
3762  * vtophys: virtual address to physical address.  For use by
3763  * machine-dependent code only.
3764  */
3765 paddr_t
3766 vtophys(vaddr_t va)
3767 {
3768 	paddr_t pa;
3769 
3770 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3771 		return pa;
3772 	return 0;
3773 }
3774 
3775 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3776 
3777 #ifdef XENPV
3778 /*
3779  * vtomach: virtual address to machine address.  For use by
3780  * machine-dependent code only.
3781  */
3782 paddr_t
3783 vtomach(vaddr_t va)
3784 {
3785 	paddr_t pa;
3786 
3787 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3788 		return pa;
3789 	return 0;
3790 }
3791 #endif
3792 
3793 /*
3794  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3795  * determine the bounds of the kernel virtual addess space.
3796  */
3797 void
3798 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3799 {
3800 	*startp = virtual_avail;
3801 	*endp = virtual_end;
3802 }
3803 
3804 void
3805 pmap_zero_page(paddr_t pa)
3806 {
3807 #if defined(__HAVE_DIRECT_MAP)
3808 	memset((void *)PMAP_DIRECT_MAP(pa), 0, PAGE_SIZE);
3809 #else
3810 #if defined(XENPV)
3811 	if (XEN_VERSION_SUPPORTED(3, 4))
3812 		xen_pagezero(pa);
3813 #endif
3814 	struct cpu_info *ci;
3815 	pt_entry_t *zpte;
3816 	vaddr_t zerova;
3817 
3818 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3819 
3820 	kpreempt_disable();
3821 
3822 	ci = curcpu();
3823 	zerova = ci->vpage[VPAGE_ZER];
3824 	zpte = ci->vpage_pte[VPAGE_ZER];
3825 
3826 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3827 
3828 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3829 	pmap_pte_flush();
3830 	pmap_update_pg(zerova);		/* flush TLB */
3831 
3832 	memset((void *)zerova, 0, PAGE_SIZE);
3833 
3834 #if defined(DIAGNOSTIC) || defined(XENPV)
3835 	pmap_pte_set(zpte, 0);				/* zap ! */
3836 	pmap_pte_flush();
3837 #endif
3838 
3839 	kpreempt_enable();
3840 #endif /* defined(__HAVE_DIRECT_MAP) */
3841 }
3842 
3843 void
3844 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3845 {
3846 #if defined(__HAVE_DIRECT_MAP)
3847 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3848 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3849 
3850 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3851 #else
3852 #if defined(XENPV)
3853 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3854 		xen_copy_page(srcpa, dstpa);
3855 		return;
3856 	}
3857 #endif
3858 	struct cpu_info *ci;
3859 	pt_entry_t *srcpte, *dstpte;
3860 	vaddr_t srcva, dstva;
3861 
3862 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
3863 
3864 	kpreempt_disable();
3865 
3866 	ci = curcpu();
3867 	srcva = ci->vpage[VPAGE_SRC];
3868 	dstva = ci->vpage[VPAGE_DST];
3869 	srcpte = ci->vpage_pte[VPAGE_SRC];
3870 	dstpte = ci->vpage_pte[VPAGE_DST];
3871 
3872 	KASSERT(*srcpte == 0 && *dstpte == 0);
3873 
3874 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3875 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
3876 	pmap_pte_flush();
3877 	pmap_update_pg(srcva);
3878 	pmap_update_pg(dstva);
3879 
3880 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3881 
3882 #if defined(DIAGNOSTIC) || defined(XENPV)
3883 	pmap_pte_set(srcpte, 0);
3884 	pmap_pte_set(dstpte, 0);
3885 	pmap_pte_flush();
3886 #endif
3887 
3888 	kpreempt_enable();
3889 #endif /* defined(__HAVE_DIRECT_MAP) */
3890 }
3891 
3892 static pt_entry_t *
3893 pmap_map_ptp(struct vm_page *ptp)
3894 {
3895 #ifdef __HAVE_DIRECT_MAP
3896 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3897 #else
3898 	struct cpu_info *ci;
3899 	pt_entry_t *ptppte;
3900 	vaddr_t ptpva;
3901 
3902 	KASSERT(kpreempt_disabled());
3903 
3904 #ifndef XENPV
3905 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
3906 #else
3907 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
3908 #endif
3909 
3910 	ci = curcpu();
3911 	ptpva = ci->vpage[VPAGE_PTP];
3912 	ptppte = ci->vpage_pte[VPAGE_PTP];
3913 
3914 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3915 
3916 	pmap_pte_flush();
3917 	pmap_update_pg(ptpva);
3918 
3919 	return (pt_entry_t *)ptpva;
3920 #endif
3921 }
3922 
3923 static void
3924 pmap_unmap_ptp(void)
3925 {
3926 #ifndef __HAVE_DIRECT_MAP
3927 #if defined(DIAGNOSTIC) || defined(XENPV)
3928 	struct cpu_info *ci;
3929 	pt_entry_t *pte;
3930 
3931 	KASSERT(kpreempt_disabled());
3932 
3933 	ci = curcpu();
3934 	pte = ci->vpage_pte[VPAGE_PTP];
3935 
3936 	if (*pte != 0) {
3937 		pmap_pte_set(pte, 0);
3938 		pmap_pte_flush();
3939 	}
3940 #endif
3941 #endif
3942 }
3943 
3944 static pt_entry_t *
3945 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3946 {
3947 
3948 	KASSERT(kpreempt_disabled());
3949 	if (pmap_is_curpmap(pmap)) {
3950 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3951 	}
3952 	KASSERT(ptp != NULL);
3953 	return pmap_map_ptp(ptp) + pl1_pi(va);
3954 }
3955 
3956 static void
3957 pmap_unmap_pte(void)
3958 {
3959 
3960 	KASSERT(kpreempt_disabled());
3961 
3962 	pmap_unmap_ptp();
3963 }
3964 
3965 /*
3966  * p m a p   r e m o v e   f u n c t i o n s
3967  *
3968  * functions that remove mappings
3969  */
3970 
3971 /*
3972  * pmap_remove_ptes: remove PTEs from a PTP
3973  *
3974  * => caller must hold pmap's lock
3975  * => PTP must be mapped into KVA
3976  * => PTP should be null if pmap == pmap_kernel()
3977  * => must be called with kernel preemption disabled
3978  * => returns composite pte if at least one page should be shot down
3979  */
3980 static void
3981 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3982     vaddr_t startva, vaddr_t endva)
3983 {
3984 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3985 
3986 	KASSERT(mutex_owned(&pmap->pm_lock));
3987 	KASSERT(kpreempt_disabled());
3988 
3989 	/*
3990 	 * mappings are very often sparse, so clip the given range to the
3991 	 * range of PTEs that are known present in the PTP.
3992 	 */
3993 	pmap_ptp_range_clip(ptp, &startva, &pte);
3994 
3995 	/*
3996 	 * note that ptpva points to the PTE that maps startva.   this may
3997 	 * or may not be the first PTE in the PTP.
3998 	 *
3999 	 * we loop through the PTP while there are still PTEs to look at
4000 	 * and the wire_count is greater than 1 (because we use the wire_count
4001 	 * to keep track of the number of real PTEs in the PTP).
4002 	 */
4003 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4004 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4005 		startva += PAGE_SIZE;
4006 		pte++;
4007 	}
4008 }
4009 
4010 /*
4011  * pmap_remove_pte: remove a single PTE from a PTP.
4012  *
4013  * => caller must hold pmap's lock
4014  * => PTP must be mapped into KVA
4015  * => PTP should be null if pmap == pmap_kernel()
4016  * => returns true if we removed a mapping
4017  * => must be called with kernel preemption disabled
4018  */
4019 static bool
4020 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4021     vaddr_t va)
4022 {
4023 	struct pv_entry *pve;
4024 	struct vm_page *pg;
4025 	struct pmap_page *pp;
4026 	pt_entry_t opte;
4027 
4028 	KASSERT(mutex_owned(&pmap->pm_lock));
4029 	KASSERT(kpreempt_disabled());
4030 
4031 	if (!pmap_valid_entry(*pte)) {
4032 		/* VA not mapped. */
4033 		return false;
4034 	}
4035 
4036 	/* Atomically save the old PTE and zap it. */
4037 	opte = pmap_pte_testset(pte, 0);
4038 	if (!pmap_valid_entry(opte)) {
4039 		return false;
4040 	}
4041 
4042 	pmap_exec_account(pmap, va, opte, 0);
4043 	pmap_stats_update_bypte(pmap, 0, opte);
4044 
4045 	if (ptp) {
4046 		/*
4047 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4048 		 */
4049 		ptp->wire_count--;
4050 		if (ptp->wire_count <= 1) {
4051 			opte |= PTE_A;
4052 		}
4053 	}
4054 
4055 	if ((opte & PTE_A) != 0) {
4056 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4057 	}
4058 
4059 	/*
4060 	 * If we are not on a pv list - we are done.
4061 	 */
4062 	if ((opte & PTE_PVLIST) == 0) {
4063 #ifndef DOM0OPS
4064 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4065 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4066 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4067 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4068 #endif
4069 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4070 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4071 		return true;
4072 	}
4073 
4074 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4075 		pp = VM_PAGE_TO_PP(pg);
4076 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4077 		paddr_t pa = pmap_pte2pa(opte);
4078 		panic("%s: PTE_PVLIST with pv-untracked page"
4079 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4080 		    __func__, va, pa, atop(pa));
4081 	}
4082 
4083 	/* Sync R/M bits. */
4084 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4085 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4086 	return true;
4087 }
4088 
4089 static void
4090 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4091 {
4092 	pt_entry_t *ptes;
4093 	pd_entry_t pde;
4094 	pd_entry_t * const *pdes;
4095 	bool result;
4096 	vaddr_t blkendva, va = sva;
4097 	struct vm_page *ptp;
4098 	struct pmap *pmap2;
4099 	int lvl;
4100 
4101 	KASSERT(mutex_owned(&pmap->pm_lock));
4102 
4103 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4104 
4105 	/*
4106 	 * removing one page?  take shortcut function.
4107 	 */
4108 
4109 	if (va + PAGE_SIZE == eva) {
4110 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4111 			KASSERT(lvl == 1);
4112 
4113 			/* Get PTP if non-kernel mapping. */
4114 			if (pmap != pmap_kernel()) {
4115 				ptp = pmap_find_ptp(pmap, va, 1);
4116 				KASSERTMSG(ptp != NULL,
4117 				    "%s: unmanaged PTP detected", __func__);
4118 			} else {
4119 				/* Never free kernel PTPs. */
4120 				ptp = NULL;
4121 			}
4122 
4123 			result = pmap_remove_pte(pmap, ptp,
4124 			    &ptes[pl1_i(va)], va);
4125 
4126 			/*
4127 			 * if mapping removed and the PTP is no longer
4128 			 * being used, free it!
4129 			 */
4130 
4131 			if (result && ptp && ptp->wire_count <= 1)
4132 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4133 		}
4134 	} else for (/* null */ ; va < eva ; va = blkendva) {
4135 		/* determine range of block */
4136 		blkendva = x86_round_pdr(va+1);
4137 		if (blkendva > eva)
4138 			blkendva = eva;
4139 
4140 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4141 			/* Skip a range corresponding to an invalid pde. */
4142 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4143  			continue;
4144 		}
4145 		KASSERT(lvl == 1);
4146 
4147 		/* Get PTP if non-kernel mapping. */
4148 		if (pmap != pmap_kernel()) {
4149 			ptp = pmap_find_ptp(pmap, va, 1);
4150 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4151 			    __func__);
4152 		} else {
4153 			/* Never free kernel PTPs. */
4154 			ptp = NULL;
4155 		}
4156 
4157 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4158 		    blkendva);
4159 
4160 		/* If PTP is no longer being used, free it. */
4161 		if (ptp && ptp->wire_count <= 1) {
4162 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4163 		}
4164 	}
4165 	pmap_unmap_ptes(pmap, pmap2);
4166 	pmap_drain_pv(pmap);
4167 }
4168 
4169 /*
4170  * pmap_remove: mapping removal function.
4171  *
4172  * => caller should not be holding any pmap locks
4173  */
4174 void
4175 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4176 {
4177 	if (__predict_false(pmap->pm_remove != NULL)) {
4178 		(*pmap->pm_remove)(pmap, sva, eva);
4179 		return;
4180 	}
4181 
4182 	mutex_enter(&pmap->pm_lock);
4183 	pmap_remove_locked(pmap, sva, eva);
4184 	mutex_exit(&pmap->pm_lock);
4185 }
4186 
4187 /*
4188  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4189  *
4190  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4191  * => Caller should disable kernel preemption.
4192  * => issues tlb shootdowns if necessary.
4193  */
4194 static int
4195 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4196     pt_entry_t *optep)
4197 {
4198 	struct pmap *pmap;
4199 	struct vm_page *ptp;
4200 	vaddr_t va;
4201 	pt_entry_t *ptep;
4202 	pt_entry_t opte;
4203 	pt_entry_t npte;
4204 	pt_entry_t expect;
4205 	bool need_shootdown;
4206 
4207 	ptp = pvpte->pte_ptp;
4208 	va = pvpte->pte_va;
4209 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4210 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4211 	pmap = ptp_to_pmap(ptp);
4212 	KASSERT(kpreempt_disabled());
4213 
4214 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4215 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4216 		    optep);
4217 	}
4218 
4219 	expect = pmap_pa2pte(pa) | PTE_P;
4220 
4221 	if (clearbits != ~0) {
4222 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4223 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4224 	}
4225 
4226 	ptep = pmap_map_pte(pmap, ptp, va);
4227 	do {
4228 		opte = *ptep;
4229 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4230 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4231 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4232 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4233 			/*
4234 			 * We lost a race with a V->P operation like
4235 			 * pmap_remove().  Wait for the competitor
4236 			 * reflecting pte bits into mp_attrs.
4237 			 */
4238 			pmap_unmap_pte();
4239 			return EAGAIN;
4240 		}
4241 
4242 		/*
4243 		 * Check if there's anything to do on this PTE.
4244 		 */
4245 		if ((opte & clearbits) == 0) {
4246 			need_shootdown = false;
4247 			break;
4248 		}
4249 
4250 		/*
4251 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4252 		 * ... Unless we are clearing only the PTE_W bit and
4253 		 * it isn't cached as RW (PTE_D).
4254 		 */
4255 		need_shootdown = (opte & PTE_A) != 0 &&
4256 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4257 
4258 		npte = opte & ~clearbits;
4259 
4260 		/*
4261 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4262 		 */
4263 		if (need_shootdown) {
4264 			npte &= ~(PTE_A | PTE_D);
4265 		}
4266 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4267 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4268 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4269 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4270 
4271 	if (need_shootdown) {
4272 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4273 	}
4274 	pmap_unmap_pte();
4275 
4276 	*oattrs = pmap_pte_to_pp_attrs(opte);
4277 	if (optep != NULL)
4278 		*optep = opte;
4279 	return 0;
4280 }
4281 
4282 static void
4283 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4284     vaddr_t va)
4285 {
4286 	struct pmap *pmap2;
4287 	pt_entry_t *ptes;
4288 	pd_entry_t * const *pdes;
4289 
4290 	KASSERT(mutex_owned(&pmap->pm_lock));
4291 
4292 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4293 	pmap_stats_update_bypte(pmap, 0, opte);
4294 	ptp->wire_count--;
4295 	if (ptp->wire_count <= 1) {
4296 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4297 	}
4298 	pmap_unmap_ptes(pmap, pmap2);
4299 }
4300 
4301 static void
4302 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4303 {
4304 	struct pv_pte *pvpte;
4305 	struct vm_page *ptp;
4306 	uintptr_t sum;
4307 	uint8_t oattrs;
4308 	bool locked;
4309 
4310 	/*
4311 	 * Do an unlocked check to see if the page has no mappings, eg when
4312 	 * pmap_remove_all() was called before amap_wipeout() for a process
4313 	 * private amap - common.  The page being removed must be on the way
4314 	 * out, so we don't have to worry about concurrent attempts to enter
4315 	 * it (otherwise the caller either doesn't care or has screwed up).
4316 	 */
4317 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4318 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4319 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4320 	if (sum == 0) {
4321 	    	return;
4322 	}
4323 
4324 	kpreempt_disable();
4325 	for (;;) {
4326 		struct pmap *pmap;
4327 		struct pv_entry *pve;
4328 		pt_entry_t opte;
4329 		vaddr_t va;
4330 
4331 		mutex_spin_enter(&pp->pp_lock);
4332 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4333 			mutex_spin_exit(&pp->pp_lock);
4334 			break;
4335 		}
4336 
4337 		/*
4338 		 * Add a reference to the pmap before clearing the pte.
4339 		 * Otherwise the pmap can disappear behind us.
4340 		 */
4341 		ptp = pvpte->pte_ptp;
4342 		pmap = ptp_to_pmap(ptp);
4343 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4344 		if (ptp != NULL) {
4345 			pmap_reference(pmap);
4346 		}
4347 
4348 		/*
4349 		 * Now try to lock it.  We need a direct handoff between
4350 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4351 		 * and kept associated with this pmap.  If that can't be
4352 		 * had, wait for the pmap's lock to become free and then
4353 		 * retry.
4354 		 */
4355 		locked = mutex_tryenter(&pmap->pm_lock);
4356 		mutex_spin_exit(&pp->pp_lock);
4357 		if (!locked) {
4358 			mutex_enter(&pmap->pm_lock);
4359 			/* nothing, just wait for it */
4360 			mutex_exit(&pmap->pm_lock);
4361 			if (ptp != NULL) {
4362 				pmap_destroy(pmap);
4363 			}
4364 			continue;
4365 		}
4366 		va = pvpte->pte_va;
4367 
4368 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4369 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4370 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4371 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4372 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4373 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4374 
4375 #ifdef DEBUG
4376 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4377 		rb_tree_t *tree = (ptp != NULL ?
4378 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4379 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4380 		if (pve == NULL) {
4381 			KASSERTMSG(&pp->pp_pte == pvpte,
4382 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4383 			    va, pmap, ptp, pvpte, pve);
4384 		} else {
4385 			KASSERTMSG(&pve->pve_pte == pvpte,
4386 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4387 			    va, pmap, ptp, pvpte, pve);
4388 		}
4389 #endif
4390 
4391 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4392 			panic("pmap_pp_remove: mapping not present");
4393 		}
4394 
4395 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4396 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4397 
4398 		/* Update the PTP reference count. Free if last reference. */
4399 		if (ptp != NULL) {
4400 			KASSERT(pmap != pmap_kernel());
4401 			pmap_tlb_shootnow();
4402 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4403 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4404 			} else {
4405 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4406 			}
4407 		} else {
4408 			KASSERT(pmap == pmap_kernel());
4409 			pmap_stats_update_bypte(pmap, 0, opte);
4410 		}
4411 		pmap_tlb_shootnow();
4412 		pmap_drain_pv(pmap);
4413 		mutex_exit(&pmap->pm_lock);
4414 		if (ptp != NULL) {
4415 			pmap_destroy(pmap);
4416 		}
4417 	}
4418 	kpreempt_enable();
4419 }
4420 
4421 /*
4422  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4423  *
4424  * => R/M bits are sync'd back to attrs
4425  */
4426 void
4427 pmap_page_remove(struct vm_page *pg)
4428 {
4429 	struct pmap_page *pp;
4430 	paddr_t pa;
4431 
4432 	pp = VM_PAGE_TO_PP(pg);
4433 	pa = VM_PAGE_TO_PHYS(pg);
4434 	pmap_pp_remove(pp, pa);
4435 }
4436 
4437 /*
4438  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4439  * that map it
4440  */
4441 void
4442 pmap_pv_remove(paddr_t pa)
4443 {
4444 	struct pmap_page *pp;
4445 
4446 	pp = pmap_pv_tracked(pa);
4447 	if (pp == NULL)
4448 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4449 	pmap_pp_remove(pp, pa);
4450 }
4451 
4452 /*
4453  * p m a p   a t t r i b u t e  f u n c t i o n s
4454  * functions that test/change managed page's attributes
4455  * since a page can be mapped multiple times we must check each PTE that
4456  * maps it by going down the pv lists.
4457  */
4458 
4459 /*
4460  * pmap_test_attrs: test a page's attributes
4461  */
4462 bool
4463 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4464 {
4465 	struct pmap_page *pp;
4466 	struct pv_pte *pvpte;
4467 	struct pmap *pmap;
4468 	uint8_t oattrs;
4469 	u_int result;
4470 	paddr_t pa;
4471 
4472 	pp = VM_PAGE_TO_PP(pg);
4473 	if ((pp->pp_attrs & testbits) != 0) {
4474 		return true;
4475 	}
4476 	pa = VM_PAGE_TO_PHYS(pg);
4477  startover:
4478 	mutex_spin_enter(&pp->pp_lock);
4479 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4480 		if ((pp->pp_attrs & testbits) != 0) {
4481 			break;
4482 		}
4483 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4484 			/*
4485 			 * raced with a V->P operation.  wait for the other
4486 			 * side to finish by acquring pmap's lock.  if no
4487 			 * wait, updates to pp_attrs by the other side may
4488 			 * go unseen.
4489 			 */
4490 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4491 			pmap_reference(pmap);
4492 			mutex_spin_exit(&pp->pp_lock);
4493 			mutex_enter(&pmap->pm_lock);
4494 			/* nothing. */
4495 			mutex_exit(&pmap->pm_lock);
4496 			pmap_destroy(pmap);
4497 			goto startover;
4498 		}
4499 		pp->pp_attrs |= oattrs;
4500 	}
4501 	result = pp->pp_attrs & testbits;
4502 	mutex_spin_exit(&pp->pp_lock);
4503 
4504 	/*
4505 	 * note that we will exit the for loop with a non-null pve if
4506 	 * we have found the bits we are testing for.
4507 	 */
4508 
4509 	return result != 0;
4510 }
4511 
4512 static bool
4513 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4514 {
4515 	struct pv_pte *pvpte;
4516 	struct pmap *pmap;
4517 	uint8_t oattrs;
4518 	u_int result;
4519 
4520 startover:
4521 	mutex_spin_enter(&pp->pp_lock);
4522 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4523 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4524 			/*
4525 			 * raced with a V->P operation.  wait for the other
4526 			 * side to finish by acquring pmap's lock.  it is
4527 			 * probably unmapping the page, and it will be gone
4528 			 * when the loop is restarted.
4529 			 */
4530 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4531 			pmap_reference(pmap);
4532 			mutex_spin_exit(&pp->pp_lock);
4533 			mutex_enter(&pmap->pm_lock);
4534 			/* nothing. */
4535 			mutex_exit(&pmap->pm_lock);
4536 			pmap_destroy(pmap);
4537 			goto startover;
4538 		}
4539 		pp->pp_attrs |= oattrs;
4540 	}
4541 	result = pp->pp_attrs & clearbits;
4542 	pp->pp_attrs &= ~clearbits;
4543 	pmap_tlb_shootnow();
4544 	mutex_spin_exit(&pp->pp_lock);
4545 
4546 	return result != 0;
4547 }
4548 
4549 /*
4550  * pmap_clear_attrs: clear the specified attribute for a page.
4551  *
4552  * => we return true if we cleared one of the bits we were asked to
4553  */
4554 bool
4555 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4556 {
4557 	struct pmap_page *pp;
4558 	paddr_t pa;
4559 
4560 	pp = VM_PAGE_TO_PP(pg);
4561 	pa = VM_PAGE_TO_PHYS(pg);
4562 
4563 	/*
4564 	 * If this is a new page, assert it has no mappings and simply zap
4565 	 * the stored attributes without taking any locks.
4566 	 */
4567 	if ((pg->flags & PG_FAKE) != 0) {
4568 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4569 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4570 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4571 		atomic_store_relaxed(&pp->pp_attrs, 0);
4572 		return false;
4573 	} else {
4574 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4575 	}
4576 }
4577 
4578 /*
4579  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4580  * pv-tracked page.
4581  */
4582 bool
4583 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4584 {
4585 	struct pmap_page *pp;
4586 
4587 	pp = pmap_pv_tracked(pa);
4588 	if (pp == NULL)
4589 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4590 
4591 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4592 }
4593 
4594 /*
4595  * p m a p   p r o t e c t i o n   f u n c t i o n s
4596  */
4597 
4598 /*
4599  * pmap_page_protect: change the protection of all recorded mappings
4600  * of a managed page
4601  *
4602  * => NOTE: this is an inline function in pmap.h
4603  */
4604 
4605 /* see pmap.h */
4606 
4607 /*
4608  * pmap_pv_protect: change the protection of all recorded mappings
4609  * of an unmanaged pv-tracked page
4610  *
4611  * => NOTE: this is an inline function in pmap.h
4612  */
4613 
4614 /* see pmap.h */
4615 
4616 /*
4617  * pmap_protect: set the protection in of the pages in a pmap
4618  *
4619  * => NOTE: this is an inline function in pmap.h
4620  */
4621 
4622 /* see pmap.h */
4623 
4624 /*
4625  * pmap_write_protect: write-protect pages in a pmap.
4626  *
4627  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4628  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4629  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4630  * present the page will still be considered as a kernel page, and the privilege
4631  * separation will be enforced correctly.
4632  */
4633 void
4634 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4635 {
4636 	pt_entry_t bit_rem, bit_put;
4637 	pt_entry_t *ptes;
4638 	pt_entry_t * const *pdes;
4639 	struct pmap *pmap2;
4640 	vaddr_t blockend, va;
4641 	int lvl, i;
4642 
4643 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4644 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4645 		return;
4646 	}
4647 
4648 	bit_rem = 0;
4649 	if (!(prot & VM_PROT_WRITE))
4650 		bit_rem = PTE_W;
4651 
4652 	bit_put = 0;
4653 	if (!(prot & VM_PROT_EXECUTE))
4654 		bit_put = pmap_pg_nx;
4655 
4656 	sva &= ~PAGE_MASK;
4657 	eva &= ~PAGE_MASK;
4658 
4659 	/*
4660 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4661 	 * be touching PV entries nor stats and kernel PDEs aren't
4662 	 * freed.
4663 	 */
4664 	if (pmap != pmap_kernel()) {
4665 		mutex_enter(&pmap->pm_lock);
4666 	}
4667 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4668 
4669 	for (va = sva ; va < eva; va = blockend) {
4670 		pt_entry_t *spte, *epte;
4671 
4672 		blockend = x86_round_pdr(va + 1);
4673 		if (blockend > eva)
4674 			blockend = eva;
4675 
4676 		/* Is it a valid block? */
4677 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4678 			continue;
4679 		}
4680 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4681 		KASSERT(lvl == 1);
4682 
4683 		spte = &ptes[pl1_i(va)];
4684 		epte = &ptes[pl1_i(blockend)];
4685 
4686 		for (i = 0; spte < epte; spte++, i++) {
4687 			pt_entry_t opte, npte;
4688 
4689 			do {
4690 				opte = *spte;
4691 				if (!pmap_valid_entry(opte)) {
4692 					goto next;
4693 				}
4694 				npte = (opte & ~bit_rem) | bit_put;
4695 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4696 
4697 			if ((opte & PTE_D) != 0) {
4698 				vaddr_t tva = va + x86_ptob(i);
4699 				pmap_tlb_shootdown(pmap, tva, opte,
4700 				    TLBSHOOT_WRITE_PROTECT);
4701 			}
4702 next:;
4703 		}
4704 	}
4705 
4706 	/* Release pmap. */
4707 	pmap_unmap_ptes(pmap, pmap2);
4708 	if (pmap != pmap_kernel()) {
4709 		mutex_exit(&pmap->pm_lock);
4710 	}
4711 }
4712 
4713 /*
4714  * pmap_unwire: clear the wired bit in the PTE.
4715  *
4716  * => Mapping should already be present.
4717  */
4718 void
4719 pmap_unwire(struct pmap *pmap, vaddr_t va)
4720 {
4721 	pt_entry_t *ptes, *ptep, opte;
4722 	pd_entry_t * const *pdes;
4723 	struct pmap *pmap2;
4724 	int lvl;
4725 
4726 	if (__predict_false(pmap->pm_unwire != NULL)) {
4727 		(*pmap->pm_unwire)(pmap, va);
4728 		return;
4729 	}
4730 
4731 	/*
4732 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4733 	 * statistics.
4734 	 */
4735 	mutex_enter(&pmap->pm_lock);
4736 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4737 
4738 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4739 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4740 	}
4741 	KASSERT(lvl == 1);
4742 
4743 	ptep = &ptes[pl1_i(va)];
4744 	opte = *ptep;
4745 	KASSERT(pmap_valid_entry(opte));
4746 
4747 	if (opte & PTE_WIRED) {
4748 		pt_entry_t npte = opte & ~PTE_WIRED;
4749 
4750 		opte = pmap_pte_testset(ptep, npte);
4751 		pmap_stats_update_bypte(pmap, npte, opte);
4752 	} else {
4753 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4754 		    " did not change!\n", __func__, pmap, va);
4755 	}
4756 
4757 	/* Release pmap. */
4758 	pmap_unmap_ptes(pmap, pmap2);
4759 	mutex_exit(&pmap->pm_lock);
4760 }
4761 
4762 /*
4763  * pmap_copy: copy mappings from one pmap to another
4764  *
4765  * => optional function
4766  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4767  */
4768 
4769 /*
4770  * defined as macro in pmap.h
4771  */
4772 
4773 __strict_weak_alias(pmap_enter, pmap_enter_default);
4774 
4775 int
4776 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4777     u_int flags)
4778 {
4779 	if (__predict_false(pmap->pm_enter != NULL)) {
4780 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4781 	}
4782 
4783 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4784 }
4785 
4786 /*
4787  * pmap_enter: enter a mapping into a pmap
4788  *
4789  * => must be done "now" ... no lazy-evaluation
4790  */
4791 int
4792 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4793 	   vm_prot_t prot, u_int flags, int domid)
4794 {
4795 	pt_entry_t *ptes, opte, npte;
4796 	pt_entry_t *ptep;
4797 	pd_entry_t * const *pdes;
4798 	struct vm_page *ptp;
4799 	struct vm_page *new_pg, *old_pg;
4800 	struct pmap_page *new_pp, *old_pp;
4801 	struct pv_entry *old_pve, *new_pve;
4802 	bool wired = (flags & PMAP_WIRED) != 0;
4803 	struct pmap *pmap2;
4804 	struct pmap_ptparray pt;
4805 	int error;
4806 	bool getptp, samepage, new_embedded;
4807 	rb_tree_t *tree;
4808 
4809 	KASSERT(pmap_initialized);
4810 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4811 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4812 	    PRIxVADDR " over PDP!", __func__, va);
4813 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4814 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4815 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4816 
4817 #ifdef XENPV
4818 	KASSERT(domid == DOMID_SELF || pa == 0);
4819 #endif
4820 
4821 	npte = ma | protection_codes[prot] | PTE_P;
4822 	npte |= pmap_pat_flags(flags);
4823 	if (wired)
4824 	        npte |= PTE_WIRED;
4825 	if (va < VM_MAXUSER_ADDRESS)
4826 		npte |= PTE_U;
4827 
4828 	if (pmap == pmap_kernel())
4829 		npte |= pmap_pg_g;
4830 	if (flags & VM_PROT_ALL) {
4831 		npte |= PTE_A;
4832 		if (flags & VM_PROT_WRITE) {
4833 			KASSERT((npte & PTE_W) != 0);
4834 			npte |= PTE_D;
4835 		}
4836 	}
4837 
4838 #ifdef XENPV
4839 	if (domid != DOMID_SELF)
4840 		new_pg = NULL;
4841 	else
4842 #endif
4843 		new_pg = PHYS_TO_VM_PAGE(pa);
4844 
4845 	if (new_pg != NULL) {
4846 		/* This is a managed page */
4847 		npte |= PTE_PVLIST;
4848 		new_pp = VM_PAGE_TO_PP(new_pg);
4849 		PMAP_CHECK_PP(new_pp);
4850 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4851 		/* This is an unmanaged pv-tracked page */
4852 		npte |= PTE_PVLIST;
4853 		PMAP_CHECK_PP(new_pp);
4854 	} else {
4855 		new_pp = NULL;
4856 	}
4857 
4858 	/* Begin by locking the pmap. */
4859 	mutex_enter(&pmap->pm_lock);
4860 
4861 	/* Look up the PTP.  Allocate if none present. */
4862 	ptp = NULL;
4863 	getptp = false;
4864 	if (pmap != pmap_kernel()) {
4865 		ptp = pmap_find_ptp(pmap, va, 1);
4866 		if (ptp == NULL) {
4867 			getptp = true;
4868 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
4869 			if (error != 0) {
4870 				if (flags & PMAP_CANFAIL) {
4871 					mutex_exit(&pmap->pm_lock);
4872 					return error;
4873 				}
4874 				panic("%s: get ptp failed, error=%d", __func__,
4875 				    error);
4876 			}
4877 		}
4878 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
4879 	} else {
4880 		/* Embedded PV entries rely on this. */
4881 		KASSERT(va != 0);
4882 		tree = &pmap_kernel_rb;
4883 	}
4884 
4885 	/*
4886 	 * Look up the old PV entry at this VA (if any), and insert a new PV
4887 	 * entry if required for the new mapping.  Temporarily track the old
4888 	 * and new mappings concurrently.  Only after the old mapping is
4889 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
4890 	 * our picture of modified/accessed state for either page could get
4891 	 * out of sync (we need any P->V operation for either page to stall
4892 	 * on pmap->pm_lock until done here).
4893 	 */
4894 	new_pve = NULL;
4895 	old_pve = NULL;
4896 	samepage = false;
4897 	new_embedded = false;
4898 
4899     	if (new_pp != NULL) {
4900     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
4901     		    &old_pve, &samepage, &new_embedded, tree);
4902 
4903 		/*
4904 		 * If a new pv_entry was needed and none was available, we
4905 		 * can go no further.
4906 		 */
4907 		if (error != 0) {
4908 			if (flags & PMAP_CANFAIL) {
4909 				if (getptp) {
4910 					pmap_unget_ptp(pmap, &pt);
4911 				}
4912 				mutex_exit(&pmap->pm_lock);
4913 				return error;
4914 			}
4915 			panic("%s: alloc pve failed", __func__);
4916 		}
4917 	} else {
4918 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4919 	}
4920 
4921 	/* Map PTEs into address space. */
4922 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4923 
4924 	/* Install any newly allocated PTPs. */
4925 	if (getptp) {
4926 		pmap_install_ptp(pmap, &pt, va, pdes);
4927 	}
4928 
4929 	/* Check if there is an existing mapping. */
4930 	ptep = &ptes[pl1_i(va)];
4931 	opte = *ptep;
4932 	bool have_oldpa = pmap_valid_entry(opte);
4933 	paddr_t oldpa = pmap_pte2pa(opte);
4934 
4935 	/*
4936 	 * Update the pte.
4937 	 */
4938 	do {
4939 		opte = *ptep;
4940 
4941 		/*
4942 		 * if the same page, inherit PTE_A and PTE_D.
4943 		 */
4944 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
4945 			npte |= opte & (PTE_A | PTE_D);
4946 		}
4947 #if defined(XENPV)
4948 		if (domid != DOMID_SELF) {
4949 			/* pmap_pte_cas with error handling */
4950 			int s = splvm();
4951 			if (opte != *ptep) {
4952 				splx(s);
4953 				continue;
4954 			}
4955 			error = xpq_update_foreign(
4956 			    vtomach((vaddr_t)ptep), npte, domid, flags);
4957 			splx(s);
4958 			if (error) {
4959 				/* Undo pv_entry tracking - oof. */
4960 				if (new_pp != NULL) {
4961 					mutex_spin_enter(&new_pp->pp_lock);
4962 					if (new_pve != NULL) {
4963 						LIST_REMOVE(new_pve, pve_list);
4964 						KASSERT(pmap->pm_pve == NULL);
4965 						pmap->pm_pve = new_pve;
4966 					} else if (new_embedded) {
4967 						new_pp->pp_pte.pte_ptp = NULL;
4968 						new_pp->pp_pte.pte_va = 0;
4969 					}
4970 					mutex_spin_exit(&new_pp->pp_lock);
4971 				}
4972 				pmap_unmap_ptes(pmap, pmap2);
4973 				/* Free new PTP. */
4974 				if (ptp != NULL && ptp->wire_count <= 1) {
4975 					pmap_free_ptp(pmap, ptp, va, ptes,
4976 					    pdes);
4977 				}
4978 				mutex_exit(&pmap->pm_lock);
4979 				return error;
4980 			}
4981 			break;
4982 		}
4983 #endif /* defined(XENPV) */
4984 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4985 
4986 	/*
4987 	 * Done with the PTEs: they can now be unmapped.
4988 	 */
4989 	pmap_unmap_ptes(pmap, pmap2);
4990 
4991 	/*
4992 	 * Update statistics and PTP's reference count.
4993 	 */
4994 	pmap_stats_update_bypte(pmap, npte, opte);
4995 	if (ptp != NULL) {
4996 		if (!have_oldpa) {
4997 			ptp->wire_count++;
4998 		}
4999 		/* Remember minimum VA in PTP. */
5000 		pmap_ptp_range_set(ptp, va);
5001 	}
5002 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5003 
5004 	/*
5005 	 * If the same page, we can skip pv_entry handling.
5006 	 */
5007 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5008 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5009 		if ((npte & PTE_PVLIST) != 0) {
5010 			KASSERT(samepage);
5011 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5012 		}
5013 		goto same_pa;
5014 	} else if ((npte & PTE_PVLIST) != 0) {
5015 		KASSERT(!samepage);
5016 	}
5017 
5018 	/*
5019 	 * If old page is pv-tracked, remove pv_entry from its list.
5020 	 */
5021 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5022 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5023 			old_pp = VM_PAGE_TO_PP(old_pg);
5024 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5025 			panic("%s: PTE_PVLIST with pv-untracked page"
5026 			    " va = %#"PRIxVADDR
5027 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5028 			    __func__, va, oldpa, atop(pa));
5029 		}
5030 
5031 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5032 		    pmap_pte_to_pp_attrs(opte));
5033 	} else {
5034 		KASSERT(old_pve == NULL);
5035 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5036 	}
5037 
5038 	/*
5039 	 * If new page is dynamically PV tracked, insert to tree.
5040 	 */
5041 	if (new_pve != NULL) {
5042 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5043 		old_pve = rb_tree_insert_node(tree, new_pve);
5044 		KASSERT(old_pve == new_pve);
5045 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5046 	}
5047 
5048 same_pa:
5049 	/*
5050 	 * shootdown tlb if necessary.
5051 	 */
5052 
5053 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5054 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5055 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5056 	}
5057 	pmap_drain_pv(pmap);
5058 	mutex_exit(&pmap->pm_lock);
5059 	return 0;
5060 }
5061 
5062 #if defined(XEN) && defined(DOM0OPS)
5063 
5064 struct pmap_data_gnt {
5065 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5066 	vaddr_t pd_gnt_sva;
5067 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5068 	int pd_gnt_refs; /* ref counter */
5069 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5070 };
5071 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5072 
5073 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5074 
5075 static struct pmap_data_gnt *
5076 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5077 {
5078 	struct pmap_data_gnt_head *headp;
5079 	struct pmap_data_gnt *pgnt;
5080 
5081 	KASSERT(mutex_owned(&pmap->pm_lock));
5082 	headp = pmap->pm_data;
5083 	KASSERT(headp != NULL);
5084 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5085 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5086 			return pgnt;
5087 		/* check that we're not overlapping part of a region */
5088 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5089 	}
5090 	return NULL;
5091 }
5092 
5093 static void
5094 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5095     const struct gnttab_map_grant_ref *ops)
5096 {
5097 	struct pmap_data_gnt_head *headp;
5098 	struct pmap_data_gnt *pgnt;
5099 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5100 	KASSERT(mutex_owned(&pmap->pm_lock));
5101 	KASSERT(nentries >= 1);
5102 	if (pmap->pm_remove == NULL) {
5103 		pmap->pm_remove = pmap_remove_gnt;
5104 		KASSERT(pmap->pm_data == NULL);
5105 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5106 		SLIST_INIT(headp);
5107 		pmap->pm_data = headp;
5108 	} else {
5109 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5110 		KASSERT(pmap->pm_data != NULL);
5111 		headp = pmap->pm_data;
5112 	}
5113 
5114 	pgnt = pmap_find_gnt(pmap, sva, eva);
5115 	if (pgnt != NULL) {
5116 		KASSERT(pgnt->pd_gnt_sva == sva);
5117 		KASSERT(pgnt->pd_gnt_eva == eva);
5118 		return;
5119 	}
5120 
5121 	/* new entry */
5122 	pgnt = kmem_alloc(sizeof(*pgnt) +
5123 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5124 	pgnt->pd_gnt_sva = sva;
5125 	pgnt->pd_gnt_eva = eva;
5126 	pgnt->pd_gnt_refs = 0;
5127 	memcpy(pgnt->pd_gnt_ops, ops,
5128 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5129 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5130 }
5131 
5132 static void
5133 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5134 {
5135 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5136 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5137 	KASSERT(nentries >= 1);
5138 	KASSERT(mutex_owned(&pmap->pm_lock));
5139 	KASSERT(pgnt->pd_gnt_refs == 0);
5140 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5141 	kmem_free(pgnt, sizeof(*pgnt) +
5142 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5143 	if (SLIST_EMPTY(headp)) {
5144 		kmem_free(headp, sizeof(*headp));
5145 		pmap->pm_data = NULL;
5146 		pmap->pm_remove = NULL;
5147 	}
5148 }
5149 
5150 /*
5151  * pmap_enter_gnt: enter a grant entry into a pmap
5152  *
5153  * => must be done "now" ... no lazy-evaluation
5154  */
5155 int
5156 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5157     const struct gnttab_map_grant_ref *oops)
5158 {
5159 	struct pmap_data_gnt *pgnt;
5160 	pt_entry_t *ptes, opte;
5161 	pt_entry_t *ptep;
5162 	pd_entry_t * const *pdes;
5163 	struct vm_page *ptp;
5164 	struct vm_page *old_pg;
5165 	struct pmap_page *old_pp;
5166 	struct pv_entry *old_pve;
5167 	struct pmap *pmap2;
5168 	struct pmap_ptparray pt;
5169 	int error;
5170 	bool getptp;
5171 	rb_tree_t *tree;
5172 	struct gnttab_map_grant_ref *op;
5173 	int ret;
5174 	int idx;
5175 
5176 	KASSERT(pmap_initialized);
5177 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5178 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5179 	    PRIxVADDR " over PDP!", __func__, va);
5180 	KASSERT(pmap != pmap_kernel());
5181 
5182 	/* Begin by locking the pmap. */
5183 	mutex_enter(&pmap->pm_lock);
5184 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5185 
5186 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5187 	KASSERT(pgnt != NULL);
5188 
5189 	/* Look up the PTP.  Allocate if none present. */
5190 	ptp = NULL;
5191 	getptp = false;
5192 	ptp = pmap_find_ptp(pmap, va, 1);
5193 	if (ptp == NULL) {
5194 		getptp = true;
5195 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5196 		if (error != 0) {
5197 			mutex_exit(&pmap->pm_lock);
5198 			return error;
5199 		}
5200 	}
5201 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5202 
5203 	/*
5204 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5205 	 * entry if required for the new mapping.  Temporarily track the old
5206 	 * and new mappings concurrently.  Only after the old mapping is
5207 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5208 	 * our picture of modified/accessed state for either page could get
5209 	 * out of sync (we need any P->V operation for either page to stall
5210 	 * on pmap->pm_lock until done here).
5211 	 */
5212 	old_pve = NULL;
5213 
5214 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5215 
5216 	/* Map PTEs into address space. */
5217 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5218 
5219 	/* Install any newly allocated PTPs. */
5220 	if (getptp) {
5221 		pmap_install_ptp(pmap, &pt, va, pdes);
5222 	}
5223 
5224 	/* Check if there is an existing mapping. */
5225 	ptep = &ptes[pl1_i(va)];
5226 	opte = *ptep;
5227 	bool have_oldpa = pmap_valid_entry(opte);
5228 	paddr_t oldpa = pmap_pte2pa(opte);
5229 
5230 	/*
5231 	 * Update the pte.
5232 	 */
5233 
5234 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5235 	op = &pgnt->pd_gnt_ops[idx];
5236 
5237 	op->host_addr = xpmap_ptetomach(ptep);
5238 	op->dev_bus_addr = 0;
5239 	op->status = GNTST_general_error;
5240 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5241 	if (__predict_false(ret)) {
5242 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5243 		    __func__, ret);
5244 		op->status = GNTST_general_error;
5245 	}
5246 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5247 		kpause("gntmap", false, mstohz(1), NULL);
5248 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5249 		if (__predict_false(ret)) {
5250 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5251 			    __func__, ret);
5252 			op->status = GNTST_general_error;
5253 		}
5254 	}
5255 	if (__predict_false(op->status != GNTST_okay)) {
5256 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5257 		    __func__, op->status);
5258 		if (have_oldpa) {
5259 			ptp->wire_count--;
5260 		}
5261 	} else {
5262 		pgnt->pd_gnt_refs++;
5263 		if (!have_oldpa) {
5264 			ptp->wire_count++;
5265 		}
5266 		KASSERT(ptp->wire_count > 1);
5267 		/* Remember minimum VA in PTP. */
5268 		pmap_ptp_range_set(ptp, va);
5269 	}
5270 	if (ptp->wire_count <= 1)
5271 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5272 
5273 	/*
5274 	 * Done with the PTEs: they can now be unmapped.
5275 	 */
5276 	pmap_unmap_ptes(pmap, pmap2);
5277 
5278 	/*
5279 	 * Update statistics and PTP's reference count.
5280 	 */
5281 	pmap_stats_update_bypte(pmap, 0, opte);
5282 
5283 	/*
5284 	 * If old page is pv-tracked, remove pv_entry from its list.
5285 	 */
5286 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5287 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5288 			old_pp = VM_PAGE_TO_PP(old_pg);
5289 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5290 			panic("%s: PTE_PVLIST with pv-untracked page"
5291 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5292 			    __func__, va, oldpa);
5293 		}
5294 
5295 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5296 		    pmap_pte_to_pp_attrs(opte));
5297 	} else {
5298 		KASSERT(old_pve == NULL);
5299 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5300 	}
5301 
5302 	pmap_drain_pv(pmap);
5303 	mutex_exit(&pmap->pm_lock);
5304 	return op->status;
5305 }
5306 
5307 /*
5308  * pmap_remove_gnt: grant mapping removal function.
5309  *
5310  * => caller should not be holding any pmap locks
5311  */
5312 static void
5313 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5314 {
5315 	struct pmap_data_gnt *pgnt;
5316 	pt_entry_t *ptes;
5317 	pd_entry_t pde;
5318 	pd_entry_t * const *pdes;
5319 	struct vm_page *ptp;
5320 	struct pmap *pmap2;
5321 	vaddr_t va;
5322 	int lvl;
5323 	int idx;
5324 	struct gnttab_map_grant_ref *op;
5325 	struct gnttab_unmap_grant_ref unmap_op;
5326 	int ret;
5327 
5328 	KASSERT(pmap != pmap_kernel());
5329 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5330 
5331 	mutex_enter(&pmap->pm_lock);
5332 	for (va = sva; va < eva; va += PAGE_SIZE) {
5333 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5334 		if (pgnt == NULL) {
5335 			pmap_remove_locked(pmap, sva, eva);
5336 			continue;
5337 		}
5338 
5339 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5340 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5341 			panic("pmap_remove_gnt pdes not valid");
5342 		}
5343 
5344 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5345 		op = &pgnt->pd_gnt_ops[idx];
5346 		KASSERT(lvl == 1);
5347 		KASSERT(op->status == GNTST_okay);
5348 
5349 		/* Get PTP if non-kernel mapping. */
5350 		ptp = pmap_find_ptp(pmap, va, 1);
5351 		KASSERTMSG(ptp != NULL,
5352 		    "%s: unmanaged PTP detected", __func__);
5353 
5354 		if (op->status == GNTST_okay)  {
5355 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5356 			unmap_op.handle = op->handle;
5357 			unmap_op.dev_bus_addr = 0;
5358 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5359 			ret = HYPERVISOR_grant_table_op(
5360 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5361 			if (ret) {
5362 				printf("%s: GNTTABOP_unmap_grant_ref "
5363 				    "failed: %d\n", __func__, ret);
5364 			}
5365 
5366 			ptp->wire_count--;
5367 			pgnt->pd_gnt_refs--;
5368 			if (pgnt->pd_gnt_refs == 0) {
5369 				pmap_free_gnt(pmap, pgnt);
5370 			}
5371 		}
5372 		/*
5373 		 * if mapping removed and the PTP is no longer
5374 		 * being used, free it!
5375 		 */
5376 
5377 		if (ptp->wire_count <= 1)
5378 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5379 		pmap_unmap_ptes(pmap, pmap2);
5380 	}
5381 	mutex_exit(&pmap->pm_lock);
5382 }
5383 #endif /* XEN && DOM0OPS */
5384 
5385 paddr_t
5386 pmap_get_physpage(void)
5387 {
5388 	struct vm_page *ptp;
5389 	struct pmap *kpm = pmap_kernel();
5390 	paddr_t pa;
5391 
5392 	if (!uvm.page_init_done) {
5393 		/*
5394 		 * We're growing the kernel pmap early (from
5395 		 * uvm_pageboot_alloc()). This case must be
5396 		 * handled a little differently.
5397 		 */
5398 
5399 		if (!uvm_page_physget(&pa))
5400 			panic("%s: out of memory", __func__);
5401 #if defined(__HAVE_DIRECT_MAP)
5402 		memset((void *)PMAP_DIRECT_MAP(pa), 0, PAGE_SIZE);
5403 #else
5404 #if defined(XENPV)
5405 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5406 			xen_pagezero(pa);
5407 			return pa;
5408 		}
5409 #endif
5410 		kpreempt_disable();
5411 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5412 		    PTE_W | pmap_pg_nx);
5413 		pmap_pte_flush();
5414 		pmap_update_pg((vaddr_t)early_zerop);
5415 		memset(early_zerop, 0, PAGE_SIZE);
5416 #if defined(DIAGNOSTIC) || defined(XENPV)
5417 		pmap_pte_set(early_zero_pte, 0);
5418 		pmap_pte_flush();
5419 #endif /* defined(DIAGNOSTIC) */
5420 		kpreempt_enable();
5421 #endif /* defined(__HAVE_DIRECT_MAP) */
5422 	} else {
5423 		/* XXX */
5424 		ptp = uvm_pagealloc(NULL, 0, NULL,
5425 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5426 		if (ptp == NULL)
5427 			panic("%s: out of memory", __func__);
5428 		ptp->flags &= ~PG_BUSY;
5429 		ptp->wire_count = 1;
5430 		pa = VM_PAGE_TO_PHYS(ptp);
5431 	}
5432 	pmap_stats_update(kpm, 1, 0);
5433 
5434 	return pa;
5435 }
5436 
5437 /*
5438  * Expand the page tree with the specified amount of PTPs, mapping virtual
5439  * addresses starting at kva. We populate all the levels but the last one
5440  * (L1). The nodes of the tree are created as RW, but the pages covered
5441  * will be kentered in L1, with proper permissions.
5442  *
5443  * Used only by pmap_growkernel.
5444  */
5445 static void
5446 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5447 {
5448 	unsigned long i;
5449 	paddr_t pa;
5450 	unsigned long index, endindex;
5451 	int level;
5452 	pd_entry_t *pdep;
5453 #ifdef XENPV
5454 	int s = splvm(); /* protect xpq_* */
5455 #endif
5456 
5457 	for (level = PTP_LEVELS; level > 1; level--) {
5458 		if (level == PTP_LEVELS)
5459 			pdep = cpm->pm_pdir;
5460 		else
5461 			pdep = normal_pdes[level - 2];
5462 		index = pl_i_roundup(kva, level);
5463 		endindex = index + needed_ptps[level - 1] - 1;
5464 
5465 		for (i = index; i <= endindex; i++) {
5466 			pt_entry_t pte;
5467 
5468 			KASSERT(!pmap_valid_entry(pdep[i]));
5469 			pa = pmap_get_physpage();
5470 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5471 #ifdef __x86_64__
5472 			pte |= pmap_pg_nx;
5473 #endif
5474 			pmap_pte_set(&pdep[i], pte);
5475 
5476 #ifdef XENPV
5477 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5478 				if (__predict_true(
5479 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5480 					/* update per-cpu PMDs on all cpus */
5481 					xen_kpm_sync(pmap_kernel(), i);
5482 				} else {
5483 					/*
5484 					 * too early; update primary CPU
5485 					 * PMD only (without locks)
5486 					 */
5487 #ifdef __x86_64__
5488 					pd_entry_t *cpu_pdep =
5489 						&cpu_info_primary.ci_kpm_pdir[i];
5490 #else
5491 					pd_entry_t *cpu_pdep =
5492 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5493 #endif
5494 					pmap_pte_set(cpu_pdep, pte);
5495 				}
5496 			}
5497 #endif
5498 
5499 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5500 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5501 			nkptp[level - 1]++;
5502 		}
5503 		pmap_pte_flush();
5504 	}
5505 #ifdef XENPV
5506 	splx(s);
5507 #endif
5508 }
5509 
5510 /*
5511  * pmap_growkernel: increase usage of KVM space.
5512  *
5513  * => we allocate new PTPs for the kernel and install them in all
5514  *    the pmaps on the system.
5515  */
5516 vaddr_t
5517 pmap_growkernel(vaddr_t maxkvaddr)
5518 {
5519 	struct pmap *kpm = pmap_kernel();
5520 	struct pmap *cpm;
5521 #if !defined(XENPV) || !defined(__x86_64__)
5522 	struct pmap *pm;
5523 	long old;
5524 #endif
5525 	int s, i;
5526 	long needed_kptp[PTP_LEVELS], target_nptp;
5527 	bool invalidate = false;
5528 
5529 	s = splvm();	/* to be safe */
5530 	mutex_enter(&kpm->pm_lock);
5531 
5532 	if (maxkvaddr <= pmap_maxkvaddr) {
5533 		mutex_exit(&kpm->pm_lock);
5534 		splx(s);
5535 		return pmap_maxkvaddr;
5536 	}
5537 
5538 	maxkvaddr = x86_round_pdr(maxkvaddr);
5539 #if !defined(XENPV) || !defined(__x86_64__)
5540 	old = nkptp[PTP_LEVELS - 1];
5541 #endif
5542 
5543 	/* Initialize needed_kptp. */
5544 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5545 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5546 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5547 
5548 		if (target_nptp > nkptpmax[i])
5549 			panic("out of KVA space");
5550 		KASSERT(target_nptp >= nkptp[i]);
5551 		needed_kptp[i] = target_nptp - nkptp[i];
5552 	}
5553 
5554 #ifdef XENPV
5555 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5556 	cpm = kpm;
5557 #else
5558 	/* Get the current pmap */
5559 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5560 		cpm = curcpu()->ci_pmap;
5561 	} else {
5562 		cpm = kpm;
5563 	}
5564 #endif
5565 
5566 	kasan_shadow_map((void *)pmap_maxkvaddr,
5567 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5568 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5569 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5570 
5571 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5572 
5573 	/*
5574 	 * If the number of top level entries changed, update all pmaps.
5575 	 */
5576 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5577 #ifdef XENPV
5578 #ifdef __x86_64__
5579 		/* nothing, kernel entries are never entered in user pmap */
5580 #else
5581 		int pdkidx;
5582 
5583 		mutex_enter(&pmaps_lock);
5584 		LIST_FOREACH(pm, &pmaps, pm_list) {
5585 			for (pdkidx = PDIR_SLOT_KERN + old;
5586 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5587 			    pdkidx++) {
5588 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5589 				    kpm->pm_pdir[pdkidx]);
5590 			}
5591 			pmap_pte_flush();
5592 		}
5593 		mutex_exit(&pmaps_lock);
5594 #endif /* __x86_64__ */
5595 #else /* XENPV */
5596 		size_t newpdes;
5597 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5598 		if (cpm != kpm) {
5599 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5600 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5601 			    newpdes * sizeof(pd_entry_t));
5602 		}
5603 
5604 		mutex_enter(&pmaps_lock);
5605 		LIST_FOREACH(pm, &pmaps, pm_list) {
5606 			if (__predict_false(pm->pm_enter != NULL)) {
5607 				/*
5608 				 * Not a native pmap, the kernel is not mapped,
5609 				 * so nothing to synchronize.
5610 				 */
5611 				continue;
5612 			}
5613 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5614 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5615 			    newpdes * sizeof(pd_entry_t));
5616 		}
5617 		mutex_exit(&pmaps_lock);
5618 #endif
5619 		invalidate = true;
5620 	}
5621 	pmap_maxkvaddr = maxkvaddr;
5622 	mutex_exit(&kpm->pm_lock);
5623 	splx(s);
5624 
5625 	if (invalidate && pmap_initialized) {
5626 		/* Invalidate the pmap cache. */
5627 		pool_cache_invalidate(&pmap_cache);
5628 	}
5629 
5630 	return maxkvaddr;
5631 }
5632 
5633 #ifdef DEBUG
5634 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5635 
5636 /*
5637  * pmap_dump: dump all the mappings from a pmap
5638  *
5639  * => caller should not be holding any pmap locks
5640  */
5641 void
5642 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5643 {
5644 	pt_entry_t *ptes, *pte;
5645 	pd_entry_t * const *pdes;
5646 	struct pmap *pmap2;
5647 	vaddr_t blkendva;
5648 	int lvl;
5649 
5650 	/*
5651 	 * if end is out of range truncate.
5652 	 * if (end == start) update to max.
5653 	 */
5654 
5655 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5656 		eva = VM_MAXUSER_ADDRESS;
5657 
5658 	mutex_enter(&pmap->pm_lock);
5659 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5660 
5661 	/*
5662 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5663 	 */
5664 
5665 	for (/* null */ ; sva < eva ; sva = blkendva) {
5666 
5667 		/* determine range of block */
5668 		blkendva = x86_round_pdr(sva+1);
5669 		if (blkendva > eva)
5670 			blkendva = eva;
5671 
5672 		/* valid block? */
5673 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5674 			continue;
5675 		KASSERT(lvl == 1);
5676 
5677 		pte = &ptes[pl1_i(sva)];
5678 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5679 			if (!pmap_valid_entry(*pte))
5680 				continue;
5681 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5682 			    " (pte=%#" PRIxPADDR ")\n",
5683 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5684 		}
5685 	}
5686 	pmap_unmap_ptes(pmap, pmap2);
5687 	mutex_exit(&pmap->pm_lock);
5688 }
5689 #endif
5690 
5691 /*
5692  * pmap_update: process deferred invalidations and frees.
5693  */
5694 void
5695 pmap_update(struct pmap *pmap)
5696 {
5697 	struct pmap_page *pp;
5698 	struct vm_page *ptp;
5699 
5700 	/*
5701 	 * Initiate any pending TLB shootdowns.  Wait for them to
5702 	 * complete before returning control to the caller.
5703 	 */
5704 	kpreempt_disable();
5705 	pmap_tlb_shootnow();
5706 	kpreempt_enable();
5707 
5708 	/*
5709 	 * Now that shootdowns are complete, process deferred frees.  This
5710 	 * is an unlocked check, but is safe as we're only interested in
5711 	 * work done in this LWP - we won't get a false negative.
5712 	 */
5713 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5714 		return;
5715 	}
5716 
5717 	mutex_enter(&pmap->pm_lock);
5718 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5719 		KASSERT(ptp->wire_count == 0);
5720 		KASSERT(ptp->uanon == NULL);
5721 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5722 		pp = VM_PAGE_TO_PP(ptp);
5723 		LIST_INIT(&pp->pp_pvlist);
5724 		pp->pp_attrs = 0;
5725 		pp->pp_pte.pte_ptp = NULL;
5726 		pp->pp_pte.pte_va = 0;
5727 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5728 
5729 		/*
5730 		 * XXX Hack to avoid extra locking, and lock
5731 		 * assertions in uvm_pagefree().  Despite uobject
5732 		 * being set, this isn't a managed page.
5733 		 */
5734 		PMAP_DUMMY_LOCK(pmap);
5735 		uvm_pagerealloc(ptp, NULL, 0);
5736 		PMAP_DUMMY_UNLOCK(pmap);
5737 		uvm_pagefree(ptp);
5738 	}
5739 	mutex_exit(&pmap->pm_lock);
5740 }
5741 
5742 #if PTP_LEVELS > 4
5743 #error "Unsupported number of page table mappings"
5744 #endif
5745 
5746 paddr_t
5747 pmap_init_tmp_pgtbl(paddr_t pg)
5748 {
5749 	static bool maps_loaded;
5750 	static const paddr_t x86_tmp_pml_paddr[] = {
5751 	    4 * PAGE_SIZE,	/* L1 */
5752 	    5 * PAGE_SIZE,	/* L2 */
5753 	    6 * PAGE_SIZE,	/* L3 */
5754 	    7 * PAGE_SIZE	/* L4 */
5755 	};
5756 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5757 
5758 	pd_entry_t *tmp_pml, *kernel_pml;
5759 
5760 	int level;
5761 
5762 	if (!maps_loaded) {
5763 		for (level = 0; level < PTP_LEVELS; ++level) {
5764 			x86_tmp_pml_vaddr[level] =
5765 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5766 			    UVM_KMF_VAONLY);
5767 
5768 			if (x86_tmp_pml_vaddr[level] == 0)
5769 				panic("mapping of real mode PML failed\n");
5770 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5771 			    x86_tmp_pml_paddr[level],
5772 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5773 		}
5774 		pmap_update(pmap_kernel());
5775 		maps_loaded = true;
5776 	}
5777 
5778 	/* Zero levels 1-3 */
5779 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
5780 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5781 		memset(tmp_pml, 0, PAGE_SIZE);
5782 	}
5783 
5784 	/* Copy PML4 */
5785 	kernel_pml = pmap_kernel()->pm_pdir;
5786 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
5787 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
5788 
5789 #ifdef PAE
5790 	/*
5791 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
5792 	 * last entries are unlikely to be used for temporary mappings.
5793 	 * 508: maps 0->1GB (userland)
5794 	 * 509: unused
5795 	 * 510: unused
5796 	 * 511: maps 3->4GB (kernel)
5797 	 */
5798 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
5799 	tmp_pml[509] = 0;
5800 	tmp_pml[510] = 0;
5801 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
5802 #endif
5803 
5804 	for (level = PTP_LEVELS - 1; level > 0; --level) {
5805 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5806 
5807 		tmp_pml[pl_i(pg, level + 1)] =
5808 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
5809 	}
5810 
5811 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
5812 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
5813 
5814 #ifdef PAE
5815 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
5816 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
5817 #endif
5818 
5819 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
5820 }
5821 
5822 u_int
5823 x86_mmap_flags(paddr_t mdpgno)
5824 {
5825 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
5826 	u_int pflag = 0;
5827 
5828 	if (nflag & X86_MMAP_FLAG_PREFETCH)
5829 		pflag |= PMAP_WRITE_COMBINE;
5830 
5831 	return pflag;
5832 }
5833 
5834 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
5835 
5836 /*
5837  * -----------------------------------------------------------------------------
5838  * *****************************************************************************
5839  * *****************************************************************************
5840  * *****************************************************************************
5841  * *****************************************************************************
5842  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
5843  * *****************************************************************************
5844  * *****************************************************************************
5845  * *****************************************************************************
5846  * *****************************************************************************
5847  * -----------------------------------------------------------------------------
5848  *
5849  * These functions are invoked as callbacks from the code above. Contrary to
5850  * native, EPT does not have a recursive slot; therefore, it is not possible
5851  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
5852  * tree manually.
5853  *
5854  * Apart from that, the logic is mostly the same as native. Once a pmap has
5855  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
5856  * After that we're good, and the callbacks will handle the translations
5857  * for us.
5858  *
5859  * -----------------------------------------------------------------------------
5860  */
5861 
5862 /* Hardware bits. */
5863 #define EPT_R		__BIT(0)	/* read */
5864 #define EPT_W		__BIT(1)	/* write */
5865 #define EPT_X		__BIT(2)	/* execute */
5866 #define EPT_T		__BITS(5,3)	/* type */
5867 #define		TYPE_UC	0
5868 #define		TYPE_WC	1
5869 #define		TYPE_WT	4
5870 #define		TYPE_WP	5
5871 #define		TYPE_WB	6
5872 #define EPT_NOPAT	__BIT(6)
5873 #define EPT_L		__BIT(7)	/* large */
5874 #define EPT_A		__BIT(8)	/* accessed */
5875 #define EPT_D		__BIT(9)	/* dirty */
5876 /* Software bits. */
5877 #define EPT_PVLIST	__BIT(60)
5878 #define EPT_WIRED	__BIT(61)
5879 
5880 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
5881 
5882 bool pmap_ept_has_ad __read_mostly;
5883 
5884 static inline void
5885 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
5886 {
5887 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
5888 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
5889 
5890 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5891 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5892 
5893 	pmap_stats_update(pmap, resid_diff, wired_diff);
5894 }
5895 
5896 static pt_entry_t
5897 pmap_ept_type(u_int flags)
5898 {
5899 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
5900 	pt_entry_t ret;
5901 
5902 	switch (cacheflags) {
5903 	case PMAP_NOCACHE:
5904 	case PMAP_NOCACHE_OVR:
5905 		ret = __SHIFTIN(TYPE_UC, EPT_T);
5906 		break;
5907 	case PMAP_WRITE_COMBINE:
5908 		ret = __SHIFTIN(TYPE_WC, EPT_T);
5909 		break;
5910 	case PMAP_WRITE_BACK:
5911 	default:
5912 		ret = __SHIFTIN(TYPE_WB, EPT_T);
5913 		break;
5914 	}
5915 
5916 	ret |= EPT_NOPAT;
5917 	return ret;
5918 }
5919 
5920 static inline pt_entry_t
5921 pmap_ept_prot(vm_prot_t prot)
5922 {
5923 	pt_entry_t res = 0;
5924 
5925 	if (prot & VM_PROT_READ)
5926 		res |= EPT_R;
5927 	if (prot & VM_PROT_WRITE)
5928 		res |= EPT_W;
5929 	if (prot & VM_PROT_EXECUTE)
5930 		res |= EPT_X;
5931 
5932 	return res;
5933 }
5934 
5935 static inline uint8_t
5936 pmap_ept_to_pp_attrs(pt_entry_t ept)
5937 {
5938 	uint8_t ret = 0;
5939 	if (pmap_ept_has_ad) {
5940 		if (ept & EPT_D)
5941 			ret |= PP_ATTRS_D;
5942 		if (ept & EPT_A)
5943 			ret |= PP_ATTRS_A;
5944 	} else {
5945 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
5946 	}
5947 	if (ept & EPT_W)
5948 		ret |= PP_ATTRS_W;
5949 	return ret;
5950 }
5951 
5952 static inline pt_entry_t
5953 pmap_pp_attrs_to_ept(uint8_t attrs)
5954 {
5955 	pt_entry_t ept = 0;
5956 	if (attrs & PP_ATTRS_D)
5957 		ept |= EPT_D;
5958 	if (attrs & PP_ATTRS_A)
5959 		ept |= EPT_A;
5960 	if (attrs & PP_ATTRS_W)
5961 		ept |= EPT_W;
5962 	return ept;
5963 }
5964 
5965 /*
5966  * Helper for pmap_ept_free_ptp.
5967  * tree[0] = &L2[L2idx]
5968  * tree[1] = &L3[L3idx]
5969  * tree[2] = &L4[L4idx]
5970  */
5971 static void
5972 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
5973 {
5974 	pt_entry_t *pteva;
5975 	paddr_t ptepa;
5976 	int i, index;
5977 
5978 	ptepa = pmap->pm_pdirpa[0];
5979 	for (i = PTP_LEVELS; i > 1; i--) {
5980 		index = pl_pi(va, i);
5981 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
5982 		KASSERT(pmap_ept_valid_entry(pteva[index]));
5983 		tree[i - 2] = &pteva[index];
5984 		ptepa = pmap_pte2pa(pteva[index]);
5985 	}
5986 }
5987 
5988 static void
5989 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
5990 {
5991 	pd_entry_t *tree[3];
5992 	int level;
5993 
5994 	KASSERT(pmap != pmap_kernel());
5995 	KASSERT(mutex_owned(&pmap->pm_lock));
5996 	KASSERT(kpreempt_disabled());
5997 
5998 	pmap_ept_get_tree(pmap, va, tree);
5999 
6000 	level = 1;
6001 	do {
6002 		(void)pmap_pte_testset(tree[level - 1], 0);
6003 
6004 		pmap_freepage(pmap, ptp, level);
6005 		if (level < PTP_LEVELS - 1) {
6006 			ptp = pmap_find_ptp(pmap, va, level + 1);
6007 			ptp->wire_count--;
6008 			if (ptp->wire_count > 1)
6009 				break;
6010 		}
6011 	} while (++level < PTP_LEVELS);
6012 	pmap_pte_flush();
6013 }
6014 
6015 /* Allocate L4->L3->L2. Return L2. */
6016 static void
6017 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6018 {
6019 	struct vm_page *ptp;
6020 	unsigned long index;
6021 	pd_entry_t *pteva;
6022 	paddr_t ptepa;
6023 	int i;
6024 
6025 	KASSERT(pmap != pmap_kernel());
6026 	KASSERT(mutex_owned(&pmap->pm_lock));
6027 	KASSERT(kpreempt_disabled());
6028 
6029 	/*
6030 	 * Now that we have all the pages looked up or allocated,
6031 	 * loop through again installing any new ones into the tree.
6032 	 */
6033 	ptepa = pmap->pm_pdirpa[0];
6034 	for (i = PTP_LEVELS; i > 1; i--) {
6035 		index = pl_pi(va, i);
6036 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6037 
6038 		if (pmap_ept_valid_entry(pteva[index])) {
6039 			KASSERT(!pt->alloced[i]);
6040 			ptepa = pmap_pte2pa(pteva[index]);
6041 			continue;
6042 		}
6043 
6044 		ptp = pt->pg[i];
6045 		ptp->flags &= ~PG_BUSY; /* never busy */
6046 		ptp->wire_count = 1;
6047 		pmap->pm_ptphint[i - 2] = ptp;
6048 		ptepa = VM_PAGE_TO_PHYS(ptp);
6049 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6050 
6051 		pmap_pte_flush();
6052 		pmap_stats_update(pmap, 1, 0);
6053 
6054 		/*
6055 		 * If we're not in the top level, increase the
6056 		 * wire count of the parent page.
6057 		 */
6058 		if (i < PTP_LEVELS) {
6059 			pt->pg[i + 1]->wire_count++;
6060 		}
6061 	}
6062 }
6063 
6064 static int
6065 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6066     u_int flags)
6067 {
6068 	pt_entry_t *ptes, opte, npte;
6069 	pt_entry_t *ptep;
6070 	struct vm_page *ptp;
6071 	struct vm_page *new_pg, *old_pg;
6072 	struct pmap_page *new_pp, *old_pp;
6073 	struct pv_entry *old_pve, *new_pve;
6074 	bool wired = (flags & PMAP_WIRED) != 0;
6075 	bool accessed;
6076 	struct pmap_ptparray pt;
6077 	int error;
6078 	bool getptp, samepage, new_embedded;
6079 	rb_tree_t *tree;
6080 
6081 	KASSERT(pmap_initialized);
6082 	KASSERT(va < VM_MAXUSER_ADDRESS);
6083 
6084 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6085 
6086 	if (wired)
6087 		npte |= EPT_WIRED;
6088 	if (flags & VM_PROT_ALL) {
6089 		npte |= EPT_A;
6090 		if (flags & VM_PROT_WRITE) {
6091 			KASSERT((npte & EPT_W) != 0);
6092 			npte |= EPT_D;
6093 		}
6094 	}
6095 
6096 	new_pg = PHYS_TO_VM_PAGE(pa);
6097 	if (new_pg != NULL) {
6098 		/* This is a managed page */
6099 		npte |= EPT_PVLIST;
6100 		new_pp = VM_PAGE_TO_PP(new_pg);
6101 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6102 		/* This is an unmanaged pv-tracked page */
6103 		npte |= EPT_PVLIST;
6104 	} else {
6105 		new_pp = NULL;
6106 	}
6107 
6108 	/* Begin by locking the pmap. */
6109 	mutex_enter(&pmap->pm_lock);
6110 
6111 	/* Look up the PTP.  Allocate if none present. */
6112 	ptp = NULL;
6113 	getptp = false;
6114 	if (pmap != pmap_kernel()) {
6115 		ptp = pmap_find_ptp(pmap, va, 1);
6116 		if (ptp == NULL) {
6117 			getptp = true;
6118 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6119 			if (error != 0) {
6120 				if (flags & PMAP_CANFAIL) {
6121 					mutex_exit(&pmap->pm_lock);
6122 					return error;
6123 				}
6124 				panic("%s: get ptp failed, error=%d", __func__,
6125 				    error);
6126 			}
6127 		}
6128 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6129 	} else {
6130 		/* Embedded PV entries rely on this. */
6131 		KASSERT(va != 0);
6132 		tree = &pmap_kernel_rb;
6133 	}
6134 
6135 	/*
6136 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6137 	 * entry if required for the new mapping.  Temporarily track the old
6138 	 * and new mappings concurrently.  Only after the old mapping is
6139 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6140 	 * our picture of modified/accessed state for either page could get
6141 	 * out of sync (we need any P->V operation for either page to stall
6142 	 * on pmap->pm_lock until done here).
6143 	 */
6144 	new_pve = NULL;
6145 	old_pve = NULL;
6146 	samepage = false;
6147 	new_embedded = false;
6148 
6149     	if (new_pp != NULL) {
6150     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6151     		    &old_pve, &samepage, &new_embedded, tree);
6152 
6153 		/*
6154 		 * If a new pv_entry was needed and none was available, we
6155 		 * can go no further.
6156 		 */
6157 		if (error != 0) {
6158 			if (flags & PMAP_CANFAIL) {
6159 				if (getptp) {
6160 					pmap_unget_ptp(pmap, &pt);
6161 				}
6162 				mutex_exit(&pmap->pm_lock);
6163 				return error;
6164 			}
6165 			panic("%s: alloc pve failed", __func__);
6166 		}
6167 	} else {
6168 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6169 	}
6170 
6171 	/* Map PTEs into address space. */
6172 	kpreempt_disable();
6173 
6174 	/* Install any newly allocated PTPs. */
6175 	if (getptp) {
6176 		pmap_ept_install_ptp(pmap, &pt, va);
6177 	}
6178 
6179 	/* Check if there is an existing mapping. */
6180 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6181 	ptep = &ptes[pl1_pi(va)];
6182 	opte = *ptep;
6183 	bool have_oldpa = pmap_ept_valid_entry(opte);
6184 	paddr_t oldpa = pmap_pte2pa(opte);
6185 
6186 	/*
6187 	 * Update the pte.
6188 	 */
6189 	do {
6190 		opte = *ptep;
6191 
6192 		/*
6193 		 * if the same page, inherit PTE_A and PTE_D.
6194 		 */
6195 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6196 			npte |= opte & (EPT_A | EPT_D);
6197 		}
6198 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6199 
6200 	/*
6201 	 * Done with the PTEs: they can now be unmapped.
6202 	 */
6203 	kpreempt_enable();
6204 
6205 	/*
6206 	 * Update statistics and PTP's reference count.
6207 	 */
6208 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6209 	if (ptp != NULL) {
6210 		if (!have_oldpa) {
6211 			ptp->wire_count++;
6212 		}
6213 		/* Remember minimum VA in PTP. */
6214 		pmap_ptp_range_set(ptp, va);
6215 	}
6216 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6217 
6218 	/*
6219 	 * If the same page, we can skip pv_entry handling.
6220 	 */
6221 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6222 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6223 		if ((npte & EPT_PVLIST) != 0) {
6224 			KASSERT(samepage);
6225 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6226 		}
6227 		goto same_pa;
6228 	} else if ((npte & EPT_PVLIST) != 0) {
6229 		KASSERT(!samepage);
6230 	}
6231 
6232 	/*
6233 	 * If old page is pv-tracked, remove pv_entry from its list.
6234 	 */
6235 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6236 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6237 			old_pp = VM_PAGE_TO_PP(old_pg);
6238 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6239 			panic("%s: EPT_PVLIST with pv-untracked page"
6240 			    " va = %#"PRIxVADDR
6241 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6242 			    __func__, va, oldpa, atop(pa));
6243 		}
6244 
6245 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6246 		    pmap_ept_to_pp_attrs(opte));
6247 	} else {
6248 		KASSERT(old_pve == NULL);
6249 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6250 	}
6251 
6252 	/*
6253 	 * If new page is dynamically PV tracked, insert to tree.
6254 	 */
6255 	if (new_pve != NULL) {
6256 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6257 		old_pve = rb_tree_insert_node(tree, new_pve);
6258 		KASSERT(old_pve == new_pve);
6259 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6260 	}
6261 
6262 same_pa:
6263 	/*
6264 	 * shootdown tlb if necessary.
6265 	 */
6266 
6267 	if (pmap_ept_has_ad) {
6268 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6269 	} else {
6270 		accessed = (opte & EPT_R) != 0;
6271 	}
6272 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6273 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6274 	}
6275 	pmap_drain_pv(pmap);
6276 	mutex_exit(&pmap->pm_lock);
6277 	return 0;
6278 }
6279 
6280 /* Pay close attention, this returns L2. */
6281 static int
6282 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6283 {
6284 	pt_entry_t *pteva;
6285 	paddr_t ptepa;
6286 	int i, index;
6287 
6288 	KASSERT(mutex_owned(&pmap->pm_lock));
6289 
6290 	ptepa = pmap->pm_pdirpa[0];
6291 	for (i = PTP_LEVELS; i > 1; i--) {
6292 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6293 		index = pl_pi(va, i);
6294 		if (!pmap_ept_valid_entry(pteva[index]))
6295 			return i;
6296 		ptepa = pmap_pte2pa(pteva[index]);
6297 	}
6298 	if (lastpde != NULL) {
6299 		*lastpde = pteva[index];
6300 	}
6301 
6302 	return 0;
6303 }
6304 
6305 static bool
6306 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6307 {
6308 	pt_entry_t *ptes, pte;
6309 	pd_entry_t pde;
6310 	paddr_t ptppa, pa;
6311 	bool rv;
6312 
6313 #ifdef __HAVE_DIRECT_MAP
6314 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6315 		if (pap != NULL) {
6316 			*pap = PMAP_DIRECT_UNMAP(va);
6317 		}
6318 		return true;
6319 	}
6320 #endif
6321 
6322 	rv = false;
6323 	pa = 0;
6324 
6325 	mutex_enter(&pmap->pm_lock);
6326 	kpreempt_disable();
6327 
6328 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6329 		ptppa = pmap_pte2pa(pde);
6330 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6331 		pte = ptes[pl1_pi(va)];
6332 		if (__predict_true((pte & EPT_R) != 0)) {
6333 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6334 			rv = true;
6335 		}
6336 	}
6337 
6338 	kpreempt_enable();
6339 	mutex_exit(&pmap->pm_lock);
6340 
6341 	if (pap != NULL) {
6342 		*pap = pa;
6343 	}
6344 	return rv;
6345 }
6346 
6347 static bool
6348 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6349     vaddr_t va)
6350 {
6351 	struct pv_entry *pve;
6352 	struct vm_page *pg;
6353 	struct pmap_page *pp;
6354 	pt_entry_t opte;
6355 	bool accessed;
6356 
6357 	KASSERT(pmap != pmap_kernel());
6358 	KASSERT(mutex_owned(&pmap->pm_lock));
6359 	KASSERT(kpreempt_disabled());
6360 
6361 	if (!pmap_ept_valid_entry(*pte)) {
6362 		/* VA not mapped. */
6363 		return false;
6364 	}
6365 
6366 	/* Atomically save the old PTE and zap it. */
6367 	opte = pmap_pte_testset(pte, 0);
6368 	if (!pmap_ept_valid_entry(opte)) {
6369 		return false;
6370 	}
6371 
6372 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6373 
6374 	if (ptp) {
6375 		/*
6376 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6377 		 */
6378 		ptp->wire_count--;
6379 		if (ptp->wire_count <= 1) {
6380 			opte |= EPT_A;
6381 		}
6382 	}
6383 
6384 	if (pmap_ept_has_ad) {
6385 		accessed = (opte & EPT_A) != 0;
6386 	} else {
6387 		accessed = true;
6388 	}
6389 	if (accessed) {
6390 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6391 	}
6392 
6393 	/*
6394 	 * If we are not on a pv list - we are done.
6395 	 */
6396 	if ((opte & EPT_PVLIST) == 0) {
6397 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6398 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6399 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6400 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6401 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6402 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6403 		return true;
6404 	}
6405 
6406 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6407 		pp = VM_PAGE_TO_PP(pg);
6408 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6409 		paddr_t pa = pmap_pte2pa(opte);
6410 		panic("%s: EPT_PVLIST with pv-untracked page"
6411 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6412 		    __func__, va, pa, atop(pa));
6413 	}
6414 
6415 	/* Sync R/M bits. */
6416 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6417 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6418 	return true;
6419 }
6420 
6421 static void
6422 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6423     vaddr_t startva, vaddr_t endva)
6424 {
6425 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6426 
6427 	KASSERT(pmap != pmap_kernel());
6428 	KASSERT(mutex_owned(&pmap->pm_lock));
6429 	KASSERT(kpreempt_disabled());
6430 
6431 	/*
6432 	 * mappings are very often sparse, so clip the given range to the
6433 	 * range of PTEs that are known present in the PTP.
6434 	 */
6435 	pmap_ptp_range_clip(ptp, &startva, &pte);
6436 
6437 	/*
6438 	 * note that ptpva points to the PTE that maps startva.   this may
6439 	 * or may not be the first PTE in the PTP.
6440 	 *
6441 	 * we loop through the PTP while there are still PTEs to look at
6442 	 * and the wire_count is greater than 1 (because we use the wire_count
6443 	 * to keep track of the number of real PTEs in the PTP).
6444 	 */
6445 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6446 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6447 		startva += PAGE_SIZE;
6448 		pte++;
6449 	}
6450 }
6451 
6452 static void
6453 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6454 {
6455 	pt_entry_t *ptes;
6456 	pd_entry_t pde;
6457 	paddr_t ptppa;
6458 	vaddr_t blkendva, va = sva;
6459 	struct vm_page *ptp;
6460 
6461 	mutex_enter(&pmap->pm_lock);
6462 	kpreempt_disable();
6463 
6464 	for (/* null */ ; va < eva ; va = blkendva) {
6465 		int lvl;
6466 
6467 		/* determine range of block */
6468 		blkendva = x86_round_pdr(va+1);
6469 		if (blkendva > eva)
6470 			blkendva = eva;
6471 
6472 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6473 		if (lvl != 0) {
6474 			/* Skip a range corresponding to an invalid pde. */
6475 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6476  			continue;
6477 		}
6478 
6479 		/* PA of the PTP */
6480 		ptppa = pmap_pte2pa(pde);
6481 
6482 		ptp = pmap_find_ptp(pmap, va, 1);
6483 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6484 		    __func__);
6485 
6486 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6487 
6488 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6489 		    blkendva);
6490 
6491 		/* If PTP is no longer being used, free it. */
6492 		if (ptp && ptp->wire_count <= 1) {
6493 			pmap_ept_free_ptp(pmap, ptp, va);
6494 		}
6495 	}
6496 
6497 	kpreempt_enable();
6498 	pmap_drain_pv(pmap);
6499 	mutex_exit(&pmap->pm_lock);
6500 }
6501 
6502 static int
6503 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6504     uint8_t *oattrs, pt_entry_t *optep)
6505 {
6506 	struct pmap *pmap;
6507 	pt_entry_t *ptep;
6508 	pt_entry_t opte;
6509 	pt_entry_t npte;
6510 	pt_entry_t expect;
6511 	bool need_shootdown;
6512 
6513 	expect = pmap_pa2pte(pa) | EPT_R;
6514 	pmap = ptp_to_pmap(ptp);
6515 
6516 	if (clearbits != ~0) {
6517 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6518 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6519 	}
6520 
6521 	ptep = pmap_map_pte(pmap, ptp, va);
6522 	do {
6523 		opte = *ptep;
6524 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6525 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6526 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6527 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6528 			/*
6529 			 * We lost a race with a V->P operation like
6530 			 * pmap_remove().  Wait for the competitor
6531 			 * reflecting pte bits into mp_attrs.
6532 			 */
6533 			pmap_unmap_pte();
6534 			return EAGAIN;
6535 		}
6536 
6537 		/*
6538 		 * Check if there's anything to do on this PTE.
6539 		 */
6540 		if ((opte & clearbits) == 0) {
6541 			need_shootdown = false;
6542 			break;
6543 		}
6544 
6545 		/*
6546 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6547 		 * ... Unless we are clearing only the EPT_W bit and
6548 		 * it isn't cached as RW (EPT_D).
6549 		 */
6550 		if (pmap_ept_has_ad) {
6551 			need_shootdown = (opte & EPT_A) != 0 &&
6552 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6553 		} else {
6554 			need_shootdown = true;
6555 		}
6556 
6557 		npte = opte & ~clearbits;
6558 
6559 		/*
6560 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6561 		 */
6562 		if (need_shootdown) {
6563 			npte &= ~(EPT_A | EPT_D);
6564 		}
6565 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6566 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6567 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6568 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6569 
6570 	if (need_shootdown) {
6571 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6572 	}
6573 	pmap_unmap_pte();
6574 
6575 	*oattrs = pmap_ept_to_pp_attrs(opte);
6576 	if (optep != NULL)
6577 		*optep = opte;
6578 	return 0;
6579 }
6580 
6581 static void
6582 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6583     vaddr_t va)
6584 {
6585 
6586 	KASSERT(mutex_owned(&pmap->pm_lock));
6587 
6588 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6589 	ptp->wire_count--;
6590 	if (ptp->wire_count <= 1) {
6591 		pmap_ept_free_ptp(pmap, ptp, va);
6592 	}
6593 }
6594 
6595 static void
6596 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6597 {
6598 	pt_entry_t bit_rem;
6599 	pt_entry_t *ptes, *spte;
6600 	pt_entry_t opte, npte;
6601 	pd_entry_t pde;
6602 	paddr_t ptppa;
6603 	vaddr_t va;
6604 	bool modified;
6605 
6606 	bit_rem = 0;
6607 	if (!(prot & VM_PROT_WRITE))
6608 		bit_rem = EPT_W;
6609 
6610 	sva &= PTE_FRAME;
6611 	eva &= PTE_FRAME;
6612 
6613 	/* Acquire pmap. */
6614 	mutex_enter(&pmap->pm_lock);
6615 	kpreempt_disable();
6616 
6617 	for (va = sva; va < eva; va += PAGE_SIZE) {
6618 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6619 			continue;
6620 		}
6621 
6622 		ptppa = pmap_pte2pa(pde);
6623 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6624 		spte = &ptes[pl1_pi(va)];
6625 
6626 		do {
6627 			opte = *spte;
6628 			if (!pmap_ept_valid_entry(opte)) {
6629 				goto next;
6630 			}
6631 			npte = (opte & ~bit_rem);
6632 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6633 
6634 		if (pmap_ept_has_ad) {
6635 			modified = (opte & EPT_D) != 0;
6636 		} else {
6637 			modified = true;
6638 		}
6639 		if (modified) {
6640 			vaddr_t tva = x86_ptob(spte - ptes);
6641 			pmap_tlb_shootdown(pmap, tva, 0,
6642 			    TLBSHOOT_WRITE_PROTECT);
6643 		}
6644 next:;
6645 	}
6646 
6647 	kpreempt_enable();
6648 	mutex_exit(&pmap->pm_lock);
6649 }
6650 
6651 static void
6652 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6653 {
6654 	pt_entry_t *ptes, *ptep, opte;
6655 	pd_entry_t pde;
6656 	paddr_t ptppa;
6657 
6658 	/* Acquire pmap. */
6659 	mutex_enter(&pmap->pm_lock);
6660 	kpreempt_disable();
6661 
6662 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6663 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6664 	}
6665 
6666 	ptppa = pmap_pte2pa(pde);
6667 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6668 	ptep = &ptes[pl1_pi(va)];
6669 	opte = *ptep;
6670 	KASSERT(pmap_ept_valid_entry(opte));
6671 
6672 	if (opte & EPT_WIRED) {
6673 		pt_entry_t npte = opte & ~EPT_WIRED;
6674 
6675 		opte = pmap_pte_testset(ptep, npte);
6676 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6677 	} else {
6678 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6679 		    "did not change!\n", __func__, pmap, va);
6680 	}
6681 
6682 	/* Release pmap. */
6683 	kpreempt_enable();
6684 	mutex_exit(&pmap->pm_lock);
6685 }
6686 
6687 /* -------------------------------------------------------------------------- */
6688 
6689 void
6690 pmap_ept_transform(struct pmap *pmap)
6691 {
6692 	pmap->pm_enter = pmap_ept_enter;
6693 	pmap->pm_extract = pmap_ept_extract;
6694 	pmap->pm_remove = pmap_ept_remove;
6695 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6696 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6697 	pmap->pm_write_protect = pmap_ept_write_protect;
6698 	pmap->pm_unwire = pmap_ept_unwire;
6699 
6700 	memset(pmap->pm_pdir, 0, PAGE_SIZE);
6701 }
6702 
6703 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6704