xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*	$NetBSD: pmap.c,v 1.410 2021/04/17 18:03:21 bouyer Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.410 2021/04/17 18:03:21 bouyer Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 
142 #define	__MUTEX_PRIVATE	/* for assertions */
143 
144 #include <sys/param.h>
145 #include <sys/systm.h>
146 #include <sys/proc.h>
147 #include <sys/pool.h>
148 #include <sys/kernel.h>
149 #include <sys/atomic.h>
150 #include <sys/cpu.h>
151 #include <sys/intr.h>
152 #include <sys/xcall.h>
153 #include <sys/kcore.h>
154 #include <sys/kmem.h>
155 #include <sys/asan.h>
156 #include <sys/msan.h>
157 #include <sys/entropy.h>
158 
159 #include <uvm/uvm.h>
160 #include <uvm/pmap/pmap_pvt.h>
161 
162 #include <dev/isa/isareg.h>
163 
164 #include <machine/specialreg.h>
165 #include <machine/gdt.h>
166 #include <machine/isa_machdep.h>
167 #include <machine/cpuvar.h>
168 #include <machine/cputypes.h>
169 
170 #include <x86/pmap_pv.h>
171 
172 #include <x86/i82489reg.h>
173 #include <x86/i82489var.h>
174 
175 #ifdef XEN
176 #include <xen/include/public/xen.h>
177 #include <xen/hypervisor.h>
178 #include <xen/xenpmap.h>
179 #endif
180 
181 /*
182  * general info:
183  *
184  *  - for an explanation of how the x86 MMU hardware works see
185  *    the comments in <machine/pte.h>.
186  *
187  *  - for an explanation of the general memory structure used by
188  *    this pmap (including the recursive mapping), see the comments
189  *    in <machine/pmap.h>.
190  *
191  * this file contains the code for the "pmap module."   the module's
192  * job is to manage the hardware's virtual to physical address mappings.
193  * note that there are two levels of mapping in the VM system:
194  *
195  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
196  *      to map ranges of virtual address space to objects/files.  for
197  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
198  *      to the file /bin/ls starting at offset zero."   note that
199  *      the upper layer mapping is not concerned with how individual
200  *      vm_pages are mapped.
201  *
202  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
203  *      from virtual addresses.   it is concerned with which vm_page is
204  *      mapped where.   for example, when you run /bin/ls and start
205  *      at page 0x1000 the fault routine may lookup the correct page
206  *      of the /bin/ls file and then ask the pmap layer to establish
207  *      a mapping for it.
208  *
209  * note that information in the lower layer of the VM system can be
210  * thrown away since it can easily be reconstructed from the info
211  * in the upper layer.
212  *
213  * data structures we use include:
214  *
215  *  - struct pmap: describes the address space of one thread
216  *  - struct pmap_page: describes one pv-tracked page, without
217  *    necessarily a corresponding vm_page
218  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
219  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
220  *    physical memory.   the pp_pvlist points to a list of pv_entry
221  *    structures which describe all the <PMAP,VA> pairs that this
222  *    page is mapped in.    this is critical for page based operations
223  *    such as pmap_page_protect() [change protection on _all_ mappings
224  *    of a page]
225  */
226 
227 /*
228  * Locking
229  *
230  * We have the following locks that we must deal with, listed in the order
231  * that they are acquired:
232  *
233  * pg->uobject->vmobjlock, pg->uanon->an_lock
234  *
235  * 	For managed pages, these per-object locks are taken by the VM system
236  *	before calling into the pmap module - either a read or write hold.
237  *	The lock hold prevent pages from changing identity while the pmap is
238  *	operating on them.  For example, the same lock is held across a call
239  *	to pmap_remove() and the following call to pmap_update(), so that a
240  *	page does not gain a new identity while its TLB visibility is stale.
241  *
242  * pmap->pm_lock
243  *
244  *	This lock protects the fields in the pmap structure including the
245  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
246  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
247  *	kernel PDEs are never freed, and the kernel is expected to be self
248  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
249  *	because they can be modified from interrupt context).
250  *
251  * pmaps_lock
252  *
253  *	This lock protects the list of active pmaps (headed by "pmaps").
254  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
255  *
256  * pp_lock
257  *
258  *	This per-page lock protects PV entry lists and the embedded PV entry
259  *	in each vm_page, allowing for concurrent operation on pages by
260  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
261  *	points it is taken context switching is usually not tolerable, and
262  *	spin mutexes must block out interrupts that could take kernel_lock.
263  */
264 
265 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
266 #ifdef DIAGNOSTIC
267 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
268 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
269 #else
270 #define	PMAP_DUMMY_LOCK(pm)
271 #define	PMAP_DUMMY_UNLOCK(pm)
272 #endif
273 
274 static const struct uvm_pagerops pmap_pager = {
275 	/* nothing */
276 };
277 
278 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
279 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
280 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
281 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
282 const long nbpd[] = NBPD_INITIALIZER;
283 #ifdef i386
284 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
285 #else
286 pd_entry_t *normal_pdes[3];
287 #endif
288 
289 long nkptp[] = NKPTP_INITIALIZER;
290 
291 struct pmap_head pmaps;
292 kmutex_t pmaps_lock __cacheline_aligned;
293 
294 struct pcpu_area *pcpuarea __read_mostly;
295 
296 static vaddr_t pmap_maxkvaddr;
297 
298 /*
299  * Misc. event counters.
300  */
301 struct evcnt pmap_iobmp_evcnt;
302 struct evcnt pmap_ldt_evcnt;
303 
304 /*
305  * PAT
306  */
307 static bool cpu_pat_enabled __read_mostly = false;
308 
309 /*
310  * Global data structures
311  */
312 
313 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
314 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
315 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
316 
317 struct bootspace bootspace __read_mostly;
318 struct slotspace slotspace __read_mostly;
319 
320 /* Set to PTE_NX if supported. */
321 pd_entry_t pmap_pg_nx __read_mostly = 0;
322 
323 /* Set to PTE_G if supported. */
324 pd_entry_t pmap_pg_g __read_mostly = 0;
325 
326 /* Set to true if large pages are supported. */
327 int pmap_largepages __read_mostly = 0;
328 
329 paddr_t lowmem_rsvd __read_mostly;
330 paddr_t avail_start __read_mostly; /* PA of first available physical page */
331 paddr_t avail_end __read_mostly; /* PA of last available physical page */
332 
333 #ifdef XENPV
334 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
335 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
336 #endif
337 
338 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
339 #define	PMAP_CHECK_PP(pp) \
340     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
341 
342 #define PAGE_ALIGNED(pp)	\
343 	__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
344 
345 /*
346  * Other data structures
347  */
348 
349 static pt_entry_t protection_codes[8] __read_mostly;
350 
351 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
352 
353 /*
354  * The following two vaddr_t's are used during system startup to keep track of
355  * how much of the kernel's VM space we have used. Once the system is started,
356  * the management of the remaining kernel VM space is turned over to the
357  * kernel_map vm_map.
358  */
359 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
360 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
361 
362 #ifndef XENPV
363 /*
364  * LAPIC virtual address, and fake physical address.
365  */
366 volatile vaddr_t local_apic_va __read_mostly;
367 paddr_t local_apic_pa __read_mostly;
368 #endif
369 
370 /*
371  * pool that pmap structures are allocated from
372  */
373 struct pool_cache pmap_cache;
374 static int  pmap_ctor(void *, void *, int);
375 static void pmap_dtor(void *, void *);
376 
377 /*
378  * pv_page cache
379  */
380 static struct pool_cache pmap_pvp_cache;
381 
382 #ifdef __HAVE_DIRECT_MAP
383 vaddr_t pmap_direct_base __read_mostly;
384 vaddr_t pmap_direct_end __read_mostly;
385 #endif
386 
387 #ifndef __HAVE_DIRECT_MAP
388 /*
389  * Special VAs and the PTEs that map them
390  */
391 static pt_entry_t *early_zero_pte;
392 static void pmap_vpage_cpualloc(struct cpu_info *);
393 #ifdef XENPV
394 char *early_zerop; /* also referenced from xen_locore() */
395 #else
396 static char *early_zerop;
397 #endif
398 #endif
399 
400 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
401 
402 /* PDP pool and its callbacks */
403 static struct pool pmap_pdp_pool;
404 static void pmap_pdp_init(pd_entry_t *);
405 static void pmap_pdp_fini(pd_entry_t *);
406 
407 #ifdef PAE
408 /* need to allocate items of 4 pages */
409 static void *pmap_pdp_alloc(struct pool *, int);
410 static void pmap_pdp_free(struct pool *, void *);
411 static struct pool_allocator pmap_pdp_allocator = {
412 	.pa_alloc = pmap_pdp_alloc,
413 	.pa_free = pmap_pdp_free,
414 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
415 };
416 #endif
417 
418 extern vaddr_t idt_vaddr;
419 extern paddr_t idt_paddr;
420 extern vaddr_t gdt_vaddr;
421 extern paddr_t gdt_paddr;
422 extern vaddr_t ldt_vaddr;
423 extern paddr_t ldt_paddr;
424 
425 #ifdef i386
426 /* stuff to fix the pentium f00f bug */
427 extern vaddr_t pentium_idt_vaddr;
428 #endif
429 
430 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
431 struct pmap_ptparray {
432 	struct vm_page *pg[PTP_LEVELS + 1];
433 	bool alloced[PTP_LEVELS + 1];
434 };
435 
436 /*
437  * PV entries are allocated in page-sized chunks and cached per-pmap to
438  * avoid intense pressure on memory allocators.
439  */
440 
441 struct pv_page {
442 	LIST_HEAD(, pv_entry)	pvp_pves;
443 	LIST_ENTRY(pv_page)	pvp_list;
444 	long			pvp_nfree;
445 	struct pmap		*pvp_pmap;
446 };
447 
448 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
449 
450 /*
451  * PV tree prototypes
452  */
453 
454 static int	pmap_compare_key(void *, const void *, const void *);
455 static int	pmap_compare_nodes(void *, const void *, const void *);
456 
457 /* Read-black tree */
458 static const rb_tree_ops_t pmap_rbtree_ops = {
459 	.rbto_compare_nodes = pmap_compare_nodes,
460 	.rbto_compare_key = pmap_compare_key,
461 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
462 	.rbto_context = NULL
463 };
464 
465 /*
466  * Local prototypes
467  */
468 
469 #ifdef __HAVE_PCPU_AREA
470 static void pmap_init_pcpu(void);
471 #endif
472 #ifdef __HAVE_DIRECT_MAP
473 static void pmap_init_directmap(struct pmap *);
474 #endif
475 #if !defined(XENPV)
476 static void pmap_remap_global(void);
477 #endif
478 #ifndef XENPV
479 static void pmap_init_lapic(void);
480 static void pmap_remap_largepages(void);
481 #endif
482 
483 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
484     struct vm_page **);
485 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
486 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
487     pd_entry_t * const *);
488 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
489 static void pmap_freepage(struct pmap *, struct vm_page *, int);
490 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
491     pt_entry_t *, pd_entry_t * const *);
492 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
493     vaddr_t);
494 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
495     vaddr_t);
496 static int pmap_pvp_ctor(void *, void *, int);
497 static void pmap_pvp_dtor(void *, void *);
498 static struct pv_entry *pmap_alloc_pv(struct pmap *);
499 static void pmap_free_pv(struct pmap *, struct pv_entry *);
500 static void pmap_drain_pv(struct pmap *);
501 
502 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
503 
504 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
505 static void pmap_reactivate(struct pmap *);
506 
507 /*
508  * p m a p   h e l p e r   f u n c t i o n s
509  */
510 
511 static inline void
512 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
513 {
514 
515 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
516 	pmap->pm_stats.resident_count += resid_diff;
517 	pmap->pm_stats.wired_count += wired_diff;
518 }
519 
520 static inline void
521 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
522 {
523 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
524 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
525 
526 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
527 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
528 
529 	pmap_stats_update(pmap, resid_diff, wired_diff);
530 }
531 
532 /*
533  * ptp_to_pmap: lookup pmap by ptp
534  */
535 static inline struct pmap *
536 ptp_to_pmap(struct vm_page *ptp)
537 {
538 	struct pmap *pmap;
539 
540 	if (ptp == NULL) {
541 		return pmap_kernel();
542 	}
543 	pmap = (struct pmap *)ptp->uobject;
544 	KASSERT(pmap != NULL);
545 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
546 	return pmap;
547 }
548 
549 static inline struct pv_pte *
550 pve_to_pvpte(struct pv_entry *pve)
551 {
552 
553 	if (pve == NULL)
554 		return NULL;
555 	KASSERT((void *)&pve->pve_pte == (void *)pve);
556 	return &pve->pve_pte;
557 }
558 
559 static inline struct pv_entry *
560 pvpte_to_pve(struct pv_pte *pvpte)
561 {
562 	struct pv_entry *pve = (void *)pvpte;
563 
564 	KASSERT(pve_to_pvpte(pve) == pvpte);
565 	return pve;
566 }
567 
568 /*
569  * Return true if the pmap page has an embedded PV entry.
570  */
571 static inline bool
572 pv_pte_embedded(struct pmap_page *pp)
573 {
574 
575 	KASSERT(mutex_owned(&pp->pp_lock));
576 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
577 }
578 
579 /*
580  * pv_pte_first, pv_pte_next: PV list iterator.
581  */
582 static inline struct pv_pte *
583 pv_pte_first(struct pmap_page *pp)
584 {
585 
586 	KASSERT(mutex_owned(&pp->pp_lock));
587 	if (pv_pte_embedded(pp)) {
588 		return &pp->pp_pte;
589 	}
590 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
591 }
592 
593 static inline struct pv_pte *
594 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
595 {
596 
597 	KASSERT(mutex_owned(&pp->pp_lock));
598 	KASSERT(pvpte != NULL);
599 	if (pvpte == &pp->pp_pte) {
600 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
601 	}
602 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
603 }
604 
605 static inline uint8_t
606 pmap_pte_to_pp_attrs(pt_entry_t pte)
607 {
608 	uint8_t ret = 0;
609 	if (pte & PTE_D)
610 		ret |= PP_ATTRS_D;
611 	if (pte & PTE_A)
612 		ret |= PP_ATTRS_A;
613 	if (pte & PTE_W)
614 		ret |= PP_ATTRS_W;
615 	return ret;
616 }
617 
618 static inline pt_entry_t
619 pmap_pp_attrs_to_pte(uint8_t attrs)
620 {
621 	pt_entry_t pte = 0;
622 	if (attrs & PP_ATTRS_D)
623 		pte |= PTE_D;
624 	if (attrs & PP_ATTRS_A)
625 		pte |= PTE_A;
626 	if (attrs & PP_ATTRS_W)
627 		pte |= PTE_W;
628 	return pte;
629 }
630 
631 /*
632  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
633  * of course the kernel is always loaded
634  */
635 bool
636 pmap_is_curpmap(struct pmap *pmap)
637 {
638 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
639 }
640 
641 inline void
642 pmap_reference(struct pmap *pmap)
643 {
644 
645 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
646 }
647 
648 /*
649  * rbtree: compare two nodes.
650  */
651 static int
652 pmap_compare_nodes(void *context, const void *n1, const void *n2)
653 {
654 	const struct pv_entry *pve1 = n1;
655 	const struct pv_entry *pve2 = n2;
656 
657 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
658 
659 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
660 		return -1;
661 	}
662 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
663 		return 1;
664 	}
665 	return 0;
666 }
667 
668 /*
669  * rbtree: compare a node and a key.
670  */
671 static int
672 pmap_compare_key(void *context, const void *n, const void *k)
673 {
674 	const struct pv_entry *pve = n;
675 	const vaddr_t key = (vaddr_t)k;
676 
677 	if (pve->pve_pte.pte_va < key) {
678 		return -1;
679 	}
680 	if (pve->pve_pte.pte_va > key) {
681 		return 1;
682 	}
683 	return 0;
684 }
685 
686 /*
687  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
688  */
689 static inline void
690 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
691 {
692 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
693 
694 	if (va < *min) {
695 		*min = va;
696 	}
697 }
698 
699 /*
700  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
701  */
702 static inline void
703 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
704 {
705 	vaddr_t sclip;
706 
707 	if (ptp == NULL) {
708 		return;
709 	}
710 
711 	sclip = (vaddr_t)ptp->uanon;
712 	sclip = (*startva < sclip ? sclip : *startva);
713 	*pte += (sclip - *startva) / PAGE_SIZE;
714 	*startva = sclip;
715 }
716 
717 /*
718  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
719  *
720  * there are several pmaps involved.  some or all of them might be same.
721  *
722  *	- the pmap given by the first argument
723  *		our caller wants to access this pmap's PTEs.
724  *
725  *	- pmap_kernel()
726  *		the kernel pmap.  note that it only contains the kernel part
727  *		of the address space which is shared by any pmap.  ie. any
728  *		pmap can be used instead of pmap_kernel() for our purpose.
729  *
730  *	- ci->ci_pmap
731  *		pmap currently loaded on the cpu.
732  *
733  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
734  *		current process' pmap.
735  *
736  * => caller must lock pmap first (if not the kernel pmap)
737  * => must be undone with pmap_unmap_ptes before returning
738  * => disables kernel preemption
739  */
740 void
741 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
742     pd_entry_t * const **pdeppp)
743 {
744 	struct pmap *curpmap;
745 	struct cpu_info *ci;
746 	lwp_t *l;
747 
748 	kpreempt_disable();
749 
750 	/* The kernel's pmap is always accessible. */
751 	if (pmap == pmap_kernel()) {
752 		*pmap2 = NULL;
753 		*ptepp = PTE_BASE;
754 		*pdeppp = normal_pdes;
755 		return;
756 	}
757 
758 	KASSERT(mutex_owned(&pmap->pm_lock));
759 
760 	l = curlwp;
761 	ci = l->l_cpu;
762 	curpmap = ci->ci_pmap;
763 	if (pmap == curpmap) {
764 		/*
765 		 * Already on the CPU: make it valid.  This is very
766 		 * often the case during exit(), when we have switched
767 		 * to the kernel pmap in order to destroy a user pmap.
768 		 */
769 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
770 			pmap_reactivate(pmap);
771 		}
772 		*pmap2 = NULL;
773 	} else {
774 		/*
775 		 * Toss current pmap from CPU and install new pmap, but keep
776 		 * a reference to the old one.  Dropping the reference can
777 		 * can block as it needs to take locks, so defer that to
778 		 * pmap_unmap_ptes().
779 		 */
780 		pmap_reference(pmap);
781 		pmap_load1(l, pmap, curpmap);
782 		*pmap2 = curpmap;
783 	}
784 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
785 #ifdef DIAGNOSTIC
786 	pmap->pm_ncsw = lwp_pctr();
787 #endif
788 	*ptepp = PTE_BASE;
789 
790 #if defined(XENPV) && defined(__x86_64__)
791 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
792 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
793 	*pdeppp = ci->ci_normal_pdes;
794 #else
795 	*pdeppp = normal_pdes;
796 #endif
797 }
798 
799 /*
800  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
801  *
802  * => we cannot tolerate context switches while mapped in: assert this.
803  * => reenables kernel preemption.
804  * => does not unlock pmap.
805  */
806 void
807 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
808 {
809 	struct cpu_info *ci;
810 	struct pmap *mypmap;
811 	struct lwp *l;
812 
813 	KASSERT(kpreempt_disabled());
814 
815 	/* The kernel's pmap is always accessible. */
816 	if (pmap == pmap_kernel()) {
817 		kpreempt_enable();
818 		return;
819 	}
820 
821 	l = curlwp;
822 	ci = l->l_cpu;
823 
824 	KASSERT(mutex_owned(&pmap->pm_lock));
825 	KASSERT(pmap->pm_ncsw == lwp_pctr());
826 
827 #if defined(XENPV) && defined(__x86_64__)
828 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
829 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
830 #endif
831 
832 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
833 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
834 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
835 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
836 		ci->ci_want_pmapload = 0;
837 	} else {
838 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
839 		ci->ci_tlbstate = TLBSTATE_LAZY;
840 	}
841 
842 	/* Now safe to re-enable preemption. */
843 	kpreempt_enable();
844 
845 	/* Toss reference to other pmap taken earlier. */
846 	if (pmap2 != NULL) {
847 		pmap_destroy(pmap2);
848 	}
849 }
850 
851 inline static void
852 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
853 {
854 
855 #if !defined(__x86_64__)
856 	if (curproc == NULL || curproc->p_vmspace == NULL ||
857 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
858 		return;
859 
860 	if ((opte ^ npte) & PTE_X)
861 		pmap_update_pg(va);
862 
863 	/*
864 	 * Executability was removed on the last executable change.
865 	 * Reset the code segment to something conservative and
866 	 * let the trap handler deal with setting the right limit.
867 	 * We can't do that because of locking constraints on the vm map.
868 	 */
869 
870 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
871 		struct trapframe *tf = curlwp->l_md.md_regs;
872 
873 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
874 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
875 	}
876 #endif /* !defined(__x86_64__) */
877 }
878 
879 #if !defined(__x86_64__)
880 /*
881  * Fixup the code segment to cover all potential executable mappings.
882  * returns 0 if no changes to the code segment were made.
883  */
884 int
885 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
886 {
887 	struct vm_map_entry *ent;
888 	struct pmap *pm = vm_map_pmap(map);
889 	vaddr_t va = 0;
890 
891 	vm_map_lock_read(map);
892 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
893 		/*
894 		 * This entry has greater va than the entries before.
895 		 * We need to make it point to the last page, not past it.
896 		 */
897 		if (ent->protection & VM_PROT_EXECUTE)
898 			va = trunc_page(ent->end) - PAGE_SIZE;
899 	}
900 	vm_map_unlock_read(map);
901 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
902 		return 0;
903 
904 	pm->pm_hiexec = va;
905 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
906 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
907 	} else {
908 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
909 		return 0;
910 	}
911 	return 1;
912 }
913 #endif /* !defined(__x86_64__) */
914 
915 void
916 pat_init(struct cpu_info *ci)
917 {
918 #ifndef XENPV
919 	uint64_t pat;
920 
921 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
922 		return;
923 
924 	/* We change WT to WC. Leave all other entries the default values. */
925 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
926 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
927 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
928 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
929 
930 	wrmsr(MSR_CR_PAT, pat);
931 	cpu_pat_enabled = true;
932 #endif
933 }
934 
935 static pt_entry_t
936 pmap_pat_flags(u_int flags)
937 {
938 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
939 
940 	if (!cpu_pat_enabled) {
941 		switch (cacheflags) {
942 		case PMAP_NOCACHE:
943 		case PMAP_NOCACHE_OVR:
944 			/* results in PGC_UCMINUS on cpus which have
945 			 * the cpuid PAT but PAT "disabled"
946 			 */
947 			return PTE_PCD;
948 		default:
949 			return 0;
950 		}
951 	}
952 
953 	switch (cacheflags) {
954 	case PMAP_NOCACHE:
955 		return PGC_UC;
956 	case PMAP_WRITE_COMBINE:
957 		return PGC_WC;
958 	case PMAP_WRITE_BACK:
959 		return PGC_WB;
960 	case PMAP_NOCACHE_OVR:
961 		return PGC_UCMINUS;
962 	}
963 
964 	return 0;
965 }
966 
967 /*
968  * p m a p   k e n t e r   f u n c t i o n s
969  *
970  * functions to quickly enter/remove pages from the kernel address
971  * space.   pmap_kremove is exported to MI kernel.  we make use of
972  * the recursive PTE mappings.
973  */
974 
975 /*
976  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
977  *
978  * => no need to lock anything, assume va is already allocated
979  * => should be faster than normal pmap enter function
980  */
981 void
982 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
983 {
984 	pt_entry_t *pte, opte, npte;
985 
986 	KASSERT(!(prot & ~VM_PROT_ALL));
987 
988 	if (va < VM_MIN_KERNEL_ADDRESS)
989 		pte = vtopte(va);
990 	else
991 		pte = kvtopte(va);
992 #if defined(XENPV) && defined(DOM0OPS)
993 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
994 #ifdef DEBUG
995 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
996 		    " outside range\n", __func__, pa, va);
997 #endif /* DEBUG */
998 		npte = pa;
999 	} else
1000 #endif /* XENPV && DOM0OPS */
1001 		npte = pmap_pa2pte(pa);
1002 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1003 	npte |= pmap_pat_flags(flags);
1004 	opte = pmap_pte_testset(pte, npte); /* zap! */
1005 
1006 	/*
1007 	 * XXX: make sure we are not dealing with a large page, since the only
1008 	 * large pages created are for the kernel image, and they should never
1009 	 * be kentered.
1010 	 */
1011 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1012 
1013 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1014 		/* This should not happen. */
1015 		printf_nolog("%s: mapping already present\n", __func__);
1016 		kpreempt_disable();
1017 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1018 		kpreempt_enable();
1019 	}
1020 }
1021 
1022 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1023 
1024 #if defined(__x86_64__)
1025 /*
1026  * Change protection for a virtual address. Local for a CPU only, don't
1027  * care about TLB shootdowns.
1028  *
1029  * => must be called with preemption disabled
1030  */
1031 void
1032 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1033 {
1034 	pt_entry_t *pte, opte, npte;
1035 
1036 	KASSERT(kpreempt_disabled());
1037 
1038 	if (va < VM_MIN_KERNEL_ADDRESS)
1039 		pte = vtopte(va);
1040 	else
1041 		pte = kvtopte(va);
1042 
1043 	npte = opte = *pte;
1044 
1045 	if ((prot & VM_PROT_WRITE) != 0)
1046 		npte |= PTE_W;
1047 	else
1048 		npte &= ~(PTE_W|PTE_D);
1049 
1050 	if (opte != npte) {
1051 		pmap_pte_set(pte, npte);
1052 		pmap_pte_flush();
1053 		invlpg(va);
1054 	}
1055 }
1056 #endif /* defined(__x86_64__) */
1057 
1058 /*
1059  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1060  *
1061  * => no need to lock anything
1062  * => caller must dispose of any vm_page mapped in the va range
1063  * => note: not an inline function
1064  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1065  * => we assume kernel only unmaps valid addresses and thus don't bother
1066  *    checking the valid bit before doing TLB flushing
1067  * => must be followed by call to pmap_update() before reuse of page
1068  */
1069 static void
1070 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1071 {
1072 	pt_entry_t *pte, opte;
1073 	vaddr_t va, eva;
1074 
1075 	eva = sva + len;
1076 
1077 	kpreempt_disable();
1078 	for (va = sva; va < eva; va += PAGE_SIZE) {
1079 		pte = kvtopte(va);
1080 		opte = pmap_pte_testset(pte, 0); /* zap! */
1081 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1082 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1083 			    TLBSHOOT_KREMOVE);
1084 		}
1085 		KASSERTMSG((opte & PTE_PS) == 0,
1086 		    "va %#" PRIxVADDR " is a large page", va);
1087 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1088 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1089 	}
1090 	if (localonly) {
1091 		tlbflushg();
1092 	}
1093 	kpreempt_enable();
1094 }
1095 
1096 void
1097 pmap_kremove(vaddr_t sva, vsize_t len)
1098 {
1099 
1100 	pmap_kremove1(sva, len, false);
1101 }
1102 
1103 /*
1104  * pmap_kremove_local: like pmap_kremove(), but only worry about
1105  * TLB invalidations on the current CPU.  this is only intended
1106  * for use while writing kernel crash dumps, either after panic
1107  * or via reboot -d.
1108  */
1109 void
1110 pmap_kremove_local(vaddr_t sva, vsize_t len)
1111 {
1112 
1113 	pmap_kremove1(sva, len, true);
1114 }
1115 
1116 /*
1117  * p m a p   i n i t   f u n c t i o n s
1118  *
1119  * pmap_bootstrap and pmap_init are called during system startup
1120  * to init the pmap module.   pmap_bootstrap() does a low level
1121  * init just to get things rolling.   pmap_init() finishes the job.
1122  */
1123 
1124 /*
1125  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1126  * This function is to be used before any VM system has been set up.
1127  *
1128  * The va is taken from virtual_avail.
1129  */
1130 static vaddr_t
1131 pmap_bootstrap_valloc(size_t npages)
1132 {
1133 	vaddr_t va = virtual_avail;
1134 	virtual_avail += npages * PAGE_SIZE;
1135 	return va;
1136 }
1137 
1138 /*
1139  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1140  * This function is to be used before any VM system has been set up.
1141  *
1142  * The pa is taken from avail_start.
1143  */
1144 static paddr_t
1145 pmap_bootstrap_palloc(size_t npages)
1146 {
1147 	paddr_t pa = avail_start;
1148 	avail_start += npages * PAGE_SIZE;
1149 	return pa;
1150 }
1151 
1152 /*
1153  * pmap_bootstrap: get the system in a state where it can run with VM properly
1154  * enabled (called before main()). The VM system is fully init'd later.
1155  *
1156  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1157  *    kernel, and nkpde PTP's for the kernel.
1158  * => kva_start is the first free virtual address in kernel space.
1159  */
1160 void
1161 pmap_bootstrap(vaddr_t kva_start)
1162 {
1163 	struct pmap *kpm;
1164 	int i;
1165 	vaddr_t kva;
1166 
1167 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1168 
1169 	/*
1170 	 * Set up our local static global vars that keep track of the usage of
1171 	 * KVM before kernel_map is set up.
1172 	 */
1173 	virtual_avail = kva_start;		/* first free KVA */
1174 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1175 
1176 	/*
1177 	 * Set up protection_codes: we need to be able to convert from a MI
1178 	 * protection code (some combo of VM_PROT...) to something we can jam
1179 	 * into a x86 PTE.
1180 	 */
1181 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1182 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1183 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1184 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1185 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1186 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1187 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1188 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1189 
1190 	/*
1191 	 * Now we init the kernel's pmap.
1192 	 *
1193 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1194 	 * the pm_obj contains the list of active PTPs.
1195 	 */
1196 	kpm = pmap_kernel();
1197 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1198 	rw_init(&kpm->pm_dummy_lock);
1199 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1200 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1201 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1202 		kpm->pm_ptphint[i] = NULL;
1203 	}
1204 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1205 
1206 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1207 	for (i = 0; i < PDP_SIZE; i++)
1208 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1209 
1210 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1211 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1212 
1213 	kcpuset_create(&kpm->pm_cpus, true);
1214 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1215 
1216 	kpm->pm_ldt = NULL;
1217 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1218 
1219 	/*
1220 	 * the above is just a rough estimate and not critical to the proper
1221 	 * operation of the system.
1222 	 */
1223 
1224 #if !defined(XENPV)
1225 	/*
1226 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1227 	 * attribute to already mapped kernel pages. Do that only if SVS is
1228 	 * disabled.
1229 	 *
1230 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1231 	 * happens later in cpu_init().
1232 	 */
1233 #ifdef SVS
1234 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1235 #else
1236 	if (cpu_feature[0] & CPUID_PGE) {
1237 #endif
1238 		pmap_pg_g = PTE_G;
1239 		pmap_remap_global();
1240 	}
1241 #endif
1242 
1243 #ifndef XENPV
1244 	/*
1245 	 * Enable large pages if they are supported.
1246 	 */
1247 	if (cpu_feature[0] & CPUID_PSE) {
1248 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1249 		pmap_largepages = 1;	/* enable software */
1250 
1251 		/*
1252 		 * The TLB must be flushed after enabling large pages on Pentium
1253 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1254 		 * Software Developer's Manual, Volume 3: System Programming".
1255 		 */
1256 		tlbflushg();
1257 
1258 		/* Remap the kernel. */
1259 		pmap_remap_largepages();
1260 	}
1261 	pmap_init_lapic();
1262 #endif /* !XENPV */
1263 
1264 #ifdef __HAVE_PCPU_AREA
1265 	pmap_init_pcpu();
1266 #endif
1267 
1268 #ifdef __HAVE_DIRECT_MAP
1269 	pmap_init_directmap(kpm);
1270 #else
1271 	pmap_vpage_cpualloc(&cpu_info_primary);
1272 
1273 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1274 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1275 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1276 	} else { /* amd64 */
1277 		/*
1278 		 * zero_pte is stuck at the end of mapped space for the kernel
1279 		 * image (disjunct from kva space). This is done so that it
1280 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1281 		 * when it's called for the first time.
1282 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1283 		 */
1284 #ifdef XENPV
1285 		/* early_zerop initialized in xen_locore() */
1286 #else
1287 		early_zerop = (void *)bootspace.spareva;
1288 #endif
1289 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1290 	}
1291 #endif
1292 
1293 #if defined(XENPV) && defined(__x86_64__)
1294 	extern vaddr_t xen_dummy_page;
1295 	paddr_t xen_dummy_user_pgd;
1296 
1297 	/*
1298 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1299 	 * Xen will still consider it active. So we set user PGD to this one
1300 	 * to lift all protection on the now inactive page tables set.
1301 	 */
1302 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1303 
1304 	/* Zero fill it, the less checks in Xen it requires the better */
1305 	memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1306 	/* Mark read-only */
1307 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1308 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1309 	    UVMF_INVLPG);
1310 	/* Pin as L4 */
1311 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1312 #endif
1313 
1314 	/*
1315 	 * Allocate space for the IDT, GDT and LDT.
1316 	 */
1317 	idt_vaddr = pmap_bootstrap_valloc(1);
1318 	idt_paddr = pmap_bootstrap_palloc(1);
1319 
1320 	gdt_vaddr = pmap_bootstrap_valloc(1);
1321 	gdt_paddr = pmap_bootstrap_palloc(1);
1322 
1323 #ifdef __HAVE_PCPU_AREA
1324 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1325 #else
1326 	ldt_vaddr = pmap_bootstrap_valloc(1);
1327 #endif
1328 	ldt_paddr = pmap_bootstrap_palloc(1);
1329 
1330 #if !defined(__x86_64__)
1331 	/* pentium f00f bug stuff */
1332 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1333 #endif
1334 
1335 #if defined(XENPVHVM)
1336 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1337 	extern paddr_t HYPERVISOR_shared_info_pa;
1338 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1339 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1340 
1341 	if (vm_guest != VM_GUEST_XENPVH) {
1342 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1343 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1344 	}
1345 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1346 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1347 #endif
1348 	/*
1349 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1350 	 */
1351 	virtual_avail = reserve_dumppages(virtual_avail);
1352 
1353 	/*
1354 	 * Init the global lock and global list.
1355 	 */
1356 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1357 	LIST_INIT(&pmaps);
1358 
1359 	/*
1360 	 * Ensure the TLB is sync'd with reality by flushing it...
1361 	 */
1362 	tlbflushg();
1363 
1364 	/*
1365 	 * Calculate pmap_maxkvaddr from nkptp[].
1366 	 */
1367 	kva = VM_MIN_KERNEL_ADDRESS;
1368 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1369 		kva += nkptp[i] * nbpd[i];
1370 	}
1371 	pmap_maxkvaddr = kva;
1372 }
1373 
1374 #ifndef XENPV
1375 static void
1376 pmap_init_lapic(void)
1377 {
1378 	/*
1379 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1380 	 * x86 implementation relies a lot on this address to be valid; so just
1381 	 * allocate a fake physical page that will be kentered into
1382 	 * local_apic_va by machdep.
1383 	 *
1384 	 * If the LAPIC is present, the va will be remapped somewhere else
1385 	 * later in lapic_map.
1386 	 */
1387 	local_apic_va = pmap_bootstrap_valloc(1);
1388 	local_apic_pa = pmap_bootstrap_palloc(1);
1389 }
1390 #endif
1391 
1392 #ifdef __x86_64__
1393 static size_t
1394 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1395 {
1396 	size_t npages;
1397 	npages = (roundup(endva, pgsz) / pgsz) -
1398 	    (rounddown(startva, pgsz) / pgsz);
1399 	return npages;
1400 }
1401 #endif
1402 
1403 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1404 static inline void
1405 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1406 {
1407 	size_t sslot = slotspace.area[type].sslot;
1408 	size_t nslot = slotspace.area[type].nslot;
1409 
1410 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1411 }
1412 #endif
1413 
1414 #ifdef __x86_64__
1415 /*
1416  * Randomize the location of an area. We count the holes in the VM space. We
1417  * randomly select one hole, and then randomly select an area within that hole.
1418  * Finally we update the associated entry in the slotspace structure.
1419  */
1420 vaddr_t
1421 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1422     vaddr_t randva)
1423 {
1424 	struct {
1425 		int start;
1426 		int end;
1427 	} holes[SLSPACE_NAREAS+1];
1428 	size_t i, nholes, hole;
1429 	size_t startsl, endsl, nslots, winsize;
1430 	vaddr_t startva, va;
1431 
1432 	sz = roundup(sz, align);
1433 
1434 	/*
1435 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1436 	 * an area that crosses slots:
1437 	 *     +------+------+------+
1438 	 *     | Slot | Slot | Slot |
1439 	 *     +------+------+------+
1440 	 *        [Chosen Area]
1441 	 * And in that case we must take into account the additional slot
1442 	 * consumed.
1443 	 */
1444 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1445 
1446 	/* Get the holes. */
1447 	nholes = 0;
1448 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1449 	while (1) {
1450 		/*
1451 		 * Find the first occupied slot after the current one.
1452 		 * The area between the two is a hole.
1453 		 */
1454 		size_t minsslot = 512;
1455 		size_t minnslot = 0;
1456 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1457 			if (!slotspace.area[i].active)
1458 				continue;
1459 			if (slotspace.area[i].sslot >= curslot &&
1460 			    slotspace.area[i].sslot < minsslot) {
1461 				minsslot = slotspace.area[i].sslot;
1462 				minnslot = slotspace.area[i].nslot;
1463 			}
1464 		}
1465 
1466 		/* No hole anymore, stop here. */
1467 		if (minsslot == 512) {
1468 			break;
1469 		}
1470 
1471 		/* Register the hole. */
1472 		if (minsslot - curslot >= nslots) {
1473 			holes[nholes].start = curslot;
1474 			holes[nholes].end = minsslot;
1475 			nholes++;
1476 		}
1477 
1478 		/* Skip that hole, and iterate again. */
1479 		curslot = minsslot + minnslot;
1480 	}
1481 
1482 	if (nholes == 0) {
1483 		panic("%s: impossible", __func__);
1484 	}
1485 
1486 	/* Select a hole. */
1487 	hole = randhole;
1488 #ifdef NO_X86_ASLR
1489 	hole = 0;
1490 #endif
1491 	hole %= nholes;
1492 	startsl = holes[hole].start;
1493 	endsl = holes[hole].end;
1494 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1495 
1496 	/* Select an area within the hole. */
1497 	va = randva;
1498 #ifdef NO_X86_ASLR
1499 	va = 0;
1500 #endif
1501 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1502 	va %= winsize;
1503 	va = rounddown(va, align);
1504 	va += startva;
1505 
1506 	/* Update the entry. */
1507 	slotspace.area[type].sslot = pl4_i(va);
1508 	slotspace.area[type].nslot =
1509 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1510 	slotspace.area[type].active = true;
1511 
1512 	return va;
1513 }
1514 #endif
1515 
1516 #ifdef __HAVE_PCPU_AREA
1517 static void
1518 pmap_init_pcpu(void)
1519 {
1520 	const vaddr_t startva = PMAP_PCPU_BASE;
1521 	size_t nL4e, nL3e, nL2e, nL1e;
1522 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1523 	paddr_t pa;
1524 	vaddr_t endva;
1525 	vaddr_t tmpva;
1526 	pt_entry_t *pte;
1527 	size_t size;
1528 	int i;
1529 
1530 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1531 
1532 	size = sizeof(struct pcpu_area);
1533 
1534 	endva = startva + size;
1535 
1536 	/* We will use this temporary va. */
1537 	tmpva = bootspace.spareva;
1538 	pte = PTE_BASE + pl1_i(tmpva);
1539 
1540 	/* Build L4 */
1541 	L4e_idx = pl4_i(startva);
1542 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1543 	KASSERT(nL4e  == 1);
1544 	for (i = 0; i < nL4e; i++) {
1545 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1546 
1547 		pa = pmap_bootstrap_palloc(1);
1548 		*pte = (pa & PTE_FRAME) | pteflags;
1549 		pmap_update_pg(tmpva);
1550 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1551 
1552 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1553 	}
1554 
1555 	/* Build L3 */
1556 	L3e_idx = pl3_i(startva);
1557 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1558 	for (i = 0; i < nL3e; i++) {
1559 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1560 
1561 		pa = pmap_bootstrap_palloc(1);
1562 		*pte = (pa & PTE_FRAME) | pteflags;
1563 		pmap_update_pg(tmpva);
1564 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1565 
1566 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1567 	}
1568 
1569 	/* Build L2 */
1570 	L2e_idx = pl2_i(startva);
1571 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1572 	for (i = 0; i < nL2e; i++) {
1573 
1574 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1575 
1576 		pa = pmap_bootstrap_palloc(1);
1577 		*pte = (pa & PTE_FRAME) | pteflags;
1578 		pmap_update_pg(tmpva);
1579 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1580 
1581 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1582 	}
1583 
1584 	/* Build L1 */
1585 	L1e_idx = pl1_i(startva);
1586 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1587 	for (i = 0; i < nL1e; i++) {
1588 		/*
1589 		 * Nothing to do, the PTEs will be entered via
1590 		 * pmap_kenter_pa.
1591 		 */
1592 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1593 	}
1594 
1595 	*pte = 0;
1596 	pmap_update_pg(tmpva);
1597 
1598 	pcpuarea = (struct pcpu_area *)startva;
1599 
1600 	tlbflush();
1601 }
1602 #endif
1603 
1604 #ifdef __HAVE_DIRECT_MAP
1605 /*
1606  * Create the amd64 direct map. Called only once at boot time. We map all of
1607  * the physical memory contiguously using 2MB large pages, with RW permissions.
1608  * However there is a hole: the kernel is mapped with RO permissions.
1609  */
1610 static void
1611 pmap_init_directmap(struct pmap *kpm)
1612 {
1613 	extern phys_ram_seg_t mem_clusters[];
1614 	extern int mem_cluster_cnt;
1615 
1616 	vaddr_t startva;
1617 	size_t nL4e, nL3e, nL2e;
1618 	size_t L4e_idx, L3e_idx, L2e_idx;
1619 	size_t spahole, epahole;
1620 	paddr_t lastpa, pa;
1621 	vaddr_t endva;
1622 	vaddr_t tmpva;
1623 	pt_entry_t *pte;
1624 	phys_ram_seg_t *mc;
1625 	int i;
1626 	size_t randhole;
1627 	vaddr_t randva;
1628 
1629 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1630 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1631 
1632 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1633 
1634 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1635 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1636 
1637 	/* Get the last physical address available */
1638 	lastpa = 0;
1639 	for (i = 0; i < mem_cluster_cnt; i++) {
1640 		mc = &mem_clusters[i];
1641 		lastpa = MAX(lastpa, mc->start + mc->size);
1642 	}
1643 
1644 	/*
1645 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1646 	 */
1647 	if (lastpa > MAXPHYSMEM) {
1648 		panic("pmap_init_directmap: lastpa incorrect");
1649 	}
1650 
1651 	entropy_extract(&randhole, sizeof randhole, 0);
1652 	entropy_extract(&randva, sizeof randva, 0);
1653 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1654 	    randhole, randva);
1655 	endva = startva + lastpa;
1656 
1657 	/* We will use this temporary va. */
1658 	tmpva = bootspace.spareva;
1659 	pte = PTE_BASE + pl1_i(tmpva);
1660 
1661 	/* Build L4 */
1662 	L4e_idx = pl4_i(startva);
1663 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1664 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1665 	for (i = 0; i < nL4e; i++) {
1666 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1667 
1668 		pa = pmap_bootstrap_palloc(1);
1669 		*pte = (pa & PTE_FRAME) | pteflags;
1670 		pmap_update_pg(tmpva);
1671 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1672 
1673 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1674 	}
1675 
1676 	/* Build L3 */
1677 	L3e_idx = pl3_i(startva);
1678 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1679 	for (i = 0; i < nL3e; i++) {
1680 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1681 
1682 		pa = pmap_bootstrap_palloc(1);
1683 		*pte = (pa & PTE_FRAME) | pteflags;
1684 		pmap_update_pg(tmpva);
1685 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1686 
1687 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1688 	}
1689 
1690 	/* Build L2 */
1691 	L2e_idx = pl2_i(startva);
1692 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1693 	for (i = 0; i < nL2e; i++) {
1694 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1695 
1696 		pa = (paddr_t)(i * NBPD_L2);
1697 
1698 		if (spahole <= pa && pa < epahole) {
1699 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1700 			    PTE_PS | pmap_pg_g;
1701 		} else {
1702 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1703 			    PTE_PS | pmap_pg_g;
1704 		}
1705 	}
1706 
1707 	*pte = 0;
1708 	pmap_update_pg(tmpva);
1709 
1710 	pmap_direct_base = startva;
1711 	pmap_direct_end = endva;
1712 
1713 	tlbflush();
1714 }
1715 #endif /* __HAVE_DIRECT_MAP */
1716 
1717 #if !defined(XENPV)
1718 /*
1719  * Remap all of the virtual pages created so far with the PTE_G bit.
1720  */
1721 static void
1722 pmap_remap_global(void)
1723 {
1724 	vaddr_t kva, kva_end;
1725 	unsigned long p1i;
1726 	size_t i;
1727 
1728 	/* head */
1729 	kva = bootspace.head.va;
1730 	kva_end = kva + bootspace.head.sz;
1731 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1732 		p1i = pl1_i(kva);
1733 		if (pmap_valid_entry(PTE_BASE[p1i]))
1734 			PTE_BASE[p1i] |= pmap_pg_g;
1735 	}
1736 
1737 	/* kernel segments */
1738 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1739 		if (bootspace.segs[i].type == BTSEG_NONE) {
1740 			continue;
1741 		}
1742 		kva = bootspace.segs[i].va;
1743 		kva_end = kva + bootspace.segs[i].sz;
1744 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1745 			p1i = pl1_i(kva);
1746 			if (pmap_valid_entry(PTE_BASE[p1i]))
1747 				PTE_BASE[p1i] |= pmap_pg_g;
1748 		}
1749 	}
1750 
1751 	/* boot space */
1752 	kva = bootspace.boot.va;
1753 	kva_end = kva + bootspace.boot.sz;
1754 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1755 		p1i = pl1_i(kva);
1756 		if (pmap_valid_entry(PTE_BASE[p1i]))
1757 			PTE_BASE[p1i] |= pmap_pg_g;
1758 	}
1759 }
1760 #endif
1761 
1762 #ifndef XENPV
1763 /*
1764  * Remap several kernel segments with large pages. We cover as many pages as we
1765  * can. Called only once at boot time, if the CPU supports large pages.
1766  */
1767 static void
1768 pmap_remap_largepages(void)
1769 {
1770 	pd_entry_t *pde;
1771 	vaddr_t kva, kva_end;
1772 	paddr_t pa;
1773 	size_t i;
1774 
1775 	/* Remap the kernel text using large pages. */
1776 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1777 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1778 			continue;
1779 		}
1780 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1781 		if (kva < bootspace.segs[i].va) {
1782 			continue;
1783 		}
1784 		kva_end = rounddown(bootspace.segs[i].va +
1785 			bootspace.segs[i].sz, NBPD_L2);
1786 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1787 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1788 			pde = &L2_BASE[pl2_i(kva)];
1789 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1790 			tlbflushg();
1791 		}
1792 	}
1793 
1794 	/* Remap the kernel rodata using large pages. */
1795 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1796 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1797 			continue;
1798 		}
1799 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1800 		if (kva < bootspace.segs[i].va) {
1801 			continue;
1802 		}
1803 		kva_end = rounddown(bootspace.segs[i].va +
1804 			bootspace.segs[i].sz, NBPD_L2);
1805 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1806 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1807 			pde = &L2_BASE[pl2_i(kva)];
1808 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1809 			tlbflushg();
1810 		}
1811 	}
1812 
1813 	/* Remap the kernel data+bss using large pages. */
1814 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1815 		if (bootspace.segs[i].type != BTSEG_DATA) {
1816 			continue;
1817 		}
1818 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1819 		if (kva < bootspace.segs[i].va) {
1820 			continue;
1821 		}
1822 		kva_end = rounddown(bootspace.segs[i].va +
1823 			bootspace.segs[i].sz, NBPD_L2);
1824 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1825 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1826 			pde = &L2_BASE[pl2_i(kva)];
1827 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1828 			tlbflushg();
1829 		}
1830 	}
1831 }
1832 #endif /* !XENPV */
1833 
1834 /*
1835  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1836  * to manage mappings.
1837  */
1838 void
1839 pmap_init(void)
1840 {
1841 	int flags;
1842 
1843 	/*
1844 	 * initialize caches.
1845 	 */
1846 
1847 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1848 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1849 
1850 #ifdef XENPV
1851 	/*
1852 	 * pool_cache(9) should not touch cached objects, since they
1853 	 * are pinned on xen and R/O for the domU
1854 	 */
1855 	flags = PR_NOTOUCH;
1856 #else
1857 	flags = 0;
1858 #endif
1859 
1860 #ifdef PAE
1861 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1862 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1863 #else
1864 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1865 	    "pdppl", NULL, IPL_NONE);
1866 #endif
1867 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1868 	     0, 0, "pvpage", &pool_allocator_kmem,
1869 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1870 
1871 	pmap_tlb_init();
1872 
1873 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1874 	pmap_tlb_cpu_init(curcpu());
1875 
1876 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1877 	    NULL, "x86", "io bitmap copy");
1878 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1879 	    NULL, "x86", "ldt sync");
1880 
1881 	/*
1882 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1883 	 * to hang a tree of pv_entry records.  Dynamically allocated
1884 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1885 	 * usual case is embedded), so cop out and use a single RB tree
1886 	 * to cover them.
1887 	 */
1888 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1889 
1890 	/*
1891 	 * done: pmap module is up (and ready for business)
1892 	 */
1893 
1894 	pmap_initialized = true;
1895 }
1896 
1897 #ifndef XENPV
1898 /*
1899  * pmap_cpu_init_late: perform late per-CPU initialization.
1900  */
1901 void
1902 pmap_cpu_init_late(struct cpu_info *ci)
1903 {
1904 	/*
1905 	 * The BP has already its own PD page allocated during early
1906 	 * MD startup.
1907 	 */
1908 	if (ci == &cpu_info_primary)
1909 		return;
1910 #ifdef PAE
1911 	cpu_alloc_l3_page(ci);
1912 #endif
1913 }
1914 #endif
1915 
1916 #ifndef __HAVE_DIRECT_MAP
1917 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1918 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1919 
1920 static void
1921 pmap_vpage_cpualloc(struct cpu_info *ci)
1922 {
1923 	bool primary = (ci == &cpu_info_primary);
1924 	size_t i, npages;
1925 	vaddr_t vabase;
1926 	vsize_t vrange;
1927 
1928 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1929 	KASSERT(npages >= VPAGE_MAX);
1930 	vrange = npages * PAGE_SIZE;
1931 
1932 	if (primary) {
1933 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1934 			/* Waste some pages to align properly */
1935 		}
1936 		/* The base is aligned, allocate the rest (contiguous) */
1937 		pmap_bootstrap_valloc(npages - 1);
1938 	} else {
1939 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1940 		    UVM_KMF_VAONLY);
1941 		if (vabase == 0) {
1942 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1943 			    __func__, cpu_index(ci));
1944 		}
1945 	}
1946 
1947 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1948 
1949 	for (i = 0; i < VPAGE_MAX; i++) {
1950 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1951 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1952 	}
1953 }
1954 
1955 void
1956 pmap_vpage_cpu_init(struct cpu_info *ci)
1957 {
1958 	if (ci == &cpu_info_primary) {
1959 		/* cpu0 already taken care of in pmap_bootstrap */
1960 		return;
1961 	}
1962 
1963 	pmap_vpage_cpualloc(ci);
1964 }
1965 #endif
1966 
1967 /*
1968  * p v _ e n t r y   f u n c t i o n s
1969  */
1970 
1971 /*
1972  * pmap_pvp_dtor: pool_cache constructor for PV pages.
1973  */
1974 static int
1975 pmap_pvp_ctor(void *arg, void *obj, int flags)
1976 {
1977 	struct pv_page *pvp = (struct pv_page *)obj;
1978 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
1979 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
1980 
1981 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
1982 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
1983 
1984 	LIST_INIT(&pvp->pvp_pves);
1985 	pvp->pvp_nfree = PVE_PER_PVP;
1986 	pvp->pvp_pmap = NULL;
1987 
1988 	for (; pve < maxpve; pve++) {
1989 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
1990 	}
1991 
1992 	return 0;
1993 }
1994 
1995 /*
1996  * pmap_pvp_dtor: pool_cache destructor for PV pages.
1997  */
1998 static void
1999 pmap_pvp_dtor(void *arg, void *obj)
2000 {
2001 	struct pv_page *pvp __diagused = obj;
2002 
2003 	KASSERT(pvp->pvp_pmap == NULL);
2004 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2005 }
2006 
2007 /*
2008  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2009  */
2010 static struct pv_entry *
2011 pmap_alloc_pv(struct pmap *pmap)
2012 {
2013 	struct pv_entry *pve;
2014 	struct pv_page *pvp;
2015 
2016 	KASSERT(mutex_owned(&pmap->pm_lock));
2017 
2018 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2019 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2020 			LIST_REMOVE(pvp, pvp_list);
2021 		} else {
2022 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2023 		}
2024 		if (__predict_false(pvp == NULL)) {
2025 			return NULL;
2026 		}
2027 		/* full -> part */
2028 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2029 		pvp->pvp_pmap = pmap;
2030 	}
2031 
2032 	KASSERT(pvp->pvp_pmap == pmap);
2033 	KASSERT(pvp->pvp_nfree > 0);
2034 
2035 	pve = LIST_FIRST(&pvp->pvp_pves);
2036 	LIST_REMOVE(pve, pve_list);
2037 	pvp->pvp_nfree--;
2038 
2039 	if (__predict_false(pvp->pvp_nfree == 0)) {
2040 		/* part -> empty */
2041 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2042 		LIST_REMOVE(pvp, pvp_list);
2043 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2044 	} else {
2045 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2046 	}
2047 
2048 	return pve;
2049 }
2050 
2051 /*
2052  * pmap_free_pv: delayed free of a PV entry.
2053  */
2054 static void
2055 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2056 {
2057 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2058 
2059 	KASSERT(mutex_owned(&pmap->pm_lock));
2060 	KASSERT(pvp->pvp_pmap == pmap);
2061 	KASSERT(pvp->pvp_nfree >= 0);
2062 
2063 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2064 	pvp->pvp_nfree++;
2065 
2066 	if (__predict_false(pvp->pvp_nfree == 1)) {
2067 		/* empty -> part */
2068 		LIST_REMOVE(pvp, pvp_list);
2069 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2070 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2071 		/* part -> full */
2072 		LIST_REMOVE(pvp, pvp_list);
2073 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2074 	}
2075 }
2076 
2077 /*
2078  * pmap_drain_pv: free full PV pages.
2079  */
2080 static void
2081 pmap_drain_pv(struct pmap *pmap)
2082 {
2083 	struct pv_page *pvp;
2084 
2085 	KASSERT(mutex_owned(&pmap->pm_lock));
2086 
2087 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2088 		LIST_REMOVE(pvp, pvp_list);
2089 		KASSERT(pvp->pvp_pmap == pmap);
2090 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2091 		pvp->pvp_pmap = NULL;
2092 		pool_cache_put(&pmap_pvp_cache, pvp);
2093 	}
2094 }
2095 
2096 /*
2097  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2098  */
2099 static void
2100 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2101     vaddr_t va, bool tracked)
2102 {
2103 #ifdef DEBUG
2104 	struct pv_pte *pvpte;
2105 
2106 	PMAP_CHECK_PP(pp);
2107 
2108 	mutex_spin_enter(&pp->pp_lock);
2109 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2110 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2111 			break;
2112 		}
2113 	}
2114 	mutex_spin_exit(&pp->pp_lock);
2115 
2116 	if (pvpte && !tracked) {
2117 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2118 	} else if (!pvpte && tracked) {
2119 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2120 	}
2121 #endif
2122 }
2123 
2124 /*
2125  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2126  *
2127  * => pmap must be locked
2128  */
2129 static struct pv_entry *
2130 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2131     const rb_tree_t *tree, const vaddr_t va)
2132 {
2133 	struct pv_entry *pve;
2134 	rb_node_t *node;
2135 
2136 	/*
2137 	 * Inlined lookup tailored for exactly what's needed here that is
2138 	 * quite a bit faster than using rb_tree_find_node().
2139 	 */
2140 	for (node = tree->rbt_root;;) {
2141 		if (__predict_false(RB_SENTINEL_P(node))) {
2142 			return NULL;
2143 		}
2144 		pve = (struct pv_entry *)
2145 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2146 		if (pve->pve_pte.pte_va == va) {
2147 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2148 			return pve;
2149 		}
2150 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2151 	}
2152 }
2153 
2154 /*
2155  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2156  *
2157  * => a PV entry must be known present (doesn't check for existence)
2158  * => pmap must be locked
2159  */
2160 static struct pv_entry *
2161 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2162     const struct pmap_page * const old_pp, const vaddr_t va)
2163 {
2164 	struct pv_entry *pve;
2165 	const rb_tree_t *tree;
2166 
2167 	KASSERT(mutex_owned(&pmap->pm_lock));
2168 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2169 
2170 	/*
2171 	 * [This mostly deals with the case of process-private pages, i.e.
2172 	 * anonymous memory allocations or COW.]
2173 	 *
2174 	 * If the page is tracked with an embedded entry then the tree
2175 	 * lookup can be avoided.  It's safe to check for this specific
2176 	 * set of values without pp_lock because both will only ever be
2177 	 * set together for this pmap.
2178 	 *
2179 	 */
2180 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2181 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2182 		return NULL;
2183 	}
2184 
2185 	/*
2186 	 * [This mostly deals with shared mappings, for example shared libs
2187 	 * and executables.]
2188 	 *
2189 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2190 	 * look at the lowest numbered node in the tree first.  The tree is
2191 	 * known non-empty because of the check above.  For short lived
2192 	 * processes where pmap_remove() isn't used much this gets close to
2193 	 * a 100% hit rate.
2194 	 */
2195 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2196 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2197 	pve = (struct pv_entry *)
2198 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2199 	    offsetof(struct pv_entry, pve_rb));
2200 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2201 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2202 		return pve;
2203 	}
2204 
2205 	/* Search the RB tree for the key (uncommon). */
2206 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2207 }
2208 
2209 /*
2210  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2211  *
2212  * => pmap must be locked
2213  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2214  */
2215 static int
2216 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2217     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2218     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2219 {
2220 	struct pv_entry *pve;
2221 	int error;
2222 
2223 	KASSERT(mutex_owned(&pmap->pm_lock));
2224 	KASSERT(ptp_to_pmap(ptp) == pmap);
2225 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2226 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2227 	PMAP_CHECK_PP(pp);
2228 
2229 	/*
2230 	 * If entering the same page and it's already tracked with an
2231 	 * embedded entry, we can avoid the expense below.  It's safe
2232 	 * to check for this very specific set of values without a lock
2233 	 * because both will only ever be set together for this pmap.
2234 	 */
2235 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2236 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2237 		*samepage = true;
2238 		pmap_check_pv(pmap, ptp, pp, va, true);
2239 		return 0;
2240 	}
2241 
2242 	/*
2243 	 * Check for an existing dynamic mapping at this address.  If it's
2244 	 * for the same page, then it will be reused and nothing needs to be
2245 	 * changed.
2246 	 */
2247 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2248 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2249 		*samepage = true;
2250 		pmap_check_pv(pmap, ptp, pp, va, true);
2251 		return 0;
2252 	}
2253 
2254 	/*
2255 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2256 	 * case it's needed; won't know for sure until the lock is taken.
2257 	 */
2258 	if (pmap->pm_pve == NULL) {
2259 		pmap->pm_pve = pmap_alloc_pv(pmap);
2260 	}
2261 
2262 	error = 0;
2263 	pmap_check_pv(pmap, ptp, pp, va, false);
2264 	mutex_spin_enter(&pp->pp_lock);
2265 	if (!pv_pte_embedded(pp)) {
2266 		/*
2267 		 * Embedded PV tracking available - easy.
2268 		 */
2269 		pp->pp_pte.pte_ptp = ptp;
2270 		pp->pp_pte.pte_va = va;
2271 		*new_embedded = true;
2272 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2273 		/*
2274 		 * No memory.
2275 		 */
2276 		error = ENOMEM;
2277 	} else {
2278 		/*
2279 		 * Install new pv_entry on the page.
2280 		 */
2281 		pve = pmap->pm_pve;
2282 		pmap->pm_pve = NULL;
2283 		*new_pve = pve;
2284 		pve->pve_pte.pte_ptp = ptp;
2285 		pve->pve_pte.pte_va = va;
2286 		pve->pve_pp = pp;
2287 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2288 	}
2289 	mutex_spin_exit(&pp->pp_lock);
2290 	if (error == 0) {
2291 		pmap_check_pv(pmap, ptp, pp, va, true);
2292 	}
2293 
2294 	return error;
2295 }
2296 
2297 /*
2298  * pmap_remove_pv: try to remove a mapping from a pv_list
2299  *
2300  * => pmap must be locked
2301  * => removes dynamic entries from tree and frees them
2302  * => caller should adjust ptp's wire_count and free PTP if needed
2303  */
2304 static void
2305 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2306     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2307 {
2308 	rb_tree_t *tree = (ptp != NULL ?
2309 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2310 
2311 	KASSERT(mutex_owned(&pmap->pm_lock));
2312 	KASSERT(ptp_to_pmap(ptp) == pmap);
2313 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2314 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2315 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2316 
2317 	pmap_check_pv(pmap, ptp, pp, va, true);
2318 
2319 	if (pve == NULL) {
2320 		mutex_spin_enter(&pp->pp_lock);
2321 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2322 		KASSERT(pp->pp_pte.pte_va == va);
2323 		pp->pp_attrs |= oattrs;
2324 		pp->pp_pte.pte_ptp = NULL;
2325 		pp->pp_pte.pte_va = 0;
2326 		mutex_spin_exit(&pp->pp_lock);
2327 	} else {
2328 		mutex_spin_enter(&pp->pp_lock);
2329 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2330 		    pp->pp_pte.pte_va != va);
2331 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2332 		KASSERT(pve->pve_pte.pte_va == va);
2333 		KASSERT(pve->pve_pp == pp);
2334 		pp->pp_attrs |= oattrs;
2335 		LIST_REMOVE(pve, pve_list);
2336 		mutex_spin_exit(&pp->pp_lock);
2337 
2338 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2339 		rb_tree_remove_node(tree, pve);
2340 #ifdef DIAGNOSTIC
2341 		memset(pve, 0, sizeof(*pve));
2342 #endif
2343 		pmap_free_pv(pmap, pve);
2344 	}
2345 
2346 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2347 	pmap_check_pv(pmap, ptp, pp, va, false);
2348 }
2349 
2350 /*
2351  * p t p   f u n c t i o n s
2352  */
2353 
2354 static struct vm_page *
2355 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2356 {
2357 	int lidx = level - 1;
2358 	off_t off = ptp_va2o(va, level);
2359 	struct vm_page *pg;
2360 
2361 	KASSERT(mutex_owned(&pmap->pm_lock));
2362 
2363 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2364 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2365 		pg = pmap->pm_ptphint[lidx];
2366 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2367 		return pg;
2368 	}
2369 	PMAP_DUMMY_LOCK(pmap);
2370 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2371 	PMAP_DUMMY_UNLOCK(pmap);
2372 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2373 		/* This page is queued to be freed - ignore. */
2374 		pg = NULL;
2375 	}
2376 	if (pg != NULL) {
2377 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2378 	}
2379 	pmap->pm_ptphint[lidx] = pg;
2380 	return pg;
2381 }
2382 
2383 static inline void
2384 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2385 {
2386 	int lidx;
2387 
2388 	KASSERT(ptp->wire_count <= 1);
2389 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2390 
2391 	lidx = level - 1;
2392 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2393 	if (pmap->pm_ptphint[lidx] == ptp)
2394 		pmap->pm_ptphint[lidx] = NULL;
2395 	ptp->wire_count = 0;
2396 	ptp->uanon = NULL;
2397 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2398 
2399 	/*
2400 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2401 	 * the page from the uvm_object, as that can take further locks
2402 	 * (intolerable right now because the PTEs are likely mapped in).
2403 	 * Instead mark the PTP as free and if we bump into it again, we'll
2404 	 * either ignore or reuse (depending on what's useful at the time).
2405 	 */
2406 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2407 }
2408 
2409 static void
2410 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2411 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2412 {
2413 	unsigned long index;
2414 	int level;
2415 	vaddr_t invaladdr;
2416 	pd_entry_t opde;
2417 
2418 	KASSERT(pmap != pmap_kernel());
2419 	KASSERT(mutex_owned(&pmap->pm_lock));
2420 	KASSERT(kpreempt_disabled());
2421 
2422 	level = 1;
2423 	do {
2424 		index = pl_i(va, level + 1);
2425 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2426 
2427 		/*
2428 		 * On Xen-amd64 or SVS, we need to sync the top level page
2429 		 * directory on each CPU.
2430 		 */
2431 #if defined(XENPV) && defined(__x86_64__)
2432 		if (level == PTP_LEVELS - 1) {
2433 			xen_kpm_sync(pmap, index);
2434 		}
2435 #elif defined(SVS)
2436 		if (svs_enabled && level == PTP_LEVELS - 1) {
2437 			svs_pmap_sync(pmap, index);
2438 		}
2439 #endif
2440 
2441 		invaladdr = level == 1 ? (vaddr_t)ptes :
2442 		    (vaddr_t)pdes[level - 2];
2443 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2444 		    opde, TLBSHOOT_FREE_PTP);
2445 
2446 #if defined(XENPV)
2447 		pmap_tlb_shootnow();
2448 #endif
2449 
2450 		pmap_freepage(pmap, ptp, level);
2451 		if (level < PTP_LEVELS - 1) {
2452 			ptp = pmap_find_ptp(pmap, va, level + 1);
2453 			ptp->wire_count--;
2454 			if (ptp->wire_count > 1)
2455 				break;
2456 		}
2457 	} while (++level < PTP_LEVELS);
2458 	pmap_pte_flush();
2459 }
2460 
2461 /*
2462  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2463  *
2464  * => pmap should NOT be pmap_kernel()
2465  * => pmap should be locked
2466  * => we are not touching any PTEs yet, so they need not be mapped in
2467  */
2468 static int
2469 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2470     int flags, struct vm_page **resultp)
2471 {
2472 	struct vm_page *ptp;
2473 	int i, aflags;
2474 	struct uvm_object *obj;
2475 	voff_t off;
2476 
2477 	KASSERT(pmap != pmap_kernel());
2478 	KASSERT(mutex_owned(&pmap->pm_lock));
2479 
2480 	/*
2481 	 * Loop through all page table levels allocating a page
2482 	 * for any level where we don't already have one.
2483 	 */
2484 	memset(pt, 0, sizeof(*pt));
2485 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2486 		UVM_PGA_ZERO;
2487 	for (i = PTP_LEVELS; i > 1; i--) {
2488 		obj = &pmap->pm_obj[i - 2];
2489 		off = ptp_va2o(va, i - 1);
2490 
2491 		PMAP_DUMMY_LOCK(pmap);
2492 		pt->pg[i] = uvm_pagelookup(obj, off);
2493 
2494 		if (pt->pg[i] == NULL) {
2495 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2496 			pt->alloced[i] = (pt->pg[i] != NULL);
2497 		} else if (pt->pg[i]->wire_count == 0) {
2498 			/* This page was queued to be freed; dequeue it. */
2499 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2500 			pt->alloced[i] = true;
2501 		}
2502 		PMAP_DUMMY_UNLOCK(pmap);
2503 		if (pt->pg[i] == NULL) {
2504 			pmap_unget_ptp(pmap, pt);
2505 			return ENOMEM;
2506 		} else if (pt->alloced[i]) {
2507 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2508 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2509 			    &pmap_rbtree_ops);
2510 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2511 		}
2512 	}
2513 	ptp = pt->pg[2];
2514 	KASSERT(ptp != NULL);
2515 	*resultp = ptp;
2516 	pmap->pm_ptphint[0] = ptp;
2517 	return 0;
2518 }
2519 
2520 /*
2521  * pmap_install_ptp: install any freshly allocated PTPs
2522  *
2523  * => pmap should NOT be pmap_kernel()
2524  * => pmap should be locked
2525  * => PTEs must be mapped
2526  * => preemption must be disabled
2527  */
2528 static void
2529 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2530     pd_entry_t * const *pdes)
2531 {
2532 	struct vm_page *ptp;
2533 	unsigned long index;
2534 	pd_entry_t *pva;
2535 	paddr_t pa;
2536 	int i;
2537 
2538 	KASSERT(pmap != pmap_kernel());
2539 	KASSERT(mutex_owned(&pmap->pm_lock));
2540 	KASSERT(kpreempt_disabled());
2541 
2542 	/*
2543 	 * Now that we have all the pages looked up or allocated,
2544 	 * loop through again installing any new ones into the tree.
2545 	 */
2546 	for (i = PTP_LEVELS; i > 1; i--) {
2547 		index = pl_i(va, i);
2548 		pva = pdes[i - 2];
2549 
2550 		if (pmap_valid_entry(pva[index])) {
2551 			KASSERT(!pt->alloced[i]);
2552 			continue;
2553 		}
2554 
2555 		ptp = pt->pg[i];
2556 		ptp->flags &= ~PG_BUSY; /* never busy */
2557 		ptp->wire_count = 1;
2558 		pmap->pm_ptphint[i - 2] = ptp;
2559 		pa = VM_PAGE_TO_PHYS(ptp);
2560 		pmap_pte_set(&pva[index], (pd_entry_t)
2561 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2562 
2563 		/*
2564 		 * On Xen-amd64 or SVS, we need to sync the top level page
2565 		 * directory on each CPU.
2566 		 */
2567 #if defined(XENPV) && defined(__x86_64__)
2568 		if (i == PTP_LEVELS) {
2569 			xen_kpm_sync(pmap, index);
2570 		}
2571 #elif defined(SVS)
2572 		if (svs_enabled && i == PTP_LEVELS) {
2573 			svs_pmap_sync(pmap, index);
2574 		}
2575 #endif
2576 
2577 		pmap_pte_flush();
2578 		pmap_stats_update(pmap, 1, 0);
2579 
2580 		/*
2581 		 * If we're not in the top level, increase the
2582 		 * wire count of the parent page.
2583 		 */
2584 		if (i < PTP_LEVELS) {
2585 			pt->pg[i + 1]->wire_count++;
2586 		}
2587 	}
2588 }
2589 
2590 /*
2591  * pmap_unget_ptp: free unusued PTPs
2592  *
2593  * => pmap should NOT be pmap_kernel()
2594  * => pmap should be locked
2595  */
2596 static void
2597 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2598 {
2599 	int i;
2600 
2601 	KASSERT(pmap != pmap_kernel());
2602 	KASSERT(mutex_owned(&pmap->pm_lock));
2603 
2604 	for (i = PTP_LEVELS; i > 1; i--) {
2605 		if (!pt->alloced[i]) {
2606 			continue;
2607 		}
2608 		KASSERT(pt->pg[i]->wire_count == 0);
2609 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2610 		pmap_freepage(pmap, pt->pg[i], i - 1);
2611 	}
2612 }
2613 
2614 /*
2615  * p m a p   l i f e c y c l e   f u n c t i o n s
2616  */
2617 
2618 /*
2619  * pmap_pdp_init: constructor a new PDP.
2620  */
2621 static void
2622 pmap_pdp_init(pd_entry_t *pdir)
2623 {
2624 	paddr_t pdirpa = 0;
2625 	vaddr_t object;
2626 	int i;
2627 
2628 #if !defined(XENPV) || !defined(__x86_64__)
2629 	int npde;
2630 #endif
2631 #ifdef XENPV
2632 	int s;
2633 #endif
2634 
2635 	memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2636 
2637 	/*
2638 	 * NOTE: This is all done unlocked, but we will check afterwards
2639 	 * if we have raced with pmap_growkernel().
2640 	 */
2641 
2642 #if defined(XENPV) && defined(__x86_64__)
2643 	/* Fetch the physical address of the page directory */
2644 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2645 
2646 	/*
2647 	 * This pdir will NEVER be active in kernel mode, so mark
2648 	 * recursive entry invalid.
2649 	 */
2650 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2651 
2652 	/*
2653 	 * PDP constructed this way won't be for the kernel, hence we
2654 	 * don't put kernel mappings on Xen.
2655 	 *
2656 	 * But we need to make pmap_create() happy, so put a dummy
2657 	 * (without PTE_P) value at the right place.
2658 	 */
2659 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2660 	     (pd_entry_t)-1 & PTE_FRAME;
2661 #else /* XENPV && __x86_64__*/
2662 	object = (vaddr_t)pdir;
2663 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2664 		/* Fetch the physical address of the page directory */
2665 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2666 
2667 		/* Put in recursive PDE to map the PTEs */
2668 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2669 		    pmap_pg_nx;
2670 #ifndef XENPV
2671 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2672 #endif
2673 	}
2674 
2675 	/* Copy the kernel's top level PDE */
2676 	npde = nkptp[PTP_LEVELS - 1];
2677 
2678 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2679 	    npde * sizeof(pd_entry_t));
2680 
2681 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2682 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2683 		pdir[idx] = PDP_BASE[idx];
2684 	}
2685 
2686 #ifdef __HAVE_PCPU_AREA
2687 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2688 #endif
2689 #ifdef __HAVE_DIRECT_MAP
2690 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2691 #endif
2692 #ifdef KASAN
2693 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2694 #endif
2695 #ifdef KMSAN
2696 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2697 #endif
2698 #endif /* XENPV  && __x86_64__*/
2699 
2700 #ifdef XENPV
2701 	s = splvm();
2702 	object = (vaddr_t)pdir;
2703 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2704 	    VM_PROT_READ);
2705 	pmap_update(pmap_kernel());
2706 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2707 		/*
2708 		 * pin as L2/L4 page, we have to do the page with the
2709 		 * PDIR_SLOT_PTE entries last
2710 		 */
2711 #ifdef PAE
2712 		if (i == l2tol3(PDIR_SLOT_PTE))
2713 			continue;
2714 #endif
2715 
2716 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2717 #ifdef __x86_64__
2718 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2719 #else
2720 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2721 #endif
2722 	}
2723 #ifdef PAE
2724 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2725 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2726 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2727 #endif
2728 	splx(s);
2729 #endif /* XENPV */
2730 }
2731 
2732 /*
2733  * pmap_pdp_fini: destructor for the PDPs.
2734  */
2735 static void
2736 pmap_pdp_fini(pd_entry_t *pdir)
2737 {
2738 #ifdef XENPV
2739 	paddr_t pdirpa = 0;	/* XXX: GCC */
2740 	vaddr_t object = (vaddr_t)pdir;
2741 	int i;
2742 	int s = splvm();
2743 	pt_entry_t *pte;
2744 
2745 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2746 		/* fetch the physical address of the page directory. */
2747 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2748 		/* unpin page table */
2749 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2750 	}
2751 	object = (vaddr_t)pdir;
2752 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2753 		/* Set page RW again */
2754 		pte = kvtopte(object);
2755 		pmap_pte_set(pte, *pte | PTE_W);
2756 		xen_bcast_invlpg((vaddr_t)object);
2757 	}
2758 	splx(s);
2759 #endif  /* XENPV */
2760 }
2761 
2762 #ifdef PAE
2763 static void *
2764 pmap_pdp_alloc(struct pool *pp, int flags)
2765 {
2766 	return (void *)uvm_km_alloc(kernel_map,
2767 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2768 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2769 	    UVM_KMF_WIRED);
2770 }
2771 
2772 static void
2773 pmap_pdp_free(struct pool *pp, void *v)
2774 {
2775 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2776 	    UVM_KMF_WIRED);
2777 }
2778 #endif /* PAE */
2779 
2780 /*
2781  * pmap_ctor: constructor for the pmap cache.
2782  */
2783 static int
2784 pmap_ctor(void *arg, void *obj, int flags)
2785 {
2786 	struct pmap *pmap = obj;
2787 	pt_entry_t p;
2788 	int i;
2789 
2790 	KASSERT((flags & PR_WAITOK) != 0);
2791 
2792 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2793 	rw_init(&pmap->pm_dummy_lock);
2794 	kcpuset_create(&pmap->pm_cpus, true);
2795 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2796 #ifdef XENPV
2797 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2798 #endif
2799 	LIST_INIT(&pmap->pm_gc_ptp);
2800 	pmap->pm_pve = NULL;
2801 	LIST_INIT(&pmap->pm_pvp_full);
2802 	LIST_INIT(&pmap->pm_pvp_part);
2803 	LIST_INIT(&pmap->pm_pvp_empty);
2804 
2805 	/* allocate and init PDP */
2806 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2807 
2808 	for (;;) {
2809 		pmap_pdp_init(pmap->pm_pdir);
2810 		mutex_enter(&pmaps_lock);
2811 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2812 		if (__predict_true(p != 0)) {
2813 			break;
2814 		}
2815 		mutex_exit(&pmaps_lock);
2816 	}
2817 
2818 	for (i = 0; i < PDP_SIZE; i++)
2819 		pmap->pm_pdirpa[i] =
2820 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2821 
2822 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2823 	mutex_exit(&pmaps_lock);
2824 
2825 	return 0;
2826 }
2827 
2828 /*
2829  * pmap_ctor: destructor for the pmap cache.
2830  */
2831 static void
2832 pmap_dtor(void *arg, void *obj)
2833 {
2834 	struct pmap *pmap = obj;
2835 
2836 	mutex_enter(&pmaps_lock);
2837 	LIST_REMOVE(pmap, pm_list);
2838 	mutex_exit(&pmaps_lock);
2839 
2840 	pmap_pdp_fini(pmap->pm_pdir);
2841 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2842 	mutex_destroy(&pmap->pm_lock);
2843 	rw_destroy(&pmap->pm_dummy_lock);
2844 	kcpuset_destroy(pmap->pm_cpus);
2845 	kcpuset_destroy(pmap->pm_kernel_cpus);
2846 #ifdef XENPV
2847 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2848 #endif
2849 }
2850 
2851 /*
2852  * pmap_create: create a pmap object.
2853  */
2854 struct pmap *
2855 pmap_create(void)
2856 {
2857 	struct pmap *pmap;
2858 	int i;
2859 
2860 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2861 
2862 	/* init uvm_object */
2863 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2864 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2865 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2866 		pmap->pm_ptphint[i] = NULL;
2867 	}
2868 	pmap->pm_stats.wired_count = 0;
2869 	/* count the PDP allocd below */
2870 	pmap->pm_stats.resident_count = PDP_SIZE;
2871 #if !defined(__x86_64__)
2872 	pmap->pm_hiexec = 0;
2873 #endif
2874 
2875 	/* Used by NVMM and Xen */
2876 	pmap->pm_enter = NULL;
2877 	pmap->pm_extract = NULL;
2878 	pmap->pm_remove = NULL;
2879 	pmap->pm_sync_pv = NULL;
2880 	pmap->pm_pp_remove_ent = NULL;
2881 	pmap->pm_write_protect = NULL;
2882 	pmap->pm_unwire = NULL;
2883 	pmap->pm_tlb_flush = NULL;
2884 	pmap->pm_data = NULL;
2885 
2886 	/* init the LDT */
2887 	pmap->pm_ldt = NULL;
2888 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2889 
2890 	return (pmap);
2891 }
2892 
2893 /*
2894  * pmap_check_ptps: verify that none of the pmap's page table objects
2895  * have any pages allocated to them.
2896  */
2897 static void
2898 pmap_check_ptps(struct pmap *pmap)
2899 {
2900 	int i;
2901 
2902 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2903 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2904 		    "pmap %p level %d still has %d pages",
2905 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2906 	}
2907 }
2908 
2909 static void
2910 pmap_check_inuse(struct pmap *pmap)
2911 {
2912 #ifdef DEBUG
2913 	CPU_INFO_ITERATOR cii;
2914 	struct cpu_info *ci;
2915 
2916 	for (CPU_INFO_FOREACH(cii, ci)) {
2917 		if (ci->ci_pmap == pmap)
2918 			panic("destroying pmap being used");
2919 #if defined(XENPV) && defined(__x86_64__)
2920 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
2921 			if (pmap->pm_pdir[i] != 0 &&
2922 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2923 				printf("pmap_destroy(%p) pmap_kernel %p "
2924 				    "curcpu %d cpu %d ci_pmap %p "
2925 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2926 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2927 				    pmap, pmap_kernel(), curcpu()->ci_index,
2928 				    ci->ci_index, ci->ci_pmap,
2929 				    i, ci->ci_kpm_pdir[i],
2930 				    i, pmap->pm_pdir[i]);
2931 				panic("%s: used pmap", __func__);
2932 			}
2933 		}
2934 #endif
2935 	}
2936 #endif /* DEBUG */
2937 }
2938 
2939 /*
2940  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
2941  * count goes to zero.
2942  *
2943  * => we can be called from pmap_unmap_ptes() with a different, unrelated
2944  *    pmap's lock held.  be careful!
2945  */
2946 void
2947 pmap_destroy(struct pmap *pmap)
2948 {
2949 	int i;
2950 
2951 	/*
2952 	 * drop reference count and verify not in use.
2953 	 */
2954 
2955 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2956 		return;
2957 	}
2958 	pmap_check_inuse(pmap);
2959 
2960 	/*
2961 	 * handle any deferred frees.
2962 	 */
2963 
2964 	mutex_enter(&pmap->pm_lock);
2965 	if (pmap->pm_pve != NULL) {
2966 		pmap_free_pv(pmap, pmap->pm_pve);
2967 		pmap->pm_pve = NULL;
2968 	}
2969 	pmap_drain_pv(pmap);
2970 	mutex_exit(&pmap->pm_lock);
2971 	pmap_update(pmap);
2972 
2973 	/*
2974 	 * Reference count is zero, free pmap resources and then free pmap.
2975 	 */
2976 
2977 	pmap_check_ptps(pmap);
2978 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
2979 
2980 #ifdef USER_LDT
2981 	if (pmap->pm_ldt != NULL) {
2982 		/*
2983 		 * No need to switch the LDT; this address space is gone,
2984 		 * nothing is using it.
2985 		 *
2986 		 * No need to lock the pmap for ldt_free (or anything else),
2987 		 * we're the last one to use it.
2988 		 */
2989 		/* XXXAD can't take cpu_lock here - fix soon. */
2990 		mutex_enter(&cpu_lock);
2991 		ldt_free(pmap->pm_ldt_sel);
2992 		mutex_exit(&cpu_lock);
2993 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2994 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
2995 	}
2996 #endif
2997 
2998 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2999 		uvm_obj_destroy(&pmap->pm_obj[i], false);
3000 	}
3001 	kcpuset_zero(pmap->pm_cpus);
3002 	kcpuset_zero(pmap->pm_kernel_cpus);
3003 #ifdef XENPV
3004 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3005 #endif
3006 
3007 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3008 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3009 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3010 
3011 	pmap_check_ptps(pmap);
3012 	if (__predict_false(pmap->pm_enter != NULL)) {
3013 		/* XXX make this a different cache */
3014 		pool_cache_destruct_object(&pmap_cache, pmap);
3015 	} else {
3016 		pool_cache_put(&pmap_cache, pmap);
3017 	}
3018 }
3019 
3020 /*
3021  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3022  *
3023  * => caller must hold pmap's lock
3024  * => PTP must be mapped into KVA
3025  * => must be called with kernel preemption disabled
3026  * => does as little work as possible
3027  */
3028 static void
3029 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3030     vaddr_t startva, vaddr_t blkendva)
3031 {
3032 #ifndef XENPV
3033 	struct pv_entry *pve;
3034 	struct vm_page *pg;
3035 	struct pmap_page *pp;
3036 	pt_entry_t opte;
3037 	rb_tree_t *tree;
3038 	vaddr_t va;
3039 	int wired;
3040 	uint8_t oattrs;
3041 	u_int cnt;
3042 
3043 	KASSERT(mutex_owned(&pmap->pm_lock));
3044 	KASSERT(kpreempt_disabled());
3045 	KASSERT(pmap != pmap_kernel());
3046 	KASSERT(ptp->wire_count > 1);
3047 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3048 
3049 	/*
3050 	 * Start at the lowest entered VA, and scan until there are no more
3051 	 * PTEs in the PTPs.
3052 	 */
3053 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3054 	pve = RB_TREE_MIN(tree);
3055 	wired = 0;
3056 	va = (vaddr_t)ptp->uanon;
3057 	pte += ((va - startva) >> PAGE_SHIFT);
3058 
3059 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3060 		/*
3061 		 * No need for an atomic to clear the PTE.  Nothing else can
3062 		 * see the address space any more and speculative access (if
3063 		 * possible) won't modify.  Therefore there's no need to
3064 		 * track the accessed/dirty bits.
3065 		 */
3066 		opte = *pte;
3067 		if (!pmap_valid_entry(opte)) {
3068 			continue;
3069 		}
3070 
3071 		/*
3072 		 * Count the PTE.  If it's not for a managed mapping
3073 		 * there's noting more to do.
3074 		 */
3075 		cnt--;
3076 		wired -= (opte & PTE_WIRED);
3077 		if ((opte & PTE_PVLIST) == 0) {
3078 #ifndef DOM0OPS
3079 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3080 			    "managed page without PTE_PVLIST for %#"
3081 			    PRIxVADDR, va);
3082 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3083 			    "pv-tracked page without PTE_PVLIST for %#"
3084 			    PRIxVADDR, va);
3085 #endif
3086 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3087 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3088 			    va) == NULL);
3089 			continue;
3090 		}
3091 
3092 		/*
3093 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3094 		 * in the PTP.  If it's for this VA, take advantage of it to
3095 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3096 		 * tree by skipping to the next VA in the tree whenever
3097 		 * there is a match here.  The tree will be cleared out in
3098 		 * one pass before return to pmap_remove_all().
3099 		 */
3100 		oattrs = pmap_pte_to_pp_attrs(opte);
3101 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3102 			pp = pve->pve_pp;
3103 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3104 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3105 			    pp->pp_pte.pte_va != va);
3106 			mutex_spin_enter(&pp->pp_lock);
3107 			pp->pp_attrs |= oattrs;
3108 			LIST_REMOVE(pve, pve_list);
3109 			mutex_spin_exit(&pp->pp_lock);
3110 
3111 			/*
3112 			 * pve won't be touched again until pmap_drain_pv(),
3113 			 * so it's still safe to traverse the tree.
3114 			 */
3115 			pmap_free_pv(pmap, pve);
3116 			pve = RB_TREE_NEXT(tree, pve);
3117 			continue;
3118 		}
3119 
3120 		/*
3121 		 * No entry in the tree so it must be embedded.  Look up the
3122 		 * page and cancel the embedded entry.
3123 		 */
3124 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3125 			pp = VM_PAGE_TO_PP(pg);
3126 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3127 			paddr_t pa = pmap_pte2pa(opte);
3128 			panic("%s: PTE_PVLIST with pv-untracked page"
3129 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3130 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3131 		}
3132 		mutex_spin_enter(&pp->pp_lock);
3133 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3134 		KASSERT(pp->pp_pte.pte_va == va);
3135 		pp->pp_attrs |= oattrs;
3136 		pp->pp_pte.pte_ptp = NULL;
3137 		pp->pp_pte.pte_va = 0;
3138 		mutex_spin_exit(&pp->pp_lock);
3139 	}
3140 
3141 	/* PTP now empty - adjust the tree & stats to match. */
3142 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3143 	ptp->wire_count = 1;
3144 #ifdef DIAGNOSTIC
3145 	rb_tree_init(tree, &pmap_rbtree_ops);
3146 #endif
3147 #else	/* !XENPV */
3148 	/*
3149 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3150 	 * I guess the hypervisor keeps track of PTEs too.
3151 	 */
3152 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3153 #endif	/* !XENPV */
3154 }
3155 
3156 /*
3157  * pmap_remove_all: remove all mappings from pmap in bulk.
3158  *
3159  * Ordinarily when removing mappings it's important to hold the UVM object's
3160  * lock, so that pages do not gain a new identity while retaining stale TLB
3161  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3162  * Here it's known that the address space is no longer visible to any user
3163  * process, so we don't need to worry about that.
3164  */
3165 bool
3166 pmap_remove_all(struct pmap *pmap)
3167 {
3168 	struct vm_page *ptps[32];
3169 	vaddr_t va, blkendva;
3170 	struct pmap *pmap2;
3171 	pt_entry_t *ptes;
3172 	pd_entry_t pde __diagused;
3173 	pd_entry_t * const *pdes;
3174 	int lvl __diagused, i, n;
3175 
3176 	/* XXX Can't handle EPT just yet. */
3177 	if (pmap->pm_remove != NULL) {
3178 		return false;
3179 	}
3180 
3181 	for (;;) {
3182 		/* Fetch a block of PTPs from tree. */
3183 		mutex_enter(&pmap->pm_lock);
3184 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3185 		    (void **)ptps, __arraycount(ptps), false);
3186 		if (n == 0) {
3187 			mutex_exit(&pmap->pm_lock);
3188 			break;
3189 		}
3190 
3191 		/* Remove all mappings in the set of PTPs. */
3192 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3193 		for (i = 0; i < n; i++) {
3194 			if (ptps[i]->wire_count == 0) {
3195 				/* It's dead: pmap_update() will expunge. */
3196 				continue;
3197 			}
3198 
3199 			/* Determine range of block. */
3200 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3201 			blkendva = x86_round_pdr(va + 1);
3202 
3203 			/* Make sure everything squares up... */
3204 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3205 			KASSERT(lvl == 1);
3206 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3207 
3208 			/* Zap! */
3209 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3210 			    blkendva);
3211 
3212 			/* PTP should now be unused - free it. */
3213 			KASSERT(ptps[i]->wire_count == 1);
3214 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3215 		}
3216 		pmap_unmap_ptes(pmap, pmap2);
3217 		pmap_drain_pv(pmap);
3218 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3219 		mutex_exit(&pmap->pm_lock);
3220 
3221 		/* Process deferred frees. */
3222 		pmap_update(pmap);
3223 
3224 		/* A breathing point. */
3225 		preempt_point();
3226 	}
3227 
3228 	/* Verify that the pmap is now completely empty. */
3229 	pmap_check_ptps(pmap);
3230 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3231 	    "pmap %p not empty", pmap);
3232 
3233 	return true;
3234 }
3235 
3236 #if defined(PMAP_FORK)
3237 /*
3238  * pmap_fork: perform any necessary data structure manipulation when
3239  * a VM space is forked.
3240  */
3241 void
3242 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3243 {
3244 #ifdef USER_LDT
3245 	union descriptor *new_ldt;
3246 	int sel;
3247 
3248 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3249 		return;
3250 	}
3251 
3252 	/*
3253 	 * Copy the LDT into the new process.
3254 	 *
3255 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3256 	 * we'll retry. This will starve if there's a stream of LDT changes
3257 	 * in another thread but that should not happen.
3258 	 */
3259 
3260 retry:
3261 	if (pmap1->pm_ldt != NULL) {
3262 		/* Allocate space for the new process's LDT */
3263 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3264 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3265 		if (new_ldt == NULL) {
3266 			printf("WARNING: %s: unable to allocate LDT space\n",
3267 			    __func__);
3268 			return;
3269 		}
3270 		mutex_enter(&cpu_lock);
3271 		/* Get a GDT slot for it */
3272 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3273 		if (sel == -1) {
3274 			mutex_exit(&cpu_lock);
3275 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3276 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3277 			printf("WARNING: %s: unable to allocate LDT selector\n",
3278 			    __func__);
3279 			return;
3280 		}
3281 	} else {
3282 		/* Wasn't anything there after all. */
3283 		new_ldt = NULL;
3284 		sel = -1;
3285 		mutex_enter(&cpu_lock);
3286 	}
3287 
3288  	/*
3289 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3290 	 */
3291  	if (pmap1->pm_ldt != NULL) {
3292 		if (new_ldt == NULL) {
3293 			/* A wild LDT just appeared. */
3294 			mutex_exit(&cpu_lock);
3295 			goto retry;
3296 		}
3297 
3298 		/* Copy the LDT data and install it in pmap2 */
3299 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3300 		pmap2->pm_ldt = new_ldt;
3301 		pmap2->pm_ldt_sel = sel;
3302 		mutex_exit(&cpu_lock);
3303 	} else {
3304 		if (new_ldt != NULL) {
3305 			/* The LDT disappeared, drop what we did. */
3306 			ldt_free(sel);
3307 			mutex_exit(&cpu_lock);
3308 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3309 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3310 			return;
3311 		}
3312 
3313 		/* We're good, just leave. */
3314 		mutex_exit(&cpu_lock);
3315 	}
3316 #endif /* USER_LDT */
3317 }
3318 #endif /* PMAP_FORK */
3319 
3320 #ifdef USER_LDT
3321 
3322 /*
3323  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3324  * is active, reload LDTR.
3325  */
3326 static void
3327 pmap_ldt_xcall(void *arg1, void *arg2)
3328 {
3329 	struct pmap *pm;
3330 
3331 	kpreempt_disable();
3332 	pm = arg1;
3333 	if (curcpu()->ci_pmap == pm) {
3334 #if defined(SVS)
3335 		if (svs_enabled) {
3336 			svs_ldt_sync(pm);
3337 		} else
3338 #endif
3339 		lldt(pm->pm_ldt_sel);
3340 	}
3341 	kpreempt_enable();
3342 }
3343 
3344 /*
3345  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3346  * in the new selector on all CPUs.
3347  */
3348 void
3349 pmap_ldt_sync(struct pmap *pm)
3350 {
3351 	uint64_t where;
3352 
3353 	KASSERT(mutex_owned(&cpu_lock));
3354 
3355 	pmap_ldt_evcnt.ev_count++;
3356 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3357 	xc_wait(where);
3358 }
3359 
3360 /*
3361  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3362  * restore the default.
3363  */
3364 void
3365 pmap_ldt_cleanup(struct lwp *l)
3366 {
3367 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3368 	union descriptor *ldt;
3369 	int sel;
3370 
3371 	if (__predict_true(pmap->pm_ldt == NULL)) {
3372 		return;
3373 	}
3374 
3375 	mutex_enter(&cpu_lock);
3376 	if (pmap->pm_ldt != NULL) {
3377 		sel = pmap->pm_ldt_sel;
3378 		ldt = pmap->pm_ldt;
3379 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3380 		pmap->pm_ldt = NULL;
3381 		pmap_ldt_sync(pmap);
3382 		ldt_free(sel);
3383 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3384 		    UVM_KMF_WIRED);
3385 	}
3386 	mutex_exit(&cpu_lock);
3387 }
3388 #endif /* USER_LDT */
3389 
3390 /*
3391  * pmap_activate: activate a process' pmap
3392  *
3393  * => must be called with kernel preemption disabled
3394  * => if lwp is the curlwp, then set ci_want_pmapload so that
3395  *    actual MMU context switch will be done by pmap_load() later
3396  */
3397 void
3398 pmap_activate(struct lwp *l)
3399 {
3400 	struct cpu_info *ci;
3401 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3402 
3403 	KASSERT(kpreempt_disabled());
3404 
3405 	ci = curcpu();
3406 
3407 	if (l != ci->ci_curlwp)
3408 		return;
3409 
3410 	KASSERT(ci->ci_want_pmapload == 0);
3411 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3412 
3413 	/*
3414 	 * no need to switch to kernel vmspace because
3415 	 * it's a subset of any vmspace.
3416 	 */
3417 
3418 	if (pmap == pmap_kernel()) {
3419 		ci->ci_want_pmapload = 0;
3420 		return;
3421 	}
3422 
3423 	ci->ci_want_pmapload = 1;
3424 }
3425 
3426 #if defined(XENPV) && defined(__x86_64__)
3427 #define	KASSERT_PDIRPA(pmap) \
3428 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3429 	    pmap == pmap_kernel())
3430 #elif defined(PAE)
3431 #define	KASSERT_PDIRPA(pmap) \
3432 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3433 #elif !defined(XENPV)
3434 #define	KASSERT_PDIRPA(pmap) \
3435 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3436 #else
3437 #define	KASSERT_PDIRPA(pmap) 	KASSERT(true)	/* nothing to do */
3438 #endif
3439 
3440 /*
3441  * pmap_reactivate: try to regain reference to the pmap.
3442  *
3443  * => Must be called with kernel preemption disabled.
3444  */
3445 static void
3446 pmap_reactivate(struct pmap *pmap)
3447 {
3448 	struct cpu_info * const ci = curcpu();
3449 	const cpuid_t cid = cpu_index(ci);
3450 
3451 	KASSERT(kpreempt_disabled());
3452 	KASSERT_PDIRPA(pmap);
3453 
3454 	/*
3455 	 * If we still have a lazy reference to this pmap, we can assume
3456 	 * that there was no TLB shootdown for this pmap in the meantime.
3457 	 *
3458 	 * The order of events here is important as we must synchronize
3459 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3460 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3461 	 * change only when the state is TLBSTATE_LAZY.
3462 	 */
3463 
3464 	ci->ci_tlbstate = TLBSTATE_VALID;
3465 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3466 
3467 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3468 		/* We have the reference, state is valid. */
3469 	} else {
3470 		/*
3471 		 * Must reload the TLB, pmap has been changed during
3472 		 * deactivated.
3473 		 */
3474 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3475 
3476 		tlbflush();
3477 	}
3478 }
3479 
3480 /*
3481  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3482  * and relevant LDT info.
3483  *
3484  * Ensures that the current process' pmap is loaded on the current CPU's
3485  * MMU and that there are no stale TLB entries.
3486  *
3487  * => The caller should disable kernel preemption or do check-and-retry
3488  *    to prevent a preemption from undoing our efforts.
3489  * => This function may block.
3490  */
3491 void
3492 pmap_load(void)
3493 {
3494 	struct cpu_info *ci;
3495 	struct pmap *pmap, *oldpmap;
3496 	struct lwp *l;
3497 	uint64_t ncsw;
3498 
3499 	kpreempt_disable();
3500  retry:
3501 	ci = curcpu();
3502 	if (!ci->ci_want_pmapload) {
3503 		kpreempt_enable();
3504 		return;
3505 	}
3506 	l = ci->ci_curlwp;
3507 	ncsw = l->l_ncsw;
3508 	__insn_barrier();
3509 
3510 	/* should be able to take ipis. */
3511 	KASSERT(ci->ci_ilevel < IPL_HIGH);
3512 #ifdef XENPV
3513 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3514 	KASSERT(x86_read_psl() == 0);
3515 #else
3516 	KASSERT((x86_read_psl() & PSL_I) != 0);
3517 #endif
3518 
3519 	KASSERT(l != NULL);
3520 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3521 	KASSERT(pmap != pmap_kernel());
3522 	oldpmap = ci->ci_pmap;
3523 
3524 	if (pmap == oldpmap) {
3525 		pmap_reactivate(pmap);
3526 		ci->ci_want_pmapload = 0;
3527 		kpreempt_enable();
3528 		return;
3529 	}
3530 
3531 	/*
3532 	 * Acquire a reference to the new pmap and perform the switch.
3533 	 */
3534 
3535 	pmap_reference(pmap);
3536 	pmap_load1(l, pmap, oldpmap);
3537 	ci->ci_want_pmapload = 0;
3538 
3539 	/*
3540 	 * we're now running with the new pmap.  drop the reference
3541 	 * to the old pmap.  if we block, we need to go around again.
3542 	 */
3543 
3544 	pmap_destroy(oldpmap);
3545 	__insn_barrier();
3546 	if (l->l_ncsw != ncsw) {
3547 		goto retry;
3548 	}
3549 
3550 	kpreempt_enable();
3551 }
3552 
3553 /*
3554  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3555  * pmap_load().  It's critically important that this function does not
3556  * block.
3557  */
3558 static void
3559 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3560 {
3561 	struct cpu_info *ci;
3562 	struct pcb *pcb;
3563 	cpuid_t cid;
3564 
3565 	KASSERT(kpreempt_disabled());
3566 
3567 	pcb = lwp_getpcb(l);
3568 	ci = l->l_cpu;
3569 	cid = cpu_index(ci);
3570 
3571 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3572 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3573 
3574 	KASSERT_PDIRPA(oldpmap);
3575 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3576 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3577 
3578 	/*
3579 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3580 	 * with TLB shootdown interrupts, so set the state VALID first,
3581 	 * then register us for shootdown events on this pmap.
3582 	 */
3583 	ci->ci_tlbstate = TLBSTATE_VALID;
3584 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3585 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3586 	ci->ci_pmap = pmap;
3587 
3588 	/*
3589 	 * update tss.  now that we have registered for invalidations
3590 	 * from other CPUs, we're good to load the page tables.
3591 	 */
3592 #ifdef PAE
3593 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3594 #else
3595 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3596 #endif
3597 
3598 #ifdef i386
3599 #ifndef XENPV
3600 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3601 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3602 #endif
3603 #endif
3604 
3605 #if defined(SVS) && defined(USER_LDT)
3606 	if (svs_enabled) {
3607 		svs_ldt_sync(pmap);
3608 	} else
3609 #endif
3610 	lldt(pmap->pm_ldt_sel);
3611 
3612 	cpu_load_pmap(pmap, oldpmap);
3613 }
3614 
3615 /*
3616  * pmap_deactivate: deactivate a process' pmap.
3617  *
3618  * => Must be called with kernel preemption disabled (high IPL is enough).
3619  */
3620 void
3621 pmap_deactivate(struct lwp *l)
3622 {
3623 	struct pmap *pmap;
3624 	struct cpu_info *ci;
3625 
3626 	KASSERT(kpreempt_disabled());
3627 
3628 	if (l != curlwp) {
3629 		return;
3630 	}
3631 
3632 	/*
3633 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3634 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3635 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3636 	 * to kernel preemption or blocking routine in between.
3637 	 */
3638 	pmap_tlb_shootnow();
3639 
3640 	ci = curcpu();
3641 
3642 	if (ci->ci_want_pmapload) {
3643 		/*
3644 		 * ci_want_pmapload means that our pmap is not loaded on
3645 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3646 		 * is always considered loaded.
3647 		 */
3648 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3649 		    != pmap_kernel());
3650 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3651 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3652 
3653 		/*
3654 		 * userspace has not been touched.
3655 		 * nothing to do here.
3656 		 */
3657 
3658 		ci->ci_want_pmapload = 0;
3659 		return;
3660 	}
3661 
3662 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3663 
3664 	if (pmap == pmap_kernel()) {
3665 		return;
3666 	}
3667 
3668 	KASSERT_PDIRPA(pmap);
3669 	KASSERT(ci->ci_pmap == pmap);
3670 
3671 	/*
3672 	 * we aren't interested in TLB invalidations for this pmap,
3673 	 * at least for the time being.
3674 	 */
3675 
3676 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3677 	ci->ci_tlbstate = TLBSTATE_LAZY;
3678 }
3679 
3680 /*
3681  * some misc. functions
3682  */
3683 
3684 bool
3685 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3686     int *lastlvl)
3687 {
3688 	unsigned long index;
3689 	pd_entry_t pde;
3690 	int i;
3691 
3692 	for (i = PTP_LEVELS; i > 1; i--) {
3693 		index = pl_i(va, i);
3694 		pde = pdes[i - 2][index];
3695 		if ((pde & PTE_P) == 0) {
3696 			*lastlvl = i;
3697 			return false;
3698 		}
3699 		if (pde & PTE_PS)
3700 			break;
3701 	}
3702 	if (lastpde != NULL)
3703 		*lastpde = pde;
3704 	*lastlvl = i;
3705 	return true;
3706 }
3707 
3708 /*
3709  * pmap_extract: extract a PA for the given VA
3710  */
3711 bool
3712 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3713 {
3714 	pt_entry_t *ptes, pte;
3715 	pd_entry_t pde;
3716 	pd_entry_t * const *pdes;
3717 	struct pmap *pmap2;
3718 	paddr_t pa;
3719 	bool rv;
3720 	int lvl;
3721 
3722 	if (__predict_false(pmap->pm_extract != NULL)) {
3723 		return (*pmap->pm_extract)(pmap, va, pap);
3724 	}
3725 
3726 #ifdef __HAVE_DIRECT_MAP
3727 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3728 		if (pap != NULL) {
3729 			*pap = PMAP_DIRECT_UNMAP(va);
3730 		}
3731 		return true;
3732 	}
3733 #endif
3734 
3735 	rv = false;
3736 	pa = 0;
3737 
3738 	if (pmap != pmap_kernel()) {
3739 		mutex_enter(&pmap->pm_lock);
3740 	}
3741 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3742 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3743 		if (lvl == 2) {
3744 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3745 			rv = true;
3746 		} else {
3747 			KASSERT(lvl == 1);
3748 			pte = ptes[pl1_i(va)];
3749 			if (__predict_true((pte & PTE_P) != 0)) {
3750 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3751 				rv = true;
3752 			}
3753 		}
3754 	}
3755 	pmap_unmap_ptes(pmap, pmap2);
3756 	if (pmap != pmap_kernel()) {
3757 		mutex_exit(&pmap->pm_lock);
3758 	}
3759 	if (pap != NULL) {
3760 		*pap = pa;
3761 	}
3762 
3763 	return rv;
3764 }
3765 
3766 /*
3767  * vtophys: virtual address to physical address.  For use by
3768  * machine-dependent code only.
3769  */
3770 paddr_t
3771 vtophys(vaddr_t va)
3772 {
3773 	paddr_t pa;
3774 
3775 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3776 		return pa;
3777 	return 0;
3778 }
3779 
3780 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3781 
3782 #ifdef XENPV
3783 /*
3784  * vtomach: virtual address to machine address.  For use by
3785  * machine-dependent code only.
3786  */
3787 paddr_t
3788 vtomach(vaddr_t va)
3789 {
3790 	paddr_t pa;
3791 
3792 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3793 		return pa;
3794 	return 0;
3795 }
3796 #endif
3797 
3798 /*
3799  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3800  * determine the bounds of the kernel virtual addess space.
3801  */
3802 void
3803 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3804 {
3805 	*startp = virtual_avail;
3806 	*endp = virtual_end;
3807 }
3808 
3809 void
3810 pmap_zero_page(paddr_t pa)
3811 {
3812 #if defined(__HAVE_DIRECT_MAP)
3813 	memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
3814 #else
3815 #if defined(XENPV)
3816 	if (XEN_VERSION_SUPPORTED(3, 4))
3817 		xen_pagezero(pa);
3818 #endif
3819 	struct cpu_info *ci;
3820 	pt_entry_t *zpte;
3821 	vaddr_t zerova;
3822 
3823 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3824 
3825 	kpreempt_disable();
3826 
3827 	ci = curcpu();
3828 	zerova = ci->vpage[VPAGE_ZER];
3829 	zpte = ci->vpage_pte[VPAGE_ZER];
3830 
3831 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3832 
3833 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3834 	pmap_pte_flush();
3835 	pmap_update_pg(zerova);		/* flush TLB */
3836 
3837 	memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
3838 
3839 #if defined(DIAGNOSTIC) || defined(XENPV)
3840 	pmap_pte_set(zpte, 0);				/* zap ! */
3841 	pmap_pte_flush();
3842 #endif
3843 
3844 	kpreempt_enable();
3845 #endif /* defined(__HAVE_DIRECT_MAP) */
3846 }
3847 
3848 void
3849 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3850 {
3851 #if defined(__HAVE_DIRECT_MAP)
3852 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3853 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3854 
3855 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
3856 #else
3857 #if defined(XENPV)
3858 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3859 		xen_copy_page(srcpa, dstpa);
3860 		return;
3861 	}
3862 #endif
3863 	struct cpu_info *ci;
3864 	pt_entry_t *srcpte, *dstpte;
3865 	vaddr_t srcva, dstva;
3866 
3867 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
3868 
3869 	kpreempt_disable();
3870 
3871 	ci = curcpu();
3872 	srcva = ci->vpage[VPAGE_SRC];
3873 	dstva = ci->vpage[VPAGE_DST];
3874 	srcpte = ci->vpage_pte[VPAGE_SRC];
3875 	dstpte = ci->vpage_pte[VPAGE_DST];
3876 
3877 	KASSERT(*srcpte == 0 && *dstpte == 0);
3878 
3879 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3880 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
3881 	pmap_pte_flush();
3882 	pmap_update_pg(srcva);
3883 	pmap_update_pg(dstva);
3884 
3885 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
3886 
3887 #if defined(DIAGNOSTIC) || defined(XENPV)
3888 	pmap_pte_set(srcpte, 0);
3889 	pmap_pte_set(dstpte, 0);
3890 	pmap_pte_flush();
3891 #endif
3892 
3893 	kpreempt_enable();
3894 #endif /* defined(__HAVE_DIRECT_MAP) */
3895 }
3896 
3897 static pt_entry_t *
3898 pmap_map_ptp(struct vm_page *ptp)
3899 {
3900 #ifdef __HAVE_DIRECT_MAP
3901 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3902 #else
3903 	struct cpu_info *ci;
3904 	pt_entry_t *ptppte;
3905 	vaddr_t ptpva;
3906 
3907 	KASSERT(kpreempt_disabled());
3908 
3909 #ifndef XENPV
3910 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
3911 #else
3912 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
3913 #endif
3914 
3915 	ci = curcpu();
3916 	ptpva = ci->vpage[VPAGE_PTP];
3917 	ptppte = ci->vpage_pte[VPAGE_PTP];
3918 
3919 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3920 
3921 	pmap_pte_flush();
3922 	pmap_update_pg(ptpva);
3923 
3924 	return (pt_entry_t *)ptpva;
3925 #endif
3926 }
3927 
3928 static void
3929 pmap_unmap_ptp(void)
3930 {
3931 #ifndef __HAVE_DIRECT_MAP
3932 #if defined(DIAGNOSTIC) || defined(XENPV)
3933 	struct cpu_info *ci;
3934 	pt_entry_t *pte;
3935 
3936 	KASSERT(kpreempt_disabled());
3937 
3938 	ci = curcpu();
3939 	pte = ci->vpage_pte[VPAGE_PTP];
3940 
3941 	if (*pte != 0) {
3942 		pmap_pte_set(pte, 0);
3943 		pmap_pte_flush();
3944 	}
3945 #endif
3946 #endif
3947 }
3948 
3949 static pt_entry_t *
3950 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3951 {
3952 
3953 	KASSERT(kpreempt_disabled());
3954 	if (pmap_is_curpmap(pmap)) {
3955 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3956 	}
3957 	KASSERT(ptp != NULL);
3958 	return pmap_map_ptp(ptp) + pl1_pi(va);
3959 }
3960 
3961 static void
3962 pmap_unmap_pte(void)
3963 {
3964 
3965 	KASSERT(kpreempt_disabled());
3966 
3967 	pmap_unmap_ptp();
3968 }
3969 
3970 /*
3971  * p m a p   r e m o v e   f u n c t i o n s
3972  *
3973  * functions that remove mappings
3974  */
3975 
3976 /*
3977  * pmap_remove_ptes: remove PTEs from a PTP
3978  *
3979  * => caller must hold pmap's lock
3980  * => PTP must be mapped into KVA
3981  * => PTP should be null if pmap == pmap_kernel()
3982  * => must be called with kernel preemption disabled
3983  * => returns composite pte if at least one page should be shot down
3984  */
3985 static void
3986 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3987     vaddr_t startva, vaddr_t endva)
3988 {
3989 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3990 
3991 	KASSERT(mutex_owned(&pmap->pm_lock));
3992 	KASSERT(kpreempt_disabled());
3993 
3994 	/*
3995 	 * mappings are very often sparse, so clip the given range to the
3996 	 * range of PTEs that are known present in the PTP.
3997 	 */
3998 	pmap_ptp_range_clip(ptp, &startva, &pte);
3999 
4000 	/*
4001 	 * note that ptpva points to the PTE that maps startva.   this may
4002 	 * or may not be the first PTE in the PTP.
4003 	 *
4004 	 * we loop through the PTP while there are still PTEs to look at
4005 	 * and the wire_count is greater than 1 (because we use the wire_count
4006 	 * to keep track of the number of real PTEs in the PTP).
4007 	 */
4008 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4009 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4010 		startva += PAGE_SIZE;
4011 		pte++;
4012 	}
4013 }
4014 
4015 /*
4016  * pmap_remove_pte: remove a single PTE from a PTP.
4017  *
4018  * => caller must hold pmap's lock
4019  * => PTP must be mapped into KVA
4020  * => PTP should be null if pmap == pmap_kernel()
4021  * => returns true if we removed a mapping
4022  * => must be called with kernel preemption disabled
4023  */
4024 static bool
4025 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4026     vaddr_t va)
4027 {
4028 	struct pv_entry *pve;
4029 	struct vm_page *pg;
4030 	struct pmap_page *pp;
4031 	pt_entry_t opte;
4032 
4033 	KASSERT(mutex_owned(&pmap->pm_lock));
4034 	KASSERT(kpreempt_disabled());
4035 
4036 	if (!pmap_valid_entry(*pte)) {
4037 		/* VA not mapped. */
4038 		return false;
4039 	}
4040 
4041 	/* Atomically save the old PTE and zap it. */
4042 	opte = pmap_pte_testset(pte, 0);
4043 	if (!pmap_valid_entry(opte)) {
4044 		return false;
4045 	}
4046 
4047 	pmap_exec_account(pmap, va, opte, 0);
4048 	pmap_stats_update_bypte(pmap, 0, opte);
4049 
4050 	if (ptp) {
4051 		/*
4052 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4053 		 */
4054 		ptp->wire_count--;
4055 		if (ptp->wire_count <= 1) {
4056 			opte |= PTE_A;
4057 		}
4058 	}
4059 
4060 	if ((opte & PTE_A) != 0) {
4061 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4062 	}
4063 
4064 	/*
4065 	 * If we are not on a pv list - we are done.
4066 	 */
4067 	if ((opte & PTE_PVLIST) == 0) {
4068 #ifndef DOM0OPS
4069 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4070 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4071 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4072 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4073 #endif
4074 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4075 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4076 		return true;
4077 	}
4078 
4079 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4080 		pp = VM_PAGE_TO_PP(pg);
4081 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4082 		paddr_t pa = pmap_pte2pa(opte);
4083 		panic("%s: PTE_PVLIST with pv-untracked page"
4084 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4085 		    __func__, va, pa, atop(pa));
4086 	}
4087 
4088 	/* Sync R/M bits. */
4089 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4090 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4091 	return true;
4092 }
4093 
4094 static void
4095 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4096 {
4097 	pt_entry_t *ptes;
4098 	pd_entry_t pde;
4099 	pd_entry_t * const *pdes;
4100 	bool result;
4101 	vaddr_t blkendva, va = sva;
4102 	struct vm_page *ptp;
4103 	struct pmap *pmap2;
4104 	int lvl;
4105 
4106 	KASSERT(mutex_owned(&pmap->pm_lock));
4107 
4108 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4109 
4110 	/*
4111 	 * removing one page?  take shortcut function.
4112 	 */
4113 
4114 	if (va + PAGE_SIZE == eva) {
4115 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4116 			KASSERT(lvl == 1);
4117 
4118 			/* Get PTP if non-kernel mapping. */
4119 			if (pmap != pmap_kernel()) {
4120 				ptp = pmap_find_ptp(pmap, va, 1);
4121 				KASSERTMSG(ptp != NULL,
4122 				    "%s: unmanaged PTP detected", __func__);
4123 			} else {
4124 				/* Never free kernel PTPs. */
4125 				ptp = NULL;
4126 			}
4127 
4128 			result = pmap_remove_pte(pmap, ptp,
4129 			    &ptes[pl1_i(va)], va);
4130 
4131 			/*
4132 			 * if mapping removed and the PTP is no longer
4133 			 * being used, free it!
4134 			 */
4135 
4136 			if (result && ptp && ptp->wire_count <= 1)
4137 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4138 		}
4139 	} else for (/* null */ ; va < eva ; va = blkendva) {
4140 		/* determine range of block */
4141 		blkendva = x86_round_pdr(va+1);
4142 		if (blkendva > eva)
4143 			blkendva = eva;
4144 
4145 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4146 			/* Skip a range corresponding to an invalid pde. */
4147 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4148  			continue;
4149 		}
4150 		KASSERT(lvl == 1);
4151 
4152 		/* Get PTP if non-kernel mapping. */
4153 		if (pmap != pmap_kernel()) {
4154 			ptp = pmap_find_ptp(pmap, va, 1);
4155 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4156 			    __func__);
4157 		} else {
4158 			/* Never free kernel PTPs. */
4159 			ptp = NULL;
4160 		}
4161 
4162 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4163 		    blkendva);
4164 
4165 		/* If PTP is no longer being used, free it. */
4166 		if (ptp && ptp->wire_count <= 1) {
4167 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4168 		}
4169 	}
4170 	pmap_unmap_ptes(pmap, pmap2);
4171 	pmap_drain_pv(pmap);
4172 }
4173 
4174 /*
4175  * pmap_remove: mapping removal function.
4176  *
4177  * => caller should not be holding any pmap locks
4178  */
4179 void
4180 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4181 {
4182 	if (__predict_false(pmap->pm_remove != NULL)) {
4183 		(*pmap->pm_remove)(pmap, sva, eva);
4184 		return;
4185 	}
4186 
4187 	mutex_enter(&pmap->pm_lock);
4188 	pmap_remove_locked(pmap, sva, eva);
4189 	mutex_exit(&pmap->pm_lock);
4190 }
4191 
4192 /*
4193  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4194  *
4195  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4196  * => Caller should disable kernel preemption.
4197  * => issues tlb shootdowns if necessary.
4198  */
4199 static int
4200 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4201     pt_entry_t *optep)
4202 {
4203 	struct pmap *pmap;
4204 	struct vm_page *ptp;
4205 	vaddr_t va;
4206 	pt_entry_t *ptep;
4207 	pt_entry_t opte;
4208 	pt_entry_t npte;
4209 	pt_entry_t expect;
4210 	bool need_shootdown;
4211 
4212 	ptp = pvpte->pte_ptp;
4213 	va = pvpte->pte_va;
4214 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4215 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4216 	pmap = ptp_to_pmap(ptp);
4217 	KASSERT(kpreempt_disabled());
4218 
4219 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4220 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4221 		    optep);
4222 	}
4223 
4224 	expect = pmap_pa2pte(pa) | PTE_P;
4225 
4226 	if (clearbits != ~0) {
4227 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4228 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4229 	}
4230 
4231 	ptep = pmap_map_pte(pmap, ptp, va);
4232 	do {
4233 		opte = *ptep;
4234 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4235 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4236 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4237 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4238 			/*
4239 			 * We lost a race with a V->P operation like
4240 			 * pmap_remove().  Wait for the competitor
4241 			 * reflecting pte bits into mp_attrs.
4242 			 */
4243 			pmap_unmap_pte();
4244 			return EAGAIN;
4245 		}
4246 
4247 		/*
4248 		 * Check if there's anything to do on this PTE.
4249 		 */
4250 		if ((opte & clearbits) == 0) {
4251 			need_shootdown = false;
4252 			break;
4253 		}
4254 
4255 		/*
4256 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4257 		 * ... Unless we are clearing only the PTE_W bit and
4258 		 * it isn't cached as RW (PTE_D).
4259 		 */
4260 		need_shootdown = (opte & PTE_A) != 0 &&
4261 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4262 
4263 		npte = opte & ~clearbits;
4264 
4265 		/*
4266 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4267 		 */
4268 		if (need_shootdown) {
4269 			npte &= ~(PTE_A | PTE_D);
4270 		}
4271 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4272 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4273 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4274 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4275 
4276 	if (need_shootdown) {
4277 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4278 	}
4279 	pmap_unmap_pte();
4280 
4281 	*oattrs = pmap_pte_to_pp_attrs(opte);
4282 	if (optep != NULL)
4283 		*optep = opte;
4284 	return 0;
4285 }
4286 
4287 static void
4288 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4289     vaddr_t va)
4290 {
4291 	struct pmap *pmap2;
4292 	pt_entry_t *ptes;
4293 	pd_entry_t * const *pdes;
4294 
4295 	KASSERT(mutex_owned(&pmap->pm_lock));
4296 
4297 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4298 	pmap_stats_update_bypte(pmap, 0, opte);
4299 	ptp->wire_count--;
4300 	if (ptp->wire_count <= 1) {
4301 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4302 	}
4303 	pmap_unmap_ptes(pmap, pmap2);
4304 }
4305 
4306 static void
4307 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4308 {
4309 	struct pv_pte *pvpte;
4310 	struct vm_page *ptp;
4311 	uintptr_t sum;
4312 	uint8_t oattrs;
4313 	bool locked;
4314 
4315 	/*
4316 	 * Do an unlocked check to see if the page has no mappings, eg when
4317 	 * pmap_remove_all() was called before amap_wipeout() for a process
4318 	 * private amap - common.  The page being removed must be on the way
4319 	 * out, so we don't have to worry about concurrent attempts to enter
4320 	 * it (otherwise the caller either doesn't care or has screwed up).
4321 	 */
4322 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4323 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4324 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4325 	if (sum == 0) {
4326 	    	return;
4327 	}
4328 
4329 	kpreempt_disable();
4330 	for (;;) {
4331 		struct pmap *pmap;
4332 		struct pv_entry *pve;
4333 		pt_entry_t opte;
4334 		vaddr_t va;
4335 
4336 		mutex_spin_enter(&pp->pp_lock);
4337 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4338 			mutex_spin_exit(&pp->pp_lock);
4339 			break;
4340 		}
4341 
4342 		/*
4343 		 * Add a reference to the pmap before clearing the pte.
4344 		 * Otherwise the pmap can disappear behind us.
4345 		 */
4346 		ptp = pvpte->pte_ptp;
4347 		pmap = ptp_to_pmap(ptp);
4348 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4349 		if (ptp != NULL) {
4350 			pmap_reference(pmap);
4351 		}
4352 
4353 		/*
4354 		 * Now try to lock it.  We need a direct handoff between
4355 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4356 		 * and kept associated with this pmap.  If that can't be
4357 		 * had, wait for the pmap's lock to become free and then
4358 		 * retry.
4359 		 */
4360 		locked = mutex_tryenter(&pmap->pm_lock);
4361 		mutex_spin_exit(&pp->pp_lock);
4362 		if (!locked) {
4363 			mutex_enter(&pmap->pm_lock);
4364 			/* nothing, just wait for it */
4365 			mutex_exit(&pmap->pm_lock);
4366 			if (ptp != NULL) {
4367 				pmap_destroy(pmap);
4368 			}
4369 			continue;
4370 		}
4371 		va = pvpte->pte_va;
4372 
4373 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4374 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4375 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4376 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4377 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4378 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4379 
4380 #ifdef DEBUG
4381 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4382 		rb_tree_t *tree = (ptp != NULL ?
4383 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4384 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4385 		if (pve == NULL) {
4386 			KASSERTMSG(&pp->pp_pte == pvpte,
4387 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4388 			    va, pmap, ptp, pvpte, pve);
4389 		} else {
4390 			KASSERTMSG(&pve->pve_pte == pvpte,
4391 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4392 			    va, pmap, ptp, pvpte, pve);
4393 		}
4394 #endif
4395 
4396 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4397 			panic("pmap_pp_remove: mapping not present");
4398 		}
4399 
4400 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4401 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4402 
4403 		/* Update the PTP reference count. Free if last reference. */
4404 		if (ptp != NULL) {
4405 			KASSERT(pmap != pmap_kernel());
4406 			pmap_tlb_shootnow();
4407 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4408 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4409 			} else {
4410 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4411 			}
4412 		} else {
4413 			KASSERT(pmap == pmap_kernel());
4414 			pmap_stats_update_bypte(pmap, 0, opte);
4415 		}
4416 		pmap_tlb_shootnow();
4417 		pmap_drain_pv(pmap);
4418 		mutex_exit(&pmap->pm_lock);
4419 		if (ptp != NULL) {
4420 			pmap_destroy(pmap);
4421 		}
4422 	}
4423 	kpreempt_enable();
4424 }
4425 
4426 /*
4427  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4428  *
4429  * => R/M bits are sync'd back to attrs
4430  */
4431 void
4432 pmap_page_remove(struct vm_page *pg)
4433 {
4434 	struct pmap_page *pp;
4435 	paddr_t pa;
4436 
4437 	pp = VM_PAGE_TO_PP(pg);
4438 	pa = VM_PAGE_TO_PHYS(pg);
4439 	pmap_pp_remove(pp, pa);
4440 }
4441 
4442 /*
4443  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4444  * that map it
4445  */
4446 void
4447 pmap_pv_remove(paddr_t pa)
4448 {
4449 	struct pmap_page *pp;
4450 
4451 	pp = pmap_pv_tracked(pa);
4452 	if (pp == NULL)
4453 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4454 	pmap_pp_remove(pp, pa);
4455 }
4456 
4457 /*
4458  * p m a p   a t t r i b u t e  f u n c t i o n s
4459  * functions that test/change managed page's attributes
4460  * since a page can be mapped multiple times we must check each PTE that
4461  * maps it by going down the pv lists.
4462  */
4463 
4464 /*
4465  * pmap_test_attrs: test a page's attributes
4466  */
4467 bool
4468 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4469 {
4470 	struct pmap_page *pp;
4471 	struct pv_pte *pvpte;
4472 	struct pmap *pmap;
4473 	uint8_t oattrs;
4474 	u_int result;
4475 	paddr_t pa;
4476 
4477 	pp = VM_PAGE_TO_PP(pg);
4478 	if ((pp->pp_attrs & testbits) != 0) {
4479 		return true;
4480 	}
4481 	pa = VM_PAGE_TO_PHYS(pg);
4482  startover:
4483 	mutex_spin_enter(&pp->pp_lock);
4484 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4485 		if ((pp->pp_attrs & testbits) != 0) {
4486 			break;
4487 		}
4488 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4489 			/*
4490 			 * raced with a V->P operation.  wait for the other
4491 			 * side to finish by acquring pmap's lock.  if no
4492 			 * wait, updates to pp_attrs by the other side may
4493 			 * go unseen.
4494 			 */
4495 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4496 			pmap_reference(pmap);
4497 			mutex_spin_exit(&pp->pp_lock);
4498 			mutex_enter(&pmap->pm_lock);
4499 			/* nothing. */
4500 			mutex_exit(&pmap->pm_lock);
4501 			pmap_destroy(pmap);
4502 			goto startover;
4503 		}
4504 		pp->pp_attrs |= oattrs;
4505 	}
4506 	result = pp->pp_attrs & testbits;
4507 	mutex_spin_exit(&pp->pp_lock);
4508 
4509 	/*
4510 	 * note that we will exit the for loop with a non-null pve if
4511 	 * we have found the bits we are testing for.
4512 	 */
4513 
4514 	return result != 0;
4515 }
4516 
4517 static bool
4518 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4519 {
4520 	struct pv_pte *pvpte;
4521 	struct pmap *pmap;
4522 	uint8_t oattrs;
4523 	u_int result;
4524 
4525 startover:
4526 	mutex_spin_enter(&pp->pp_lock);
4527 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4528 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4529 			/*
4530 			 * raced with a V->P operation.  wait for the other
4531 			 * side to finish by acquring pmap's lock.  it is
4532 			 * probably unmapping the page, and it will be gone
4533 			 * when the loop is restarted.
4534 			 */
4535 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4536 			pmap_reference(pmap);
4537 			mutex_spin_exit(&pp->pp_lock);
4538 			mutex_enter(&pmap->pm_lock);
4539 			/* nothing. */
4540 			mutex_exit(&pmap->pm_lock);
4541 			pmap_destroy(pmap);
4542 			goto startover;
4543 		}
4544 		pp->pp_attrs |= oattrs;
4545 	}
4546 	result = pp->pp_attrs & clearbits;
4547 	pp->pp_attrs &= ~clearbits;
4548 	pmap_tlb_shootnow();
4549 	mutex_spin_exit(&pp->pp_lock);
4550 
4551 	return result != 0;
4552 }
4553 
4554 /*
4555  * pmap_clear_attrs: clear the specified attribute for a page.
4556  *
4557  * => we return true if we cleared one of the bits we were asked to
4558  */
4559 bool
4560 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4561 {
4562 	struct pmap_page *pp;
4563 	paddr_t pa;
4564 
4565 	pp = VM_PAGE_TO_PP(pg);
4566 	pa = VM_PAGE_TO_PHYS(pg);
4567 
4568 	/*
4569 	 * If this is a new page, assert it has no mappings and simply zap
4570 	 * the stored attributes without taking any locks.
4571 	 */
4572 	if ((pg->flags & PG_FAKE) != 0) {
4573 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4574 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4575 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4576 		atomic_store_relaxed(&pp->pp_attrs, 0);
4577 		return false;
4578 	} else {
4579 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4580 	}
4581 }
4582 
4583 /*
4584  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4585  * pv-tracked page.
4586  */
4587 bool
4588 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4589 {
4590 	struct pmap_page *pp;
4591 
4592 	pp = pmap_pv_tracked(pa);
4593 	if (pp == NULL)
4594 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4595 
4596 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4597 }
4598 
4599 /*
4600  * p m a p   p r o t e c t i o n   f u n c t i o n s
4601  */
4602 
4603 /*
4604  * pmap_page_protect: change the protection of all recorded mappings
4605  * of a managed page
4606  *
4607  * => NOTE: this is an inline function in pmap.h
4608  */
4609 
4610 /* see pmap.h */
4611 
4612 /*
4613  * pmap_pv_protect: change the protection of all recorded mappings
4614  * of an unmanaged pv-tracked page
4615  *
4616  * => NOTE: this is an inline function in pmap.h
4617  */
4618 
4619 /* see pmap.h */
4620 
4621 /*
4622  * pmap_protect: set the protection in of the pages in a pmap
4623  *
4624  * => NOTE: this is an inline function in pmap.h
4625  */
4626 
4627 /* see pmap.h */
4628 
4629 /*
4630  * pmap_write_protect: write-protect pages in a pmap.
4631  *
4632  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4633  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4634  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4635  * present the page will still be considered as a kernel page, and the privilege
4636  * separation will be enforced correctly.
4637  */
4638 void
4639 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4640 {
4641 	pt_entry_t bit_rem, bit_put;
4642 	pt_entry_t *ptes;
4643 	pt_entry_t * const *pdes;
4644 	struct pmap *pmap2;
4645 	vaddr_t blockend, va;
4646 	int lvl, i;
4647 
4648 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4649 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4650 		return;
4651 	}
4652 
4653 	bit_rem = 0;
4654 	if (!(prot & VM_PROT_WRITE))
4655 		bit_rem = PTE_W;
4656 
4657 	bit_put = 0;
4658 	if (!(prot & VM_PROT_EXECUTE))
4659 		bit_put = pmap_pg_nx;
4660 
4661 	sva &= ~PAGE_MASK;
4662 	eva &= ~PAGE_MASK;
4663 
4664 	/*
4665 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4666 	 * be touching PV entries nor stats and kernel PDEs aren't
4667 	 * freed.
4668 	 */
4669 	if (pmap != pmap_kernel()) {
4670 		mutex_enter(&pmap->pm_lock);
4671 	}
4672 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4673 
4674 	for (va = sva ; va < eva; va = blockend) {
4675 		pt_entry_t *spte, *epte;
4676 
4677 		blockend = x86_round_pdr(va + 1);
4678 		if (blockend > eva)
4679 			blockend = eva;
4680 
4681 		/* Is it a valid block? */
4682 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4683 			continue;
4684 		}
4685 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4686 		KASSERT(lvl == 1);
4687 
4688 		spte = &ptes[pl1_i(va)];
4689 		epte = &ptes[pl1_i(blockend)];
4690 
4691 		for (i = 0; spte < epte; spte++, i++) {
4692 			pt_entry_t opte, npte;
4693 
4694 			do {
4695 				opte = *spte;
4696 				if (!pmap_valid_entry(opte)) {
4697 					goto next;
4698 				}
4699 				npte = (opte & ~bit_rem) | bit_put;
4700 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4701 
4702 			if ((opte & PTE_D) != 0) {
4703 				vaddr_t tva = va + x86_ptob(i);
4704 				pmap_tlb_shootdown(pmap, tva, opte,
4705 				    TLBSHOOT_WRITE_PROTECT);
4706 			}
4707 next:;
4708 		}
4709 	}
4710 
4711 	/* Release pmap. */
4712 	pmap_unmap_ptes(pmap, pmap2);
4713 	if (pmap != pmap_kernel()) {
4714 		mutex_exit(&pmap->pm_lock);
4715 	}
4716 }
4717 
4718 /*
4719  * pmap_unwire: clear the wired bit in the PTE.
4720  *
4721  * => Mapping should already be present.
4722  */
4723 void
4724 pmap_unwire(struct pmap *pmap, vaddr_t va)
4725 {
4726 	pt_entry_t *ptes, *ptep, opte;
4727 	pd_entry_t * const *pdes;
4728 	struct pmap *pmap2;
4729 	int lvl;
4730 
4731 	if (__predict_false(pmap->pm_unwire != NULL)) {
4732 		(*pmap->pm_unwire)(pmap, va);
4733 		return;
4734 	}
4735 
4736 	/*
4737 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4738 	 * statistics.
4739 	 */
4740 	mutex_enter(&pmap->pm_lock);
4741 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4742 
4743 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4744 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4745 	}
4746 	KASSERT(lvl == 1);
4747 
4748 	ptep = &ptes[pl1_i(va)];
4749 	opte = *ptep;
4750 	KASSERT(pmap_valid_entry(opte));
4751 
4752 	if (opte & PTE_WIRED) {
4753 		pt_entry_t npte = opte & ~PTE_WIRED;
4754 
4755 		opte = pmap_pte_testset(ptep, npte);
4756 		pmap_stats_update_bypte(pmap, npte, opte);
4757 	} else {
4758 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4759 		    " did not change!\n", __func__, pmap, va);
4760 	}
4761 
4762 	/* Release pmap. */
4763 	pmap_unmap_ptes(pmap, pmap2);
4764 	mutex_exit(&pmap->pm_lock);
4765 }
4766 
4767 /*
4768  * pmap_copy: copy mappings from one pmap to another
4769  *
4770  * => optional function
4771  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4772  */
4773 
4774 /*
4775  * defined as macro in pmap.h
4776  */
4777 
4778 __strict_weak_alias(pmap_enter, pmap_enter_default);
4779 
4780 int
4781 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4782     u_int flags)
4783 {
4784 	if (__predict_false(pmap->pm_enter != NULL)) {
4785 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4786 	}
4787 
4788 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4789 }
4790 
4791 /*
4792  * pmap_enter: enter a mapping into a pmap
4793  *
4794  * => must be done "now" ... no lazy-evaluation
4795  */
4796 int
4797 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4798 	   vm_prot_t prot, u_int flags, int domid)
4799 {
4800 	pt_entry_t *ptes, opte, npte;
4801 	pt_entry_t *ptep;
4802 	pd_entry_t * const *pdes;
4803 	struct vm_page *ptp;
4804 	struct vm_page *new_pg, *old_pg;
4805 	struct pmap_page *new_pp, *old_pp;
4806 	struct pv_entry *old_pve, *new_pve;
4807 	bool wired = (flags & PMAP_WIRED) != 0;
4808 	struct pmap *pmap2;
4809 	struct pmap_ptparray pt;
4810 	int error;
4811 	bool getptp, samepage, new_embedded;
4812 	rb_tree_t *tree;
4813 
4814 	KASSERT(pmap_initialized);
4815 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4816 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4817 	    PRIxVADDR " over PDP!", __func__, va);
4818 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4819 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4820 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4821 
4822 #ifdef XENPV
4823 	KASSERT(domid == DOMID_SELF || pa == 0);
4824 #endif
4825 
4826 	npte = ma | protection_codes[prot] | PTE_P;
4827 	npte |= pmap_pat_flags(flags);
4828 	if (wired)
4829 	        npte |= PTE_WIRED;
4830 	if (va < VM_MAXUSER_ADDRESS)
4831 		npte |= PTE_U;
4832 
4833 	if (pmap == pmap_kernel())
4834 		npte |= pmap_pg_g;
4835 	if (flags & VM_PROT_ALL) {
4836 		npte |= PTE_A;
4837 		if (flags & VM_PROT_WRITE) {
4838 			KASSERT((npte & PTE_W) != 0);
4839 			npte |= PTE_D;
4840 		}
4841 	}
4842 
4843 #ifdef XENPV
4844 	if (domid != DOMID_SELF)
4845 		new_pg = NULL;
4846 	else
4847 #endif
4848 		new_pg = PHYS_TO_VM_PAGE(pa);
4849 
4850 	if (new_pg != NULL) {
4851 		/* This is a managed page */
4852 		npte |= PTE_PVLIST;
4853 		new_pp = VM_PAGE_TO_PP(new_pg);
4854 		PMAP_CHECK_PP(new_pp);
4855 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4856 		/* This is an unmanaged pv-tracked page */
4857 		npte |= PTE_PVLIST;
4858 		PMAP_CHECK_PP(new_pp);
4859 	} else {
4860 		new_pp = NULL;
4861 	}
4862 
4863 	/* Begin by locking the pmap. */
4864 	mutex_enter(&pmap->pm_lock);
4865 
4866 	/* Look up the PTP.  Allocate if none present. */
4867 	ptp = NULL;
4868 	getptp = false;
4869 	if (pmap != pmap_kernel()) {
4870 		ptp = pmap_find_ptp(pmap, va, 1);
4871 		if (ptp == NULL) {
4872 			getptp = true;
4873 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
4874 			if (error != 0) {
4875 				if (flags & PMAP_CANFAIL) {
4876 					mutex_exit(&pmap->pm_lock);
4877 					return error;
4878 				}
4879 				panic("%s: get ptp failed, error=%d", __func__,
4880 				    error);
4881 			}
4882 		}
4883 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
4884 	} else {
4885 		/* Embedded PV entries rely on this. */
4886 		KASSERT(va != 0);
4887 		tree = &pmap_kernel_rb;
4888 	}
4889 
4890 	/*
4891 	 * Look up the old PV entry at this VA (if any), and insert a new PV
4892 	 * entry if required for the new mapping.  Temporarily track the old
4893 	 * and new mappings concurrently.  Only after the old mapping is
4894 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
4895 	 * our picture of modified/accessed state for either page could get
4896 	 * out of sync (we need any P->V operation for either page to stall
4897 	 * on pmap->pm_lock until done here).
4898 	 */
4899 	new_pve = NULL;
4900 	old_pve = NULL;
4901 	samepage = false;
4902 	new_embedded = false;
4903 
4904     	if (new_pp != NULL) {
4905     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
4906     		    &old_pve, &samepage, &new_embedded, tree);
4907 
4908 		/*
4909 		 * If a new pv_entry was needed and none was available, we
4910 		 * can go no further.
4911 		 */
4912 		if (error != 0) {
4913 			if (flags & PMAP_CANFAIL) {
4914 				if (getptp) {
4915 					pmap_unget_ptp(pmap, &pt);
4916 				}
4917 				mutex_exit(&pmap->pm_lock);
4918 				return error;
4919 			}
4920 			panic("%s: alloc pve failed", __func__);
4921 		}
4922 	} else {
4923 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4924 	}
4925 
4926 	/* Map PTEs into address space. */
4927 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4928 
4929 	/* Install any newly allocated PTPs. */
4930 	if (getptp) {
4931 		pmap_install_ptp(pmap, &pt, va, pdes);
4932 	}
4933 
4934 	/* Check if there is an existing mapping. */
4935 	ptep = &ptes[pl1_i(va)];
4936 	opte = *ptep;
4937 	bool have_oldpa = pmap_valid_entry(opte);
4938 	paddr_t oldpa = pmap_pte2pa(opte);
4939 
4940 	/*
4941 	 * Update the pte.
4942 	 */
4943 	do {
4944 		opte = *ptep;
4945 
4946 		/*
4947 		 * if the same page, inherit PTE_A and PTE_D.
4948 		 */
4949 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
4950 			npte |= opte & (PTE_A | PTE_D);
4951 		}
4952 #if defined(XENPV)
4953 		if (domid != DOMID_SELF) {
4954 			/* pmap_pte_cas with error handling */
4955 			int s = splvm();
4956 			if (opte != *ptep) {
4957 				splx(s);
4958 				continue;
4959 			}
4960 			error = xpq_update_foreign(
4961 			    vtomach((vaddr_t)ptep), npte, domid, flags);
4962 			splx(s);
4963 			if (error) {
4964 				/* Undo pv_entry tracking - oof. */
4965 				if (new_pp != NULL) {
4966 					mutex_spin_enter(&new_pp->pp_lock);
4967 					if (new_pve != NULL) {
4968 						LIST_REMOVE(new_pve, pve_list);
4969 						KASSERT(pmap->pm_pve == NULL);
4970 						pmap->pm_pve = new_pve;
4971 					} else if (new_embedded) {
4972 						new_pp->pp_pte.pte_ptp = NULL;
4973 						new_pp->pp_pte.pte_va = 0;
4974 					}
4975 					mutex_spin_exit(&new_pp->pp_lock);
4976 				}
4977 				pmap_unmap_ptes(pmap, pmap2);
4978 				/* Free new PTP. */
4979 				if (ptp != NULL && ptp->wire_count <= 1) {
4980 					pmap_free_ptp(pmap, ptp, va, ptes,
4981 					    pdes);
4982 				}
4983 				mutex_exit(&pmap->pm_lock);
4984 				return error;
4985 			}
4986 			break;
4987 		}
4988 #endif /* defined(XENPV) */
4989 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4990 
4991 	/*
4992 	 * Done with the PTEs: they can now be unmapped.
4993 	 */
4994 	pmap_unmap_ptes(pmap, pmap2);
4995 
4996 	/*
4997 	 * Update statistics and PTP's reference count.
4998 	 */
4999 	pmap_stats_update_bypte(pmap, npte, opte);
5000 	if (ptp != NULL) {
5001 		if (!have_oldpa) {
5002 			ptp->wire_count++;
5003 		}
5004 		/* Remember minimum VA in PTP. */
5005 		pmap_ptp_range_set(ptp, va);
5006 	}
5007 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5008 
5009 	/*
5010 	 * If the same page, we can skip pv_entry handling.
5011 	 */
5012 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5013 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5014 		if ((npte & PTE_PVLIST) != 0) {
5015 			KASSERT(samepage);
5016 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5017 		}
5018 		goto same_pa;
5019 	} else if ((npte & PTE_PVLIST) != 0) {
5020 		KASSERT(!samepage);
5021 	}
5022 
5023 	/*
5024 	 * If old page is pv-tracked, remove pv_entry from its list.
5025 	 */
5026 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5027 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5028 			old_pp = VM_PAGE_TO_PP(old_pg);
5029 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5030 			panic("%s: PTE_PVLIST with pv-untracked page"
5031 			    " va = %#"PRIxVADDR
5032 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5033 			    __func__, va, oldpa, atop(pa));
5034 		}
5035 
5036 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5037 		    pmap_pte_to_pp_attrs(opte));
5038 	} else {
5039 		KASSERT(old_pve == NULL);
5040 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5041 	}
5042 
5043 	/*
5044 	 * If new page is dynamically PV tracked, insert to tree.
5045 	 */
5046 	if (new_pve != NULL) {
5047 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5048 		old_pve = rb_tree_insert_node(tree, new_pve);
5049 		KASSERT(old_pve == new_pve);
5050 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5051 	}
5052 
5053 same_pa:
5054 	/*
5055 	 * shootdown tlb if necessary.
5056 	 */
5057 
5058 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5059 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5060 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5061 	}
5062 	pmap_drain_pv(pmap);
5063 	mutex_exit(&pmap->pm_lock);
5064 	return 0;
5065 }
5066 
5067 #if defined(XEN) && defined(DOM0OPS)
5068 
5069 struct pmap_data_gnt {
5070 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5071 	vaddr_t pd_gnt_sva;
5072 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5073 	int pd_gnt_refs; /* ref counter */
5074 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5075 };
5076 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5077 
5078 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5079 
5080 static struct pmap_data_gnt *
5081 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5082 {
5083 	struct pmap_data_gnt_head *headp;
5084 	struct pmap_data_gnt *pgnt;
5085 
5086 	KASSERT(mutex_owned(&pmap->pm_lock));
5087 	headp = pmap->pm_data;
5088 	KASSERT(headp != NULL);
5089 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5090 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5091 			return pgnt;
5092 		/* check that we're not overlapping part of a region */
5093 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5094 	}
5095 	return NULL;
5096 }
5097 
5098 static void
5099 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5100     const struct gnttab_map_grant_ref *ops)
5101 {
5102 	struct pmap_data_gnt_head *headp;
5103 	struct pmap_data_gnt *pgnt;
5104 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5105 	KASSERT(mutex_owned(&pmap->pm_lock));
5106 	KASSERT(nentries >= 1);
5107 	if (pmap->pm_remove == NULL) {
5108 		pmap->pm_remove = pmap_remove_gnt;
5109 		KASSERT(pmap->pm_data == NULL);
5110 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5111 		SLIST_INIT(headp);
5112 		pmap->pm_data = headp;
5113 	} else {
5114 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5115 		KASSERT(pmap->pm_data != NULL);
5116 		headp = pmap->pm_data;
5117 	}
5118 
5119 	pgnt = pmap_find_gnt(pmap, sva, eva);
5120 	if (pgnt != NULL) {
5121 		KASSERT(pgnt->pd_gnt_sva == sva);
5122 		KASSERT(pgnt->pd_gnt_eva == eva);
5123 		return;
5124 	}
5125 
5126 	/* new entry */
5127 	pgnt = kmem_alloc(sizeof(*pgnt) +
5128 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5129 	pgnt->pd_gnt_sva = sva;
5130 	pgnt->pd_gnt_eva = eva;
5131 	pgnt->pd_gnt_refs = 0;
5132 	memcpy(pgnt->pd_gnt_ops, ops,
5133 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5134 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5135 }
5136 
5137 static void
5138 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5139 {
5140 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5141 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5142 	KASSERT(nentries >= 1);
5143 	KASSERT(mutex_owned(&pmap->pm_lock));
5144 	KASSERT(pgnt->pd_gnt_refs == 0);
5145 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5146 	kmem_free(pgnt, sizeof(*pgnt) +
5147 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5148 	if (SLIST_EMPTY(headp)) {
5149 		kmem_free(headp, sizeof(*headp));
5150 		pmap->pm_data = NULL;
5151 		pmap->pm_remove = NULL;
5152 	}
5153 }
5154 
5155 /*
5156  * pmap_enter_gnt: enter a grant entry into a pmap
5157  *
5158  * => must be done "now" ... no lazy-evaluation
5159  */
5160 int
5161 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5162     const struct gnttab_map_grant_ref *oops)
5163 {
5164 	struct pmap_data_gnt *pgnt;
5165 	pt_entry_t *ptes, opte;
5166 	pt_entry_t *ptep;
5167 	pd_entry_t * const *pdes;
5168 	struct vm_page *ptp;
5169 	struct vm_page *old_pg;
5170 	struct pmap_page *old_pp;
5171 	struct pv_entry *old_pve;
5172 	struct pmap *pmap2;
5173 	struct pmap_ptparray pt;
5174 	int error;
5175 	bool getptp;
5176 	rb_tree_t *tree;
5177 	struct gnttab_map_grant_ref *op;
5178 	int ret;
5179 	int idx;
5180 
5181 	KASSERT(pmap_initialized);
5182 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5183 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5184 	    PRIxVADDR " over PDP!", __func__, va);
5185 	KASSERT(pmap != pmap_kernel());
5186 
5187 	/* Begin by locking the pmap. */
5188 	mutex_enter(&pmap->pm_lock);
5189 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5190 
5191 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5192 	KASSERT(pgnt != NULL);
5193 
5194 	/* Look up the PTP.  Allocate if none present. */
5195 	ptp = NULL;
5196 	getptp = false;
5197 	ptp = pmap_find_ptp(pmap, va, 1);
5198 	if (ptp == NULL) {
5199 		getptp = true;
5200 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5201 		if (error != 0) {
5202 			mutex_exit(&pmap->pm_lock);
5203 			return error;
5204 		}
5205 	}
5206 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5207 
5208 	/*
5209 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5210 	 * entry if required for the new mapping.  Temporarily track the old
5211 	 * and new mappings concurrently.  Only after the old mapping is
5212 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5213 	 * our picture of modified/accessed state for either page could get
5214 	 * out of sync (we need any P->V operation for either page to stall
5215 	 * on pmap->pm_lock until done here).
5216 	 */
5217 	old_pve = NULL;
5218 
5219 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5220 
5221 	/* Map PTEs into address space. */
5222 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5223 
5224 	/* Install any newly allocated PTPs. */
5225 	if (getptp) {
5226 		pmap_install_ptp(pmap, &pt, va, pdes);
5227 	}
5228 
5229 	/* Check if there is an existing mapping. */
5230 	ptep = &ptes[pl1_i(va)];
5231 	opte = *ptep;
5232 	bool have_oldpa = pmap_valid_entry(opte);
5233 	paddr_t oldpa = pmap_pte2pa(opte);
5234 
5235 	/*
5236 	 * Update the pte.
5237 	 */
5238 
5239 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5240 	op = &pgnt->pd_gnt_ops[idx];
5241 
5242 #ifdef XENPV /* XXX */
5243 	op->host_addr = xpmap_ptetomach(ptep);
5244 #endif
5245 	op->dev_bus_addr = 0;
5246 	op->status = GNTST_general_error;
5247 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5248 	if (__predict_false(ret)) {
5249 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5250 		    __func__, ret);
5251 		op->status = GNTST_general_error;
5252 	}
5253 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5254 		kpause("gntmap", false, mstohz(1), NULL);
5255 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5256 		if (__predict_false(ret)) {
5257 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5258 			    __func__, ret);
5259 			op->status = GNTST_general_error;
5260 		}
5261 	}
5262 	if (__predict_false(op->status != GNTST_okay)) {
5263 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5264 		    __func__, op->status);
5265 		if (have_oldpa) {
5266 			ptp->wire_count--;
5267 		}
5268 	} else {
5269 		pgnt->pd_gnt_refs++;
5270 		if (!have_oldpa) {
5271 			ptp->wire_count++;
5272 		}
5273 		KASSERT(ptp->wire_count > 1);
5274 		/* Remember minimum VA in PTP. */
5275 		pmap_ptp_range_set(ptp, va);
5276 	}
5277 	if (ptp->wire_count <= 1)
5278 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5279 
5280 	/*
5281 	 * Done with the PTEs: they can now be unmapped.
5282 	 */
5283 	pmap_unmap_ptes(pmap, pmap2);
5284 
5285 	/*
5286 	 * Update statistics and PTP's reference count.
5287 	 */
5288 	pmap_stats_update_bypte(pmap, 0, opte);
5289 
5290 	/*
5291 	 * If old page is pv-tracked, remove pv_entry from its list.
5292 	 */
5293 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5294 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5295 			old_pp = VM_PAGE_TO_PP(old_pg);
5296 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5297 			panic("%s: PTE_PVLIST with pv-untracked page"
5298 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5299 			    __func__, va, oldpa);
5300 		}
5301 
5302 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5303 		    pmap_pte_to_pp_attrs(opte));
5304 	} else {
5305 		KASSERT(old_pve == NULL);
5306 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5307 	}
5308 
5309 	pmap_drain_pv(pmap);
5310 	mutex_exit(&pmap->pm_lock);
5311 	return op->status;
5312 }
5313 
5314 /*
5315  * pmap_remove_gnt: grant mapping removal function.
5316  *
5317  * => caller should not be holding any pmap locks
5318  */
5319 static void
5320 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5321 {
5322 	struct pmap_data_gnt *pgnt;
5323 	pt_entry_t *ptes;
5324 	pd_entry_t pde;
5325 	pd_entry_t * const *pdes;
5326 	struct vm_page *ptp;
5327 	struct pmap *pmap2;
5328 	vaddr_t va;
5329 	int lvl;
5330 	int idx;
5331 	struct gnttab_map_grant_ref *op;
5332 	struct gnttab_unmap_grant_ref unmap_op;
5333 	int ret;
5334 
5335 	KASSERT(pmap != pmap_kernel());
5336 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5337 
5338 	mutex_enter(&pmap->pm_lock);
5339 	for (va = sva; va < eva; va += PAGE_SIZE) {
5340 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5341 		if (pgnt == NULL) {
5342 			pmap_remove_locked(pmap, sva, eva);
5343 			continue;
5344 		}
5345 
5346 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5347 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5348 			panic("pmap_remove_gnt pdes not valid");
5349 		}
5350 
5351 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5352 		op = &pgnt->pd_gnt_ops[idx];
5353 		KASSERT(lvl == 1);
5354 		KASSERT(op->status == GNTST_okay);
5355 
5356 		/* Get PTP if non-kernel mapping. */
5357 		ptp = pmap_find_ptp(pmap, va, 1);
5358 		KASSERTMSG(ptp != NULL,
5359 		    "%s: unmanaged PTP detected", __func__);
5360 
5361 		if (op->status == GNTST_okay)  {
5362 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5363 			unmap_op.handle = op->handle;
5364 			unmap_op.dev_bus_addr = 0;
5365 #ifdef XENPV /* XXX */
5366 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5367 #endif
5368 			ret = HYPERVISOR_grant_table_op(
5369 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5370 			if (ret) {
5371 				printf("%s: GNTTABOP_unmap_grant_ref "
5372 				    "failed: %d\n", __func__, ret);
5373 			}
5374 
5375 			ptp->wire_count--;
5376 			pgnt->pd_gnt_refs--;
5377 			if (pgnt->pd_gnt_refs == 0) {
5378 				pmap_free_gnt(pmap, pgnt);
5379 			}
5380 		}
5381 		/*
5382 		 * if mapping removed and the PTP is no longer
5383 		 * being used, free it!
5384 		 */
5385 
5386 		if (ptp->wire_count <= 1)
5387 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5388 		pmap_unmap_ptes(pmap, pmap2);
5389 	}
5390 	mutex_exit(&pmap->pm_lock);
5391 }
5392 #endif /* XEN && DOM0OPS */
5393 
5394 paddr_t
5395 pmap_get_physpage(void)
5396 {
5397 	struct vm_page *ptp;
5398 	struct pmap *kpm = pmap_kernel();
5399 	paddr_t pa;
5400 
5401 	if (!uvm.page_init_done) {
5402 		/*
5403 		 * We're growing the kernel pmap early (from
5404 		 * uvm_pageboot_alloc()). This case must be
5405 		 * handled a little differently.
5406 		 */
5407 
5408 		if (!uvm_page_physget(&pa))
5409 			panic("%s: out of memory", __func__);
5410 #if defined(__HAVE_DIRECT_MAP)
5411 		memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5412 #else
5413 #if defined(XENPV)
5414 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5415 			xen_pagezero(pa);
5416 			return pa;
5417 		}
5418 #endif
5419 		kpreempt_disable();
5420 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5421 		    PTE_W | pmap_pg_nx);
5422 		pmap_pte_flush();
5423 		pmap_update_pg((vaddr_t)early_zerop);
5424 		memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5425 #if defined(DIAGNOSTIC) || defined(XENPV)
5426 		pmap_pte_set(early_zero_pte, 0);
5427 		pmap_pte_flush();
5428 #endif /* defined(DIAGNOSTIC) */
5429 		kpreempt_enable();
5430 #endif /* defined(__HAVE_DIRECT_MAP) */
5431 	} else {
5432 		/* XXX */
5433 		ptp = uvm_pagealloc(NULL, 0, NULL,
5434 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5435 		if (ptp == NULL)
5436 			panic("%s: out of memory", __func__);
5437 		ptp->flags &= ~PG_BUSY;
5438 		ptp->wire_count = 1;
5439 		pa = VM_PAGE_TO_PHYS(ptp);
5440 	}
5441 	pmap_stats_update(kpm, 1, 0);
5442 
5443 	return pa;
5444 }
5445 
5446 /*
5447  * Expand the page tree with the specified amount of PTPs, mapping virtual
5448  * addresses starting at kva. We populate all the levels but the last one
5449  * (L1). The nodes of the tree are created as RW, but the pages covered
5450  * will be kentered in L1, with proper permissions.
5451  *
5452  * Used only by pmap_growkernel.
5453  */
5454 static void
5455 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5456 {
5457 	unsigned long i;
5458 	paddr_t pa;
5459 	unsigned long index, endindex;
5460 	int level;
5461 	pd_entry_t *pdep;
5462 #ifdef XENPV
5463 	int s = splvm(); /* protect xpq_* */
5464 #endif
5465 
5466 	for (level = PTP_LEVELS; level > 1; level--) {
5467 		if (level == PTP_LEVELS)
5468 			pdep = cpm->pm_pdir;
5469 		else
5470 			pdep = normal_pdes[level - 2];
5471 		index = pl_i_roundup(kva, level);
5472 		endindex = index + needed_ptps[level - 1] - 1;
5473 
5474 		for (i = index; i <= endindex; i++) {
5475 			pt_entry_t pte;
5476 
5477 			KASSERT(!pmap_valid_entry(pdep[i]));
5478 			pa = pmap_get_physpage();
5479 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5480 #ifdef __x86_64__
5481 			pte |= pmap_pg_nx;
5482 #endif
5483 			pmap_pte_set(&pdep[i], pte);
5484 
5485 #ifdef XENPV
5486 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5487 				if (__predict_true(
5488 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5489 					/* update per-cpu PMDs on all cpus */
5490 					xen_kpm_sync(pmap_kernel(), i);
5491 				} else {
5492 					/*
5493 					 * too early; update primary CPU
5494 					 * PMD only (without locks)
5495 					 */
5496 #ifdef __x86_64__
5497 					pd_entry_t *cpu_pdep =
5498 						&cpu_info_primary.ci_kpm_pdir[i];
5499 #else
5500 					pd_entry_t *cpu_pdep =
5501 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5502 #endif
5503 					pmap_pte_set(cpu_pdep, pte);
5504 				}
5505 			}
5506 #endif
5507 
5508 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5509 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5510 			nkptp[level - 1]++;
5511 		}
5512 		pmap_pte_flush();
5513 	}
5514 #ifdef XENPV
5515 	splx(s);
5516 #endif
5517 }
5518 
5519 /*
5520  * pmap_growkernel: increase usage of KVM space.
5521  *
5522  * => we allocate new PTPs for the kernel and install them in all
5523  *    the pmaps on the system.
5524  */
5525 vaddr_t
5526 pmap_growkernel(vaddr_t maxkvaddr)
5527 {
5528 	struct pmap *kpm = pmap_kernel();
5529 	struct pmap *cpm;
5530 #if !defined(XENPV) || !defined(__x86_64__)
5531 	struct pmap *pm;
5532 	long old;
5533 #endif
5534 	int s, i;
5535 	long needed_kptp[PTP_LEVELS], target_nptp;
5536 	bool invalidate = false;
5537 
5538 	s = splvm();	/* to be safe */
5539 	mutex_enter(&kpm->pm_lock);
5540 
5541 	if (maxkvaddr <= pmap_maxkvaddr) {
5542 		mutex_exit(&kpm->pm_lock);
5543 		splx(s);
5544 		return pmap_maxkvaddr;
5545 	}
5546 
5547 	maxkvaddr = x86_round_pdr(maxkvaddr);
5548 #if !defined(XENPV) || !defined(__x86_64__)
5549 	old = nkptp[PTP_LEVELS - 1];
5550 #endif
5551 
5552 	/* Initialize needed_kptp. */
5553 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5554 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5555 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5556 
5557 		if (target_nptp > nkptpmax[i])
5558 			panic("out of KVA space");
5559 		KASSERT(target_nptp >= nkptp[i]);
5560 		needed_kptp[i] = target_nptp - nkptp[i];
5561 	}
5562 
5563 #ifdef XENPV
5564 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5565 	cpm = kpm;
5566 #else
5567 	/* Get the current pmap */
5568 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5569 		cpm = curcpu()->ci_pmap;
5570 	} else {
5571 		cpm = kpm;
5572 	}
5573 #endif
5574 
5575 	kasan_shadow_map((void *)pmap_maxkvaddr,
5576 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5577 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5578 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5579 
5580 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5581 
5582 	/*
5583 	 * If the number of top level entries changed, update all pmaps.
5584 	 */
5585 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5586 #ifdef XENPV
5587 #ifdef __x86_64__
5588 		/* nothing, kernel entries are never entered in user pmap */
5589 #else
5590 		int pdkidx;
5591 
5592 		mutex_enter(&pmaps_lock);
5593 		LIST_FOREACH(pm, &pmaps, pm_list) {
5594 			for (pdkidx = PDIR_SLOT_KERN + old;
5595 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5596 			    pdkidx++) {
5597 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5598 				    kpm->pm_pdir[pdkidx]);
5599 			}
5600 			pmap_pte_flush();
5601 		}
5602 		mutex_exit(&pmaps_lock);
5603 #endif /* __x86_64__ */
5604 #else /* XENPV */
5605 		size_t newpdes;
5606 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5607 		if (cpm != kpm) {
5608 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5609 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5610 			    newpdes * sizeof(pd_entry_t));
5611 		}
5612 
5613 		mutex_enter(&pmaps_lock);
5614 		LIST_FOREACH(pm, &pmaps, pm_list) {
5615 			if (__predict_false(pm->pm_enter != NULL)) {
5616 				/*
5617 				 * Not a native pmap, the kernel is not mapped,
5618 				 * so nothing to synchronize.
5619 				 */
5620 				continue;
5621 			}
5622 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5623 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5624 			    newpdes * sizeof(pd_entry_t));
5625 		}
5626 		mutex_exit(&pmaps_lock);
5627 #endif
5628 		invalidate = true;
5629 	}
5630 	pmap_maxkvaddr = maxkvaddr;
5631 	mutex_exit(&kpm->pm_lock);
5632 	splx(s);
5633 
5634 	if (invalidate && pmap_initialized) {
5635 		/* Invalidate the pmap cache. */
5636 		pool_cache_invalidate(&pmap_cache);
5637 	}
5638 
5639 	return maxkvaddr;
5640 }
5641 
5642 #ifdef DEBUG
5643 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5644 
5645 /*
5646  * pmap_dump: dump all the mappings from a pmap
5647  *
5648  * => caller should not be holding any pmap locks
5649  */
5650 void
5651 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5652 {
5653 	pt_entry_t *ptes, *pte;
5654 	pd_entry_t * const *pdes;
5655 	struct pmap *pmap2;
5656 	vaddr_t blkendva;
5657 	int lvl;
5658 
5659 	/*
5660 	 * if end is out of range truncate.
5661 	 * if (end == start) update to max.
5662 	 */
5663 
5664 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5665 		eva = VM_MAXUSER_ADDRESS;
5666 
5667 	mutex_enter(&pmap->pm_lock);
5668 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5669 
5670 	/*
5671 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5672 	 */
5673 
5674 	for (/* null */ ; sva < eva ; sva = blkendva) {
5675 
5676 		/* determine range of block */
5677 		blkendva = x86_round_pdr(sva+1);
5678 		if (blkendva > eva)
5679 			blkendva = eva;
5680 
5681 		/* valid block? */
5682 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5683 			continue;
5684 		KASSERT(lvl == 1);
5685 
5686 		pte = &ptes[pl1_i(sva)];
5687 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5688 			if (!pmap_valid_entry(*pte))
5689 				continue;
5690 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5691 			    " (pte=%#" PRIxPADDR ")\n",
5692 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5693 		}
5694 	}
5695 	pmap_unmap_ptes(pmap, pmap2);
5696 	mutex_exit(&pmap->pm_lock);
5697 }
5698 #endif
5699 
5700 /*
5701  * pmap_update: process deferred invalidations and frees.
5702  */
5703 void
5704 pmap_update(struct pmap *pmap)
5705 {
5706 	struct pmap_page *pp;
5707 	struct vm_page *ptp;
5708 
5709 	/*
5710 	 * Initiate any pending TLB shootdowns.  Wait for them to
5711 	 * complete before returning control to the caller.
5712 	 */
5713 	kpreempt_disable();
5714 	pmap_tlb_shootnow();
5715 	kpreempt_enable();
5716 
5717 	/*
5718 	 * Now that shootdowns are complete, process deferred frees.  This
5719 	 * is an unlocked check, but is safe as we're only interested in
5720 	 * work done in this LWP - we won't get a false negative.
5721 	 */
5722 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5723 		return;
5724 	}
5725 
5726 	mutex_enter(&pmap->pm_lock);
5727 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5728 		KASSERT(ptp->wire_count == 0);
5729 		KASSERT(ptp->uanon == NULL);
5730 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5731 		pp = VM_PAGE_TO_PP(ptp);
5732 		LIST_INIT(&pp->pp_pvlist);
5733 		pp->pp_attrs = 0;
5734 		pp->pp_pte.pte_ptp = NULL;
5735 		pp->pp_pte.pte_va = 0;
5736 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5737 
5738 		/*
5739 		 * XXX Hack to avoid extra locking, and lock
5740 		 * assertions in uvm_pagefree().  Despite uobject
5741 		 * being set, this isn't a managed page.
5742 		 */
5743 		PMAP_DUMMY_LOCK(pmap);
5744 		uvm_pagerealloc(ptp, NULL, 0);
5745 		PMAP_DUMMY_UNLOCK(pmap);
5746 		uvm_pagefree(ptp);
5747 	}
5748 	mutex_exit(&pmap->pm_lock);
5749 }
5750 
5751 #if PTP_LEVELS > 4
5752 #error "Unsupported number of page table mappings"
5753 #endif
5754 
5755 paddr_t
5756 pmap_init_tmp_pgtbl(paddr_t pg)
5757 {
5758 	static bool maps_loaded;
5759 	static const paddr_t x86_tmp_pml_paddr[] = {
5760 	    4 * PAGE_SIZE,	/* L1 */
5761 	    5 * PAGE_SIZE,	/* L2 */
5762 	    6 * PAGE_SIZE,	/* L3 */
5763 	    7 * PAGE_SIZE	/* L4 */
5764 	};
5765 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5766 
5767 	pd_entry_t *tmp_pml, *kernel_pml;
5768 
5769 	int level;
5770 
5771 	if (!maps_loaded) {
5772 		for (level = 0; level < PTP_LEVELS; ++level) {
5773 			x86_tmp_pml_vaddr[level] =
5774 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5775 			    UVM_KMF_VAONLY);
5776 
5777 			if (x86_tmp_pml_vaddr[level] == 0)
5778 				panic("mapping of real mode PML failed\n");
5779 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5780 			    x86_tmp_pml_paddr[level],
5781 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5782 		}
5783 		pmap_update(pmap_kernel());
5784 		maps_loaded = true;
5785 	}
5786 
5787 	/* Zero levels 1-3 */
5788 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
5789 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5790 		memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
5791 	}
5792 
5793 	/* Copy PML4 */
5794 	kernel_pml = pmap_kernel()->pm_pdir;
5795 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
5796 	memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
5797 
5798 #ifdef PAE
5799 	/*
5800 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
5801 	 * last entries are unlikely to be used for temporary mappings.
5802 	 * 508: maps 0->1GB (userland)
5803 	 * 509: unused
5804 	 * 510: unused
5805 	 * 511: maps 3->4GB (kernel)
5806 	 */
5807 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
5808 	tmp_pml[509] = 0;
5809 	tmp_pml[510] = 0;
5810 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
5811 #endif
5812 
5813 	for (level = PTP_LEVELS - 1; level > 0; --level) {
5814 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5815 
5816 		tmp_pml[pl_i(pg, level + 1)] =
5817 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
5818 	}
5819 
5820 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
5821 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
5822 
5823 #ifdef PAE
5824 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
5825 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
5826 #endif
5827 
5828 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
5829 }
5830 
5831 u_int
5832 x86_mmap_flags(paddr_t mdpgno)
5833 {
5834 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
5835 	u_int pflag = 0;
5836 
5837 	if (nflag & X86_MMAP_FLAG_PREFETCH)
5838 		pflag |= PMAP_WRITE_COMBINE;
5839 
5840 	return pflag;
5841 }
5842 
5843 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
5844 
5845 /*
5846  * -----------------------------------------------------------------------------
5847  * *****************************************************************************
5848  * *****************************************************************************
5849  * *****************************************************************************
5850  * *****************************************************************************
5851  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
5852  * *****************************************************************************
5853  * *****************************************************************************
5854  * *****************************************************************************
5855  * *****************************************************************************
5856  * -----------------------------------------------------------------------------
5857  *
5858  * These functions are invoked as callbacks from the code above. Contrary to
5859  * native, EPT does not have a recursive slot; therefore, it is not possible
5860  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
5861  * tree manually.
5862  *
5863  * Apart from that, the logic is mostly the same as native. Once a pmap has
5864  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
5865  * After that we're good, and the callbacks will handle the translations
5866  * for us.
5867  *
5868  * -----------------------------------------------------------------------------
5869  */
5870 
5871 /* Hardware bits. */
5872 #define EPT_R		__BIT(0)	/* read */
5873 #define EPT_W		__BIT(1)	/* write */
5874 #define EPT_X		__BIT(2)	/* execute */
5875 #define EPT_T		__BITS(5,3)	/* type */
5876 #define		TYPE_UC	0
5877 #define		TYPE_WC	1
5878 #define		TYPE_WT	4
5879 #define		TYPE_WP	5
5880 #define		TYPE_WB	6
5881 #define EPT_NOPAT	__BIT(6)
5882 #define EPT_L		__BIT(7)	/* large */
5883 #define EPT_A		__BIT(8)	/* accessed */
5884 #define EPT_D		__BIT(9)	/* dirty */
5885 /* Software bits. */
5886 #define EPT_PVLIST	__BIT(60)
5887 #define EPT_WIRED	__BIT(61)
5888 
5889 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
5890 
5891 bool pmap_ept_has_ad __read_mostly;
5892 
5893 static inline void
5894 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
5895 {
5896 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
5897 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
5898 
5899 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5900 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5901 
5902 	pmap_stats_update(pmap, resid_diff, wired_diff);
5903 }
5904 
5905 static pt_entry_t
5906 pmap_ept_type(u_int flags)
5907 {
5908 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
5909 	pt_entry_t ret;
5910 
5911 	switch (cacheflags) {
5912 	case PMAP_NOCACHE:
5913 	case PMAP_NOCACHE_OVR:
5914 		ret = __SHIFTIN(TYPE_UC, EPT_T);
5915 		break;
5916 	case PMAP_WRITE_COMBINE:
5917 		ret = __SHIFTIN(TYPE_WC, EPT_T);
5918 		break;
5919 	case PMAP_WRITE_BACK:
5920 	default:
5921 		ret = __SHIFTIN(TYPE_WB, EPT_T);
5922 		break;
5923 	}
5924 
5925 	ret |= EPT_NOPAT;
5926 	return ret;
5927 }
5928 
5929 static inline pt_entry_t
5930 pmap_ept_prot(vm_prot_t prot)
5931 {
5932 	pt_entry_t res = 0;
5933 
5934 	if (prot & VM_PROT_READ)
5935 		res |= EPT_R;
5936 	if (prot & VM_PROT_WRITE)
5937 		res |= EPT_W;
5938 	if (prot & VM_PROT_EXECUTE)
5939 		res |= EPT_X;
5940 
5941 	return res;
5942 }
5943 
5944 static inline uint8_t
5945 pmap_ept_to_pp_attrs(pt_entry_t ept)
5946 {
5947 	uint8_t ret = 0;
5948 	if (pmap_ept_has_ad) {
5949 		if (ept & EPT_D)
5950 			ret |= PP_ATTRS_D;
5951 		if (ept & EPT_A)
5952 			ret |= PP_ATTRS_A;
5953 	} else {
5954 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
5955 	}
5956 	if (ept & EPT_W)
5957 		ret |= PP_ATTRS_W;
5958 	return ret;
5959 }
5960 
5961 static inline pt_entry_t
5962 pmap_pp_attrs_to_ept(uint8_t attrs)
5963 {
5964 	pt_entry_t ept = 0;
5965 	if (attrs & PP_ATTRS_D)
5966 		ept |= EPT_D;
5967 	if (attrs & PP_ATTRS_A)
5968 		ept |= EPT_A;
5969 	if (attrs & PP_ATTRS_W)
5970 		ept |= EPT_W;
5971 	return ept;
5972 }
5973 
5974 /*
5975  * Helper for pmap_ept_free_ptp.
5976  * tree[0] = &L2[L2idx]
5977  * tree[1] = &L3[L3idx]
5978  * tree[2] = &L4[L4idx]
5979  */
5980 static void
5981 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
5982 {
5983 	pt_entry_t *pteva;
5984 	paddr_t ptepa;
5985 	int i, index;
5986 
5987 	ptepa = pmap->pm_pdirpa[0];
5988 	for (i = PTP_LEVELS; i > 1; i--) {
5989 		index = pl_pi(va, i);
5990 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
5991 		KASSERT(pmap_ept_valid_entry(pteva[index]));
5992 		tree[i - 2] = &pteva[index];
5993 		ptepa = pmap_pte2pa(pteva[index]);
5994 	}
5995 }
5996 
5997 static void
5998 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
5999 {
6000 	pd_entry_t *tree[3];
6001 	int level;
6002 
6003 	KASSERT(pmap != pmap_kernel());
6004 	KASSERT(mutex_owned(&pmap->pm_lock));
6005 	KASSERT(kpreempt_disabled());
6006 
6007 	pmap_ept_get_tree(pmap, va, tree);
6008 
6009 	level = 1;
6010 	do {
6011 		(void)pmap_pte_testset(tree[level - 1], 0);
6012 
6013 		pmap_freepage(pmap, ptp, level);
6014 		if (level < PTP_LEVELS - 1) {
6015 			ptp = pmap_find_ptp(pmap, va, level + 1);
6016 			ptp->wire_count--;
6017 			if (ptp->wire_count > 1)
6018 				break;
6019 		}
6020 	} while (++level < PTP_LEVELS);
6021 	pmap_pte_flush();
6022 }
6023 
6024 /* Allocate L4->L3->L2. Return L2. */
6025 static void
6026 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6027 {
6028 	struct vm_page *ptp;
6029 	unsigned long index;
6030 	pd_entry_t *pteva;
6031 	paddr_t ptepa;
6032 	int i;
6033 
6034 	KASSERT(pmap != pmap_kernel());
6035 	KASSERT(mutex_owned(&pmap->pm_lock));
6036 	KASSERT(kpreempt_disabled());
6037 
6038 	/*
6039 	 * Now that we have all the pages looked up or allocated,
6040 	 * loop through again installing any new ones into the tree.
6041 	 */
6042 	ptepa = pmap->pm_pdirpa[0];
6043 	for (i = PTP_LEVELS; i > 1; i--) {
6044 		index = pl_pi(va, i);
6045 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6046 
6047 		if (pmap_ept_valid_entry(pteva[index])) {
6048 			KASSERT(!pt->alloced[i]);
6049 			ptepa = pmap_pte2pa(pteva[index]);
6050 			continue;
6051 		}
6052 
6053 		ptp = pt->pg[i];
6054 		ptp->flags &= ~PG_BUSY; /* never busy */
6055 		ptp->wire_count = 1;
6056 		pmap->pm_ptphint[i - 2] = ptp;
6057 		ptepa = VM_PAGE_TO_PHYS(ptp);
6058 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6059 
6060 		pmap_pte_flush();
6061 		pmap_stats_update(pmap, 1, 0);
6062 
6063 		/*
6064 		 * If we're not in the top level, increase the
6065 		 * wire count of the parent page.
6066 		 */
6067 		if (i < PTP_LEVELS) {
6068 			pt->pg[i + 1]->wire_count++;
6069 		}
6070 	}
6071 }
6072 
6073 static int
6074 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6075     u_int flags)
6076 {
6077 	pt_entry_t *ptes, opte, npte;
6078 	pt_entry_t *ptep;
6079 	struct vm_page *ptp;
6080 	struct vm_page *new_pg, *old_pg;
6081 	struct pmap_page *new_pp, *old_pp;
6082 	struct pv_entry *old_pve, *new_pve;
6083 	bool wired = (flags & PMAP_WIRED) != 0;
6084 	bool accessed;
6085 	struct pmap_ptparray pt;
6086 	int error;
6087 	bool getptp, samepage, new_embedded;
6088 	rb_tree_t *tree;
6089 
6090 	KASSERT(pmap_initialized);
6091 	KASSERT(va < VM_MAXUSER_ADDRESS);
6092 
6093 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6094 
6095 	if (wired)
6096 		npte |= EPT_WIRED;
6097 	if (flags & VM_PROT_ALL) {
6098 		npte |= EPT_A;
6099 		if (flags & VM_PROT_WRITE) {
6100 			KASSERT((npte & EPT_W) != 0);
6101 			npte |= EPT_D;
6102 		}
6103 	}
6104 
6105 	new_pg = PHYS_TO_VM_PAGE(pa);
6106 	if (new_pg != NULL) {
6107 		/* This is a managed page */
6108 		npte |= EPT_PVLIST;
6109 		new_pp = VM_PAGE_TO_PP(new_pg);
6110 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6111 		/* This is an unmanaged pv-tracked page */
6112 		npte |= EPT_PVLIST;
6113 	} else {
6114 		new_pp = NULL;
6115 	}
6116 
6117 	/* Begin by locking the pmap. */
6118 	mutex_enter(&pmap->pm_lock);
6119 
6120 	/* Look up the PTP.  Allocate if none present. */
6121 	ptp = NULL;
6122 	getptp = false;
6123 	if (pmap != pmap_kernel()) {
6124 		ptp = pmap_find_ptp(pmap, va, 1);
6125 		if (ptp == NULL) {
6126 			getptp = true;
6127 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6128 			if (error != 0) {
6129 				if (flags & PMAP_CANFAIL) {
6130 					mutex_exit(&pmap->pm_lock);
6131 					return error;
6132 				}
6133 				panic("%s: get ptp failed, error=%d", __func__,
6134 				    error);
6135 			}
6136 		}
6137 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6138 	} else {
6139 		/* Embedded PV entries rely on this. */
6140 		KASSERT(va != 0);
6141 		tree = &pmap_kernel_rb;
6142 	}
6143 
6144 	/*
6145 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6146 	 * entry if required for the new mapping.  Temporarily track the old
6147 	 * and new mappings concurrently.  Only after the old mapping is
6148 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6149 	 * our picture of modified/accessed state for either page could get
6150 	 * out of sync (we need any P->V operation for either page to stall
6151 	 * on pmap->pm_lock until done here).
6152 	 */
6153 	new_pve = NULL;
6154 	old_pve = NULL;
6155 	samepage = false;
6156 	new_embedded = false;
6157 
6158     	if (new_pp != NULL) {
6159     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6160     		    &old_pve, &samepage, &new_embedded, tree);
6161 
6162 		/*
6163 		 * If a new pv_entry was needed and none was available, we
6164 		 * can go no further.
6165 		 */
6166 		if (error != 0) {
6167 			if (flags & PMAP_CANFAIL) {
6168 				if (getptp) {
6169 					pmap_unget_ptp(pmap, &pt);
6170 				}
6171 				mutex_exit(&pmap->pm_lock);
6172 				return error;
6173 			}
6174 			panic("%s: alloc pve failed", __func__);
6175 		}
6176 	} else {
6177 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6178 	}
6179 
6180 	/* Map PTEs into address space. */
6181 	kpreempt_disable();
6182 
6183 	/* Install any newly allocated PTPs. */
6184 	if (getptp) {
6185 		pmap_ept_install_ptp(pmap, &pt, va);
6186 	}
6187 
6188 	/* Check if there is an existing mapping. */
6189 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6190 	ptep = &ptes[pl1_pi(va)];
6191 	opte = *ptep;
6192 	bool have_oldpa = pmap_ept_valid_entry(opte);
6193 	paddr_t oldpa = pmap_pte2pa(opte);
6194 
6195 	/*
6196 	 * Update the pte.
6197 	 */
6198 	do {
6199 		opte = *ptep;
6200 
6201 		/*
6202 		 * if the same page, inherit PTE_A and PTE_D.
6203 		 */
6204 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6205 			npte |= opte & (EPT_A | EPT_D);
6206 		}
6207 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6208 
6209 	/*
6210 	 * Done with the PTEs: they can now be unmapped.
6211 	 */
6212 	kpreempt_enable();
6213 
6214 	/*
6215 	 * Update statistics and PTP's reference count.
6216 	 */
6217 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6218 	if (ptp != NULL) {
6219 		if (!have_oldpa) {
6220 			ptp->wire_count++;
6221 		}
6222 		/* Remember minimum VA in PTP. */
6223 		pmap_ptp_range_set(ptp, va);
6224 	}
6225 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6226 
6227 	/*
6228 	 * If the same page, we can skip pv_entry handling.
6229 	 */
6230 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6231 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6232 		if ((npte & EPT_PVLIST) != 0) {
6233 			KASSERT(samepage);
6234 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6235 		}
6236 		goto same_pa;
6237 	} else if ((npte & EPT_PVLIST) != 0) {
6238 		KASSERT(!samepage);
6239 	}
6240 
6241 	/*
6242 	 * If old page is pv-tracked, remove pv_entry from its list.
6243 	 */
6244 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6245 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6246 			old_pp = VM_PAGE_TO_PP(old_pg);
6247 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6248 			panic("%s: EPT_PVLIST with pv-untracked page"
6249 			    " va = %#"PRIxVADDR
6250 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6251 			    __func__, va, oldpa, atop(pa));
6252 		}
6253 
6254 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6255 		    pmap_ept_to_pp_attrs(opte));
6256 	} else {
6257 		KASSERT(old_pve == NULL);
6258 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6259 	}
6260 
6261 	/*
6262 	 * If new page is dynamically PV tracked, insert to tree.
6263 	 */
6264 	if (new_pve != NULL) {
6265 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6266 		old_pve = rb_tree_insert_node(tree, new_pve);
6267 		KASSERT(old_pve == new_pve);
6268 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6269 	}
6270 
6271 same_pa:
6272 	/*
6273 	 * shootdown tlb if necessary.
6274 	 */
6275 
6276 	if (pmap_ept_has_ad) {
6277 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6278 	} else {
6279 		accessed = (opte & EPT_R) != 0;
6280 	}
6281 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6282 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6283 	}
6284 	pmap_drain_pv(pmap);
6285 	mutex_exit(&pmap->pm_lock);
6286 	return 0;
6287 }
6288 
6289 /* Pay close attention, this returns L2. */
6290 static int
6291 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6292 {
6293 	pt_entry_t *pteva;
6294 	paddr_t ptepa;
6295 	int i, index;
6296 
6297 	KASSERT(mutex_owned(&pmap->pm_lock));
6298 
6299 	ptepa = pmap->pm_pdirpa[0];
6300 	for (i = PTP_LEVELS; i > 1; i--) {
6301 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6302 		index = pl_pi(va, i);
6303 		if (!pmap_ept_valid_entry(pteva[index]))
6304 			return i;
6305 		ptepa = pmap_pte2pa(pteva[index]);
6306 	}
6307 	if (lastpde != NULL) {
6308 		*lastpde = pteva[index];
6309 	}
6310 
6311 	return 0;
6312 }
6313 
6314 static bool
6315 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6316 {
6317 	pt_entry_t *ptes, pte;
6318 	pd_entry_t pde;
6319 	paddr_t ptppa, pa;
6320 	bool rv;
6321 
6322 #ifdef __HAVE_DIRECT_MAP
6323 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6324 		if (pap != NULL) {
6325 			*pap = PMAP_DIRECT_UNMAP(va);
6326 		}
6327 		return true;
6328 	}
6329 #endif
6330 
6331 	rv = false;
6332 	pa = 0;
6333 
6334 	mutex_enter(&pmap->pm_lock);
6335 	kpreempt_disable();
6336 
6337 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6338 		ptppa = pmap_pte2pa(pde);
6339 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6340 		pte = ptes[pl1_pi(va)];
6341 		if (__predict_true((pte & EPT_R) != 0)) {
6342 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6343 			rv = true;
6344 		}
6345 	}
6346 
6347 	kpreempt_enable();
6348 	mutex_exit(&pmap->pm_lock);
6349 
6350 	if (pap != NULL) {
6351 		*pap = pa;
6352 	}
6353 	return rv;
6354 }
6355 
6356 static bool
6357 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6358     vaddr_t va)
6359 {
6360 	struct pv_entry *pve;
6361 	struct vm_page *pg;
6362 	struct pmap_page *pp;
6363 	pt_entry_t opte;
6364 	bool accessed;
6365 
6366 	KASSERT(pmap != pmap_kernel());
6367 	KASSERT(mutex_owned(&pmap->pm_lock));
6368 	KASSERT(kpreempt_disabled());
6369 
6370 	if (!pmap_ept_valid_entry(*pte)) {
6371 		/* VA not mapped. */
6372 		return false;
6373 	}
6374 
6375 	/* Atomically save the old PTE and zap it. */
6376 	opte = pmap_pte_testset(pte, 0);
6377 	if (!pmap_ept_valid_entry(opte)) {
6378 		return false;
6379 	}
6380 
6381 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6382 
6383 	if (ptp) {
6384 		/*
6385 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6386 		 */
6387 		ptp->wire_count--;
6388 		if (ptp->wire_count <= 1) {
6389 			opte |= EPT_A;
6390 		}
6391 	}
6392 
6393 	if (pmap_ept_has_ad) {
6394 		accessed = (opte & EPT_A) != 0;
6395 	} else {
6396 		accessed = true;
6397 	}
6398 	if (accessed) {
6399 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6400 	}
6401 
6402 	/*
6403 	 * If we are not on a pv list - we are done.
6404 	 */
6405 	if ((opte & EPT_PVLIST) == 0) {
6406 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6407 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6408 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6409 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6410 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6411 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6412 		return true;
6413 	}
6414 
6415 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6416 		pp = VM_PAGE_TO_PP(pg);
6417 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6418 		paddr_t pa = pmap_pte2pa(opte);
6419 		panic("%s: EPT_PVLIST with pv-untracked page"
6420 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6421 		    __func__, va, pa, atop(pa));
6422 	}
6423 
6424 	/* Sync R/M bits. */
6425 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6426 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6427 	return true;
6428 }
6429 
6430 static void
6431 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6432     vaddr_t startva, vaddr_t endva)
6433 {
6434 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6435 
6436 	KASSERT(pmap != pmap_kernel());
6437 	KASSERT(mutex_owned(&pmap->pm_lock));
6438 	KASSERT(kpreempt_disabled());
6439 
6440 	/*
6441 	 * mappings are very often sparse, so clip the given range to the
6442 	 * range of PTEs that are known present in the PTP.
6443 	 */
6444 	pmap_ptp_range_clip(ptp, &startva, &pte);
6445 
6446 	/*
6447 	 * note that ptpva points to the PTE that maps startva.   this may
6448 	 * or may not be the first PTE in the PTP.
6449 	 *
6450 	 * we loop through the PTP while there are still PTEs to look at
6451 	 * and the wire_count is greater than 1 (because we use the wire_count
6452 	 * to keep track of the number of real PTEs in the PTP).
6453 	 */
6454 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6455 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6456 		startva += PAGE_SIZE;
6457 		pte++;
6458 	}
6459 }
6460 
6461 static void
6462 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6463 {
6464 	pt_entry_t *ptes;
6465 	pd_entry_t pde;
6466 	paddr_t ptppa;
6467 	vaddr_t blkendva, va = sva;
6468 	struct vm_page *ptp;
6469 
6470 	mutex_enter(&pmap->pm_lock);
6471 	kpreempt_disable();
6472 
6473 	for (/* null */ ; va < eva ; va = blkendva) {
6474 		int lvl;
6475 
6476 		/* determine range of block */
6477 		blkendva = x86_round_pdr(va+1);
6478 		if (blkendva > eva)
6479 			blkendva = eva;
6480 
6481 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6482 		if (lvl != 0) {
6483 			/* Skip a range corresponding to an invalid pde. */
6484 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6485  			continue;
6486 		}
6487 
6488 		/* PA of the PTP */
6489 		ptppa = pmap_pte2pa(pde);
6490 
6491 		ptp = pmap_find_ptp(pmap, va, 1);
6492 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6493 		    __func__);
6494 
6495 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6496 
6497 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6498 		    blkendva);
6499 
6500 		/* If PTP is no longer being used, free it. */
6501 		if (ptp && ptp->wire_count <= 1) {
6502 			pmap_ept_free_ptp(pmap, ptp, va);
6503 		}
6504 	}
6505 
6506 	kpreempt_enable();
6507 	pmap_drain_pv(pmap);
6508 	mutex_exit(&pmap->pm_lock);
6509 }
6510 
6511 static int
6512 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6513     uint8_t *oattrs, pt_entry_t *optep)
6514 {
6515 	struct pmap *pmap;
6516 	pt_entry_t *ptep;
6517 	pt_entry_t opte;
6518 	pt_entry_t npte;
6519 	pt_entry_t expect;
6520 	bool need_shootdown;
6521 
6522 	expect = pmap_pa2pte(pa) | EPT_R;
6523 	pmap = ptp_to_pmap(ptp);
6524 
6525 	if (clearbits != ~0) {
6526 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6527 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6528 	}
6529 
6530 	ptep = pmap_map_pte(pmap, ptp, va);
6531 	do {
6532 		opte = *ptep;
6533 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6534 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6535 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6536 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6537 			/*
6538 			 * We lost a race with a V->P operation like
6539 			 * pmap_remove().  Wait for the competitor
6540 			 * reflecting pte bits into mp_attrs.
6541 			 */
6542 			pmap_unmap_pte();
6543 			return EAGAIN;
6544 		}
6545 
6546 		/*
6547 		 * Check if there's anything to do on this PTE.
6548 		 */
6549 		if ((opte & clearbits) == 0) {
6550 			need_shootdown = false;
6551 			break;
6552 		}
6553 
6554 		/*
6555 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6556 		 * ... Unless we are clearing only the EPT_W bit and
6557 		 * it isn't cached as RW (EPT_D).
6558 		 */
6559 		if (pmap_ept_has_ad) {
6560 			need_shootdown = (opte & EPT_A) != 0 &&
6561 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6562 		} else {
6563 			need_shootdown = true;
6564 		}
6565 
6566 		npte = opte & ~clearbits;
6567 
6568 		/*
6569 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6570 		 */
6571 		if (need_shootdown) {
6572 			npte &= ~(EPT_A | EPT_D);
6573 		}
6574 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6575 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6576 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6577 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6578 
6579 	if (need_shootdown) {
6580 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6581 	}
6582 	pmap_unmap_pte();
6583 
6584 	*oattrs = pmap_ept_to_pp_attrs(opte);
6585 	if (optep != NULL)
6586 		*optep = opte;
6587 	return 0;
6588 }
6589 
6590 static void
6591 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6592     vaddr_t va)
6593 {
6594 
6595 	KASSERT(mutex_owned(&pmap->pm_lock));
6596 
6597 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6598 	ptp->wire_count--;
6599 	if (ptp->wire_count <= 1) {
6600 		pmap_ept_free_ptp(pmap, ptp, va);
6601 	}
6602 }
6603 
6604 static void
6605 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6606 {
6607 	pt_entry_t bit_rem;
6608 	pt_entry_t *ptes, *spte;
6609 	pt_entry_t opte, npte;
6610 	pd_entry_t pde;
6611 	paddr_t ptppa;
6612 	vaddr_t va;
6613 	bool modified;
6614 
6615 	bit_rem = 0;
6616 	if (!(prot & VM_PROT_WRITE))
6617 		bit_rem = EPT_W;
6618 
6619 	sva &= PTE_FRAME;
6620 	eva &= PTE_FRAME;
6621 
6622 	/* Acquire pmap. */
6623 	mutex_enter(&pmap->pm_lock);
6624 	kpreempt_disable();
6625 
6626 	for (va = sva; va < eva; va += PAGE_SIZE) {
6627 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6628 			continue;
6629 		}
6630 
6631 		ptppa = pmap_pte2pa(pde);
6632 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6633 		spte = &ptes[pl1_pi(va)];
6634 
6635 		do {
6636 			opte = *spte;
6637 			if (!pmap_ept_valid_entry(opte)) {
6638 				goto next;
6639 			}
6640 			npte = (opte & ~bit_rem);
6641 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6642 
6643 		if (pmap_ept_has_ad) {
6644 			modified = (opte & EPT_D) != 0;
6645 		} else {
6646 			modified = true;
6647 		}
6648 		if (modified) {
6649 			vaddr_t tva = x86_ptob(spte - ptes);
6650 			pmap_tlb_shootdown(pmap, tva, 0,
6651 			    TLBSHOOT_WRITE_PROTECT);
6652 		}
6653 next:;
6654 	}
6655 
6656 	kpreempt_enable();
6657 	mutex_exit(&pmap->pm_lock);
6658 }
6659 
6660 static void
6661 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6662 {
6663 	pt_entry_t *ptes, *ptep, opte;
6664 	pd_entry_t pde;
6665 	paddr_t ptppa;
6666 
6667 	/* Acquire pmap. */
6668 	mutex_enter(&pmap->pm_lock);
6669 	kpreempt_disable();
6670 
6671 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6672 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6673 	}
6674 
6675 	ptppa = pmap_pte2pa(pde);
6676 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6677 	ptep = &ptes[pl1_pi(va)];
6678 	opte = *ptep;
6679 	KASSERT(pmap_ept_valid_entry(opte));
6680 
6681 	if (opte & EPT_WIRED) {
6682 		pt_entry_t npte = opte & ~EPT_WIRED;
6683 
6684 		opte = pmap_pte_testset(ptep, npte);
6685 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6686 	} else {
6687 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6688 		    "did not change!\n", __func__, pmap, va);
6689 	}
6690 
6691 	/* Release pmap. */
6692 	kpreempt_enable();
6693 	mutex_exit(&pmap->pm_lock);
6694 }
6695 
6696 /* -------------------------------------------------------------------------- */
6697 
6698 void
6699 pmap_ept_transform(struct pmap *pmap)
6700 {
6701 	pmap->pm_enter = pmap_ept_enter;
6702 	pmap->pm_extract = pmap_ept_extract;
6703 	pmap->pm_remove = pmap_ept_remove;
6704 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6705 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6706 	pmap->pm_write_protect = pmap_ept_write_protect;
6707 	pmap->pm_unwire = pmap_ept_unwire;
6708 
6709 	memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6710 }
6711 
6712 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6713