xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: pmap.c,v 1.380 2020/03/22 00:16:16 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.380 2020/03/22 00:16:16 ad Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 
142 #define	__MUTEX_PRIVATE	/* for assertions */
143 
144 #include <sys/param.h>
145 #include <sys/systm.h>
146 #include <sys/proc.h>
147 #include <sys/pool.h>
148 #include <sys/kernel.h>
149 #include <sys/atomic.h>
150 #include <sys/cpu.h>
151 #include <sys/intr.h>
152 #include <sys/xcall.h>
153 #include <sys/kcore.h>
154 #include <sys/asan.h>
155 #include <sys/msan.h>
156 
157 #include <uvm/uvm.h>
158 #include <uvm/pmap/pmap_pvt.h>
159 
160 #include <dev/isa/isareg.h>
161 
162 #include <machine/specialreg.h>
163 #include <machine/gdt.h>
164 #include <machine/isa_machdep.h>
165 #include <machine/cpuvar.h>
166 #include <machine/cputypes.h>
167 #include <machine/cpu_rng.h>
168 
169 #include <x86/pmap.h>
170 #include <x86/pmap_pv.h>
171 
172 #include <x86/i82489reg.h>
173 #include <x86/i82489var.h>
174 
175 #ifdef XEN
176 #include <xen/include/public/xen.h>
177 #include <xen/hypervisor.h>
178 #endif
179 
180 /*
181  * general info:
182  *
183  *  - for an explanation of how the x86 MMU hardware works see
184  *    the comments in <machine/pte.h>.
185  *
186  *  - for an explanation of the general memory structure used by
187  *    this pmap (including the recursive mapping), see the comments
188  *    in <machine/pmap.h>.
189  *
190  * this file contains the code for the "pmap module."   the module's
191  * job is to manage the hardware's virtual to physical address mappings.
192  * note that there are two levels of mapping in the VM system:
193  *
194  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
195  *      to map ranges of virtual address space to objects/files.  for
196  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
197  *      to the file /bin/ls starting at offset zero."   note that
198  *      the upper layer mapping is not concerned with how individual
199  *      vm_pages are mapped.
200  *
201  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
202  *      from virtual addresses.   it is concerned with which vm_page is
203  *      mapped where.   for example, when you run /bin/ls and start
204  *      at page 0x1000 the fault routine may lookup the correct page
205  *      of the /bin/ls file and then ask the pmap layer to establish
206  *      a mapping for it.
207  *
208  * note that information in the lower layer of the VM system can be
209  * thrown away since it can easily be reconstructed from the info
210  * in the upper layer.
211  *
212  * data structures we use include:
213  *
214  *  - struct pmap: describes the address space of one thread
215  *  - struct pmap_page: describes one pv-tracked page, without
216  *    necessarily a corresponding vm_page
217  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
218  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
219  *    physical memory.   the pp_pvlist points to a list of pv_entry
220  *    structures which describe all the <PMAP,VA> pairs that this
221  *    page is mapped in.    this is critical for page based operations
222  *    such as pmap_page_protect() [change protection on _all_ mappings
223  *    of a page]
224  */
225 
226 /*
227  * Locking
228  *
229  * We have the following locks that we must deal with, listed in the order
230  * that they are acquired:
231  *
232  * pg->uobject->vmobjlock, pg->uanon->an_lock
233  *
234  * 	For managed pages, these per-object locks are taken by the VM system
235  *	before calling into the pmap module - either a read or write hold.
236  *	The lock hold prevent pages from changing identity while the pmap is
237  *	operating on them.  For example, the same lock is held across a call
238  *	to pmap_remove() and the following call to pmap_update(), so that a
239  *	page does not gain a new identity while its TLB visibility is stale.
240  *
241  * pmap->pm_lock
242  *
243  *	This lock protects the fields in the pmap structure including the
244  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
245  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
246  *	kernel PDEs are never freed, and the kernel is expected to be self
247  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
248  *	because they can be modified from interrupt context).
249  *
250  * pmaps_lock
251  *
252  *	This lock protects the list of active pmaps (headed by "pmaps").
253  *	It's acqired when adding or removing pmaps or adjusting kernel PDEs.
254  *
255  * pp_lock
256  *
257  *	This per-page lock protects PV entry lists and the embedded PV entry
258  *	in each vm_page, allowing for concurrent operation on pages by
259  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
260  *	points it is taken context switching is usually not tolerable, and
261  *	spin mutexes must block out interrupts that could take kernel_lock.
262  */
263 
264 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
265 #ifdef DIAGNOSTIC
266 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
267 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
268 #else
269 #define	PMAP_DUMMY_LOCK(pm)
270 #define	PMAP_DUMMY_UNLOCK(pm)
271 #endif
272 
273 static const struct uvm_pagerops pmap_pager = {
274 	/* nothing */
275 };
276 
277 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
278 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
279 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
280 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
281 const long nbpd[] = NBPD_INITIALIZER;
282 #ifdef i386
283 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
284 #else
285 pd_entry_t *normal_pdes[3];
286 #endif
287 
288 long nkptp[] = NKPTP_INITIALIZER;
289 
290 struct pmap_head pmaps;
291 kmutex_t pmaps_lock __cacheline_aligned;
292 
293 struct pcpu_area *pcpuarea __read_mostly;
294 
295 static vaddr_t pmap_maxkvaddr;
296 
297 /*
298  * Misc. event counters.
299  */
300 struct evcnt pmap_iobmp_evcnt;
301 struct evcnt pmap_ldt_evcnt;
302 
303 /*
304  * PAT
305  */
306 static bool cpu_pat_enabled __read_mostly = false;
307 
308 /*
309  * Global data structures
310  */
311 
312 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
313 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
314 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
315 
316 struct bootspace bootspace __read_mostly;
317 struct slotspace slotspace __read_mostly;
318 
319 /* Set to PTE_NX if supported. */
320 pd_entry_t pmap_pg_nx __read_mostly = 0;
321 
322 /* Set to PTE_G if supported. */
323 pd_entry_t pmap_pg_g __read_mostly = 0;
324 
325 /* Set to true if large pages are supported. */
326 int pmap_largepages __read_mostly = 0;
327 
328 paddr_t lowmem_rsvd __read_mostly;
329 paddr_t avail_start __read_mostly; /* PA of first available physical page */
330 paddr_t avail_end __read_mostly; /* PA of last available physical page */
331 
332 #ifdef XENPV
333 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
334 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
335 #endif
336 
337 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
338 #define	PMAP_CHECK_PP(pp) \
339     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
340 
341 /*
342  * Other data structures
343  */
344 
345 static pt_entry_t protection_codes[8] __read_mostly;
346 
347 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
348 
349 /*
350  * The following two vaddr_t's are used during system startup to keep track of
351  * how much of the kernel's VM space we have used. Once the system is started,
352  * the management of the remaining kernel VM space is turned over to the
353  * kernel_map vm_map.
354  */
355 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
356 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
357 
358 #ifndef XENPV
359 /*
360  * LAPIC virtual address, and fake physical address.
361  */
362 volatile vaddr_t local_apic_va __read_mostly;
363 paddr_t local_apic_pa __read_mostly;
364 #endif
365 
366 /*
367  * pool that pmap structures are allocated from
368  */
369 struct pool_cache pmap_cache;
370 static int  pmap_ctor(void *, void *, int);
371 static void pmap_dtor(void *, void *);
372 
373 /*
374  * pv_entry cache
375  */
376 static struct pool_cache pmap_pv_cache;
377 
378 #ifdef __HAVE_DIRECT_MAP
379 vaddr_t pmap_direct_base __read_mostly;
380 vaddr_t pmap_direct_end __read_mostly;
381 #endif
382 
383 #ifndef __HAVE_DIRECT_MAP
384 /*
385  * Special VAs and the PTEs that map them
386  */
387 static pt_entry_t *early_zero_pte;
388 static void pmap_vpage_cpualloc(struct cpu_info *);
389 #ifdef XENPV
390 char *early_zerop; /* also referenced from xen_locore() */
391 #else
392 static char *early_zerop;
393 #endif
394 #endif
395 
396 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
397 
398 /* PDP pool and its callbacks */
399 static struct pool pmap_pdp_pool;
400 static void pmap_pdp_init(pd_entry_t *);
401 static void pmap_pdp_fini(pd_entry_t *);
402 
403 #ifdef PAE
404 /* need to allocate items of 4 pages */
405 static void *pmap_pdp_alloc(struct pool *, int);
406 static void pmap_pdp_free(struct pool *, void *);
407 static struct pool_allocator pmap_pdp_allocator = {
408 	.pa_alloc = pmap_pdp_alloc,
409 	.pa_free = pmap_pdp_free,
410 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
411 };
412 #endif
413 
414 extern vaddr_t idt_vaddr;
415 extern paddr_t idt_paddr;
416 extern vaddr_t gdt_vaddr;
417 extern paddr_t gdt_paddr;
418 extern vaddr_t ldt_vaddr;
419 extern paddr_t ldt_paddr;
420 
421 #ifdef i386
422 /* stuff to fix the pentium f00f bug */
423 extern vaddr_t pentium_idt_vaddr;
424 #endif
425 
426 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
427 struct pmap_ptparray {
428 	struct vm_page *pg[PTP_LEVELS + 1];
429 	bool alloced[PTP_LEVELS + 1];
430 };
431 
432 /*
433  * PV tree prototypes
434  */
435 
436 static int	pmap_compare_key(void *, const void *, const void *);
437 static int	pmap_compare_nodes(void *, const void *, const void *);
438 
439 /* Read-black tree */
440 static const rb_tree_ops_t pmap_rbtree_ops = {
441 	.rbto_compare_nodes = pmap_compare_nodes,
442 	.rbto_compare_key = pmap_compare_key,
443 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
444 	.rbto_context = NULL
445 };
446 
447 /*
448  * Local prototypes
449  */
450 
451 #ifdef __HAVE_PCPU_AREA
452 static void pmap_init_pcpu(void);
453 #endif
454 #ifdef __HAVE_DIRECT_MAP
455 static void pmap_init_directmap(struct pmap *);
456 #endif
457 #if !defined(XENPV)
458 static void pmap_remap_global(void);
459 #endif
460 #ifndef XENPV
461 static void pmap_init_lapic(void);
462 static void pmap_remap_largepages(void);
463 #endif
464 
465 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
466     struct vm_page **);
467 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
468 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
469     pd_entry_t * const *);
470 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
471 static void pmap_freepage(struct pmap *, struct vm_page *, int);
472 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
473     pt_entry_t *, pd_entry_t * const *);
474 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
475     vaddr_t, struct pv_entry **);
476 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
477     vaddr_t, struct pv_entry **);
478 
479 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
480 
481 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
482 static void pmap_reactivate(struct pmap *);
483 
484 /*
485  * p m a p   h e l p e r   f u n c t i o n s
486  */
487 
488 static inline void
489 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
490 {
491 
492 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
493 	pmap->pm_stats.resident_count += resid_diff;
494 	pmap->pm_stats.wired_count += wired_diff;
495 }
496 
497 static inline void
498 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
499 {
500 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
501 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
502 
503 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
504 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
505 
506 	pmap_stats_update(pmap, resid_diff, wired_diff);
507 }
508 
509 /*
510  * ptp_to_pmap: lookup pmap by ptp
511  */
512 static inline struct pmap *
513 ptp_to_pmap(struct vm_page *ptp)
514 {
515 	struct pmap *pmap;
516 
517 	if (ptp == NULL) {
518 		return pmap_kernel();
519 	}
520 	pmap = (struct pmap *)ptp->uobject;
521 	KASSERT(pmap != NULL);
522 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
523 	return pmap;
524 }
525 
526 static inline struct pv_pte *
527 pve_to_pvpte(struct pv_entry *pve)
528 {
529 
530 	if (pve == NULL)
531 		return NULL;
532 	KASSERT((void *)&pve->pve_pte == (void *)pve);
533 	return &pve->pve_pte;
534 }
535 
536 static inline struct pv_entry *
537 pvpte_to_pve(struct pv_pte *pvpte)
538 {
539 	struct pv_entry *pve = (void *)pvpte;
540 
541 	KASSERT(pve_to_pvpte(pve) == pvpte);
542 	return pve;
543 }
544 
545 /*
546  * Return true if the pmap page has an embedded PV entry.
547  */
548 static inline bool
549 pv_pte_embedded(struct pmap_page *pp)
550 {
551 
552 	KASSERT(mutex_owned(&pp->pp_lock));
553 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
554 }
555 
556 /*
557  * pv_pte_first, pv_pte_next: PV list iterator.
558  */
559 static inline struct pv_pte *
560 pv_pte_first(struct pmap_page *pp)
561 {
562 
563 	KASSERT(mutex_owned(&pp->pp_lock));
564 	if (pv_pte_embedded(pp)) {
565 		return &pp->pp_pte;
566 	}
567 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
568 }
569 
570 static inline struct pv_pte *
571 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
572 {
573 
574 	KASSERT(mutex_owned(&pp->pp_lock));
575 	KASSERT(pvpte != NULL);
576 	if (pvpte == &pp->pp_pte) {
577 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
578 	}
579 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
580 }
581 
582 static inline uint8_t
583 pmap_pte_to_pp_attrs(pt_entry_t pte)
584 {
585 	uint8_t ret = 0;
586 	if (pte & PTE_D)
587 		ret |= PP_ATTRS_D;
588 	if (pte & PTE_A)
589 		ret |= PP_ATTRS_A;
590 	if (pte & PTE_W)
591 		ret |= PP_ATTRS_W;
592 	return ret;
593 }
594 
595 static inline pt_entry_t
596 pmap_pp_attrs_to_pte(uint8_t attrs)
597 {
598 	pt_entry_t pte = 0;
599 	if (attrs & PP_ATTRS_D)
600 		pte |= PTE_D;
601 	if (attrs & PP_ATTRS_A)
602 		pte |= PTE_A;
603 	if (attrs & PP_ATTRS_W)
604 		pte |= PTE_W;
605 	return pte;
606 }
607 
608 /*
609  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
610  * of course the kernel is always loaded
611  */
612 bool
613 pmap_is_curpmap(struct pmap *pmap)
614 {
615 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
616 }
617 
618 inline void
619 pmap_reference(struct pmap *pmap)
620 {
621 
622 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
623 }
624 
625 /*
626  * rbtree: compare two nodes.
627  */
628 static int
629 pmap_compare_nodes(void *context, const void *n1, const void *n2)
630 {
631 	const struct pv_entry *pve1 = n1;
632 	const struct pv_entry *pve2 = n2;
633 
634 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
635 
636 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
637 		return -1;
638 	}
639 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
640 		return 1;
641 	}
642 	return 0;
643 }
644 
645 /*
646  * rbtree: compare a node and a key.
647  */
648 static int
649 pmap_compare_key(void *context, const void *n, const void *k)
650 {
651 	const struct pv_entry *pve = n;
652 	const vaddr_t key = (vaddr_t)k;
653 
654 	if (pve->pve_pte.pte_va < key) {
655 		return -1;
656 	}
657 	if (pve->pve_pte.pte_va > key) {
658 		return 1;
659 	}
660 	return 0;
661 }
662 
663 /*
664  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
665  */
666 static inline void
667 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
668 {
669 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
670 
671 	if (va < *min) {
672 		*min = va;
673 	}
674 }
675 
676 /*
677  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
678  */
679 static inline void
680 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
681 {
682 	vaddr_t sclip;
683 
684 	if (ptp == NULL) {
685 		return;
686 	}
687 
688 	sclip = (vaddr_t)ptp->uanon;
689 	sclip = (*startva < sclip ? sclip : *startva);
690 	*pte += (sclip - *startva) / PAGE_SIZE;
691 	*startva = sclip;
692 }
693 
694 /*
695  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
696  *
697  * there are several pmaps involved.  some or all of them might be same.
698  *
699  *	- the pmap given by the first argument
700  *		our caller wants to access this pmap's PTEs.
701  *
702  *	- pmap_kernel()
703  *		the kernel pmap.  note that it only contains the kernel part
704  *		of the address space which is shared by any pmap.  ie. any
705  *		pmap can be used instead of pmap_kernel() for our purpose.
706  *
707  *	- ci->ci_pmap
708  *		pmap currently loaded on the cpu.
709  *
710  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
711  *		current process' pmap.
712  *
713  * => caller must lock pmap first (if not the kernel pmap)
714  * => must be undone with pmap_unmap_ptes before returning
715  * => disables kernel preemption
716  */
717 void
718 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
719     pd_entry_t * const **pdeppp)
720 {
721 	struct pmap *curpmap;
722 	struct cpu_info *ci;
723 	lwp_t *l;
724 
725 	kpreempt_disable();
726 
727 	/* The kernel's pmap is always accessible. */
728 	if (pmap == pmap_kernel()) {
729 		*pmap2 = NULL;
730 		*ptepp = PTE_BASE;
731 		*pdeppp = normal_pdes;
732 		return;
733 	}
734 
735 	KASSERT(mutex_owned(&pmap->pm_lock));
736 
737 	l = curlwp;
738 	ci = l->l_cpu;
739 	curpmap = ci->ci_pmap;
740 	if (pmap == curpmap) {
741 		/*
742 		 * Already on the CPU: make it valid.  This is very
743 		 * often the case during exit(), when we have switched
744 		 * to the kernel pmap in order to destroy a user pmap.
745 		 */
746 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
747 			pmap_reactivate(pmap);
748 		}
749 		*pmap2 = NULL;
750 	} else {
751 		/*
752 		 * Toss current pmap from CPU and install new pmap, but keep
753 		 * a reference to the old one.  Dropping the reference can
754 		 * can block as it needs to take locks, so defer that to
755 		 * pmap_unmap_ptes().
756 		 */
757 		pmap_reference(pmap);
758 		pmap_load1(l, pmap, curpmap);
759 		*pmap2 = curpmap;
760 	}
761 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
762 #ifdef DIAGNOSTIC
763 	pmap->pm_ncsw = lwp_pctr();
764 #endif
765 	*ptepp = PTE_BASE;
766 
767 #if defined(XENPV) && defined(__x86_64__)
768 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
769 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
770 	*pdeppp = ci->ci_normal_pdes;
771 #else
772 	*pdeppp = normal_pdes;
773 #endif
774 }
775 
776 /*
777  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
778  *
779  * => we cannot tolerate context switches while mapped in: assert this.
780  * => reenables kernel preemption.
781  * => does not unlock pmap.
782  */
783 void
784 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
785 {
786 	struct cpu_info *ci;
787 	struct pmap *mypmap;
788 	struct lwp *l;
789 
790 	KASSERT(kpreempt_disabled());
791 
792 	/* The kernel's pmap is always accessible. */
793 	if (pmap == pmap_kernel()) {
794 		kpreempt_enable();
795 		return;
796 	}
797 
798 	l = curlwp;
799 	ci = l->l_cpu;
800 
801 	KASSERT(mutex_owned(&pmap->pm_lock));
802 	KASSERT(pmap->pm_ncsw == lwp_pctr());
803 
804 #if defined(XENPV) && defined(__x86_64__)
805 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
806 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
807 #endif
808 
809 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
810 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
811 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
812 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
813 		ci->ci_want_pmapload = 0;
814 	} else {
815 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
816 		ci->ci_tlbstate = TLBSTATE_LAZY;
817 	}
818 
819 	/* Now safe to re-enable preemption. */
820 	kpreempt_enable();
821 
822 	/* Toss reference to other pmap taken earlier. */
823 	if (pmap2 != NULL) {
824 		pmap_destroy(pmap2);
825 	}
826 }
827 
828 inline static void
829 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
830 {
831 
832 #if !defined(__x86_64__)
833 	if (curproc == NULL || curproc->p_vmspace == NULL ||
834 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
835 		return;
836 
837 	if ((opte ^ npte) & PTE_X)
838 		pmap_update_pg(va);
839 
840 	/*
841 	 * Executability was removed on the last executable change.
842 	 * Reset the code segment to something conservative and
843 	 * let the trap handler deal with setting the right limit.
844 	 * We can't do that because of locking constraints on the vm map.
845 	 */
846 
847 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
848 		struct trapframe *tf = curlwp->l_md.md_regs;
849 
850 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
851 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
852 	}
853 #endif /* !defined(__x86_64__) */
854 }
855 
856 #if !defined(__x86_64__)
857 /*
858  * Fixup the code segment to cover all potential executable mappings.
859  * returns 0 if no changes to the code segment were made.
860  */
861 int
862 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
863 {
864 	struct vm_map_entry *ent;
865 	struct pmap *pm = vm_map_pmap(map);
866 	vaddr_t va = 0;
867 
868 	vm_map_lock_read(map);
869 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
870 		/*
871 		 * This entry has greater va than the entries before.
872 		 * We need to make it point to the last page, not past it.
873 		 */
874 		if (ent->protection & VM_PROT_EXECUTE)
875 			va = trunc_page(ent->end) - PAGE_SIZE;
876 	}
877 	vm_map_unlock_read(map);
878 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
879 		return 0;
880 
881 	pm->pm_hiexec = va;
882 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
883 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
884 	} else {
885 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
886 		return 0;
887 	}
888 	return 1;
889 }
890 #endif /* !defined(__x86_64__) */
891 
892 void
893 pat_init(struct cpu_info *ci)
894 {
895 	uint64_t pat;
896 
897 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
898 		return;
899 
900 	/* We change WT to WC. Leave all other entries the default values. */
901 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
902 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
903 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
904 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
905 
906 	wrmsr(MSR_CR_PAT, pat);
907 	cpu_pat_enabled = true;
908 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
909 }
910 
911 static pt_entry_t
912 pmap_pat_flags(u_int flags)
913 {
914 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
915 
916 	if (!cpu_pat_enabled) {
917 		switch (cacheflags) {
918 		case PMAP_NOCACHE:
919 		case PMAP_NOCACHE_OVR:
920 			/* results in PGC_UCMINUS on cpus which have
921 			 * the cpuid PAT but PAT "disabled"
922 			 */
923 			return PTE_PCD;
924 		default:
925 			return 0;
926 		}
927 	}
928 
929 	switch (cacheflags) {
930 	case PMAP_NOCACHE:
931 		return PGC_UC;
932 	case PMAP_WRITE_COMBINE:
933 		return PGC_WC;
934 	case PMAP_WRITE_BACK:
935 		return PGC_WB;
936 	case PMAP_NOCACHE_OVR:
937 		return PGC_UCMINUS;
938 	}
939 
940 	return 0;
941 }
942 
943 /*
944  * p m a p   k e n t e r   f u n c t i o n s
945  *
946  * functions to quickly enter/remove pages from the kernel address
947  * space.   pmap_kremove is exported to MI kernel.  we make use of
948  * the recursive PTE mappings.
949  */
950 
951 /*
952  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
953  *
954  * => no need to lock anything, assume va is already allocated
955  * => should be faster than normal pmap enter function
956  */
957 void
958 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
959 {
960 	pt_entry_t *pte, opte, npte;
961 
962 	KASSERT(!(prot & ~VM_PROT_ALL));
963 
964 	if (va < VM_MIN_KERNEL_ADDRESS)
965 		pte = vtopte(va);
966 	else
967 		pte = kvtopte(va);
968 #ifdef DOM0OPS
969 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
970 #ifdef DEBUG
971 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
972 		    " outside range\n", __func__, pa, va);
973 #endif /* DEBUG */
974 		npte = pa;
975 	} else
976 #endif /* DOM0OPS */
977 		npte = pmap_pa2pte(pa);
978 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
979 	npte |= pmap_pat_flags(flags);
980 	opte = pmap_pte_testset(pte, npte); /* zap! */
981 
982 	/*
983 	 * XXX: make sure we are not dealing with a large page, since the only
984 	 * large pages created are for the kernel image, and they should never
985 	 * be kentered.
986 	 */
987 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
988 
989 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
990 		/* This should not happen. */
991 		printf_nolog("%s: mapping already present\n", __func__);
992 		kpreempt_disable();
993 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
994 		kpreempt_enable();
995 	}
996 }
997 
998 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
999 
1000 #if defined(__x86_64__)
1001 /*
1002  * Change protection for a virtual address. Local for a CPU only, don't
1003  * care about TLB shootdowns.
1004  *
1005  * => must be called with preemption disabled
1006  */
1007 void
1008 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1009 {
1010 	pt_entry_t *pte, opte, npte;
1011 
1012 	KASSERT(kpreempt_disabled());
1013 
1014 	if (va < VM_MIN_KERNEL_ADDRESS)
1015 		pte = vtopte(va);
1016 	else
1017 		pte = kvtopte(va);
1018 
1019 	npte = opte = *pte;
1020 
1021 	if ((prot & VM_PROT_WRITE) != 0)
1022 		npte |= PTE_W;
1023 	else
1024 		npte &= ~(PTE_W|PTE_D);
1025 
1026 	if (opte != npte) {
1027 		pmap_pte_set(pte, npte);
1028 		pmap_pte_flush();
1029 		invlpg(va);
1030 	}
1031 }
1032 #endif /* defined(__x86_64__) */
1033 
1034 /*
1035  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1036  *
1037  * => no need to lock anything
1038  * => caller must dispose of any vm_page mapped in the va range
1039  * => note: not an inline function
1040  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1041  * => we assume kernel only unmaps valid addresses and thus don't bother
1042  *    checking the valid bit before doing TLB flushing
1043  * => must be followed by call to pmap_update() before reuse of page
1044  */
1045 static void
1046 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1047 {
1048 	pt_entry_t *pte, opte;
1049 	vaddr_t va, eva;
1050 
1051 	eva = sva + len;
1052 
1053 	kpreempt_disable();
1054 	for (va = sva; va < eva; va += PAGE_SIZE) {
1055 		pte = kvtopte(va);
1056 		opte = pmap_pte_testset(pte, 0); /* zap! */
1057 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1058 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1059 			    TLBSHOOT_KREMOVE);
1060 		}
1061 		KASSERTMSG((opte & PTE_PS) == 0,
1062 		    "va %#" PRIxVADDR " is a large page", va);
1063 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1064 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1065 	}
1066 	if (localonly) {
1067 		tlbflushg();
1068 	}
1069 	kpreempt_enable();
1070 }
1071 
1072 void
1073 pmap_kremove(vaddr_t sva, vsize_t len)
1074 {
1075 
1076 	pmap_kremove1(sva, len, false);
1077 }
1078 
1079 /*
1080  * pmap_kremove_local: like pmap_kremove(), but only worry about
1081  * TLB invalidations on the current CPU.  this is only intended
1082  * for use while writing kernel crash dumps, either after panic
1083  * or via reboot -d.
1084  */
1085 void
1086 pmap_kremove_local(vaddr_t sva, vsize_t len)
1087 {
1088 
1089 	pmap_kremove1(sva, len, true);
1090 }
1091 
1092 /*
1093  * p m a p   i n i t   f u n c t i o n s
1094  *
1095  * pmap_bootstrap and pmap_init are called during system startup
1096  * to init the pmap module.   pmap_bootstrap() does a low level
1097  * init just to get things rolling.   pmap_init() finishes the job.
1098  */
1099 
1100 /*
1101  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1102  * This function is to be used before any VM system has been set up.
1103  *
1104  * The va is taken from virtual_avail.
1105  */
1106 static vaddr_t
1107 pmap_bootstrap_valloc(size_t npages)
1108 {
1109 	vaddr_t va = virtual_avail;
1110 	virtual_avail += npages * PAGE_SIZE;
1111 	return va;
1112 }
1113 
1114 /*
1115  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1116  * This function is to be used before any VM system has been set up.
1117  *
1118  * The pa is taken from avail_start.
1119  */
1120 static paddr_t
1121 pmap_bootstrap_palloc(size_t npages)
1122 {
1123 	paddr_t pa = avail_start;
1124 	avail_start += npages * PAGE_SIZE;
1125 	return pa;
1126 }
1127 
1128 /*
1129  * pmap_bootstrap: get the system in a state where it can run with VM properly
1130  * enabled (called before main()). The VM system is fully init'd later.
1131  *
1132  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1133  *    kernel, and nkpde PTP's for the kernel.
1134  * => kva_start is the first free virtual address in kernel space.
1135  */
1136 void
1137 pmap_bootstrap(vaddr_t kva_start)
1138 {
1139 	struct pmap *kpm;
1140 	int i;
1141 	vaddr_t kva;
1142 
1143 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1144 
1145 	/*
1146 	 * Set up our local static global vars that keep track of the usage of
1147 	 * KVM before kernel_map is set up.
1148 	 */
1149 	virtual_avail = kva_start;		/* first free KVA */
1150 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1151 
1152 	/*
1153 	 * Set up protection_codes: we need to be able to convert from a MI
1154 	 * protection code (some combo of VM_PROT...) to something we can jam
1155 	 * into a x86 PTE.
1156 	 */
1157 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1158 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1159 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1160 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1161 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1162 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1163 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1164 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1165 
1166 	/*
1167 	 * Now we init the kernel's pmap.
1168 	 *
1169 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1170 	 * the pm_obj contains the list of active PTPs.
1171 	 */
1172 	kpm = pmap_kernel();
1173 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1174 	rw_init(&kpm->pm_dummy_lock);
1175 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1176 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1177 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1178 		kpm->pm_ptphint[i] = NULL;
1179 	}
1180 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1181 
1182 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1183 	for (i = 0; i < PDP_SIZE; i++)
1184 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1185 
1186 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1187 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1188 
1189 	kcpuset_create(&kpm->pm_cpus, true);
1190 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1191 
1192 	kpm->pm_ldt = NULL;
1193 	kpm->pm_ldt_len = 0;
1194 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1195 
1196 	/*
1197 	 * the above is just a rough estimate and not critical to the proper
1198 	 * operation of the system.
1199 	 */
1200 
1201 #if !defined(XENPV)
1202 	/*
1203 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1204 	 * attribute to already mapped kernel pages. Do that only if SVS is
1205 	 * disabled.
1206 	 *
1207 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1208 	 * happens later in cpu_init().
1209 	 */
1210 #ifdef SVS
1211 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1212 #else
1213 	if (cpu_feature[0] & CPUID_PGE) {
1214 #endif
1215 		pmap_pg_g = PTE_G;
1216 		pmap_remap_global();
1217 	}
1218 #endif
1219 
1220 #ifndef XENPV
1221 	/*
1222 	 * Enable large pages if they are supported.
1223 	 */
1224 	if (cpu_feature[0] & CPUID_PSE) {
1225 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1226 		pmap_largepages = 1;	/* enable software */
1227 
1228 		/*
1229 		 * The TLB must be flushed after enabling large pages on Pentium
1230 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1231 		 * Software Developer's Manual, Volume 3: System Programming".
1232 		 */
1233 		tlbflushg();
1234 
1235 		/* Remap the kernel. */
1236 		pmap_remap_largepages();
1237 	}
1238 	pmap_init_lapic();
1239 #endif /* !XENPV */
1240 
1241 #ifdef __HAVE_PCPU_AREA
1242 	pmap_init_pcpu();
1243 #endif
1244 
1245 #ifdef __HAVE_DIRECT_MAP
1246 	pmap_init_directmap(kpm);
1247 #else
1248 	pmap_vpage_cpualloc(&cpu_info_primary);
1249 
1250 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1251 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1252 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1253 	} else { /* amd64 */
1254 		/*
1255 		 * zero_pte is stuck at the end of mapped space for the kernel
1256 		 * image (disjunct from kva space). This is done so that it
1257 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1258 		 * when it's called for the first time.
1259 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1260 		 */
1261 #ifdef XENPV
1262 		/* early_zerop initialized in xen_locore() */
1263 #else
1264 		early_zerop = (void *)bootspace.spareva;
1265 #endif
1266 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1267 	}
1268 #endif
1269 
1270 #if defined(XENPV) && defined(__x86_64__)
1271 	extern vaddr_t xen_dummy_page;
1272 	paddr_t xen_dummy_user_pgd;
1273 
1274 	/*
1275 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1276 	 * Xen will still consider it active. So we set user PGD to this one
1277 	 * to lift all protection on the now inactive page tables set.
1278 	 */
1279 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1280 
1281 	/* Zero fill it, the less checks in Xen it requires the better */
1282 	memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1283 	/* Mark read-only */
1284 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1285 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1286 	    UVMF_INVLPG);
1287 	/* Pin as L4 */
1288 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1289 #endif
1290 
1291 	/*
1292 	 * Allocate space for the IDT, GDT and LDT.
1293 	 */
1294 #ifdef __HAVE_PCPU_AREA
1295 	idt_vaddr = (vaddr_t)&pcpuarea->idt;
1296 #else
1297 	idt_vaddr = pmap_bootstrap_valloc(1);
1298 #endif
1299 	idt_paddr = pmap_bootstrap_palloc(1);
1300 
1301 	gdt_vaddr = pmap_bootstrap_valloc(1);
1302 	gdt_paddr = pmap_bootstrap_palloc(1);
1303 
1304 #ifdef __HAVE_PCPU_AREA
1305 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1306 #else
1307 	ldt_vaddr = pmap_bootstrap_valloc(1);
1308 #endif
1309 	ldt_paddr = pmap_bootstrap_palloc(1);
1310 
1311 #if !defined(__x86_64__)
1312 	/* pentium f00f bug stuff */
1313 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1314 #endif
1315 
1316 #if defined(XENPVHVM)
1317 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1318 	extern paddr_t HYPERVISOR_shared_info_pa;
1319 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1320 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1321 
1322 	HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1323 	HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1324 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1325 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1326 #endif
1327 	/*
1328 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1329 	 */
1330 	virtual_avail = reserve_dumppages(virtual_avail);
1331 
1332 	/*
1333 	 * Init the global lock and global list.
1334 	 */
1335 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1336 	LIST_INIT(&pmaps);
1337 
1338 	/*
1339 	 * Ensure the TLB is sync'd with reality by flushing it...
1340 	 */
1341 	tlbflushg();
1342 
1343 	/*
1344 	 * Calculate pmap_maxkvaddr from nkptp[].
1345 	 */
1346 	kva = VM_MIN_KERNEL_ADDRESS;
1347 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1348 		kva += nkptp[i] * nbpd[i];
1349 	}
1350 	pmap_maxkvaddr = kva;
1351 }
1352 
1353 #ifndef XENPV
1354 static void
1355 pmap_init_lapic(void)
1356 {
1357 	/*
1358 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1359 	 * x86 implementation relies a lot on this address to be valid; so just
1360 	 * allocate a fake physical page that will be kentered into
1361 	 * local_apic_va by machdep.
1362 	 *
1363 	 * If the LAPIC is present, the va will be remapped somewhere else
1364 	 * later in lapic_map.
1365 	 */
1366 	local_apic_va = pmap_bootstrap_valloc(1);
1367 	local_apic_pa = pmap_bootstrap_palloc(1);
1368 }
1369 #endif
1370 
1371 #ifdef __x86_64__
1372 static size_t
1373 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1374 {
1375 	size_t npages;
1376 	npages = (roundup(endva, pgsz) / pgsz) -
1377 	    (rounddown(startva, pgsz) / pgsz);
1378 	return npages;
1379 }
1380 #endif
1381 
1382 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1383 static inline void
1384 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1385 {
1386 	size_t sslot = slotspace.area[type].sslot;
1387 	size_t nslot = slotspace.area[type].nslot;
1388 
1389 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1390 }
1391 #endif
1392 
1393 #ifdef __x86_64__
1394 /*
1395  * Randomize the location of an area. We count the holes in the VM space. We
1396  * randomly select one hole, and then randomly select an area within that hole.
1397  * Finally we update the associated entry in the slotspace structure.
1398  */
1399 vaddr_t __noasan
1400 slotspace_rand(int type, size_t sz, size_t align)
1401 {
1402 	struct {
1403 		int start;
1404 		int end;
1405 	} holes[SLSPACE_NAREAS+1];
1406 	size_t i, nholes, hole;
1407 	size_t startsl, endsl, nslots, winsize;
1408 	vaddr_t startva, va;
1409 
1410 	sz = roundup(sz, align);
1411 
1412 	/*
1413 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1414 	 * an area that crosses slots:
1415 	 *     +------+------+------+
1416 	 *     | Slot | Slot | Slot |
1417 	 *     +------+------+------+
1418 	 *        [Chosen Area]
1419 	 * And in that case we must take into account the additional slot
1420 	 * consumed.
1421 	 */
1422 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1423 
1424 	/* Get the holes. */
1425 	nholes = 0;
1426 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1427 	while (1) {
1428 		/*
1429 		 * Find the first occupied slot after the current one.
1430 		 * The area between the two is a hole.
1431 		 */
1432 		size_t minsslot = 512;
1433 		size_t minnslot = 0;
1434 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1435 			if (!slotspace.area[i].active)
1436 				continue;
1437 			if (slotspace.area[i].sslot >= curslot &&
1438 			    slotspace.area[i].sslot < minsslot) {
1439 				minsslot = slotspace.area[i].sslot;
1440 				minnslot = slotspace.area[i].nslot;
1441 			}
1442 		}
1443 
1444 		/* No hole anymore, stop here. */
1445 		if (minsslot == 512) {
1446 			break;
1447 		}
1448 
1449 		/* Register the hole. */
1450 		if (minsslot - curslot >= nslots) {
1451 			holes[nholes].start = curslot;
1452 			holes[nholes].end = minsslot;
1453 			nholes++;
1454 		}
1455 
1456 		/* Skip that hole, and iterate again. */
1457 		curslot = minsslot + minnslot;
1458 	}
1459 
1460 	if (nholes == 0) {
1461 		panic("%s: impossible", __func__);
1462 	}
1463 
1464 	/* Select a hole. */
1465 	cpu_earlyrng(&hole, sizeof(hole));
1466 #ifdef NO_X86_ASLR
1467 	hole = 0;
1468 #endif
1469 	hole %= nholes;
1470 	startsl = holes[hole].start;
1471 	endsl = holes[hole].end;
1472 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1473 
1474 	/* Select an area within the hole. */
1475 	cpu_earlyrng(&va, sizeof(va));
1476 #ifdef NO_X86_ASLR
1477 	va = 0;
1478 #endif
1479 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1480 	va %= winsize;
1481 	va = rounddown(va, align);
1482 	va += startva;
1483 
1484 	/* Update the entry. */
1485 	slotspace.area[type].sslot = pl4_i(va);
1486 	slotspace.area[type].nslot =
1487 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1488 	slotspace.area[type].active = true;
1489 
1490 	return va;
1491 }
1492 #endif
1493 
1494 #ifdef __HAVE_PCPU_AREA
1495 static void
1496 pmap_init_pcpu(void)
1497 {
1498 	const vaddr_t startva = PMAP_PCPU_BASE;
1499 	size_t nL4e, nL3e, nL2e, nL1e;
1500 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1501 	paddr_t pa;
1502 	vaddr_t endva;
1503 	vaddr_t tmpva;
1504 	pt_entry_t *pte;
1505 	size_t size;
1506 	int i;
1507 
1508 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1509 
1510 	size = sizeof(struct pcpu_area);
1511 
1512 	endva = startva + size;
1513 
1514 	/* We will use this temporary va. */
1515 	tmpva = bootspace.spareva;
1516 	pte = PTE_BASE + pl1_i(tmpva);
1517 
1518 	/* Build L4 */
1519 	L4e_idx = pl4_i(startva);
1520 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1521 	KASSERT(nL4e  == 1);
1522 	for (i = 0; i < nL4e; i++) {
1523 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1524 
1525 		pa = pmap_bootstrap_palloc(1);
1526 		*pte = (pa & PTE_FRAME) | pteflags;
1527 		pmap_update_pg(tmpva);
1528 		memset((void *)tmpva, 0, PAGE_SIZE);
1529 
1530 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1531 	}
1532 
1533 	/* Build L3 */
1534 	L3e_idx = pl3_i(startva);
1535 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1536 	for (i = 0; i < nL3e; i++) {
1537 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1538 
1539 		pa = pmap_bootstrap_palloc(1);
1540 		*pte = (pa & PTE_FRAME) | pteflags;
1541 		pmap_update_pg(tmpva);
1542 		memset((void *)tmpva, 0, PAGE_SIZE);
1543 
1544 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1545 	}
1546 
1547 	/* Build L2 */
1548 	L2e_idx = pl2_i(startva);
1549 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1550 	for (i = 0; i < nL2e; i++) {
1551 
1552 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1553 
1554 		pa = pmap_bootstrap_palloc(1);
1555 		*pte = (pa & PTE_FRAME) | pteflags;
1556 		pmap_update_pg(tmpva);
1557 		memset((void *)tmpva, 0, PAGE_SIZE);
1558 
1559 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1560 	}
1561 
1562 	/* Build L1 */
1563 	L1e_idx = pl1_i(startva);
1564 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1565 	for (i = 0; i < nL1e; i++) {
1566 		/*
1567 		 * Nothing to do, the PTEs will be entered via
1568 		 * pmap_kenter_pa.
1569 		 */
1570 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1571 	}
1572 
1573 	*pte = 0;
1574 	pmap_update_pg(tmpva);
1575 
1576 	pcpuarea = (struct pcpu_area *)startva;
1577 
1578 	tlbflush();
1579 }
1580 #endif
1581 
1582 #ifdef __HAVE_DIRECT_MAP
1583 /*
1584  * Create the amd64 direct map. Called only once at boot time. We map all of
1585  * the physical memory contiguously using 2MB large pages, with RW permissions.
1586  * However there is a hole: the kernel is mapped with RO permissions.
1587  */
1588 static void
1589 pmap_init_directmap(struct pmap *kpm)
1590 {
1591 	extern phys_ram_seg_t mem_clusters[];
1592 	extern int mem_cluster_cnt;
1593 
1594 	vaddr_t startva;
1595 	size_t nL4e, nL3e, nL2e;
1596 	size_t L4e_idx, L3e_idx, L2e_idx;
1597 	size_t spahole, epahole;
1598 	paddr_t lastpa, pa;
1599 	vaddr_t endva;
1600 	vaddr_t tmpva;
1601 	pt_entry_t *pte;
1602 	phys_ram_seg_t *mc;
1603 	int i;
1604 
1605 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1606 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1607 
1608 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1609 
1610 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1611 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1612 
1613 	/* Get the last physical address available */
1614 	lastpa = 0;
1615 	for (i = 0; i < mem_cluster_cnt; i++) {
1616 		mc = &mem_clusters[i];
1617 		lastpa = MAX(lastpa, mc->start + mc->size);
1618 	}
1619 
1620 	/*
1621 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1622 	 */
1623 	if (lastpa > MAXPHYSMEM) {
1624 		panic("pmap_init_directmap: lastpa incorrect");
1625 	}
1626 
1627 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2);
1628 	endva = startva + lastpa;
1629 
1630 	/* We will use this temporary va. */
1631 	tmpva = bootspace.spareva;
1632 	pte = PTE_BASE + pl1_i(tmpva);
1633 
1634 	/* Build L4 */
1635 	L4e_idx = pl4_i(startva);
1636 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1637 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1638 	for (i = 0; i < nL4e; i++) {
1639 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1640 
1641 		pa = pmap_bootstrap_palloc(1);
1642 		*pte = (pa & PTE_FRAME) | pteflags;
1643 		pmap_update_pg(tmpva);
1644 		memset((void *)tmpva, 0, PAGE_SIZE);
1645 
1646 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1647 	}
1648 
1649 	/* Build L3 */
1650 	L3e_idx = pl3_i(startva);
1651 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1652 	for (i = 0; i < nL3e; i++) {
1653 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1654 
1655 		pa = pmap_bootstrap_palloc(1);
1656 		*pte = (pa & PTE_FRAME) | pteflags;
1657 		pmap_update_pg(tmpva);
1658 		memset((void *)tmpva, 0, PAGE_SIZE);
1659 
1660 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1661 	}
1662 
1663 	/* Build L2 */
1664 	L2e_idx = pl2_i(startva);
1665 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1666 	for (i = 0; i < nL2e; i++) {
1667 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1668 
1669 		pa = (paddr_t)(i * NBPD_L2);
1670 
1671 		if (spahole <= pa && pa < epahole) {
1672 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1673 			    PTE_PS | pmap_pg_g;
1674 		} else {
1675 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1676 			    PTE_PS | pmap_pg_g;
1677 		}
1678 	}
1679 
1680 	*pte = 0;
1681 	pmap_update_pg(tmpva);
1682 
1683 	pmap_direct_base = startva;
1684 	pmap_direct_end = endva;
1685 
1686 	tlbflush();
1687 }
1688 #endif /* __HAVE_DIRECT_MAP */
1689 
1690 #if !defined(XENPV)
1691 /*
1692  * Remap all of the virtual pages created so far with the PTE_G bit.
1693  */
1694 static void
1695 pmap_remap_global(void)
1696 {
1697 	vaddr_t kva, kva_end;
1698 	unsigned long p1i;
1699 	size_t i;
1700 
1701 	/* head */
1702 	kva = bootspace.head.va;
1703 	kva_end = kva + bootspace.head.sz;
1704 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1705 		p1i = pl1_i(kva);
1706 		if (pmap_valid_entry(PTE_BASE[p1i]))
1707 			PTE_BASE[p1i] |= pmap_pg_g;
1708 	}
1709 
1710 	/* kernel segments */
1711 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1712 		if (bootspace.segs[i].type == BTSEG_NONE) {
1713 			continue;
1714 		}
1715 		kva = bootspace.segs[i].va;
1716 		kva_end = kva + bootspace.segs[i].sz;
1717 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1718 			p1i = pl1_i(kva);
1719 			if (pmap_valid_entry(PTE_BASE[p1i]))
1720 				PTE_BASE[p1i] |= pmap_pg_g;
1721 		}
1722 	}
1723 
1724 	/* boot space */
1725 	kva = bootspace.boot.va;
1726 	kva_end = kva + bootspace.boot.sz;
1727 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1728 		p1i = pl1_i(kva);
1729 		if (pmap_valid_entry(PTE_BASE[p1i]))
1730 			PTE_BASE[p1i] |= pmap_pg_g;
1731 	}
1732 }
1733 #endif
1734 
1735 #ifndef XENPV
1736 /*
1737  * Remap several kernel segments with large pages. We cover as many pages as we
1738  * can. Called only once at boot time, if the CPU supports large pages.
1739  */
1740 static void
1741 pmap_remap_largepages(void)
1742 {
1743 	pd_entry_t *pde;
1744 	vaddr_t kva, kva_end;
1745 	paddr_t pa;
1746 	size_t i;
1747 
1748 	/* Remap the kernel text using large pages. */
1749 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1750 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1751 			continue;
1752 		}
1753 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1754 		if (kva < bootspace.segs[i].va) {
1755 			continue;
1756 		}
1757 		kva_end = rounddown(bootspace.segs[i].va +
1758 			bootspace.segs[i].sz, NBPD_L2);
1759 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1760 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1761 			pde = &L2_BASE[pl2_i(kva)];
1762 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1763 			tlbflushg();
1764 		}
1765 	}
1766 
1767 	/* Remap the kernel rodata using large pages. */
1768 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1769 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1770 			continue;
1771 		}
1772 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1773 		if (kva < bootspace.segs[i].va) {
1774 			continue;
1775 		}
1776 		kva_end = rounddown(bootspace.segs[i].va +
1777 			bootspace.segs[i].sz, NBPD_L2);
1778 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1779 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1780 			pde = &L2_BASE[pl2_i(kva)];
1781 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1782 			tlbflushg();
1783 		}
1784 	}
1785 
1786 	/* Remap the kernel data+bss using large pages. */
1787 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1788 		if (bootspace.segs[i].type != BTSEG_DATA) {
1789 			continue;
1790 		}
1791 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1792 		if (kva < bootspace.segs[i].va) {
1793 			continue;
1794 		}
1795 		kva_end = rounddown(bootspace.segs[i].va +
1796 			bootspace.segs[i].sz, NBPD_L2);
1797 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1798 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1799 			pde = &L2_BASE[pl2_i(kva)];
1800 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1801 			tlbflushg();
1802 		}
1803 	}
1804 }
1805 #endif /* !XENPV */
1806 
1807 /*
1808  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1809  * to manage mappings.
1810  */
1811 void
1812 pmap_init(void)
1813 {
1814 	int flags;
1815 
1816 	/*
1817 	 * initialize caches.
1818 	 */
1819 
1820 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1821 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1822 
1823 #ifdef XENPV
1824 	/*
1825 	 * pool_cache(9) should not touch cached objects, since they
1826 	 * are pinned on xen and R/O for the domU
1827 	 */
1828 	flags = PR_NOTOUCH;
1829 #else
1830 	flags = 0;
1831 #endif
1832 
1833 #ifdef PAE
1834 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1835 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1836 #else
1837 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1838 	    "pdppl", NULL, IPL_NONE);
1839 #endif
1840 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry),
1841 #ifdef _LP64
1842 	    coherency_unit,
1843 #else
1844 	    coherency_unit / 2,
1845 #endif
1846 	     0, PR_LARGECACHE, "pvpl", &pool_allocator_kmem,
1847 	    IPL_NONE, NULL, NULL, NULL);
1848 
1849 	pmap_tlb_init();
1850 
1851 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1852 	pmap_tlb_cpu_init(curcpu());
1853 
1854 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1855 	    NULL, "x86", "io bitmap copy");
1856 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1857 	    NULL, "x86", "ldt sync");
1858 
1859 	/*
1860 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1861 	 * to hang a tree of pv_entry records.  Dynamically allocated
1862 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1863 	 * usual case is embedded), so cop out and use a single RB tree
1864 	 * to cover them.
1865 	 */
1866 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1867 
1868 	/*
1869 	 * done: pmap module is up (and ready for business)
1870 	 */
1871 
1872 	pmap_initialized = true;
1873 }
1874 
1875 #ifndef XENPV
1876 /*
1877  * pmap_cpu_init_late: perform late per-CPU initialization.
1878  */
1879 void
1880 pmap_cpu_init_late(struct cpu_info *ci)
1881 {
1882 	/*
1883 	 * The BP has already its own PD page allocated during early
1884 	 * MD startup.
1885 	 */
1886 	if (ci == &cpu_info_primary)
1887 		return;
1888 #ifdef PAE
1889 	cpu_alloc_l3_page(ci);
1890 #endif
1891 }
1892 #endif
1893 
1894 #ifndef __HAVE_DIRECT_MAP
1895 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1896 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1897 
1898 static void
1899 pmap_vpage_cpualloc(struct cpu_info *ci)
1900 {
1901 	bool primary = (ci == &cpu_info_primary);
1902 	size_t i, npages;
1903 	vaddr_t vabase;
1904 	vsize_t vrange;
1905 
1906 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1907 	KASSERT(npages >= VPAGE_MAX);
1908 	vrange = npages * PAGE_SIZE;
1909 
1910 	if (primary) {
1911 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1912 			/* Waste some pages to align properly */
1913 		}
1914 		/* The base is aligned, allocate the rest (contiguous) */
1915 		pmap_bootstrap_valloc(npages - 1);
1916 	} else {
1917 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1918 		    UVM_KMF_VAONLY);
1919 		if (vabase == 0) {
1920 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1921 			    __func__, cpu_index(ci));
1922 		}
1923 	}
1924 
1925 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1926 
1927 	for (i = 0; i < VPAGE_MAX; i++) {
1928 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1929 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1930 	}
1931 }
1932 
1933 void
1934 pmap_vpage_cpu_init(struct cpu_info *ci)
1935 {
1936 	if (ci == &cpu_info_primary) {
1937 		/* cpu0 already taken care of in pmap_bootstrap */
1938 		return;
1939 	}
1940 
1941 	pmap_vpage_cpualloc(ci);
1942 }
1943 #endif
1944 
1945 /*
1946  * p v _ e n t r y   f u n c t i o n s
1947  */
1948 
1949 /*
1950  * pmap_free_pvs: free a linked list of pv entries.  the pv entries have
1951  * been removed from their respective pages, but are still entered into the
1952  * map and we must undo that.
1953  *
1954  * => must be called with pmap locked.
1955  */
1956 static void
1957 pmap_free_pvs(struct pmap *pmap, struct pv_entry *pve)
1958 {
1959 	struct pv_entry *next;
1960 
1961 	KASSERT(mutex_owned(&pmap->pm_lock));
1962 
1963 	for ( /* null */ ; pve != NULL ; pve = next) {
1964 		next = pve->pve_next;
1965 		pool_cache_put(&pmap_pv_cache, pve);
1966 	}
1967 }
1968 
1969 /*
1970  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
1971  */
1972 static void
1973 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
1974     vaddr_t va, bool tracked)
1975 {
1976 #ifdef DEBUG
1977 	struct pv_pte *pvpte;
1978 
1979 	PMAP_CHECK_PP(pp);
1980 
1981 	mutex_spin_enter(&pp->pp_lock);
1982 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
1983 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
1984 			break;
1985 		}
1986 	}
1987 	mutex_spin_exit(&pp->pp_lock);
1988 
1989 	if (pvpte && !tracked) {
1990 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
1991 	} else if (!pvpte && tracked) {
1992 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
1993 	}
1994 #endif
1995 }
1996 
1997 /*
1998  * pmap_treelookup_pv: search the PV tree for a dynamic entry
1999  *
2000  * => pmap must be locked
2001  */
2002 static struct pv_entry *
2003 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2004     const rb_tree_t *tree, const vaddr_t va)
2005 {
2006 	struct pv_entry *pve;
2007 	rb_node_t *node;
2008 
2009 	/*
2010 	 * Inlined lookup tailored for exactly what's needed here that is
2011 	 * quite a bit faster than using rb_tree_find_node().
2012 	 */
2013 	for (node = tree->rbt_root;;) {
2014 		if (__predict_false(RB_SENTINEL_P(node))) {
2015 			return NULL;
2016 		}
2017 		pve = (struct pv_entry *)
2018 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2019 		if (pve->pve_pte.pte_va == va) {
2020 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2021 			return pve;
2022 		}
2023 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2024 	}
2025 }
2026 
2027 /*
2028  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2029  *
2030  * => a PV entry must be known present (doesn't check for existence)
2031  * => pmap must be locked
2032  */
2033 static struct pv_entry *
2034 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2035     const struct pmap_page * const old_pp, const vaddr_t va)
2036 {
2037 	struct pv_entry *pve;
2038 	const rb_tree_t *tree;
2039 
2040 	KASSERT(mutex_owned(&pmap->pm_lock));
2041 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2042 
2043 	/*
2044 	 * [This mostly deals with the case of process-private pages, i.e.
2045 	 * anonymous memory allocations or COW.]
2046 	 *
2047 	 * If the page is tracked with an embedded entry then the tree
2048 	 * lookup can be avoided.  It's safe to check for this specific
2049 	 * set of values without pp_lock because both will only ever be
2050 	 * set together for this pmap.
2051 	 *
2052 	 */
2053 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2054 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2055 		return NULL;
2056 	}
2057 
2058 	/*
2059 	 * [This mostly deals with shared mappings, for example shared libs
2060 	 * and executables.]
2061 	 *
2062 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2063 	 * look at the lowest numbered node in the tree first.  The tree is
2064 	 * known non-empty because of the check above.  For short lived
2065 	 * processes where pmap_remove() isn't used much this gets close to
2066 	 * a 100% hit rate.
2067 	 */
2068 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2069 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2070 	pve = (struct pv_entry *)
2071 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2072 	    offsetof(struct pv_entry, pve_rb));
2073 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2074 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2075 		return pve;
2076 	}
2077 
2078 	/* Search the RB tree for the key (uncommon). */
2079 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2080 }
2081 
2082 /*
2083  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2084  *
2085  * => pmap must be locked
2086  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2087  */
2088 static int
2089 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2090     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2091     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2092 {
2093 	struct pv_entry *pve;
2094 	int error;
2095 
2096 	KASSERT(mutex_owned(&pmap->pm_lock));
2097 	KASSERT(ptp_to_pmap(ptp) == pmap);
2098 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2099 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2100 	PMAP_CHECK_PP(pp);
2101 
2102 	/*
2103 	 * If entering the same page and it's already tracked with an
2104 	 * embedded entry, we can avoid the expense below.  It's safe
2105 	 * to check for this very specific set of values without a lock
2106 	 * because both will only ever be set together for this pmap.
2107 	 */
2108 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2109 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2110 		*samepage = true;
2111 		pmap_check_pv(pmap, ptp, pp, va, true);
2112 		return 0;
2113 	}
2114 
2115 	/*
2116 	 * Check for an existing dynamic mapping at this address.  If it's
2117 	 * for the same page, then it will be reused and nothing needs to be
2118 	 * changed.
2119 	 */
2120 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2121 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2122 		*samepage = true;
2123 		pmap_check_pv(pmap, ptp, pp, va, true);
2124 		return 0;
2125 	}
2126 
2127 	/*
2128 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2129 	 * case it's needed; won't know for sure until the lock is taken.
2130 	 */
2131 	if (pmap->pm_pve == NULL) {
2132 		pmap->pm_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
2133 	}
2134 
2135 	error = 0;
2136 	pmap_check_pv(pmap, ptp, pp, va, false);
2137 	mutex_spin_enter(&pp->pp_lock);
2138 	if (!pv_pte_embedded(pp)) {
2139 		/*
2140 		 * Embedded PV tracking available - easy.
2141 		 */
2142 		pp->pp_pte.pte_ptp = ptp;
2143 		pp->pp_pte.pte_va = va;
2144 		*new_embedded = true;
2145 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2146 		/*
2147 		 * No memory.
2148 		 */
2149 		error = ENOMEM;
2150 	} else {
2151 		/*
2152 		 * Install new pv_entry on the page.
2153 		 */
2154 		pve = pmap->pm_pve;
2155 		pmap->pm_pve = NULL;
2156 		*new_pve = pve;
2157 		pve->pve_pte.pte_ptp = ptp;
2158 		pve->pve_pte.pte_va = va;
2159 		pve->pve_pp = pp;
2160 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2161 	}
2162 	mutex_spin_exit(&pp->pp_lock);
2163 	if (error == 0) {
2164 		pmap_check_pv(pmap, ptp, pp, va, true);
2165 	}
2166 
2167 	return error;
2168 }
2169 
2170 /*
2171  * pmap_remove_pv: try to remove a mapping from a pv_list
2172  *
2173  * => pmap must be locked
2174  * => removes dynamic entries from tree
2175  * => caller should adjust ptp's wire_count and free PTP if needed
2176  */
2177 static void
2178 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2179     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2180 {
2181 	rb_tree_t *tree = (ptp != NULL ?
2182 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2183 
2184 	KASSERT(mutex_owned(&pmap->pm_lock));
2185 	KASSERT(ptp_to_pmap(ptp) == pmap);
2186 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2187 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2188 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2189 
2190 	pmap_check_pv(pmap, ptp, pp, va, true);
2191 
2192 	if (pve == NULL) {
2193 		mutex_spin_enter(&pp->pp_lock);
2194 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2195 		KASSERT(pp->pp_pte.pte_va == va);
2196 		pp->pp_attrs |= oattrs;
2197 		pp->pp_pte.pte_ptp = NULL;
2198 		pp->pp_pte.pte_va = 0;
2199 		mutex_spin_exit(&pp->pp_lock);
2200 	} else {
2201 		mutex_spin_enter(&pp->pp_lock);
2202 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2203 		    pp->pp_pte.pte_va != va);
2204 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2205 		KASSERT(pve->pve_pte.pte_va == va);
2206 		KASSERT(pve->pve_pp == pp);
2207 		pp->pp_attrs |= oattrs;
2208 		LIST_REMOVE(pve, pve_list);
2209 		mutex_spin_exit(&pp->pp_lock);
2210 
2211 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2212 		rb_tree_remove_node(tree, pve);
2213 #ifdef DIAGNOSTIC
2214 		memset(pve, 0, sizeof(*pve));
2215 #endif
2216 	}
2217 
2218 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2219 	pmap_check_pv(pmap, ptp, pp, va, false);
2220 }
2221 
2222 /*
2223  * p t p   f u n c t i o n s
2224  */
2225 
2226 static struct vm_page *
2227 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2228 {
2229 	int lidx = level - 1;
2230 	off_t off = ptp_va2o(va, level);
2231 	struct vm_page *pg;
2232 
2233 	KASSERT(mutex_owned(&pmap->pm_lock));
2234 
2235 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2236 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2237 		pg = pmap->pm_ptphint[lidx];
2238 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2239 		return pg;
2240 	}
2241 	PMAP_DUMMY_LOCK(pmap);
2242 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2243 	PMAP_DUMMY_UNLOCK(pmap);
2244 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2245 		/* This page is queued to be freed - ignore. */
2246 		pg = NULL;
2247 	}
2248 	if (pg != NULL) {
2249 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2250 	}
2251 	pmap->pm_ptphint[lidx] = pg;
2252 	return pg;
2253 }
2254 
2255 static inline void
2256 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2257 {
2258 	int lidx;
2259 
2260 	KASSERT(ptp->wire_count <= 1);
2261 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2262 
2263 	lidx = level - 1;
2264 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2265 	if (pmap->pm_ptphint[lidx] == ptp)
2266 		pmap->pm_ptphint[lidx] = NULL;
2267 	ptp->wire_count = 0;
2268 	ptp->uanon = NULL;
2269 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2270 
2271 	/*
2272 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2273 	 * the page from the uvm_object, as that can take further locks
2274 	 * (intolerable right now because the PTEs are likely mapped in).
2275 	 * Instead mark the PTP as free and if we bump into it again, we'll
2276 	 * either ignore or reuse (depending on what's useful at the time).
2277 	 */
2278 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2279 }
2280 
2281 static void
2282 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2283 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2284 {
2285 	unsigned long index;
2286 	int level;
2287 	vaddr_t invaladdr;
2288 	pd_entry_t opde;
2289 
2290 	KASSERT(pmap != pmap_kernel());
2291 	KASSERT(mutex_owned(&pmap->pm_lock));
2292 	KASSERT(kpreempt_disabled());
2293 
2294 	level = 1;
2295 	do {
2296 		index = pl_i(va, level + 1);
2297 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2298 
2299 		/*
2300 		 * On Xen-amd64 or SVS, we need to sync the top level page
2301 		 * directory on each CPU.
2302 		 */
2303 #if defined(XENPV) && defined(__x86_64__)
2304 		if (level == PTP_LEVELS - 1) {
2305 			xen_kpm_sync(pmap, index);
2306 		}
2307 #elif defined(SVS)
2308 		if (svs_enabled && level == PTP_LEVELS - 1) {
2309 			svs_pmap_sync(pmap, index);
2310 		}
2311 #endif
2312 
2313 		invaladdr = level == 1 ? (vaddr_t)ptes :
2314 		    (vaddr_t)pdes[level - 2];
2315 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2316 		    opde, TLBSHOOT_FREE_PTP);
2317 
2318 #if defined(XENPV)
2319 		pmap_tlb_shootnow();
2320 #endif
2321 
2322 		pmap_freepage(pmap, ptp, level);
2323 		if (level < PTP_LEVELS - 1) {
2324 			ptp = pmap_find_ptp(pmap, va, level + 1);
2325 			ptp->wire_count--;
2326 			if (ptp->wire_count > 1)
2327 				break;
2328 		}
2329 	} while (++level < PTP_LEVELS);
2330 	pmap_pte_flush();
2331 }
2332 
2333 /*
2334  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2335  *
2336  * => pmap should NOT be pmap_kernel()
2337  * => pmap should be locked
2338  * => we are not touching any PTEs yet, so they need not be mapped in
2339  */
2340 static int
2341 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2342     int flags, struct vm_page **resultp)
2343 {
2344 	struct vm_page *ptp;
2345 	int i, aflags;
2346 	struct uvm_object *obj;
2347 	voff_t off;
2348 
2349 	KASSERT(pmap != pmap_kernel());
2350 	KASSERT(mutex_owned(&pmap->pm_lock));
2351 
2352 	/*
2353 	 * Loop through all page table levels allocating a page
2354 	 * for any level where we don't already have one.
2355 	 */
2356 	memset(pt, 0, sizeof(*pt));
2357 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2358 		UVM_PGA_ZERO;
2359 	for (i = PTP_LEVELS; i > 1; i--) {
2360 		obj = &pmap->pm_obj[i - 2];
2361 		off = ptp_va2o(va, i - 1);
2362 
2363 		PMAP_DUMMY_LOCK(pmap);
2364 		pt->pg[i] = uvm_pagelookup(obj, off);
2365 
2366 		if (pt->pg[i] == NULL) {
2367 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2368 			pt->alloced[i] = (pt->pg[i] != NULL);
2369 		} else if (pt->pg[i]->wire_count == 0) {
2370 			/* This page was queued to be freed; dequeue it. */
2371 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2372 			pt->alloced[i] = true;
2373 		}
2374 		PMAP_DUMMY_UNLOCK(pmap);
2375 		if (pt->pg[i] == NULL) {
2376 			pmap_unget_ptp(pmap, pt);
2377 			return ENOMEM;
2378 		} else if (pt->alloced[i]) {
2379 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2380 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2381 			    &pmap_rbtree_ops);
2382 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2383 		}
2384 	}
2385 	ptp = pt->pg[2];
2386 	KASSERT(ptp != NULL);
2387 	*resultp = ptp;
2388 	pmap->pm_ptphint[0] = ptp;
2389 	return 0;
2390 }
2391 
2392 /*
2393  * pmap_install_ptp: install any freshly allocated PTPs
2394  *
2395  * => pmap should NOT be pmap_kernel()
2396  * => pmap should be locked
2397  * => PTEs must be mapped
2398  * => preemption must be disabled
2399  */
2400 static void
2401 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2402     pd_entry_t * const *pdes)
2403 {
2404 	struct vm_page *ptp;
2405 	unsigned long index;
2406 	pd_entry_t *pva;
2407 	paddr_t pa;
2408 	int i;
2409 
2410 	KASSERT(pmap != pmap_kernel());
2411 	KASSERT(mutex_owned(&pmap->pm_lock));
2412 	KASSERT(kpreempt_disabled());
2413 
2414 	/*
2415 	 * Now that we have all the pages looked up or allocated,
2416 	 * loop through again installing any new ones into the tree.
2417 	 */
2418 	for (i = PTP_LEVELS; i > 1; i--) {
2419 		index = pl_i(va, i);
2420 		pva = pdes[i - 2];
2421 
2422 		if (pmap_valid_entry(pva[index])) {
2423 			KASSERT(!pt->alloced[i]);
2424 			continue;
2425 		}
2426 
2427 		ptp = pt->pg[i];
2428 		ptp->flags &= ~PG_BUSY; /* never busy */
2429 		ptp->wire_count = 1;
2430 		pmap->pm_ptphint[i - 2] = ptp;
2431 		pa = VM_PAGE_TO_PHYS(ptp);
2432 		pmap_pte_set(&pva[index], (pd_entry_t)
2433 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2434 
2435 		/*
2436 		 * On Xen-amd64 or SVS, we need to sync the top level page
2437 		 * directory on each CPU.
2438 		 */
2439 #if defined(XENPV) && defined(__x86_64__)
2440 		if (i == PTP_LEVELS) {
2441 			xen_kpm_sync(pmap, index);
2442 		}
2443 #elif defined(SVS)
2444 		if (svs_enabled && i == PTP_LEVELS) {
2445 			svs_pmap_sync(pmap, index);
2446 		}
2447 #endif
2448 
2449 		pmap_pte_flush();
2450 		pmap_stats_update(pmap, 1, 0);
2451 
2452 		/*
2453 		 * If we're not in the top level, increase the
2454 		 * wire count of the parent page.
2455 		 */
2456 		if (i < PTP_LEVELS) {
2457 			pt->pg[i + 1]->wire_count++;
2458 		}
2459 	}
2460 }
2461 
2462 /*
2463  * pmap_unget_ptp: free unusued PTPs
2464  *
2465  * => pmap should NOT be pmap_kernel()
2466  * => pmap should be locked
2467  */
2468 static void
2469 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2470 {
2471 	int i;
2472 
2473 	KASSERT(pmap != pmap_kernel());
2474 	KASSERT(mutex_owned(&pmap->pm_lock));
2475 
2476 	for (i = PTP_LEVELS; i > 1; i--) {
2477 		if (!pt->alloced[i]) {
2478 			continue;
2479 		}
2480 		KASSERT(pt->pg[i]->wire_count == 0);
2481 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2482 		pmap_freepage(pmap, pt->pg[i], i - 1);
2483 	}
2484 }
2485 
2486 /*
2487  * p m a p   l i f e c y c l e   f u n c t i o n s
2488  */
2489 
2490 /*
2491  * pmap_pdp_init: constructor a new PDP.
2492  */
2493 static void
2494 pmap_pdp_init(pd_entry_t *pdir)
2495 {
2496 	paddr_t pdirpa = 0;
2497 	vaddr_t object;
2498 	int i;
2499 
2500 #if !defined(XENPV) || !defined(__x86_64__)
2501 	int npde;
2502 #endif
2503 #ifdef XENPV
2504 	int s;
2505 #endif
2506 
2507 	memset(pdir, 0, PDP_SIZE * PAGE_SIZE);
2508 
2509 	/*
2510 	 * NOTE: This is all done unlocked, but we will check afterwards
2511 	 * if we have raced with pmap_growkernel().
2512 	 */
2513 
2514 #if defined(XENPV) && defined(__x86_64__)
2515 	/* Fetch the physical address of the page directory */
2516 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2517 
2518 	/*
2519 	 * This pdir will NEVER be active in kernel mode, so mark
2520 	 * recursive entry invalid.
2521 	 */
2522 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2523 
2524 	/*
2525 	 * PDP constructed this way won't be for the kernel, hence we
2526 	 * don't put kernel mappings on Xen.
2527 	 *
2528 	 * But we need to make pmap_create() happy, so put a dummy
2529 	 * (without PTE_P) value at the right place.
2530 	 */
2531 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2532 	     (pd_entry_t)-1 & PTE_FRAME;
2533 #else /* XENPV && __x86_64__*/
2534 	object = (vaddr_t)pdir;
2535 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2536 		/* Fetch the physical address of the page directory */
2537 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2538 
2539 		/* Put in recursive PDE to map the PTEs */
2540 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2541 		    pmap_pg_nx;
2542 #ifndef XENPV
2543 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2544 #endif
2545 	}
2546 
2547 	/* Copy the kernel's top level PDE */
2548 	npde = nkptp[PTP_LEVELS - 1];
2549 
2550 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2551 	    npde * sizeof(pd_entry_t));
2552 
2553 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2554 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2555 		pdir[idx] = PDP_BASE[idx];
2556 	}
2557 
2558 #ifdef __HAVE_PCPU_AREA
2559 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2560 #endif
2561 #ifdef __HAVE_DIRECT_MAP
2562 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2563 #endif
2564 #ifdef KASAN
2565 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2566 #endif
2567 #ifdef KMSAN
2568 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2569 #endif
2570 #endif /* XENPV  && __x86_64__*/
2571 
2572 #ifdef XENPV
2573 	s = splvm();
2574 	object = (vaddr_t)pdir;
2575 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2576 	    VM_PROT_READ);
2577 	pmap_update(pmap_kernel());
2578 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2579 		/*
2580 		 * pin as L2/L4 page, we have to do the page with the
2581 		 * PDIR_SLOT_PTE entries last
2582 		 */
2583 #ifdef PAE
2584 		if (i == l2tol3(PDIR_SLOT_PTE))
2585 			continue;
2586 #endif
2587 
2588 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2589 #ifdef __x86_64__
2590 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2591 #else
2592 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2593 #endif
2594 	}
2595 #ifdef PAE
2596 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2597 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2598 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2599 #endif
2600 	splx(s);
2601 #endif /* XENPV */
2602 }
2603 
2604 /*
2605  * pmap_pdp_fini: destructor for the PDPs.
2606  */
2607 static void
2608 pmap_pdp_fini(pd_entry_t *pdir)
2609 {
2610 #ifdef XENPV
2611 	paddr_t pdirpa = 0;	/* XXX: GCC */
2612 	vaddr_t object = (vaddr_t)pdir;
2613 	int i;
2614 	int s = splvm();
2615 	pt_entry_t *pte;
2616 
2617 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2618 		/* fetch the physical address of the page directory. */
2619 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2620 		/* unpin page table */
2621 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2622 	}
2623 	object = (vaddr_t)pdir;
2624 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2625 		/* Set page RW again */
2626 		pte = kvtopte(object);
2627 		pmap_pte_set(pte, *pte | PTE_W);
2628 		xen_bcast_invlpg((vaddr_t)object);
2629 	}
2630 	splx(s);
2631 #endif  /* XENPV */
2632 }
2633 
2634 #ifdef PAE
2635 static void *
2636 pmap_pdp_alloc(struct pool *pp, int flags)
2637 {
2638 	return (void *)uvm_km_alloc(kernel_map,
2639 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2640 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2641 	    UVM_KMF_WIRED);
2642 }
2643 
2644 static void
2645 pmap_pdp_free(struct pool *pp, void *v)
2646 {
2647 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2648 	    UVM_KMF_WIRED);
2649 }
2650 #endif /* PAE */
2651 
2652 /*
2653  * pmap_ctor: constructor for the pmap cache.
2654  */
2655 static int
2656 pmap_ctor(void *arg, void *obj, int flags)
2657 {
2658 	struct pmap *pmap = obj;
2659 	pt_entry_t p;
2660 	int i;
2661 
2662 	KASSERT((flags & PR_WAITOK) != 0);
2663 
2664 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2665 	rw_init(&pmap->pm_dummy_lock);
2666 	kcpuset_create(&pmap->pm_cpus, true);
2667 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2668 #ifdef XENPV
2669 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2670 #endif
2671 	LIST_INIT(&pmap->pm_gc_ptp);
2672 	pmap->pm_pve = NULL;
2673 
2674 	/* allocate and init PDP */
2675 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2676 
2677 	for (;;) {
2678 		pmap_pdp_init(pmap->pm_pdir);
2679 		mutex_enter(&pmaps_lock);
2680 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2681 		if (__predict_true(p != 0)) {
2682 			break;
2683 		}
2684 		mutex_exit(&pmaps_lock);
2685 	}
2686 
2687 	for (i = 0; i < PDP_SIZE; i++)
2688 		pmap->pm_pdirpa[i] =
2689 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2690 
2691 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2692 	mutex_exit(&pmaps_lock);
2693 
2694 	return 0;
2695 }
2696 
2697 /*
2698  * pmap_ctor: destructor for the pmap cache.
2699  */
2700 static void
2701 pmap_dtor(void *arg, void *obj)
2702 {
2703 	struct pmap *pmap = obj;
2704 
2705 	if (pmap->pm_pve != NULL) {
2706 		pool_cache_put(&pmap_pv_cache, pmap->pm_pve);
2707 	}
2708 
2709 	mutex_enter(&pmaps_lock);
2710 	LIST_REMOVE(pmap, pm_list);
2711 	mutex_exit(&pmaps_lock);
2712 
2713 	pmap_pdp_fini(pmap->pm_pdir);
2714 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2715 	mutex_destroy(&pmap->pm_lock);
2716 	rw_destroy(&pmap->pm_dummy_lock);
2717 	kcpuset_destroy(pmap->pm_cpus);
2718 	kcpuset_destroy(pmap->pm_kernel_cpus);
2719 #ifdef XENPV
2720 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2721 #endif
2722 }
2723 
2724 /*
2725  * pmap_create: create a pmap object.
2726  */
2727 struct pmap *
2728 pmap_create(void)
2729 {
2730 	struct pmap *pmap;
2731 	int i;
2732 
2733 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2734 
2735 	/* init uvm_object */
2736 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2737 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2738 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2739 		pmap->pm_ptphint[i] = NULL;
2740 	}
2741 	pmap->pm_stats.wired_count = 0;
2742 	/* count the PDP allocd below */
2743 	pmap->pm_stats.resident_count = PDP_SIZE;
2744 #if !defined(__x86_64__)
2745 	pmap->pm_hiexec = 0;
2746 #endif
2747 
2748 	/* Used by NVMM. */
2749 	pmap->pm_enter = NULL;
2750 	pmap->pm_extract = NULL;
2751 	pmap->pm_remove = NULL;
2752 	pmap->pm_sync_pv = NULL;
2753 	pmap->pm_pp_remove_ent = NULL;
2754 	pmap->pm_write_protect = NULL;
2755 	pmap->pm_unwire = NULL;
2756 	pmap->pm_tlb_flush = NULL;
2757 	pmap->pm_data = NULL;
2758 
2759 	/* init the LDT */
2760 	pmap->pm_ldt = NULL;
2761 	pmap->pm_ldt_len = 0;
2762 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2763 
2764 	return (pmap);
2765 }
2766 
2767 /*
2768  * pmap_check_ptps: verify that none of the pmap's page table objects
2769  * have any pages allocated to them.
2770  */
2771 static void
2772 pmap_check_ptps(struct pmap *pmap)
2773 {
2774 	int i;
2775 
2776 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2777 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2778 		    "pmap %p level %d still has %d pages",
2779 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2780 	}
2781 }
2782 
2783 static void
2784 pmap_check_inuse(struct pmap *pmap)
2785 {
2786 #ifdef DEBUG
2787 	CPU_INFO_ITERATOR cii;
2788 	struct cpu_info *ci;
2789 
2790 	for (CPU_INFO_FOREACH(cii, ci)) {
2791 		if (ci->ci_pmap == pmap)
2792 			panic("destroying pmap being used");
2793 #if defined(XENPV) && defined(__x86_64__)
2794 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
2795 			if (pmap->pm_pdir[i] != 0 &&
2796 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2797 				printf("pmap_destroy(%p) pmap_kernel %p "
2798 				    "curcpu %d cpu %d ci_pmap %p "
2799 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2800 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2801 				    pmap, pmap_kernel(), curcpu()->ci_index,
2802 				    ci->ci_index, ci->ci_pmap,
2803 				    i, ci->ci_kpm_pdir[i],
2804 				    i, pmap->pm_pdir[i]);
2805 				panic("%s: used pmap", __func__);
2806 			}
2807 		}
2808 #endif
2809 	}
2810 #endif /* DEBUG */
2811 }
2812 
2813 /*
2814  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
2815  * count goes to zero.
2816  *
2817  * => we can be called from pmap_unmap_ptes() with a different, unrelated
2818  *    pmap's lock held.  be careful!
2819  */
2820 void
2821 pmap_destroy(struct pmap *pmap)
2822 {
2823 	int i;
2824 
2825 	/*
2826 	 * drop reference count and verify not in use.
2827 	 */
2828 
2829 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2830 		return;
2831 	}
2832 	pmap_check_inuse(pmap);
2833 
2834 	/*
2835 	 * XXX handle deferred PTP page free for EPT.  ordinarily this is
2836 	 * taken care of by pmap_remove_all().  once shared with EPT this
2837 	 * can go away.
2838 	 */
2839 	if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) {
2840 		pmap_update(pmap);
2841 	}
2842 
2843 	/*
2844 	 * Reference count is zero, free pmap resources and then free pmap.
2845 	 */
2846 
2847 	pmap_check_ptps(pmap);
2848 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
2849 
2850 #ifdef USER_LDT
2851 	if (pmap->pm_ldt != NULL) {
2852 		/*
2853 		 * no need to switch the LDT; this address space is gone,
2854 		 * nothing is using it.
2855 		 *
2856 		 * No need to lock the pmap for ldt_free (or anything else),
2857 		 * we're the last one to use it.
2858 		 */
2859 		/* XXXAD can't take cpu_lock here - fix soon. */
2860 		mutex_enter(&cpu_lock);
2861 		ldt_free(pmap->pm_ldt_sel);
2862 		mutex_exit(&cpu_lock);
2863 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2864 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2865 	}
2866 #endif
2867 
2868 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2869 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2870 	}
2871 	kcpuset_zero(pmap->pm_cpus);
2872 	kcpuset_zero(pmap->pm_kernel_cpus);
2873 #ifdef XENPV
2874 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
2875 #endif
2876 
2877 	pmap_check_ptps(pmap);
2878 	if (__predict_false(pmap->pm_enter != NULL)) {
2879 		/* XXX make this a different cache */
2880 		pool_cache_destruct_object(&pmap_cache, pmap);
2881 	} else {
2882 		pool_cache_put(&pmap_cache, pmap);
2883 	}
2884 }
2885 
2886 /*
2887  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
2888  *
2889  * => caller must hold pmap's lock
2890  * => PTP must be mapped into KVA
2891  * => must be called with kernel preemption disabled
2892  * => does as little work as possible
2893  */
2894 static void
2895 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
2896     vaddr_t startva, vaddr_t blkendva, struct pv_entry **pv_tofree)
2897 {
2898 #ifndef XEN
2899 	struct pv_entry *pve;
2900 	struct vm_page *pg;
2901 	struct pmap_page *pp;
2902 	pt_entry_t opte;
2903 	rb_tree_t *tree;
2904 	vaddr_t va;
2905 	int wired;
2906 	uint8_t oattrs;
2907 	u_int cnt;
2908 
2909 	KASSERT(mutex_owned(&pmap->pm_lock));
2910 	KASSERT(kpreempt_disabled());
2911 	KASSERT(pmap != pmap_kernel());
2912 	KASSERT(ptp->wire_count > 1);
2913 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
2914 
2915 	/*
2916 	 * Start at the lowest entered VA, and scan until there are no more
2917 	 * PTEs in the PTPs.  The goal is to disconnect PV entries and patch
2918 	 * up the pmap's stats.  No PTEs will be modified.
2919 	 */
2920 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
2921 	pve = RB_TREE_MIN(tree);
2922 	wired = 0;
2923 	va = (vaddr_t)ptp->uanon;
2924 	pte += ((va - startva) >> PAGE_SHIFT);
2925 
2926 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
2927 		opte = *pte;
2928 		if (!pmap_valid_entry(opte)) {
2929 			continue;
2930 		}
2931 
2932 		/*
2933 		 * Count the PTE.  If it's not for a managed mapping
2934 		 * there's noting more to do.
2935 		 */
2936 		cnt--;
2937 		wired -= (opte & PTE_WIRED);
2938 		if ((opte & PTE_PVLIST) == 0) {
2939 #ifndef DOM0OPS
2940 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
2941 			    "managed page without PTE_PVLIST for %#"
2942 			    PRIxVADDR, va);
2943 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
2944 			    "pv-tracked page without PTE_PVLIST for %#"
2945 			    PRIxVADDR, va);
2946 #endif
2947 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
2948 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
2949 			    va) == NULL);
2950 			continue;
2951 		}
2952 
2953 		/*
2954 		 * "pve" now points to the lowest (by VA) dynamic PV entry
2955 		 * in the PTP.  If it's for this VA, take advantage of it to
2956 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
2957 		 * tree by skipping to the next VA in the tree whenever
2958 		 * there is a match here.  The tree will be cleared out in
2959 		 * one pass before return to pmap_remove_all().
2960 		 */
2961 		oattrs = pmap_pte_to_pp_attrs(opte);
2962 		if (pve != NULL && pve->pve_pte.pte_va == va) {
2963 			pp = pve->pve_pp;
2964 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2965 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
2966 			    pp->pp_pte.pte_va != va);
2967 			mutex_spin_enter(&pp->pp_lock);
2968 			pp->pp_attrs |= oattrs;
2969 			LIST_REMOVE(pve, pve_list);
2970 			mutex_spin_exit(&pp->pp_lock);
2971 			pve->pve_next = *pv_tofree;
2972 			*pv_tofree = pve;
2973 			pve = RB_TREE_NEXT(tree, pve);
2974 			continue;
2975 		}
2976 
2977 		/*
2978 		 * No entry in the tree so it must be embedded.  Look up the
2979 		 * page and cancel the embedded entry.
2980 		 */
2981 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
2982 			pp = VM_PAGE_TO_PP(pg);
2983 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
2984 			paddr_t pa = pmap_pte2pa(opte);
2985 			panic("%s: PTE_PVLIST with pv-untracked page"
2986 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
2987 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
2988 		}
2989 		mutex_spin_enter(&pp->pp_lock);
2990 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2991 		KASSERT(pp->pp_pte.pte_va == va);
2992 		pp->pp_attrs |= oattrs;
2993 		pp->pp_pte.pte_ptp = NULL;
2994 		pp->pp_pte.pte_va = 0;
2995 		mutex_spin_exit(&pp->pp_lock);
2996 	}
2997 
2998 	/* PTP now empty - adjust the tree & stats to match. */
2999 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3000 	ptp->wire_count = 1;
3001 #ifdef DIAGNOSTIC
3002 	rb_tree_init(tree, &pmap_rbtree_ops);
3003 #endif
3004 #else	/* !XEN */
3005 	/*
3006 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3007 	 * I guess the hypervisor keeps track of PTEs too.
3008 	 */
3009 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva,
3010 	    pv_tofree);
3011 #endif	/* !XEN */
3012 }
3013 
3014 /*
3015  * pmap_remove_all: remove all mappings from pmap in bulk.
3016  *
3017  * Ordinarily when removing mappings it's important to hold the UVM object's
3018  * lock, so that pages do not gain a new identity while retaining stale TLB
3019  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3020  * Here it's known that the address space is no longer visible to any user
3021  * process, so we don't need to worry about that.
3022  */
3023 bool
3024 pmap_remove_all(struct pmap *pmap)
3025 {
3026 	struct vm_page *ptps[32];
3027 	vaddr_t va, blkendva;
3028 	struct pmap *pmap2;
3029 	pt_entry_t *ptes;
3030 	pd_entry_t pde __diagused;
3031 	pd_entry_t * const *pdes;
3032 	struct pv_entry *pv_tofree;
3033 	int lvl __diagused, i, n;
3034 
3035 	/* XXX Can't handle EPT just yet. */
3036 	if (pmap->pm_remove != NULL) {
3037 		return false;
3038 	}
3039 
3040 	for (;;) {
3041 		/* Fetch a block of PTPs from tree. */
3042 		mutex_enter(&pmap->pm_lock);
3043 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3044 		    (void **)ptps, __arraycount(ptps), false);
3045 		if (n == 0) {
3046 			mutex_exit(&pmap->pm_lock);
3047 			break;
3048 		}
3049 
3050 		/* Remove all mappings in the set of PTPs. */
3051 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3052 		pv_tofree = NULL;
3053 		for (i = 0; i < n; i++) {
3054 			if (ptps[i]->wire_count == 0) {
3055 				/* It's dead: pmap_update() will expunge. */
3056 				continue;
3057 			}
3058 
3059 			/* Determine range of block. */
3060 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3061 			blkendva = x86_round_pdr(va + 1);
3062 
3063 			/* Make sure everything squares up... */
3064 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3065 			KASSERT(lvl == 1);
3066 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3067 
3068 			/* Zap! */
3069 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3070 			    blkendva, &pv_tofree);
3071 
3072 			/* PTP should now be unused - free it. */
3073 			KASSERT(ptps[i]->wire_count == 1);
3074 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3075 		}
3076 		pmap_unmap_ptes(pmap, pmap2);
3077 		pmap_free_pvs(pmap, pv_tofree);
3078 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3079 		mutex_exit(&pmap->pm_lock);
3080 
3081 		/* Process deferred frees. */
3082 		pmap_update(pmap);
3083 
3084 		/* A breathing point. */
3085 		preempt_point();
3086 	}
3087 
3088 	/* Verify that the pmap is now completely empty. */
3089 	pmap_check_ptps(pmap);
3090 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3091 	    "pmap %p not empty", pmap);
3092 
3093 	return true;
3094 }
3095 
3096 #if defined(PMAP_FORK)
3097 /*
3098  * pmap_fork: perform any necessary data structure manipulation when
3099  * a VM space is forked.
3100  */
3101 void
3102 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3103 {
3104 #ifdef USER_LDT
3105 	union descriptor *new_ldt;
3106 	size_t len;
3107 	int sel;
3108 
3109 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3110 		return;
3111 	}
3112 
3113 	/*
3114 	 * Copy the LDT into the new process.
3115 	 *
3116 	 * Read pmap1's ldt pointer and length unlocked; if it changes
3117 	 * behind our back we'll retry. This will starve if there's a
3118 	 * stream of LDT changes in another thread but that should not
3119 	 * happen.
3120 	 */
3121 
3122  retry:
3123 	if (pmap1->pm_ldt != NULL) {
3124 		len = pmap1->pm_ldt_len;
3125 		/* Allocate space for the new process's LDT */
3126 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
3127 		    UVM_KMF_WIRED);
3128 		if (new_ldt == NULL) {
3129 			printf("WARNING: %s: unable to allocate LDT space\n",
3130 			    __func__);
3131 			return;
3132 		}
3133 		mutex_enter(&cpu_lock);
3134 		/* Get a GDT slot for it */
3135 		sel = ldt_alloc(new_ldt, len);
3136 		if (sel == -1) {
3137 			mutex_exit(&cpu_lock);
3138 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
3139 			    UVM_KMF_WIRED);
3140 			printf("WARNING: %s: unable to allocate LDT selector\n",
3141 			    __func__);
3142 			return;
3143 		}
3144 	} else {
3145 		/* Wasn't anything there after all. */
3146 		len = -1;
3147 		new_ldt = NULL;
3148 		sel = -1;
3149 		mutex_enter(&cpu_lock);
3150 	}
3151 
3152  	/* If there's still something there now that we have cpu_lock... */
3153  	if (pmap1->pm_ldt != NULL) {
3154 		if (len != pmap1->pm_ldt_len) {
3155 			/* Oops, it changed. Drop what we did and try again */
3156 			if (len != -1) {
3157 				ldt_free(sel);
3158 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3159 				    len, UVM_KMF_WIRED);
3160 			}
3161 			mutex_exit(&cpu_lock);
3162 			goto retry;
3163 		}
3164 
3165 		/* Copy the LDT data and install it in pmap2 */
3166 		memcpy(new_ldt, pmap1->pm_ldt, len);
3167 		pmap2->pm_ldt = new_ldt;
3168 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
3169 		pmap2->pm_ldt_sel = sel;
3170 		len = -1;
3171 	}
3172 
3173 	if (len != -1) {
3174 		/* There wasn't still something there, so mop up */
3175 		ldt_free(sel);
3176 		mutex_exit(&cpu_lock);
3177 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
3178 		    UVM_KMF_WIRED);
3179 	} else {
3180 		mutex_exit(&cpu_lock);
3181 	}
3182 #endif /* USER_LDT */
3183 }
3184 #endif /* PMAP_FORK */
3185 
3186 #ifdef USER_LDT
3187 
3188 /*
3189  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3190  * is active, reload LDTR.
3191  */
3192 static void
3193 pmap_ldt_xcall(void *arg1, void *arg2)
3194 {
3195 	struct pmap *pm;
3196 
3197 	kpreempt_disable();
3198 	pm = arg1;
3199 	if (curcpu()->ci_pmap == pm) {
3200 #if defined(SVS) && defined(USER_LDT)
3201 		if (svs_enabled) {
3202 			svs_ldt_sync(pm);
3203 		} else
3204 #endif
3205 		lldt(pm->pm_ldt_sel);
3206 	}
3207 	kpreempt_enable();
3208 }
3209 
3210 /*
3211  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3212  * in the new selector on all CPUs.
3213  */
3214 void
3215 pmap_ldt_sync(struct pmap *pm)
3216 {
3217 	uint64_t where;
3218 
3219 	KASSERT(mutex_owned(&cpu_lock));
3220 
3221 	pmap_ldt_evcnt.ev_count++;
3222 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3223 	xc_wait(where);
3224 }
3225 
3226 /*
3227  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3228  * restore the default.
3229  */
3230 void
3231 pmap_ldt_cleanup(struct lwp *l)
3232 {
3233 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3234 	union descriptor *dp = NULL;
3235 	size_t len = 0;
3236 	int sel = -1;
3237 
3238 	if (__predict_true(pmap->pm_ldt == NULL)) {
3239 		return;
3240 	}
3241 
3242 	mutex_enter(&cpu_lock);
3243 	if (pmap->pm_ldt != NULL) {
3244 		sel = pmap->pm_ldt_sel;
3245 		dp = pmap->pm_ldt;
3246 		len = pmap->pm_ldt_len;
3247 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3248 		pmap->pm_ldt = NULL;
3249 		pmap->pm_ldt_len = 0;
3250 		pmap_ldt_sync(pmap);
3251 		ldt_free(sel);
3252 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
3253 	}
3254 	mutex_exit(&cpu_lock);
3255 }
3256 #endif /* USER_LDT */
3257 
3258 /*
3259  * pmap_activate: activate a process' pmap
3260  *
3261  * => must be called with kernel preemption disabled
3262  * => if lwp is the curlwp, then set ci_want_pmapload so that
3263  *    actual MMU context switch will be done by pmap_load() later
3264  */
3265 void
3266 pmap_activate(struct lwp *l)
3267 {
3268 	struct cpu_info *ci;
3269 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3270 
3271 	KASSERT(kpreempt_disabled());
3272 
3273 	ci = curcpu();
3274 
3275 	if (l != ci->ci_curlwp)
3276 		return;
3277 
3278 	KASSERT(ci->ci_want_pmapload == 0);
3279 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3280 
3281 	/*
3282 	 * no need to switch to kernel vmspace because
3283 	 * it's a subset of any vmspace.
3284 	 */
3285 
3286 	if (pmap == pmap_kernel()) {
3287 		ci->ci_want_pmapload = 0;
3288 		return;
3289 	}
3290 
3291 	ci->ci_want_pmapload = 1;
3292 }
3293 
3294 #if defined(XENPV) && defined(__x86_64__)
3295 #define	KASSERT_PDIRPA(pmap) \
3296 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3297 	    pmap == pmap_kernel())
3298 #elif defined(PAE)
3299 #define	KASSERT_PDIRPA(pmap) \
3300 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3301 #elif !defined(XENPV)
3302 #define	KASSERT_PDIRPA(pmap) \
3303 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3304 #else
3305 #define	KASSERT_PDIRPA(pmap) 	KASSERT(true)	/* nothing to do */
3306 #endif
3307 
3308 /*
3309  * pmap_reactivate: try to regain reference to the pmap.
3310  *
3311  * => Must be called with kernel preemption disabled.
3312  */
3313 static void
3314 pmap_reactivate(struct pmap *pmap)
3315 {
3316 	struct cpu_info * const ci = curcpu();
3317 	const cpuid_t cid = cpu_index(ci);
3318 
3319 	KASSERT(kpreempt_disabled());
3320 	KASSERT_PDIRPA(pmap);
3321 
3322 	/*
3323 	 * If we still have a lazy reference to this pmap, we can assume
3324 	 * that there was no TLB shootdown for this pmap in the meantime.
3325 	 *
3326 	 * The order of events here is important as we must synchronize
3327 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3328 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3329 	 * change only when the state is TLBSTATE_LAZY.
3330 	 */
3331 
3332 	ci->ci_tlbstate = TLBSTATE_VALID;
3333 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3334 
3335 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3336 		/* We have the reference, state is valid. */
3337 	} else {
3338 		/*
3339 		 * Must reload the TLB, pmap has been changed during
3340 		 * deactivated.
3341 		 */
3342 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3343 
3344 		tlbflush();
3345 	}
3346 }
3347 
3348 /*
3349  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3350  * and relevant LDT info.
3351  *
3352  * Ensures that the current process' pmap is loaded on the current CPU's
3353  * MMU and that there are no stale TLB entries.
3354  *
3355  * => The caller should disable kernel preemption or do check-and-retry
3356  *    to prevent a preemption from undoing our efforts.
3357  * => This function may block.
3358  */
3359 void
3360 pmap_load(void)
3361 {
3362 	struct cpu_info *ci;
3363 	struct pmap *pmap, *oldpmap;
3364 	struct lwp *l;
3365 	uint64_t ncsw;
3366 
3367 	kpreempt_disable();
3368  retry:
3369 	ci = curcpu();
3370 	if (!ci->ci_want_pmapload) {
3371 		kpreempt_enable();
3372 		return;
3373 	}
3374 	l = ci->ci_curlwp;
3375 	ncsw = l->l_ncsw;
3376 	__insn_barrier();
3377 
3378 	/* should be able to take ipis. */
3379 	KASSERT(ci->ci_ilevel < IPL_HIGH);
3380 #ifdef XENPV
3381 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3382 	KASSERT(x86_read_psl() == 0);
3383 #else
3384 	KASSERT((x86_read_psl() & PSL_I) != 0);
3385 #endif
3386 
3387 	KASSERT(l != NULL);
3388 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3389 	KASSERT(pmap != pmap_kernel());
3390 	oldpmap = ci->ci_pmap;
3391 
3392 	if (pmap == oldpmap) {
3393 		pmap_reactivate(pmap);
3394 		ci->ci_want_pmapload = 0;
3395 		kpreempt_enable();
3396 		return;
3397 	}
3398 
3399 	/*
3400 	 * Acquire a reference to the new pmap and perform the switch.
3401 	 */
3402 
3403 	pmap_reference(pmap);
3404 	pmap_load1(l, pmap, oldpmap);
3405 	ci->ci_want_pmapload = 0;
3406 
3407 	/*
3408 	 * we're now running with the new pmap.  drop the reference
3409 	 * to the old pmap.  if we block, we need to go around again.
3410 	 */
3411 
3412 	pmap_destroy(oldpmap);
3413 	__insn_barrier();
3414 	if (l->l_ncsw != ncsw) {
3415 		goto retry;
3416 	}
3417 
3418 	kpreempt_enable();
3419 }
3420 
3421 /*
3422  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3423  * pmap_load().  It's critically important that this function does not
3424  * block.
3425  */
3426 static void
3427 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3428 {
3429 	struct cpu_info *ci;
3430 	struct pcb *pcb;
3431 	cpuid_t cid;
3432 
3433 	KASSERT(kpreempt_disabled());
3434 
3435 	pcb = lwp_getpcb(l);
3436 	ci = l->l_cpu;
3437 	cid = cpu_index(ci);
3438 
3439 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3440 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3441 
3442 	KASSERT_PDIRPA(oldpmap);
3443 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3444 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3445 
3446 	/*
3447 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3448 	 * with TLB shootdown interrupts, so set the state VALID first,
3449 	 * then register us for shootdown events on this pmap.
3450 	 */
3451 	ci->ci_tlbstate = TLBSTATE_VALID;
3452 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3453 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3454 	ci->ci_pmap = pmap;
3455 
3456 	/*
3457 	 * update tss.  now that we have registered for invalidations
3458 	 * from other CPUs, we're good to load the page tables.
3459 	 */
3460 #ifdef PAE
3461 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3462 #else
3463 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3464 #endif
3465 
3466 #ifdef i386
3467 #ifndef XENPV
3468 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3469 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3470 #endif
3471 #endif
3472 
3473 #if defined(SVS) && defined(USER_LDT)
3474 	if (svs_enabled) {
3475 		svs_ldt_sync(pmap);
3476 	} else
3477 #endif
3478 	lldt(pmap->pm_ldt_sel);
3479 
3480 	cpu_load_pmap(pmap, oldpmap);
3481 }
3482 
3483 /*
3484  * pmap_deactivate: deactivate a process' pmap.
3485  *
3486  * => Must be called with kernel preemption disabled (high IPL is enough).
3487  */
3488 void
3489 pmap_deactivate(struct lwp *l)
3490 {
3491 	struct pmap *pmap;
3492 	struct cpu_info *ci;
3493 
3494 	KASSERT(kpreempt_disabled());
3495 
3496 	if (l != curlwp) {
3497 		return;
3498 	}
3499 
3500 	/*
3501 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3502 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3503 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3504 	 * to kernel preemption or blocking routine in between.
3505 	 */
3506 	pmap_tlb_shootnow();
3507 
3508 	ci = curcpu();
3509 
3510 	if (ci->ci_want_pmapload) {
3511 		/*
3512 		 * ci_want_pmapload means that our pmap is not loaded on
3513 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3514 		 * is always considered loaded.
3515 		 */
3516 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3517 		    != pmap_kernel());
3518 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3519 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3520 
3521 		/*
3522 		 * userspace has not been touched.
3523 		 * nothing to do here.
3524 		 */
3525 
3526 		ci->ci_want_pmapload = 0;
3527 		return;
3528 	}
3529 
3530 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3531 
3532 	if (pmap == pmap_kernel()) {
3533 		return;
3534 	}
3535 
3536 	KASSERT_PDIRPA(pmap);
3537 	KASSERT(ci->ci_pmap == pmap);
3538 
3539 	/*
3540 	 * we aren't interested in TLB invalidations for this pmap,
3541 	 * at least for the time being.
3542 	 */
3543 
3544 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3545 	ci->ci_tlbstate = TLBSTATE_LAZY;
3546 }
3547 
3548 /*
3549  * some misc. functions
3550  */
3551 
3552 bool
3553 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3554     int *lastlvl)
3555 {
3556 	unsigned long index;
3557 	pd_entry_t pde;
3558 	int i;
3559 
3560 	for (i = PTP_LEVELS; i > 1; i--) {
3561 		index = pl_i(va, i);
3562 		pde = pdes[i - 2][index];
3563 		if ((pde & PTE_P) == 0) {
3564 			*lastlvl = i;
3565 			return false;
3566 		}
3567 		if (pde & PTE_PS)
3568 			break;
3569 	}
3570 	if (lastpde != NULL)
3571 		*lastpde = pde;
3572 	*lastlvl = i;
3573 	return true;
3574 }
3575 
3576 /*
3577  * pmap_extract: extract a PA for the given VA
3578  */
3579 bool
3580 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3581 {
3582 	pt_entry_t *ptes, pte;
3583 	pd_entry_t pde;
3584 	pd_entry_t * const *pdes;
3585 	struct pmap *pmap2;
3586 	paddr_t pa;
3587 	bool rv;
3588 	int lvl;
3589 
3590 	if (__predict_false(pmap->pm_extract != NULL)) {
3591 		return (*pmap->pm_extract)(pmap, va, pap);
3592 	}
3593 
3594 #ifdef __HAVE_DIRECT_MAP
3595 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3596 		if (pap != NULL) {
3597 			*pap = PMAP_DIRECT_UNMAP(va);
3598 		}
3599 		return true;
3600 	}
3601 #endif
3602 
3603 	rv = false;
3604 	pa = 0;
3605 
3606 	if (pmap != pmap_kernel()) {
3607 		mutex_enter(&pmap->pm_lock);
3608 	}
3609 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3610 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3611 		if (lvl == 2) {
3612 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3613 			rv = true;
3614 		} else {
3615 			KASSERT(lvl == 1);
3616 			pte = ptes[pl1_i(va)];
3617 			if (__predict_true((pte & PTE_P) != 0)) {
3618 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3619 				rv = true;
3620 			}
3621 		}
3622 	}
3623 	pmap_unmap_ptes(pmap, pmap2);
3624 	if (pmap != pmap_kernel()) {
3625 		mutex_exit(&pmap->pm_lock);
3626 	}
3627 	if (pap != NULL) {
3628 		*pap = pa;
3629 	}
3630 
3631 	return rv;
3632 }
3633 
3634 /*
3635  * vtophys: virtual address to physical address.  For use by
3636  * machine-dependent code only.
3637  */
3638 paddr_t
3639 vtophys(vaddr_t va)
3640 {
3641 	paddr_t pa;
3642 
3643 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3644 		return pa;
3645 	return 0;
3646 }
3647 
3648 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3649 
3650 #ifdef XENPV
3651 /*
3652  * vtomach: virtual address to machine address.  For use by
3653  * machine-dependent code only.
3654  */
3655 paddr_t
3656 vtomach(vaddr_t va)
3657 {
3658 	paddr_t pa;
3659 
3660 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3661 		return pa;
3662 	return 0;
3663 }
3664 #endif
3665 
3666 /*
3667  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3668  * determine the bounds of the kernel virtual addess space.
3669  */
3670 void
3671 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3672 {
3673 	*startp = virtual_avail;
3674 	*endp = virtual_end;
3675 }
3676 
3677 void
3678 pmap_zero_page(paddr_t pa)
3679 {
3680 #if defined(__HAVE_DIRECT_MAP)
3681 	pagezero(PMAP_DIRECT_MAP(pa));
3682 #else
3683 #if defined(XENPV)
3684 	if (XEN_VERSION_SUPPORTED(3, 4))
3685 		xen_pagezero(pa);
3686 #endif
3687 	struct cpu_info *ci;
3688 	pt_entry_t *zpte;
3689 	vaddr_t zerova;
3690 
3691 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3692 
3693 	kpreempt_disable();
3694 
3695 	ci = curcpu();
3696 	zerova = ci->vpage[VPAGE_ZER];
3697 	zpte = ci->vpage_pte[VPAGE_ZER];
3698 
3699 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3700 
3701 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3702 	pmap_pte_flush();
3703 	pmap_update_pg(zerova);		/* flush TLB */
3704 
3705 	memset((void *)zerova, 0, PAGE_SIZE);
3706 
3707 #if defined(DIAGNOSTIC) || defined(XENPV)
3708 	pmap_pte_set(zpte, 0);				/* zap ! */
3709 	pmap_pte_flush();
3710 #endif
3711 
3712 	kpreempt_enable();
3713 #endif /* defined(__HAVE_DIRECT_MAP) */
3714 }
3715 
3716 /*
3717  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3718  * Returns true if the page was zero'd, false if we aborted for
3719  * some reason.
3720  */
3721 bool
3722 pmap_pageidlezero(paddr_t pa)
3723 {
3724 #ifdef __HAVE_DIRECT_MAP
3725 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3726 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3727 #else
3728 	struct cpu_info *ci;
3729 	pt_entry_t *zpte;
3730 	vaddr_t zerova;
3731 	bool rv;
3732 
3733 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3734 
3735 	ci = curcpu();
3736 	zerova = ci->vpage[VPAGE_ZER];
3737 	zpte = ci->vpage_pte[VPAGE_ZER];
3738 
3739 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3740 	KASSERT(*zpte == 0);
3741 
3742 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3743 	pmap_pte_flush();
3744 	pmap_update_pg(zerova);		/* flush TLB */
3745 
3746 	rv = sse2_idlezero_page((void *)zerova);
3747 
3748 #if defined(DIAGNOSTIC) || defined(XENPV)
3749 	pmap_pte_set(zpte, 0);				/* zap ! */
3750 	pmap_pte_flush();
3751 #endif
3752 
3753 	return rv;
3754 #endif
3755 }
3756 
3757 void
3758 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3759 {
3760 #if defined(__HAVE_DIRECT_MAP)
3761 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3762 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3763 
3764 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3765 #else
3766 #if defined(XENPV)
3767 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3768 		xen_copy_page(srcpa, dstpa);
3769 		return;
3770 	}
3771 #endif
3772 	struct cpu_info *ci;
3773 	pt_entry_t *srcpte, *dstpte;
3774 	vaddr_t srcva, dstva;
3775 
3776 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
3777 
3778 	kpreempt_disable();
3779 
3780 	ci = curcpu();
3781 	srcva = ci->vpage[VPAGE_SRC];
3782 	dstva = ci->vpage[VPAGE_DST];
3783 	srcpte = ci->vpage_pte[VPAGE_SRC];
3784 	dstpte = ci->vpage_pte[VPAGE_DST];
3785 
3786 	KASSERT(*srcpte == 0 && *dstpte == 0);
3787 
3788 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3789 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
3790 	pmap_pte_flush();
3791 	pmap_update_pg(srcva);
3792 	pmap_update_pg(dstva);
3793 
3794 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3795 
3796 #if defined(DIAGNOSTIC) || defined(XENPV)
3797 	pmap_pte_set(srcpte, 0);
3798 	pmap_pte_set(dstpte, 0);
3799 	pmap_pte_flush();
3800 #endif
3801 
3802 	kpreempt_enable();
3803 #endif /* defined(__HAVE_DIRECT_MAP) */
3804 }
3805 
3806 static pt_entry_t *
3807 pmap_map_ptp(struct vm_page *ptp)
3808 {
3809 #ifdef __HAVE_DIRECT_MAP
3810 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3811 #else
3812 	struct cpu_info *ci;
3813 	pt_entry_t *ptppte;
3814 	vaddr_t ptpva;
3815 
3816 	KASSERT(kpreempt_disabled());
3817 
3818 #ifndef XENPV
3819 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
3820 #else
3821 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
3822 #endif
3823 
3824 	ci = curcpu();
3825 	ptpva = ci->vpage[VPAGE_PTP];
3826 	ptppte = ci->vpage_pte[VPAGE_PTP];
3827 
3828 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3829 
3830 	pmap_pte_flush();
3831 	pmap_update_pg(ptpva);
3832 
3833 	return (pt_entry_t *)ptpva;
3834 #endif
3835 }
3836 
3837 static void
3838 pmap_unmap_ptp(void)
3839 {
3840 #ifndef __HAVE_DIRECT_MAP
3841 #if defined(DIAGNOSTIC) || defined(XENPV)
3842 	struct cpu_info *ci;
3843 	pt_entry_t *pte;
3844 
3845 	KASSERT(kpreempt_disabled());
3846 
3847 	ci = curcpu();
3848 	pte = ci->vpage_pte[VPAGE_PTP];
3849 
3850 	if (*pte != 0) {
3851 		pmap_pte_set(pte, 0);
3852 		pmap_pte_flush();
3853 	}
3854 #endif
3855 #endif
3856 }
3857 
3858 static pt_entry_t *
3859 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3860 {
3861 
3862 	KASSERT(kpreempt_disabled());
3863 	if (pmap_is_curpmap(pmap)) {
3864 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3865 	}
3866 	KASSERT(ptp != NULL);
3867 	return pmap_map_ptp(ptp) + pl1_pi(va);
3868 }
3869 
3870 static void
3871 pmap_unmap_pte(void)
3872 {
3873 
3874 	KASSERT(kpreempt_disabled());
3875 
3876 	pmap_unmap_ptp();
3877 }
3878 
3879 /*
3880  * p m a p   r e m o v e   f u n c t i o n s
3881  *
3882  * functions that remove mappings
3883  */
3884 
3885 /*
3886  * pmap_remove_ptes: remove PTEs from a PTP
3887  *
3888  * => caller must hold pmap's lock
3889  * => PTP must be mapped into KVA
3890  * => PTP should be null if pmap == pmap_kernel()
3891  * => must be called with kernel preemption disabled
3892  * => returns composite pte if at least one page should be shot down
3893  */
3894 static void
3895 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3896     vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3897 {
3898 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3899 
3900 	KASSERT(mutex_owned(&pmap->pm_lock));
3901 	KASSERT(kpreempt_disabled());
3902 
3903 	/*
3904 	 * mappings are very often sparse, so clip the given range to the
3905 	 * range of PTEs that are known present in the PTP.
3906 	 */
3907 	pmap_ptp_range_clip(ptp, &startva, &pte);
3908 
3909 	/*
3910 	 * note that ptpva points to the PTE that maps startva.   this may
3911 	 * or may not be the first PTE in the PTP.
3912 	 *
3913 	 * we loop through the PTP while there are still PTEs to look at
3914 	 * and the wire_count is greater than 1 (because we use the wire_count
3915 	 * to keep track of the number of real PTEs in the PTP).
3916 	 */
3917 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3918 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3919 		startva += PAGE_SIZE;
3920 		pte++;
3921 	}
3922 }
3923 
3924 /*
3925  * pmap_remove_pte: remove a single PTE from a PTP.
3926  *
3927  * => caller must hold pmap's lock
3928  * => PTP must be mapped into KVA
3929  * => PTP should be null if pmap == pmap_kernel()
3930  * => returns true if we removed a mapping
3931  * => must be called with kernel preemption disabled
3932  */
3933 static bool
3934 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3935     vaddr_t va, struct pv_entry **pv_tofree)
3936 {
3937 	struct pv_entry *pve;
3938 	struct vm_page *pg;
3939 	struct pmap_page *pp;
3940 	pt_entry_t opte;
3941 
3942 	KASSERT(mutex_owned(&pmap->pm_lock));
3943 	KASSERT(kpreempt_disabled());
3944 
3945 	if (!pmap_valid_entry(*pte)) {
3946 		/* VA not mapped. */
3947 		return false;
3948 	}
3949 
3950 	/* Atomically save the old PTE and zap it. */
3951 	opte = pmap_pte_testset(pte, 0);
3952 	if (!pmap_valid_entry(opte)) {
3953 		return false;
3954 	}
3955 
3956 	pmap_exec_account(pmap, va, opte, 0);
3957 	pmap_stats_update_bypte(pmap, 0, opte);
3958 
3959 	if (ptp) {
3960 		/*
3961 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3962 		 */
3963 		ptp->wire_count--;
3964 		if (ptp->wire_count <= 1) {
3965 			opte |= PTE_A;
3966 		}
3967 	}
3968 
3969 	if ((opte & PTE_A) != 0) {
3970 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3971 	}
3972 
3973 	/*
3974 	 * If we are not on a pv list - we are done.
3975 	 */
3976 	if ((opte & PTE_PVLIST) == 0) {
3977 #ifndef DOM0OPS
3978 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3979 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
3980 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3981 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
3982 #endif
3983 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3984 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
3985 		return true;
3986 	}
3987 
3988 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3989 		pp = VM_PAGE_TO_PP(pg);
3990 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3991 		paddr_t pa = pmap_pte2pa(opte);
3992 		panic("%s: PTE_PVLIST with pv-untracked page"
3993 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
3994 		    __func__, va, pa, atop(pa));
3995 	}
3996 
3997 	/* Sync R/M bits. */
3998 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
3999 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4000 
4001 	if (pve) {
4002 		pve->pve_next = *pv_tofree;
4003 		*pv_tofree = pve;
4004 	}
4005 	return true;
4006 }
4007 
4008 /*
4009  * pmap_remove: mapping removal function.
4010  *
4011  * => caller should not be holding any pmap locks
4012  */
4013 void
4014 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4015 {
4016 	pt_entry_t *ptes;
4017 	pd_entry_t pde;
4018 	pd_entry_t * const *pdes;
4019 	struct pv_entry *pv_tofree = NULL;
4020 	bool result;
4021 	vaddr_t blkendva, va = sva;
4022 	struct vm_page *ptp;
4023 	struct pmap *pmap2;
4024 	int lvl;
4025 
4026 	if (__predict_false(pmap->pm_remove != NULL)) {
4027 		(*pmap->pm_remove)(pmap, sva, eva);
4028 		return;
4029 	}
4030 
4031 	mutex_enter(&pmap->pm_lock);
4032 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4033 
4034 	/*
4035 	 * removing one page?  take shortcut function.
4036 	 */
4037 
4038 	if (va + PAGE_SIZE == eva) {
4039 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4040 			KASSERT(lvl == 1);
4041 
4042 			/* Get PTP if non-kernel mapping. */
4043 			if (pmap != pmap_kernel()) {
4044 				ptp = pmap_find_ptp(pmap, va, 1);
4045 				KASSERTMSG(ptp != NULL,
4046 				    "%s: unmanaged PTP detected", __func__);
4047 			} else {
4048 				/* Never free kernel PTPs. */
4049 				ptp = NULL;
4050 			}
4051 
4052 			result = pmap_remove_pte(pmap, ptp,
4053 			    &ptes[pl1_i(va)], va, &pv_tofree);
4054 
4055 			/*
4056 			 * if mapping removed and the PTP is no longer
4057 			 * being used, free it!
4058 			 */
4059 
4060 			if (result && ptp && ptp->wire_count <= 1)
4061 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4062 		}
4063 	} else for (/* null */ ; va < eva ; va = blkendva) {
4064 		/* determine range of block */
4065 		blkendva = x86_round_pdr(va+1);
4066 		if (blkendva > eva)
4067 			blkendva = eva;
4068 
4069 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4070 			/* Skip a range corresponding to an invalid pde. */
4071 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4072  			continue;
4073 		}
4074 		KASSERT(lvl == 1);
4075 
4076 		/* Get PTP if non-kernel mapping. */
4077 		if (pmap != pmap_kernel()) {
4078 			ptp = pmap_find_ptp(pmap, va, 1);
4079 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4080 			    __func__);
4081 		} else {
4082 			/* Never free kernel PTPs. */
4083 			ptp = NULL;
4084 		}
4085 
4086 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4087 		    blkendva, &pv_tofree);
4088 
4089 		/* If PTP is no longer being used, free it. */
4090 		if (ptp && ptp->wire_count <= 1) {
4091 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4092 		}
4093 	}
4094 	pmap_unmap_ptes(pmap, pmap2);
4095 	/*
4096 	 * Now safe to free, as we no longer have the PTEs mapped and can
4097 	 * block again.
4098 	 */
4099 	if (pv_tofree != NULL) {
4100 		pmap_free_pvs(pmap, pv_tofree);
4101 	}
4102 	mutex_exit(&pmap->pm_lock);
4103 }
4104 
4105 /*
4106  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4107  *
4108  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4109  * => Caller should disable kernel preemption.
4110  * => issues tlb shootdowns if necessary.
4111  */
4112 static int
4113 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4114     pt_entry_t *optep)
4115 {
4116 	struct pmap *pmap;
4117 	struct vm_page *ptp;
4118 	vaddr_t va;
4119 	pt_entry_t *ptep;
4120 	pt_entry_t opte;
4121 	pt_entry_t npte;
4122 	pt_entry_t expect;
4123 	bool need_shootdown;
4124 
4125 	ptp = pvpte->pte_ptp;
4126 	va = pvpte->pte_va;
4127 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4128 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4129 	pmap = ptp_to_pmap(ptp);
4130 	KASSERT(kpreempt_disabled());
4131 
4132 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4133 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4134 		    optep);
4135 	}
4136 
4137 	expect = pmap_pa2pte(pa) | PTE_P;
4138 
4139 	if (clearbits != ~0) {
4140 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4141 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4142 	}
4143 
4144 	ptep = pmap_map_pte(pmap, ptp, va);
4145 	do {
4146 		opte = *ptep;
4147 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4148 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4149 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4150 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4151 			/*
4152 			 * We lost a race with a V->P operation like
4153 			 * pmap_remove().  Wait for the competitor
4154 			 * reflecting pte bits into mp_attrs.
4155 			 */
4156 			pmap_unmap_pte();
4157 			return EAGAIN;
4158 		}
4159 
4160 		/*
4161 		 * Check if there's anything to do on this PTE.
4162 		 */
4163 		if ((opte & clearbits) == 0) {
4164 			need_shootdown = false;
4165 			break;
4166 		}
4167 
4168 		/*
4169 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4170 		 * ... Unless we are clearing only the PTE_W bit and
4171 		 * it isn't cached as RW (PTE_D).
4172 		 */
4173 		need_shootdown = (opte & PTE_A) != 0 &&
4174 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4175 
4176 		npte = opte & ~clearbits;
4177 
4178 		/*
4179 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4180 		 */
4181 		if (need_shootdown) {
4182 			npte &= ~(PTE_A | PTE_D);
4183 		}
4184 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4185 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4186 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4187 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4188 
4189 	if (need_shootdown) {
4190 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4191 	}
4192 	pmap_unmap_pte();
4193 
4194 	*oattrs = pmap_pte_to_pp_attrs(opte);
4195 	if (optep != NULL)
4196 		*optep = opte;
4197 	return 0;
4198 }
4199 
4200 static void
4201 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4202     vaddr_t va)
4203 {
4204 	struct pmap *pmap2;
4205 	pt_entry_t *ptes;
4206 	pd_entry_t * const *pdes;
4207 
4208 	KASSERT(mutex_owned(&pmap->pm_lock));
4209 
4210 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4211 	pmap_stats_update_bypte(pmap, 0, opte);
4212 	ptp->wire_count--;
4213 	if (ptp->wire_count <= 1) {
4214 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4215 	}
4216 	pmap_unmap_ptes(pmap, pmap2);
4217 }
4218 
4219 static void
4220 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4221 {
4222 	struct pv_pte *pvpte;
4223 	struct vm_page *ptp;
4224 	uintptr_t sum;
4225 	uint8_t oattrs;
4226 	bool locked;
4227 
4228 	/*
4229 	 * Do an unlocked check to see if the page has no mappings, eg when
4230 	 * pmap_remove_all() was called before amap_wipeout() for a process
4231 	 * private amap - common.  The page being removed must be on the way
4232 	 * out, so we don't have to worry about concurrent attempts to enter
4233 	 * it (otherwise the caller either doesn't care or has screwed up).
4234 	 */
4235 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4236 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4237 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4238 	if (sum == 0) {
4239 	    	return;
4240 	}
4241 
4242 	kpreempt_disable();
4243 	for (;;) {
4244 		struct pmap *pmap;
4245 		struct pv_entry *pve;
4246 		pt_entry_t opte;
4247 		vaddr_t va;
4248 
4249 		mutex_spin_enter(&pp->pp_lock);
4250 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4251 			mutex_spin_exit(&pp->pp_lock);
4252 			break;
4253 		}
4254 
4255 		/*
4256 		 * Add a reference to the pmap before clearing the pte.
4257 		 * Otherwise the pmap can disappear behind us.
4258 		 */
4259 		ptp = pvpte->pte_ptp;
4260 		pmap = ptp_to_pmap(ptp);
4261 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4262 		if (ptp != NULL) {
4263 			pmap_reference(pmap);
4264 		}
4265 
4266 		/*
4267 		 * Now try to lock it.  We need a direct handoff between
4268 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4269 		 * and kept associated with this pmap.  If that can't be
4270 		 * had, wait for the pmap's lock to become free and then
4271 		 * retry.
4272 		 */
4273 		locked = mutex_tryenter(&pmap->pm_lock);
4274 		mutex_spin_exit(&pp->pp_lock);
4275 		if (!locked) {
4276 			mutex_enter(&pmap->pm_lock);
4277 			/* nothing, just wait for it */
4278 			mutex_exit(&pmap->pm_lock);
4279 			if (ptp != NULL) {
4280 				pmap_destroy(pmap);
4281 			}
4282 			continue;
4283 		}
4284 		va = pvpte->pte_va;
4285 
4286 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4287 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4288 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4289 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4290 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4291 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4292 
4293 #ifdef DEBUG
4294 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4295 		rb_tree_t *tree = (ptp != NULL ?
4296 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4297 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4298 		if (pve == NULL) {
4299 			KASSERTMSG(&pp->pp_pte == pvpte,
4300 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4301 			    va, pmap, ptp, pvpte, pve);
4302 		} else {
4303 			KASSERTMSG(&pve->pve_pte == pvpte,
4304 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4305 			    va, pmap, ptp, pvpte, pve);
4306 		}
4307 #endif
4308 
4309 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4310 			panic("pmap_pp_remove: mapping not present");
4311 		}
4312 
4313 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4314 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4315 
4316 		/* Update the PTP reference count. Free if last reference. */
4317 		if (ptp != NULL) {
4318 			KASSERT(pmap != pmap_kernel());
4319 			pmap_tlb_shootnow();
4320 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4321 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4322 			} else {
4323 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4324 			}
4325 		} else {
4326 			KASSERT(pmap == pmap_kernel());
4327 			pmap_stats_update_bypte(pmap, 0, opte);
4328 		}
4329 		if (pve != NULL) {
4330 			pve->pve_next = NULL;
4331 			pmap_free_pvs(pmap, pve);
4332 		}
4333 		pmap_tlb_shootnow();
4334 		mutex_exit(&pmap->pm_lock);
4335 		if (ptp != NULL) {
4336 			pmap_destroy(pmap);
4337 		}
4338 	}
4339 	kpreempt_enable();
4340 }
4341 
4342 /*
4343  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4344  *
4345  * => R/M bits are sync'd back to attrs
4346  */
4347 void
4348 pmap_page_remove(struct vm_page *pg)
4349 {
4350 	struct pmap_page *pp;
4351 	paddr_t pa;
4352 
4353 	pp = VM_PAGE_TO_PP(pg);
4354 	pa = VM_PAGE_TO_PHYS(pg);
4355 	pmap_pp_remove(pp, pa);
4356 }
4357 
4358 /*
4359  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4360  * that map it
4361  */
4362 void
4363 pmap_pv_remove(paddr_t pa)
4364 {
4365 	struct pmap_page *pp;
4366 
4367 	pp = pmap_pv_tracked(pa);
4368 	if (pp == NULL)
4369 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4370 	pmap_pp_remove(pp, pa);
4371 }
4372 
4373 /*
4374  * p m a p   a t t r i b u t e  f u n c t i o n s
4375  * functions that test/change managed page's attributes
4376  * since a page can be mapped multiple times we must check each PTE that
4377  * maps it by going down the pv lists.
4378  */
4379 
4380 /*
4381  * pmap_test_attrs: test a page's attributes
4382  */
4383 bool
4384 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4385 {
4386 	struct pmap_page *pp;
4387 	struct pv_pte *pvpte;
4388 	struct pmap *pmap;
4389 	uint8_t oattrs;
4390 	u_int result;
4391 	paddr_t pa;
4392 
4393 	pp = VM_PAGE_TO_PP(pg);
4394 	if ((pp->pp_attrs & testbits) != 0) {
4395 		return true;
4396 	}
4397 	pa = VM_PAGE_TO_PHYS(pg);
4398  startover:
4399 	mutex_spin_enter(&pp->pp_lock);
4400 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4401 		if ((pp->pp_attrs & testbits) != 0) {
4402 			break;
4403 		}
4404 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4405 			/*
4406 			 * raced with a V->P operation.  wait for the other
4407 			 * side to finish by acquring pmap's lock.  if no
4408 			 * wait, updates to pp_attrs by the other side may
4409 			 * go unseen.
4410 			 */
4411 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4412 			pmap_reference(pmap);
4413 			mutex_spin_exit(&pp->pp_lock);
4414 			mutex_enter(&pmap->pm_lock);
4415 			/* nothing. */
4416 			mutex_exit(&pmap->pm_lock);
4417 			pmap_destroy(pmap);
4418 			goto startover;
4419 		}
4420 		pp->pp_attrs |= oattrs;
4421 	}
4422 	result = pp->pp_attrs & testbits;
4423 	mutex_spin_exit(&pp->pp_lock);
4424 
4425 	/*
4426 	 * note that we will exit the for loop with a non-null pve if
4427 	 * we have found the bits we are testing for.
4428 	 */
4429 
4430 	return result != 0;
4431 }
4432 
4433 static bool
4434 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4435 {
4436 	struct pv_pte *pvpte;
4437 	struct pmap *pmap;
4438 	uint8_t oattrs;
4439 	u_int result;
4440 
4441 startover:
4442 	mutex_spin_enter(&pp->pp_lock);
4443 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4444 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4445 			/*
4446 			 * raced with a V->P operation.  wait for the other
4447 			 * side to finish by acquring pmap's lock.  it is
4448 			 * probably unmapping the page, and it will be gone
4449 			 * when the loop is restarted.
4450 			 */
4451 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4452 			pmap_reference(pmap);
4453 			mutex_spin_exit(&pp->pp_lock);
4454 			mutex_enter(&pmap->pm_lock);
4455 			/* nothing. */
4456 			mutex_exit(&pmap->pm_lock);
4457 			pmap_destroy(pmap);
4458 			goto startover;
4459 		}
4460 		pp->pp_attrs |= oattrs;
4461 	}
4462 	result = pp->pp_attrs & clearbits;
4463 	pp->pp_attrs &= ~clearbits;
4464 	pmap_tlb_shootnow();
4465 	mutex_spin_exit(&pp->pp_lock);
4466 
4467 	return result != 0;
4468 }
4469 
4470 /*
4471  * pmap_clear_attrs: clear the specified attribute for a page.
4472  *
4473  * => we return true if we cleared one of the bits we were asked to
4474  */
4475 bool
4476 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4477 {
4478 	struct pmap_page *pp;
4479 	paddr_t pa;
4480 
4481 	pp = VM_PAGE_TO_PP(pg);
4482 	pa = VM_PAGE_TO_PHYS(pg);
4483 
4484 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4485 }
4486 
4487 /*
4488  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4489  * pv-tracked page.
4490  */
4491 bool
4492 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4493 {
4494 	struct pmap_page *pp;
4495 
4496 	pp = pmap_pv_tracked(pa);
4497 	if (pp == NULL)
4498 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4499 
4500 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4501 }
4502 
4503 /*
4504  * p m a p   p r o t e c t i o n   f u n c t i o n s
4505  */
4506 
4507 /*
4508  * pmap_page_protect: change the protection of all recorded mappings
4509  * of a managed page
4510  *
4511  * => NOTE: this is an inline function in pmap.h
4512  */
4513 
4514 /* see pmap.h */
4515 
4516 /*
4517  * pmap_pv_protect: change the protection of all recorded mappings
4518  * of an unmanaged pv-tracked page
4519  *
4520  * => NOTE: this is an inline function in pmap.h
4521  */
4522 
4523 /* see pmap.h */
4524 
4525 /*
4526  * pmap_protect: set the protection in of the pages in a pmap
4527  *
4528  * => NOTE: this is an inline function in pmap.h
4529  */
4530 
4531 /* see pmap.h */
4532 
4533 /*
4534  * pmap_write_protect: write-protect pages in a pmap.
4535  *
4536  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4537  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4538  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4539  * present the page will still be considered as a kernel page, and the privilege
4540  * separation will be enforced correctly.
4541  */
4542 void
4543 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4544 {
4545 	pt_entry_t bit_rem, bit_put;
4546 	pt_entry_t *ptes;
4547 	pt_entry_t * const *pdes;
4548 	struct pmap *pmap2;
4549 	vaddr_t blockend, va;
4550 	int lvl, i;
4551 
4552 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4553 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4554 		return;
4555 	}
4556 
4557 	bit_rem = 0;
4558 	if (!(prot & VM_PROT_WRITE))
4559 		bit_rem = PTE_W;
4560 
4561 	bit_put = 0;
4562 	if (!(prot & VM_PROT_EXECUTE))
4563 		bit_put = pmap_pg_nx;
4564 
4565 	sva &= ~PAGE_MASK;
4566 	eva &= ~PAGE_MASK;
4567 
4568 	/*
4569 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4570 	 * be touching PV entries nor stats and kernel PDEs aren't
4571 	 * freed.
4572 	 */
4573 	if (pmap != pmap_kernel()) {
4574 		mutex_enter(&pmap->pm_lock);
4575 	}
4576 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4577 
4578 	for (va = sva ; va < eva; va = blockend) {
4579 		pt_entry_t *spte, *epte;
4580 
4581 		blockend = x86_round_pdr(va + 1);
4582 		if (blockend > eva)
4583 			blockend = eva;
4584 
4585 		/* Is it a valid block? */
4586 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4587 			continue;
4588 		}
4589 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4590 		KASSERT(lvl == 1);
4591 
4592 		spte = &ptes[pl1_i(va)];
4593 		epte = &ptes[pl1_i(blockend)];
4594 
4595 		for (i = 0; spte < epte; spte++, i++) {
4596 			pt_entry_t opte, npte;
4597 
4598 			do {
4599 				opte = *spte;
4600 				if (!pmap_valid_entry(opte)) {
4601 					goto next;
4602 				}
4603 				npte = (opte & ~bit_rem) | bit_put;
4604 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4605 
4606 			if ((opte & PTE_D) != 0) {
4607 				vaddr_t tva = va + x86_ptob(i);
4608 				pmap_tlb_shootdown(pmap, tva, opte,
4609 				    TLBSHOOT_WRITE_PROTECT);
4610 			}
4611 next:;
4612 		}
4613 	}
4614 
4615 	/* Release pmap. */
4616 	pmap_unmap_ptes(pmap, pmap2);
4617 	if (pmap != pmap_kernel()) {
4618 		mutex_exit(&pmap->pm_lock);
4619 	}
4620 }
4621 
4622 /*
4623  * pmap_unwire: clear the wired bit in the PTE.
4624  *
4625  * => Mapping should already be present.
4626  */
4627 void
4628 pmap_unwire(struct pmap *pmap, vaddr_t va)
4629 {
4630 	pt_entry_t *ptes, *ptep, opte;
4631 	pd_entry_t * const *pdes;
4632 	struct pmap *pmap2;
4633 	int lvl;
4634 
4635 	if (__predict_false(pmap->pm_unwire != NULL)) {
4636 		(*pmap->pm_unwire)(pmap, va);
4637 		return;
4638 	}
4639 
4640 	/*
4641 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4642 	 * statistics.
4643 	 */
4644 	mutex_enter(&pmap->pm_lock);
4645 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4646 
4647 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4648 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4649 	}
4650 	KASSERT(lvl == 1);
4651 
4652 	ptep = &ptes[pl1_i(va)];
4653 	opte = *ptep;
4654 	KASSERT(pmap_valid_entry(opte));
4655 
4656 	if (opte & PTE_WIRED) {
4657 		pt_entry_t npte = opte & ~PTE_WIRED;
4658 
4659 		opte = pmap_pte_testset(ptep, npte);
4660 		pmap_stats_update_bypte(pmap, npte, opte);
4661 	} else {
4662 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4663 		    " did not change!\n", __func__, pmap, va);
4664 	}
4665 
4666 	/* Release pmap. */
4667 	pmap_unmap_ptes(pmap, pmap2);
4668 	mutex_exit(&pmap->pm_lock);
4669 }
4670 
4671 /*
4672  * pmap_copy: copy mappings from one pmap to another
4673  *
4674  * => optional function
4675  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4676  */
4677 
4678 /*
4679  * defined as macro in pmap.h
4680  */
4681 
4682 __strict_weak_alias(pmap_enter, pmap_enter_default);
4683 
4684 int
4685 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4686     u_int flags)
4687 {
4688 	if (__predict_false(pmap->pm_enter != NULL)) {
4689 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4690 	}
4691 
4692 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4693 }
4694 
4695 /*
4696  * pmap_enter: enter a mapping into a pmap
4697  *
4698  * => must be done "now" ... no lazy-evaluation
4699  */
4700 int
4701 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4702 	   vm_prot_t prot, u_int flags, int domid)
4703 {
4704 	pt_entry_t *ptes, opte, npte;
4705 	pt_entry_t *ptep;
4706 	pd_entry_t * const *pdes;
4707 	struct vm_page *ptp;
4708 	struct vm_page *new_pg, *old_pg;
4709 	struct pmap_page *new_pp, *old_pp;
4710 	struct pv_entry *old_pve, *new_pve;
4711 	bool wired = (flags & PMAP_WIRED) != 0;
4712 	struct pmap *pmap2;
4713 	struct pmap_ptparray pt;
4714 	int error;
4715 	bool getptp, samepage, new_embedded;
4716 	rb_tree_t *tree;
4717 
4718 	KASSERT(pmap_initialized);
4719 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4720 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4721 	    PRIxVADDR " over PDP!", __func__, va);
4722 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4723 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4724 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4725 
4726 #ifdef XENPV
4727 	KASSERT(domid == DOMID_SELF || pa == 0);
4728 #endif
4729 
4730 	npte = ma | protection_codes[prot] | PTE_P;
4731 	npte |= pmap_pat_flags(flags);
4732 	if (wired)
4733 	        npte |= PTE_WIRED;
4734 	if (va < VM_MAXUSER_ADDRESS)
4735 		npte |= PTE_U;
4736 
4737 	if (pmap == pmap_kernel())
4738 		npte |= pmap_pg_g;
4739 	if (flags & VM_PROT_ALL) {
4740 		npte |= PTE_A;
4741 		if (flags & VM_PROT_WRITE) {
4742 			KASSERT((npte & PTE_W) != 0);
4743 			npte |= PTE_D;
4744 		}
4745 	}
4746 
4747 #ifdef XENPV
4748 	if (domid != DOMID_SELF)
4749 		new_pg = NULL;
4750 	else
4751 #endif
4752 		new_pg = PHYS_TO_VM_PAGE(pa);
4753 
4754 	if (new_pg != NULL) {
4755 		/* This is a managed page */
4756 		npte |= PTE_PVLIST;
4757 		new_pp = VM_PAGE_TO_PP(new_pg);
4758 		PMAP_CHECK_PP(new_pp);
4759 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4760 		/* This is an unmanaged pv-tracked page */
4761 		npte |= PTE_PVLIST;
4762 		PMAP_CHECK_PP(new_pp);
4763 	} else {
4764 		new_pp = NULL;
4765 	}
4766 
4767 	/* Begin by locking the pmap. */
4768 	mutex_enter(&pmap->pm_lock);
4769 
4770 	/* Look up the PTP.  Allocate if none present. */
4771 	ptp = NULL;
4772 	getptp = false;
4773 	if (pmap != pmap_kernel()) {
4774 		ptp = pmap_find_ptp(pmap, va, 1);
4775 		if (ptp == NULL) {
4776 			getptp = true;
4777 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
4778 			if (error != 0) {
4779 				if (flags & PMAP_CANFAIL) {
4780 					mutex_exit(&pmap->pm_lock);
4781 					return error;
4782 				}
4783 				panic("%s: get ptp failed, error=%d", __func__,
4784 				    error);
4785 			}
4786 		}
4787 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
4788 	} else {
4789 		/* Embedded PV entries rely on this. */
4790 		KASSERT(va != 0);
4791 		tree = &pmap_kernel_rb;
4792 	}
4793 
4794 	/*
4795 	 * Look up the old PV entry at this VA (if any), and insert a new PV
4796 	 * entry if required for the new mapping.  Temporarily track the old
4797 	 * and new mappings concurrently.  Only after the old mapping is
4798 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
4799 	 * our picture of modified/accessed state for either page could get
4800 	 * out of sync (we need any P->V operation for either page to stall
4801 	 * on pmap->pm_lock until done here).
4802 	 */
4803 	new_pve = NULL;
4804 	old_pve = NULL;
4805 	samepage = false;
4806 	new_embedded = false;
4807 
4808     	if (new_pp != NULL) {
4809     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
4810     		    &old_pve, &samepage, &new_embedded, tree);
4811 
4812 		/*
4813 		 * If a new pv_entry was needed and none was available, we
4814 		 * can go no further.
4815 		 */
4816 		if (error != 0) {
4817 			if (flags & PMAP_CANFAIL) {
4818 				if (getptp) {
4819 					pmap_unget_ptp(pmap, &pt);
4820 				}
4821 				mutex_exit(&pmap->pm_lock);
4822 				return error;
4823 			}
4824 			panic("%s: alloc pve failed", __func__);
4825 		}
4826 	} else {
4827 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4828 	}
4829 
4830 	/* Map PTEs into address space. */
4831 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4832 
4833 	/* Install any newly allocated PTPs. */
4834 	if (getptp) {
4835 		pmap_install_ptp(pmap, &pt, va, pdes);
4836 	}
4837 
4838 	/* Check if there is an existing mapping. */
4839 	ptep = &ptes[pl1_i(va)];
4840 	opte = *ptep;
4841 	bool have_oldpa = pmap_valid_entry(opte);
4842 	paddr_t oldpa = pmap_pte2pa(opte);
4843 
4844 	/*
4845 	 * Update the pte.
4846 	 */
4847 	do {
4848 		opte = *ptep;
4849 
4850 		/*
4851 		 * if the same page, inherit PTE_A and PTE_D.
4852 		 */
4853 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
4854 			npte |= opte & (PTE_A | PTE_D);
4855 		}
4856 #if defined(XENPV)
4857 		if (domid != DOMID_SELF) {
4858 			/* pmap_pte_cas with error handling */
4859 			int s = splvm();
4860 			if (opte != *ptep) {
4861 				splx(s);
4862 				continue;
4863 			}
4864 			error = xpq_update_foreign(
4865 			    vtomach((vaddr_t)ptep), npte, domid);
4866 			splx(s);
4867 			if (error) {
4868 				/* Undo pv_entry tracking - oof. */
4869 				if (new_pp != NULL) {
4870 					mutex_spin_enter(&new_pp->pp_lock);
4871 					if (new_pve != NULL) {
4872 						LIST_REMOVE(new_pve, pve_list);
4873 						KASSERT(pmap->pm_pve == NULL);
4874 						pmap->pm_pve = new_pve;
4875 					} else if (new_embedded) {
4876 						new_pp->pp_pte.pte_ptp = NULL;
4877 						new_pp->pp_pte.pte_va = 0;
4878 					}
4879 					mutex_spin_exit(&new_pp->pp_lock);
4880 				}
4881 				pmap_unmap_ptes(pmap, pmap2);
4882 				/* Free new PTP. */
4883 				if (ptp != NULL && ptp->wire_count <= 1) {
4884 					pmap_free_ptp(pmap, ptp, va, ptes,
4885 					    pdes);
4886 				}
4887 				mutex_exit(&pmap->pm_lock);
4888 				return error;
4889 			}
4890 			break;
4891 		}
4892 #endif /* defined(XENPV) */
4893 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4894 
4895 	/*
4896 	 * Done with the PTEs: they can now be unmapped.
4897 	 */
4898 	pmap_unmap_ptes(pmap, pmap2);
4899 
4900 	/*
4901 	 * Update statistics and PTP's reference count.
4902 	 */
4903 	pmap_stats_update_bypte(pmap, npte, opte);
4904 	if (ptp != NULL) {
4905 		if (!have_oldpa) {
4906 			ptp->wire_count++;
4907 		}
4908 		/* Remember minimum VA in PTP. */
4909 		pmap_ptp_range_set(ptp, va);
4910 	}
4911 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4912 
4913 	/*
4914 	 * If the same page, we can skip pv_entry handling.
4915 	 */
4916 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
4917 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
4918 		if ((npte & PTE_PVLIST) != 0) {
4919 			KASSERT(samepage);
4920 			pmap_check_pv(pmap, ptp, new_pp, va, true);
4921 		}
4922 		goto same_pa;
4923 	} else if ((npte & PTE_PVLIST) != 0) {
4924 		KASSERT(!samepage);
4925 	}
4926 
4927 	/*
4928 	 * If old page is pv-tracked, remove pv_entry from its list.
4929 	 */
4930 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
4931 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
4932 			old_pp = VM_PAGE_TO_PP(old_pg);
4933 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
4934 			panic("%s: PTE_PVLIST with pv-untracked page"
4935 			    " va = %#"PRIxVADDR
4936 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
4937 			    __func__, va, oldpa, atop(pa));
4938 		}
4939 
4940 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
4941 		    pmap_pte_to_pp_attrs(opte));
4942 		if (old_pve != NULL) {
4943 			if (pmap->pm_pve == NULL) {
4944 				pmap->pm_pve = old_pve;
4945 			} else {
4946 				pool_cache_put(&pmap_pv_cache, old_pve);
4947 			}
4948 		}
4949 	} else {
4950 		KASSERT(old_pve == NULL);
4951 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
4952 	}
4953 
4954 	/*
4955 	 * If new page is dynamically PV tracked, insert to tree.
4956 	 */
4957 	if (new_pve != NULL) {
4958 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
4959 		old_pve = rb_tree_insert_node(tree, new_pve);
4960 		KASSERT(old_pve == new_pve);
4961 		pmap_check_pv(pmap, ptp, new_pp, va, true);
4962 	}
4963 
4964 same_pa:
4965 	/*
4966 	 * shootdown tlb if necessary.
4967 	 */
4968 
4969 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
4970 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
4971 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4972 	}
4973 	mutex_exit(&pmap->pm_lock);
4974 	return 0;
4975 }
4976 
4977 paddr_t
4978 pmap_get_physpage(void)
4979 {
4980 	struct vm_page *ptp;
4981 	struct pmap *kpm = pmap_kernel();
4982 	paddr_t pa;
4983 
4984 	if (!uvm.page_init_done) {
4985 		/*
4986 		 * We're growing the kernel pmap early (from
4987 		 * uvm_pageboot_alloc()). This case must be
4988 		 * handled a little differently.
4989 		 */
4990 
4991 		if (!uvm_page_physget(&pa))
4992 			panic("%s: out of memory", __func__);
4993 #if defined(__HAVE_DIRECT_MAP)
4994 		pagezero(PMAP_DIRECT_MAP(pa));
4995 #else
4996 #if defined(XENPV)
4997 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4998 			xen_pagezero(pa);
4999 			return pa;
5000 		}
5001 #endif
5002 		kpreempt_disable();
5003 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5004 		    PTE_W | pmap_pg_nx);
5005 		pmap_pte_flush();
5006 		pmap_update_pg((vaddr_t)early_zerop);
5007 		memset(early_zerop, 0, PAGE_SIZE);
5008 #if defined(DIAGNOSTIC) || defined(XENPV)
5009 		pmap_pte_set(early_zero_pte, 0);
5010 		pmap_pte_flush();
5011 #endif /* defined(DIAGNOSTIC) */
5012 		kpreempt_enable();
5013 #endif /* defined(__HAVE_DIRECT_MAP) */
5014 	} else {
5015 		/* XXX */
5016 		ptp = uvm_pagealloc(NULL, 0, NULL,
5017 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5018 		if (ptp == NULL)
5019 			panic("%s: out of memory", __func__);
5020 		ptp->flags &= ~PG_BUSY;
5021 		ptp->wire_count = 1;
5022 		pa = VM_PAGE_TO_PHYS(ptp);
5023 	}
5024 	pmap_stats_update(kpm, 1, 0);
5025 
5026 	return pa;
5027 }
5028 
5029 /*
5030  * Expand the page tree with the specified amount of PTPs, mapping virtual
5031  * addresses starting at kva. We populate all the levels but the last one
5032  * (L1). The nodes of the tree are created as RW, but the pages covered
5033  * will be kentered in L1, with proper permissions.
5034  *
5035  * Used only by pmap_growkernel.
5036  */
5037 static void
5038 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5039 {
5040 	unsigned long i;
5041 	paddr_t pa;
5042 	unsigned long index, endindex;
5043 	int level;
5044 	pd_entry_t *pdep;
5045 #ifdef XENPV
5046 	int s = splvm(); /* protect xpq_* */
5047 #endif
5048 
5049 	for (level = PTP_LEVELS; level > 1; level--) {
5050 		if (level == PTP_LEVELS)
5051 			pdep = cpm->pm_pdir;
5052 		else
5053 			pdep = normal_pdes[level - 2];
5054 		index = pl_i_roundup(kva, level);
5055 		endindex = index + needed_ptps[level - 1] - 1;
5056 
5057 		for (i = index; i <= endindex; i++) {
5058 			pt_entry_t pte;
5059 
5060 			KASSERT(!pmap_valid_entry(pdep[i]));
5061 			pa = pmap_get_physpage();
5062 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5063 #ifdef __x86_64__
5064 			pte |= pmap_pg_nx;
5065 #endif
5066 			pmap_pte_set(&pdep[i], pte);
5067 
5068 #ifdef XENPV
5069 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5070 				if (__predict_true(
5071 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5072 					/* update per-cpu PMDs on all cpus */
5073 					xen_kpm_sync(pmap_kernel(), i);
5074 				} else {
5075 					/*
5076 					 * too early; update primary CPU
5077 					 * PMD only (without locks)
5078 					 */
5079 #ifdef __x86_64__
5080 					pd_entry_t *cpu_pdep =
5081 						&cpu_info_primary.ci_kpm_pdir[i];
5082 #else
5083 					pd_entry_t *cpu_pdep =
5084 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5085 #endif
5086 					pmap_pte_set(cpu_pdep, pte);
5087 				}
5088 			}
5089 #endif
5090 
5091 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5092 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5093 			nkptp[level - 1]++;
5094 		}
5095 		pmap_pte_flush();
5096 	}
5097 #ifdef XENPV
5098 	splx(s);
5099 #endif
5100 }
5101 
5102 /*
5103  * pmap_growkernel: increase usage of KVM space.
5104  *
5105  * => we allocate new PTPs for the kernel and install them in all
5106  *    the pmaps on the system.
5107  */
5108 vaddr_t
5109 pmap_growkernel(vaddr_t maxkvaddr)
5110 {
5111 	struct pmap *kpm = pmap_kernel();
5112 	struct pmap *cpm;
5113 #if !defined(XENPV) || !defined(__x86_64__)
5114 	struct pmap *pm;
5115 	long old;
5116 #endif
5117 	int s, i;
5118 	long needed_kptp[PTP_LEVELS], target_nptp;
5119 	bool invalidate = false;
5120 
5121 	s = splvm();	/* to be safe */
5122 	mutex_enter(&kpm->pm_lock);
5123 
5124 	if (maxkvaddr <= pmap_maxkvaddr) {
5125 		mutex_exit(&kpm->pm_lock);
5126 		splx(s);
5127 		return pmap_maxkvaddr;
5128 	}
5129 
5130 	maxkvaddr = x86_round_pdr(maxkvaddr);
5131 #if !defined(XENPV) || !defined(__x86_64__)
5132 	old = nkptp[PTP_LEVELS - 1];
5133 #endif
5134 
5135 	/* Initialize needed_kptp. */
5136 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5137 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5138 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5139 
5140 		if (target_nptp > nkptpmax[i])
5141 			panic("out of KVA space");
5142 		KASSERT(target_nptp >= nkptp[i]);
5143 		needed_kptp[i] = target_nptp - nkptp[i];
5144 	}
5145 
5146 #ifdef XENPV
5147 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5148 	cpm = kpm;
5149 #else
5150 	/* Get the current pmap */
5151 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5152 		cpm = curcpu()->ci_pmap;
5153 	} else {
5154 		cpm = kpm;
5155 	}
5156 #endif
5157 
5158 	kasan_shadow_map((void *)pmap_maxkvaddr,
5159 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5160 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5161 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5162 
5163 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5164 
5165 	/*
5166 	 * If the number of top level entries changed, update all pmaps.
5167 	 */
5168 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5169 #ifdef XENPV
5170 #ifdef __x86_64__
5171 		/* nothing, kernel entries are never entered in user pmap */
5172 #else
5173 		int pdkidx;
5174 
5175 		mutex_enter(&pmaps_lock);
5176 		LIST_FOREACH(pm, &pmaps, pm_list) {
5177 			for (pdkidx = PDIR_SLOT_KERN + old;
5178 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5179 			    pdkidx++) {
5180 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5181 				    kpm->pm_pdir[pdkidx]);
5182 			}
5183 			pmap_pte_flush();
5184 		}
5185 		mutex_exit(&pmaps_lock);
5186 #endif /* __x86_64__ */
5187 #else /* XENPV */
5188 		size_t newpdes;
5189 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5190 		if (cpm != kpm) {
5191 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5192 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5193 			    newpdes * sizeof(pd_entry_t));
5194 		}
5195 
5196 		mutex_enter(&pmaps_lock);
5197 		LIST_FOREACH(pm, &pmaps, pm_list) {
5198 			if (__predict_false(pm->pm_enter != NULL)) {
5199 				/*
5200 				 * Not a native pmap, the kernel is not mapped,
5201 				 * so nothing to synchronize.
5202 				 */
5203 				continue;
5204 			}
5205 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5206 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5207 			    newpdes * sizeof(pd_entry_t));
5208 		}
5209 		mutex_exit(&pmaps_lock);
5210 #endif
5211 		invalidate = true;
5212 	}
5213 	pmap_maxkvaddr = maxkvaddr;
5214 	mutex_exit(&kpm->pm_lock);
5215 	splx(s);
5216 
5217 	if (invalidate && pmap_initialized) {
5218 		/* Invalidate the pmap cache. */
5219 		pool_cache_invalidate(&pmap_cache);
5220 	}
5221 
5222 	return maxkvaddr;
5223 }
5224 
5225 #ifdef DEBUG
5226 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5227 
5228 /*
5229  * pmap_dump: dump all the mappings from a pmap
5230  *
5231  * => caller should not be holding any pmap locks
5232  */
5233 void
5234 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5235 {
5236 	pt_entry_t *ptes, *pte;
5237 	pd_entry_t * const *pdes;
5238 	struct pmap *pmap2;
5239 	vaddr_t blkendva;
5240 	int lvl;
5241 
5242 	/*
5243 	 * if end is out of range truncate.
5244 	 * if (end == start) update to max.
5245 	 */
5246 
5247 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5248 		eva = VM_MAXUSER_ADDRESS;
5249 
5250 	mutex_enter(&pmap->pm_lock);
5251 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5252 
5253 	/*
5254 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5255 	 */
5256 
5257 	for (/* null */ ; sva < eva ; sva = blkendva) {
5258 
5259 		/* determine range of block */
5260 		blkendva = x86_round_pdr(sva+1);
5261 		if (blkendva > eva)
5262 			blkendva = eva;
5263 
5264 		/* valid block? */
5265 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5266 			continue;
5267 		KASSERT(lvl == 1);
5268 
5269 		pte = &ptes[pl1_i(sva)];
5270 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5271 			if (!pmap_valid_entry(*pte))
5272 				continue;
5273 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5274 			    " (pte=%#" PRIxPADDR ")\n",
5275 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5276 		}
5277 	}
5278 	pmap_unmap_ptes(pmap, pmap2);
5279 	mutex_exit(&pmap->pm_lock);
5280 }
5281 #endif
5282 
5283 /*
5284  * pmap_update: process deferred invalidations and frees.
5285  */
5286 void
5287 pmap_update(struct pmap *pmap)
5288 {
5289 	struct pmap_page *pp;
5290 	struct vm_page *ptp;
5291 
5292 	/*
5293 	 * Initiate any pending TLB shootdowns.  Wait for them to
5294 	 * complete before returning control to the caller.
5295 	 */
5296 	kpreempt_disable();
5297 	pmap_tlb_shootnow();
5298 	kpreempt_enable();
5299 
5300 	/*
5301 	 * Now that shootdowns are complete, process deferred frees.  This
5302 	 * is an unlocked check, but is safe as we're only interested in
5303 	 * work done in this LWP - we won't get a false negative.
5304 	 */
5305 	if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) {
5306 		mutex_enter(&pmap->pm_lock);
5307 		while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5308 			KASSERT(ptp->wire_count == 0);
5309 			KASSERT(ptp->uanon == NULL);
5310 			LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5311 			pp = VM_PAGE_TO_PP(ptp);
5312 			LIST_INIT(&pp->pp_pvlist);
5313 			pp->pp_attrs = 0;
5314 			pp->pp_pte.pte_ptp = NULL;
5315 			pp->pp_pte.pte_va = 0;
5316 			PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5317 
5318 			/*
5319 			 * XXX Hack to avoid extra locking, and lock
5320 			 * assertions in uvm_pagefree().  Despite uobject
5321 			 * being set, this isn't a managed page.
5322 			 */
5323 			PMAP_DUMMY_LOCK(pmap);
5324 			uvm_pagerealloc(ptp, NULL, 0);
5325 			PMAP_DUMMY_UNLOCK(pmap);
5326 
5327 			/*
5328 			 * XXX for PTPs freed by pmap_remove_ptes() but not
5329 			 * pmap_zap_ptp(), we could mark them PG_ZERO.
5330 			 */
5331 			uvm_pagefree(ptp);
5332 		}
5333 		mutex_exit(&pmap->pm_lock);
5334 	}
5335 }
5336 
5337 #if PTP_LEVELS > 4
5338 #error "Unsupported number of page table mappings"
5339 #endif
5340 
5341 paddr_t
5342 pmap_init_tmp_pgtbl(paddr_t pg)
5343 {
5344 	static bool maps_loaded;
5345 	static const paddr_t x86_tmp_pml_paddr[] = {
5346 	    4 * PAGE_SIZE,	/* L1 */
5347 	    5 * PAGE_SIZE,	/* L2 */
5348 	    6 * PAGE_SIZE,	/* L3 */
5349 	    7 * PAGE_SIZE	/* L4 */
5350 	};
5351 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5352 
5353 	pd_entry_t *tmp_pml, *kernel_pml;
5354 
5355 	int level;
5356 
5357 	if (!maps_loaded) {
5358 		for (level = 0; level < PTP_LEVELS; ++level) {
5359 			x86_tmp_pml_vaddr[level] =
5360 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5361 			    UVM_KMF_VAONLY);
5362 
5363 			if (x86_tmp_pml_vaddr[level] == 0)
5364 				panic("mapping of real mode PML failed\n");
5365 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5366 			    x86_tmp_pml_paddr[level],
5367 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5368 		}
5369 		pmap_update(pmap_kernel());
5370 		maps_loaded = true;
5371 	}
5372 
5373 	/* Zero levels 1-3 */
5374 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
5375 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5376 		memset(tmp_pml, 0, PAGE_SIZE);
5377 	}
5378 
5379 	/* Copy PML4 */
5380 	kernel_pml = pmap_kernel()->pm_pdir;
5381 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
5382 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
5383 
5384 #ifdef PAE
5385 	/*
5386 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
5387 	 * last entries are unlikely to be used for temporary mappings.
5388 	 * 508: maps 0->1GB (userland)
5389 	 * 509: unused
5390 	 * 510: unused
5391 	 * 511: maps 3->4GB (kernel)
5392 	 */
5393 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
5394 	tmp_pml[509] = 0;
5395 	tmp_pml[510] = 0;
5396 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
5397 #endif
5398 
5399 	for (level = PTP_LEVELS - 1; level > 0; --level) {
5400 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5401 
5402 		tmp_pml[pl_i(pg, level + 1)] =
5403 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
5404 	}
5405 
5406 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
5407 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
5408 
5409 #ifdef PAE
5410 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
5411 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
5412 #endif
5413 
5414 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
5415 }
5416 
5417 u_int
5418 x86_mmap_flags(paddr_t mdpgno)
5419 {
5420 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
5421 	u_int pflag = 0;
5422 
5423 	if (nflag & X86_MMAP_FLAG_PREFETCH)
5424 		pflag |= PMAP_WRITE_COMBINE;
5425 
5426 	return pflag;
5427 }
5428 
5429 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XEN)
5430 
5431 /*
5432  * -----------------------------------------------------------------------------
5433  * *****************************************************************************
5434  * *****************************************************************************
5435  * *****************************************************************************
5436  * *****************************************************************************
5437  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
5438  * *****************************************************************************
5439  * *****************************************************************************
5440  * *****************************************************************************
5441  * *****************************************************************************
5442  * -----------------------------------------------------------------------------
5443  *
5444  * These functions are invoked as callbacks from the code above. Contrary to
5445  * native, EPT does not have a recursive slot; therefore, it is not possible
5446  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
5447  * tree manually.
5448  *
5449  * Apart from that, the logic is mostly the same as native. Once a pmap has
5450  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
5451  * After that we're good, and the callbacks will handle the translations
5452  * for us.
5453  *
5454  * -----------------------------------------------------------------------------
5455  */
5456 
5457 /* Hardware bits. */
5458 #define EPT_R		__BIT(0)	/* read */
5459 #define EPT_W		__BIT(1)	/* write */
5460 #define EPT_X		__BIT(2)	/* execute */
5461 #define EPT_T		__BITS(5,3)	/* type */
5462 #define		TYPE_UC	0
5463 #define		TYPE_WC	1
5464 #define		TYPE_WT	4
5465 #define		TYPE_WP	5
5466 #define		TYPE_WB	6
5467 #define EPT_NOPAT	__BIT(6)
5468 #define EPT_L		__BIT(7)	/* large */
5469 #define EPT_A		__BIT(8)	/* accessed */
5470 #define EPT_D		__BIT(9)	/* dirty */
5471 /* Software bits. */
5472 #define EPT_PVLIST	__BIT(60)
5473 #define EPT_WIRED	__BIT(61)
5474 
5475 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
5476 
5477 bool pmap_ept_has_ad __read_mostly;
5478 
5479 static inline void
5480 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
5481 {
5482 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
5483 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
5484 
5485 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5486 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5487 
5488 	pmap_stats_update(pmap, resid_diff, wired_diff);
5489 }
5490 
5491 static pt_entry_t
5492 pmap_ept_type(u_int flags)
5493 {
5494 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
5495 	pt_entry_t ret;
5496 
5497 	switch (cacheflags) {
5498 	case PMAP_NOCACHE:
5499 	case PMAP_NOCACHE_OVR:
5500 		ret = __SHIFTIN(TYPE_UC, EPT_T);
5501 		break;
5502 	case PMAP_WRITE_COMBINE:
5503 		ret = __SHIFTIN(TYPE_WC, EPT_T);
5504 		break;
5505 	case PMAP_WRITE_BACK:
5506 	default:
5507 		ret = __SHIFTIN(TYPE_WB, EPT_T);
5508 		break;
5509 	}
5510 
5511 	ret |= EPT_NOPAT;
5512 	return ret;
5513 }
5514 
5515 static inline pt_entry_t
5516 pmap_ept_prot(vm_prot_t prot)
5517 {
5518 	pt_entry_t res = 0;
5519 
5520 	if (prot & VM_PROT_READ)
5521 		res |= EPT_R;
5522 	if (prot & VM_PROT_WRITE)
5523 		res |= EPT_W;
5524 	if (prot & VM_PROT_EXECUTE)
5525 		res |= EPT_X;
5526 
5527 	return res;
5528 }
5529 
5530 static inline uint8_t
5531 pmap_ept_to_pp_attrs(pt_entry_t ept)
5532 {
5533 	uint8_t ret = 0;
5534 	if (pmap_ept_has_ad) {
5535 		if (ept & EPT_D)
5536 			ret |= PP_ATTRS_D;
5537 		if (ept & EPT_A)
5538 			ret |= PP_ATTRS_A;
5539 	} else {
5540 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
5541 	}
5542 	if (ept & EPT_W)
5543 		ret |= PP_ATTRS_W;
5544 	return ret;
5545 }
5546 
5547 static inline pt_entry_t
5548 pmap_pp_attrs_to_ept(uint8_t attrs)
5549 {
5550 	pt_entry_t ept = 0;
5551 	if (attrs & PP_ATTRS_D)
5552 		ept |= EPT_D;
5553 	if (attrs & PP_ATTRS_A)
5554 		ept |= EPT_A;
5555 	if (attrs & PP_ATTRS_W)
5556 		ept |= EPT_W;
5557 	return ept;
5558 }
5559 
5560 /*
5561  * Helper for pmap_ept_free_ptp.
5562  * tree[0] = &L2[L2idx]
5563  * tree[1] = &L3[L3idx]
5564  * tree[2] = &L4[L4idx]
5565  */
5566 static void
5567 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
5568 {
5569 	pt_entry_t *pteva;
5570 	paddr_t ptepa;
5571 	int i, index;
5572 
5573 	ptepa = pmap->pm_pdirpa[0];
5574 	for (i = PTP_LEVELS; i > 1; i--) {
5575 		index = pl_pi(va, i);
5576 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
5577 		KASSERT(pmap_ept_valid_entry(pteva[index]));
5578 		tree[i - 2] = &pteva[index];
5579 		ptepa = pmap_pte2pa(pteva[index]);
5580 	}
5581 }
5582 
5583 static void
5584 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
5585 {
5586 	pd_entry_t *tree[3];
5587 	int level;
5588 
5589 	KASSERT(pmap != pmap_kernel());
5590 	KASSERT(mutex_owned(&pmap->pm_lock));
5591 	KASSERT(kpreempt_disabled());
5592 
5593 	pmap_ept_get_tree(pmap, va, tree);
5594 
5595 	level = 1;
5596 	do {
5597 		(void)pmap_pte_testset(tree[level - 1], 0);
5598 
5599 		pmap_freepage(pmap, ptp, level);
5600 		if (level < PTP_LEVELS - 1) {
5601 			ptp = pmap_find_ptp(pmap, va, level + 1);
5602 			ptp->wire_count--;
5603 			if (ptp->wire_count > 1)
5604 				break;
5605 		}
5606 	} while (++level < PTP_LEVELS);
5607 	pmap_pte_flush();
5608 }
5609 
5610 /* Allocate L4->L3->L2. Return L2. */
5611 static void
5612 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
5613 {
5614 	struct vm_page *ptp;
5615 	unsigned long index;
5616 	pd_entry_t *pteva;
5617 	paddr_t ptepa;
5618 	int i;
5619 
5620 	KASSERT(pmap != pmap_kernel());
5621 	KASSERT(mutex_owned(&pmap->pm_lock));
5622 	KASSERT(kpreempt_disabled());
5623 
5624 	/*
5625 	 * Now that we have all the pages looked up or allocated,
5626 	 * loop through again installing any new ones into the tree.
5627 	 */
5628 	ptepa = pmap->pm_pdirpa[0];
5629 	for (i = PTP_LEVELS; i > 1; i--) {
5630 		index = pl_pi(va, i);
5631 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
5632 
5633 		if (pmap_ept_valid_entry(pteva[index])) {
5634 			KASSERT(!pt->alloced[i]);
5635 			ptepa = pmap_pte2pa(pteva[index]);
5636 			continue;
5637 		}
5638 
5639 		ptp = pt->pg[i];
5640 		ptp->flags &= ~PG_BUSY; /* never busy */
5641 		ptp->wire_count = 1;
5642 		pmap->pm_ptphint[i - 2] = ptp;
5643 		ptepa = VM_PAGE_TO_PHYS(ptp);
5644 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
5645 
5646 		pmap_pte_flush();
5647 		pmap_stats_update(pmap, 1, 0);
5648 
5649 		/*
5650 		 * If we're not in the top level, increase the
5651 		 * wire count of the parent page.
5652 		 */
5653 		if (i < PTP_LEVELS) {
5654 			pt->pg[i + 1]->wire_count++;
5655 		}
5656 	}
5657 }
5658 
5659 static int
5660 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
5661     u_int flags)
5662 {
5663 	pt_entry_t *ptes, opte, npte;
5664 	pt_entry_t *ptep;
5665 	struct vm_page *ptp;
5666 	struct vm_page *new_pg, *old_pg;
5667 	struct pmap_page *new_pp, *old_pp;
5668 	struct pv_entry *old_pve, *new_pve;
5669 	bool wired = (flags & PMAP_WIRED) != 0;
5670 	bool accessed;
5671 	struct pmap_ptparray pt;
5672 	int error;
5673 	bool getptp, samepage, new_embedded;
5674 	rb_tree_t *tree;
5675 
5676 	KASSERT(pmap_initialized);
5677 	KASSERT(va < VM_MAXUSER_ADDRESS);
5678 
5679 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
5680 
5681 	if (wired)
5682 		npte |= EPT_WIRED;
5683 	if (flags & VM_PROT_ALL) {
5684 		npte |= EPT_A;
5685 		if (flags & VM_PROT_WRITE) {
5686 			KASSERT((npte & EPT_W) != 0);
5687 			npte |= EPT_D;
5688 		}
5689 	}
5690 
5691 	new_pg = PHYS_TO_VM_PAGE(pa);
5692 	if (new_pg != NULL) {
5693 		/* This is a managed page */
5694 		npte |= EPT_PVLIST;
5695 		new_pp = VM_PAGE_TO_PP(new_pg);
5696 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5697 		/* This is an unmanaged pv-tracked page */
5698 		npte |= EPT_PVLIST;
5699 	} else {
5700 		new_pp = NULL;
5701 	}
5702 
5703 	/* Begin by locking the pmap. */
5704 	mutex_enter(&pmap->pm_lock);
5705 
5706 	/* Look up the PTP.  Allocate if none present. */
5707 	ptp = NULL;
5708 	getptp = false;
5709 	if (pmap != pmap_kernel()) {
5710 		ptp = pmap_find_ptp(pmap, va, 1);
5711 		if (ptp == NULL) {
5712 			getptp = true;
5713 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5714 			if (error != 0) {
5715 				if (flags & PMAP_CANFAIL) {
5716 					mutex_exit(&pmap->pm_lock);
5717 					return error;
5718 				}
5719 				panic("%s: get ptp failed, error=%d", __func__,
5720 				    error);
5721 			}
5722 		}
5723 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5724 	} else {
5725 		/* Embedded PV entries rely on this. */
5726 		KASSERT(va != 0);
5727 		tree = &pmap_kernel_rb;
5728 	}
5729 
5730 	/*
5731 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5732 	 * entry if required for the new mapping.  Temporarily track the old
5733 	 * and new mappings concurrently.  Only after the old mapping is
5734 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5735 	 * our picture of modified/accessed state for either page could get
5736 	 * out of sync (we need any P->V operation for either page to stall
5737 	 * on pmap->pm_lock until done here).
5738 	 */
5739 	new_pve = NULL;
5740 	old_pve = NULL;
5741 	samepage = false;
5742 	new_embedded = false;
5743 
5744     	if (new_pp != NULL) {
5745     		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5746     		    &old_pve, &samepage, &new_embedded, tree);
5747 
5748 		/*
5749 		 * If a new pv_entry was needed and none was available, we
5750 		 * can go no further.
5751 		 */
5752 		if (error != 0) {
5753 			if (flags & PMAP_CANFAIL) {
5754 				if (getptp) {
5755 					pmap_unget_ptp(pmap, &pt);
5756 				}
5757 				mutex_exit(&pmap->pm_lock);
5758 				return error;
5759 			}
5760 			panic("%s: alloc pve failed", __func__);
5761 		}
5762 	} else {
5763 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5764 	}
5765 
5766 	/* Map PTEs into address space. */
5767 	kpreempt_disable();
5768 
5769 	/* Install any newly allocated PTPs. */
5770 	if (getptp) {
5771 		pmap_ept_install_ptp(pmap, &pt, va);
5772 	}
5773 
5774 	/* Check if there is an existing mapping. */
5775 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
5776 	ptep = &ptes[pl1_pi(va)];
5777 	opte = *ptep;
5778 	bool have_oldpa = pmap_ept_valid_entry(opte);
5779 	paddr_t oldpa = pmap_pte2pa(opte);
5780 
5781 	/*
5782 	 * Update the pte.
5783 	 */
5784 	do {
5785 		opte = *ptep;
5786 
5787 		/*
5788 		 * if the same page, inherit PTE_A and PTE_D.
5789 		 */
5790 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
5791 			npte |= opte & (EPT_A | EPT_D);
5792 		}
5793 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
5794 
5795 	/*
5796 	 * Done with the PTEs: they can now be unmapped.
5797 	 */
5798 	kpreempt_enable();
5799 
5800 	/*
5801 	 * Update statistics and PTP's reference count.
5802 	 */
5803 	pmap_ept_stats_update_bypte(pmap, npte, opte);
5804 	if (ptp != NULL) {
5805 		if (!have_oldpa) {
5806 			ptp->wire_count++;
5807 		}
5808 		/* Remember minimum VA in PTP. */
5809 		pmap_ptp_range_set(ptp, va);
5810 	}
5811 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5812 
5813 	/*
5814 	 * If the same page, we can skip pv_entry handling.
5815 	 */
5816 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
5817 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
5818 		if ((npte & EPT_PVLIST) != 0) {
5819 			KASSERT(samepage);
5820 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5821 		}
5822 		goto same_pa;
5823 	} else if ((npte & EPT_PVLIST) != 0) {
5824 		KASSERT(!samepage);
5825 	}
5826 
5827 	/*
5828 	 * If old page is pv-tracked, remove pv_entry from its list.
5829 	 */
5830 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
5831 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5832 			old_pp = VM_PAGE_TO_PP(old_pg);
5833 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5834 			panic("%s: EPT_PVLIST with pv-untracked page"
5835 			    " va = %#"PRIxVADDR
5836 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5837 			    __func__, va, oldpa, atop(pa));
5838 		}
5839 
5840 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5841 		    pmap_ept_to_pp_attrs(opte));
5842 		if (old_pve != NULL) {
5843 			if (pmap->pm_pve == NULL) {
5844 				pmap->pm_pve = old_pve;
5845 			} else {
5846 				pool_cache_put(&pmap_pv_cache, old_pve);
5847 			}
5848 		}
5849 	} else {
5850 		KASSERT(old_pve == NULL);
5851 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5852 	}
5853 
5854 	/*
5855 	 * If new page is dynamically PV tracked, insert to tree.
5856 	 */
5857 	if (new_pve != NULL) {
5858 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5859 		old_pve = rb_tree_insert_node(tree, new_pve);
5860 		KASSERT(old_pve == new_pve);
5861 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5862 	}
5863 
5864 same_pa:
5865 	/*
5866 	 * shootdown tlb if necessary.
5867 	 */
5868 
5869 	if (pmap_ept_has_ad) {
5870 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
5871 	} else {
5872 		accessed = (opte & EPT_R) != 0;
5873 	}
5874 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
5875 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
5876 	}
5877 	mutex_exit(&pmap->pm_lock);
5878 	return 0;
5879 }
5880 
5881 /* Pay close attention, this returns L2. */
5882 static int
5883 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
5884 {
5885 	pt_entry_t *pteva;
5886 	paddr_t ptepa;
5887 	int i, index;
5888 
5889 	KASSERT(mutex_owned(&pmap->pm_lock));
5890 
5891 	ptepa = pmap->pm_pdirpa[0];
5892 	for (i = PTP_LEVELS; i > 1; i--) {
5893 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
5894 		index = pl_pi(va, i);
5895 		if (!pmap_ept_valid_entry(pteva[index]))
5896 			return i;
5897 		ptepa = pmap_pte2pa(pteva[index]);
5898 	}
5899 	if (lastpde != NULL) {
5900 		*lastpde = pteva[index];
5901 	}
5902 
5903 	return 0;
5904 }
5905 
5906 static bool
5907 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
5908 {
5909 	pt_entry_t *ptes, pte;
5910 	pd_entry_t pde;
5911 	paddr_t ptppa, pa;
5912 	bool rv;
5913 
5914 #ifdef __HAVE_DIRECT_MAP
5915 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
5916 		if (pap != NULL) {
5917 			*pap = PMAP_DIRECT_UNMAP(va);
5918 		}
5919 		return true;
5920 	}
5921 #endif
5922 
5923 	rv = false;
5924 	pa = 0;
5925 
5926 	mutex_enter(&pmap->pm_lock);
5927 	kpreempt_disable();
5928 
5929 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
5930 		ptppa = pmap_pte2pa(pde);
5931 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
5932 		pte = ptes[pl1_pi(va)];
5933 		if (__predict_true((pte & EPT_R) != 0)) {
5934 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
5935 			rv = true;
5936 		}
5937 	}
5938 
5939 	kpreempt_enable();
5940 	mutex_exit(&pmap->pm_lock);
5941 
5942 	if (pap != NULL) {
5943 		*pap = pa;
5944 	}
5945 	return rv;
5946 }
5947 
5948 static bool
5949 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
5950     vaddr_t va, struct pv_entry **pv_tofree)
5951 {
5952 	struct pv_entry *pve;
5953 	struct vm_page *pg;
5954 	struct pmap_page *pp;
5955 	pt_entry_t opte;
5956 	bool accessed;
5957 
5958 	KASSERT(pmap != pmap_kernel());
5959 	KASSERT(mutex_owned(&pmap->pm_lock));
5960 	KASSERT(kpreempt_disabled());
5961 
5962 	if (!pmap_ept_valid_entry(*pte)) {
5963 		/* VA not mapped. */
5964 		return false;
5965 	}
5966 
5967 	/* Atomically save the old PTE and zap it. */
5968 	opte = pmap_pte_testset(pte, 0);
5969 	if (!pmap_ept_valid_entry(opte)) {
5970 		return false;
5971 	}
5972 
5973 	pmap_ept_stats_update_bypte(pmap, 0, opte);
5974 
5975 	if (ptp) {
5976 		/*
5977 		 * Dropping a PTE.  Make sure that the PDE is flushed.
5978 		 */
5979 		ptp->wire_count--;
5980 		if (ptp->wire_count <= 1) {
5981 			opte |= EPT_A;
5982 		}
5983 	}
5984 
5985 	if (pmap_ept_has_ad) {
5986 		accessed = (opte & EPT_A) != 0;
5987 	} else {
5988 		accessed = true;
5989 	}
5990 	if (accessed) {
5991 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
5992 	}
5993 
5994 	/*
5995 	 * If we are not on a pv list - we are done.
5996 	 */
5997 	if ((opte & EPT_PVLIST) == 0) {
5998 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
5999 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6000 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6001 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6002 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6003 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6004 		return true;
6005 	}
6006 
6007 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6008 		pp = VM_PAGE_TO_PP(pg);
6009 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6010 		paddr_t pa = pmap_pte2pa(opte);
6011 		panic("%s: EPT_PVLIST with pv-untracked page"
6012 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6013 		    __func__, va, pa, atop(pa));
6014 	}
6015 
6016 	/* Sync R/M bits. */
6017 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6018 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6019 
6020 	if (pve) {
6021 		pve->pve_next = *pv_tofree;
6022 		*pv_tofree = pve;
6023 	}
6024 	return true;
6025 }
6026 
6027 static void
6028 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6029     vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
6030 {
6031 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6032 
6033 	KASSERT(pmap != pmap_kernel());
6034 	KASSERT(mutex_owned(&pmap->pm_lock));
6035 	KASSERT(kpreempt_disabled());
6036 
6037 	/*
6038 	 * mappings are very often sparse, so clip the given range to the
6039 	 * range of PTEs that are known present in the PTP.
6040 	 */
6041 	pmap_ptp_range_clip(ptp, &startva, &pte);
6042 
6043 	/*
6044 	 * note that ptpva points to the PTE that maps startva.   this may
6045 	 * or may not be the first PTE in the PTP.
6046 	 *
6047 	 * we loop through the PTP while there are still PTEs to look at
6048 	 * and the wire_count is greater than 1 (because we use the wire_count
6049 	 * to keep track of the number of real PTEs in the PTP).
6050 	 */
6051 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6052 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva, pv_tofree);
6053 		startva += PAGE_SIZE;
6054 		pte++;
6055 	}
6056 }
6057 
6058 static void
6059 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6060 {
6061 	struct pv_entry *pv_tofree = NULL;
6062 	pt_entry_t *ptes;
6063 	pd_entry_t pde;
6064 	paddr_t ptppa;
6065 	vaddr_t blkendva, va = sva;
6066 	struct vm_page *ptp;
6067 
6068 	mutex_enter(&pmap->pm_lock);
6069 	kpreempt_disable();
6070 
6071 	for (/* null */ ; va < eva ; va = blkendva) {
6072 		int lvl;
6073 
6074 		/* determine range of block */
6075 		blkendva = x86_round_pdr(va+1);
6076 		if (blkendva > eva)
6077 			blkendva = eva;
6078 
6079 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6080 		if (lvl != 0) {
6081 			/* Skip a range corresponding to an invalid pde. */
6082 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6083  			continue;
6084 		}
6085 
6086 		/* PA of the PTP */
6087 		ptppa = pmap_pte2pa(pde);
6088 
6089 		ptp = pmap_find_ptp(pmap, va, 1);
6090 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6091 		    __func__);
6092 
6093 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6094 
6095 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6096 		    blkendva, &pv_tofree);
6097 
6098 		/* If PTP is no longer being used, free it. */
6099 		if (ptp && ptp->wire_count <= 1) {
6100 			pmap_ept_free_ptp(pmap, ptp, va);
6101 		}
6102 	}
6103 
6104 	kpreempt_enable();
6105 	if (pv_tofree != NULL) {
6106 		pmap_free_pvs(pmap, pv_tofree);
6107 	}
6108 	mutex_exit(&pmap->pm_lock);
6109 }
6110 
6111 static int
6112 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6113     uint8_t *oattrs, pt_entry_t *optep)
6114 {
6115 	struct pmap *pmap;
6116 	pt_entry_t *ptep;
6117 	pt_entry_t opte;
6118 	pt_entry_t npte;
6119 	pt_entry_t expect;
6120 	bool need_shootdown;
6121 
6122 	expect = pmap_pa2pte(pa) | EPT_R;
6123 	pmap = ptp_to_pmap(ptp);
6124 
6125 	if (clearbits != ~0) {
6126 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6127 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6128 	}
6129 
6130 	ptep = pmap_map_pte(pmap, ptp, va);
6131 	do {
6132 		opte = *ptep;
6133 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6134 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6135 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6136 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6137 			/*
6138 			 * We lost a race with a V->P operation like
6139 			 * pmap_remove().  Wait for the competitor
6140 			 * reflecting pte bits into mp_attrs.
6141 			 */
6142 			pmap_unmap_pte();
6143 			return EAGAIN;
6144 		}
6145 
6146 		/*
6147 		 * Check if there's anything to do on this PTE.
6148 		 */
6149 		if ((opte & clearbits) == 0) {
6150 			need_shootdown = false;
6151 			break;
6152 		}
6153 
6154 		/*
6155 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6156 		 * ... Unless we are clearing only the EPT_W bit and
6157 		 * it isn't cached as RW (EPT_D).
6158 		 */
6159 		if (pmap_ept_has_ad) {
6160 			need_shootdown = (opte & EPT_A) != 0 &&
6161 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6162 		} else {
6163 			need_shootdown = true;
6164 		}
6165 
6166 		npte = opte & ~clearbits;
6167 
6168 		/*
6169 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6170 		 */
6171 		if (need_shootdown) {
6172 			npte &= ~(EPT_A | EPT_D);
6173 		}
6174 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6175 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6176 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6177 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6178 
6179 	if (need_shootdown) {
6180 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6181 	}
6182 	pmap_unmap_pte();
6183 
6184 	*oattrs = pmap_ept_to_pp_attrs(opte);
6185 	if (optep != NULL)
6186 		*optep = opte;
6187 	return 0;
6188 }
6189 
6190 static void
6191 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6192     vaddr_t va)
6193 {
6194 
6195 	KASSERT(mutex_owned(&pmap->pm_lock));
6196 
6197 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6198 	ptp->wire_count--;
6199 	if (ptp->wire_count <= 1) {
6200 		pmap_ept_free_ptp(pmap, ptp, va);
6201 	}
6202 }
6203 
6204 static void
6205 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6206 {
6207 	pt_entry_t bit_rem;
6208 	pt_entry_t *ptes, *spte;
6209 	pt_entry_t opte, npte;
6210 	pd_entry_t pde;
6211 	paddr_t ptppa;
6212 	vaddr_t va;
6213 	bool modified;
6214 
6215 	bit_rem = 0;
6216 	if (!(prot & VM_PROT_WRITE))
6217 		bit_rem = EPT_W;
6218 
6219 	sva &= PTE_FRAME;
6220 	eva &= PTE_FRAME;
6221 
6222 	/* Acquire pmap. */
6223 	mutex_enter(&pmap->pm_lock);
6224 	kpreempt_disable();
6225 
6226 	for (va = sva; va < eva; va += PAGE_SIZE) {
6227 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6228 			continue;
6229 		}
6230 
6231 		ptppa = pmap_pte2pa(pde);
6232 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6233 		spte = &ptes[pl1_pi(va)];
6234 
6235 		do {
6236 			opte = *spte;
6237 			if (!pmap_ept_valid_entry(opte)) {
6238 				goto next;
6239 			}
6240 			npte = (opte & ~bit_rem);
6241 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6242 
6243 		if (pmap_ept_has_ad) {
6244 			modified = (opte & EPT_D) != 0;
6245 		} else {
6246 			modified = true;
6247 		}
6248 		if (modified) {
6249 			vaddr_t tva = x86_ptob(spte - ptes);
6250 			pmap_tlb_shootdown(pmap, tva, 0,
6251 			    TLBSHOOT_WRITE_PROTECT);
6252 		}
6253 next:;
6254 	}
6255 
6256 	kpreempt_enable();
6257 	mutex_exit(&pmap->pm_lock);
6258 }
6259 
6260 static void
6261 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6262 {
6263 	pt_entry_t *ptes, *ptep, opte;
6264 	pd_entry_t pde;
6265 	paddr_t ptppa;
6266 
6267 	/* Acquire pmap. */
6268 	mutex_enter(&pmap->pm_lock);
6269 	kpreempt_disable();
6270 
6271 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6272 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6273 	}
6274 
6275 	ptppa = pmap_pte2pa(pde);
6276 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6277 	ptep = &ptes[pl1_pi(va)];
6278 	opte = *ptep;
6279 	KASSERT(pmap_ept_valid_entry(opte));
6280 
6281 	if (opte & EPT_WIRED) {
6282 		pt_entry_t npte = opte & ~EPT_WIRED;
6283 
6284 		opte = pmap_pte_testset(ptep, npte);
6285 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6286 	} else {
6287 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6288 		    "did not change!\n", __func__, pmap, va);
6289 	}
6290 
6291 	/* Release pmap. */
6292 	kpreempt_enable();
6293 	mutex_exit(&pmap->pm_lock);
6294 }
6295 
6296 /* -------------------------------------------------------------------------- */
6297 
6298 void
6299 pmap_ept_transform(struct pmap *pmap)
6300 {
6301 	pmap->pm_enter = pmap_ept_enter;
6302 	pmap->pm_extract = pmap_ept_extract;
6303 	pmap->pm_remove = pmap_ept_remove;
6304 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6305 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6306 	pmap->pm_write_protect = pmap_ept_write_protect;
6307 	pmap->pm_unwire = pmap_ept_unwire;
6308 
6309 	memset(pmap->pm_pdir, 0, PAGE_SIZE);
6310 }
6311 
6312 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XEN */
6313