xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision dd3ee07da436799d8de85f3055253118b76bf345)
1 /*	$NetBSD: pmap.c,v 1.415 2022/05/13 09:39:40 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.415 2022/05/13 09:39:40 riastradh Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 
142 #define	__MUTEX_PRIVATE	/* for assertions */
143 
144 #include <sys/param.h>
145 #include <sys/systm.h>
146 #include <sys/proc.h>
147 #include <sys/pool.h>
148 #include <sys/kernel.h>
149 #include <sys/atomic.h>
150 #include <sys/cpu.h>
151 #include <sys/intr.h>
152 #include <sys/xcall.h>
153 #include <sys/kcore.h>
154 #include <sys/kmem.h>
155 #include <sys/asan.h>
156 #include <sys/msan.h>
157 #include <sys/entropy.h>
158 
159 #include <uvm/uvm.h>
160 #include <uvm/pmap/pmap_pvt.h>
161 
162 #include <dev/isa/isareg.h>
163 
164 #include <machine/specialreg.h>
165 #include <machine/gdt.h>
166 #include <machine/isa_machdep.h>
167 #include <machine/cpuvar.h>
168 #include <machine/cputypes.h>
169 
170 #include <x86/pmap_pv.h>
171 
172 #include <x86/i82489reg.h>
173 #include <x86/i82489var.h>
174 
175 #ifdef XEN
176 #include <xen/include/public/xen.h>
177 #include <xen/hypervisor.h>
178 #include <xen/xenpmap.h>
179 #endif
180 
181 #ifdef __HAVE_DIRECT_MAP
182 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
183 #endif
184 
185 /*
186  * general info:
187  *
188  *  - for an explanation of how the x86 MMU hardware works see
189  *    the comments in <machine/pte.h>.
190  *
191  *  - for an explanation of the general memory structure used by
192  *    this pmap (including the recursive mapping), see the comments
193  *    in <machine/pmap.h>.
194  *
195  * this file contains the code for the "pmap module."   the module's
196  * job is to manage the hardware's virtual to physical address mappings.
197  * note that there are two levels of mapping in the VM system:
198  *
199  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
200  *      to map ranges of virtual address space to objects/files.  for
201  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
202  *      to the file /bin/ls starting at offset zero."   note that
203  *      the upper layer mapping is not concerned with how individual
204  *      vm_pages are mapped.
205  *
206  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
207  *      from virtual addresses.   it is concerned with which vm_page is
208  *      mapped where.   for example, when you run /bin/ls and start
209  *      at page 0x1000 the fault routine may lookup the correct page
210  *      of the /bin/ls file and then ask the pmap layer to establish
211  *      a mapping for it.
212  *
213  * note that information in the lower layer of the VM system can be
214  * thrown away since it can easily be reconstructed from the info
215  * in the upper layer.
216  *
217  * data structures we use include:
218  *
219  *  - struct pmap: describes the address space of one thread
220  *  - struct pmap_page: describes one pv-tracked page, without
221  *    necessarily a corresponding vm_page
222  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
223  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
224  *    physical memory.   the pp_pvlist points to a list of pv_entry
225  *    structures which describe all the <PMAP,VA> pairs that this
226  *    page is mapped in.    this is critical for page based operations
227  *    such as pmap_page_protect() [change protection on _all_ mappings
228  *    of a page]
229  */
230 
231 /*
232  * Locking
233  *
234  * We have the following locks that we must deal with, listed in the order
235  * that they are acquired:
236  *
237  * pg->uobject->vmobjlock, pg->uanon->an_lock
238  *
239  *	For managed pages, these per-object locks are taken by the VM system
240  *	before calling into the pmap module - either a read or write hold.
241  *	The lock hold prevent pages from changing identity while the pmap is
242  *	operating on them.  For example, the same lock is held across a call
243  *	to pmap_remove() and the following call to pmap_update(), so that a
244  *	page does not gain a new identity while its TLB visibility is stale.
245  *
246  * pmap->pm_lock
247  *
248  *	This lock protects the fields in the pmap structure including the
249  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
250  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
251  *	kernel PDEs are never freed, and the kernel is expected to be self
252  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
253  *	because they can be modified from interrupt context).
254  *
255  * pmaps_lock
256  *
257  *	This lock protects the list of active pmaps (headed by "pmaps").
258  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
259  *
260  * pp_lock
261  *
262  *	This per-page lock protects PV entry lists and the embedded PV entry
263  *	in each vm_page, allowing for concurrent operation on pages by
264  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
265  *	points it is taken context switching is usually not tolerable, and
266  *	spin mutexes must block out interrupts that could take kernel_lock.
267  */
268 
269 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
270 #ifdef DIAGNOSTIC
271 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
272 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
273 #else
274 #define	PMAP_DUMMY_LOCK(pm)
275 #define	PMAP_DUMMY_UNLOCK(pm)
276 #endif
277 
278 static const struct uvm_pagerops pmap_pager = {
279 	/* nothing */
280 };
281 
282 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
283 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
284 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
285 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
286 const long nbpd[] = NBPD_INITIALIZER;
287 #ifdef i386
288 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
289 #else
290 pd_entry_t *normal_pdes[3];
291 #endif
292 
293 long nkptp[] = NKPTP_INITIALIZER;
294 
295 struct pmap_head pmaps;
296 kmutex_t pmaps_lock __cacheline_aligned;
297 
298 struct pcpu_area *pcpuarea __read_mostly;
299 
300 static vaddr_t pmap_maxkvaddr;
301 
302 /*
303  * Misc. event counters.
304  */
305 struct evcnt pmap_iobmp_evcnt;
306 struct evcnt pmap_ldt_evcnt;
307 
308 /*
309  * PAT
310  */
311 static bool cpu_pat_enabled __read_mostly = false;
312 
313 /*
314  * Global data structures
315  */
316 
317 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
318 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
319 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
320 
321 struct bootspace bootspace __read_mostly;
322 struct slotspace slotspace __read_mostly;
323 
324 /* Set to PTE_NX if supported. */
325 pd_entry_t pmap_pg_nx __read_mostly = 0;
326 
327 /* Set to PTE_G if supported. */
328 pd_entry_t pmap_pg_g __read_mostly = 0;
329 
330 /* Set to true if large pages are supported. */
331 int pmap_largepages __read_mostly = 0;
332 
333 paddr_t lowmem_rsvd __read_mostly;
334 paddr_t avail_start __read_mostly; /* PA of first available physical page */
335 paddr_t avail_end __read_mostly; /* PA of last available physical page */
336 
337 #ifdef XENPV
338 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
339 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
340 #endif
341 
342 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
343 #define	PMAP_CHECK_PP(pp) \
344     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
345 
346 #define PAGE_ALIGNED(pp)	\
347 	__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
348 
349 /*
350  * Other data structures
351  */
352 
353 static pt_entry_t protection_codes[8] __read_mostly;
354 
355 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
356 
357 /*
358  * The following two vaddr_t's are used during system startup to keep track of
359  * how much of the kernel's VM space we have used. Once the system is started,
360  * the management of the remaining kernel VM space is turned over to the
361  * kernel_map vm_map.
362  */
363 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
364 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
365 
366 #ifndef XENPV
367 /*
368  * LAPIC virtual address, and fake physical address.
369  */
370 volatile vaddr_t local_apic_va __read_mostly;
371 paddr_t local_apic_pa __read_mostly;
372 #endif
373 
374 /*
375  * pool that pmap structures are allocated from
376  */
377 struct pool_cache pmap_cache;
378 static int  pmap_ctor(void *, void *, int);
379 static void pmap_dtor(void *, void *);
380 
381 /*
382  * pv_page cache
383  */
384 static struct pool_cache pmap_pvp_cache;
385 
386 #ifdef __HAVE_DIRECT_MAP
387 vaddr_t pmap_direct_base __read_mostly;
388 vaddr_t pmap_direct_end __read_mostly;
389 #endif
390 
391 #ifndef __HAVE_DIRECT_MAP
392 /*
393  * Special VAs and the PTEs that map them
394  */
395 static pt_entry_t *early_zero_pte;
396 static void pmap_vpage_cpualloc(struct cpu_info *);
397 #ifdef XENPV
398 char *early_zerop; /* also referenced from xen_locore() */
399 #else
400 static char *early_zerop;
401 #endif
402 #endif
403 
404 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
405 
406 /* PDP pool and its callbacks */
407 static struct pool pmap_pdp_pool;
408 static void pmap_pdp_init(pd_entry_t *);
409 static void pmap_pdp_fini(pd_entry_t *);
410 
411 #ifdef PAE
412 /* need to allocate items of 4 pages */
413 static void *pmap_pdp_alloc(struct pool *, int);
414 static void pmap_pdp_free(struct pool *, void *);
415 static struct pool_allocator pmap_pdp_allocator = {
416 	.pa_alloc = pmap_pdp_alloc,
417 	.pa_free = pmap_pdp_free,
418 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
419 };
420 #endif
421 
422 extern vaddr_t idt_vaddr;
423 extern paddr_t idt_paddr;
424 extern vaddr_t gdt_vaddr;
425 extern paddr_t gdt_paddr;
426 extern vaddr_t ldt_vaddr;
427 extern paddr_t ldt_paddr;
428 
429 #ifdef i386
430 /* stuff to fix the pentium f00f bug */
431 extern vaddr_t pentium_idt_vaddr;
432 #endif
433 
434 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
435 struct pmap_ptparray {
436 	struct vm_page *pg[PTP_LEVELS + 1];
437 	bool alloced[PTP_LEVELS + 1];
438 };
439 
440 /*
441  * PV entries are allocated in page-sized chunks and cached per-pmap to
442  * avoid intense pressure on memory allocators.
443  */
444 
445 struct pv_page {
446 	LIST_HEAD(, pv_entry)	pvp_pves;
447 	LIST_ENTRY(pv_page)	pvp_list;
448 	long			pvp_nfree;
449 	struct pmap		*pvp_pmap;
450 };
451 
452 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
453 
454 /*
455  * PV tree prototypes
456  */
457 
458 static int	pmap_compare_key(void *, const void *, const void *);
459 static int	pmap_compare_nodes(void *, const void *, const void *);
460 
461 /* Read-black tree */
462 static const rb_tree_ops_t pmap_rbtree_ops = {
463 	.rbto_compare_nodes = pmap_compare_nodes,
464 	.rbto_compare_key = pmap_compare_key,
465 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
466 	.rbto_context = NULL
467 };
468 
469 /*
470  * Local prototypes
471  */
472 
473 #ifdef __HAVE_PCPU_AREA
474 static void pmap_init_pcpu(void);
475 #endif
476 #ifdef __HAVE_DIRECT_MAP
477 static void pmap_init_directmap(struct pmap *);
478 #endif
479 #if !defined(XENPV)
480 static void pmap_remap_global(void);
481 #endif
482 #ifndef XENPV
483 static void pmap_init_lapic(void);
484 static void pmap_remap_largepages(void);
485 #endif
486 
487 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
488     struct vm_page **);
489 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
490 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
491     pd_entry_t * const *);
492 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
493 static void pmap_freepage(struct pmap *, struct vm_page *, int);
494 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
495     pt_entry_t *, pd_entry_t * const *);
496 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
497     vaddr_t);
498 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
499     vaddr_t);
500 static int pmap_pvp_ctor(void *, void *, int);
501 static void pmap_pvp_dtor(void *, void *);
502 static struct pv_entry *pmap_alloc_pv(struct pmap *);
503 static void pmap_free_pv(struct pmap *, struct pv_entry *);
504 static void pmap_drain_pv(struct pmap *);
505 
506 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
507 
508 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
509 static void pmap_reactivate(struct pmap *);
510 
511 /*
512  * p m a p   h e l p e r   f u n c t i o n s
513  */
514 
515 static inline void
516 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
517 {
518 
519 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
520 	pmap->pm_stats.resident_count += resid_diff;
521 	pmap->pm_stats.wired_count += wired_diff;
522 }
523 
524 static inline void
525 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
526 {
527 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
528 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
529 
530 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
531 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
532 
533 	pmap_stats_update(pmap, resid_diff, wired_diff);
534 }
535 
536 /*
537  * ptp_to_pmap: lookup pmap by ptp
538  */
539 static inline struct pmap *
540 ptp_to_pmap(struct vm_page *ptp)
541 {
542 	struct pmap *pmap;
543 
544 	if (ptp == NULL) {
545 		return pmap_kernel();
546 	}
547 	pmap = (struct pmap *)ptp->uobject;
548 	KASSERT(pmap != NULL);
549 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
550 	return pmap;
551 }
552 
553 static inline struct pv_pte *
554 pve_to_pvpte(struct pv_entry *pve)
555 {
556 
557 	if (pve == NULL)
558 		return NULL;
559 	KASSERT((void *)&pve->pve_pte == (void *)pve);
560 	return &pve->pve_pte;
561 }
562 
563 static inline struct pv_entry *
564 pvpte_to_pve(struct pv_pte *pvpte)
565 {
566 	struct pv_entry *pve = (void *)pvpte;
567 
568 	KASSERT(pve_to_pvpte(pve) == pvpte);
569 	return pve;
570 }
571 
572 /*
573  * Return true if the pmap page has an embedded PV entry.
574  */
575 static inline bool
576 pv_pte_embedded(struct pmap_page *pp)
577 {
578 
579 	KASSERT(mutex_owned(&pp->pp_lock));
580 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
581 }
582 
583 /*
584  * pv_pte_first, pv_pte_next: PV list iterator.
585  */
586 static inline struct pv_pte *
587 pv_pte_first(struct pmap_page *pp)
588 {
589 
590 	KASSERT(mutex_owned(&pp->pp_lock));
591 	if (pv_pte_embedded(pp)) {
592 		return &pp->pp_pte;
593 	}
594 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
595 }
596 
597 static inline struct pv_pte *
598 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
599 {
600 
601 	KASSERT(mutex_owned(&pp->pp_lock));
602 	KASSERT(pvpte != NULL);
603 	if (pvpte == &pp->pp_pte) {
604 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
605 	}
606 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
607 }
608 
609 static inline uint8_t
610 pmap_pte_to_pp_attrs(pt_entry_t pte)
611 {
612 	uint8_t ret = 0;
613 	if (pte & PTE_D)
614 		ret |= PP_ATTRS_D;
615 	if (pte & PTE_A)
616 		ret |= PP_ATTRS_A;
617 	if (pte & PTE_W)
618 		ret |= PP_ATTRS_W;
619 	return ret;
620 }
621 
622 static inline pt_entry_t
623 pmap_pp_attrs_to_pte(uint8_t attrs)
624 {
625 	pt_entry_t pte = 0;
626 	if (attrs & PP_ATTRS_D)
627 		pte |= PTE_D;
628 	if (attrs & PP_ATTRS_A)
629 		pte |= PTE_A;
630 	if (attrs & PP_ATTRS_W)
631 		pte |= PTE_W;
632 	return pte;
633 }
634 
635 /*
636  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
637  * of course the kernel is always loaded
638  */
639 bool
640 pmap_is_curpmap(struct pmap *pmap)
641 {
642 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
643 }
644 
645 inline void
646 pmap_reference(struct pmap *pmap)
647 {
648 
649 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
650 }
651 
652 /*
653  * rbtree: compare two nodes.
654  */
655 static int
656 pmap_compare_nodes(void *context, const void *n1, const void *n2)
657 {
658 	const struct pv_entry *pve1 = n1;
659 	const struct pv_entry *pve2 = n2;
660 
661 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
662 
663 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
664 		return -1;
665 	}
666 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
667 		return 1;
668 	}
669 	return 0;
670 }
671 
672 /*
673  * rbtree: compare a node and a key.
674  */
675 static int
676 pmap_compare_key(void *context, const void *n, const void *k)
677 {
678 	const struct pv_entry *pve = n;
679 	const vaddr_t key = (vaddr_t)k;
680 
681 	if (pve->pve_pte.pte_va < key) {
682 		return -1;
683 	}
684 	if (pve->pve_pte.pte_va > key) {
685 		return 1;
686 	}
687 	return 0;
688 }
689 
690 /*
691  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
692  */
693 static inline void
694 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
695 {
696 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
697 
698 	if (va < *min) {
699 		*min = va;
700 	}
701 }
702 
703 /*
704  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
705  */
706 static inline void
707 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
708 {
709 	vaddr_t sclip;
710 
711 	if (ptp == NULL) {
712 		return;
713 	}
714 
715 	sclip = (vaddr_t)ptp->uanon;
716 	sclip = (*startva < sclip ? sclip : *startva);
717 	*pte += (sclip - *startva) / PAGE_SIZE;
718 	*startva = sclip;
719 }
720 
721 /*
722  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
723  *
724  * there are several pmaps involved.  some or all of them might be same.
725  *
726  *	- the pmap given by the first argument
727  *		our caller wants to access this pmap's PTEs.
728  *
729  *	- pmap_kernel()
730  *		the kernel pmap.  note that it only contains the kernel part
731  *		of the address space which is shared by any pmap.  ie. any
732  *		pmap can be used instead of pmap_kernel() for our purpose.
733  *
734  *	- ci->ci_pmap
735  *		pmap currently loaded on the cpu.
736  *
737  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
738  *		current process' pmap.
739  *
740  * => caller must lock pmap first (if not the kernel pmap)
741  * => must be undone with pmap_unmap_ptes before returning
742  * => disables kernel preemption
743  */
744 void
745 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
746     pd_entry_t * const **pdeppp)
747 {
748 	struct pmap *curpmap;
749 	struct cpu_info *ci;
750 	lwp_t *l;
751 
752 	kpreempt_disable();
753 
754 	/* The kernel's pmap is always accessible. */
755 	if (pmap == pmap_kernel()) {
756 		*pmap2 = NULL;
757 		*ptepp = PTE_BASE;
758 		*pdeppp = normal_pdes;
759 		return;
760 	}
761 
762 	KASSERT(mutex_owned(&pmap->pm_lock));
763 
764 	l = curlwp;
765 	ci = l->l_cpu;
766 	curpmap = ci->ci_pmap;
767 	if (pmap == curpmap) {
768 		/*
769 		 * Already on the CPU: make it valid.  This is very
770 		 * often the case during exit(), when we have switched
771 		 * to the kernel pmap in order to destroy a user pmap.
772 		 */
773 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
774 			pmap_reactivate(pmap);
775 		}
776 		*pmap2 = NULL;
777 	} else {
778 		/*
779 		 * Toss current pmap from CPU and install new pmap, but keep
780 		 * a reference to the old one.  Dropping the reference can
781 		 * can block as it needs to take locks, so defer that to
782 		 * pmap_unmap_ptes().
783 		 */
784 		pmap_reference(pmap);
785 		pmap_load1(l, pmap, curpmap);
786 		*pmap2 = curpmap;
787 	}
788 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
789 #ifdef DIAGNOSTIC
790 	pmap->pm_ncsw = lwp_pctr();
791 #endif
792 	*ptepp = PTE_BASE;
793 
794 #if defined(XENPV) && defined(__x86_64__)
795 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
796 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
797 	*pdeppp = ci->ci_normal_pdes;
798 #else
799 	*pdeppp = normal_pdes;
800 #endif
801 }
802 
803 /*
804  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
805  *
806  * => we cannot tolerate context switches while mapped in: assert this.
807  * => reenables kernel preemption.
808  * => does not unlock pmap.
809  */
810 void
811 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
812 {
813 	struct cpu_info *ci;
814 	struct pmap *mypmap;
815 	struct lwp *l;
816 
817 	KASSERT(kpreempt_disabled());
818 
819 	/* The kernel's pmap is always accessible. */
820 	if (pmap == pmap_kernel()) {
821 		kpreempt_enable();
822 		return;
823 	}
824 
825 	l = curlwp;
826 	ci = l->l_cpu;
827 
828 	KASSERT(mutex_owned(&pmap->pm_lock));
829 	KASSERT(pmap->pm_ncsw == lwp_pctr());
830 
831 #if defined(XENPV) && defined(__x86_64__)
832 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
833 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
834 #endif
835 
836 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
837 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
838 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
839 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
840 		ci->ci_want_pmapload = 0;
841 	} else {
842 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
843 		ci->ci_tlbstate = TLBSTATE_LAZY;
844 	}
845 
846 	/* Now safe to re-enable preemption. */
847 	kpreempt_enable();
848 
849 	/* Toss reference to other pmap taken earlier. */
850 	if (pmap2 != NULL) {
851 		pmap_destroy(pmap2);
852 	}
853 }
854 
855 inline static void
856 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
857 {
858 
859 #if !defined(__x86_64__)
860 	if (curproc == NULL || curproc->p_vmspace == NULL ||
861 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
862 		return;
863 
864 	if ((opte ^ npte) & PTE_X)
865 		pmap_update_pg(va);
866 
867 	/*
868 	 * Executability was removed on the last executable change.
869 	 * Reset the code segment to something conservative and
870 	 * let the trap handler deal with setting the right limit.
871 	 * We can't do that because of locking constraints on the vm map.
872 	 */
873 
874 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
875 		struct trapframe *tf = curlwp->l_md.md_regs;
876 
877 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
878 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
879 	}
880 #endif /* !defined(__x86_64__) */
881 }
882 
883 #if !defined(__x86_64__)
884 /*
885  * Fixup the code segment to cover all potential executable mappings.
886  * returns 0 if no changes to the code segment were made.
887  */
888 int
889 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
890 {
891 	struct vm_map_entry *ent;
892 	struct pmap *pm = vm_map_pmap(map);
893 	vaddr_t va = 0;
894 
895 	vm_map_lock_read(map);
896 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
897 		/*
898 		 * This entry has greater va than the entries before.
899 		 * We need to make it point to the last page, not past it.
900 		 */
901 		if (ent->protection & VM_PROT_EXECUTE)
902 			va = trunc_page(ent->end) - PAGE_SIZE;
903 	}
904 	vm_map_unlock_read(map);
905 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
906 		return 0;
907 
908 	pm->pm_hiexec = va;
909 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
910 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
911 	} else {
912 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913 		return 0;
914 	}
915 	return 1;
916 }
917 #endif /* !defined(__x86_64__) */
918 
919 void
920 pat_init(struct cpu_info *ci)
921 {
922 #ifndef XENPV
923 	uint64_t pat;
924 
925 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
926 		return;
927 
928 	/* We change WT to WC. Leave all other entries the default values. */
929 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
930 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
931 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
932 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
933 
934 	wrmsr(MSR_CR_PAT, pat);
935 	cpu_pat_enabled = true;
936 #endif
937 }
938 
939 static pt_entry_t
940 pmap_pat_flags(u_int flags)
941 {
942 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
943 
944 	if (!cpu_pat_enabled) {
945 		switch (cacheflags) {
946 		case PMAP_NOCACHE:
947 		case PMAP_NOCACHE_OVR:
948 			/* results in PGC_UCMINUS on cpus which have
949 			 * the cpuid PAT but PAT "disabled"
950 			 */
951 			return PTE_PCD;
952 		default:
953 			return 0;
954 		}
955 	}
956 
957 	switch (cacheflags) {
958 	case PMAP_NOCACHE:
959 		return PGC_UC;
960 	case PMAP_WRITE_COMBINE:
961 		return PGC_WC;
962 	case PMAP_WRITE_BACK:
963 		return PGC_WB;
964 	case PMAP_NOCACHE_OVR:
965 		return PGC_UCMINUS;
966 	}
967 
968 	return 0;
969 }
970 
971 /*
972  * p m a p   k e n t e r   f u n c t i o n s
973  *
974  * functions to quickly enter/remove pages from the kernel address
975  * space.   pmap_kremove is exported to MI kernel.  we make use of
976  * the recursive PTE mappings.
977  */
978 
979 /*
980  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
981  *
982  * => no need to lock anything, assume va is already allocated
983  * => should be faster than normal pmap enter function
984  */
985 void
986 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
987 {
988 	pt_entry_t *pte, opte, npte;
989 
990 	KASSERT(!(prot & ~VM_PROT_ALL));
991 
992 	if (va < VM_MIN_KERNEL_ADDRESS)
993 		pte = vtopte(va);
994 	else
995 		pte = kvtopte(va);
996 #if defined(XENPV) && defined(DOM0OPS)
997 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
998 #ifdef DEBUG
999 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1000 		    " outside range\n", __func__, pa, va);
1001 #endif /* DEBUG */
1002 		npte = pa;
1003 	} else
1004 #endif /* XENPV && DOM0OPS */
1005 		npte = pmap_pa2pte(pa);
1006 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1007 	npte |= pmap_pat_flags(flags);
1008 	opte = pmap_pte_testset(pte, npte); /* zap! */
1009 
1010 	/*
1011 	 * XXX: make sure we are not dealing with a large page, since the only
1012 	 * large pages created are for the kernel image, and they should never
1013 	 * be kentered.
1014 	 */
1015 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1016 
1017 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1018 		/* This should not happen. */
1019 		printf_nolog("%s: mapping already present\n", __func__);
1020 		kpreempt_disable();
1021 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1022 		kpreempt_enable();
1023 	}
1024 }
1025 
1026 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1027 
1028 #if defined(__x86_64__)
1029 /*
1030  * Change protection for a virtual address. Local for a CPU only, don't
1031  * care about TLB shootdowns.
1032  *
1033  * => must be called with preemption disabled
1034  */
1035 void
1036 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1037 {
1038 	pt_entry_t *pte, opte, npte;
1039 
1040 	KASSERT(kpreempt_disabled());
1041 
1042 	if (va < VM_MIN_KERNEL_ADDRESS)
1043 		pte = vtopte(va);
1044 	else
1045 		pte = kvtopte(va);
1046 
1047 	npte = opte = *pte;
1048 
1049 	if ((prot & VM_PROT_WRITE) != 0)
1050 		npte |= PTE_W;
1051 	else
1052 		npte &= ~(PTE_W|PTE_D);
1053 
1054 	if (opte != npte) {
1055 		pmap_pte_set(pte, npte);
1056 		pmap_pte_flush();
1057 		invlpg(va);
1058 	}
1059 }
1060 #endif /* defined(__x86_64__) */
1061 
1062 /*
1063  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1064  *
1065  * => no need to lock anything
1066  * => caller must dispose of any vm_page mapped in the va range
1067  * => note: not an inline function
1068  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1069  * => we assume kernel only unmaps valid addresses and thus don't bother
1070  *    checking the valid bit before doing TLB flushing
1071  * => must be followed by call to pmap_update() before reuse of page
1072  */
1073 static void
1074 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1075 {
1076 	pt_entry_t *pte, opte;
1077 	vaddr_t va, eva;
1078 
1079 	eva = sva + len;
1080 
1081 	kpreempt_disable();
1082 	for (va = sva; va < eva; va += PAGE_SIZE) {
1083 		pte = kvtopte(va);
1084 		opte = pmap_pte_testset(pte, 0); /* zap! */
1085 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1086 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1087 			    TLBSHOOT_KREMOVE);
1088 		}
1089 		KASSERTMSG((opte & PTE_PS) == 0,
1090 		    "va %#" PRIxVADDR " is a large page", va);
1091 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1092 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1093 	}
1094 	if (localonly) {
1095 		tlbflushg();
1096 	}
1097 	kpreempt_enable();
1098 }
1099 
1100 void
1101 pmap_kremove(vaddr_t sva, vsize_t len)
1102 {
1103 
1104 	pmap_kremove1(sva, len, false);
1105 }
1106 
1107 /*
1108  * pmap_kremove_local: like pmap_kremove(), but only worry about
1109  * TLB invalidations on the current CPU.  this is only intended
1110  * for use while writing kernel crash dumps, either after panic
1111  * or via reboot -d.
1112  */
1113 void
1114 pmap_kremove_local(vaddr_t sva, vsize_t len)
1115 {
1116 
1117 	pmap_kremove1(sva, len, true);
1118 }
1119 
1120 /*
1121  * p m a p   i n i t   f u n c t i o n s
1122  *
1123  * pmap_bootstrap and pmap_init are called during system startup
1124  * to init the pmap module.   pmap_bootstrap() does a low level
1125  * init just to get things rolling.   pmap_init() finishes the job.
1126  */
1127 
1128 /*
1129  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1130  * This function is to be used before any VM system has been set up.
1131  *
1132  * The va is taken from virtual_avail.
1133  */
1134 static vaddr_t
1135 pmap_bootstrap_valloc(size_t npages)
1136 {
1137 	vaddr_t va = virtual_avail;
1138 	virtual_avail += npages * PAGE_SIZE;
1139 	return va;
1140 }
1141 
1142 /*
1143  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1144  * This function is to be used before any VM system has been set up.
1145  *
1146  * The pa is taken from avail_start.
1147  */
1148 static paddr_t
1149 pmap_bootstrap_palloc(size_t npages)
1150 {
1151 	paddr_t pa = avail_start;
1152 	avail_start += npages * PAGE_SIZE;
1153 	return pa;
1154 }
1155 
1156 /*
1157  * pmap_bootstrap: get the system in a state where it can run with VM properly
1158  * enabled (called before main()). The VM system is fully init'd later.
1159  *
1160  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1161  *    kernel, and nkpde PTP's for the kernel.
1162  * => kva_start is the first free virtual address in kernel space.
1163  */
1164 void
1165 pmap_bootstrap(vaddr_t kva_start)
1166 {
1167 	struct pmap *kpm;
1168 	int i;
1169 	vaddr_t kva;
1170 
1171 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1172 
1173 	/*
1174 	 * Set up our local static global vars that keep track of the usage of
1175 	 * KVM before kernel_map is set up.
1176 	 */
1177 	virtual_avail = kva_start;		/* first free KVA */
1178 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1179 
1180 	/*
1181 	 * Set up protection_codes: we need to be able to convert from a MI
1182 	 * protection code (some combo of VM_PROT...) to something we can jam
1183 	 * into a x86 PTE.
1184 	 */
1185 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1186 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1187 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1188 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1189 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1190 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1191 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1192 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1193 
1194 	/*
1195 	 * Now we init the kernel's pmap.
1196 	 *
1197 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1198 	 * the pm_obj contains the list of active PTPs.
1199 	 */
1200 	kpm = pmap_kernel();
1201 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1202 	rw_init(&kpm->pm_dummy_lock);
1203 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1204 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1205 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1206 		kpm->pm_ptphint[i] = NULL;
1207 	}
1208 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1209 
1210 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1211 	for (i = 0; i < PDP_SIZE; i++)
1212 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1213 
1214 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1215 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1216 
1217 	kcpuset_create(&kpm->pm_cpus, true);
1218 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1219 
1220 	kpm->pm_ldt = NULL;
1221 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1222 
1223 	/*
1224 	 * the above is just a rough estimate and not critical to the proper
1225 	 * operation of the system.
1226 	 */
1227 
1228 #if !defined(XENPV)
1229 	/*
1230 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1231 	 * attribute to already mapped kernel pages. Do that only if SVS is
1232 	 * disabled.
1233 	 *
1234 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1235 	 * happens later in cpu_init().
1236 	 */
1237 #ifdef SVS
1238 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1239 #else
1240 	if (cpu_feature[0] & CPUID_PGE) {
1241 #endif
1242 		pmap_pg_g = PTE_G;
1243 		pmap_remap_global();
1244 	}
1245 #endif
1246 
1247 #ifndef XENPV
1248 	/*
1249 	 * Enable large pages if they are supported.
1250 	 */
1251 	if (cpu_feature[0] & CPUID_PSE) {
1252 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1253 		pmap_largepages = 1;	/* enable software */
1254 
1255 		/*
1256 		 * The TLB must be flushed after enabling large pages on Pentium
1257 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1258 		 * Software Developer's Manual, Volume 3: System Programming".
1259 		 */
1260 		tlbflushg();
1261 
1262 		/* Remap the kernel. */
1263 		pmap_remap_largepages();
1264 	}
1265 	pmap_init_lapic();
1266 #endif /* !XENPV */
1267 
1268 #ifdef __HAVE_PCPU_AREA
1269 	pmap_init_pcpu();
1270 #endif
1271 
1272 #ifdef __HAVE_DIRECT_MAP
1273 	pmap_init_directmap(kpm);
1274 #else
1275 	pmap_vpage_cpualloc(&cpu_info_primary);
1276 
1277 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1278 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1279 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1280 	} else { /* amd64 */
1281 		/*
1282 		 * zero_pte is stuck at the end of mapped space for the kernel
1283 		 * image (disjunct from kva space). This is done so that it
1284 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1285 		 * when it's called for the first time.
1286 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1287 		 */
1288 #ifdef XENPV
1289 		/* early_zerop initialized in xen_locore() */
1290 #else
1291 		early_zerop = (void *)bootspace.spareva;
1292 #endif
1293 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1294 	}
1295 #endif
1296 
1297 #if defined(XENPV) && defined(__x86_64__)
1298 	extern vaddr_t xen_dummy_page;
1299 	paddr_t xen_dummy_user_pgd;
1300 
1301 	/*
1302 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1303 	 * Xen will still consider it active. So we set user PGD to this one
1304 	 * to lift all protection on the now inactive page tables set.
1305 	 */
1306 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1307 
1308 	/* Zero fill it, the less checks in Xen it requires the better */
1309 	memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1310 	/* Mark read-only */
1311 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1312 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1313 	    UVMF_INVLPG);
1314 	/* Pin as L4 */
1315 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1316 #endif
1317 
1318 	/*
1319 	 * Allocate space for the IDT, GDT and LDT.
1320 	 */
1321 	idt_vaddr = pmap_bootstrap_valloc(1);
1322 	idt_paddr = pmap_bootstrap_palloc(1);
1323 
1324 	gdt_vaddr = pmap_bootstrap_valloc(1);
1325 	gdt_paddr = pmap_bootstrap_palloc(1);
1326 
1327 #ifdef __HAVE_PCPU_AREA
1328 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1329 #else
1330 	ldt_vaddr = pmap_bootstrap_valloc(1);
1331 #endif
1332 	ldt_paddr = pmap_bootstrap_palloc(1);
1333 
1334 #if !defined(__x86_64__)
1335 	/* pentium f00f bug stuff */
1336 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1337 #endif
1338 
1339 #if defined(XENPVHVM)
1340 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1341 	extern paddr_t HYPERVISOR_shared_info_pa;
1342 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1343 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1344 
1345 	if (vm_guest != VM_GUEST_XENPVH) {
1346 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1347 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1348 	}
1349 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1350 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1351 #endif
1352 	/*
1353 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1354 	 */
1355 	virtual_avail = reserve_dumppages(virtual_avail);
1356 
1357 	/*
1358 	 * Init the global lock and global list.
1359 	 */
1360 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1361 	LIST_INIT(&pmaps);
1362 
1363 	/*
1364 	 * Ensure the TLB is sync'd with reality by flushing it...
1365 	 */
1366 	tlbflushg();
1367 
1368 	/*
1369 	 * Calculate pmap_maxkvaddr from nkptp[].
1370 	 */
1371 	kva = VM_MIN_KERNEL_ADDRESS;
1372 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1373 		kva += nkptp[i] * nbpd[i];
1374 	}
1375 	pmap_maxkvaddr = kva;
1376 }
1377 
1378 #ifndef XENPV
1379 static void
1380 pmap_init_lapic(void)
1381 {
1382 	/*
1383 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1384 	 * x86 implementation relies a lot on this address to be valid; so just
1385 	 * allocate a fake physical page that will be kentered into
1386 	 * local_apic_va by machdep.
1387 	 *
1388 	 * If the LAPIC is present, the va will be remapped somewhere else
1389 	 * later in lapic_map.
1390 	 */
1391 	local_apic_va = pmap_bootstrap_valloc(1);
1392 	local_apic_pa = pmap_bootstrap_palloc(1);
1393 }
1394 #endif
1395 
1396 #ifdef __x86_64__
1397 static size_t
1398 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1399 {
1400 	size_t npages;
1401 	npages = (roundup(endva, pgsz) / pgsz) -
1402 	    (rounddown(startva, pgsz) / pgsz);
1403 	return npages;
1404 }
1405 #endif
1406 
1407 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1408 static inline void
1409 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1410 {
1411 	size_t sslot = slotspace.area[type].sslot;
1412 	size_t nslot = slotspace.area[type].nslot;
1413 
1414 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1415 }
1416 #endif
1417 
1418 #ifdef __x86_64__
1419 /*
1420  * Randomize the location of an area. We count the holes in the VM space. We
1421  * randomly select one hole, and then randomly select an area within that hole.
1422  * Finally we update the associated entry in the slotspace structure.
1423  */
1424 vaddr_t
1425 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1426     vaddr_t randva)
1427 {
1428 	struct {
1429 		int start;
1430 		int end;
1431 	} holes[SLSPACE_NAREAS+1];
1432 	size_t i, nholes, hole;
1433 	size_t startsl, endsl, nslots, winsize;
1434 	vaddr_t startva, va;
1435 
1436 	sz = roundup(sz, align);
1437 
1438 	/*
1439 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1440 	 * an area that crosses slots:
1441 	 *     +------+------+------+
1442 	 *     | Slot | Slot | Slot |
1443 	 *     +------+------+------+
1444 	 *        [Chosen Area]
1445 	 * And in that case we must take into account the additional slot
1446 	 * consumed.
1447 	 */
1448 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1449 
1450 	/* Get the holes. */
1451 	nholes = 0;
1452 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1453 	while (1) {
1454 		/*
1455 		 * Find the first occupied slot after the current one.
1456 		 * The area between the two is a hole.
1457 		 */
1458 		size_t minsslot = 512;
1459 		size_t minnslot = 0;
1460 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1461 			if (!slotspace.area[i].active)
1462 				continue;
1463 			if (slotspace.area[i].sslot >= curslot &&
1464 			    slotspace.area[i].sslot < minsslot) {
1465 				minsslot = slotspace.area[i].sslot;
1466 				minnslot = slotspace.area[i].nslot;
1467 			}
1468 		}
1469 
1470 		/* No hole anymore, stop here. */
1471 		if (minsslot == 512) {
1472 			break;
1473 		}
1474 
1475 		/* Register the hole. */
1476 		if (minsslot - curslot >= nslots) {
1477 			holes[nholes].start = curslot;
1478 			holes[nholes].end = minsslot;
1479 			nholes++;
1480 		}
1481 
1482 		/* Skip that hole, and iterate again. */
1483 		curslot = minsslot + minnslot;
1484 	}
1485 
1486 	if (nholes == 0) {
1487 		panic("%s: impossible", __func__);
1488 	}
1489 
1490 	/* Select a hole. */
1491 	hole = randhole;
1492 #ifdef NO_X86_ASLR
1493 	hole = 0;
1494 #endif
1495 	hole %= nholes;
1496 	startsl = holes[hole].start;
1497 	endsl = holes[hole].end;
1498 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1499 
1500 	/* Select an area within the hole. */
1501 	va = randva;
1502 #ifdef NO_X86_ASLR
1503 	va = 0;
1504 #endif
1505 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1506 	va %= winsize;
1507 	va = rounddown(va, align);
1508 	va += startva;
1509 
1510 	/* Update the entry. */
1511 	slotspace.area[type].sslot = pl4_i(va);
1512 	slotspace.area[type].nslot =
1513 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1514 	slotspace.area[type].active = true;
1515 
1516 	return va;
1517 }
1518 #endif
1519 
1520 #ifdef __HAVE_PCPU_AREA
1521 static void
1522 pmap_init_pcpu(void)
1523 {
1524 	const vaddr_t startva = PMAP_PCPU_BASE;
1525 	size_t nL4e, nL3e, nL2e, nL1e;
1526 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1527 	paddr_t pa;
1528 	vaddr_t endva;
1529 	vaddr_t tmpva;
1530 	pt_entry_t *pte;
1531 	size_t size;
1532 	int i;
1533 
1534 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1535 
1536 	size = sizeof(struct pcpu_area);
1537 
1538 	endva = startva + size;
1539 
1540 	/* We will use this temporary va. */
1541 	tmpva = bootspace.spareva;
1542 	pte = PTE_BASE + pl1_i(tmpva);
1543 
1544 	/* Build L4 */
1545 	L4e_idx = pl4_i(startva);
1546 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1547 	KASSERT(nL4e  == 1);
1548 	for (i = 0; i < nL4e; i++) {
1549 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1550 
1551 		pa = pmap_bootstrap_palloc(1);
1552 		*pte = (pa & PTE_FRAME) | pteflags;
1553 		pmap_update_pg(tmpva);
1554 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1555 
1556 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1557 	}
1558 
1559 	/* Build L3 */
1560 	L3e_idx = pl3_i(startva);
1561 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1562 	for (i = 0; i < nL3e; i++) {
1563 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1564 
1565 		pa = pmap_bootstrap_palloc(1);
1566 		*pte = (pa & PTE_FRAME) | pteflags;
1567 		pmap_update_pg(tmpva);
1568 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1569 
1570 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1571 	}
1572 
1573 	/* Build L2 */
1574 	L2e_idx = pl2_i(startva);
1575 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1576 	for (i = 0; i < nL2e; i++) {
1577 
1578 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1579 
1580 		pa = pmap_bootstrap_palloc(1);
1581 		*pte = (pa & PTE_FRAME) | pteflags;
1582 		pmap_update_pg(tmpva);
1583 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1584 
1585 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1586 	}
1587 
1588 	/* Build L1 */
1589 	L1e_idx = pl1_i(startva);
1590 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1591 	for (i = 0; i < nL1e; i++) {
1592 		/*
1593 		 * Nothing to do, the PTEs will be entered via
1594 		 * pmap_kenter_pa.
1595 		 */
1596 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1597 	}
1598 
1599 	*pte = 0;
1600 	pmap_update_pg(tmpva);
1601 
1602 	pcpuarea = (struct pcpu_area *)startva;
1603 
1604 	tlbflush();
1605 }
1606 #endif
1607 
1608 #ifdef __HAVE_DIRECT_MAP
1609 static void
1610 randomize_hole(size_t *randholep, vaddr_t *randvap)
1611 {
1612 	struct nist_hash_drbg drbg;
1613 	uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1614 	const char p[] = "x86/directmap";
1615 	int error;
1616 
1617 	entropy_extract(seed, sizeof(seed), 0);
1618 
1619 	error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1620 	    /*nonce*/NULL, 0,
1621 	    /*personalization*/p, strlen(p));
1622 	KASSERTMSG(error == 0, "error=%d", error);
1623 
1624 	error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1625 	    /*additional*/NULL, 0);
1626 	KASSERTMSG(error == 0, "error=%d", error);
1627 
1628 	error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1629 	    /*additional*/NULL, 0);
1630 	KASSERTMSG(error == 0, "error=%d", error);
1631 
1632 	explicit_memset(seed, 0, sizeof(seed));
1633 	explicit_memset(&drbg, 0, sizeof(drbg));
1634 }
1635 
1636 /*
1637  * Create the amd64 direct map. Called only once at boot time. We map all of
1638  * the physical memory contiguously using 2MB large pages, with RW permissions.
1639  * However there is a hole: the kernel is mapped with RO permissions.
1640  */
1641 static void
1642 pmap_init_directmap(struct pmap *kpm)
1643 {
1644 	extern phys_ram_seg_t mem_clusters[];
1645 	extern int mem_cluster_cnt;
1646 
1647 	vaddr_t startva;
1648 	size_t nL4e, nL3e, nL2e;
1649 	size_t L4e_idx, L3e_idx, L2e_idx;
1650 	size_t spahole, epahole;
1651 	paddr_t lastpa, pa;
1652 	vaddr_t endva;
1653 	vaddr_t tmpva;
1654 	pt_entry_t *pte;
1655 	phys_ram_seg_t *mc;
1656 	int i;
1657 	size_t randhole;
1658 	vaddr_t randva;
1659 
1660 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1661 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1662 
1663 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1664 
1665 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1666 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1667 
1668 	/* Get the last physical address available */
1669 	lastpa = 0;
1670 	for (i = 0; i < mem_cluster_cnt; i++) {
1671 		mc = &mem_clusters[i];
1672 		lastpa = MAX(lastpa, mc->start + mc->size);
1673 	}
1674 
1675 	/*
1676 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1677 	 */
1678 	if (lastpa > MAXPHYSMEM) {
1679 		panic("pmap_init_directmap: lastpa incorrect");
1680 	}
1681 
1682 	randomize_hole(&randhole, &randva);
1683 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1684 	    randhole, randva);
1685 	endva = startva + lastpa;
1686 
1687 	/* We will use this temporary va. */
1688 	tmpva = bootspace.spareva;
1689 	pte = PTE_BASE + pl1_i(tmpva);
1690 
1691 	/* Build L4 */
1692 	L4e_idx = pl4_i(startva);
1693 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1694 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1695 	for (i = 0; i < nL4e; i++) {
1696 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1697 
1698 		pa = pmap_bootstrap_palloc(1);
1699 		*pte = (pa & PTE_FRAME) | pteflags;
1700 		pmap_update_pg(tmpva);
1701 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1702 
1703 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1704 	}
1705 
1706 	/* Build L3 */
1707 	L3e_idx = pl3_i(startva);
1708 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1709 	for (i = 0; i < nL3e; i++) {
1710 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1711 
1712 		pa = pmap_bootstrap_palloc(1);
1713 		*pte = (pa & PTE_FRAME) | pteflags;
1714 		pmap_update_pg(tmpva);
1715 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1716 
1717 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1718 	}
1719 
1720 	/* Build L2 */
1721 	L2e_idx = pl2_i(startva);
1722 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1723 	for (i = 0; i < nL2e; i++) {
1724 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1725 
1726 		pa = (paddr_t)(i * NBPD_L2);
1727 
1728 		if (spahole <= pa && pa < epahole) {
1729 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1730 			    PTE_PS | pmap_pg_g;
1731 		} else {
1732 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1733 			    PTE_PS | pmap_pg_g;
1734 		}
1735 	}
1736 
1737 	*pte = 0;
1738 	pmap_update_pg(tmpva);
1739 
1740 	pmap_direct_base = startva;
1741 	pmap_direct_end = endva;
1742 
1743 	tlbflush();
1744 }
1745 #endif /* __HAVE_DIRECT_MAP */
1746 
1747 #if !defined(XENPV)
1748 /*
1749  * Remap all of the virtual pages created so far with the PTE_G bit.
1750  */
1751 static void
1752 pmap_remap_global(void)
1753 {
1754 	vaddr_t kva, kva_end;
1755 	unsigned long p1i;
1756 	size_t i;
1757 
1758 	/* head */
1759 	kva = bootspace.head.va;
1760 	kva_end = kva + bootspace.head.sz;
1761 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1762 		p1i = pl1_i(kva);
1763 		if (pmap_valid_entry(PTE_BASE[p1i]))
1764 			PTE_BASE[p1i] |= pmap_pg_g;
1765 	}
1766 
1767 	/* kernel segments */
1768 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1769 		if (bootspace.segs[i].type == BTSEG_NONE) {
1770 			continue;
1771 		}
1772 		kva = bootspace.segs[i].va;
1773 		kva_end = kva + bootspace.segs[i].sz;
1774 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1775 			p1i = pl1_i(kva);
1776 			if (pmap_valid_entry(PTE_BASE[p1i]))
1777 				PTE_BASE[p1i] |= pmap_pg_g;
1778 		}
1779 	}
1780 
1781 	/* boot space */
1782 	kva = bootspace.boot.va;
1783 	kva_end = kva + bootspace.boot.sz;
1784 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1785 		p1i = pl1_i(kva);
1786 		if (pmap_valid_entry(PTE_BASE[p1i]))
1787 			PTE_BASE[p1i] |= pmap_pg_g;
1788 	}
1789 }
1790 #endif
1791 
1792 #ifndef XENPV
1793 /*
1794  * Remap several kernel segments with large pages. We cover as many pages as we
1795  * can. Called only once at boot time, if the CPU supports large pages.
1796  */
1797 static void
1798 pmap_remap_largepages(void)
1799 {
1800 	pd_entry_t *pde;
1801 	vaddr_t kva, kva_end;
1802 	paddr_t pa;
1803 	size_t i;
1804 
1805 	/* Remap the kernel text using large pages. */
1806 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1807 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1808 			continue;
1809 		}
1810 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1811 		if (kva < bootspace.segs[i].va) {
1812 			continue;
1813 		}
1814 		kva_end = rounddown(bootspace.segs[i].va +
1815 			bootspace.segs[i].sz, NBPD_L2);
1816 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1817 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1818 			pde = &L2_BASE[pl2_i(kva)];
1819 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1820 			tlbflushg();
1821 		}
1822 	}
1823 
1824 	/* Remap the kernel rodata using large pages. */
1825 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1826 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1827 			continue;
1828 		}
1829 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1830 		if (kva < bootspace.segs[i].va) {
1831 			continue;
1832 		}
1833 		kva_end = rounddown(bootspace.segs[i].va +
1834 			bootspace.segs[i].sz, NBPD_L2);
1835 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1836 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1837 			pde = &L2_BASE[pl2_i(kva)];
1838 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1839 			tlbflushg();
1840 		}
1841 	}
1842 
1843 	/* Remap the kernel data+bss using large pages. */
1844 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1845 		if (bootspace.segs[i].type != BTSEG_DATA) {
1846 			continue;
1847 		}
1848 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1849 		if (kva < bootspace.segs[i].va) {
1850 			continue;
1851 		}
1852 		kva_end = rounddown(bootspace.segs[i].va +
1853 			bootspace.segs[i].sz, NBPD_L2);
1854 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1855 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1856 			pde = &L2_BASE[pl2_i(kva)];
1857 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1858 			tlbflushg();
1859 		}
1860 	}
1861 }
1862 #endif /* !XENPV */
1863 
1864 /*
1865  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1866  * to manage mappings.
1867  */
1868 void
1869 pmap_init(void)
1870 {
1871 	int flags;
1872 
1873 	/*
1874 	 * initialize caches.
1875 	 */
1876 
1877 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1878 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1879 
1880 #ifdef XENPV
1881 	/*
1882 	 * pool_cache(9) should not touch cached objects, since they
1883 	 * are pinned on xen and R/O for the domU
1884 	 */
1885 	flags = PR_NOTOUCH;
1886 #else
1887 	flags = 0;
1888 #endif
1889 
1890 #ifdef PAE
1891 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1892 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1893 #else
1894 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1895 	    "pdppl", NULL, IPL_NONE);
1896 #endif
1897 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1898 	     0, 0, "pvpage", &pool_allocator_kmem,
1899 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1900 
1901 	pmap_tlb_init();
1902 
1903 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1904 	pmap_tlb_cpu_init(curcpu());
1905 
1906 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1907 	    NULL, "x86", "io bitmap copy");
1908 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1909 	    NULL, "x86", "ldt sync");
1910 
1911 	/*
1912 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1913 	 * to hang a tree of pv_entry records.  Dynamically allocated
1914 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1915 	 * usual case is embedded), so cop out and use a single RB tree
1916 	 * to cover them.
1917 	 */
1918 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1919 
1920 	/*
1921 	 * done: pmap module is up (and ready for business)
1922 	 */
1923 
1924 	pmap_initialized = true;
1925 }
1926 
1927 #ifndef XENPV
1928 /*
1929  * pmap_cpu_init_late: perform late per-CPU initialization.
1930  */
1931 void
1932 pmap_cpu_init_late(struct cpu_info *ci)
1933 {
1934 	/*
1935 	 * The BP has already its own PD page allocated during early
1936 	 * MD startup.
1937 	 */
1938 	if (ci == &cpu_info_primary)
1939 		return;
1940 #ifdef PAE
1941 	cpu_alloc_l3_page(ci);
1942 #endif
1943 }
1944 #endif
1945 
1946 #ifndef __HAVE_DIRECT_MAP
1947 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1948 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1949 
1950 static void
1951 pmap_vpage_cpualloc(struct cpu_info *ci)
1952 {
1953 	bool primary = (ci == &cpu_info_primary);
1954 	size_t i, npages;
1955 	vaddr_t vabase;
1956 	vsize_t vrange;
1957 
1958 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1959 	KASSERT(npages >= VPAGE_MAX);
1960 	vrange = npages * PAGE_SIZE;
1961 
1962 	if (primary) {
1963 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1964 			/* Waste some pages to align properly */
1965 		}
1966 		/* The base is aligned, allocate the rest (contiguous) */
1967 		pmap_bootstrap_valloc(npages - 1);
1968 	} else {
1969 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1970 		    UVM_KMF_VAONLY);
1971 		if (vabase == 0) {
1972 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1973 			    __func__, cpu_index(ci));
1974 		}
1975 	}
1976 
1977 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1978 
1979 	for (i = 0; i < VPAGE_MAX; i++) {
1980 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1981 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1982 	}
1983 }
1984 
1985 void
1986 pmap_vpage_cpu_init(struct cpu_info *ci)
1987 {
1988 	if (ci == &cpu_info_primary) {
1989 		/* cpu0 already taken care of in pmap_bootstrap */
1990 		return;
1991 	}
1992 
1993 	pmap_vpage_cpualloc(ci);
1994 }
1995 #endif
1996 
1997 /*
1998  * p v _ e n t r y   f u n c t i o n s
1999  */
2000 
2001 /*
2002  * pmap_pvp_dtor: pool_cache constructor for PV pages.
2003  */
2004 static int
2005 pmap_pvp_ctor(void *arg, void *obj, int flags)
2006 {
2007 	struct pv_page *pvp = (struct pv_page *)obj;
2008 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
2009 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
2010 
2011 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2012 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2013 
2014 	LIST_INIT(&pvp->pvp_pves);
2015 	pvp->pvp_nfree = PVE_PER_PVP;
2016 	pvp->pvp_pmap = NULL;
2017 
2018 	for (; pve < maxpve; pve++) {
2019 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2020 	}
2021 
2022 	return 0;
2023 }
2024 
2025 /*
2026  * pmap_pvp_dtor: pool_cache destructor for PV pages.
2027  */
2028 static void
2029 pmap_pvp_dtor(void *arg, void *obj)
2030 {
2031 	struct pv_page *pvp __diagused = obj;
2032 
2033 	KASSERT(pvp->pvp_pmap == NULL);
2034 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2035 }
2036 
2037 /*
2038  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2039  */
2040 static struct pv_entry *
2041 pmap_alloc_pv(struct pmap *pmap)
2042 {
2043 	struct pv_entry *pve;
2044 	struct pv_page *pvp;
2045 
2046 	KASSERT(mutex_owned(&pmap->pm_lock));
2047 
2048 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2049 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2050 			LIST_REMOVE(pvp, pvp_list);
2051 		} else {
2052 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2053 		}
2054 		if (__predict_false(pvp == NULL)) {
2055 			return NULL;
2056 		}
2057 		/* full -> part */
2058 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2059 		pvp->pvp_pmap = pmap;
2060 	}
2061 
2062 	KASSERT(pvp->pvp_pmap == pmap);
2063 	KASSERT(pvp->pvp_nfree > 0);
2064 
2065 	pve = LIST_FIRST(&pvp->pvp_pves);
2066 	LIST_REMOVE(pve, pve_list);
2067 	pvp->pvp_nfree--;
2068 
2069 	if (__predict_false(pvp->pvp_nfree == 0)) {
2070 		/* part -> empty */
2071 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2072 		LIST_REMOVE(pvp, pvp_list);
2073 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2074 	} else {
2075 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2076 	}
2077 
2078 	return pve;
2079 }
2080 
2081 /*
2082  * pmap_free_pv: delayed free of a PV entry.
2083  */
2084 static void
2085 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2086 {
2087 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2088 
2089 	KASSERT(mutex_owned(&pmap->pm_lock));
2090 	KASSERT(pvp->pvp_pmap == pmap);
2091 	KASSERT(pvp->pvp_nfree >= 0);
2092 
2093 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2094 	pvp->pvp_nfree++;
2095 
2096 	if (__predict_false(pvp->pvp_nfree == 1)) {
2097 		/* empty -> part */
2098 		LIST_REMOVE(pvp, pvp_list);
2099 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2100 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2101 		/* part -> full */
2102 		LIST_REMOVE(pvp, pvp_list);
2103 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2104 	}
2105 }
2106 
2107 /*
2108  * pmap_drain_pv: free full PV pages.
2109  */
2110 static void
2111 pmap_drain_pv(struct pmap *pmap)
2112 {
2113 	struct pv_page *pvp;
2114 
2115 	KASSERT(mutex_owned(&pmap->pm_lock));
2116 
2117 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2118 		LIST_REMOVE(pvp, pvp_list);
2119 		KASSERT(pvp->pvp_pmap == pmap);
2120 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2121 		pvp->pvp_pmap = NULL;
2122 		pool_cache_put(&pmap_pvp_cache, pvp);
2123 	}
2124 }
2125 
2126 /*
2127  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2128  */
2129 static void
2130 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2131     vaddr_t va, bool tracked)
2132 {
2133 #ifdef DEBUG
2134 	struct pv_pte *pvpte;
2135 
2136 	PMAP_CHECK_PP(pp);
2137 
2138 	mutex_spin_enter(&pp->pp_lock);
2139 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2140 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2141 			break;
2142 		}
2143 	}
2144 	mutex_spin_exit(&pp->pp_lock);
2145 
2146 	if (pvpte && !tracked) {
2147 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2148 	} else if (!pvpte && tracked) {
2149 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2150 	}
2151 #endif
2152 }
2153 
2154 /*
2155  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2156  *
2157  * => pmap must be locked
2158  */
2159 static struct pv_entry *
2160 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2161     const rb_tree_t *tree, const vaddr_t va)
2162 {
2163 	struct pv_entry *pve;
2164 	rb_node_t *node;
2165 
2166 	/*
2167 	 * Inlined lookup tailored for exactly what's needed here that is
2168 	 * quite a bit faster than using rb_tree_find_node().
2169 	 */
2170 	for (node = tree->rbt_root;;) {
2171 		if (__predict_false(RB_SENTINEL_P(node))) {
2172 			return NULL;
2173 		}
2174 		pve = (struct pv_entry *)
2175 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2176 		if (pve->pve_pte.pte_va == va) {
2177 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2178 			return pve;
2179 		}
2180 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2181 	}
2182 }
2183 
2184 /*
2185  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2186  *
2187  * => a PV entry must be known present (doesn't check for existence)
2188  * => pmap must be locked
2189  */
2190 static struct pv_entry *
2191 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2192     const struct pmap_page * const old_pp, const vaddr_t va)
2193 {
2194 	struct pv_entry *pve;
2195 	const rb_tree_t *tree;
2196 
2197 	KASSERT(mutex_owned(&pmap->pm_lock));
2198 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2199 
2200 	/*
2201 	 * [This mostly deals with the case of process-private pages, i.e.
2202 	 * anonymous memory allocations or COW.]
2203 	 *
2204 	 * If the page is tracked with an embedded entry then the tree
2205 	 * lookup can be avoided.  It's safe to check for this specific
2206 	 * set of values without pp_lock because both will only ever be
2207 	 * set together for this pmap.
2208 	 *
2209 	 */
2210 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2211 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2212 		return NULL;
2213 	}
2214 
2215 	/*
2216 	 * [This mostly deals with shared mappings, for example shared libs
2217 	 * and executables.]
2218 	 *
2219 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2220 	 * look at the lowest numbered node in the tree first.  The tree is
2221 	 * known non-empty because of the check above.  For short lived
2222 	 * processes where pmap_remove() isn't used much this gets close to
2223 	 * a 100% hit rate.
2224 	 */
2225 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2226 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2227 	pve = (struct pv_entry *)
2228 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2229 	    offsetof(struct pv_entry, pve_rb));
2230 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2231 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2232 		return pve;
2233 	}
2234 
2235 	/* Search the RB tree for the key (uncommon). */
2236 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2237 }
2238 
2239 /*
2240  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2241  *
2242  * => pmap must be locked
2243  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2244  */
2245 static int
2246 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2247     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2248     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2249 {
2250 	struct pv_entry *pve;
2251 	int error;
2252 
2253 	KASSERT(mutex_owned(&pmap->pm_lock));
2254 	KASSERT(ptp_to_pmap(ptp) == pmap);
2255 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2256 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2257 	PMAP_CHECK_PP(pp);
2258 
2259 	/*
2260 	 * If entering the same page and it's already tracked with an
2261 	 * embedded entry, we can avoid the expense below.  It's safe
2262 	 * to check for this very specific set of values without a lock
2263 	 * because both will only ever be set together for this pmap.
2264 	 */
2265 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2266 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2267 		*samepage = true;
2268 		pmap_check_pv(pmap, ptp, pp, va, true);
2269 		return 0;
2270 	}
2271 
2272 	/*
2273 	 * Check for an existing dynamic mapping at this address.  If it's
2274 	 * for the same page, then it will be reused and nothing needs to be
2275 	 * changed.
2276 	 */
2277 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2278 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2279 		*samepage = true;
2280 		pmap_check_pv(pmap, ptp, pp, va, true);
2281 		return 0;
2282 	}
2283 
2284 	/*
2285 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2286 	 * case it's needed; won't know for sure until the lock is taken.
2287 	 */
2288 	if (pmap->pm_pve == NULL) {
2289 		pmap->pm_pve = pmap_alloc_pv(pmap);
2290 	}
2291 
2292 	error = 0;
2293 	pmap_check_pv(pmap, ptp, pp, va, false);
2294 	mutex_spin_enter(&pp->pp_lock);
2295 	if (!pv_pte_embedded(pp)) {
2296 		/*
2297 		 * Embedded PV tracking available - easy.
2298 		 */
2299 		pp->pp_pte.pte_ptp = ptp;
2300 		pp->pp_pte.pte_va = va;
2301 		*new_embedded = true;
2302 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2303 		/*
2304 		 * No memory.
2305 		 */
2306 		error = ENOMEM;
2307 	} else {
2308 		/*
2309 		 * Install new pv_entry on the page.
2310 		 */
2311 		pve = pmap->pm_pve;
2312 		pmap->pm_pve = NULL;
2313 		*new_pve = pve;
2314 		pve->pve_pte.pte_ptp = ptp;
2315 		pve->pve_pte.pte_va = va;
2316 		pve->pve_pp = pp;
2317 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2318 	}
2319 	mutex_spin_exit(&pp->pp_lock);
2320 	if (error == 0) {
2321 		pmap_check_pv(pmap, ptp, pp, va, true);
2322 	}
2323 
2324 	return error;
2325 }
2326 
2327 /*
2328  * pmap_remove_pv: try to remove a mapping from a pv_list
2329  *
2330  * => pmap must be locked
2331  * => removes dynamic entries from tree and frees them
2332  * => caller should adjust ptp's wire_count and free PTP if needed
2333  */
2334 static void
2335 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2336     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2337 {
2338 	rb_tree_t *tree = (ptp != NULL ?
2339 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2340 
2341 	KASSERT(mutex_owned(&pmap->pm_lock));
2342 	KASSERT(ptp_to_pmap(ptp) == pmap);
2343 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2344 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2345 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2346 
2347 	pmap_check_pv(pmap, ptp, pp, va, true);
2348 
2349 	if (pve == NULL) {
2350 		mutex_spin_enter(&pp->pp_lock);
2351 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2352 		KASSERT(pp->pp_pte.pte_va == va);
2353 		pp->pp_attrs |= oattrs;
2354 		pp->pp_pte.pte_ptp = NULL;
2355 		pp->pp_pte.pte_va = 0;
2356 		mutex_spin_exit(&pp->pp_lock);
2357 	} else {
2358 		mutex_spin_enter(&pp->pp_lock);
2359 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2360 		    pp->pp_pte.pte_va != va);
2361 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2362 		KASSERT(pve->pve_pte.pte_va == va);
2363 		KASSERT(pve->pve_pp == pp);
2364 		pp->pp_attrs |= oattrs;
2365 		LIST_REMOVE(pve, pve_list);
2366 		mutex_spin_exit(&pp->pp_lock);
2367 
2368 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2369 		rb_tree_remove_node(tree, pve);
2370 #ifdef DIAGNOSTIC
2371 		memset(pve, 0, sizeof(*pve));
2372 #endif
2373 		pmap_free_pv(pmap, pve);
2374 	}
2375 
2376 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2377 	pmap_check_pv(pmap, ptp, pp, va, false);
2378 }
2379 
2380 /*
2381  * p t p   f u n c t i o n s
2382  */
2383 
2384 static struct vm_page *
2385 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2386 {
2387 	int lidx = level - 1;
2388 	off_t off = ptp_va2o(va, level);
2389 	struct vm_page *pg;
2390 
2391 	KASSERT(mutex_owned(&pmap->pm_lock));
2392 
2393 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2394 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2395 		pg = pmap->pm_ptphint[lidx];
2396 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2397 		return pg;
2398 	}
2399 	PMAP_DUMMY_LOCK(pmap);
2400 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2401 	PMAP_DUMMY_UNLOCK(pmap);
2402 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2403 		/* This page is queued to be freed - ignore. */
2404 		pg = NULL;
2405 	}
2406 	if (pg != NULL) {
2407 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2408 	}
2409 	pmap->pm_ptphint[lidx] = pg;
2410 	return pg;
2411 }
2412 
2413 static inline void
2414 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2415 {
2416 	int lidx;
2417 
2418 	KASSERT(ptp->wire_count <= 1);
2419 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2420 
2421 	lidx = level - 1;
2422 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2423 	if (pmap->pm_ptphint[lidx] == ptp)
2424 		pmap->pm_ptphint[lidx] = NULL;
2425 	ptp->wire_count = 0;
2426 	ptp->uanon = NULL;
2427 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2428 
2429 	/*
2430 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2431 	 * the page from the uvm_object, as that can take further locks
2432 	 * (intolerable right now because the PTEs are likely mapped in).
2433 	 * Instead mark the PTP as free and if we bump into it again, we'll
2434 	 * either ignore or reuse (depending on what's useful at the time).
2435 	 */
2436 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2437 }
2438 
2439 static void
2440 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2441 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2442 {
2443 	unsigned long index;
2444 	int level;
2445 	vaddr_t invaladdr;
2446 	pd_entry_t opde;
2447 
2448 	KASSERT(pmap != pmap_kernel());
2449 	KASSERT(mutex_owned(&pmap->pm_lock));
2450 	KASSERT(kpreempt_disabled());
2451 
2452 	level = 1;
2453 	do {
2454 		index = pl_i(va, level + 1);
2455 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2456 
2457 		/*
2458 		 * On Xen-amd64 or SVS, we need to sync the top level page
2459 		 * directory on each CPU.
2460 		 */
2461 #if defined(XENPV) && defined(__x86_64__)
2462 		if (level == PTP_LEVELS - 1) {
2463 			xen_kpm_sync(pmap, index);
2464 		}
2465 #elif defined(SVS)
2466 		if (svs_enabled && level == PTP_LEVELS - 1) {
2467 			svs_pmap_sync(pmap, index);
2468 		}
2469 #endif
2470 
2471 		invaladdr = level == 1 ? (vaddr_t)ptes :
2472 		    (vaddr_t)pdes[level - 2];
2473 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2474 		    opde, TLBSHOOT_FREE_PTP);
2475 
2476 #if defined(XENPV)
2477 		pmap_tlb_shootnow();
2478 #endif
2479 
2480 		pmap_freepage(pmap, ptp, level);
2481 		if (level < PTP_LEVELS - 1) {
2482 			ptp = pmap_find_ptp(pmap, va, level + 1);
2483 			ptp->wire_count--;
2484 			if (ptp->wire_count > 1)
2485 				break;
2486 		}
2487 	} while (++level < PTP_LEVELS);
2488 	pmap_pte_flush();
2489 }
2490 
2491 /*
2492  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2493  *
2494  * => pmap should NOT be pmap_kernel()
2495  * => pmap should be locked
2496  * => we are not touching any PTEs yet, so they need not be mapped in
2497  */
2498 static int
2499 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2500     int flags, struct vm_page **resultp)
2501 {
2502 	struct vm_page *ptp;
2503 	int i, aflags;
2504 	struct uvm_object *obj;
2505 	voff_t off;
2506 
2507 	KASSERT(pmap != pmap_kernel());
2508 	KASSERT(mutex_owned(&pmap->pm_lock));
2509 
2510 	/*
2511 	 * Loop through all page table levels allocating a page
2512 	 * for any level where we don't already have one.
2513 	 */
2514 	memset(pt, 0, sizeof(*pt));
2515 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2516 		UVM_PGA_ZERO;
2517 	for (i = PTP_LEVELS; i > 1; i--) {
2518 		obj = &pmap->pm_obj[i - 2];
2519 		off = ptp_va2o(va, i - 1);
2520 
2521 		PMAP_DUMMY_LOCK(pmap);
2522 		pt->pg[i] = uvm_pagelookup(obj, off);
2523 
2524 		if (pt->pg[i] == NULL) {
2525 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2526 			pt->alloced[i] = (pt->pg[i] != NULL);
2527 		} else if (pt->pg[i]->wire_count == 0) {
2528 			/* This page was queued to be freed; dequeue it. */
2529 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2530 			pt->alloced[i] = true;
2531 		}
2532 		PMAP_DUMMY_UNLOCK(pmap);
2533 		if (pt->pg[i] == NULL) {
2534 			pmap_unget_ptp(pmap, pt);
2535 			return ENOMEM;
2536 		} else if (pt->alloced[i]) {
2537 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2538 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2539 			    &pmap_rbtree_ops);
2540 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2541 		}
2542 	}
2543 	ptp = pt->pg[2];
2544 	KASSERT(ptp != NULL);
2545 	*resultp = ptp;
2546 	pmap->pm_ptphint[0] = ptp;
2547 	return 0;
2548 }
2549 
2550 /*
2551  * pmap_install_ptp: install any freshly allocated PTPs
2552  *
2553  * => pmap should NOT be pmap_kernel()
2554  * => pmap should be locked
2555  * => PTEs must be mapped
2556  * => preemption must be disabled
2557  */
2558 static void
2559 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2560     pd_entry_t * const *pdes)
2561 {
2562 	struct vm_page *ptp;
2563 	unsigned long index;
2564 	pd_entry_t *pva;
2565 	paddr_t pa;
2566 	int i;
2567 
2568 	KASSERT(pmap != pmap_kernel());
2569 	KASSERT(mutex_owned(&pmap->pm_lock));
2570 	KASSERT(kpreempt_disabled());
2571 
2572 	/*
2573 	 * Now that we have all the pages looked up or allocated,
2574 	 * loop through again installing any new ones into the tree.
2575 	 */
2576 	for (i = PTP_LEVELS; i > 1; i--) {
2577 		index = pl_i(va, i);
2578 		pva = pdes[i - 2];
2579 
2580 		if (pmap_valid_entry(pva[index])) {
2581 			KASSERT(!pt->alloced[i]);
2582 			continue;
2583 		}
2584 
2585 		ptp = pt->pg[i];
2586 		ptp->flags &= ~PG_BUSY; /* never busy */
2587 		ptp->wire_count = 1;
2588 		pmap->pm_ptphint[i - 2] = ptp;
2589 		pa = VM_PAGE_TO_PHYS(ptp);
2590 		pmap_pte_set(&pva[index], (pd_entry_t)
2591 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2592 
2593 		/*
2594 		 * On Xen-amd64 or SVS, we need to sync the top level page
2595 		 * directory on each CPU.
2596 		 */
2597 #if defined(XENPV) && defined(__x86_64__)
2598 		if (i == PTP_LEVELS) {
2599 			xen_kpm_sync(pmap, index);
2600 		}
2601 #elif defined(SVS)
2602 		if (svs_enabled && i == PTP_LEVELS) {
2603 			svs_pmap_sync(pmap, index);
2604 		}
2605 #endif
2606 
2607 		pmap_pte_flush();
2608 		pmap_stats_update(pmap, 1, 0);
2609 
2610 		/*
2611 		 * If we're not in the top level, increase the
2612 		 * wire count of the parent page.
2613 		 */
2614 		if (i < PTP_LEVELS) {
2615 			pt->pg[i + 1]->wire_count++;
2616 		}
2617 	}
2618 }
2619 
2620 /*
2621  * pmap_unget_ptp: free unusued PTPs
2622  *
2623  * => pmap should NOT be pmap_kernel()
2624  * => pmap should be locked
2625  */
2626 static void
2627 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2628 {
2629 	int i;
2630 
2631 	KASSERT(pmap != pmap_kernel());
2632 	KASSERT(mutex_owned(&pmap->pm_lock));
2633 
2634 	for (i = PTP_LEVELS; i > 1; i--) {
2635 		if (!pt->alloced[i]) {
2636 			continue;
2637 		}
2638 		KASSERT(pt->pg[i]->wire_count == 0);
2639 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2640 		pmap_freepage(pmap, pt->pg[i], i - 1);
2641 	}
2642 }
2643 
2644 /*
2645  * p m a p   l i f e c y c l e   f u n c t i o n s
2646  */
2647 
2648 /*
2649  * pmap_pdp_init: constructor a new PDP.
2650  */
2651 static void
2652 pmap_pdp_init(pd_entry_t *pdir)
2653 {
2654 	paddr_t pdirpa = 0;
2655 	vaddr_t object;
2656 	int i;
2657 
2658 #if !defined(XENPV) || !defined(__x86_64__)
2659 	int npde;
2660 #endif
2661 #ifdef XENPV
2662 	int s;
2663 #endif
2664 
2665 	memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2666 
2667 	/*
2668 	 * NOTE: This is all done unlocked, but we will check afterwards
2669 	 * if we have raced with pmap_growkernel().
2670 	 */
2671 
2672 #if defined(XENPV) && defined(__x86_64__)
2673 	/* Fetch the physical address of the page directory */
2674 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2675 
2676 	/*
2677 	 * This pdir will NEVER be active in kernel mode, so mark
2678 	 * recursive entry invalid.
2679 	 */
2680 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2681 
2682 	/*
2683 	 * PDP constructed this way won't be for the kernel, hence we
2684 	 * don't put kernel mappings on Xen.
2685 	 *
2686 	 * But we need to make pmap_create() happy, so put a dummy
2687 	 * (without PTE_P) value at the right place.
2688 	 */
2689 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2690 	     (pd_entry_t)-1 & PTE_FRAME;
2691 #else /* XENPV && __x86_64__*/
2692 	object = (vaddr_t)pdir;
2693 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2694 		/* Fetch the physical address of the page directory */
2695 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2696 
2697 		/* Put in recursive PDE to map the PTEs */
2698 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2699 		    pmap_pg_nx;
2700 #ifndef XENPV
2701 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2702 #endif
2703 	}
2704 
2705 	/* Copy the kernel's top level PDE */
2706 	npde = nkptp[PTP_LEVELS - 1];
2707 
2708 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2709 	    npde * sizeof(pd_entry_t));
2710 
2711 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2712 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2713 		pdir[idx] = PDP_BASE[idx];
2714 	}
2715 
2716 #ifdef __HAVE_PCPU_AREA
2717 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2718 #endif
2719 #ifdef __HAVE_DIRECT_MAP
2720 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2721 #endif
2722 #ifdef KASAN
2723 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2724 #endif
2725 #ifdef KMSAN
2726 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2727 #endif
2728 #endif /* XENPV  && __x86_64__*/
2729 
2730 #ifdef XENPV
2731 	s = splvm();
2732 	object = (vaddr_t)pdir;
2733 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2734 	    VM_PROT_READ);
2735 	pmap_update(pmap_kernel());
2736 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2737 		/*
2738 		 * pin as L2/L4 page, we have to do the page with the
2739 		 * PDIR_SLOT_PTE entries last
2740 		 */
2741 #ifdef PAE
2742 		if (i == l2tol3(PDIR_SLOT_PTE))
2743 			continue;
2744 #endif
2745 
2746 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2747 #ifdef __x86_64__
2748 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2749 #else
2750 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2751 #endif
2752 	}
2753 #ifdef PAE
2754 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2755 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2756 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2757 #endif
2758 	splx(s);
2759 #endif /* XENPV */
2760 }
2761 
2762 /*
2763  * pmap_pdp_fini: destructor for the PDPs.
2764  */
2765 static void
2766 pmap_pdp_fini(pd_entry_t *pdir)
2767 {
2768 #ifdef XENPV
2769 	paddr_t pdirpa = 0;	/* XXX: GCC */
2770 	vaddr_t object = (vaddr_t)pdir;
2771 	int i;
2772 	int s = splvm();
2773 	pt_entry_t *pte;
2774 
2775 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2776 		/* fetch the physical address of the page directory. */
2777 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2778 		/* unpin page table */
2779 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2780 	}
2781 	object = (vaddr_t)pdir;
2782 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2783 		/* Set page RW again */
2784 		pte = kvtopte(object);
2785 		pmap_pte_set(pte, *pte | PTE_W);
2786 		xen_bcast_invlpg((vaddr_t)object);
2787 	}
2788 	splx(s);
2789 #endif  /* XENPV */
2790 }
2791 
2792 #ifdef PAE
2793 static void *
2794 pmap_pdp_alloc(struct pool *pp, int flags)
2795 {
2796 	return (void *)uvm_km_alloc(kernel_map,
2797 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2798 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2799 	    UVM_KMF_WIRED);
2800 }
2801 
2802 static void
2803 pmap_pdp_free(struct pool *pp, void *v)
2804 {
2805 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2806 	    UVM_KMF_WIRED);
2807 }
2808 #endif /* PAE */
2809 
2810 /*
2811  * pmap_ctor: constructor for the pmap cache.
2812  */
2813 static int
2814 pmap_ctor(void *arg, void *obj, int flags)
2815 {
2816 	struct pmap *pmap = obj;
2817 	pt_entry_t p;
2818 	int i;
2819 
2820 	KASSERT((flags & PR_WAITOK) != 0);
2821 
2822 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2823 	rw_init(&pmap->pm_dummy_lock);
2824 	kcpuset_create(&pmap->pm_cpus, true);
2825 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2826 #ifdef XENPV
2827 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2828 #endif
2829 	LIST_INIT(&pmap->pm_gc_ptp);
2830 	pmap->pm_pve = NULL;
2831 	LIST_INIT(&pmap->pm_pvp_full);
2832 	LIST_INIT(&pmap->pm_pvp_part);
2833 	LIST_INIT(&pmap->pm_pvp_empty);
2834 
2835 	/* allocate and init PDP */
2836 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2837 
2838 	for (;;) {
2839 		pmap_pdp_init(pmap->pm_pdir);
2840 		mutex_enter(&pmaps_lock);
2841 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2842 		if (__predict_true(p != 0)) {
2843 			break;
2844 		}
2845 		mutex_exit(&pmaps_lock);
2846 	}
2847 
2848 	for (i = 0; i < PDP_SIZE; i++)
2849 		pmap->pm_pdirpa[i] =
2850 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2851 
2852 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2853 	mutex_exit(&pmaps_lock);
2854 
2855 	return 0;
2856 }
2857 
2858 /*
2859  * pmap_ctor: destructor for the pmap cache.
2860  */
2861 static void
2862 pmap_dtor(void *arg, void *obj)
2863 {
2864 	struct pmap *pmap = obj;
2865 
2866 	mutex_enter(&pmaps_lock);
2867 	LIST_REMOVE(pmap, pm_list);
2868 	mutex_exit(&pmaps_lock);
2869 
2870 	pmap_pdp_fini(pmap->pm_pdir);
2871 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2872 	mutex_destroy(&pmap->pm_lock);
2873 	rw_destroy(&pmap->pm_dummy_lock);
2874 	kcpuset_destroy(pmap->pm_cpus);
2875 	kcpuset_destroy(pmap->pm_kernel_cpus);
2876 #ifdef XENPV
2877 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2878 #endif
2879 }
2880 
2881 /*
2882  * pmap_create: create a pmap object.
2883  */
2884 struct pmap *
2885 pmap_create(void)
2886 {
2887 	struct pmap *pmap;
2888 	int i;
2889 
2890 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2891 
2892 	/* init uvm_object */
2893 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2894 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2895 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2896 		pmap->pm_ptphint[i] = NULL;
2897 	}
2898 	pmap->pm_stats.wired_count = 0;
2899 	/* count the PDP allocd below */
2900 	pmap->pm_stats.resident_count = PDP_SIZE;
2901 #if !defined(__x86_64__)
2902 	pmap->pm_hiexec = 0;
2903 #endif
2904 
2905 	/* Used by NVMM and Xen */
2906 	pmap->pm_enter = NULL;
2907 	pmap->pm_extract = NULL;
2908 	pmap->pm_remove = NULL;
2909 	pmap->pm_sync_pv = NULL;
2910 	pmap->pm_pp_remove_ent = NULL;
2911 	pmap->pm_write_protect = NULL;
2912 	pmap->pm_unwire = NULL;
2913 	pmap->pm_tlb_flush = NULL;
2914 	pmap->pm_data = NULL;
2915 
2916 	/* init the LDT */
2917 	pmap->pm_ldt = NULL;
2918 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2919 
2920 	return pmap;
2921 }
2922 
2923 /*
2924  * pmap_check_ptps: verify that none of the pmap's page table objects
2925  * have any pages allocated to them.
2926  */
2927 static void
2928 pmap_check_ptps(struct pmap *pmap)
2929 {
2930 	int i;
2931 
2932 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2933 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2934 		    "pmap %p level %d still has %d pages",
2935 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2936 	}
2937 }
2938 
2939 static void
2940 pmap_check_inuse(struct pmap *pmap)
2941 {
2942 #ifdef DEBUG
2943 	CPU_INFO_ITERATOR cii;
2944 	struct cpu_info *ci;
2945 
2946 	for (CPU_INFO_FOREACH(cii, ci)) {
2947 		if (ci->ci_pmap == pmap)
2948 			panic("destroying pmap being used");
2949 #if defined(XENPV) && defined(__x86_64__)
2950 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
2951 			if (pmap->pm_pdir[i] != 0 &&
2952 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2953 				printf("pmap_destroy(%p) pmap_kernel %p "
2954 				    "curcpu %d cpu %d ci_pmap %p "
2955 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2956 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2957 				    pmap, pmap_kernel(), curcpu()->ci_index,
2958 				    ci->ci_index, ci->ci_pmap,
2959 				    i, ci->ci_kpm_pdir[i],
2960 				    i, pmap->pm_pdir[i]);
2961 				panic("%s: used pmap", __func__);
2962 			}
2963 		}
2964 #endif
2965 	}
2966 #endif /* DEBUG */
2967 }
2968 
2969 /*
2970  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
2971  * count goes to zero.
2972  *
2973  * => we can be called from pmap_unmap_ptes() with a different, unrelated
2974  *    pmap's lock held.  be careful!
2975  */
2976 void
2977 pmap_destroy(struct pmap *pmap)
2978 {
2979 	int i;
2980 
2981 	/*
2982 	 * drop reference count and verify not in use.
2983 	 */
2984 
2985 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2986 		return;
2987 	}
2988 	pmap_check_inuse(pmap);
2989 
2990 	/*
2991 	 * handle any deferred frees.
2992 	 */
2993 
2994 	mutex_enter(&pmap->pm_lock);
2995 	if (pmap->pm_pve != NULL) {
2996 		pmap_free_pv(pmap, pmap->pm_pve);
2997 		pmap->pm_pve = NULL;
2998 	}
2999 	pmap_drain_pv(pmap);
3000 	mutex_exit(&pmap->pm_lock);
3001 	pmap_update(pmap);
3002 
3003 	/*
3004 	 * Reference count is zero, free pmap resources and then free pmap.
3005 	 */
3006 
3007 	pmap_check_ptps(pmap);
3008 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3009 
3010 #ifdef USER_LDT
3011 	if (pmap->pm_ldt != NULL) {
3012 		/*
3013 		 * No need to switch the LDT; this address space is gone,
3014 		 * nothing is using it.
3015 		 *
3016 		 * No need to lock the pmap for ldt_free (or anything else),
3017 		 * we're the last one to use it.
3018 		 */
3019 		/* XXXAD can't take cpu_lock here - fix soon. */
3020 		mutex_enter(&cpu_lock);
3021 		ldt_free(pmap->pm_ldt_sel);
3022 		mutex_exit(&cpu_lock);
3023 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3024 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3025 	}
3026 #endif
3027 
3028 	for (i = 0; i < PTP_LEVELS - 1; i++) {
3029 		uvm_obj_destroy(&pmap->pm_obj[i], false);
3030 	}
3031 	kcpuset_zero(pmap->pm_cpus);
3032 	kcpuset_zero(pmap->pm_kernel_cpus);
3033 #ifdef XENPV
3034 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3035 #endif
3036 
3037 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3038 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3039 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3040 
3041 	pmap_check_ptps(pmap);
3042 	if (__predict_false(pmap->pm_enter != NULL)) {
3043 		/* XXX make this a different cache */
3044 		pool_cache_destruct_object(&pmap_cache, pmap);
3045 	} else {
3046 		pool_cache_put(&pmap_cache, pmap);
3047 	}
3048 }
3049 
3050 /*
3051  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3052  *
3053  * => caller must hold pmap's lock
3054  * => PTP must be mapped into KVA
3055  * => must be called with kernel preemption disabled
3056  * => does as little work as possible
3057  */
3058 static void
3059 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3060     vaddr_t startva, vaddr_t blkendva)
3061 {
3062 #ifndef XENPV
3063 	struct pv_entry *pve;
3064 	struct vm_page *pg;
3065 	struct pmap_page *pp;
3066 	pt_entry_t opte;
3067 	rb_tree_t *tree;
3068 	vaddr_t va;
3069 	int wired;
3070 	uint8_t oattrs;
3071 	u_int cnt;
3072 
3073 	KASSERT(mutex_owned(&pmap->pm_lock));
3074 	KASSERT(kpreempt_disabled());
3075 	KASSERT(pmap != pmap_kernel());
3076 	KASSERT(ptp->wire_count > 1);
3077 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3078 
3079 	/*
3080 	 * Start at the lowest entered VA, and scan until there are no more
3081 	 * PTEs in the PTPs.
3082 	 */
3083 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3084 	pve = RB_TREE_MIN(tree);
3085 	wired = 0;
3086 	va = (vaddr_t)ptp->uanon;
3087 	pte += ((va - startva) >> PAGE_SHIFT);
3088 
3089 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3090 		/*
3091 		 * No need for an atomic to clear the PTE.  Nothing else can
3092 		 * see the address space any more and speculative access (if
3093 		 * possible) won't modify.  Therefore there's no need to
3094 		 * track the accessed/dirty bits.
3095 		 */
3096 		opte = *pte;
3097 		if (!pmap_valid_entry(opte)) {
3098 			continue;
3099 		}
3100 
3101 		/*
3102 		 * Count the PTE.  If it's not for a managed mapping
3103 		 * there's noting more to do.
3104 		 */
3105 		cnt--;
3106 		wired -= (opte & PTE_WIRED);
3107 		if ((opte & PTE_PVLIST) == 0) {
3108 #ifndef DOM0OPS
3109 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3110 			    "managed page without PTE_PVLIST for %#"
3111 			    PRIxVADDR, va);
3112 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3113 			    "pv-tracked page without PTE_PVLIST for %#"
3114 			    PRIxVADDR, va);
3115 #endif
3116 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3117 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3118 			    va) == NULL);
3119 			continue;
3120 		}
3121 
3122 		/*
3123 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3124 		 * in the PTP.  If it's for this VA, take advantage of it to
3125 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3126 		 * tree by skipping to the next VA in the tree whenever
3127 		 * there is a match here.  The tree will be cleared out in
3128 		 * one pass before return to pmap_remove_all().
3129 		 */
3130 		oattrs = pmap_pte_to_pp_attrs(opte);
3131 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3132 			pp = pve->pve_pp;
3133 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3134 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3135 			    pp->pp_pte.pte_va != va);
3136 			mutex_spin_enter(&pp->pp_lock);
3137 			pp->pp_attrs |= oattrs;
3138 			LIST_REMOVE(pve, pve_list);
3139 			mutex_spin_exit(&pp->pp_lock);
3140 
3141 			/*
3142 			 * pve won't be touched again until pmap_drain_pv(),
3143 			 * so it's still safe to traverse the tree.
3144 			 */
3145 			pmap_free_pv(pmap, pve);
3146 			pve = RB_TREE_NEXT(tree, pve);
3147 			continue;
3148 		}
3149 
3150 		/*
3151 		 * No entry in the tree so it must be embedded.  Look up the
3152 		 * page and cancel the embedded entry.
3153 		 */
3154 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3155 			pp = VM_PAGE_TO_PP(pg);
3156 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3157 			paddr_t pa = pmap_pte2pa(opte);
3158 			panic("%s: PTE_PVLIST with pv-untracked page"
3159 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3160 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3161 		}
3162 		mutex_spin_enter(&pp->pp_lock);
3163 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3164 		KASSERT(pp->pp_pte.pte_va == va);
3165 		pp->pp_attrs |= oattrs;
3166 		pp->pp_pte.pte_ptp = NULL;
3167 		pp->pp_pte.pte_va = 0;
3168 		mutex_spin_exit(&pp->pp_lock);
3169 	}
3170 
3171 	/* PTP now empty - adjust the tree & stats to match. */
3172 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3173 	ptp->wire_count = 1;
3174 #ifdef DIAGNOSTIC
3175 	rb_tree_init(tree, &pmap_rbtree_ops);
3176 #endif
3177 #else	/* !XENPV */
3178 	/*
3179 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3180 	 * I guess the hypervisor keeps track of PTEs too.
3181 	 */
3182 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3183 #endif	/* !XENPV */
3184 }
3185 
3186 /*
3187  * pmap_remove_all: remove all mappings from pmap in bulk.
3188  *
3189  * Ordinarily when removing mappings it's important to hold the UVM object's
3190  * lock, so that pages do not gain a new identity while retaining stale TLB
3191  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3192  * Here it's known that the address space is no longer visible to any user
3193  * process, so we don't need to worry about that.
3194  */
3195 bool
3196 pmap_remove_all(struct pmap *pmap)
3197 {
3198 	struct vm_page *ptps[32];
3199 	vaddr_t va, blkendva;
3200 	struct pmap *pmap2;
3201 	pt_entry_t *ptes;
3202 	pd_entry_t pde __diagused;
3203 	pd_entry_t * const *pdes;
3204 	int lvl __diagused, i, n;
3205 
3206 	/* XXX Can't handle EPT just yet. */
3207 	if (pmap->pm_remove != NULL) {
3208 		return false;
3209 	}
3210 
3211 	for (;;) {
3212 		/* Fetch a block of PTPs from tree. */
3213 		mutex_enter(&pmap->pm_lock);
3214 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3215 		    (void **)ptps, __arraycount(ptps), false);
3216 		if (n == 0) {
3217 			mutex_exit(&pmap->pm_lock);
3218 			break;
3219 		}
3220 
3221 		/* Remove all mappings in the set of PTPs. */
3222 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3223 		for (i = 0; i < n; i++) {
3224 			if (ptps[i]->wire_count == 0) {
3225 				/* It's dead: pmap_update() will expunge. */
3226 				continue;
3227 			}
3228 
3229 			/* Determine range of block. */
3230 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3231 			blkendva = x86_round_pdr(va + 1);
3232 
3233 			/* Make sure everything squares up... */
3234 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3235 			KASSERT(lvl == 1);
3236 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3237 
3238 			/* Zap! */
3239 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3240 			    blkendva);
3241 
3242 			/* PTP should now be unused - free it. */
3243 			KASSERT(ptps[i]->wire_count == 1);
3244 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3245 		}
3246 		pmap_unmap_ptes(pmap, pmap2);
3247 		pmap_drain_pv(pmap);
3248 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3249 		mutex_exit(&pmap->pm_lock);
3250 
3251 		/* Process deferred frees. */
3252 		pmap_update(pmap);
3253 
3254 		/* A breathing point. */
3255 		preempt_point();
3256 	}
3257 
3258 	/* Verify that the pmap is now completely empty. */
3259 	pmap_check_ptps(pmap);
3260 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3261 	    "pmap %p not empty", pmap);
3262 
3263 	return true;
3264 }
3265 
3266 #if defined(PMAP_FORK)
3267 /*
3268  * pmap_fork: perform any necessary data structure manipulation when
3269  * a VM space is forked.
3270  */
3271 void
3272 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3273 {
3274 #ifdef USER_LDT
3275 	union descriptor *new_ldt;
3276 	int sel;
3277 
3278 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3279 		return;
3280 	}
3281 
3282 	/*
3283 	 * Copy the LDT into the new process.
3284 	 *
3285 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3286 	 * we'll retry. This will starve if there's a stream of LDT changes
3287 	 * in another thread but that should not happen.
3288 	 */
3289 
3290 retry:
3291 	if (pmap1->pm_ldt != NULL) {
3292 		/* Allocate space for the new process's LDT */
3293 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3294 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3295 		if (new_ldt == NULL) {
3296 			printf("WARNING: %s: unable to allocate LDT space\n",
3297 			    __func__);
3298 			return;
3299 		}
3300 		mutex_enter(&cpu_lock);
3301 		/* Get a GDT slot for it */
3302 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3303 		if (sel == -1) {
3304 			mutex_exit(&cpu_lock);
3305 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3306 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3307 			printf("WARNING: %s: unable to allocate LDT selector\n",
3308 			    __func__);
3309 			return;
3310 		}
3311 	} else {
3312 		/* Wasn't anything there after all. */
3313 		new_ldt = NULL;
3314 		sel = -1;
3315 		mutex_enter(&cpu_lock);
3316 	}
3317 
3318 	/*
3319 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3320 	 */
3321 	if (pmap1->pm_ldt != NULL) {
3322 		if (new_ldt == NULL) {
3323 			/* A wild LDT just appeared. */
3324 			mutex_exit(&cpu_lock);
3325 			goto retry;
3326 		}
3327 
3328 		/* Copy the LDT data and install it in pmap2 */
3329 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3330 		pmap2->pm_ldt = new_ldt;
3331 		pmap2->pm_ldt_sel = sel;
3332 		mutex_exit(&cpu_lock);
3333 	} else {
3334 		if (new_ldt != NULL) {
3335 			/* The LDT disappeared, drop what we did. */
3336 			ldt_free(sel);
3337 			mutex_exit(&cpu_lock);
3338 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3339 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3340 			return;
3341 		}
3342 
3343 		/* We're good, just leave. */
3344 		mutex_exit(&cpu_lock);
3345 	}
3346 #endif /* USER_LDT */
3347 }
3348 #endif /* PMAP_FORK */
3349 
3350 #ifdef USER_LDT
3351 
3352 /*
3353  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3354  * is active, reload LDTR.
3355  */
3356 static void
3357 pmap_ldt_xcall(void *arg1, void *arg2)
3358 {
3359 	struct pmap *pm;
3360 
3361 	kpreempt_disable();
3362 	pm = arg1;
3363 	if (curcpu()->ci_pmap == pm) {
3364 #if defined(SVS)
3365 		if (svs_enabled) {
3366 			svs_ldt_sync(pm);
3367 		} else
3368 #endif
3369 		lldt(pm->pm_ldt_sel);
3370 	}
3371 	kpreempt_enable();
3372 }
3373 
3374 /*
3375  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3376  * in the new selector on all CPUs.
3377  */
3378 void
3379 pmap_ldt_sync(struct pmap *pm)
3380 {
3381 	uint64_t where;
3382 
3383 	KASSERT(mutex_owned(&cpu_lock));
3384 
3385 	pmap_ldt_evcnt.ev_count++;
3386 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3387 	xc_wait(where);
3388 }
3389 
3390 /*
3391  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3392  * restore the default.
3393  */
3394 void
3395 pmap_ldt_cleanup(struct lwp *l)
3396 {
3397 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3398 	union descriptor *ldt;
3399 	int sel;
3400 
3401 	if (__predict_true(pmap->pm_ldt == NULL)) {
3402 		return;
3403 	}
3404 
3405 	mutex_enter(&cpu_lock);
3406 	if (pmap->pm_ldt != NULL) {
3407 		sel = pmap->pm_ldt_sel;
3408 		ldt = pmap->pm_ldt;
3409 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3410 		pmap->pm_ldt = NULL;
3411 		pmap_ldt_sync(pmap);
3412 		ldt_free(sel);
3413 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3414 		    UVM_KMF_WIRED);
3415 	}
3416 	mutex_exit(&cpu_lock);
3417 }
3418 #endif /* USER_LDT */
3419 
3420 /*
3421  * pmap_activate: activate a process' pmap
3422  *
3423  * => must be called with kernel preemption disabled
3424  * => if lwp is the curlwp, then set ci_want_pmapload so that
3425  *    actual MMU context switch will be done by pmap_load() later
3426  */
3427 void
3428 pmap_activate(struct lwp *l)
3429 {
3430 	struct cpu_info *ci;
3431 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3432 
3433 	KASSERT(kpreempt_disabled());
3434 
3435 	ci = curcpu();
3436 
3437 	if (l != ci->ci_curlwp)
3438 		return;
3439 
3440 	KASSERT(ci->ci_want_pmapload == 0);
3441 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3442 
3443 	/*
3444 	 * no need to switch to kernel vmspace because
3445 	 * it's a subset of any vmspace.
3446 	 */
3447 
3448 	if (pmap == pmap_kernel()) {
3449 		ci->ci_want_pmapload = 0;
3450 		return;
3451 	}
3452 
3453 	ci->ci_want_pmapload = 1;
3454 }
3455 
3456 #if defined(XENPV) && defined(__x86_64__)
3457 #define	KASSERT_PDIRPA(pmap) \
3458 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3459 	    pmap == pmap_kernel())
3460 #elif defined(PAE)
3461 #define	KASSERT_PDIRPA(pmap) \
3462 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3463 #elif !defined(XENPV)
3464 #define	KASSERT_PDIRPA(pmap) \
3465 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3466 #else
3467 #define	KASSERT_PDIRPA(pmap)	KASSERT(true)	/* nothing to do */
3468 #endif
3469 
3470 /*
3471  * pmap_reactivate: try to regain reference to the pmap.
3472  *
3473  * => Must be called with kernel preemption disabled.
3474  */
3475 static void
3476 pmap_reactivate(struct pmap *pmap)
3477 {
3478 	struct cpu_info * const ci = curcpu();
3479 	const cpuid_t cid = cpu_index(ci);
3480 
3481 	KASSERT(kpreempt_disabled());
3482 	KASSERT_PDIRPA(pmap);
3483 
3484 	/*
3485 	 * If we still have a lazy reference to this pmap, we can assume
3486 	 * that there was no TLB shootdown for this pmap in the meantime.
3487 	 *
3488 	 * The order of events here is important as we must synchronize
3489 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3490 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3491 	 * change only when the state is TLBSTATE_LAZY.
3492 	 */
3493 
3494 	ci->ci_tlbstate = TLBSTATE_VALID;
3495 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3496 
3497 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3498 		/* We have the reference, state is valid. */
3499 	} else {
3500 		/*
3501 		 * Must reload the TLB, pmap has been changed during
3502 		 * deactivated.
3503 		 */
3504 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3505 
3506 		tlbflush();
3507 	}
3508 }
3509 
3510 /*
3511  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3512  * and relevant LDT info.
3513  *
3514  * Ensures that the current process' pmap is loaded on the current CPU's
3515  * MMU and that there are no stale TLB entries.
3516  *
3517  * => The caller should disable kernel preemption or do check-and-retry
3518  *    to prevent a preemption from undoing our efforts.
3519  * => This function may block.
3520  */
3521 void
3522 pmap_load(void)
3523 {
3524 	struct cpu_info *ci;
3525 	struct pmap *pmap, *oldpmap;
3526 	struct lwp *l;
3527 	uint64_t ncsw;
3528 
3529 	kpreempt_disable();
3530  retry:
3531 	ci = curcpu();
3532 	if (!ci->ci_want_pmapload) {
3533 		kpreempt_enable();
3534 		return;
3535 	}
3536 	l = ci->ci_curlwp;
3537 	ncsw = l->l_ncsw;
3538 	__insn_barrier();
3539 
3540 	/* should be able to take ipis. */
3541 	KASSERT(ci->ci_ilevel < IPL_HIGH);
3542 #ifdef XENPV
3543 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3544 	KASSERT(x86_read_psl() == 0);
3545 #else
3546 	KASSERT((x86_read_psl() & PSL_I) != 0);
3547 #endif
3548 
3549 	KASSERT(l != NULL);
3550 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3551 	KASSERT(pmap != pmap_kernel());
3552 	oldpmap = ci->ci_pmap;
3553 
3554 	if (pmap == oldpmap) {
3555 		pmap_reactivate(pmap);
3556 		ci->ci_want_pmapload = 0;
3557 		kpreempt_enable();
3558 		return;
3559 	}
3560 
3561 	/*
3562 	 * Acquire a reference to the new pmap and perform the switch.
3563 	 */
3564 
3565 	pmap_reference(pmap);
3566 	pmap_load1(l, pmap, oldpmap);
3567 	ci->ci_want_pmapload = 0;
3568 
3569 	/*
3570 	 * we're now running with the new pmap.  drop the reference
3571 	 * to the old pmap.  if we block, we need to go around again.
3572 	 */
3573 
3574 	pmap_destroy(oldpmap);
3575 	__insn_barrier();
3576 	if (l->l_ncsw != ncsw) {
3577 		goto retry;
3578 	}
3579 
3580 	kpreempt_enable();
3581 }
3582 
3583 /*
3584  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3585  * pmap_load().  It's critically important that this function does not
3586  * block.
3587  */
3588 static void
3589 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3590 {
3591 	struct cpu_info *ci;
3592 	struct pcb *pcb;
3593 	cpuid_t cid;
3594 
3595 	KASSERT(kpreempt_disabled());
3596 
3597 	pcb = lwp_getpcb(l);
3598 	ci = l->l_cpu;
3599 	cid = cpu_index(ci);
3600 
3601 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3602 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3603 
3604 	KASSERT_PDIRPA(oldpmap);
3605 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3606 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3607 
3608 	/*
3609 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3610 	 * with TLB shootdown interrupts, so set the state VALID first,
3611 	 * then register us for shootdown events on this pmap.
3612 	 */
3613 	ci->ci_tlbstate = TLBSTATE_VALID;
3614 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3615 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3616 	ci->ci_pmap = pmap;
3617 
3618 	/*
3619 	 * update tss.  now that we have registered for invalidations
3620 	 * from other CPUs, we're good to load the page tables.
3621 	 */
3622 #ifdef PAE
3623 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3624 #else
3625 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3626 #endif
3627 
3628 #ifdef i386
3629 #ifndef XENPV
3630 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3631 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3632 #endif
3633 #endif
3634 
3635 #if defined(SVS) && defined(USER_LDT)
3636 	if (svs_enabled) {
3637 		svs_ldt_sync(pmap);
3638 	} else
3639 #endif
3640 	lldt(pmap->pm_ldt_sel);
3641 
3642 	cpu_load_pmap(pmap, oldpmap);
3643 }
3644 
3645 /*
3646  * pmap_deactivate: deactivate a process' pmap.
3647  *
3648  * => Must be called with kernel preemption disabled (high IPL is enough).
3649  */
3650 void
3651 pmap_deactivate(struct lwp *l)
3652 {
3653 	struct pmap *pmap;
3654 	struct cpu_info *ci;
3655 
3656 	KASSERT(kpreempt_disabled());
3657 
3658 	if (l != curlwp) {
3659 		return;
3660 	}
3661 
3662 	/*
3663 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3664 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3665 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3666 	 * to kernel preemption or blocking routine in between.
3667 	 */
3668 	pmap_tlb_shootnow();
3669 
3670 	ci = curcpu();
3671 
3672 	if (ci->ci_want_pmapload) {
3673 		/*
3674 		 * ci_want_pmapload means that our pmap is not loaded on
3675 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3676 		 * is always considered loaded.
3677 		 */
3678 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3679 		    != pmap_kernel());
3680 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3681 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3682 
3683 		/*
3684 		 * userspace has not been touched.
3685 		 * nothing to do here.
3686 		 */
3687 
3688 		ci->ci_want_pmapload = 0;
3689 		return;
3690 	}
3691 
3692 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3693 
3694 	if (pmap == pmap_kernel()) {
3695 		return;
3696 	}
3697 
3698 	KASSERT_PDIRPA(pmap);
3699 	KASSERT(ci->ci_pmap == pmap);
3700 
3701 	/*
3702 	 * we aren't interested in TLB invalidations for this pmap,
3703 	 * at least for the time being.
3704 	 */
3705 
3706 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3707 	ci->ci_tlbstate = TLBSTATE_LAZY;
3708 }
3709 
3710 /*
3711  * some misc. functions
3712  */
3713 
3714 bool
3715 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3716     int *lastlvl)
3717 {
3718 	unsigned long index;
3719 	pd_entry_t pde;
3720 	int i;
3721 
3722 	for (i = PTP_LEVELS; i > 1; i--) {
3723 		index = pl_i(va, i);
3724 		pde = pdes[i - 2][index];
3725 		if ((pde & PTE_P) == 0) {
3726 			*lastlvl = i;
3727 			return false;
3728 		}
3729 		if (pde & PTE_PS)
3730 			break;
3731 	}
3732 	if (lastpde != NULL)
3733 		*lastpde = pde;
3734 	*lastlvl = i;
3735 	return true;
3736 }
3737 
3738 /*
3739  * pmap_extract: extract a PA for the given VA
3740  */
3741 bool
3742 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3743 {
3744 	pt_entry_t *ptes, pte;
3745 	pd_entry_t pde;
3746 	pd_entry_t * const *pdes;
3747 	struct pmap *pmap2;
3748 	paddr_t pa;
3749 	bool rv;
3750 	int lvl;
3751 
3752 	if (__predict_false(pmap->pm_extract != NULL)) {
3753 		return (*pmap->pm_extract)(pmap, va, pap);
3754 	}
3755 
3756 #ifdef __HAVE_DIRECT_MAP
3757 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3758 		if (pap != NULL) {
3759 			*pap = PMAP_DIRECT_UNMAP(va);
3760 		}
3761 		return true;
3762 	}
3763 #endif
3764 
3765 	rv = false;
3766 	pa = 0;
3767 
3768 	if (pmap != pmap_kernel()) {
3769 		mutex_enter(&pmap->pm_lock);
3770 	}
3771 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3772 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3773 		if (lvl == 2) {
3774 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3775 			rv = true;
3776 		} else {
3777 			KASSERT(lvl == 1);
3778 			pte = ptes[pl1_i(va)];
3779 			if (__predict_true((pte & PTE_P) != 0)) {
3780 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3781 				rv = true;
3782 			}
3783 		}
3784 	}
3785 	pmap_unmap_ptes(pmap, pmap2);
3786 	if (pmap != pmap_kernel()) {
3787 		mutex_exit(&pmap->pm_lock);
3788 	}
3789 	if (pap != NULL) {
3790 		*pap = pa;
3791 	}
3792 
3793 	return rv;
3794 }
3795 
3796 /*
3797  * vtophys: virtual address to physical address.  For use by
3798  * machine-dependent code only.
3799  */
3800 paddr_t
3801 vtophys(vaddr_t va)
3802 {
3803 	paddr_t pa;
3804 
3805 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3806 		return pa;
3807 	return 0;
3808 }
3809 
3810 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3811 
3812 #ifdef XENPV
3813 /*
3814  * vtomach: virtual address to machine address.  For use by
3815  * machine-dependent code only.
3816  */
3817 paddr_t
3818 vtomach(vaddr_t va)
3819 {
3820 	paddr_t pa;
3821 
3822 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3823 		return pa;
3824 	return 0;
3825 }
3826 #endif
3827 
3828 /*
3829  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3830  * determine the bounds of the kernel virtual address space.
3831  */
3832 void
3833 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3834 {
3835 	*startp = virtual_avail;
3836 	*endp = virtual_end;
3837 }
3838 
3839 void
3840 pmap_zero_page(paddr_t pa)
3841 {
3842 #if defined(__HAVE_DIRECT_MAP)
3843 	memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
3844 #else
3845 #if defined(XENPV)
3846 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3847 		xen_pagezero(pa);
3848 		return;
3849 	}
3850 #endif
3851 	struct cpu_info *ci;
3852 	pt_entry_t *zpte;
3853 	vaddr_t zerova;
3854 
3855 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
3856 
3857 	kpreempt_disable();
3858 
3859 	ci = curcpu();
3860 	zerova = ci->vpage[VPAGE_ZER];
3861 	zpte = ci->vpage_pte[VPAGE_ZER];
3862 
3863 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3864 
3865 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3866 	pmap_pte_flush();
3867 	pmap_update_pg(zerova);		/* flush TLB */
3868 
3869 	memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
3870 
3871 #if defined(DIAGNOSTIC) || defined(XENPV)
3872 	pmap_pte_set(zpte, 0);				/* zap ! */
3873 	pmap_pte_flush();
3874 #endif
3875 
3876 	kpreempt_enable();
3877 #endif /* defined(__HAVE_DIRECT_MAP) */
3878 }
3879 
3880 void
3881 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3882 {
3883 #if defined(__HAVE_DIRECT_MAP)
3884 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3885 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3886 
3887 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
3888 #else
3889 #if defined(XENPV)
3890 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3891 		xen_copy_page(srcpa, dstpa);
3892 		return;
3893 	}
3894 #endif
3895 	struct cpu_info *ci;
3896 	pt_entry_t *srcpte, *dstpte;
3897 	vaddr_t srcva, dstva;
3898 
3899 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
3900 
3901 	kpreempt_disable();
3902 
3903 	ci = curcpu();
3904 	srcva = ci->vpage[VPAGE_SRC];
3905 	dstva = ci->vpage[VPAGE_DST];
3906 	srcpte = ci->vpage_pte[VPAGE_SRC];
3907 	dstpte = ci->vpage_pte[VPAGE_DST];
3908 
3909 	KASSERT(*srcpte == 0 && *dstpte == 0);
3910 
3911 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3912 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
3913 	pmap_pte_flush();
3914 	pmap_update_pg(srcva);
3915 	pmap_update_pg(dstva);
3916 
3917 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
3918 
3919 #if defined(DIAGNOSTIC) || defined(XENPV)
3920 	pmap_pte_set(srcpte, 0);
3921 	pmap_pte_set(dstpte, 0);
3922 	pmap_pte_flush();
3923 #endif
3924 
3925 	kpreempt_enable();
3926 #endif /* defined(__HAVE_DIRECT_MAP) */
3927 }
3928 
3929 static pt_entry_t *
3930 pmap_map_ptp(struct vm_page *ptp)
3931 {
3932 #ifdef __HAVE_DIRECT_MAP
3933 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3934 #else
3935 	struct cpu_info *ci;
3936 	pt_entry_t *ptppte;
3937 	vaddr_t ptpva;
3938 
3939 	KASSERT(kpreempt_disabled());
3940 
3941 #ifndef XENPV
3942 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
3943 #else
3944 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
3945 #endif
3946 
3947 	ci = curcpu();
3948 	ptpva = ci->vpage[VPAGE_PTP];
3949 	ptppte = ci->vpage_pte[VPAGE_PTP];
3950 
3951 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3952 
3953 	pmap_pte_flush();
3954 	pmap_update_pg(ptpva);
3955 
3956 	return (pt_entry_t *)ptpva;
3957 #endif
3958 }
3959 
3960 static void
3961 pmap_unmap_ptp(void)
3962 {
3963 #ifndef __HAVE_DIRECT_MAP
3964 #if defined(DIAGNOSTIC) || defined(XENPV)
3965 	struct cpu_info *ci;
3966 	pt_entry_t *pte;
3967 
3968 	KASSERT(kpreempt_disabled());
3969 
3970 	ci = curcpu();
3971 	pte = ci->vpage_pte[VPAGE_PTP];
3972 
3973 	if (*pte != 0) {
3974 		pmap_pte_set(pte, 0);
3975 		pmap_pte_flush();
3976 	}
3977 #endif
3978 #endif
3979 }
3980 
3981 static pt_entry_t *
3982 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3983 {
3984 
3985 	KASSERT(kpreempt_disabled());
3986 	if (pmap_is_curpmap(pmap)) {
3987 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3988 	}
3989 	KASSERT(ptp != NULL);
3990 	return pmap_map_ptp(ptp) + pl1_pi(va);
3991 }
3992 
3993 static void
3994 pmap_unmap_pte(void)
3995 {
3996 
3997 	KASSERT(kpreempt_disabled());
3998 
3999 	pmap_unmap_ptp();
4000 }
4001 
4002 /*
4003  * p m a p   r e m o v e   f u n c t i o n s
4004  *
4005  * functions that remove mappings
4006  */
4007 
4008 /*
4009  * pmap_remove_ptes: remove PTEs from a PTP
4010  *
4011  * => caller must hold pmap's lock
4012  * => PTP must be mapped into KVA
4013  * => PTP should be null if pmap == pmap_kernel()
4014  * => must be called with kernel preemption disabled
4015  * => returns composite pte if at least one page should be shot down
4016  */
4017 static void
4018 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4019     vaddr_t startva, vaddr_t endva)
4020 {
4021 	pt_entry_t *pte = (pt_entry_t *)ptpva;
4022 
4023 	KASSERT(mutex_owned(&pmap->pm_lock));
4024 	KASSERT(kpreempt_disabled());
4025 
4026 	/*
4027 	 * mappings are very often sparse, so clip the given range to the
4028 	 * range of PTEs that are known present in the PTP.
4029 	 */
4030 	pmap_ptp_range_clip(ptp, &startva, &pte);
4031 
4032 	/*
4033 	 * note that ptpva points to the PTE that maps startva.   this may
4034 	 * or may not be the first PTE in the PTP.
4035 	 *
4036 	 * we loop through the PTP while there are still PTEs to look at
4037 	 * and the wire_count is greater than 1 (because we use the wire_count
4038 	 * to keep track of the number of real PTEs in the PTP).
4039 	 */
4040 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4041 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4042 		startva += PAGE_SIZE;
4043 		pte++;
4044 	}
4045 }
4046 
4047 /*
4048  * pmap_remove_pte: remove a single PTE from a PTP.
4049  *
4050  * => caller must hold pmap's lock
4051  * => PTP must be mapped into KVA
4052  * => PTP should be null if pmap == pmap_kernel()
4053  * => returns true if we removed a mapping
4054  * => must be called with kernel preemption disabled
4055  */
4056 static bool
4057 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4058     vaddr_t va)
4059 {
4060 	struct pv_entry *pve;
4061 	struct vm_page *pg;
4062 	struct pmap_page *pp;
4063 	pt_entry_t opte;
4064 
4065 	KASSERT(mutex_owned(&pmap->pm_lock));
4066 	KASSERT(kpreempt_disabled());
4067 
4068 	if (!pmap_valid_entry(*pte)) {
4069 		/* VA not mapped. */
4070 		return false;
4071 	}
4072 
4073 	/* Atomically save the old PTE and zap it. */
4074 	opte = pmap_pte_testset(pte, 0);
4075 	if (!pmap_valid_entry(opte)) {
4076 		return false;
4077 	}
4078 
4079 	pmap_exec_account(pmap, va, opte, 0);
4080 	pmap_stats_update_bypte(pmap, 0, opte);
4081 
4082 	if (ptp) {
4083 		/*
4084 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4085 		 */
4086 		ptp->wire_count--;
4087 		if (ptp->wire_count <= 1) {
4088 			opte |= PTE_A;
4089 		}
4090 	}
4091 
4092 	if ((opte & PTE_A) != 0) {
4093 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4094 	}
4095 
4096 	/*
4097 	 * If we are not on a pv list - we are done.
4098 	 */
4099 	if ((opte & PTE_PVLIST) == 0) {
4100 #ifndef DOM0OPS
4101 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4102 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4103 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4104 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4105 #endif
4106 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4107 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4108 		return true;
4109 	}
4110 
4111 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4112 		pp = VM_PAGE_TO_PP(pg);
4113 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4114 		paddr_t pa = pmap_pte2pa(opte);
4115 		panic("%s: PTE_PVLIST with pv-untracked page"
4116 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4117 		    __func__, va, pa, atop(pa));
4118 	}
4119 
4120 	/* Sync R/M bits. */
4121 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4122 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4123 	return true;
4124 }
4125 
4126 static void
4127 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4128 {
4129 	pt_entry_t *ptes;
4130 	pd_entry_t pde;
4131 	pd_entry_t * const *pdes;
4132 	bool result;
4133 	vaddr_t blkendva, va = sva;
4134 	struct vm_page *ptp;
4135 	struct pmap *pmap2;
4136 	int lvl;
4137 
4138 	KASSERT(mutex_owned(&pmap->pm_lock));
4139 
4140 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4141 
4142 	/*
4143 	 * removing one page?  take shortcut function.
4144 	 */
4145 
4146 	if (va + PAGE_SIZE == eva) {
4147 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4148 			KASSERT(lvl == 1);
4149 
4150 			/* Get PTP if non-kernel mapping. */
4151 			if (pmap != pmap_kernel()) {
4152 				ptp = pmap_find_ptp(pmap, va, 1);
4153 				KASSERTMSG(ptp != NULL,
4154 				    "%s: unmanaged PTP detected", __func__);
4155 			} else {
4156 				/* Never free kernel PTPs. */
4157 				ptp = NULL;
4158 			}
4159 
4160 			result = pmap_remove_pte(pmap, ptp,
4161 			    &ptes[pl1_i(va)], va);
4162 
4163 			/*
4164 			 * if mapping removed and the PTP is no longer
4165 			 * being used, free it!
4166 			 */
4167 
4168 			if (result && ptp && ptp->wire_count <= 1)
4169 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4170 		}
4171 	} else for (/* null */ ; va < eva ; va = blkendva) {
4172 		/* determine range of block */
4173 		blkendva = x86_round_pdr(va+1);
4174 		if (blkendva > eva)
4175 			blkendva = eva;
4176 
4177 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4178 			/* Skip a range corresponding to an invalid pde. */
4179 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4180 			continue;
4181 		}
4182 		KASSERT(lvl == 1);
4183 
4184 		/* Get PTP if non-kernel mapping. */
4185 		if (pmap != pmap_kernel()) {
4186 			ptp = pmap_find_ptp(pmap, va, 1);
4187 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4188 			    __func__);
4189 		} else {
4190 			/* Never free kernel PTPs. */
4191 			ptp = NULL;
4192 		}
4193 
4194 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4195 		    blkendva);
4196 
4197 		/* If PTP is no longer being used, free it. */
4198 		if (ptp && ptp->wire_count <= 1) {
4199 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4200 		}
4201 	}
4202 	pmap_unmap_ptes(pmap, pmap2);
4203 	pmap_drain_pv(pmap);
4204 }
4205 
4206 /*
4207  * pmap_remove: mapping removal function.
4208  *
4209  * => caller should not be holding any pmap locks
4210  */
4211 void
4212 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4213 {
4214 	if (__predict_false(pmap->pm_remove != NULL)) {
4215 		(*pmap->pm_remove)(pmap, sva, eva);
4216 		return;
4217 	}
4218 
4219 	mutex_enter(&pmap->pm_lock);
4220 	pmap_remove_locked(pmap, sva, eva);
4221 	mutex_exit(&pmap->pm_lock);
4222 }
4223 
4224 /*
4225  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4226  *
4227  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4228  * => Caller should disable kernel preemption.
4229  * => issues tlb shootdowns if necessary.
4230  */
4231 static int
4232 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4233     pt_entry_t *optep)
4234 {
4235 	struct pmap *pmap;
4236 	struct vm_page *ptp;
4237 	vaddr_t va;
4238 	pt_entry_t *ptep;
4239 	pt_entry_t opte;
4240 	pt_entry_t npte;
4241 	pt_entry_t expect;
4242 	bool need_shootdown;
4243 
4244 	ptp = pvpte->pte_ptp;
4245 	va = pvpte->pte_va;
4246 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4247 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4248 	pmap = ptp_to_pmap(ptp);
4249 	KASSERT(kpreempt_disabled());
4250 
4251 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4252 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4253 		    optep);
4254 	}
4255 
4256 	expect = pmap_pa2pte(pa) | PTE_P;
4257 
4258 	if (clearbits != ~0) {
4259 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4260 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4261 	}
4262 
4263 	ptep = pmap_map_pte(pmap, ptp, va);
4264 	do {
4265 		opte = *ptep;
4266 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4267 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4268 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4269 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4270 			/*
4271 			 * We lost a race with a V->P operation like
4272 			 * pmap_remove().  Wait for the competitor
4273 			 * reflecting pte bits into mp_attrs.
4274 			 */
4275 			pmap_unmap_pte();
4276 			return EAGAIN;
4277 		}
4278 
4279 		/*
4280 		 * Check if there's anything to do on this PTE.
4281 		 */
4282 		if ((opte & clearbits) == 0) {
4283 			need_shootdown = false;
4284 			break;
4285 		}
4286 
4287 		/*
4288 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4289 		 * ... Unless we are clearing only the PTE_W bit and
4290 		 * it isn't cached as RW (PTE_D).
4291 		 */
4292 		need_shootdown = (opte & PTE_A) != 0 &&
4293 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4294 
4295 		npte = opte & ~clearbits;
4296 
4297 		/*
4298 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4299 		 */
4300 		if (need_shootdown) {
4301 			npte &= ~(PTE_A | PTE_D);
4302 		}
4303 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4304 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4305 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4306 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4307 
4308 	if (need_shootdown) {
4309 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4310 	}
4311 	pmap_unmap_pte();
4312 
4313 	*oattrs = pmap_pte_to_pp_attrs(opte);
4314 	if (optep != NULL)
4315 		*optep = opte;
4316 	return 0;
4317 }
4318 
4319 static void
4320 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4321     vaddr_t va)
4322 {
4323 	struct pmap *pmap2;
4324 	pt_entry_t *ptes;
4325 	pd_entry_t * const *pdes;
4326 
4327 	KASSERT(mutex_owned(&pmap->pm_lock));
4328 
4329 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4330 	pmap_stats_update_bypte(pmap, 0, opte);
4331 	ptp->wire_count--;
4332 	if (ptp->wire_count <= 1) {
4333 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4334 	}
4335 	pmap_unmap_ptes(pmap, pmap2);
4336 }
4337 
4338 static void
4339 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4340 {
4341 	struct pv_pte *pvpte;
4342 	struct vm_page *ptp;
4343 	uintptr_t sum;
4344 	uint8_t oattrs;
4345 	bool locked;
4346 
4347 	/*
4348 	 * Do an unlocked check to see if the page has no mappings, eg when
4349 	 * pmap_remove_all() was called before amap_wipeout() for a process
4350 	 * private amap - common.  The page being removed must be on the way
4351 	 * out, so we don't have to worry about concurrent attempts to enter
4352 	 * it (otherwise the caller either doesn't care or has screwed up).
4353 	 */
4354 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4355 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4356 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4357 	if (sum == 0) {
4358 		return;
4359 	}
4360 
4361 	kpreempt_disable();
4362 	for (;;) {
4363 		struct pmap *pmap;
4364 		struct pv_entry *pve;
4365 		pt_entry_t opte;
4366 		vaddr_t va;
4367 
4368 		mutex_spin_enter(&pp->pp_lock);
4369 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4370 			mutex_spin_exit(&pp->pp_lock);
4371 			break;
4372 		}
4373 
4374 		/*
4375 		 * Add a reference to the pmap before clearing the pte.
4376 		 * Otherwise the pmap can disappear behind us.
4377 		 */
4378 		ptp = pvpte->pte_ptp;
4379 		pmap = ptp_to_pmap(ptp);
4380 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4381 		if (ptp != NULL) {
4382 			pmap_reference(pmap);
4383 		}
4384 
4385 		/*
4386 		 * Now try to lock it.  We need a direct handoff between
4387 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4388 		 * and kept associated with this pmap.  If that can't be
4389 		 * had, wait for the pmap's lock to become free and then
4390 		 * retry.
4391 		 */
4392 		locked = mutex_tryenter(&pmap->pm_lock);
4393 		mutex_spin_exit(&pp->pp_lock);
4394 		if (!locked) {
4395 			mutex_enter(&pmap->pm_lock);
4396 			/* nothing, just wait for it */
4397 			mutex_exit(&pmap->pm_lock);
4398 			if (ptp != NULL) {
4399 				pmap_destroy(pmap);
4400 			}
4401 			continue;
4402 		}
4403 		va = pvpte->pte_va;
4404 
4405 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4406 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4407 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4408 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4409 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4410 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4411 
4412 #ifdef DEBUG
4413 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4414 		rb_tree_t *tree = (ptp != NULL ?
4415 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4416 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4417 		if (pve == NULL) {
4418 			KASSERTMSG(&pp->pp_pte == pvpte,
4419 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4420 			    va, pmap, ptp, pvpte, pve);
4421 		} else {
4422 			KASSERTMSG(&pve->pve_pte == pvpte,
4423 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4424 			    va, pmap, ptp, pvpte, pve);
4425 		}
4426 #endif
4427 
4428 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4429 			panic("pmap_pp_remove: mapping not present");
4430 		}
4431 
4432 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4433 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4434 
4435 		/* Update the PTP reference count. Free if last reference. */
4436 		if (ptp != NULL) {
4437 			KASSERT(pmap != pmap_kernel());
4438 			pmap_tlb_shootnow();
4439 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4440 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4441 			} else {
4442 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4443 			}
4444 		} else {
4445 			KASSERT(pmap == pmap_kernel());
4446 			pmap_stats_update_bypte(pmap, 0, opte);
4447 		}
4448 		pmap_tlb_shootnow();
4449 		pmap_drain_pv(pmap);
4450 		mutex_exit(&pmap->pm_lock);
4451 		if (ptp != NULL) {
4452 			pmap_destroy(pmap);
4453 		}
4454 	}
4455 	kpreempt_enable();
4456 }
4457 
4458 /*
4459  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4460  *
4461  * => R/M bits are sync'd back to attrs
4462  */
4463 void
4464 pmap_page_remove(struct vm_page *pg)
4465 {
4466 	struct pmap_page *pp;
4467 	paddr_t pa;
4468 
4469 	pp = VM_PAGE_TO_PP(pg);
4470 	pa = VM_PAGE_TO_PHYS(pg);
4471 	pmap_pp_remove(pp, pa);
4472 }
4473 
4474 /*
4475  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4476  * that map it
4477  */
4478 void
4479 pmap_pv_remove(paddr_t pa)
4480 {
4481 	struct pmap_page *pp;
4482 
4483 	pp = pmap_pv_tracked(pa);
4484 	if (pp == NULL)
4485 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4486 	pmap_pp_remove(pp, pa);
4487 }
4488 
4489 /*
4490  * p m a p   a t t r i b u t e  f u n c t i o n s
4491  * functions that test/change managed page's attributes
4492  * since a page can be mapped multiple times we must check each PTE that
4493  * maps it by going down the pv lists.
4494  */
4495 
4496 /*
4497  * pmap_test_attrs: test a page's attributes
4498  */
4499 bool
4500 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4501 {
4502 	struct pmap_page *pp;
4503 	struct pv_pte *pvpte;
4504 	struct pmap *pmap;
4505 	uint8_t oattrs;
4506 	u_int result;
4507 	paddr_t pa;
4508 
4509 	pp = VM_PAGE_TO_PP(pg);
4510 	if ((pp->pp_attrs & testbits) != 0) {
4511 		return true;
4512 	}
4513 	pa = VM_PAGE_TO_PHYS(pg);
4514  startover:
4515 	mutex_spin_enter(&pp->pp_lock);
4516 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4517 		if ((pp->pp_attrs & testbits) != 0) {
4518 			break;
4519 		}
4520 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4521 			/*
4522 			 * raced with a V->P operation.  wait for the other
4523 			 * side to finish by acquiring pmap's lock.  if no
4524 			 * wait, updates to pp_attrs by the other side may
4525 			 * go unseen.
4526 			 */
4527 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4528 			pmap_reference(pmap);
4529 			mutex_spin_exit(&pp->pp_lock);
4530 			mutex_enter(&pmap->pm_lock);
4531 			/* nothing. */
4532 			mutex_exit(&pmap->pm_lock);
4533 			pmap_destroy(pmap);
4534 			goto startover;
4535 		}
4536 		pp->pp_attrs |= oattrs;
4537 	}
4538 	result = pp->pp_attrs & testbits;
4539 	mutex_spin_exit(&pp->pp_lock);
4540 
4541 	/*
4542 	 * note that we will exit the for loop with a non-null pve if
4543 	 * we have found the bits we are testing for.
4544 	 */
4545 
4546 	return result != 0;
4547 }
4548 
4549 static bool
4550 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4551 {
4552 	struct pv_pte *pvpte;
4553 	struct pmap *pmap;
4554 	uint8_t oattrs;
4555 	u_int result;
4556 
4557 startover:
4558 	mutex_spin_enter(&pp->pp_lock);
4559 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4560 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4561 			/*
4562 			 * raced with a V->P operation.  wait for the other
4563 			 * side to finish by acquiring pmap's lock.  it is
4564 			 * probably unmapping the page, and it will be gone
4565 			 * when the loop is restarted.
4566 			 */
4567 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4568 			pmap_reference(pmap);
4569 			mutex_spin_exit(&pp->pp_lock);
4570 			mutex_enter(&pmap->pm_lock);
4571 			/* nothing. */
4572 			mutex_exit(&pmap->pm_lock);
4573 			pmap_destroy(pmap);
4574 			goto startover;
4575 		}
4576 		pp->pp_attrs |= oattrs;
4577 	}
4578 	result = pp->pp_attrs & clearbits;
4579 	pp->pp_attrs &= ~clearbits;
4580 	pmap_tlb_shootnow();
4581 	mutex_spin_exit(&pp->pp_lock);
4582 
4583 	return result != 0;
4584 }
4585 
4586 /*
4587  * pmap_clear_attrs: clear the specified attribute for a page.
4588  *
4589  * => we return true if we cleared one of the bits we were asked to
4590  */
4591 bool
4592 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4593 {
4594 	struct pmap_page *pp;
4595 	paddr_t pa;
4596 
4597 	pp = VM_PAGE_TO_PP(pg);
4598 	pa = VM_PAGE_TO_PHYS(pg);
4599 
4600 	/*
4601 	 * If this is a new page, assert it has no mappings and simply zap
4602 	 * the stored attributes without taking any locks.
4603 	 */
4604 	if ((pg->flags & PG_FAKE) != 0) {
4605 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4606 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4607 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4608 		atomic_store_relaxed(&pp->pp_attrs, 0);
4609 		return false;
4610 	} else {
4611 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4612 	}
4613 }
4614 
4615 /*
4616  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4617  * pv-tracked page.
4618  */
4619 bool
4620 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4621 {
4622 	struct pmap_page *pp;
4623 
4624 	pp = pmap_pv_tracked(pa);
4625 	if (pp == NULL)
4626 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4627 
4628 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4629 }
4630 
4631 /*
4632  * p m a p   p r o t e c t i o n   f u n c t i o n s
4633  */
4634 
4635 /*
4636  * pmap_page_protect: change the protection of all recorded mappings
4637  * of a managed page
4638  *
4639  * => NOTE: this is an inline function in pmap.h
4640  */
4641 
4642 /* see pmap.h */
4643 
4644 /*
4645  * pmap_pv_protect: change the protection of all recorded mappings
4646  * of an unmanaged pv-tracked page
4647  *
4648  * => NOTE: this is an inline function in pmap.h
4649  */
4650 
4651 /* see pmap.h */
4652 
4653 /*
4654  * pmap_protect: set the protection in of the pages in a pmap
4655  *
4656  * => NOTE: this is an inline function in pmap.h
4657  */
4658 
4659 /* see pmap.h */
4660 
4661 /*
4662  * pmap_write_protect: write-protect pages in a pmap.
4663  *
4664  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4665  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4666  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4667  * present the page will still be considered as a kernel page, and the privilege
4668  * separation will be enforced correctly.
4669  */
4670 void
4671 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4672 {
4673 	pt_entry_t bit_rem, bit_put;
4674 	pt_entry_t *ptes;
4675 	pt_entry_t * const *pdes;
4676 	struct pmap *pmap2;
4677 	vaddr_t blockend, va;
4678 	int lvl, i;
4679 
4680 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4681 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4682 		return;
4683 	}
4684 
4685 	bit_rem = 0;
4686 	if (!(prot & VM_PROT_WRITE))
4687 		bit_rem = PTE_W;
4688 
4689 	bit_put = 0;
4690 	if (!(prot & VM_PROT_EXECUTE))
4691 		bit_put = pmap_pg_nx;
4692 
4693 	sva &= ~PAGE_MASK;
4694 	eva &= ~PAGE_MASK;
4695 
4696 	/*
4697 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4698 	 * be touching PV entries nor stats and kernel PDEs aren't
4699 	 * freed.
4700 	 */
4701 	if (pmap != pmap_kernel()) {
4702 		mutex_enter(&pmap->pm_lock);
4703 	}
4704 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4705 
4706 	for (va = sva ; va < eva; va = blockend) {
4707 		pt_entry_t *spte, *epte;
4708 
4709 		blockend = x86_round_pdr(va + 1);
4710 		if (blockend > eva)
4711 			blockend = eva;
4712 
4713 		/* Is it a valid block? */
4714 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4715 			continue;
4716 		}
4717 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4718 		KASSERT(lvl == 1);
4719 
4720 		spte = &ptes[pl1_i(va)];
4721 		epte = &ptes[pl1_i(blockend)];
4722 
4723 		for (i = 0; spte < epte; spte++, i++) {
4724 			pt_entry_t opte, npte;
4725 
4726 			do {
4727 				opte = *spte;
4728 				if (!pmap_valid_entry(opte)) {
4729 					goto next;
4730 				}
4731 				npte = (opte & ~bit_rem) | bit_put;
4732 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4733 
4734 			if ((opte & PTE_D) != 0) {
4735 				vaddr_t tva = va + x86_ptob(i);
4736 				pmap_tlb_shootdown(pmap, tva, opte,
4737 				    TLBSHOOT_WRITE_PROTECT);
4738 			}
4739 next:;
4740 		}
4741 	}
4742 
4743 	/* Release pmap. */
4744 	pmap_unmap_ptes(pmap, pmap2);
4745 	if (pmap != pmap_kernel()) {
4746 		mutex_exit(&pmap->pm_lock);
4747 	}
4748 }
4749 
4750 /*
4751  * pmap_unwire: clear the wired bit in the PTE.
4752  *
4753  * => Mapping should already be present.
4754  */
4755 void
4756 pmap_unwire(struct pmap *pmap, vaddr_t va)
4757 {
4758 	pt_entry_t *ptes, *ptep, opte;
4759 	pd_entry_t * const *pdes;
4760 	struct pmap *pmap2;
4761 	int lvl;
4762 
4763 	if (__predict_false(pmap->pm_unwire != NULL)) {
4764 		(*pmap->pm_unwire)(pmap, va);
4765 		return;
4766 	}
4767 
4768 	/*
4769 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4770 	 * statistics.
4771 	 */
4772 	mutex_enter(&pmap->pm_lock);
4773 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4774 
4775 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4776 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4777 	}
4778 	KASSERT(lvl == 1);
4779 
4780 	ptep = &ptes[pl1_i(va)];
4781 	opte = *ptep;
4782 	KASSERT(pmap_valid_entry(opte));
4783 
4784 	if (opte & PTE_WIRED) {
4785 		pt_entry_t npte = opte & ~PTE_WIRED;
4786 
4787 		opte = pmap_pte_testset(ptep, npte);
4788 		pmap_stats_update_bypte(pmap, npte, opte);
4789 	} else {
4790 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4791 		    " did not change!\n", __func__, pmap, va);
4792 	}
4793 
4794 	/* Release pmap. */
4795 	pmap_unmap_ptes(pmap, pmap2);
4796 	mutex_exit(&pmap->pm_lock);
4797 }
4798 
4799 /*
4800  * pmap_copy: copy mappings from one pmap to another
4801  *
4802  * => optional function
4803  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4804  */
4805 
4806 /*
4807  * defined as macro in pmap.h
4808  */
4809 
4810 __strict_weak_alias(pmap_enter, pmap_enter_default);
4811 
4812 int
4813 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4814     u_int flags)
4815 {
4816 	if (__predict_false(pmap->pm_enter != NULL)) {
4817 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4818 	}
4819 
4820 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4821 }
4822 
4823 /*
4824  * pmap_enter: enter a mapping into a pmap
4825  *
4826  * => must be done "now" ... no lazy-evaluation
4827  */
4828 int
4829 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4830 	   vm_prot_t prot, u_int flags, int domid)
4831 {
4832 	pt_entry_t *ptes, opte, npte;
4833 	pt_entry_t *ptep;
4834 	pd_entry_t * const *pdes;
4835 	struct vm_page *ptp;
4836 	struct vm_page *new_pg, *old_pg;
4837 	struct pmap_page *new_pp, *old_pp;
4838 	struct pv_entry *old_pve, *new_pve;
4839 	bool wired = (flags & PMAP_WIRED) != 0;
4840 	struct pmap *pmap2;
4841 	struct pmap_ptparray pt;
4842 	int error;
4843 	bool getptp, samepage, new_embedded;
4844 	rb_tree_t *tree;
4845 
4846 	KASSERT(pmap_initialized);
4847 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4848 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4849 	    PRIxVADDR " over PDP!", __func__, va);
4850 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4851 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4852 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4853 
4854 #ifdef XENPV
4855 	KASSERT(domid == DOMID_SELF || pa == 0);
4856 #endif
4857 
4858 	npte = ma | protection_codes[prot] | PTE_P;
4859 	npte |= pmap_pat_flags(flags);
4860 	if (wired)
4861 		npte |= PTE_WIRED;
4862 	if (va < VM_MAXUSER_ADDRESS)
4863 		npte |= PTE_U;
4864 
4865 	if (pmap == pmap_kernel())
4866 		npte |= pmap_pg_g;
4867 	if (flags & VM_PROT_ALL) {
4868 		npte |= PTE_A;
4869 		if (flags & VM_PROT_WRITE) {
4870 			KASSERT((npte & PTE_W) != 0);
4871 			npte |= PTE_D;
4872 		}
4873 	}
4874 
4875 #ifdef XENPV
4876 	if (domid != DOMID_SELF)
4877 		new_pg = NULL;
4878 	else
4879 #endif
4880 		new_pg = PHYS_TO_VM_PAGE(pa);
4881 
4882 	if (new_pg != NULL) {
4883 		/* This is a managed page */
4884 		npte |= PTE_PVLIST;
4885 		new_pp = VM_PAGE_TO_PP(new_pg);
4886 		PMAP_CHECK_PP(new_pp);
4887 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4888 		/* This is an unmanaged pv-tracked page */
4889 		npte |= PTE_PVLIST;
4890 		PMAP_CHECK_PP(new_pp);
4891 	} else {
4892 		new_pp = NULL;
4893 	}
4894 
4895 	/* Begin by locking the pmap. */
4896 	mutex_enter(&pmap->pm_lock);
4897 
4898 	/* Look up the PTP.  Allocate if none present. */
4899 	ptp = NULL;
4900 	getptp = false;
4901 	if (pmap != pmap_kernel()) {
4902 		ptp = pmap_find_ptp(pmap, va, 1);
4903 		if (ptp == NULL) {
4904 			getptp = true;
4905 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
4906 			if (error != 0) {
4907 				if (flags & PMAP_CANFAIL) {
4908 					mutex_exit(&pmap->pm_lock);
4909 					return error;
4910 				}
4911 				panic("%s: get ptp failed, error=%d", __func__,
4912 				    error);
4913 			}
4914 		}
4915 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
4916 	} else {
4917 		/* Embedded PV entries rely on this. */
4918 		KASSERT(va != 0);
4919 		tree = &pmap_kernel_rb;
4920 	}
4921 
4922 	/*
4923 	 * Look up the old PV entry at this VA (if any), and insert a new PV
4924 	 * entry if required for the new mapping.  Temporarily track the old
4925 	 * and new mappings concurrently.  Only after the old mapping is
4926 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
4927 	 * our picture of modified/accessed state for either page could get
4928 	 * out of sync (we need any P->V operation for either page to stall
4929 	 * on pmap->pm_lock until done here).
4930 	 */
4931 	new_pve = NULL;
4932 	old_pve = NULL;
4933 	samepage = false;
4934 	new_embedded = false;
4935 
4936 	if (new_pp != NULL) {
4937 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
4938 		    &old_pve, &samepage, &new_embedded, tree);
4939 
4940 		/*
4941 		 * If a new pv_entry was needed and none was available, we
4942 		 * can go no further.
4943 		 */
4944 		if (error != 0) {
4945 			if (flags & PMAP_CANFAIL) {
4946 				if (getptp) {
4947 					pmap_unget_ptp(pmap, &pt);
4948 				}
4949 				mutex_exit(&pmap->pm_lock);
4950 				return error;
4951 			}
4952 			panic("%s: alloc pve failed", __func__);
4953 		}
4954 	} else {
4955 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4956 	}
4957 
4958 	/* Map PTEs into address space. */
4959 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4960 
4961 	/* Install any newly allocated PTPs. */
4962 	if (getptp) {
4963 		pmap_install_ptp(pmap, &pt, va, pdes);
4964 	}
4965 
4966 	/* Check if there is an existing mapping. */
4967 	ptep = &ptes[pl1_i(va)];
4968 	opte = *ptep;
4969 	bool have_oldpa = pmap_valid_entry(opte);
4970 	paddr_t oldpa = pmap_pte2pa(opte);
4971 
4972 	/*
4973 	 * Update the pte.
4974 	 */
4975 	do {
4976 		opte = *ptep;
4977 
4978 		/*
4979 		 * if the same page, inherit PTE_A and PTE_D.
4980 		 */
4981 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
4982 			npte |= opte & (PTE_A | PTE_D);
4983 		}
4984 #if defined(XENPV)
4985 		if (domid != DOMID_SELF) {
4986 			/* pmap_pte_cas with error handling */
4987 			int s = splvm();
4988 			if (opte != *ptep) {
4989 				splx(s);
4990 				continue;
4991 			}
4992 			error = xpq_update_foreign(
4993 			    vtomach((vaddr_t)ptep), npte, domid, flags);
4994 			splx(s);
4995 			if (error) {
4996 				/* Undo pv_entry tracking - oof. */
4997 				if (new_pp != NULL) {
4998 					mutex_spin_enter(&new_pp->pp_lock);
4999 					if (new_pve != NULL) {
5000 						LIST_REMOVE(new_pve, pve_list);
5001 						KASSERT(pmap->pm_pve == NULL);
5002 						pmap->pm_pve = new_pve;
5003 					} else if (new_embedded) {
5004 						new_pp->pp_pte.pte_ptp = NULL;
5005 						new_pp->pp_pte.pte_va = 0;
5006 					}
5007 					mutex_spin_exit(&new_pp->pp_lock);
5008 				}
5009 				pmap_unmap_ptes(pmap, pmap2);
5010 				/* Free new PTP. */
5011 				if (ptp != NULL && ptp->wire_count <= 1) {
5012 					pmap_free_ptp(pmap, ptp, va, ptes,
5013 					    pdes);
5014 				}
5015 				mutex_exit(&pmap->pm_lock);
5016 				return error;
5017 			}
5018 			break;
5019 		}
5020 #endif /* defined(XENPV) */
5021 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
5022 
5023 	/*
5024 	 * Done with the PTEs: they can now be unmapped.
5025 	 */
5026 	pmap_unmap_ptes(pmap, pmap2);
5027 
5028 	/*
5029 	 * Update statistics and PTP's reference count.
5030 	 */
5031 	pmap_stats_update_bypte(pmap, npte, opte);
5032 	if (ptp != NULL) {
5033 		if (!have_oldpa) {
5034 			ptp->wire_count++;
5035 		}
5036 		/* Remember minimum VA in PTP. */
5037 		pmap_ptp_range_set(ptp, va);
5038 	}
5039 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5040 
5041 	/*
5042 	 * If the same page, we can skip pv_entry handling.
5043 	 */
5044 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5045 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5046 		if ((npte & PTE_PVLIST) != 0) {
5047 			KASSERT(samepage);
5048 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5049 		}
5050 		goto same_pa;
5051 	} else if ((npte & PTE_PVLIST) != 0) {
5052 		KASSERT(!samepage);
5053 	}
5054 
5055 	/*
5056 	 * If old page is pv-tracked, remove pv_entry from its list.
5057 	 */
5058 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5059 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5060 			old_pp = VM_PAGE_TO_PP(old_pg);
5061 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5062 			panic("%s: PTE_PVLIST with pv-untracked page"
5063 			    " va = %#"PRIxVADDR
5064 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5065 			    __func__, va, oldpa, atop(pa));
5066 		}
5067 
5068 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5069 		    pmap_pte_to_pp_attrs(opte));
5070 	} else {
5071 		KASSERT(old_pve == NULL);
5072 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5073 	}
5074 
5075 	/*
5076 	 * If new page is dynamically PV tracked, insert to tree.
5077 	 */
5078 	if (new_pve != NULL) {
5079 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5080 		old_pve = rb_tree_insert_node(tree, new_pve);
5081 		KASSERT(old_pve == new_pve);
5082 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5083 	}
5084 
5085 same_pa:
5086 	/*
5087 	 * shootdown tlb if necessary.
5088 	 */
5089 
5090 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5091 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5092 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5093 	}
5094 	pmap_drain_pv(pmap);
5095 	mutex_exit(&pmap->pm_lock);
5096 	return 0;
5097 }
5098 
5099 #if defined(XEN) && defined(DOM0OPS)
5100 
5101 struct pmap_data_gnt {
5102 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5103 	vaddr_t pd_gnt_sva;
5104 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5105 	int pd_gnt_refs; /* ref counter */
5106 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5107 };
5108 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5109 
5110 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5111 
5112 static struct pmap_data_gnt *
5113 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5114 {
5115 	struct pmap_data_gnt_head *headp;
5116 	struct pmap_data_gnt *pgnt;
5117 
5118 	KASSERT(mutex_owned(&pmap->pm_lock));
5119 	headp = pmap->pm_data;
5120 	KASSERT(headp != NULL);
5121 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5122 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5123 			return pgnt;
5124 		/* check that we're not overlapping part of a region */
5125 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5126 	}
5127 	return NULL;
5128 }
5129 
5130 static void
5131 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5132     const struct gnttab_map_grant_ref *ops)
5133 {
5134 	struct pmap_data_gnt_head *headp;
5135 	struct pmap_data_gnt *pgnt;
5136 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5137 	KASSERT(mutex_owned(&pmap->pm_lock));
5138 	KASSERT(nentries >= 1);
5139 	if (pmap->pm_remove == NULL) {
5140 		pmap->pm_remove = pmap_remove_gnt;
5141 		KASSERT(pmap->pm_data == NULL);
5142 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5143 		SLIST_INIT(headp);
5144 		pmap->pm_data = headp;
5145 	} else {
5146 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5147 		KASSERT(pmap->pm_data != NULL);
5148 		headp = pmap->pm_data;
5149 	}
5150 
5151 	pgnt = pmap_find_gnt(pmap, sva, eva);
5152 	if (pgnt != NULL) {
5153 		KASSERT(pgnt->pd_gnt_sva == sva);
5154 		KASSERT(pgnt->pd_gnt_eva == eva);
5155 		return;
5156 	}
5157 
5158 	/* new entry */
5159 	pgnt = kmem_alloc(sizeof(*pgnt) +
5160 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5161 	pgnt->pd_gnt_sva = sva;
5162 	pgnt->pd_gnt_eva = eva;
5163 	pgnt->pd_gnt_refs = 0;
5164 	memcpy(pgnt->pd_gnt_ops, ops,
5165 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5166 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5167 }
5168 
5169 static void
5170 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5171 {
5172 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5173 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5174 	KASSERT(nentries >= 1);
5175 	KASSERT(mutex_owned(&pmap->pm_lock));
5176 	KASSERT(pgnt->pd_gnt_refs == 0);
5177 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5178 	kmem_free(pgnt, sizeof(*pgnt) +
5179 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5180 	if (SLIST_EMPTY(headp)) {
5181 		kmem_free(headp, sizeof(*headp));
5182 		pmap->pm_data = NULL;
5183 		pmap->pm_remove = NULL;
5184 	}
5185 }
5186 
5187 /*
5188  * pmap_enter_gnt: enter a grant entry into a pmap
5189  *
5190  * => must be done "now" ... no lazy-evaluation
5191  */
5192 int
5193 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5194     const struct gnttab_map_grant_ref *oops)
5195 {
5196 	struct pmap_data_gnt *pgnt;
5197 	pt_entry_t *ptes, opte;
5198 	pt_entry_t *ptep;
5199 	pd_entry_t * const *pdes;
5200 	struct vm_page *ptp;
5201 	struct vm_page *old_pg;
5202 	struct pmap_page *old_pp;
5203 	struct pv_entry *old_pve;
5204 	struct pmap *pmap2;
5205 	struct pmap_ptparray pt;
5206 	int error;
5207 	bool getptp;
5208 	rb_tree_t *tree;
5209 	struct gnttab_map_grant_ref *op;
5210 	int ret;
5211 	int idx;
5212 
5213 	KASSERT(pmap_initialized);
5214 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5215 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5216 	    PRIxVADDR " over PDP!", __func__, va);
5217 	KASSERT(pmap != pmap_kernel());
5218 
5219 	/* Begin by locking the pmap. */
5220 	mutex_enter(&pmap->pm_lock);
5221 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5222 
5223 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5224 	KASSERT(pgnt != NULL);
5225 
5226 	/* Look up the PTP.  Allocate if none present. */
5227 	ptp = NULL;
5228 	getptp = false;
5229 	ptp = pmap_find_ptp(pmap, va, 1);
5230 	if (ptp == NULL) {
5231 		getptp = true;
5232 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5233 		if (error != 0) {
5234 			mutex_exit(&pmap->pm_lock);
5235 			return error;
5236 		}
5237 	}
5238 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5239 
5240 	/*
5241 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5242 	 * entry if required for the new mapping.  Temporarily track the old
5243 	 * and new mappings concurrently.  Only after the old mapping is
5244 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5245 	 * our picture of modified/accessed state for either page could get
5246 	 * out of sync (we need any P->V operation for either page to stall
5247 	 * on pmap->pm_lock until done here).
5248 	 */
5249 	old_pve = NULL;
5250 
5251 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5252 
5253 	/* Map PTEs into address space. */
5254 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5255 
5256 	/* Install any newly allocated PTPs. */
5257 	if (getptp) {
5258 		pmap_install_ptp(pmap, &pt, va, pdes);
5259 	}
5260 
5261 	/* Check if there is an existing mapping. */
5262 	ptep = &ptes[pl1_i(va)];
5263 	opte = *ptep;
5264 	bool have_oldpa = pmap_valid_entry(opte);
5265 	paddr_t oldpa = pmap_pte2pa(opte);
5266 
5267 	/*
5268 	 * Update the pte.
5269 	 */
5270 
5271 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5272 	op = &pgnt->pd_gnt_ops[idx];
5273 
5274 #ifdef XENPV /* XXX */
5275 	op->host_addr = xpmap_ptetomach(ptep);
5276 #endif
5277 	op->dev_bus_addr = 0;
5278 	op->status = GNTST_general_error;
5279 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5280 	if (__predict_false(ret)) {
5281 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5282 		    __func__, ret);
5283 		op->status = GNTST_general_error;
5284 	}
5285 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5286 		kpause("gntmap", false, mstohz(1), NULL);
5287 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5288 		if (__predict_false(ret)) {
5289 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5290 			    __func__, ret);
5291 			op->status = GNTST_general_error;
5292 		}
5293 	}
5294 	if (__predict_false(op->status != GNTST_okay)) {
5295 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5296 		    __func__, op->status);
5297 		if (have_oldpa) {
5298 			ptp->wire_count--;
5299 		}
5300 	} else {
5301 		pgnt->pd_gnt_refs++;
5302 		if (!have_oldpa) {
5303 			ptp->wire_count++;
5304 		}
5305 		KASSERT(ptp->wire_count > 1);
5306 		/* Remember minimum VA in PTP. */
5307 		pmap_ptp_range_set(ptp, va);
5308 	}
5309 	if (ptp->wire_count <= 1)
5310 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5311 
5312 	/*
5313 	 * Done with the PTEs: they can now be unmapped.
5314 	 */
5315 	pmap_unmap_ptes(pmap, pmap2);
5316 
5317 	/*
5318 	 * Update statistics and PTP's reference count.
5319 	 */
5320 	pmap_stats_update_bypte(pmap, 0, opte);
5321 
5322 	/*
5323 	 * If old page is pv-tracked, remove pv_entry from its list.
5324 	 */
5325 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5326 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5327 			old_pp = VM_PAGE_TO_PP(old_pg);
5328 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5329 			panic("%s: PTE_PVLIST with pv-untracked page"
5330 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5331 			    __func__, va, oldpa);
5332 		}
5333 
5334 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5335 		    pmap_pte_to_pp_attrs(opte));
5336 	} else {
5337 		KASSERT(old_pve == NULL);
5338 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5339 	}
5340 
5341 	pmap_drain_pv(pmap);
5342 	mutex_exit(&pmap->pm_lock);
5343 	return op->status;
5344 }
5345 
5346 /*
5347  * pmap_remove_gnt: grant mapping removal function.
5348  *
5349  * => caller should not be holding any pmap locks
5350  */
5351 static void
5352 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5353 {
5354 	struct pmap_data_gnt *pgnt;
5355 	pt_entry_t *ptes;
5356 	pd_entry_t pde;
5357 	pd_entry_t * const *pdes;
5358 	struct vm_page *ptp;
5359 	struct pmap *pmap2;
5360 	vaddr_t va;
5361 	int lvl;
5362 	int idx;
5363 	struct gnttab_map_grant_ref *op;
5364 	struct gnttab_unmap_grant_ref unmap_op;
5365 	int ret;
5366 
5367 	KASSERT(pmap != pmap_kernel());
5368 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5369 
5370 	mutex_enter(&pmap->pm_lock);
5371 	for (va = sva; va < eva; va += PAGE_SIZE) {
5372 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5373 		if (pgnt == NULL) {
5374 			pmap_remove_locked(pmap, sva, eva);
5375 			continue;
5376 		}
5377 
5378 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5379 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5380 			panic("pmap_remove_gnt pdes not valid");
5381 		}
5382 
5383 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5384 		op = &pgnt->pd_gnt_ops[idx];
5385 		KASSERT(lvl == 1);
5386 		KASSERT(op->status == GNTST_okay);
5387 
5388 		/* Get PTP if non-kernel mapping. */
5389 		ptp = pmap_find_ptp(pmap, va, 1);
5390 		KASSERTMSG(ptp != NULL,
5391 		    "%s: unmanaged PTP detected", __func__);
5392 
5393 		if (op->status == GNTST_okay)  {
5394 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5395 			unmap_op.handle = op->handle;
5396 			unmap_op.dev_bus_addr = 0;
5397 #ifdef XENPV /* XXX */
5398 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5399 #endif
5400 			ret = HYPERVISOR_grant_table_op(
5401 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5402 			if (ret) {
5403 				printf("%s: GNTTABOP_unmap_grant_ref "
5404 				    "failed: %d\n", __func__, ret);
5405 			}
5406 
5407 			ptp->wire_count--;
5408 			pgnt->pd_gnt_refs--;
5409 			if (pgnt->pd_gnt_refs == 0) {
5410 				pmap_free_gnt(pmap, pgnt);
5411 			}
5412 		}
5413 		/*
5414 		 * if mapping removed and the PTP is no longer
5415 		 * being used, free it!
5416 		 */
5417 
5418 		if (ptp->wire_count <= 1)
5419 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5420 		pmap_unmap_ptes(pmap, pmap2);
5421 	}
5422 	mutex_exit(&pmap->pm_lock);
5423 }
5424 #endif /* XEN && DOM0OPS */
5425 
5426 paddr_t
5427 pmap_get_physpage(void)
5428 {
5429 	struct vm_page *ptp;
5430 	struct pmap *kpm = pmap_kernel();
5431 	paddr_t pa;
5432 
5433 	if (!uvm.page_init_done) {
5434 		/*
5435 		 * We're growing the kernel pmap early (from
5436 		 * uvm_pageboot_alloc()). This case must be
5437 		 * handled a little differently.
5438 		 */
5439 
5440 		if (!uvm_page_physget(&pa))
5441 			panic("%s: out of memory", __func__);
5442 #if defined(__HAVE_DIRECT_MAP)
5443 		memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5444 #else
5445 #if defined(XENPV)
5446 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5447 			xen_pagezero(pa);
5448 			return pa;
5449 		}
5450 #endif
5451 		kpreempt_disable();
5452 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5453 		    PTE_W | pmap_pg_nx);
5454 		pmap_pte_flush();
5455 		pmap_update_pg((vaddr_t)early_zerop);
5456 		memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5457 #if defined(DIAGNOSTIC) || defined(XENPV)
5458 		pmap_pte_set(early_zero_pte, 0);
5459 		pmap_pte_flush();
5460 #endif /* defined(DIAGNOSTIC) */
5461 		kpreempt_enable();
5462 #endif /* defined(__HAVE_DIRECT_MAP) */
5463 	} else {
5464 		/* XXX */
5465 		ptp = uvm_pagealloc(NULL, 0, NULL,
5466 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5467 		if (ptp == NULL)
5468 			panic("%s: out of memory", __func__);
5469 		ptp->flags &= ~PG_BUSY;
5470 		ptp->wire_count = 1;
5471 		pa = VM_PAGE_TO_PHYS(ptp);
5472 	}
5473 	pmap_stats_update(kpm, 1, 0);
5474 
5475 	return pa;
5476 }
5477 
5478 /*
5479  * Expand the page tree with the specified amount of PTPs, mapping virtual
5480  * addresses starting at kva. We populate all the levels but the last one
5481  * (L1). The nodes of the tree are created as RW, but the pages covered
5482  * will be kentered in L1, with proper permissions.
5483  *
5484  * Used only by pmap_growkernel.
5485  */
5486 static void
5487 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5488 {
5489 	unsigned long i;
5490 	paddr_t pa;
5491 	unsigned long index, endindex;
5492 	int level;
5493 	pd_entry_t *pdep;
5494 #ifdef XENPV
5495 	int s = splvm(); /* protect xpq_* */
5496 #endif
5497 
5498 	for (level = PTP_LEVELS; level > 1; level--) {
5499 		if (level == PTP_LEVELS)
5500 			pdep = cpm->pm_pdir;
5501 		else
5502 			pdep = normal_pdes[level - 2];
5503 		index = pl_i_roundup(kva, level);
5504 		endindex = index + needed_ptps[level - 1] - 1;
5505 
5506 		for (i = index; i <= endindex; i++) {
5507 			pt_entry_t pte;
5508 
5509 			KASSERT(!pmap_valid_entry(pdep[i]));
5510 			pa = pmap_get_physpage();
5511 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5512 #ifdef __x86_64__
5513 			pte |= pmap_pg_nx;
5514 #endif
5515 			pmap_pte_set(&pdep[i], pte);
5516 
5517 #ifdef XENPV
5518 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5519 				if (__predict_true(
5520 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5521 					/* update per-cpu PMDs on all cpus */
5522 					xen_kpm_sync(pmap_kernel(), i);
5523 				} else {
5524 					/*
5525 					 * too early; update primary CPU
5526 					 * PMD only (without locks)
5527 					 */
5528 #ifdef __x86_64__
5529 					pd_entry_t *cpu_pdep =
5530 						&cpu_info_primary.ci_kpm_pdir[i];
5531 #else
5532 					pd_entry_t *cpu_pdep =
5533 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5534 #endif
5535 					pmap_pte_set(cpu_pdep, pte);
5536 				}
5537 			}
5538 #endif
5539 
5540 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5541 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5542 			nkptp[level - 1]++;
5543 		}
5544 		pmap_pte_flush();
5545 	}
5546 #ifdef XENPV
5547 	splx(s);
5548 #endif
5549 }
5550 
5551 /*
5552  * pmap_growkernel: increase usage of KVM space.
5553  *
5554  * => we allocate new PTPs for the kernel and install them in all
5555  *    the pmaps on the system.
5556  */
5557 vaddr_t
5558 pmap_growkernel(vaddr_t maxkvaddr)
5559 {
5560 	struct pmap *kpm = pmap_kernel();
5561 	struct pmap *cpm;
5562 #if !defined(XENPV) || !defined(__x86_64__)
5563 	struct pmap *pm;
5564 	long old;
5565 #endif
5566 	int s, i;
5567 	long needed_kptp[PTP_LEVELS], target_nptp;
5568 	bool invalidate = false;
5569 
5570 	s = splvm();	/* to be safe */
5571 	mutex_enter(&kpm->pm_lock);
5572 
5573 	if (maxkvaddr <= pmap_maxkvaddr) {
5574 		mutex_exit(&kpm->pm_lock);
5575 		splx(s);
5576 		return pmap_maxkvaddr;
5577 	}
5578 
5579 	maxkvaddr = x86_round_pdr(maxkvaddr);
5580 #if !defined(XENPV) || !defined(__x86_64__)
5581 	old = nkptp[PTP_LEVELS - 1];
5582 #endif
5583 
5584 	/* Initialize needed_kptp. */
5585 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5586 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5587 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5588 
5589 		if (target_nptp > nkptpmax[i])
5590 			panic("out of KVA space");
5591 		KASSERT(target_nptp >= nkptp[i]);
5592 		needed_kptp[i] = target_nptp - nkptp[i];
5593 	}
5594 
5595 #ifdef XENPV
5596 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5597 	cpm = kpm;
5598 #else
5599 	/* Get the current pmap */
5600 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5601 		cpm = curcpu()->ci_pmap;
5602 	} else {
5603 		cpm = kpm;
5604 	}
5605 #endif
5606 
5607 	kasan_shadow_map((void *)pmap_maxkvaddr,
5608 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5609 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5610 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5611 
5612 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5613 
5614 	/*
5615 	 * If the number of top level entries changed, update all pmaps.
5616 	 */
5617 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5618 #ifdef XENPV
5619 #ifdef __x86_64__
5620 		/* nothing, kernel entries are never entered in user pmap */
5621 #else
5622 		int pdkidx;
5623 
5624 		mutex_enter(&pmaps_lock);
5625 		LIST_FOREACH(pm, &pmaps, pm_list) {
5626 			for (pdkidx = PDIR_SLOT_KERN + old;
5627 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5628 			    pdkidx++) {
5629 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5630 				    kpm->pm_pdir[pdkidx]);
5631 			}
5632 			pmap_pte_flush();
5633 		}
5634 		mutex_exit(&pmaps_lock);
5635 #endif /* __x86_64__ */
5636 #else /* XENPV */
5637 		size_t newpdes;
5638 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5639 		if (cpm != kpm) {
5640 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5641 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5642 			    newpdes * sizeof(pd_entry_t));
5643 		}
5644 
5645 		mutex_enter(&pmaps_lock);
5646 		LIST_FOREACH(pm, &pmaps, pm_list) {
5647 			if (__predict_false(pm->pm_enter != NULL)) {
5648 				/*
5649 				 * Not a native pmap, the kernel is not mapped,
5650 				 * so nothing to synchronize.
5651 				 */
5652 				continue;
5653 			}
5654 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5655 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5656 			    newpdes * sizeof(pd_entry_t));
5657 		}
5658 		mutex_exit(&pmaps_lock);
5659 #endif
5660 		invalidate = true;
5661 	}
5662 	pmap_maxkvaddr = maxkvaddr;
5663 	mutex_exit(&kpm->pm_lock);
5664 	splx(s);
5665 
5666 	if (invalidate && pmap_initialized) {
5667 		/* Invalidate the pmap cache. */
5668 		pool_cache_invalidate(&pmap_cache);
5669 	}
5670 
5671 	return maxkvaddr;
5672 }
5673 
5674 #ifdef DEBUG
5675 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5676 
5677 /*
5678  * pmap_dump: dump all the mappings from a pmap
5679  *
5680  * => caller should not be holding any pmap locks
5681  */
5682 void
5683 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5684 {
5685 	pt_entry_t *ptes, *pte;
5686 	pd_entry_t * const *pdes;
5687 	struct pmap *pmap2;
5688 	vaddr_t blkendva;
5689 	int lvl;
5690 
5691 	/*
5692 	 * if end is out of range truncate.
5693 	 * if (end == start) update to max.
5694 	 */
5695 
5696 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5697 		eva = VM_MAXUSER_ADDRESS;
5698 
5699 	mutex_enter(&pmap->pm_lock);
5700 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5701 
5702 	/*
5703 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5704 	 */
5705 
5706 	for (/* null */ ; sva < eva ; sva = blkendva) {
5707 
5708 		/* determine range of block */
5709 		blkendva = x86_round_pdr(sva+1);
5710 		if (blkendva > eva)
5711 			blkendva = eva;
5712 
5713 		/* valid block? */
5714 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5715 			continue;
5716 		KASSERT(lvl == 1);
5717 
5718 		pte = &ptes[pl1_i(sva)];
5719 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5720 			if (!pmap_valid_entry(*pte))
5721 				continue;
5722 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5723 			    " (pte=%#" PRIxPADDR ")\n",
5724 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5725 		}
5726 	}
5727 	pmap_unmap_ptes(pmap, pmap2);
5728 	mutex_exit(&pmap->pm_lock);
5729 }
5730 #endif
5731 
5732 /*
5733  * pmap_update: process deferred invalidations and frees.
5734  */
5735 void
5736 pmap_update(struct pmap *pmap)
5737 {
5738 	struct pmap_page *pp;
5739 	struct vm_page *ptp;
5740 
5741 	/*
5742 	 * Initiate any pending TLB shootdowns.  Wait for them to
5743 	 * complete before returning control to the caller.
5744 	 */
5745 	kpreempt_disable();
5746 	pmap_tlb_shootnow();
5747 	kpreempt_enable();
5748 
5749 	/*
5750 	 * Now that shootdowns are complete, process deferred frees.  This
5751 	 * is an unlocked check, but is safe as we're only interested in
5752 	 * work done in this LWP - we won't get a false negative.
5753 	 */
5754 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5755 		return;
5756 	}
5757 
5758 	mutex_enter(&pmap->pm_lock);
5759 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5760 		KASSERT(ptp->wire_count == 0);
5761 		KASSERT(ptp->uanon == NULL);
5762 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5763 		pp = VM_PAGE_TO_PP(ptp);
5764 		LIST_INIT(&pp->pp_pvlist);
5765 		pp->pp_attrs = 0;
5766 		pp->pp_pte.pte_ptp = NULL;
5767 		pp->pp_pte.pte_va = 0;
5768 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5769 
5770 		/*
5771 		 * XXX Hack to avoid extra locking, and lock
5772 		 * assertions in uvm_pagefree().  Despite uobject
5773 		 * being set, this isn't a managed page.
5774 		 */
5775 		PMAP_DUMMY_LOCK(pmap);
5776 		uvm_pagerealloc(ptp, NULL, 0);
5777 		PMAP_DUMMY_UNLOCK(pmap);
5778 		uvm_pagefree(ptp);
5779 	}
5780 	mutex_exit(&pmap->pm_lock);
5781 }
5782 
5783 #if PTP_LEVELS > 4
5784 #error "Unsupported number of page table mappings"
5785 #endif
5786 
5787 paddr_t
5788 pmap_init_tmp_pgtbl(paddr_t pg)
5789 {
5790 	static bool maps_loaded;
5791 	static const paddr_t x86_tmp_pml_paddr[] = {
5792 	    4 * PAGE_SIZE,	/* L1 */
5793 	    5 * PAGE_SIZE,	/* L2 */
5794 	    6 * PAGE_SIZE,	/* L3 */
5795 	    7 * PAGE_SIZE	/* L4 */
5796 	};
5797 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5798 
5799 	pd_entry_t *tmp_pml, *kernel_pml;
5800 
5801 	int level;
5802 
5803 	if (!maps_loaded) {
5804 		for (level = 0; level < PTP_LEVELS; ++level) {
5805 			x86_tmp_pml_vaddr[level] =
5806 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5807 			    UVM_KMF_VAONLY);
5808 
5809 			if (x86_tmp_pml_vaddr[level] == 0)
5810 				panic("mapping of real mode PML failed\n");
5811 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5812 			    x86_tmp_pml_paddr[level],
5813 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5814 		}
5815 		pmap_update(pmap_kernel());
5816 		maps_loaded = true;
5817 	}
5818 
5819 	/* Zero levels 1-3 */
5820 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
5821 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5822 		memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
5823 	}
5824 
5825 	/* Copy PML4 */
5826 	kernel_pml = pmap_kernel()->pm_pdir;
5827 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
5828 	memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
5829 
5830 #ifdef PAE
5831 	/*
5832 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
5833 	 * last entries are unlikely to be used for temporary mappings.
5834 	 * 508: maps 0->1GB (userland)
5835 	 * 509: unused
5836 	 * 510: unused
5837 	 * 511: maps 3->4GB (kernel)
5838 	 */
5839 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
5840 	tmp_pml[509] = 0;
5841 	tmp_pml[510] = 0;
5842 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
5843 #endif
5844 
5845 	for (level = PTP_LEVELS - 1; level > 0; --level) {
5846 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
5847 
5848 		tmp_pml[pl_i(pg, level + 1)] =
5849 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
5850 	}
5851 
5852 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
5853 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
5854 
5855 #ifdef PAE
5856 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
5857 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
5858 #endif
5859 
5860 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
5861 }
5862 
5863 u_int
5864 x86_mmap_flags(paddr_t mdpgno)
5865 {
5866 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
5867 	u_int pflag = 0;
5868 
5869 	if (nflag & X86_MMAP_FLAG_PREFETCH)
5870 		pflag |= PMAP_WRITE_COMBINE;
5871 
5872 	return pflag;
5873 }
5874 
5875 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
5876 
5877 /*
5878  * -----------------------------------------------------------------------------
5879  * *****************************************************************************
5880  * *****************************************************************************
5881  * *****************************************************************************
5882  * *****************************************************************************
5883  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
5884  * *****************************************************************************
5885  * *****************************************************************************
5886  * *****************************************************************************
5887  * *****************************************************************************
5888  * -----------------------------------------------------------------------------
5889  *
5890  * These functions are invoked as callbacks from the code above. Contrary to
5891  * native, EPT does not have a recursive slot; therefore, it is not possible
5892  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
5893  * tree manually.
5894  *
5895  * Apart from that, the logic is mostly the same as native. Once a pmap has
5896  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
5897  * After that we're good, and the callbacks will handle the translations
5898  * for us.
5899  *
5900  * -----------------------------------------------------------------------------
5901  */
5902 
5903 /* Hardware bits. */
5904 #define EPT_R		__BIT(0)	/* read */
5905 #define EPT_W		__BIT(1)	/* write */
5906 #define EPT_X		__BIT(2)	/* execute */
5907 #define EPT_T		__BITS(5,3)	/* type */
5908 #define		TYPE_UC	0
5909 #define		TYPE_WC	1
5910 #define		TYPE_WT	4
5911 #define		TYPE_WP	5
5912 #define		TYPE_WB	6
5913 #define EPT_NOPAT	__BIT(6)
5914 #define EPT_L		__BIT(7)	/* large */
5915 #define EPT_A		__BIT(8)	/* accessed */
5916 #define EPT_D		__BIT(9)	/* dirty */
5917 /* Software bits. */
5918 #define EPT_PVLIST	__BIT(60)
5919 #define EPT_WIRED	__BIT(61)
5920 
5921 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
5922 
5923 bool pmap_ept_has_ad __read_mostly;
5924 
5925 static inline void
5926 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
5927 {
5928 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
5929 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
5930 
5931 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5932 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
5933 
5934 	pmap_stats_update(pmap, resid_diff, wired_diff);
5935 }
5936 
5937 static pt_entry_t
5938 pmap_ept_type(u_int flags)
5939 {
5940 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
5941 	pt_entry_t ret;
5942 
5943 	switch (cacheflags) {
5944 	case PMAP_NOCACHE:
5945 	case PMAP_NOCACHE_OVR:
5946 		ret = __SHIFTIN(TYPE_UC, EPT_T);
5947 		break;
5948 	case PMAP_WRITE_COMBINE:
5949 		ret = __SHIFTIN(TYPE_WC, EPT_T);
5950 		break;
5951 	case PMAP_WRITE_BACK:
5952 	default:
5953 		ret = __SHIFTIN(TYPE_WB, EPT_T);
5954 		break;
5955 	}
5956 
5957 	ret |= EPT_NOPAT;
5958 	return ret;
5959 }
5960 
5961 static inline pt_entry_t
5962 pmap_ept_prot(vm_prot_t prot)
5963 {
5964 	pt_entry_t res = 0;
5965 
5966 	if (prot & VM_PROT_READ)
5967 		res |= EPT_R;
5968 	if (prot & VM_PROT_WRITE)
5969 		res |= EPT_W;
5970 	if (prot & VM_PROT_EXECUTE)
5971 		res |= EPT_X;
5972 
5973 	return res;
5974 }
5975 
5976 static inline uint8_t
5977 pmap_ept_to_pp_attrs(pt_entry_t ept)
5978 {
5979 	uint8_t ret = 0;
5980 	if (pmap_ept_has_ad) {
5981 		if (ept & EPT_D)
5982 			ret |= PP_ATTRS_D;
5983 		if (ept & EPT_A)
5984 			ret |= PP_ATTRS_A;
5985 	} else {
5986 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
5987 	}
5988 	if (ept & EPT_W)
5989 		ret |= PP_ATTRS_W;
5990 	return ret;
5991 }
5992 
5993 static inline pt_entry_t
5994 pmap_pp_attrs_to_ept(uint8_t attrs)
5995 {
5996 	pt_entry_t ept = 0;
5997 	if (attrs & PP_ATTRS_D)
5998 		ept |= EPT_D;
5999 	if (attrs & PP_ATTRS_A)
6000 		ept |= EPT_A;
6001 	if (attrs & PP_ATTRS_W)
6002 		ept |= EPT_W;
6003 	return ept;
6004 }
6005 
6006 /*
6007  * Helper for pmap_ept_free_ptp.
6008  * tree[0] = &L2[L2idx]
6009  * tree[1] = &L3[L3idx]
6010  * tree[2] = &L4[L4idx]
6011  */
6012 static void
6013 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6014 {
6015 	pt_entry_t *pteva;
6016 	paddr_t ptepa;
6017 	int i, index;
6018 
6019 	ptepa = pmap->pm_pdirpa[0];
6020 	for (i = PTP_LEVELS; i > 1; i--) {
6021 		index = pl_pi(va, i);
6022 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6023 		KASSERT(pmap_ept_valid_entry(pteva[index]));
6024 		tree[i - 2] = &pteva[index];
6025 		ptepa = pmap_pte2pa(pteva[index]);
6026 	}
6027 }
6028 
6029 static void
6030 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6031 {
6032 	pd_entry_t *tree[3];
6033 	int level;
6034 
6035 	KASSERT(pmap != pmap_kernel());
6036 	KASSERT(mutex_owned(&pmap->pm_lock));
6037 	KASSERT(kpreempt_disabled());
6038 
6039 	pmap_ept_get_tree(pmap, va, tree);
6040 
6041 	level = 1;
6042 	do {
6043 		(void)pmap_pte_testset(tree[level - 1], 0);
6044 
6045 		pmap_freepage(pmap, ptp, level);
6046 		if (level < PTP_LEVELS - 1) {
6047 			ptp = pmap_find_ptp(pmap, va, level + 1);
6048 			ptp->wire_count--;
6049 			if (ptp->wire_count > 1)
6050 				break;
6051 		}
6052 	} while (++level < PTP_LEVELS);
6053 	pmap_pte_flush();
6054 }
6055 
6056 /* Allocate L4->L3->L2. Return L2. */
6057 static void
6058 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6059 {
6060 	struct vm_page *ptp;
6061 	unsigned long index;
6062 	pd_entry_t *pteva;
6063 	paddr_t ptepa;
6064 	int i;
6065 
6066 	KASSERT(pmap != pmap_kernel());
6067 	KASSERT(mutex_owned(&pmap->pm_lock));
6068 	KASSERT(kpreempt_disabled());
6069 
6070 	/*
6071 	 * Now that we have all the pages looked up or allocated,
6072 	 * loop through again installing any new ones into the tree.
6073 	 */
6074 	ptepa = pmap->pm_pdirpa[0];
6075 	for (i = PTP_LEVELS; i > 1; i--) {
6076 		index = pl_pi(va, i);
6077 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6078 
6079 		if (pmap_ept_valid_entry(pteva[index])) {
6080 			KASSERT(!pt->alloced[i]);
6081 			ptepa = pmap_pte2pa(pteva[index]);
6082 			continue;
6083 		}
6084 
6085 		ptp = pt->pg[i];
6086 		ptp->flags &= ~PG_BUSY; /* never busy */
6087 		ptp->wire_count = 1;
6088 		pmap->pm_ptphint[i - 2] = ptp;
6089 		ptepa = VM_PAGE_TO_PHYS(ptp);
6090 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6091 
6092 		pmap_pte_flush();
6093 		pmap_stats_update(pmap, 1, 0);
6094 
6095 		/*
6096 		 * If we're not in the top level, increase the
6097 		 * wire count of the parent page.
6098 		 */
6099 		if (i < PTP_LEVELS) {
6100 			pt->pg[i + 1]->wire_count++;
6101 		}
6102 	}
6103 }
6104 
6105 static int
6106 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6107     u_int flags)
6108 {
6109 	pt_entry_t *ptes, opte, npte;
6110 	pt_entry_t *ptep;
6111 	struct vm_page *ptp;
6112 	struct vm_page *new_pg, *old_pg;
6113 	struct pmap_page *new_pp, *old_pp;
6114 	struct pv_entry *old_pve, *new_pve;
6115 	bool wired = (flags & PMAP_WIRED) != 0;
6116 	bool accessed;
6117 	struct pmap_ptparray pt;
6118 	int error;
6119 	bool getptp, samepage, new_embedded;
6120 	rb_tree_t *tree;
6121 
6122 	KASSERT(pmap_initialized);
6123 	KASSERT(va < VM_MAXUSER_ADDRESS);
6124 
6125 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6126 
6127 	if (wired)
6128 		npte |= EPT_WIRED;
6129 	if (flags & VM_PROT_ALL) {
6130 		npte |= EPT_A;
6131 		if (flags & VM_PROT_WRITE) {
6132 			KASSERT((npte & EPT_W) != 0);
6133 			npte |= EPT_D;
6134 		}
6135 	}
6136 
6137 	new_pg = PHYS_TO_VM_PAGE(pa);
6138 	if (new_pg != NULL) {
6139 		/* This is a managed page */
6140 		npte |= EPT_PVLIST;
6141 		new_pp = VM_PAGE_TO_PP(new_pg);
6142 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6143 		/* This is an unmanaged pv-tracked page */
6144 		npte |= EPT_PVLIST;
6145 	} else {
6146 		new_pp = NULL;
6147 	}
6148 
6149 	/* Begin by locking the pmap. */
6150 	mutex_enter(&pmap->pm_lock);
6151 
6152 	/* Look up the PTP.  Allocate if none present. */
6153 	ptp = NULL;
6154 	getptp = false;
6155 	if (pmap != pmap_kernel()) {
6156 		ptp = pmap_find_ptp(pmap, va, 1);
6157 		if (ptp == NULL) {
6158 			getptp = true;
6159 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6160 			if (error != 0) {
6161 				if (flags & PMAP_CANFAIL) {
6162 					mutex_exit(&pmap->pm_lock);
6163 					return error;
6164 				}
6165 				panic("%s: get ptp failed, error=%d", __func__,
6166 				    error);
6167 			}
6168 		}
6169 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6170 	} else {
6171 		/* Embedded PV entries rely on this. */
6172 		KASSERT(va != 0);
6173 		tree = &pmap_kernel_rb;
6174 	}
6175 
6176 	/*
6177 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6178 	 * entry if required for the new mapping.  Temporarily track the old
6179 	 * and new mappings concurrently.  Only after the old mapping is
6180 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6181 	 * our picture of modified/accessed state for either page could get
6182 	 * out of sync (we need any P->V operation for either page to stall
6183 	 * on pmap->pm_lock until done here).
6184 	 */
6185 	new_pve = NULL;
6186 	old_pve = NULL;
6187 	samepage = false;
6188 	new_embedded = false;
6189 
6190 	if (new_pp != NULL) {
6191 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6192 		    &old_pve, &samepage, &new_embedded, tree);
6193 
6194 		/*
6195 		 * If a new pv_entry was needed and none was available, we
6196 		 * can go no further.
6197 		 */
6198 		if (error != 0) {
6199 			if (flags & PMAP_CANFAIL) {
6200 				if (getptp) {
6201 					pmap_unget_ptp(pmap, &pt);
6202 				}
6203 				mutex_exit(&pmap->pm_lock);
6204 				return error;
6205 			}
6206 			panic("%s: alloc pve failed", __func__);
6207 		}
6208 	} else {
6209 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6210 	}
6211 
6212 	/* Map PTEs into address space. */
6213 	kpreempt_disable();
6214 
6215 	/* Install any newly allocated PTPs. */
6216 	if (getptp) {
6217 		pmap_ept_install_ptp(pmap, &pt, va);
6218 	}
6219 
6220 	/* Check if there is an existing mapping. */
6221 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6222 	ptep = &ptes[pl1_pi(va)];
6223 	opte = *ptep;
6224 	bool have_oldpa = pmap_ept_valid_entry(opte);
6225 	paddr_t oldpa = pmap_pte2pa(opte);
6226 
6227 	/*
6228 	 * Update the pte.
6229 	 */
6230 	do {
6231 		opte = *ptep;
6232 
6233 		/*
6234 		 * if the same page, inherit PTE_A and PTE_D.
6235 		 */
6236 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6237 			npte |= opte & (EPT_A | EPT_D);
6238 		}
6239 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6240 
6241 	/*
6242 	 * Done with the PTEs: they can now be unmapped.
6243 	 */
6244 	kpreempt_enable();
6245 
6246 	/*
6247 	 * Update statistics and PTP's reference count.
6248 	 */
6249 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6250 	if (ptp != NULL) {
6251 		if (!have_oldpa) {
6252 			ptp->wire_count++;
6253 		}
6254 		/* Remember minimum VA in PTP. */
6255 		pmap_ptp_range_set(ptp, va);
6256 	}
6257 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6258 
6259 	/*
6260 	 * If the same page, we can skip pv_entry handling.
6261 	 */
6262 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6263 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6264 		if ((npte & EPT_PVLIST) != 0) {
6265 			KASSERT(samepage);
6266 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6267 		}
6268 		goto same_pa;
6269 	} else if ((npte & EPT_PVLIST) != 0) {
6270 		KASSERT(!samepage);
6271 	}
6272 
6273 	/*
6274 	 * If old page is pv-tracked, remove pv_entry from its list.
6275 	 */
6276 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6277 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6278 			old_pp = VM_PAGE_TO_PP(old_pg);
6279 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6280 			panic("%s: EPT_PVLIST with pv-untracked page"
6281 			    " va = %#"PRIxVADDR
6282 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6283 			    __func__, va, oldpa, atop(pa));
6284 		}
6285 
6286 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6287 		    pmap_ept_to_pp_attrs(opte));
6288 	} else {
6289 		KASSERT(old_pve == NULL);
6290 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6291 	}
6292 
6293 	/*
6294 	 * If new page is dynamically PV tracked, insert to tree.
6295 	 */
6296 	if (new_pve != NULL) {
6297 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6298 		old_pve = rb_tree_insert_node(tree, new_pve);
6299 		KASSERT(old_pve == new_pve);
6300 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6301 	}
6302 
6303 same_pa:
6304 	/*
6305 	 * shootdown tlb if necessary.
6306 	 */
6307 
6308 	if (pmap_ept_has_ad) {
6309 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6310 	} else {
6311 		accessed = (opte & EPT_R) != 0;
6312 	}
6313 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6314 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6315 	}
6316 	pmap_drain_pv(pmap);
6317 	mutex_exit(&pmap->pm_lock);
6318 	return 0;
6319 }
6320 
6321 /* Pay close attention, this returns L2. */
6322 static int
6323 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6324 {
6325 	pt_entry_t *pteva;
6326 	paddr_t ptepa;
6327 	int i, index;
6328 
6329 	KASSERT(mutex_owned(&pmap->pm_lock));
6330 
6331 	ptepa = pmap->pm_pdirpa[0];
6332 	for (i = PTP_LEVELS; i > 1; i--) {
6333 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6334 		index = pl_pi(va, i);
6335 		if (!pmap_ept_valid_entry(pteva[index]))
6336 			return i;
6337 		ptepa = pmap_pte2pa(pteva[index]);
6338 	}
6339 	if (lastpde != NULL) {
6340 		*lastpde = pteva[index];
6341 	}
6342 
6343 	return 0;
6344 }
6345 
6346 static bool
6347 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6348 {
6349 	pt_entry_t *ptes, pte;
6350 	pd_entry_t pde;
6351 	paddr_t ptppa, pa;
6352 	bool rv;
6353 
6354 #ifdef __HAVE_DIRECT_MAP
6355 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6356 		if (pap != NULL) {
6357 			*pap = PMAP_DIRECT_UNMAP(va);
6358 		}
6359 		return true;
6360 	}
6361 #endif
6362 
6363 	rv = false;
6364 	pa = 0;
6365 
6366 	mutex_enter(&pmap->pm_lock);
6367 	kpreempt_disable();
6368 
6369 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6370 		ptppa = pmap_pte2pa(pde);
6371 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6372 		pte = ptes[pl1_pi(va)];
6373 		if (__predict_true((pte & EPT_R) != 0)) {
6374 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6375 			rv = true;
6376 		}
6377 	}
6378 
6379 	kpreempt_enable();
6380 	mutex_exit(&pmap->pm_lock);
6381 
6382 	if (pap != NULL) {
6383 		*pap = pa;
6384 	}
6385 	return rv;
6386 }
6387 
6388 static bool
6389 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6390     vaddr_t va)
6391 {
6392 	struct pv_entry *pve;
6393 	struct vm_page *pg;
6394 	struct pmap_page *pp;
6395 	pt_entry_t opte;
6396 	bool accessed;
6397 
6398 	KASSERT(pmap != pmap_kernel());
6399 	KASSERT(mutex_owned(&pmap->pm_lock));
6400 	KASSERT(kpreempt_disabled());
6401 
6402 	if (!pmap_ept_valid_entry(*pte)) {
6403 		/* VA not mapped. */
6404 		return false;
6405 	}
6406 
6407 	/* Atomically save the old PTE and zap it. */
6408 	opte = pmap_pte_testset(pte, 0);
6409 	if (!pmap_ept_valid_entry(opte)) {
6410 		return false;
6411 	}
6412 
6413 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6414 
6415 	if (ptp) {
6416 		/*
6417 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6418 		 */
6419 		ptp->wire_count--;
6420 		if (ptp->wire_count <= 1) {
6421 			opte |= EPT_A;
6422 		}
6423 	}
6424 
6425 	if (pmap_ept_has_ad) {
6426 		accessed = (opte & EPT_A) != 0;
6427 	} else {
6428 		accessed = true;
6429 	}
6430 	if (accessed) {
6431 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6432 	}
6433 
6434 	/*
6435 	 * If we are not on a pv list - we are done.
6436 	 */
6437 	if ((opte & EPT_PVLIST) == 0) {
6438 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6439 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6440 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6441 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6442 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6443 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6444 		return true;
6445 	}
6446 
6447 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6448 		pp = VM_PAGE_TO_PP(pg);
6449 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6450 		paddr_t pa = pmap_pte2pa(opte);
6451 		panic("%s: EPT_PVLIST with pv-untracked page"
6452 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6453 		    __func__, va, pa, atop(pa));
6454 	}
6455 
6456 	/* Sync R/M bits. */
6457 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6458 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6459 	return true;
6460 }
6461 
6462 static void
6463 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6464     vaddr_t startva, vaddr_t endva)
6465 {
6466 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6467 
6468 	KASSERT(pmap != pmap_kernel());
6469 	KASSERT(mutex_owned(&pmap->pm_lock));
6470 	KASSERT(kpreempt_disabled());
6471 
6472 	/*
6473 	 * mappings are very often sparse, so clip the given range to the
6474 	 * range of PTEs that are known present in the PTP.
6475 	 */
6476 	pmap_ptp_range_clip(ptp, &startva, &pte);
6477 
6478 	/*
6479 	 * note that ptpva points to the PTE that maps startva.   this may
6480 	 * or may not be the first PTE in the PTP.
6481 	 *
6482 	 * we loop through the PTP while there are still PTEs to look at
6483 	 * and the wire_count is greater than 1 (because we use the wire_count
6484 	 * to keep track of the number of real PTEs in the PTP).
6485 	 */
6486 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6487 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6488 		startva += PAGE_SIZE;
6489 		pte++;
6490 	}
6491 }
6492 
6493 static void
6494 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6495 {
6496 	pt_entry_t *ptes;
6497 	pd_entry_t pde;
6498 	paddr_t ptppa;
6499 	vaddr_t blkendva, va = sva;
6500 	struct vm_page *ptp;
6501 
6502 	mutex_enter(&pmap->pm_lock);
6503 	kpreempt_disable();
6504 
6505 	for (/* null */ ; va < eva ; va = blkendva) {
6506 		int lvl;
6507 
6508 		/* determine range of block */
6509 		blkendva = x86_round_pdr(va+1);
6510 		if (blkendva > eva)
6511 			blkendva = eva;
6512 
6513 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6514 		if (lvl != 0) {
6515 			/* Skip a range corresponding to an invalid pde. */
6516 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6517 			continue;
6518 		}
6519 
6520 		/* PA of the PTP */
6521 		ptppa = pmap_pte2pa(pde);
6522 
6523 		ptp = pmap_find_ptp(pmap, va, 1);
6524 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6525 		    __func__);
6526 
6527 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6528 
6529 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6530 		    blkendva);
6531 
6532 		/* If PTP is no longer being used, free it. */
6533 		if (ptp && ptp->wire_count <= 1) {
6534 			pmap_ept_free_ptp(pmap, ptp, va);
6535 		}
6536 	}
6537 
6538 	kpreempt_enable();
6539 	pmap_drain_pv(pmap);
6540 	mutex_exit(&pmap->pm_lock);
6541 }
6542 
6543 static int
6544 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6545     uint8_t *oattrs, pt_entry_t *optep)
6546 {
6547 	struct pmap *pmap;
6548 	pt_entry_t *ptep;
6549 	pt_entry_t opte;
6550 	pt_entry_t npte;
6551 	pt_entry_t expect;
6552 	bool need_shootdown;
6553 
6554 	expect = pmap_pa2pte(pa) | EPT_R;
6555 	pmap = ptp_to_pmap(ptp);
6556 
6557 	if (clearbits != ~0) {
6558 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6559 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6560 	}
6561 
6562 	ptep = pmap_map_pte(pmap, ptp, va);
6563 	do {
6564 		opte = *ptep;
6565 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6566 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6567 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6568 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6569 			/*
6570 			 * We lost a race with a V->P operation like
6571 			 * pmap_remove().  Wait for the competitor
6572 			 * reflecting pte bits into mp_attrs.
6573 			 */
6574 			pmap_unmap_pte();
6575 			return EAGAIN;
6576 		}
6577 
6578 		/*
6579 		 * Check if there's anything to do on this PTE.
6580 		 */
6581 		if ((opte & clearbits) == 0) {
6582 			need_shootdown = false;
6583 			break;
6584 		}
6585 
6586 		/*
6587 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6588 		 * ... Unless we are clearing only the EPT_W bit and
6589 		 * it isn't cached as RW (EPT_D).
6590 		 */
6591 		if (pmap_ept_has_ad) {
6592 			need_shootdown = (opte & EPT_A) != 0 &&
6593 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6594 		} else {
6595 			need_shootdown = true;
6596 		}
6597 
6598 		npte = opte & ~clearbits;
6599 
6600 		/*
6601 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6602 		 */
6603 		if (need_shootdown) {
6604 			npte &= ~(EPT_A | EPT_D);
6605 		}
6606 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6607 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6608 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6609 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6610 
6611 	if (need_shootdown) {
6612 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6613 	}
6614 	pmap_unmap_pte();
6615 
6616 	*oattrs = pmap_ept_to_pp_attrs(opte);
6617 	if (optep != NULL)
6618 		*optep = opte;
6619 	return 0;
6620 }
6621 
6622 static void
6623 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6624     vaddr_t va)
6625 {
6626 
6627 	KASSERT(mutex_owned(&pmap->pm_lock));
6628 
6629 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6630 	ptp->wire_count--;
6631 	if (ptp->wire_count <= 1) {
6632 		pmap_ept_free_ptp(pmap, ptp, va);
6633 	}
6634 }
6635 
6636 static void
6637 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6638 {
6639 	pt_entry_t bit_rem;
6640 	pt_entry_t *ptes, *spte;
6641 	pt_entry_t opte, npte;
6642 	pd_entry_t pde;
6643 	paddr_t ptppa;
6644 	vaddr_t va;
6645 	bool modified;
6646 
6647 	bit_rem = 0;
6648 	if (!(prot & VM_PROT_WRITE))
6649 		bit_rem = EPT_W;
6650 
6651 	sva &= PTE_FRAME;
6652 	eva &= PTE_FRAME;
6653 
6654 	/* Acquire pmap. */
6655 	mutex_enter(&pmap->pm_lock);
6656 	kpreempt_disable();
6657 
6658 	for (va = sva; va < eva; va += PAGE_SIZE) {
6659 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6660 			continue;
6661 		}
6662 
6663 		ptppa = pmap_pte2pa(pde);
6664 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6665 		spte = &ptes[pl1_pi(va)];
6666 
6667 		do {
6668 			opte = *spte;
6669 			if (!pmap_ept_valid_entry(opte)) {
6670 				goto next;
6671 			}
6672 			npte = (opte & ~bit_rem);
6673 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6674 
6675 		if (pmap_ept_has_ad) {
6676 			modified = (opte & EPT_D) != 0;
6677 		} else {
6678 			modified = true;
6679 		}
6680 		if (modified) {
6681 			vaddr_t tva = x86_ptob(spte - ptes);
6682 			pmap_tlb_shootdown(pmap, tva, 0,
6683 			    TLBSHOOT_WRITE_PROTECT);
6684 		}
6685 next:;
6686 	}
6687 
6688 	kpreempt_enable();
6689 	mutex_exit(&pmap->pm_lock);
6690 }
6691 
6692 static void
6693 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6694 {
6695 	pt_entry_t *ptes, *ptep, opte;
6696 	pd_entry_t pde;
6697 	paddr_t ptppa;
6698 
6699 	/* Acquire pmap. */
6700 	mutex_enter(&pmap->pm_lock);
6701 	kpreempt_disable();
6702 
6703 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6704 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6705 	}
6706 
6707 	ptppa = pmap_pte2pa(pde);
6708 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6709 	ptep = &ptes[pl1_pi(va)];
6710 	opte = *ptep;
6711 	KASSERT(pmap_ept_valid_entry(opte));
6712 
6713 	if (opte & EPT_WIRED) {
6714 		pt_entry_t npte = opte & ~EPT_WIRED;
6715 
6716 		opte = pmap_pte_testset(ptep, npte);
6717 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6718 	} else {
6719 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6720 		    "did not change!\n", __func__, pmap, va);
6721 	}
6722 
6723 	/* Release pmap. */
6724 	kpreempt_enable();
6725 	mutex_exit(&pmap->pm_lock);
6726 }
6727 
6728 /* -------------------------------------------------------------------------- */
6729 
6730 void
6731 pmap_ept_transform(struct pmap *pmap)
6732 {
6733 	pmap->pm_enter = pmap_ept_enter;
6734 	pmap->pm_extract = pmap_ept_extract;
6735 	pmap->pm_remove = pmap_ept_remove;
6736 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6737 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6738 	pmap->pm_write_protect = pmap_ept_write_protect;
6739 	pmap->pm_unwire = pmap_ept_unwire;
6740 
6741 	memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6742 }
6743 
6744 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6745