xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision 0f96383b21a01af55e33ab1c12ca474ffb550a81)
1 /*	$NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 #include "opt_efi.h"
142 
143 #define	__MUTEX_PRIVATE	/* for assertions */
144 
145 #include <sys/param.h>
146 #include <sys/systm.h>
147 #include <sys/proc.h>
148 #include <sys/pool.h>
149 #include <sys/kernel.h>
150 #include <sys/atomic.h>
151 #include <sys/cpu.h>
152 #include <sys/intr.h>
153 #include <sys/xcall.h>
154 #include <sys/kcore.h>
155 #include <sys/kmem.h>
156 #include <sys/asan.h>
157 #include <sys/msan.h>
158 #include <sys/entropy.h>
159 
160 #include <uvm/uvm.h>
161 #include <uvm/pmap/pmap_pvt.h>
162 
163 #include <dev/isa/isareg.h>
164 
165 #include <machine/specialreg.h>
166 #include <machine/gdt.h>
167 #include <machine/isa_machdep.h>
168 #include <machine/cpuvar.h>
169 #include <machine/cputypes.h>
170 #include <machine/pmap_private.h>
171 
172 #include <x86/bootspace.h>
173 #include <x86/pat.h>
174 #include <x86/pmap_pv.h>
175 
176 #include <x86/i82489reg.h>
177 #include <x86/i82489var.h>
178 
179 #ifdef XEN
180 #include <xen/include/public/xen.h>
181 #include <xen/hypervisor.h>
182 #include <xen/xenpmap.h>
183 #endif
184 
185 #ifdef __HAVE_DIRECT_MAP
186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
187 #endif
188 
189 /*
190  * general info:
191  *
192  *  - for an explanation of how the x86 MMU hardware works see
193  *    the comments in <machine/pte.h>.
194  *
195  *  - for an explanation of the general memory structure used by
196  *    this pmap (including the recursive mapping), see the comments
197  *    in <machine/pmap.h>.
198  *
199  * this file contains the code for the "pmap module."   the module's
200  * job is to manage the hardware's virtual to physical address mappings.
201  * note that there are two levels of mapping in the VM system:
202  *
203  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
204  *      to map ranges of virtual address space to objects/files.  for
205  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
206  *      to the file /bin/ls starting at offset zero."   note that
207  *      the upper layer mapping is not concerned with how individual
208  *      vm_pages are mapped.
209  *
210  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
211  *      from virtual addresses.   it is concerned with which vm_page is
212  *      mapped where.   for example, when you run /bin/ls and start
213  *      at page 0x1000 the fault routine may lookup the correct page
214  *      of the /bin/ls file and then ask the pmap layer to establish
215  *      a mapping for it.
216  *
217  * note that information in the lower layer of the VM system can be
218  * thrown away since it can easily be reconstructed from the info
219  * in the upper layer.
220  *
221  * data structures we use include:
222  *
223  *  - struct pmap: describes the address space of one thread
224  *  - struct pmap_page: describes one pv-tracked page, without
225  *    necessarily a corresponding vm_page
226  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
227  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
228  *    physical memory.   the pp_pvlist points to a list of pv_entry
229  *    structures which describe all the <PMAP,VA> pairs that this
230  *    page is mapped in.    this is critical for page based operations
231  *    such as pmap_page_protect() [change protection on _all_ mappings
232  *    of a page]
233  */
234 
235 /*
236  * Locking
237  *
238  * We have the following locks that we must deal with, listed in the order
239  * that they are acquired:
240  *
241  * pg->uobject->vmobjlock, pg->uanon->an_lock
242  *
243  *	For managed pages, these per-object locks are taken by the VM system
244  *	before calling into the pmap module - either a read or write hold.
245  *	The lock hold prevent pages from changing identity while the pmap is
246  *	operating on them.  For example, the same lock is held across a call
247  *	to pmap_remove() and the following call to pmap_update(), so that a
248  *	page does not gain a new identity while its TLB visibility is stale.
249  *
250  * pmap->pm_lock
251  *
252  *	This lock protects the fields in the pmap structure including the
253  *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
254  *	structures.  For modifying unmanaged kernel PTEs it is not needed as
255  *	kernel PDEs are never freed, and the kernel is expected to be self
256  *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
257  *	because they can be modified from interrupt context).
258  *
259  * pmaps_lock
260  *
261  *	This lock protects the list of active pmaps (headed by "pmaps").
262  *	It's acquired when adding or removing pmaps or adjusting kernel PDEs.
263  *
264  * pp_lock
265  *
266  *	This per-page lock protects PV entry lists and the embedded PV entry
267  *	in each vm_page, allowing for concurrent operation on pages by
268  *	different pmaps.  This is a spin mutex at IPL_VM, because at the
269  *	points it is taken context switching is usually not tolerable, and
270  *	spin mutexes must block out interrupts that could take kernel_lock.
271  */
272 
273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
274 #ifdef DIAGNOSTIC
275 #define	PMAP_DUMMY_LOCK(pm)	rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
276 #define	PMAP_DUMMY_UNLOCK(pm)	rw_exit(&(pm)->pm_dummy_lock)
277 #else
278 #define	PMAP_DUMMY_LOCK(pm)
279 #define	PMAP_DUMMY_UNLOCK(pm)
280 #endif
281 
282 static const struct uvm_pagerops pmap_pager = {
283 	/* nothing */
284 };
285 
286 /*
287  * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
288  */
289 #define pl_i(va, lvl) \
290         (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
291 
292 #define	pl_i_roundup(va, lvl)	pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
293 
294 /*
295  * PTP macros:
296  *   a PTP's index is the PD index of the PDE that points to it
297  *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
298  *   a PTP's VA is the first VA mapped by that PTP
299  */
300 
301 #define ptp_va2o(va, lvl)	(pl_i(va, (lvl)+1) * PAGE_SIZE)
302 
303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
306 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
307 const long nbpd[] = NBPD_INITIALIZER;
308 #ifdef i386
309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
310 #else
311 pd_entry_t *normal_pdes[3];
312 #endif
313 
314 long nkptp[] = NKPTP_INITIALIZER;
315 
316 struct pmap_head pmaps;
317 kmutex_t pmaps_lock __cacheline_aligned;
318 
319 struct pcpu_area *pcpuarea __read_mostly;
320 
321 static vaddr_t pmap_maxkvaddr;
322 
323 /*
324  * Misc. event counters.
325  */
326 struct evcnt pmap_iobmp_evcnt;
327 struct evcnt pmap_ldt_evcnt;
328 
329 /*
330  * PAT
331  */
332 static bool cpu_pat_enabled __read_mostly = false;
333 
334 /*
335  * Global data structures
336  */
337 
338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
340 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
341 
342 struct bootspace bootspace __read_mostly;
343 struct slotspace slotspace __read_mostly;
344 
345 /* Set to PTE_NX if supported. */
346 pd_entry_t pmap_pg_nx __read_mostly = 0;
347 
348 /* Set to PTE_G if supported. */
349 pd_entry_t pmap_pg_g __read_mostly = 0;
350 
351 /* Set to true if large pages are supported. */
352 int pmap_largepages __read_mostly = 0;
353 
354 paddr_t lowmem_rsvd __read_mostly;
355 paddr_t avail_start __read_mostly; /* PA of first available physical page */
356 paddr_t avail_end __read_mostly; /* PA of last available physical page */
357 
358 #ifdef XENPV
359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
360 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
361 #endif
362 
363 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
364 #define	PMAP_CHECK_PP(pp) \
365     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
366 
367 #define PAGE_ALIGNED(pp)	\
368 	__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
369 
370 /*
371  * Other data structures
372  */
373 
374 static pt_entry_t protection_codes[8] __read_mostly;
375 
376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
377 
378 /*
379  * The following two vaddr_t's are used during system startup to keep track of
380  * how much of the kernel's VM space we have used. Once the system is started,
381  * the management of the remaining kernel VM space is turned over to the
382  * kernel_map vm_map.
383  */
384 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
385 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
386 
387 #ifndef XENPV
388 /*
389  * LAPIC virtual address, and fake physical address.
390  */
391 volatile vaddr_t local_apic_va __read_mostly;
392 paddr_t local_apic_pa __read_mostly;
393 #endif
394 
395 /*
396  * pool that pmap structures are allocated from
397  */
398 struct pool_cache pmap_cache;
399 static int  pmap_ctor(void *, void *, int);
400 static void pmap_dtor(void *, void *);
401 
402 /*
403  * pv_page cache
404  */
405 static struct pool_cache pmap_pvp_cache;
406 
407 #ifdef __HAVE_DIRECT_MAP
408 vaddr_t pmap_direct_base __read_mostly;
409 vaddr_t pmap_direct_end __read_mostly;
410 #endif
411 
412 #ifndef __HAVE_DIRECT_MAP
413 /*
414  * Special VAs and the PTEs that map them
415  */
416 static pt_entry_t *early_zero_pte;
417 static void pmap_vpage_cpualloc(struct cpu_info *);
418 #ifdef XENPV
419 char *early_zerop; /* also referenced from xen_locore() */
420 #else
421 static char *early_zerop;
422 #endif
423 #endif
424 
425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
426 
427 /* PDP pool and its callbacks */
428 static struct pool pmap_pdp_pool;
429 static void pmap_pdp_init(pd_entry_t *);
430 static void pmap_pdp_fini(pd_entry_t *);
431 
432 #ifdef PAE
433 /* need to allocate items of 4 pages */
434 static void *pmap_pdp_alloc(struct pool *, int);
435 static void pmap_pdp_free(struct pool *, void *);
436 static struct pool_allocator pmap_pdp_allocator = {
437 	.pa_alloc = pmap_pdp_alloc,
438 	.pa_free = pmap_pdp_free,
439 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
440 };
441 #endif
442 
443 extern vaddr_t idt_vaddr;
444 extern paddr_t idt_paddr;
445 extern vaddr_t gdt_vaddr;
446 extern paddr_t gdt_paddr;
447 extern vaddr_t ldt_vaddr;
448 extern paddr_t ldt_paddr;
449 
450 #ifdef i386
451 /* stuff to fix the pentium f00f bug */
452 extern vaddr_t pentium_idt_vaddr;
453 #endif
454 
455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
456 struct pmap_ptparray {
457 	struct vm_page *pg[PTP_LEVELS + 1];
458 	bool alloced[PTP_LEVELS + 1];
459 };
460 
461 /*
462  * PV entries are allocated in page-sized chunks and cached per-pmap to
463  * avoid intense pressure on memory allocators.
464  */
465 
466 struct pv_page {
467 	LIST_HEAD(, pv_entry)	pvp_pves;
468 	LIST_ENTRY(pv_page)	pvp_list;
469 	long			pvp_nfree;
470 	struct pmap		*pvp_pmap;
471 };
472 
473 #define	PVE_PER_PVP	((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
474 
475 /*
476  * PV tree prototypes
477  */
478 
479 static int	pmap_compare_key(void *, const void *, const void *);
480 static int	pmap_compare_nodes(void *, const void *, const void *);
481 
482 /* Read-black tree */
483 static const rb_tree_ops_t pmap_rbtree_ops = {
484 	.rbto_compare_nodes = pmap_compare_nodes,
485 	.rbto_compare_key = pmap_compare_key,
486 	.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
487 	.rbto_context = NULL
488 };
489 
490 /*
491  * Local prototypes
492  */
493 
494 #ifdef __HAVE_PCPU_AREA
495 static void pmap_init_pcpu(void);
496 #endif
497 #ifdef __HAVE_DIRECT_MAP
498 static void pmap_init_directmap(struct pmap *);
499 #endif
500 #if !defined(XENPV)
501 static void pmap_remap_global(void);
502 #endif
503 #ifndef XENPV
504 static void pmap_init_lapic(void);
505 static void pmap_remap_largepages(void);
506 #endif
507 
508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
509     struct vm_page **);
510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
512     pd_entry_t * const *);
513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
514 static void pmap_freepage(struct pmap *, struct vm_page *, int);
515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
516     pt_entry_t *, pd_entry_t * const *);
517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
518     vaddr_t);
519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
520     vaddr_t);
521 static int pmap_pvp_ctor(void *, void *, int);
522 static void pmap_pvp_dtor(void *, void *);
523 static struct pv_entry *pmap_alloc_pv(struct pmap *);
524 static void pmap_free_pv(struct pmap *, struct pv_entry *);
525 static void pmap_drain_pv(struct pmap *);
526 
527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
528 
529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
530 static void pmap_reactivate(struct pmap *);
531 
532 long
533 pmap_resident_count(struct pmap *pmap)
534 {
535 
536 	return pmap->pm_stats.resident_count;
537 }
538 
539 long
540 pmap_wired_count(struct pmap *pmap)
541 {
542 
543 	return pmap->pm_stats.wired_count;
544 }
545 
546 /*
547  * p m a p   h e l p e r   f u n c t i o n s
548  */
549 
550 static inline void
551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
552 {
553 
554 	KASSERT(cold || mutex_owned(&pmap->pm_lock));
555 	pmap->pm_stats.resident_count += resid_diff;
556 	pmap->pm_stats.wired_count += wired_diff;
557 }
558 
559 static inline void
560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
561 {
562 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
563 	int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
564 
565 	KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
566 	KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
567 
568 	pmap_stats_update(pmap, resid_diff, wired_diff);
569 }
570 
571 /*
572  * ptp_to_pmap: lookup pmap by ptp
573  */
574 static inline struct pmap *
575 ptp_to_pmap(struct vm_page *ptp)
576 {
577 	struct pmap *pmap;
578 
579 	if (ptp == NULL) {
580 		return pmap_kernel();
581 	}
582 	pmap = (struct pmap *)ptp->uobject;
583 	KASSERT(pmap != NULL);
584 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
585 	return pmap;
586 }
587 
588 static inline struct pv_pte *
589 pve_to_pvpte(struct pv_entry *pve)
590 {
591 
592 	if (pve == NULL)
593 		return NULL;
594 	KASSERT((void *)&pve->pve_pte == (void *)pve);
595 	return &pve->pve_pte;
596 }
597 
598 static inline struct pv_entry *
599 pvpte_to_pve(struct pv_pte *pvpte)
600 {
601 	struct pv_entry *pve = (void *)pvpte;
602 
603 	KASSERT(pve_to_pvpte(pve) == pvpte);
604 	return pve;
605 }
606 
607 /*
608  * Return true if the pmap page has an embedded PV entry.
609  */
610 static inline bool
611 pv_pte_embedded(struct pmap_page *pp)
612 {
613 
614 	KASSERT(mutex_owned(&pp->pp_lock));
615 	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
616 }
617 
618 /*
619  * pv_pte_first, pv_pte_next: PV list iterator.
620  */
621 static inline struct pv_pte *
622 pv_pte_first(struct pmap_page *pp)
623 {
624 
625 	KASSERT(mutex_owned(&pp->pp_lock));
626 	if (pv_pte_embedded(pp)) {
627 		return &pp->pp_pte;
628 	}
629 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
630 }
631 
632 static inline struct pv_pte *
633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
634 {
635 
636 	KASSERT(mutex_owned(&pp->pp_lock));
637 	KASSERT(pvpte != NULL);
638 	if (pvpte == &pp->pp_pte) {
639 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
640 	}
641 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
642 }
643 
644 static inline uint8_t
645 pmap_pte_to_pp_attrs(pt_entry_t pte)
646 {
647 	uint8_t ret = 0;
648 	if (pte & PTE_D)
649 		ret |= PP_ATTRS_D;
650 	if (pte & PTE_A)
651 		ret |= PP_ATTRS_A;
652 	if (pte & PTE_W)
653 		ret |= PP_ATTRS_W;
654 	return ret;
655 }
656 
657 static inline pt_entry_t
658 pmap_pp_attrs_to_pte(uint8_t attrs)
659 {
660 	pt_entry_t pte = 0;
661 	if (attrs & PP_ATTRS_D)
662 		pte |= PTE_D;
663 	if (attrs & PP_ATTRS_A)
664 		pte |= PTE_A;
665 	if (attrs & PP_ATTRS_W)
666 		pte |= PTE_W;
667 	return pte;
668 }
669 
670 /*
671  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
672  * of course the kernel is always loaded
673  */
674 bool
675 pmap_is_curpmap(struct pmap *pmap)
676 {
677 	return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
678 }
679 
680 inline void
681 pmap_reference(struct pmap *pmap)
682 {
683 
684 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
685 }
686 
687 /*
688  * rbtree: compare two nodes.
689  */
690 static int
691 pmap_compare_nodes(void *context, const void *n1, const void *n2)
692 {
693 	const struct pv_entry *pve1 = n1;
694 	const struct pv_entry *pve2 = n2;
695 
696 	KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
697 
698 	if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
699 		return -1;
700 	}
701 	if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
702 		return 1;
703 	}
704 	return 0;
705 }
706 
707 /*
708  * rbtree: compare a node and a key.
709  */
710 static int
711 pmap_compare_key(void *context, const void *n, const void *k)
712 {
713 	const struct pv_entry *pve = n;
714 	const vaddr_t key = (vaddr_t)k;
715 
716 	if (pve->pve_pte.pte_va < key) {
717 		return -1;
718 	}
719 	if (pve->pve_pte.pte_va > key) {
720 		return 1;
721 	}
722 	return 0;
723 }
724 
725 /*
726  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
727  */
728 static inline void
729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
730 {
731 	vaddr_t *min = (vaddr_t *)&ptp->uanon;
732 
733 	if (va < *min) {
734 		*min = va;
735 	}
736 }
737 
738 /*
739  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
740  */
741 static inline void
742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
743 {
744 	vaddr_t sclip;
745 
746 	if (ptp == NULL) {
747 		return;
748 	}
749 
750 	sclip = (vaddr_t)ptp->uanon;
751 	sclip = (*startva < sclip ? sclip : *startva);
752 	*pte += (sclip - *startva) / PAGE_SIZE;
753 	*startva = sclip;
754 }
755 
756 /*
757  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
758  *
759  * there are several pmaps involved.  some or all of them might be same.
760  *
761  *	- the pmap given by the first argument
762  *		our caller wants to access this pmap's PTEs.
763  *
764  *	- pmap_kernel()
765  *		the kernel pmap.  note that it only contains the kernel part
766  *		of the address space which is shared by any pmap.  ie. any
767  *		pmap can be used instead of pmap_kernel() for our purpose.
768  *
769  *	- ci->ci_pmap
770  *		pmap currently loaded on the cpu.
771  *
772  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
773  *		current process' pmap.
774  *
775  * => caller must lock pmap first (if not the kernel pmap)
776  * => must be undone with pmap_unmap_ptes before returning
777  * => disables kernel preemption
778  */
779 void
780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
781     pd_entry_t * const **pdeppp)
782 {
783 	struct pmap *curpmap;
784 	struct cpu_info *ci;
785 	lwp_t *l;
786 
787 	kpreempt_disable();
788 
789 	/* The kernel's pmap is always accessible. */
790 	if (pmap == pmap_kernel()) {
791 		*pmap2 = NULL;
792 		*ptepp = PTE_BASE;
793 		*pdeppp = normal_pdes;
794 		return;
795 	}
796 
797 	KASSERT(mutex_owned(&pmap->pm_lock));
798 
799 	l = curlwp;
800 	ci = l->l_cpu;
801 	curpmap = ci->ci_pmap;
802 	if (pmap == curpmap) {
803 		/*
804 		 * Already on the CPU: make it valid.  This is very
805 		 * often the case during exit(), when we have switched
806 		 * to the kernel pmap in order to destroy a user pmap.
807 		 */
808 		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
809 			pmap_reactivate(pmap);
810 		}
811 		*pmap2 = NULL;
812 	} else {
813 		/*
814 		 * Toss current pmap from CPU and install new pmap, but keep
815 		 * a reference to the old one.  Dropping the reference can
816 		 * can block as it needs to take locks, so defer that to
817 		 * pmap_unmap_ptes().
818 		 */
819 		pmap_reference(pmap);
820 		pmap_load1(l, pmap, curpmap);
821 		*pmap2 = curpmap;
822 	}
823 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
824 #ifdef DIAGNOSTIC
825 	pmap->pm_pctr = lwp_pctr();
826 #endif
827 	*ptepp = PTE_BASE;
828 
829 #if defined(XENPV) && defined(__x86_64__)
830 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
831 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
832 	*pdeppp = ci->ci_normal_pdes;
833 #else
834 	*pdeppp = normal_pdes;
835 #endif
836 }
837 
838 /*
839  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
840  *
841  * => we cannot tolerate context switches while mapped in: assert this.
842  * => reenables kernel preemption.
843  * => does not unlock pmap.
844  */
845 void
846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
847 {
848 	struct cpu_info *ci;
849 	struct pmap *mypmap;
850 	struct lwp *l;
851 
852 	KASSERT(kpreempt_disabled());
853 
854 	/* The kernel's pmap is always accessible. */
855 	if (pmap == pmap_kernel()) {
856 		kpreempt_enable();
857 		return;
858 	}
859 
860 	l = curlwp;
861 	ci = l->l_cpu;
862 
863 	KASSERT(mutex_owned(&pmap->pm_lock));
864 	KASSERT(pmap->pm_pctr == lwp_pctr());
865 
866 #if defined(XENPV) && defined(__x86_64__)
867 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
868 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
869 #endif
870 
871 	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
872 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
873 	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
874 	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
875 		ci->ci_want_pmapload = 0;
876 	} else {
877 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
878 		ci->ci_tlbstate = TLBSTATE_LAZY;
879 	}
880 
881 	/* Now safe to re-enable preemption. */
882 	kpreempt_enable();
883 
884 	/* Toss reference to other pmap taken earlier. */
885 	if (pmap2 != NULL) {
886 		pmap_destroy(pmap2);
887 	}
888 }
889 
890 inline static void
891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
892 {
893 
894 #if !defined(__x86_64__)
895 	if (curproc == NULL || curproc->p_vmspace == NULL ||
896 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
897 		return;
898 
899 	if ((opte ^ npte) & PTE_X)
900 		pmap_update_pg(va);
901 
902 	/*
903 	 * Executability was removed on the last executable change.
904 	 * Reset the code segment to something conservative and
905 	 * let the trap handler deal with setting the right limit.
906 	 * We can't do that because of locking constraints on the vm map.
907 	 */
908 
909 	if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
910 		struct trapframe *tf = curlwp->l_md.md_regs;
911 
912 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
914 	}
915 #endif /* !defined(__x86_64__) */
916 }
917 
918 #if !defined(__x86_64__)
919 /*
920  * Fixup the code segment to cover all potential executable mappings.
921  * returns 0 if no changes to the code segment were made.
922  */
923 int
924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
925 {
926 	struct vm_map_entry *ent;
927 	struct pmap *pm = vm_map_pmap(map);
928 	vaddr_t va = 0;
929 
930 	vm_map_lock_read(map);
931 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
932 		/*
933 		 * This entry has greater va than the entries before.
934 		 * We need to make it point to the last page, not past it.
935 		 */
936 		if (ent->protection & VM_PROT_EXECUTE)
937 			va = trunc_page(ent->end) - PAGE_SIZE;
938 	}
939 	vm_map_unlock_read(map);
940 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
941 		return 0;
942 
943 	pm->pm_hiexec = va;
944 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
945 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
946 	} else {
947 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
948 		return 0;
949 	}
950 	return 1;
951 }
952 #endif /* !defined(__x86_64__) */
953 
954 void
955 pat_init(struct cpu_info *ci)
956 {
957 #ifndef XENPV
958 	uint64_t pat;
959 
960 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
961 		return;
962 
963 	/* We change WT to WC. Leave all other entries the default values. */
964 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
965 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
966 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
967 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
968 
969 	wrmsr(MSR_CR_PAT, pat);
970 	cpu_pat_enabled = true;
971 #endif
972 }
973 
974 static pt_entry_t
975 pmap_pat_flags(u_int flags)
976 {
977 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
978 
979 	if (!cpu_pat_enabled) {
980 		switch (cacheflags) {
981 		case PMAP_NOCACHE:
982 		case PMAP_NOCACHE_OVR:
983 			/* results in PGC_UCMINUS on cpus which have
984 			 * the cpuid PAT but PAT "disabled"
985 			 */
986 			return PTE_PCD;
987 		default:
988 			return 0;
989 		}
990 	}
991 
992 	switch (cacheflags) {
993 	case PMAP_NOCACHE:
994 		return PGC_UC;
995 	case PMAP_WRITE_COMBINE:
996 		return PGC_WC;
997 	case PMAP_WRITE_BACK:
998 		return PGC_WB;
999 	case PMAP_NOCACHE_OVR:
1000 		return PGC_UCMINUS;
1001 	}
1002 
1003 	return 0;
1004 }
1005 
1006 /*
1007  * p m a p   k e n t e r   f u n c t i o n s
1008  *
1009  * functions to quickly enter/remove pages from the kernel address
1010  * space.   pmap_kremove is exported to MI kernel.  we make use of
1011  * the recursive PTE mappings.
1012  */
1013 
1014 /*
1015  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1016  *
1017  * => no need to lock anything, assume va is already allocated
1018  * => should be faster than normal pmap enter function
1019  */
1020 void
1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1022 {
1023 	pt_entry_t *pte, opte, npte;
1024 
1025 	KASSERT(!(prot & ~VM_PROT_ALL));
1026 
1027 	if (va < VM_MIN_KERNEL_ADDRESS)
1028 		pte = vtopte(va);
1029 	else
1030 		pte = kvtopte(va);
1031 #if defined(XENPV) && defined(DOM0OPS)
1032 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1033 #ifdef DEBUG
1034 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1035 		    " outside range\n", __func__, pa, va);
1036 #endif /* DEBUG */
1037 		npte = pa;
1038 	} else
1039 #endif /* XENPV && DOM0OPS */
1040 		npte = pmap_pa2pte(pa);
1041 	npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1042 	npte |= pmap_pat_flags(flags);
1043 	opte = pmap_pte_testset(pte, npte); /* zap! */
1044 
1045 	/*
1046 	 * XXX: make sure we are not dealing with a large page, since the only
1047 	 * large pages created are for the kernel image, and they should never
1048 	 * be kentered.
1049 	 */
1050 	KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1051 
1052 	if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1053 		/* This should not happen. */
1054 		printf_nolog("%s: mapping already present\n", __func__);
1055 		kpreempt_disable();
1056 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1057 		kpreempt_enable();
1058 	}
1059 }
1060 
1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1062 
1063 #if defined(__x86_64__)
1064 /*
1065  * Change protection for a virtual address. Local for a CPU only, don't
1066  * care about TLB shootdowns.
1067  *
1068  * => must be called with preemption disabled
1069  */
1070 void
1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1072 {
1073 	pt_entry_t *pte, opte, npte;
1074 
1075 	KASSERT(kpreempt_disabled());
1076 
1077 	if (va < VM_MIN_KERNEL_ADDRESS)
1078 		pte = vtopte(va);
1079 	else
1080 		pte = kvtopte(va);
1081 
1082 	npte = opte = *pte;
1083 
1084 	if ((prot & VM_PROT_WRITE) != 0)
1085 		npte |= PTE_W;
1086 	else
1087 		npte &= ~(PTE_W|PTE_D);
1088 
1089 	if (opte != npte) {
1090 		pmap_pte_set(pte, npte);
1091 		pmap_pte_flush();
1092 		invlpg(va);
1093 	}
1094 }
1095 #endif /* defined(__x86_64__) */
1096 
1097 /*
1098  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1099  *
1100  * => no need to lock anything
1101  * => caller must dispose of any vm_page mapped in the va range
1102  * => note: not an inline function
1103  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1104  * => we assume kernel only unmaps valid addresses and thus don't bother
1105  *    checking the valid bit before doing TLB flushing
1106  * => must be followed by call to pmap_update() before reuse of page
1107  */
1108 static void
1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1110 {
1111 	pt_entry_t *pte, opte;
1112 	vaddr_t va, eva;
1113 
1114 	eva = sva + len;
1115 
1116 	kpreempt_disable();
1117 	for (va = sva; va < eva; va += PAGE_SIZE) {
1118 		pte = kvtopte(va);
1119 		opte = pmap_pte_testset(pte, 0); /* zap! */
1120 		if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1121 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1122 			    TLBSHOOT_KREMOVE);
1123 		}
1124 		KASSERTMSG((opte & PTE_PS) == 0,
1125 		    "va %#" PRIxVADDR " is a large page", va);
1126 		KASSERTMSG((opte & PTE_PVLIST) == 0,
1127 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1128 	}
1129 	if (localonly) {
1130 		tlbflushg();
1131 	}
1132 	kpreempt_enable();
1133 }
1134 
1135 void
1136 pmap_kremove(vaddr_t sva, vsize_t len)
1137 {
1138 
1139 	pmap_kremove1(sva, len, false);
1140 }
1141 
1142 /*
1143  * pmap_kremove_local: like pmap_kremove(), but only worry about
1144  * TLB invalidations on the current CPU.  this is only intended
1145  * for use while writing kernel crash dumps, either after panic
1146  * or via reboot -d.
1147  */
1148 void
1149 pmap_kremove_local(vaddr_t sva, vsize_t len)
1150 {
1151 
1152 	pmap_kremove1(sva, len, true);
1153 }
1154 
1155 /*
1156  * p m a p   i n i t   f u n c t i o n s
1157  *
1158  * pmap_bootstrap and pmap_init are called during system startup
1159  * to init the pmap module.   pmap_bootstrap() does a low level
1160  * init just to get things rolling.   pmap_init() finishes the job.
1161  */
1162 
1163 /*
1164  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1165  * This function is to be used before any VM system has been set up.
1166  *
1167  * The va is taken from virtual_avail.
1168  */
1169 static vaddr_t
1170 pmap_bootstrap_valloc(size_t npages)
1171 {
1172 	vaddr_t va = virtual_avail;
1173 	virtual_avail += npages * PAGE_SIZE;
1174 	return va;
1175 }
1176 
1177 /*
1178  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1179  * This function is to be used before any VM system has been set up.
1180  *
1181  * The pa is taken from avail_start.
1182  */
1183 static paddr_t
1184 pmap_bootstrap_palloc(size_t npages)
1185 {
1186 	paddr_t pa = avail_start;
1187 	avail_start += npages * PAGE_SIZE;
1188 	return pa;
1189 }
1190 
1191 /*
1192  * pmap_bootstrap: get the system in a state where it can run with VM properly
1193  * enabled (called before main()). The VM system is fully init'd later.
1194  *
1195  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1196  *    kernel, and nkpde PTP's for the kernel.
1197  * => kva_start is the first free virtual address in kernel space.
1198  */
1199 void
1200 pmap_bootstrap(vaddr_t kva_start)
1201 {
1202 	struct pmap *kpm;
1203 	int i;
1204 	vaddr_t kva;
1205 
1206 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1207 
1208 	/*
1209 	 * Set up our local static global vars that keep track of the usage of
1210 	 * KVM before kernel_map is set up.
1211 	 */
1212 	virtual_avail = kva_start;		/* first free KVA */
1213 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1214 
1215 	/*
1216 	 * Set up protection_codes: we need to be able to convert from a MI
1217 	 * protection code (some combo of VM_PROT...) to something we can jam
1218 	 * into a x86 PTE.
1219 	 */
1220 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1221 	protection_codes[VM_PROT_EXECUTE] = PTE_X;
1222 	protection_codes[VM_PROT_READ] = pmap_pg_nx;
1223 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1224 	protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1225 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1226 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1227 	protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1228 
1229 	/*
1230 	 * Now we init the kernel's pmap.
1231 	 *
1232 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1233 	 * the pm_obj contains the list of active PTPs.
1234 	 */
1235 	kpm = pmap_kernel();
1236 	mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1237 	rw_init(&kpm->pm_dummy_lock);
1238 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1239 		uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1240 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1241 		kpm->pm_ptphint[i] = NULL;
1242 	}
1243 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1244 
1245 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1246 	for (i = 0; i < PDP_SIZE; i++)
1247 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1248 
1249 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1250 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1251 
1252 	kcpuset_create(&kpm->pm_cpus, true);
1253 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1254 
1255 	kpm->pm_ldt = NULL;
1256 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1257 
1258 	/*
1259 	 * the above is just a rough estimate and not critical to the proper
1260 	 * operation of the system.
1261 	 */
1262 
1263 #if !defined(XENPV)
1264 	/*
1265 	 * Begin to enable global TLB entries if they are supported: add PTE_G
1266 	 * attribute to already mapped kernel pages. Do that only if SVS is
1267 	 * disabled.
1268 	 *
1269 	 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1270 	 * happens later in cpu_init().
1271 	 */
1272 #ifdef SVS
1273 	if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1274 #else
1275 	if (cpu_feature[0] & CPUID_PGE) {
1276 #endif
1277 		pmap_pg_g = PTE_G;
1278 		pmap_remap_global();
1279 	}
1280 #endif
1281 
1282 #ifndef XENPV
1283 	/*
1284 	 * Enable large pages if they are supported.
1285 	 */
1286 	if (cpu_feature[0] & CPUID_PSE) {
1287 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1288 		pmap_largepages = 1;	/* enable software */
1289 
1290 		/*
1291 		 * The TLB must be flushed after enabling large pages on Pentium
1292 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1293 		 * Software Developer's Manual, Volume 3: System Programming".
1294 		 */
1295 		tlbflushg();
1296 
1297 		/* Remap the kernel. */
1298 		pmap_remap_largepages();
1299 	}
1300 	pmap_init_lapic();
1301 #endif /* !XENPV */
1302 
1303 #ifdef __HAVE_PCPU_AREA
1304 	pmap_init_pcpu();
1305 #endif
1306 
1307 #ifdef __HAVE_DIRECT_MAP
1308 	pmap_init_directmap(kpm);
1309 #else
1310 	pmap_vpage_cpualloc(&cpu_info_primary);
1311 
1312 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1313 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1314 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1315 	} else { /* amd64 */
1316 		/*
1317 		 * zero_pte is stuck at the end of mapped space for the kernel
1318 		 * image (disjunct from kva space). This is done so that it
1319 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1320 		 * when it's called for the first time.
1321 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1322 		 */
1323 #ifdef XENPV
1324 		/* early_zerop initialized in xen_locore() */
1325 #else
1326 		early_zerop = (void *)bootspace.spareva;
1327 #endif
1328 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1329 	}
1330 #endif
1331 
1332 #if defined(XENPV) && defined(__x86_64__)
1333 	extern vaddr_t xen_dummy_page;
1334 	paddr_t xen_dummy_user_pgd;
1335 
1336 	/*
1337 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1338 	 * Xen will still consider it active. So we set user PGD to this one
1339 	 * to lift all protection on the now inactive page tables set.
1340 	 */
1341 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1342 
1343 	/* Zero fill it, the less checks in Xen it requires the better */
1344 	memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1345 	/* Mark read-only */
1346 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1347 	    pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1348 	    UVMF_INVLPG);
1349 	/* Pin as L4 */
1350 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1351 #endif
1352 
1353 	/*
1354 	 * Allocate space for the Interrupt Descriptor Table (IDT),
1355 	 * Global Descriptor Table (GDT), and Local Descriptor Table
1356 	 * (LDT).
1357 	 *
1358 	 * Currently there is an initial temporary GDT allocated on the
1359 	 * stack by the caller of init386/init_x86_64, which is (among
1360 	 * other things) needed on i386 for %fs-relative addressing for
1361 	 * CPU-local data (CPUVAR(...), curcpu(), curlwp).  This
1362 	 * initial temporary GDT will be popped off the stack before we
1363 	 * can enter main, so we need to make sure there is space for a
1364 	 * second temporary GDT to continue existing when we enter main
1365 	 * before we allocate space for the permanent GDT with
1366 	 * uvm_km(9) in gdt_init via cpu_startup and switch to that.
1367 	 */
1368 	idt_vaddr = pmap_bootstrap_valloc(1);
1369 	idt_paddr = pmap_bootstrap_palloc(1);
1370 
1371 	gdt_vaddr = pmap_bootstrap_valloc(1);
1372 	gdt_paddr = pmap_bootstrap_palloc(1);
1373 
1374 #ifdef __HAVE_PCPU_AREA
1375 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1376 #else
1377 	ldt_vaddr = pmap_bootstrap_valloc(1);
1378 #endif
1379 	ldt_paddr = pmap_bootstrap_palloc(1);
1380 
1381 #if !defined(__x86_64__)
1382 	/* pentium f00f bug stuff */
1383 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1384 #endif
1385 
1386 #if defined(XENPVHVM)
1387 	/* XXX: move to hypervisor.c with appropriate API adjustments */
1388 	extern paddr_t HYPERVISOR_shared_info_pa;
1389 	extern volatile struct xencons_interface *xencons_interface; /* XXX */
1390 	extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1391 
1392 	if (vm_guest != VM_GUEST_XENPVH) {
1393 		HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1394 		HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1395 	}
1396 	xencons_interface = (void *) pmap_bootstrap_valloc(1);
1397 	xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1398 #endif
1399 	/*
1400 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1401 	 */
1402 	virtual_avail = reserve_dumppages(virtual_avail);
1403 
1404 	/*
1405 	 * Init the global lock and global list.
1406 	 */
1407 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1408 	LIST_INIT(&pmaps);
1409 
1410 	/*
1411 	 * Ensure the TLB is sync'd with reality by flushing it...
1412 	 */
1413 	tlbflushg();
1414 
1415 	/*
1416 	 * Calculate pmap_maxkvaddr from nkptp[].
1417 	 */
1418 	kva = VM_MIN_KERNEL_ADDRESS;
1419 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1420 		kva += nkptp[i] * nbpd[i];
1421 	}
1422 	pmap_maxkvaddr = kva;
1423 }
1424 
1425 #ifndef XENPV
1426 static void
1427 pmap_init_lapic(void)
1428 {
1429 	/*
1430 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1431 	 * x86 implementation relies a lot on this address to be valid; so just
1432 	 * allocate a fake physical page that will be kentered into
1433 	 * local_apic_va by machdep.
1434 	 *
1435 	 * If the LAPIC is present, the va will be remapped somewhere else
1436 	 * later in lapic_map.
1437 	 */
1438 	local_apic_va = pmap_bootstrap_valloc(1);
1439 	local_apic_pa = pmap_bootstrap_palloc(1);
1440 }
1441 #endif
1442 
1443 #ifdef __x86_64__
1444 static size_t
1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1446 {
1447 	size_t npages;
1448 	npages = (roundup(endva, pgsz) / pgsz) -
1449 	    (rounddown(startva, pgsz) / pgsz);
1450 	return npages;
1451 }
1452 #endif
1453 
1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1455 static inline void
1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1457 {
1458 	size_t sslot = slotspace.area[type].sslot;
1459 	size_t nslot = slotspace.area[type].nslot;
1460 
1461 	memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1462 }
1463 #endif
1464 
1465 #ifdef __x86_64__
1466 /*
1467  * Randomize the location of an area. We count the holes in the VM space. We
1468  * randomly select one hole, and then randomly select an area within that hole.
1469  * Finally we update the associated entry in the slotspace structure.
1470  */
1471 vaddr_t
1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1473     vaddr_t randva)
1474 {
1475 	struct {
1476 		int start;
1477 		int end;
1478 	} holes[SLSPACE_NAREAS+1];
1479 	size_t i, nholes, hole;
1480 	size_t startsl, endsl, nslots, winsize;
1481 	vaddr_t startva, va;
1482 
1483 	sz = roundup(sz, align);
1484 
1485 	/*
1486 	 * Take one more slot with +NBPD_L4, because we may end up choosing
1487 	 * an area that crosses slots:
1488 	 *     +------+------+------+
1489 	 *     | Slot | Slot | Slot |
1490 	 *     +------+------+------+
1491 	 *        [Chosen Area]
1492 	 * And in that case we must take into account the additional slot
1493 	 * consumed.
1494 	 */
1495 	nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1496 
1497 	/* Get the holes. */
1498 	nholes = 0;
1499 	size_t curslot = 0 + 256; /* end of SLAREA_USER */
1500 	while (1) {
1501 		/*
1502 		 * Find the first occupied slot after the current one.
1503 		 * The area between the two is a hole.
1504 		 */
1505 		size_t minsslot = 512;
1506 		size_t minnslot = 0;
1507 		for (i = 0; i < SLSPACE_NAREAS; i++) {
1508 			if (!slotspace.area[i].active)
1509 				continue;
1510 			if (slotspace.area[i].sslot >= curslot &&
1511 			    slotspace.area[i].sslot < minsslot) {
1512 				minsslot = slotspace.area[i].sslot;
1513 				minnslot = slotspace.area[i].nslot;
1514 			}
1515 		}
1516 
1517 		/* No hole anymore, stop here. */
1518 		if (minsslot == 512) {
1519 			break;
1520 		}
1521 
1522 		/* Register the hole. */
1523 		if (minsslot - curslot >= nslots) {
1524 			holes[nholes].start = curslot;
1525 			holes[nholes].end = minsslot;
1526 			nholes++;
1527 		}
1528 
1529 		/* Skip that hole, and iterate again. */
1530 		curslot = minsslot + minnslot;
1531 	}
1532 
1533 	if (nholes == 0) {
1534 		panic("%s: impossible", __func__);
1535 	}
1536 
1537 	/* Select a hole. */
1538 	hole = randhole;
1539 #ifdef NO_X86_ASLR
1540 	hole = 0;
1541 #endif
1542 	hole %= nholes;
1543 	startsl = holes[hole].start;
1544 	endsl = holes[hole].end;
1545 	startva = VA_SIGN_NEG(startsl * NBPD_L4);
1546 
1547 	/* Select an area within the hole. */
1548 	va = randva;
1549 #ifdef NO_X86_ASLR
1550 	va = 0;
1551 #endif
1552 	winsize = ((endsl - startsl) * NBPD_L4) - sz;
1553 	va %= winsize;
1554 	va = rounddown(va, align);
1555 	va += startva;
1556 
1557 	/* Update the entry. */
1558 	slotspace.area[type].sslot = pl4_i(va);
1559 	slotspace.area[type].nslot =
1560 	    pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1561 	slotspace.area[type].active = true;
1562 
1563 	return va;
1564 }
1565 #endif
1566 
1567 #ifdef __HAVE_PCPU_AREA
1568 static void
1569 pmap_init_pcpu(void)
1570 {
1571 	const vaddr_t startva = PMAP_PCPU_BASE;
1572 	size_t nL4e, nL3e, nL2e, nL1e;
1573 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1574 	paddr_t pa;
1575 	vaddr_t endva;
1576 	vaddr_t tmpva;
1577 	pt_entry_t *pte;
1578 	size_t size;
1579 	int i;
1580 
1581 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1582 
1583 	size = sizeof(struct pcpu_area);
1584 
1585 	endva = startva + size;
1586 
1587 	/* We will use this temporary va. */
1588 	tmpva = bootspace.spareva;
1589 	pte = PTE_BASE + pl1_i(tmpva);
1590 
1591 	/* Build L4 */
1592 	L4e_idx = pl4_i(startva);
1593 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1594 	KASSERT(nL4e  == 1);
1595 	for (i = 0; i < nL4e; i++) {
1596 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1597 
1598 		pa = pmap_bootstrap_palloc(1);
1599 		*pte = (pa & PTE_FRAME) | pteflags;
1600 		pmap_update_pg(tmpva);
1601 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1602 
1603 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1604 	}
1605 
1606 	/* Build L3 */
1607 	L3e_idx = pl3_i(startva);
1608 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1609 	for (i = 0; i < nL3e; i++) {
1610 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1611 
1612 		pa = pmap_bootstrap_palloc(1);
1613 		*pte = (pa & PTE_FRAME) | pteflags;
1614 		pmap_update_pg(tmpva);
1615 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1616 
1617 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1618 	}
1619 
1620 	/* Build L2 */
1621 	L2e_idx = pl2_i(startva);
1622 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1623 	for (i = 0; i < nL2e; i++) {
1624 
1625 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1626 
1627 		pa = pmap_bootstrap_palloc(1);
1628 		*pte = (pa & PTE_FRAME) | pteflags;
1629 		pmap_update_pg(tmpva);
1630 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1631 
1632 		L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1633 	}
1634 
1635 	/* Build L1 */
1636 	L1e_idx = pl1_i(startva);
1637 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1638 	for (i = 0; i < nL1e; i++) {
1639 		/*
1640 		 * Nothing to do, the PTEs will be entered via
1641 		 * pmap_kenter_pa.
1642 		 */
1643 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1644 	}
1645 
1646 	*pte = 0;
1647 	pmap_update_pg(tmpva);
1648 
1649 	pcpuarea = (struct pcpu_area *)startva;
1650 
1651 	tlbflush();
1652 }
1653 #endif
1654 
1655 #ifdef __HAVE_DIRECT_MAP
1656 static void
1657 randomize_hole(size_t *randholep, vaddr_t *randvap)
1658 {
1659 	struct nist_hash_drbg drbg;
1660 	uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1661 	const char p[] = "x86/directmap";
1662 	int error;
1663 
1664 	entropy_extract(seed, sizeof(seed), 0);
1665 
1666 	error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1667 	    /*nonce*/NULL, 0,
1668 	    /*personalization*/p, strlen(p));
1669 	KASSERTMSG(error == 0, "error=%d", error);
1670 
1671 	error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1672 	    /*additional*/NULL, 0);
1673 	KASSERTMSG(error == 0, "error=%d", error);
1674 
1675 	error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1676 	    /*additional*/NULL, 0);
1677 	KASSERTMSG(error == 0, "error=%d", error);
1678 
1679 	explicit_memset(seed, 0, sizeof(seed));
1680 	explicit_memset(&drbg, 0, sizeof(drbg));
1681 }
1682 
1683 /*
1684  * Create the amd64 direct map. Called only once at boot time. We map all of
1685  * the physical memory contiguously using 2MB large pages, with RW permissions.
1686  * However there is a hole: the kernel is mapped with RO permissions.
1687  */
1688 static void
1689 pmap_init_directmap(struct pmap *kpm)
1690 {
1691 	extern phys_ram_seg_t mem_clusters[];
1692 	extern int mem_cluster_cnt;
1693 
1694 	vaddr_t startva;
1695 	size_t nL4e, nL3e, nL2e;
1696 	size_t L4e_idx, L3e_idx, L2e_idx;
1697 	size_t spahole, epahole;
1698 	paddr_t lastpa, pa;
1699 	vaddr_t endva;
1700 	vaddr_t tmpva;
1701 	pt_entry_t *pte;
1702 	phys_ram_seg_t *mc;
1703 	int i;
1704 	size_t randhole;
1705 	vaddr_t randva;
1706 
1707 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1708 	const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1709 
1710 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1711 
1712 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1713 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1714 
1715 	/* Get the last physical address available */
1716 	lastpa = 0;
1717 	for (i = 0; i < mem_cluster_cnt; i++) {
1718 		mc = &mem_clusters[i];
1719 		lastpa = MAX(lastpa, mc->start + mc->size);
1720 	}
1721 
1722 	/*
1723 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1724 	 */
1725 	if (lastpa > MAXPHYSMEM) {
1726 		panic("pmap_init_directmap: lastpa incorrect");
1727 	}
1728 
1729 	randomize_hole(&randhole, &randva);
1730 	startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1731 	    randhole, randva);
1732 	endva = startva + lastpa;
1733 
1734 	/* We will use this temporary va. */
1735 	tmpva = bootspace.spareva;
1736 	pte = PTE_BASE + pl1_i(tmpva);
1737 
1738 	/* Build L4 */
1739 	L4e_idx = pl4_i(startva);
1740 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1741 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1742 	for (i = 0; i < nL4e; i++) {
1743 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1744 
1745 		pa = pmap_bootstrap_palloc(1);
1746 		*pte = (pa & PTE_FRAME) | pteflags;
1747 		pmap_update_pg(tmpva);
1748 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1749 
1750 		L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1751 	}
1752 
1753 	/* Build L3 */
1754 	L3e_idx = pl3_i(startva);
1755 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1756 	for (i = 0; i < nL3e; i++) {
1757 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1758 
1759 		pa = pmap_bootstrap_palloc(1);
1760 		*pte = (pa & PTE_FRAME) | pteflags;
1761 		pmap_update_pg(tmpva);
1762 		memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1763 
1764 		L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1765 	}
1766 
1767 	/* Build L2 */
1768 	L2e_idx = pl2_i(startva);
1769 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1770 	for (i = 0; i < nL2e; i++) {
1771 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1772 
1773 		pa = (paddr_t)(i * NBPD_L2);
1774 
1775 		if (spahole <= pa && pa < epahole) {
1776 			L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1777 			    PTE_PS | pmap_pg_g;
1778 		} else {
1779 			L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1780 			    PTE_PS | pmap_pg_g;
1781 		}
1782 	}
1783 
1784 	*pte = 0;
1785 	pmap_update_pg(tmpva);
1786 
1787 	pmap_direct_base = startva;
1788 	pmap_direct_end = endva;
1789 
1790 	tlbflush();
1791 }
1792 #endif /* __HAVE_DIRECT_MAP */
1793 
1794 #if !defined(XENPV)
1795 /*
1796  * Remap all of the virtual pages created so far with the PTE_G bit.
1797  */
1798 static void
1799 pmap_remap_global(void)
1800 {
1801 	vaddr_t kva, kva_end;
1802 	unsigned long p1i;
1803 	size_t i;
1804 
1805 	/* head */
1806 	kva = bootspace.head.va;
1807 	kva_end = kva + bootspace.head.sz;
1808 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1809 		p1i = pl1_i(kva);
1810 		if (pmap_valid_entry(PTE_BASE[p1i]))
1811 			PTE_BASE[p1i] |= pmap_pg_g;
1812 	}
1813 
1814 	/* kernel segments */
1815 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1816 		if (bootspace.segs[i].type == BTSEG_NONE) {
1817 			continue;
1818 		}
1819 		kva = bootspace.segs[i].va;
1820 		kva_end = kva + bootspace.segs[i].sz;
1821 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1822 			p1i = pl1_i(kva);
1823 			if (pmap_valid_entry(PTE_BASE[p1i]))
1824 				PTE_BASE[p1i] |= pmap_pg_g;
1825 		}
1826 	}
1827 
1828 	/* boot space */
1829 	kva = bootspace.boot.va;
1830 	kva_end = kva + bootspace.boot.sz;
1831 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1832 		p1i = pl1_i(kva);
1833 		if (pmap_valid_entry(PTE_BASE[p1i]))
1834 			PTE_BASE[p1i] |= pmap_pg_g;
1835 	}
1836 }
1837 #endif
1838 
1839 #ifndef XENPV
1840 /*
1841  * Remap several kernel segments with large pages. We cover as many pages as we
1842  * can. Called only once at boot time, if the CPU supports large pages.
1843  */
1844 static void
1845 pmap_remap_largepages(void)
1846 {
1847 	pd_entry_t *pde;
1848 	vaddr_t kva, kva_end;
1849 	paddr_t pa;
1850 	size_t i;
1851 
1852 	/* Remap the kernel text using large pages. */
1853 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1854 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1855 			continue;
1856 		}
1857 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1858 		if (kva < bootspace.segs[i].va) {
1859 			continue;
1860 		}
1861 		kva_end = rounddown(bootspace.segs[i].va +
1862 			bootspace.segs[i].sz, NBPD_L2);
1863 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1864 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1865 			pde = &L2_BASE[pl2_i(kva)];
1866 			*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1867 			tlbflushg();
1868 		}
1869 	}
1870 
1871 	/* Remap the kernel rodata using large pages. */
1872 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1873 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1874 			continue;
1875 		}
1876 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1877 		if (kva < bootspace.segs[i].va) {
1878 			continue;
1879 		}
1880 		kva_end = rounddown(bootspace.segs[i].va +
1881 			bootspace.segs[i].sz, NBPD_L2);
1882 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1883 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1884 			pde = &L2_BASE[pl2_i(kva)];
1885 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1886 			tlbflushg();
1887 		}
1888 	}
1889 
1890 	/* Remap the kernel data+bss using large pages. */
1891 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1892 		if (bootspace.segs[i].type != BTSEG_DATA) {
1893 			continue;
1894 		}
1895 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1896 		if (kva < bootspace.segs[i].va) {
1897 			continue;
1898 		}
1899 		kva_end = rounddown(bootspace.segs[i].va +
1900 			bootspace.segs[i].sz, NBPD_L2);
1901 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1902 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1903 			pde = &L2_BASE[pl2_i(kva)];
1904 			*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1905 			tlbflushg();
1906 		}
1907 	}
1908 }
1909 #endif /* !XENPV */
1910 
1911 /*
1912  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1913  * to manage mappings.
1914  */
1915 void
1916 pmap_init(void)
1917 {
1918 	int flags;
1919 
1920 	/*
1921 	 * initialize caches.
1922 	 */
1923 
1924 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1925 	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1926 
1927 #ifdef XENPV
1928 	/*
1929 	 * pool_cache(9) should not touch cached objects, since they
1930 	 * are pinned on xen and R/O for the domU
1931 	 */
1932 	flags = PR_NOTOUCH;
1933 #else
1934 	flags = 0;
1935 #endif
1936 
1937 #ifdef PAE
1938 	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1939 	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
1940 #else
1941 	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1942 	    "pdppl", NULL, IPL_NONE);
1943 #endif
1944 	pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1945 	     0, 0, "pvpage", &pool_allocator_kmem,
1946 	    IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1947 
1948 	pmap_tlb_init();
1949 
1950 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1951 	pmap_tlb_cpu_init(curcpu());
1952 
1953 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1954 	    NULL, "x86", "io bitmap copy");
1955 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1956 	    NULL, "x86", "ldt sync");
1957 
1958 	/*
1959 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1960 	 * to hang a tree of pv_entry records.  Dynamically allocated
1961 	 * pv_entry lists are not heavily used in the kernel's pmap (the
1962 	 * usual case is embedded), so cop out and use a single RB tree
1963 	 * to cover them.
1964 	 */
1965 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1966 
1967 	/*
1968 	 * done: pmap module is up (and ready for business)
1969 	 */
1970 
1971 	pmap_initialized = true;
1972 }
1973 
1974 #ifndef XENPV
1975 /*
1976  * pmap_cpu_init_late: perform late per-CPU initialization.
1977  */
1978 void
1979 pmap_cpu_init_late(struct cpu_info *ci)
1980 {
1981 	/*
1982 	 * The BP has already its own PD page allocated during early
1983 	 * MD startup.
1984 	 */
1985 	if (ci == &cpu_info_primary)
1986 		return;
1987 #ifdef PAE
1988 	cpu_alloc_l3_page(ci);
1989 #endif
1990 }
1991 #endif
1992 
1993 #ifndef __HAVE_DIRECT_MAP
1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1996 
1997 static void
1998 pmap_vpage_cpualloc(struct cpu_info *ci)
1999 {
2000 	bool primary = (ci == &cpu_info_primary);
2001 	size_t i, npages;
2002 	vaddr_t vabase;
2003 	vsize_t vrange;
2004 
2005 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
2006 	KASSERT(npages >= VPAGE_MAX);
2007 	vrange = npages * PAGE_SIZE;
2008 
2009 	if (primary) {
2010 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
2011 			/* Waste some pages to align properly */
2012 		}
2013 		/* The base is aligned, allocate the rest (contiguous) */
2014 		pmap_bootstrap_valloc(npages - 1);
2015 	} else {
2016 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
2017 		    UVM_KMF_VAONLY);
2018 		if (vabase == 0) {
2019 			panic("%s: failed to allocate tmp VA for CPU %d\n",
2020 			    __func__, cpu_index(ci));
2021 		}
2022 	}
2023 
2024 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
2025 
2026 	for (i = 0; i < VPAGE_MAX; i++) {
2027 		ci->vpage[i] = vabase + i * PAGE_SIZE;
2028 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
2029 	}
2030 }
2031 
2032 void
2033 pmap_vpage_cpu_init(struct cpu_info *ci)
2034 {
2035 	if (ci == &cpu_info_primary) {
2036 		/* cpu0 already taken care of in pmap_bootstrap */
2037 		return;
2038 	}
2039 
2040 	pmap_vpage_cpualloc(ci);
2041 }
2042 #endif
2043 
2044 /*
2045  * p v _ e n t r y   f u n c t i o n s
2046  */
2047 
2048 /*
2049  * pmap_pvp_dtor: pool_cache constructor for PV pages.
2050  */
2051 static int
2052 pmap_pvp_ctor(void *arg, void *obj, int flags)
2053 {
2054 	struct pv_page *pvp = (struct pv_page *)obj;
2055 	struct pv_entry *pve = (struct pv_entry *)obj + 1;
2056 	struct pv_entry *maxpve = pve + PVE_PER_PVP;
2057 
2058 	KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2059 	KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2060 
2061 	LIST_INIT(&pvp->pvp_pves);
2062 	pvp->pvp_nfree = PVE_PER_PVP;
2063 	pvp->pvp_pmap = NULL;
2064 
2065 	for (; pve < maxpve; pve++) {
2066 		LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2067 	}
2068 
2069 	return 0;
2070 }
2071 
2072 /*
2073  * pmap_pvp_dtor: pool_cache destructor for PV pages.
2074  */
2075 static void
2076 pmap_pvp_dtor(void *arg, void *obj)
2077 {
2078 	struct pv_page *pvp __diagused = obj;
2079 
2080 	KASSERT(pvp->pvp_pmap == NULL);
2081 	KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2082 }
2083 
2084 /*
2085  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2086  */
2087 static struct pv_entry *
2088 pmap_alloc_pv(struct pmap *pmap)
2089 {
2090 	struct pv_entry *pve;
2091 	struct pv_page *pvp;
2092 
2093 	KASSERT(mutex_owned(&pmap->pm_lock));
2094 
2095 	if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2096 		if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2097 			LIST_REMOVE(pvp, pvp_list);
2098 		} else {
2099 			pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2100 		}
2101 		if (__predict_false(pvp == NULL)) {
2102 			return NULL;
2103 		}
2104 		/* full -> part */
2105 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2106 		pvp->pvp_pmap = pmap;
2107 	}
2108 
2109 	KASSERT(pvp->pvp_pmap == pmap);
2110 	KASSERT(pvp->pvp_nfree > 0);
2111 
2112 	pve = LIST_FIRST(&pvp->pvp_pves);
2113 	LIST_REMOVE(pve, pve_list);
2114 	pvp->pvp_nfree--;
2115 
2116 	if (__predict_false(pvp->pvp_nfree == 0)) {
2117 		/* part -> empty */
2118 		KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2119 		LIST_REMOVE(pvp, pvp_list);
2120 		LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2121 	} else {
2122 		KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2123 	}
2124 
2125 	return pve;
2126 }
2127 
2128 /*
2129  * pmap_free_pv: delayed free of a PV entry.
2130  */
2131 static void
2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2133 {
2134 	struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2135 
2136 	KASSERT(mutex_owned(&pmap->pm_lock));
2137 	KASSERT(pvp->pvp_pmap == pmap);
2138 	KASSERT(pvp->pvp_nfree >= 0);
2139 
2140 	LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2141 	pvp->pvp_nfree++;
2142 
2143 	if (__predict_false(pvp->pvp_nfree == 1)) {
2144 		/* empty -> part */
2145 		LIST_REMOVE(pvp, pvp_list);
2146 		LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2147 	} else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2148 		/* part -> full */
2149 		LIST_REMOVE(pvp, pvp_list);
2150 		LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2151 	}
2152 }
2153 
2154 /*
2155  * pmap_drain_pv: free full PV pages.
2156  */
2157 static void
2158 pmap_drain_pv(struct pmap *pmap)
2159 {
2160 	struct pv_page *pvp;
2161 
2162 	KASSERT(mutex_owned(&pmap->pm_lock));
2163 
2164 	while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2165 		LIST_REMOVE(pvp, pvp_list);
2166 		KASSERT(pvp->pvp_pmap == pmap);
2167 		KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2168 		pvp->pvp_pmap = NULL;
2169 		pool_cache_put(&pmap_pvp_cache, pvp);
2170 	}
2171 }
2172 
2173 /*
2174  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2175  */
2176 static void
2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2178     vaddr_t va, bool tracked)
2179 {
2180 #ifdef DEBUG
2181 	struct pv_pte *pvpte;
2182 
2183 	PMAP_CHECK_PP(pp);
2184 
2185 	mutex_spin_enter(&pp->pp_lock);
2186 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2187 		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2188 			break;
2189 		}
2190 	}
2191 	mutex_spin_exit(&pp->pp_lock);
2192 
2193 	if (pvpte && !tracked) {
2194 		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2195 	} else if (!pvpte && tracked) {
2196 		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2197 	}
2198 #endif
2199 }
2200 
2201 /*
2202  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2203  *
2204  * => pmap must be locked
2205  */
2206 static struct pv_entry *
2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2208     const rb_tree_t *tree, const vaddr_t va)
2209 {
2210 	struct pv_entry *pve;
2211 	rb_node_t *node;
2212 
2213 	/*
2214 	 * Inlined lookup tailored for exactly what's needed here that is
2215 	 * quite a bit faster than using rb_tree_find_node().
2216 	 */
2217 	for (node = tree->rbt_root;;) {
2218 		if (__predict_false(RB_SENTINEL_P(node))) {
2219 			return NULL;
2220 		}
2221 		pve = (struct pv_entry *)
2222 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2223 		if (pve->pve_pte.pte_va == va) {
2224 			KASSERT(pve->pve_pte.pte_ptp == ptp);
2225 			return pve;
2226 		}
2227 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
2228 	}
2229 }
2230 
2231 /*
2232  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2233  *
2234  * => a PV entry must be known present (doesn't check for existence)
2235  * => pmap must be locked
2236  */
2237 static struct pv_entry *
2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2239     const struct pmap_page * const old_pp, const vaddr_t va)
2240 {
2241 	struct pv_entry *pve;
2242 	const rb_tree_t *tree;
2243 
2244 	KASSERT(mutex_owned(&pmap->pm_lock));
2245 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2246 
2247 	/*
2248 	 * [This mostly deals with the case of process-private pages, i.e.
2249 	 * anonymous memory allocations or COW.]
2250 	 *
2251 	 * If the page is tracked with an embedded entry then the tree
2252 	 * lookup can be avoided.  It's safe to check for this specific
2253 	 * set of values without pp_lock because both will only ever be
2254 	 * set together for this pmap.
2255 	 *
2256 	 */
2257 	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2258 	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2259 		return NULL;
2260 	}
2261 
2262 	/*
2263 	 * [This mostly deals with shared mappings, for example shared libs
2264 	 * and executables.]
2265 	 *
2266 	 * Optimise for pmap_remove_ptes() which works by ascending scan:
2267 	 * look at the lowest numbered node in the tree first.  The tree is
2268 	 * known non-empty because of the check above.  For short lived
2269 	 * processes where pmap_remove() isn't used much this gets close to
2270 	 * a 100% hit rate.
2271 	 */
2272 	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2273 	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2274 	pve = (struct pv_entry *)
2275 	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2276 	    offsetof(struct pv_entry, pve_rb));
2277 	if (__predict_true(pve->pve_pte.pte_va == va)) {
2278 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2279 		return pve;
2280 	}
2281 
2282 	/* Search the RB tree for the key (uncommon). */
2283 	return pmap_treelookup_pv(pmap, ptp, tree, va);
2284 }
2285 
2286 /*
2287  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2288  *
2289  * => pmap must be locked
2290  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2291  */
2292 static int
2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2294     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2295     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2296 {
2297 	struct pv_entry *pve;
2298 	int error;
2299 
2300 	KASSERT(mutex_owned(&pmap->pm_lock));
2301 	KASSERT(ptp_to_pmap(ptp) == pmap);
2302 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2303 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2304 	PMAP_CHECK_PP(pp);
2305 
2306 	/*
2307 	 * If entering the same page and it's already tracked with an
2308 	 * embedded entry, we can avoid the expense below.  It's safe
2309 	 * to check for this very specific set of values without a lock
2310 	 * because both will only ever be set together for this pmap.
2311 	 */
2312 	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2313 	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2314 		*samepage = true;
2315 		pmap_check_pv(pmap, ptp, pp, va, true);
2316 		return 0;
2317 	}
2318 
2319 	/*
2320 	 * Check for an existing dynamic mapping at this address.  If it's
2321 	 * for the same page, then it will be reused and nothing needs to be
2322 	 * changed.
2323 	 */
2324 	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2325 	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2326 		*samepage = true;
2327 		pmap_check_pv(pmap, ptp, pp, va, true);
2328 		return 0;
2329 	}
2330 
2331 	/*
2332 	 * Need to put a new mapping in place.  Grab a spare pv_entry in
2333 	 * case it's needed; won't know for sure until the lock is taken.
2334 	 */
2335 	if (pmap->pm_pve == NULL) {
2336 		pmap->pm_pve = pmap_alloc_pv(pmap);
2337 	}
2338 
2339 	error = 0;
2340 	pmap_check_pv(pmap, ptp, pp, va, false);
2341 	mutex_spin_enter(&pp->pp_lock);
2342 	if (!pv_pte_embedded(pp)) {
2343 		/*
2344 		 * Embedded PV tracking available - easy.
2345 		 */
2346 		pp->pp_pte.pte_ptp = ptp;
2347 		pp->pp_pte.pte_va = va;
2348 		*new_embedded = true;
2349 	} else if (__predict_false(pmap->pm_pve == NULL)) {
2350 		/*
2351 		 * No memory.
2352 		 */
2353 		error = ENOMEM;
2354 	} else {
2355 		/*
2356 		 * Install new pv_entry on the page.
2357 		 */
2358 		pve = pmap->pm_pve;
2359 		pmap->pm_pve = NULL;
2360 		*new_pve = pve;
2361 		pve->pve_pte.pte_ptp = ptp;
2362 		pve->pve_pte.pte_va = va;
2363 		pve->pve_pp = pp;
2364 		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2365 	}
2366 	mutex_spin_exit(&pp->pp_lock);
2367 	if (error == 0) {
2368 		pmap_check_pv(pmap, ptp, pp, va, true);
2369 	}
2370 
2371 	return error;
2372 }
2373 
2374 /*
2375  * pmap_remove_pv: try to remove a mapping from a pv_list
2376  *
2377  * => pmap must be locked
2378  * => removes dynamic entries from tree and frees them
2379  * => caller should adjust ptp's wire_count and free PTP if needed
2380  */
2381 static void
2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2383     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2384 {
2385 	rb_tree_t *tree = (ptp != NULL ?
2386 	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2387 
2388 	KASSERT(mutex_owned(&pmap->pm_lock));
2389 	KASSERT(ptp_to_pmap(ptp) == pmap);
2390 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2391 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2392 	KASSERT(ptp != NULL || pmap == pmap_kernel());
2393 
2394 	pmap_check_pv(pmap, ptp, pp, va, true);
2395 
2396 	if (pve == NULL) {
2397 		mutex_spin_enter(&pp->pp_lock);
2398 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2399 		KASSERT(pp->pp_pte.pte_va == va);
2400 		pp->pp_attrs |= oattrs;
2401 		pp->pp_pte.pte_ptp = NULL;
2402 		pp->pp_pte.pte_va = 0;
2403 		mutex_spin_exit(&pp->pp_lock);
2404 	} else {
2405 		mutex_spin_enter(&pp->pp_lock);
2406 		KASSERT(pp->pp_pte.pte_ptp != ptp ||
2407 		    pp->pp_pte.pte_va != va);
2408 		KASSERT(pve->pve_pte.pte_ptp == ptp);
2409 		KASSERT(pve->pve_pte.pte_va == va);
2410 		KASSERT(pve->pve_pp == pp);
2411 		pp->pp_attrs |= oattrs;
2412 		LIST_REMOVE(pve, pve_list);
2413 		mutex_spin_exit(&pp->pp_lock);
2414 
2415 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2416 		rb_tree_remove_node(tree, pve);
2417 #ifdef DIAGNOSTIC
2418 		memset(pve, 0, sizeof(*pve));
2419 #endif
2420 		pmap_free_pv(pmap, pve);
2421 	}
2422 
2423 	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2424 	pmap_check_pv(pmap, ptp, pp, va, false);
2425 }
2426 
2427 /*
2428  * p t p   f u n c t i o n s
2429  */
2430 
2431 static struct vm_page *
2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2433 {
2434 	int lidx = level - 1;
2435 	off_t off = ptp_va2o(va, level);
2436 	struct vm_page *pg;
2437 
2438 	KASSERT(mutex_owned(&pmap->pm_lock));
2439 
2440 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2441 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2442 		pg = pmap->pm_ptphint[lidx];
2443 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2444 		return pg;
2445 	}
2446 	PMAP_DUMMY_LOCK(pmap);
2447 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2448 	PMAP_DUMMY_UNLOCK(pmap);
2449 	if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2450 		/* This page is queued to be freed - ignore. */
2451 		pg = NULL;
2452 	}
2453 	if (pg != NULL) {
2454 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2455 	}
2456 	pmap->pm_ptphint[lidx] = pg;
2457 	return pg;
2458 }
2459 
2460 static inline void
2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2462 {
2463 	int lidx;
2464 
2465 	KASSERT(ptp->wire_count <= 1);
2466 	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2467 
2468 	lidx = level - 1;
2469 	pmap_stats_update(pmap, -ptp->wire_count, 0);
2470 	if (pmap->pm_ptphint[lidx] == ptp)
2471 		pmap->pm_ptphint[lidx] = NULL;
2472 	ptp->wire_count = 0;
2473 	ptp->uanon = NULL;
2474 	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2475 
2476 	/*
2477 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
2478 	 * the page from the uvm_object, as that can take further locks
2479 	 * (intolerable right now because the PTEs are likely mapped in).
2480 	 * Instead mark the PTP as free and if we bump into it again, we'll
2481 	 * either ignore or reuse (depending on what's useful at the time).
2482 	 */
2483 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2484 }
2485 
2486 static void
2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2488 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2489 {
2490 	unsigned long index;
2491 	int level;
2492 	vaddr_t invaladdr;
2493 	pd_entry_t opde;
2494 
2495 	KASSERT(pmap != pmap_kernel());
2496 	KASSERT(mutex_owned(&pmap->pm_lock));
2497 	KASSERT(kpreempt_disabled());
2498 
2499 	level = 1;
2500 	do {
2501 		index = pl_i(va, level + 1);
2502 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2503 
2504 		/*
2505 		 * On Xen-amd64 or SVS, we need to sync the top level page
2506 		 * directory on each CPU.
2507 		 */
2508 #if defined(XENPV) && defined(__x86_64__)
2509 		if (level == PTP_LEVELS - 1) {
2510 			xen_kpm_sync(pmap, index);
2511 		}
2512 #elif defined(SVS)
2513 		if (svs_enabled && level == PTP_LEVELS - 1 &&
2514 		    pmap_is_user(pmap)) {
2515 			svs_pmap_sync(pmap, index);
2516 		}
2517 #endif
2518 
2519 		invaladdr = level == 1 ? (vaddr_t)ptes :
2520 		    (vaddr_t)pdes[level - 2];
2521 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2522 		    opde, TLBSHOOT_FREE_PTP);
2523 
2524 #if defined(XENPV)
2525 		pmap_tlb_shootnow();
2526 #endif
2527 
2528 		pmap_freepage(pmap, ptp, level);
2529 		if (level < PTP_LEVELS - 1) {
2530 			ptp = pmap_find_ptp(pmap, va, level + 1);
2531 			ptp->wire_count--;
2532 			if (ptp->wire_count > 1)
2533 				break;
2534 		}
2535 	} while (++level < PTP_LEVELS);
2536 	pmap_pte_flush();
2537 }
2538 
2539 /*
2540  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2541  *
2542  * => pmap should NOT be pmap_kernel()
2543  * => pmap should be locked
2544  * => we are not touching any PTEs yet, so they need not be mapped in
2545  */
2546 static int
2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2548     int flags, struct vm_page **resultp)
2549 {
2550 	struct vm_page *ptp;
2551 	int i, aflags;
2552 	struct uvm_object *obj;
2553 	voff_t off;
2554 
2555 	KASSERT(pmap != pmap_kernel());
2556 	KASSERT(mutex_owned(&pmap->pm_lock));
2557 
2558 	/*
2559 	 * Loop through all page table levels allocating a page
2560 	 * for any level where we don't already have one.
2561 	 */
2562 	memset(pt, 0, sizeof(*pt));
2563 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2564 		UVM_PGA_ZERO;
2565 	for (i = PTP_LEVELS; i > 1; i--) {
2566 		obj = &pmap->pm_obj[i - 2];
2567 		off = ptp_va2o(va, i - 1);
2568 
2569 		PMAP_DUMMY_LOCK(pmap);
2570 		pt->pg[i] = uvm_pagelookup(obj, off);
2571 
2572 		if (pt->pg[i] == NULL) {
2573 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2574 			pt->alloced[i] = (pt->pg[i] != NULL);
2575 		} else if (pt->pg[i]->wire_count == 0) {
2576 			/* This page was queued to be freed; dequeue it. */
2577 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2578 			pt->alloced[i] = true;
2579 		}
2580 		PMAP_DUMMY_UNLOCK(pmap);
2581 		if (pt->pg[i] == NULL) {
2582 			pmap_unget_ptp(pmap, pt);
2583 			return ENOMEM;
2584 		} else if (pt->alloced[i]) {
2585 			pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2586 			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2587 			    &pmap_rbtree_ops);
2588 			PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2589 		}
2590 	}
2591 	ptp = pt->pg[2];
2592 	KASSERT(ptp != NULL);
2593 	*resultp = ptp;
2594 	pmap->pm_ptphint[0] = ptp;
2595 	return 0;
2596 }
2597 
2598 /*
2599  * pmap_install_ptp: install any freshly allocated PTPs
2600  *
2601  * => pmap should NOT be pmap_kernel()
2602  * => pmap should be locked
2603  * => PTEs must be mapped
2604  * => preemption must be disabled
2605  */
2606 static void
2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2608     pd_entry_t * const *pdes)
2609 {
2610 	struct vm_page *ptp;
2611 	unsigned long index;
2612 	pd_entry_t *pva;
2613 	paddr_t pa;
2614 	int i;
2615 
2616 	KASSERT(pmap != pmap_kernel());
2617 	KASSERT(mutex_owned(&pmap->pm_lock));
2618 	KASSERT(kpreempt_disabled());
2619 
2620 	/*
2621 	 * Now that we have all the pages looked up or allocated,
2622 	 * loop through again installing any new ones into the tree.
2623 	 */
2624 	for (i = PTP_LEVELS; i > 1; i--) {
2625 		index = pl_i(va, i);
2626 		pva = pdes[i - 2];
2627 
2628 		if (pmap_valid_entry(pva[index])) {
2629 			KASSERT(!pt->alloced[i]);
2630 			continue;
2631 		}
2632 
2633 		ptp = pt->pg[i];
2634 		ptp->flags &= ~PG_BUSY; /* never busy */
2635 		ptp->wire_count = 1;
2636 		pmap->pm_ptphint[i - 2] = ptp;
2637 		pa = VM_PAGE_TO_PHYS(ptp);
2638 		pmap_pte_set(&pva[index], (pd_entry_t)
2639 		    (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2640 
2641 		/*
2642 		 * On Xen-amd64 or SVS, we need to sync the top level page
2643 		 * directory on each CPU.
2644 		 */
2645 #if defined(XENPV) && defined(__x86_64__)
2646 		if (i == PTP_LEVELS) {
2647 			xen_kpm_sync(pmap, index);
2648 		}
2649 #elif defined(SVS)
2650 		if (svs_enabled && i == PTP_LEVELS &&
2651 		    pmap_is_user(pmap)) {
2652 			svs_pmap_sync(pmap, index);
2653 		}
2654 #endif
2655 
2656 		pmap_pte_flush();
2657 		pmap_stats_update(pmap, 1, 0);
2658 
2659 		/*
2660 		 * If we're not in the top level, increase the
2661 		 * wire count of the parent page.
2662 		 */
2663 		if (i < PTP_LEVELS) {
2664 			pt->pg[i + 1]->wire_count++;
2665 		}
2666 	}
2667 }
2668 
2669 /*
2670  * pmap_unget_ptp: free unusued PTPs
2671  *
2672  * => pmap should NOT be pmap_kernel()
2673  * => pmap should be locked
2674  */
2675 static void
2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2677 {
2678 	int i;
2679 
2680 	KASSERT(pmap != pmap_kernel());
2681 	KASSERT(mutex_owned(&pmap->pm_lock));
2682 
2683 	for (i = PTP_LEVELS; i > 1; i--) {
2684 		if (!pt->alloced[i]) {
2685 			continue;
2686 		}
2687 		KASSERT(pt->pg[i]->wire_count == 0);
2688 		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2689 		pmap_freepage(pmap, pt->pg[i], i - 1);
2690 	}
2691 }
2692 
2693 /*
2694  * p m a p   l i f e c y c l e   f u n c t i o n s
2695  */
2696 
2697 /*
2698  * pmap_pdp_init: constructor a new PDP.
2699  */
2700 static void
2701 pmap_pdp_init(pd_entry_t *pdir)
2702 {
2703 	paddr_t pdirpa = 0;
2704 	vaddr_t object;
2705 	int i;
2706 
2707 #if !defined(XENPV) || !defined(__x86_64__)
2708 	int npde;
2709 #endif
2710 #ifdef XENPV
2711 	int s;
2712 #endif
2713 
2714 	memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2715 
2716 	/*
2717 	 * NOTE: This is all done unlocked, but we will check afterwards
2718 	 * if we have raced with pmap_growkernel().
2719 	 */
2720 
2721 #if defined(XENPV) && defined(__x86_64__)
2722 	/* Fetch the physical address of the page directory */
2723 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2724 
2725 	/*
2726 	 * This pdir will NEVER be active in kernel mode, so mark
2727 	 * recursive entry invalid.
2728 	 */
2729 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2730 
2731 	/*
2732 	 * PDP constructed this way won't be for the kernel, hence we
2733 	 * don't put kernel mappings on Xen.
2734 	 *
2735 	 * But we need to make pmap_create() happy, so put a dummy
2736 	 * (without PTE_P) value at the right place.
2737 	 */
2738 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2739 	     (pd_entry_t)-1 & PTE_FRAME;
2740 #else /* XENPV && __x86_64__*/
2741 	object = (vaddr_t)pdir;
2742 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2743 		/* Fetch the physical address of the page directory */
2744 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2745 
2746 		/* Put in recursive PDE to map the PTEs */
2747 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2748 		    pmap_pg_nx;
2749 #ifndef XENPV
2750 		pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2751 #endif
2752 	}
2753 
2754 	/* Copy the kernel's top level PDE */
2755 	npde = nkptp[PTP_LEVELS - 1];
2756 
2757 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2758 	    npde * sizeof(pd_entry_t));
2759 
2760 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2761 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2762 		pdir[idx] = PDP_BASE[idx];
2763 	}
2764 
2765 #ifdef __HAVE_PCPU_AREA
2766 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2767 #endif
2768 #ifdef __HAVE_DIRECT_MAP
2769 	slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2770 #endif
2771 #ifdef KASAN
2772 	slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2773 #endif
2774 #ifdef KMSAN
2775 	slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2776 #endif
2777 #endif /* XENPV  && __x86_64__*/
2778 
2779 #ifdef XENPV
2780 	s = splvm();
2781 	object = (vaddr_t)pdir;
2782 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2783 	    VM_PROT_READ);
2784 	pmap_update(pmap_kernel());
2785 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2786 		/*
2787 		 * pin as L2/L4 page, we have to do the page with the
2788 		 * PDIR_SLOT_PTE entries last
2789 		 */
2790 #ifdef PAE
2791 		if (i == l2tol3(PDIR_SLOT_PTE))
2792 			continue;
2793 #endif
2794 
2795 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2796 #ifdef __x86_64__
2797 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2798 #else
2799 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2800 #endif
2801 	}
2802 #ifdef PAE
2803 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2804 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2805 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2806 #endif
2807 	splx(s);
2808 #endif /* XENPV */
2809 }
2810 
2811 /*
2812  * pmap_pdp_fini: destructor for the PDPs.
2813  */
2814 static void
2815 pmap_pdp_fini(pd_entry_t *pdir)
2816 {
2817 #ifdef XENPV
2818 	paddr_t pdirpa = 0;	/* XXX: GCC */
2819 	vaddr_t object = (vaddr_t)pdir;
2820 	int i;
2821 	int s = splvm();
2822 	pt_entry_t *pte;
2823 
2824 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2825 		/* fetch the physical address of the page directory. */
2826 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2827 		/* unpin page table */
2828 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2829 	}
2830 	object = (vaddr_t)pdir;
2831 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2832 		/* Set page RW again */
2833 		pte = kvtopte(object);
2834 		pmap_pte_set(pte, *pte | PTE_W);
2835 		xen_bcast_invlpg((vaddr_t)object);
2836 	}
2837 	splx(s);
2838 #endif  /* XENPV */
2839 }
2840 
2841 #ifdef PAE
2842 static void *
2843 pmap_pdp_alloc(struct pool *pp, int flags)
2844 {
2845 	return (void *)uvm_km_alloc(kernel_map,
2846 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2847 	    ((flags & PR_WAITOK) ? UVM_KMF_WAITVA
2848 		: UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2849 	    UVM_KMF_WIRED);
2850 }
2851 
2852 static void
2853 pmap_pdp_free(struct pool *pp, void *v)
2854 {
2855 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2856 	    UVM_KMF_WIRED);
2857 }
2858 #endif /* PAE */
2859 
2860 /*
2861  * pmap_ctor: constructor for the pmap cache.
2862  */
2863 static int
2864 pmap_ctor(void *arg, void *obj, int flags)
2865 {
2866 	struct pmap *pmap = obj;
2867 	pt_entry_t p;
2868 	int i;
2869 
2870 	KASSERT((flags & PR_WAITOK) != 0);
2871 
2872 	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2873 	rw_init(&pmap->pm_dummy_lock);
2874 	kcpuset_create(&pmap->pm_cpus, true);
2875 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2876 #ifdef XENPV
2877 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2878 #endif
2879 	LIST_INIT(&pmap->pm_gc_ptp);
2880 	pmap->pm_pve = NULL;
2881 	LIST_INIT(&pmap->pm_pvp_full);
2882 	LIST_INIT(&pmap->pm_pvp_part);
2883 	LIST_INIT(&pmap->pm_pvp_empty);
2884 
2885 	/* allocate and init PDP */
2886 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2887 
2888 	for (;;) {
2889 		pmap_pdp_init(pmap->pm_pdir);
2890 		mutex_enter(&pmaps_lock);
2891 		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2892 		if (__predict_true(p != 0)) {
2893 			break;
2894 		}
2895 		mutex_exit(&pmaps_lock);
2896 	}
2897 
2898 	for (i = 0; i < PDP_SIZE; i++)
2899 		pmap->pm_pdirpa[i] =
2900 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2901 
2902 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2903 	mutex_exit(&pmaps_lock);
2904 
2905 	return 0;
2906 }
2907 
2908 /*
2909  * pmap_ctor: destructor for the pmap cache.
2910  */
2911 static void
2912 pmap_dtor(void *arg, void *obj)
2913 {
2914 	struct pmap *pmap = obj;
2915 
2916 	mutex_enter(&pmaps_lock);
2917 	LIST_REMOVE(pmap, pm_list);
2918 	mutex_exit(&pmaps_lock);
2919 
2920 	pmap_pdp_fini(pmap->pm_pdir);
2921 	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2922 	mutex_destroy(&pmap->pm_lock);
2923 	rw_destroy(&pmap->pm_dummy_lock);
2924 	kcpuset_destroy(pmap->pm_cpus);
2925 	kcpuset_destroy(pmap->pm_kernel_cpus);
2926 #ifdef XENPV
2927 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2928 #endif
2929 }
2930 
2931 /*
2932  * pmap_create: create a pmap object.
2933  */
2934 struct pmap *
2935 pmap_create(void)
2936 {
2937 	struct pmap *pmap;
2938 	int i;
2939 
2940 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2941 
2942 	/* init uvm_object */
2943 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2944 		uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2945 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2946 		pmap->pm_ptphint[i] = NULL;
2947 	}
2948 	pmap->pm_stats.wired_count = 0;
2949 	/* count the PDP allocd below */
2950 	pmap->pm_stats.resident_count = PDP_SIZE;
2951 #if !defined(__x86_64__)
2952 	pmap->pm_hiexec = 0;
2953 #endif
2954 
2955 	/* Used by NVMM and Xen */
2956 	pmap->pm_enter = NULL;
2957 	pmap->pm_extract = NULL;
2958 	pmap->pm_remove = NULL;
2959 	pmap->pm_sync_pv = NULL;
2960 	pmap->pm_pp_remove_ent = NULL;
2961 	pmap->pm_write_protect = NULL;
2962 	pmap->pm_unwire = NULL;
2963 	pmap->pm_tlb_flush = NULL;
2964 	pmap->pm_data = NULL;
2965 
2966 	/* init the LDT */
2967 	pmap->pm_ldt = NULL;
2968 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2969 
2970 	return pmap;
2971 }
2972 
2973 /*
2974  * pmap_check_ptps: verify that none of the pmap's page table objects
2975  * have any pages allocated to them.
2976  */
2977 static void
2978 pmap_check_ptps(struct pmap *pmap)
2979 {
2980 	int i;
2981 
2982 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2983 		KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2984 		    "pmap %p level %d still has %d pages",
2985 		    pmap, i, (int)pmap->pm_obj[i].uo_npages);
2986 	}
2987 }
2988 
2989 static void
2990 pmap_check_inuse(struct pmap *pmap)
2991 {
2992 #ifdef DEBUG
2993 	CPU_INFO_ITERATOR cii;
2994 	struct cpu_info *ci;
2995 
2996 	for (CPU_INFO_FOREACH(cii, ci)) {
2997 		if (ci->ci_pmap == pmap)
2998 			panic("destroying pmap being used");
2999 #if defined(XENPV) && defined(__x86_64__)
3000 		for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
3001 			if (pmap->pm_pdir[i] != 0 &&
3002 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
3003 				printf("pmap_destroy(%p) pmap_kernel %p "
3004 				    "curcpu %d cpu %d ci_pmap %p "
3005 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
3006 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
3007 				    pmap, pmap_kernel(), curcpu()->ci_index,
3008 				    ci->ci_index, ci->ci_pmap,
3009 				    i, ci->ci_kpm_pdir[i],
3010 				    i, pmap->pm_pdir[i]);
3011 				panic("%s: used pmap", __func__);
3012 			}
3013 		}
3014 #endif
3015 	}
3016 #endif /* DEBUG */
3017 }
3018 
3019 /*
3020  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
3021  * count goes to zero.
3022  *
3023  * => we can be called from pmap_unmap_ptes() with a different, unrelated
3024  *    pmap's lock held.  be careful!
3025  */
3026 void
3027 pmap_destroy(struct pmap *pmap)
3028 {
3029 	int i;
3030 
3031 	/*
3032 	 * drop reference count and verify not in use.
3033 	 */
3034 
3035 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
3036 		return;
3037 	}
3038 	pmap_check_inuse(pmap);
3039 
3040 	/*
3041 	 * handle any deferred frees.
3042 	 */
3043 
3044 	mutex_enter(&pmap->pm_lock);
3045 	if (pmap->pm_pve != NULL) {
3046 		pmap_free_pv(pmap, pmap->pm_pve);
3047 		pmap->pm_pve = NULL;
3048 	}
3049 	pmap_drain_pv(pmap);
3050 	mutex_exit(&pmap->pm_lock);
3051 	pmap_update(pmap);
3052 
3053 	/*
3054 	 * Reference count is zero, free pmap resources and then free pmap.
3055 	 */
3056 
3057 	pmap_check_ptps(pmap);
3058 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3059 
3060 #ifdef USER_LDT
3061 	if (pmap->pm_ldt != NULL) {
3062 		/*
3063 		 * No need to switch the LDT; this address space is gone,
3064 		 * nothing is using it.
3065 		 *
3066 		 * No need to lock the pmap for ldt_free (or anything else),
3067 		 * we're the last one to use it.
3068 		 */
3069 		/* XXXAD can't take cpu_lock here - fix soon. */
3070 		mutex_enter(&cpu_lock);
3071 		ldt_free(pmap->pm_ldt_sel);
3072 		mutex_exit(&cpu_lock);
3073 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3074 		    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3075 	}
3076 #endif
3077 
3078 	for (i = 0; i < PTP_LEVELS - 1; i++) {
3079 		uvm_obj_destroy(&pmap->pm_obj[i], false);
3080 	}
3081 	kcpuset_zero(pmap->pm_cpus);
3082 	kcpuset_zero(pmap->pm_kernel_cpus);
3083 #ifdef XENPV
3084 	kcpuset_zero(pmap->pm_xen_ptp_cpus);
3085 #endif
3086 
3087 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3088 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3089 	KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3090 
3091 	pmap_check_ptps(pmap);
3092 	if (__predict_false(pmap->pm_enter != NULL)) {
3093 		/* XXX make this a different cache */
3094 		pool_cache_destruct_object(&pmap_cache, pmap);
3095 	} else {
3096 		pool_cache_put(&pmap_cache, pmap);
3097 	}
3098 }
3099 
3100 /*
3101  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3102  *
3103  * => caller must hold pmap's lock
3104  * => PTP must be mapped into KVA
3105  * => must be called with kernel preemption disabled
3106  * => does as little work as possible
3107  */
3108 static void
3109 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3110     vaddr_t startva, vaddr_t blkendva)
3111 {
3112 #ifndef XENPV
3113 	struct pv_entry *pve;
3114 	struct vm_page *pg;
3115 	struct pmap_page *pp;
3116 	pt_entry_t opte;
3117 	rb_tree_t *tree;
3118 	vaddr_t va;
3119 	int wired;
3120 	uint8_t oattrs;
3121 	u_int cnt;
3122 
3123 	KASSERT(mutex_owned(&pmap->pm_lock));
3124 	KASSERT(kpreempt_disabled());
3125 	KASSERT(pmap != pmap_kernel());
3126 	KASSERT(ptp->wire_count > 1);
3127 	KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3128 
3129 	/*
3130 	 * Start at the lowest entered VA, and scan until there are no more
3131 	 * PTEs in the PTPs.
3132 	 */
3133 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3134 	pve = RB_TREE_MIN(tree);
3135 	wired = 0;
3136 	va = (vaddr_t)ptp->uanon;
3137 	pte += ((va - startva) >> PAGE_SHIFT);
3138 
3139 	for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3140 		/*
3141 		 * No need for an atomic to clear the PTE.  Nothing else can
3142 		 * see the address space any more and speculative access (if
3143 		 * possible) won't modify.  Therefore there's no need to
3144 		 * track the accessed/dirty bits.
3145 		 */
3146 		opte = *pte;
3147 		if (!pmap_valid_entry(opte)) {
3148 			continue;
3149 		}
3150 
3151 		/*
3152 		 * Count the PTE.  If it's not for a managed mapping
3153 		 * there's noting more to do.
3154 		 */
3155 		cnt--;
3156 		wired -= (opte & PTE_WIRED);
3157 		if ((opte & PTE_PVLIST) == 0) {
3158 #ifndef DOM0OPS
3159 			KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3160 			    "managed page without PTE_PVLIST for %#"
3161 			    PRIxVADDR, va);
3162 			KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3163 			    "pv-tracked page without PTE_PVLIST for %#"
3164 			    PRIxVADDR, va);
3165 #endif
3166 			KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3167 			    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3168 			    va) == NULL);
3169 			continue;
3170 		}
3171 
3172 		/*
3173 		 * "pve" now points to the lowest (by VA) dynamic PV entry
3174 		 * in the PTP.  If it's for this VA, take advantage of it to
3175 		 * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3176 		 * tree by skipping to the next VA in the tree whenever
3177 		 * there is a match here.  The tree will be cleared out in
3178 		 * one pass before return to pmap_remove_all().
3179 		 */
3180 		oattrs = pmap_pte_to_pp_attrs(opte);
3181 		if (pve != NULL && pve->pve_pte.pte_va == va) {
3182 			pp = pve->pve_pp;
3183 			KASSERT(pve->pve_pte.pte_ptp == ptp);
3184 			KASSERT(pp->pp_pte.pte_ptp != ptp ||
3185 			    pp->pp_pte.pte_va != va);
3186 			mutex_spin_enter(&pp->pp_lock);
3187 			pp->pp_attrs |= oattrs;
3188 			LIST_REMOVE(pve, pve_list);
3189 			mutex_spin_exit(&pp->pp_lock);
3190 
3191 			/*
3192 			 * pve won't be touched again until pmap_drain_pv(),
3193 			 * so it's still safe to traverse the tree.
3194 			 */
3195 			pmap_free_pv(pmap, pve);
3196 			pve = RB_TREE_NEXT(tree, pve);
3197 			continue;
3198 		}
3199 
3200 		/*
3201 		 * No entry in the tree so it must be embedded.  Look up the
3202 		 * page and cancel the embedded entry.
3203 		 */
3204 		if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3205 			pp = VM_PAGE_TO_PP(pg);
3206 		} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3207 			paddr_t pa = pmap_pte2pa(opte);
3208 			panic("%s: PTE_PVLIST with pv-untracked page"
3209 			    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3210 			    "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3211 		}
3212 		mutex_spin_enter(&pp->pp_lock);
3213 		KASSERT(pp->pp_pte.pte_ptp == ptp);
3214 		KASSERT(pp->pp_pte.pte_va == va);
3215 		pp->pp_attrs |= oattrs;
3216 		pp->pp_pte.pte_ptp = NULL;
3217 		pp->pp_pte.pte_va = 0;
3218 		mutex_spin_exit(&pp->pp_lock);
3219 	}
3220 
3221 	/* PTP now empty - adjust the tree & stats to match. */
3222 	pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3223 	ptp->wire_count = 1;
3224 #ifdef DIAGNOSTIC
3225 	rb_tree_init(tree, &pmap_rbtree_ops);
3226 #endif
3227 #else	/* !XENPV */
3228 	/*
3229 	 * XXXAD For XEN, it's not clear to me that we can do this, because
3230 	 * I guess the hypervisor keeps track of PTEs too.
3231 	 */
3232 	pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3233 #endif	/* !XENPV */
3234 }
3235 
3236 /*
3237  * pmap_remove_all: remove all mappings from pmap in bulk.
3238  *
3239  * Ordinarily when removing mappings it's important to hold the UVM object's
3240  * lock, so that pages do not gain a new identity while retaining stale TLB
3241  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3242  * Here it's known that the address space is no longer visible to any user
3243  * process, so we don't need to worry about that.
3244  */
3245 bool
3246 pmap_remove_all(struct pmap *pmap)
3247 {
3248 	struct vm_page *ptps[32];
3249 	vaddr_t va, blkendva;
3250 	struct pmap *pmap2;
3251 	pt_entry_t *ptes;
3252 	pd_entry_t pde __diagused;
3253 	pd_entry_t * const *pdes;
3254 	int lvl __diagused, i, n;
3255 
3256 	/* XXX Can't handle EPT just yet. */
3257 	if (pmap->pm_remove != NULL) {
3258 		return false;
3259 	}
3260 
3261 	for (;;) {
3262 		/* Fetch a block of PTPs from tree. */
3263 		mutex_enter(&pmap->pm_lock);
3264 		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3265 		    (void **)ptps, __arraycount(ptps), false);
3266 		if (n == 0) {
3267 			mutex_exit(&pmap->pm_lock);
3268 			break;
3269 		}
3270 
3271 		/* Remove all mappings in the set of PTPs. */
3272 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3273 		for (i = 0; i < n; i++) {
3274 			if (ptps[i]->wire_count == 0) {
3275 				/* It's dead: pmap_update() will expunge. */
3276 				continue;
3277 			}
3278 
3279 			/* Determine range of block. */
3280 			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3281 			blkendva = x86_round_pdr(va + 1);
3282 
3283 			/* Make sure everything squares up... */
3284 			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3285 			KASSERT(lvl == 1);
3286 			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3287 
3288 			/* Zap! */
3289 			pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3290 			    blkendva);
3291 
3292 			/* PTP should now be unused - free it. */
3293 			KASSERT(ptps[i]->wire_count == 1);
3294 			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3295 		}
3296 		pmap_unmap_ptes(pmap, pmap2);
3297 		pmap_drain_pv(pmap);
3298 		pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3299 		mutex_exit(&pmap->pm_lock);
3300 
3301 		/* Process deferred frees. */
3302 		pmap_update(pmap);
3303 
3304 		/* A breathing point. */
3305 		preempt_point();
3306 	}
3307 
3308 	/* Verify that the pmap is now completely empty. */
3309 	pmap_check_ptps(pmap);
3310 	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3311 	    "pmap %p not empty", pmap);
3312 
3313 	return true;
3314 }
3315 
3316 #if defined(PMAP_FORK)
3317 /*
3318  * pmap_fork: perform any necessary data structure manipulation when
3319  * a VM space is forked.
3320  */
3321 void
3322 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3323 {
3324 #ifdef USER_LDT
3325 	union descriptor *new_ldt;
3326 	int sel;
3327 
3328 	if (__predict_true(pmap1->pm_ldt == NULL)) {
3329 		return;
3330 	}
3331 
3332 	/*
3333 	 * Copy the LDT into the new process.
3334 	 *
3335 	 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3336 	 * we'll retry. This will starve if there's a stream of LDT changes
3337 	 * in another thread but that should not happen.
3338 	 */
3339 
3340 retry:
3341 	if (pmap1->pm_ldt != NULL) {
3342 		/* Allocate space for the new process's LDT */
3343 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3344 		    MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3345 		if (new_ldt == NULL) {
3346 			printf("WARNING: %s: unable to allocate LDT space\n",
3347 			    __func__);
3348 			return;
3349 		}
3350 		mutex_enter(&cpu_lock);
3351 		/* Get a GDT slot for it */
3352 		sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3353 		if (sel == -1) {
3354 			mutex_exit(&cpu_lock);
3355 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3356 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3357 			printf("WARNING: %s: unable to allocate LDT selector\n",
3358 			    __func__);
3359 			return;
3360 		}
3361 	} else {
3362 		/* Wasn't anything there after all. */
3363 		new_ldt = NULL;
3364 		sel = -1;
3365 		mutex_enter(&cpu_lock);
3366 	}
3367 
3368 	/*
3369 	 * Now that we have cpu_lock, ensure the LDT status is the same.
3370 	 */
3371 	if (pmap1->pm_ldt != NULL) {
3372 		if (new_ldt == NULL) {
3373 			/* A wild LDT just appeared. */
3374 			mutex_exit(&cpu_lock);
3375 			goto retry;
3376 		}
3377 
3378 		/* Copy the LDT data and install it in pmap2 */
3379 		memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3380 		pmap2->pm_ldt = new_ldt;
3381 		pmap2->pm_ldt_sel = sel;
3382 		mutex_exit(&cpu_lock);
3383 	} else {
3384 		if (new_ldt != NULL) {
3385 			/* The LDT disappeared, drop what we did. */
3386 			ldt_free(sel);
3387 			mutex_exit(&cpu_lock);
3388 			uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3389 			    MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3390 			return;
3391 		}
3392 
3393 		/* We're good, just leave. */
3394 		mutex_exit(&cpu_lock);
3395 	}
3396 #endif /* USER_LDT */
3397 }
3398 #endif /* PMAP_FORK */
3399 
3400 #ifdef USER_LDT
3401 
3402 /*
3403  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3404  * is active, reload LDTR.
3405  */
3406 static void
3407 pmap_ldt_xcall(void *arg1, void *arg2)
3408 {
3409 	struct pmap *pm;
3410 
3411 	kpreempt_disable();
3412 	pm = arg1;
3413 	if (curcpu()->ci_pmap == pm) {
3414 #if defined(SVS)
3415 		if (svs_enabled) {
3416 			svs_ldt_sync(pm);
3417 		} else
3418 #endif
3419 		lldt(pm->pm_ldt_sel);
3420 	}
3421 	kpreempt_enable();
3422 }
3423 
3424 /*
3425  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3426  * in the new selector on all CPUs.
3427  */
3428 void
3429 pmap_ldt_sync(struct pmap *pm)
3430 {
3431 	uint64_t where;
3432 
3433 	KASSERT(mutex_owned(&cpu_lock));
3434 
3435 	pmap_ldt_evcnt.ev_count++;
3436 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3437 	xc_wait(where);
3438 }
3439 
3440 /*
3441  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3442  * restore the default.
3443  */
3444 void
3445 pmap_ldt_cleanup(struct lwp *l)
3446 {
3447 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3448 	union descriptor *ldt;
3449 	int sel;
3450 
3451 	if (__predict_true(pmap->pm_ldt == NULL)) {
3452 		return;
3453 	}
3454 
3455 	mutex_enter(&cpu_lock);
3456 	if (pmap->pm_ldt != NULL) {
3457 		sel = pmap->pm_ldt_sel;
3458 		ldt = pmap->pm_ldt;
3459 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3460 		pmap->pm_ldt = NULL;
3461 		pmap_ldt_sync(pmap);
3462 		ldt_free(sel);
3463 		uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3464 		    UVM_KMF_WIRED);
3465 	}
3466 	mutex_exit(&cpu_lock);
3467 }
3468 #endif /* USER_LDT */
3469 
3470 /*
3471  * pmap_activate: activate a process' pmap
3472  *
3473  * => must be called with kernel preemption disabled
3474  * => if lwp is the curlwp, then set ci_want_pmapload so that
3475  *    actual MMU context switch will be done by pmap_load() later
3476  */
3477 void
3478 pmap_activate(struct lwp *l)
3479 {
3480 	struct cpu_info *ci;
3481 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3482 
3483 	KASSERT(kpreempt_disabled());
3484 
3485 	ci = curcpu();
3486 
3487 	if (l != ci->ci_curlwp)
3488 		return;
3489 
3490 	KASSERT(ci->ci_want_pmapload == 0);
3491 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3492 
3493 	/*
3494 	 * no need to switch to kernel vmspace because
3495 	 * it's a subset of any vmspace.
3496 	 */
3497 
3498 	if (pmap == pmap_kernel()) {
3499 		ci->ci_want_pmapload = 0;
3500 		return;
3501 	}
3502 
3503 	ci->ci_want_pmapload = 1;
3504 }
3505 
3506 #if defined(XENPV) && defined(__x86_64__)
3507 #define	KASSERT_PDIRPA(pmap) \
3508 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3509 	    pmap == pmap_kernel())
3510 #elif defined(PAE)
3511 #define	KASSERT_PDIRPA(pmap) \
3512 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3513 #elif !defined(XENPV)
3514 #define	KASSERT_PDIRPA(pmap) \
3515 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3516 #else
3517 #define	KASSERT_PDIRPA(pmap)	KASSERT(true)	/* nothing to do */
3518 #endif
3519 
3520 /*
3521  * pmap_reactivate: try to regain reference to the pmap.
3522  *
3523  * => Must be called with kernel preemption disabled.
3524  */
3525 static void
3526 pmap_reactivate(struct pmap *pmap)
3527 {
3528 	struct cpu_info * const ci = curcpu();
3529 	const cpuid_t cid = cpu_index(ci);
3530 
3531 	KASSERT(kpreempt_disabled());
3532 	KASSERT_PDIRPA(pmap);
3533 
3534 	/*
3535 	 * If we still have a lazy reference to this pmap, we can assume
3536 	 * that there was no TLB shootdown for this pmap in the meantime.
3537 	 *
3538 	 * The order of events here is important as we must synchronize
3539 	 * with TLB shootdown interrupts.  Declare interest in invalidations
3540 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3541 	 * change only when the state is TLBSTATE_LAZY.
3542 	 */
3543 
3544 	ci->ci_tlbstate = TLBSTATE_VALID;
3545 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3546 
3547 	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3548 		/* We have the reference, state is valid. */
3549 	} else {
3550 		/*
3551 		 * Must reload the TLB, pmap has been changed during
3552 		 * deactivated.
3553 		 */
3554 		kcpuset_atomic_set(pmap->pm_cpus, cid);
3555 
3556 		tlbflush();
3557 	}
3558 }
3559 
3560 /*
3561  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3562  * and relevant LDT info.
3563  *
3564  * Ensures that the current process' pmap is loaded on the current CPU's
3565  * MMU and that there are no stale TLB entries.
3566  *
3567  * => The caller should disable kernel preemption or do check-and-retry
3568  *    to prevent a preemption from undoing our efforts.
3569  * => This function may block.
3570  */
3571 void
3572 pmap_load(void)
3573 {
3574 	struct cpu_info *ci;
3575 	struct pmap *pmap, *oldpmap;
3576 	struct lwp *l;
3577 	uint64_t pctr;
3578 	int ilevel __diagused;
3579 	u_long psl __diagused;
3580 
3581 	kpreempt_disable();
3582  retry:
3583 	ci = curcpu();
3584 	if (!ci->ci_want_pmapload) {
3585 		kpreempt_enable();
3586 		return;
3587 	}
3588 	l = ci->ci_curlwp;
3589 	pctr = lwp_pctr();
3590 	__insn_barrier();
3591 
3592 	/* should be able to take ipis. */
3593 	KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
3594 #ifdef XENPV
3595 	/* Check to see if interrupts are enabled (ie; no events are masked) */
3596 	KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
3597 #else
3598 	KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
3599 #endif
3600 
3601 	KASSERT(l != NULL);
3602 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3603 	KASSERT(pmap != pmap_kernel());
3604 	oldpmap = ci->ci_pmap;
3605 
3606 	if (pmap == oldpmap) {
3607 		pmap_reactivate(pmap);
3608 		ci->ci_want_pmapload = 0;
3609 		kpreempt_enable();
3610 		return;
3611 	}
3612 
3613 	/*
3614 	 * Acquire a reference to the new pmap and perform the switch.
3615 	 */
3616 
3617 	pmap_reference(pmap);
3618 	pmap_load1(l, pmap, oldpmap);
3619 	ci->ci_want_pmapload = 0;
3620 
3621 	/*
3622 	 * we're now running with the new pmap.  drop the reference
3623 	 * to the old pmap.  if we block, we need to go around again.
3624 	 */
3625 
3626 	pmap_destroy(oldpmap);
3627 	__insn_barrier();
3628 	if (lwp_pctr() != pctr) {
3629 		goto retry;
3630 	}
3631 
3632 	kpreempt_enable();
3633 }
3634 
3635 /*
3636  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3637  * pmap_load().  It's critically important that this function does not
3638  * block.
3639  */
3640 static void
3641 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3642 {
3643 	struct cpu_info *ci;
3644 	struct pcb *pcb;
3645 	cpuid_t cid;
3646 
3647 	KASSERT(kpreempt_disabled());
3648 
3649 	pcb = lwp_getpcb(l);
3650 	ci = l->l_cpu;
3651 	cid = cpu_index(ci);
3652 
3653 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3654 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3655 
3656 	KASSERT_PDIRPA(oldpmap);
3657 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3658 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3659 
3660 	/*
3661 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3662 	 * with TLB shootdown interrupts, so set the state VALID first,
3663 	 * then register us for shootdown events on this pmap.
3664 	 */
3665 	ci->ci_tlbstate = TLBSTATE_VALID;
3666 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3667 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3668 	ci->ci_pmap = pmap;
3669 
3670 	/*
3671 	 * update tss.  now that we have registered for invalidations
3672 	 * from other CPUs, we're good to load the page tables.
3673 	 */
3674 #ifdef PAE
3675 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3676 #else
3677 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3678 #endif
3679 
3680 #ifdef i386
3681 #ifndef XENPV
3682 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3683 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3684 #endif
3685 #endif
3686 
3687 #if defined(SVS) && defined(USER_LDT)
3688 	if (svs_enabled) {
3689 		svs_ldt_sync(pmap);
3690 	} else
3691 #endif
3692 	lldt(pmap->pm_ldt_sel);
3693 
3694 	cpu_load_pmap(pmap, oldpmap);
3695 }
3696 
3697 /*
3698  * pmap_deactivate: deactivate a process' pmap.
3699  *
3700  * => Must be called with kernel preemption disabled (high IPL is enough).
3701  */
3702 void
3703 pmap_deactivate(struct lwp *l)
3704 {
3705 	struct pmap *pmap;
3706 	struct cpu_info *ci;
3707 
3708 	KASSERT(kpreempt_disabled());
3709 
3710 	if (l != curlwp) {
3711 		return;
3712 	}
3713 
3714 	/*
3715 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3716 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3717 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3718 	 * to kernel preemption or blocking routine in between.
3719 	 */
3720 	pmap_tlb_shootnow();
3721 
3722 	ci = curcpu();
3723 
3724 	if (ci->ci_want_pmapload) {
3725 		/*
3726 		 * ci_want_pmapload means that our pmap is not loaded on
3727 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3728 		 * is always considered loaded.
3729 		 */
3730 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3731 		    != pmap_kernel());
3732 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3733 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3734 
3735 		/*
3736 		 * userspace has not been touched.
3737 		 * nothing to do here.
3738 		 */
3739 
3740 		ci->ci_want_pmapload = 0;
3741 		return;
3742 	}
3743 
3744 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3745 
3746 	if (pmap == pmap_kernel()) {
3747 		return;
3748 	}
3749 
3750 	KASSERT_PDIRPA(pmap);
3751 	KASSERT(ci->ci_pmap == pmap);
3752 
3753 	/*
3754 	 * we aren't interested in TLB invalidations for this pmap,
3755 	 * at least for the time being.
3756 	 */
3757 
3758 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3759 	ci->ci_tlbstate = TLBSTATE_LAZY;
3760 }
3761 
3762 #ifdef EFI_RUNTIME
3763 
3764 extern struct pmap *efi_runtime_pmap;
3765 
3766 /*
3767  * pmap_is_user: true if pmap, which must not be the kernel pmap, is
3768  * for an unprivileged user process
3769  */
3770 bool
3771 pmap_is_user(struct pmap *pmap)
3772 {
3773 
3774 	KASSERT(pmap != pmap_kernel());
3775 	return (pmap != efi_runtime_pmap);
3776 }
3777 
3778 /*
3779  * pmap_activate_sync: synchronously activate specified pmap.
3780  *
3781  * => Must be called with kernel preemption disabled (high IPL is enough).
3782  * => Must not sleep before pmap_deactivate_sync.
3783  */
3784 void *
3785 pmap_activate_sync(struct pmap *pmap)
3786 {
3787 	struct cpu_info *ci = curcpu();
3788 	struct pmap *oldpmap = ci->ci_pmap;
3789 	unsigned cid = cpu_index(ci);
3790 
3791 	KASSERT(kpreempt_disabled());
3792 	KASSERT(pmap != pmap_kernel());
3793 
3794 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3795 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3796 
3797 	if (oldpmap) {
3798 		KASSERT_PDIRPA(oldpmap);
3799 		kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3800 		kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3801 	}
3802 
3803 	ci->ci_tlbstate = TLBSTATE_VALID;
3804 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3805 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3806 	ci->ci_pmap = pmap;
3807 
3808 #if defined(SVS) && defined(USER_LDT)
3809 	if (svs_enabled) {
3810 		svs_ldt_sync(pmap);
3811 	} else
3812 #endif
3813 	lldt(pmap->pm_ldt_sel);
3814 
3815 	cpu_load_pmap(pmap, oldpmap);
3816 
3817 	return oldpmap;
3818 }
3819 
3820 /*
3821  * pmap_deactivate_sync: synchronously deactivate specified pmap and
3822  * restore whatever was active before pmap_activate_sync.
3823  *
3824  * => Must be called with kernel preemption disabled (high IPL is enough).
3825  * => Must not have slept since pmap_activate_sync.
3826  */
3827 void
3828 pmap_deactivate_sync(struct pmap *pmap, void *cookie)
3829 {
3830 	struct cpu_info *ci = curcpu();
3831 	struct pmap *oldpmap = cookie;
3832 	unsigned cid = cpu_index(ci);
3833 
3834 	KASSERT(kpreempt_disabled());
3835 	KASSERT(pmap != pmap_kernel());
3836 	KASSERT(ci->ci_pmap == pmap);
3837 
3838 	KASSERT_PDIRPA(pmap);
3839 
3840 	KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
3841 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3842 
3843 	pmap_tlb_shootnow();
3844 
3845 	kcpuset_atomic_clear(pmap->pm_cpus, cid);
3846 	kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
3847 
3848 	ci->ci_tlbstate = TLBSTATE_VALID;
3849 	ci->ci_pmap = oldpmap;
3850 	if (oldpmap) {
3851 		kcpuset_atomic_set(oldpmap->pm_cpus, cid);
3852 		kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
3853 #if defined(SVS) && defined(USER_LDT)
3854 		if (svs_enabled) {
3855 			svs_ldt_sync(oldpmap);
3856 		} else
3857 #endif
3858 		lldt(oldpmap->pm_ldt_sel);
3859 		cpu_load_pmap(oldpmap, pmap);
3860 	} else {
3861 		lcr3(pmap_pdirpa(pmap_kernel(), 0));
3862 	}
3863 }
3864 
3865 #endif	/* EFI_RUNTIME */
3866 
3867 /*
3868  * some misc. functions
3869  */
3870 
3871 bool
3872 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3873     int *lastlvl)
3874 {
3875 	unsigned long index;
3876 	pd_entry_t pde;
3877 	int i;
3878 
3879 	for (i = PTP_LEVELS; i > 1; i--) {
3880 		index = pl_i(va, i);
3881 		pde = pdes[i - 2][index];
3882 		if ((pde & PTE_P) == 0) {
3883 			*lastlvl = i;
3884 			return false;
3885 		}
3886 		if (pde & PTE_PS)
3887 			break;
3888 	}
3889 	if (lastpde != NULL)
3890 		*lastpde = pde;
3891 	*lastlvl = i;
3892 	return true;
3893 }
3894 
3895 /*
3896  * pmap_extract: extract a PA for the given VA
3897  */
3898 bool
3899 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3900 {
3901 	pt_entry_t *ptes, pte;
3902 	pd_entry_t pde;
3903 	pd_entry_t * const *pdes;
3904 	struct pmap *pmap2;
3905 	paddr_t pa;
3906 	bool rv;
3907 	int lvl;
3908 
3909 	if (__predict_false(pmap->pm_extract != NULL)) {
3910 		return (*pmap->pm_extract)(pmap, va, pap);
3911 	}
3912 
3913 #ifdef __HAVE_DIRECT_MAP
3914 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3915 		if (pap != NULL) {
3916 			*pap = PMAP_DIRECT_UNMAP(va);
3917 		}
3918 		return true;
3919 	}
3920 #endif
3921 
3922 	rv = false;
3923 	pa = 0;
3924 
3925 	if (pmap != pmap_kernel()) {
3926 		mutex_enter(&pmap->pm_lock);
3927 	}
3928 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3929 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3930 		if (lvl == 2) {
3931 			pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3932 			rv = true;
3933 		} else {
3934 			KASSERT(lvl == 1);
3935 			pte = ptes[pl1_i(va)];
3936 			if (__predict_true((pte & PTE_P) != 0)) {
3937 				pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3938 				rv = true;
3939 			}
3940 		}
3941 	}
3942 	pmap_unmap_ptes(pmap, pmap2);
3943 	if (pmap != pmap_kernel()) {
3944 		mutex_exit(&pmap->pm_lock);
3945 	}
3946 	if (pap != NULL) {
3947 		*pap = pa;
3948 	}
3949 
3950 	return rv;
3951 }
3952 
3953 /*
3954  * vtophys: virtual address to physical address.  For use by
3955  * machine-dependent code only.
3956  */
3957 paddr_t
3958 vtophys(vaddr_t va)
3959 {
3960 	paddr_t pa;
3961 
3962 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3963 		return pa;
3964 	return 0;
3965 }
3966 
3967 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3968 
3969 #ifdef XENPV
3970 /*
3971  * vtomach: virtual address to machine address.  For use by
3972  * machine-dependent code only.
3973  */
3974 paddr_t
3975 vtomach(vaddr_t va)
3976 {
3977 	paddr_t pa;
3978 
3979 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3980 		return pa;
3981 	return 0;
3982 }
3983 #endif
3984 
3985 /*
3986  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3987  * determine the bounds of the kernel virtual address space.
3988  */
3989 void
3990 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3991 {
3992 	*startp = virtual_avail;
3993 	*endp = virtual_end;
3994 }
3995 
3996 void
3997 pmap_zero_page(paddr_t pa)
3998 {
3999 #if defined(__HAVE_DIRECT_MAP)
4000 	memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
4001 #else
4002 #if defined(XENPV)
4003 	if (XEN_VERSION_SUPPORTED(3, 4)) {
4004 		xen_pagezero(pa);
4005 		return;
4006 	}
4007 #endif
4008 	struct cpu_info *ci;
4009 	pt_entry_t *zpte;
4010 	vaddr_t zerova;
4011 
4012 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
4013 
4014 	kpreempt_disable();
4015 
4016 	ci = curcpu();
4017 	zerova = ci->vpage[VPAGE_ZER];
4018 	zpte = ci->vpage_pte[VPAGE_ZER];
4019 
4020 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
4021 
4022 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
4023 	pmap_pte_flush();
4024 	pmap_update_pg(zerova);		/* flush TLB */
4025 
4026 	memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
4027 
4028 #if defined(DIAGNOSTIC) || defined(XENPV)
4029 	pmap_pte_set(zpte, 0);				/* zap ! */
4030 	pmap_pte_flush();
4031 #endif
4032 
4033 	kpreempt_enable();
4034 #endif /* defined(__HAVE_DIRECT_MAP) */
4035 }
4036 
4037 void
4038 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
4039 {
4040 #if defined(__HAVE_DIRECT_MAP)
4041 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
4042 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
4043 
4044 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4045 #else
4046 #if defined(XENPV)
4047 	if (XEN_VERSION_SUPPORTED(3, 4)) {
4048 		xen_copy_page(srcpa, dstpa);
4049 		return;
4050 	}
4051 #endif
4052 	struct cpu_info *ci;
4053 	pt_entry_t *srcpte, *dstpte;
4054 	vaddr_t srcva, dstva;
4055 
4056 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
4057 
4058 	kpreempt_disable();
4059 
4060 	ci = curcpu();
4061 	srcva = ci->vpage[VPAGE_SRC];
4062 	dstva = ci->vpage[VPAGE_DST];
4063 	srcpte = ci->vpage_pte[VPAGE_SRC];
4064 	dstpte = ci->vpage_pte[VPAGE_DST];
4065 
4066 	KASSERT(*srcpte == 0 && *dstpte == 0);
4067 
4068 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
4069 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
4070 	pmap_pte_flush();
4071 	pmap_update_pg(srcva);
4072 	pmap_update_pg(dstva);
4073 
4074 	memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4075 
4076 #if defined(DIAGNOSTIC) || defined(XENPV)
4077 	pmap_pte_set(srcpte, 0);
4078 	pmap_pte_set(dstpte, 0);
4079 	pmap_pte_flush();
4080 #endif
4081 
4082 	kpreempt_enable();
4083 #endif /* defined(__HAVE_DIRECT_MAP) */
4084 }
4085 
4086 static pt_entry_t *
4087 pmap_map_ptp(struct vm_page *ptp)
4088 {
4089 #ifdef __HAVE_DIRECT_MAP
4090 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
4091 #else
4092 	struct cpu_info *ci;
4093 	pt_entry_t *ptppte;
4094 	vaddr_t ptpva;
4095 
4096 	KASSERT(kpreempt_disabled());
4097 
4098 #ifndef XENPV
4099 	const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
4100 #else
4101 	const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
4102 #endif
4103 
4104 	ci = curcpu();
4105 	ptpva = ci->vpage[VPAGE_PTP];
4106 	ptppte = ci->vpage_pte[VPAGE_PTP];
4107 
4108 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
4109 
4110 	pmap_pte_flush();
4111 	pmap_update_pg(ptpva);
4112 
4113 	return (pt_entry_t *)ptpva;
4114 #endif
4115 }
4116 
4117 static void
4118 pmap_unmap_ptp(void)
4119 {
4120 #ifndef __HAVE_DIRECT_MAP
4121 #if defined(DIAGNOSTIC) || defined(XENPV)
4122 	struct cpu_info *ci;
4123 	pt_entry_t *pte;
4124 
4125 	KASSERT(kpreempt_disabled());
4126 
4127 	ci = curcpu();
4128 	pte = ci->vpage_pte[VPAGE_PTP];
4129 
4130 	if (*pte != 0) {
4131 		pmap_pte_set(pte, 0);
4132 		pmap_pte_flush();
4133 	}
4134 #endif
4135 #endif
4136 }
4137 
4138 static pt_entry_t *
4139 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
4140 {
4141 
4142 	KASSERT(kpreempt_disabled());
4143 	if (pmap_is_curpmap(pmap)) {
4144 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
4145 	}
4146 	KASSERT(ptp != NULL);
4147 	return pmap_map_ptp(ptp) + pl1_pi(va);
4148 }
4149 
4150 static void
4151 pmap_unmap_pte(void)
4152 {
4153 
4154 	KASSERT(kpreempt_disabled());
4155 
4156 	pmap_unmap_ptp();
4157 }
4158 
4159 /*
4160  * p m a p   r e m o v e   f u n c t i o n s
4161  *
4162  * functions that remove mappings
4163  */
4164 
4165 /*
4166  * pmap_remove_ptes: remove PTEs from a PTP
4167  *
4168  * => caller must hold pmap's lock
4169  * => PTP must be mapped into KVA
4170  * => PTP should be null if pmap == pmap_kernel()
4171  * => must be called with kernel preemption disabled
4172  * => returns composite pte if at least one page should be shot down
4173  */
4174 static void
4175 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4176     vaddr_t startva, vaddr_t endva)
4177 {
4178 	pt_entry_t *pte = (pt_entry_t *)ptpva;
4179 
4180 	KASSERT(mutex_owned(&pmap->pm_lock));
4181 	KASSERT(kpreempt_disabled());
4182 
4183 	/*
4184 	 * mappings are very often sparse, so clip the given range to the
4185 	 * range of PTEs that are known present in the PTP.
4186 	 */
4187 	pmap_ptp_range_clip(ptp, &startva, &pte);
4188 
4189 	/*
4190 	 * note that ptpva points to the PTE that maps startva.   this may
4191 	 * or may not be the first PTE in the PTP.
4192 	 *
4193 	 * we loop through the PTP while there are still PTEs to look at
4194 	 * and the wire_count is greater than 1 (because we use the wire_count
4195 	 * to keep track of the number of real PTEs in the PTP).
4196 	 */
4197 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4198 		(void)pmap_remove_pte(pmap, ptp, pte, startva);
4199 		startva += PAGE_SIZE;
4200 		pte++;
4201 	}
4202 }
4203 
4204 /*
4205  * pmap_remove_pte: remove a single PTE from a PTP.
4206  *
4207  * => caller must hold pmap's lock
4208  * => PTP must be mapped into KVA
4209  * => PTP should be null if pmap == pmap_kernel()
4210  * => returns true if we removed a mapping
4211  * => must be called with kernel preemption disabled
4212  */
4213 static bool
4214 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4215     vaddr_t va)
4216 {
4217 	struct pv_entry *pve;
4218 	struct vm_page *pg;
4219 	struct pmap_page *pp;
4220 	pt_entry_t opte;
4221 
4222 	KASSERT(mutex_owned(&pmap->pm_lock));
4223 	KASSERT(kpreempt_disabled());
4224 
4225 	if (!pmap_valid_entry(*pte)) {
4226 		/* VA not mapped. */
4227 		return false;
4228 	}
4229 
4230 	/* Atomically save the old PTE and zap it. */
4231 	opte = pmap_pte_testset(pte, 0);
4232 	if (!pmap_valid_entry(opte)) {
4233 		return false;
4234 	}
4235 
4236 	pmap_exec_account(pmap, va, opte, 0);
4237 	pmap_stats_update_bypte(pmap, 0, opte);
4238 
4239 	if (ptp) {
4240 		/*
4241 		 * Dropping a PTE.  Make sure that the PDE is flushed.
4242 		 */
4243 		ptp->wire_count--;
4244 		if (ptp->wire_count <= 1) {
4245 			opte |= PTE_A;
4246 		}
4247 	}
4248 
4249 	if ((opte & PTE_A) != 0) {
4250 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4251 	}
4252 
4253 	/*
4254 	 * If we are not on a pv list - we are done.
4255 	 */
4256 	if ((opte & PTE_PVLIST) == 0) {
4257 #ifndef DOM0OPS
4258 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4259 		    "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4260 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4261 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4262 #endif
4263 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4264 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4265 		return true;
4266 	}
4267 
4268 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4269 		pp = VM_PAGE_TO_PP(pg);
4270 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4271 		paddr_t pa = pmap_pte2pa(opte);
4272 		panic("%s: PTE_PVLIST with pv-untracked page"
4273 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4274 		    __func__, va, pa, atop(pa));
4275 	}
4276 
4277 	/* Sync R/M bits. */
4278 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
4279 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4280 	return true;
4281 }
4282 
4283 static void
4284 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4285 {
4286 	pt_entry_t *ptes;
4287 	pd_entry_t pde;
4288 	pd_entry_t * const *pdes;
4289 	bool result;
4290 	vaddr_t blkendva, va = sva;
4291 	struct vm_page *ptp;
4292 	struct pmap *pmap2;
4293 	int lvl;
4294 
4295 	KASSERT(mutex_owned(&pmap->pm_lock));
4296 
4297 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4298 
4299 	/*
4300 	 * removing one page?  take shortcut function.
4301 	 */
4302 
4303 	if (va + PAGE_SIZE == eva) {
4304 		if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4305 			KASSERT(lvl == 1);
4306 
4307 			/* Get PTP if non-kernel mapping. */
4308 			if (pmap != pmap_kernel()) {
4309 				ptp = pmap_find_ptp(pmap, va, 1);
4310 				KASSERTMSG(ptp != NULL,
4311 				    "%s: unmanaged PTP detected", __func__);
4312 			} else {
4313 				/* Never free kernel PTPs. */
4314 				ptp = NULL;
4315 			}
4316 
4317 			result = pmap_remove_pte(pmap, ptp,
4318 			    &ptes[pl1_i(va)], va);
4319 
4320 			/*
4321 			 * if mapping removed and the PTP is no longer
4322 			 * being used, free it!
4323 			 */
4324 
4325 			if (result && ptp && ptp->wire_count <= 1)
4326 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4327 		}
4328 	} else for (/* null */ ; va < eva ; va = blkendva) {
4329 		/* determine range of block */
4330 		blkendva = x86_round_pdr(va+1);
4331 		if (blkendva > eva)
4332 			blkendva = eva;
4333 
4334 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4335 			/* Skip a range corresponding to an invalid pde. */
4336 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4337 			continue;
4338 		}
4339 		KASSERT(lvl == 1);
4340 
4341 		/* Get PTP if non-kernel mapping. */
4342 		if (pmap != pmap_kernel()) {
4343 			ptp = pmap_find_ptp(pmap, va, 1);
4344 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4345 			    __func__);
4346 		} else {
4347 			/* Never free kernel PTPs. */
4348 			ptp = NULL;
4349 		}
4350 
4351 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4352 		    blkendva);
4353 
4354 		/* If PTP is no longer being used, free it. */
4355 		if (ptp && ptp->wire_count <= 1) {
4356 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4357 		}
4358 	}
4359 	pmap_unmap_ptes(pmap, pmap2);
4360 	pmap_drain_pv(pmap);
4361 }
4362 
4363 /*
4364  * pmap_remove: mapping removal function.
4365  *
4366  * => caller should not be holding any pmap locks
4367  */
4368 void
4369 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4370 {
4371 	if (__predict_false(pmap->pm_remove != NULL)) {
4372 		(*pmap->pm_remove)(pmap, sva, eva);
4373 		return;
4374 	}
4375 
4376 	mutex_enter(&pmap->pm_lock);
4377 	pmap_remove_locked(pmap, sva, eva);
4378 	mutex_exit(&pmap->pm_lock);
4379 }
4380 
4381 /*
4382  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4383  *
4384  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4385  * => Caller should disable kernel preemption.
4386  * => issues tlb shootdowns if necessary.
4387  */
4388 static int
4389 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4390     pt_entry_t *optep)
4391 {
4392 	struct pmap *pmap;
4393 	struct vm_page *ptp;
4394 	vaddr_t va;
4395 	pt_entry_t *ptep;
4396 	pt_entry_t opte;
4397 	pt_entry_t npte;
4398 	pt_entry_t expect;
4399 	bool need_shootdown;
4400 
4401 	ptp = pvpte->pte_ptp;
4402 	va = pvpte->pte_va;
4403 	KASSERT(ptp == NULL || ptp->uobject != NULL);
4404 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4405 	pmap = ptp_to_pmap(ptp);
4406 	KASSERT(kpreempt_disabled());
4407 
4408 	if (__predict_false(pmap->pm_sync_pv != NULL)) {
4409 		return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4410 		    optep);
4411 	}
4412 
4413 	expect = pmap_pa2pte(pa) | PTE_P;
4414 
4415 	if (clearbits != ~0) {
4416 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4417 		clearbits = pmap_pp_attrs_to_pte(clearbits);
4418 	}
4419 
4420 	ptep = pmap_map_pte(pmap, ptp, va);
4421 	do {
4422 		opte = *ptep;
4423 		KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4424 		KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4425 		KASSERT(opte == 0 || (opte & PTE_P) != 0);
4426 		if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4427 			/*
4428 			 * We lost a race with a V->P operation like
4429 			 * pmap_remove().  Wait for the competitor
4430 			 * reflecting pte bits into mp_attrs.
4431 			 */
4432 			pmap_unmap_pte();
4433 			return EAGAIN;
4434 		}
4435 
4436 		/*
4437 		 * Check if there's anything to do on this PTE.
4438 		 */
4439 		if ((opte & clearbits) == 0) {
4440 			need_shootdown = false;
4441 			break;
4442 		}
4443 
4444 		/*
4445 		 * We need a shootdown if the PTE is cached (PTE_A) ...
4446 		 * ... Unless we are clearing only the PTE_W bit and
4447 		 * it isn't cached as RW (PTE_D).
4448 		 */
4449 		need_shootdown = (opte & PTE_A) != 0 &&
4450 		    !(clearbits == PTE_W && (opte & PTE_D) == 0);
4451 
4452 		npte = opte & ~clearbits;
4453 
4454 		/*
4455 		 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4456 		 */
4457 		if (need_shootdown) {
4458 			npte &= ~(PTE_A | PTE_D);
4459 		}
4460 		KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4461 		KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4462 		KASSERT(npte == 0 || (opte & PTE_P) != 0);
4463 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4464 
4465 	if (need_shootdown) {
4466 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4467 	}
4468 	pmap_unmap_pte();
4469 
4470 	*oattrs = pmap_pte_to_pp_attrs(opte);
4471 	if (optep != NULL)
4472 		*optep = opte;
4473 	return 0;
4474 }
4475 
4476 static void
4477 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4478     vaddr_t va)
4479 {
4480 	struct pmap *pmap2;
4481 	pt_entry_t *ptes;
4482 	pd_entry_t * const *pdes;
4483 
4484 	KASSERT(mutex_owned(&pmap->pm_lock));
4485 
4486 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4487 	pmap_stats_update_bypte(pmap, 0, opte);
4488 	ptp->wire_count--;
4489 	if (ptp->wire_count <= 1) {
4490 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4491 	}
4492 	pmap_unmap_ptes(pmap, pmap2);
4493 }
4494 
4495 static void
4496 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4497 {
4498 	struct pv_pte *pvpte;
4499 	struct vm_page *ptp;
4500 	uintptr_t sum;
4501 	uint8_t oattrs;
4502 	bool locked;
4503 
4504 	/*
4505 	 * Do an unlocked check to see if the page has no mappings, eg when
4506 	 * pmap_remove_all() was called before amap_wipeout() for a process
4507 	 * private amap - common.  The page being removed must be on the way
4508 	 * out, so we don't have to worry about concurrent attempts to enter
4509 	 * it (otherwise the caller either doesn't care or has screwed up).
4510 	 */
4511 	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4512 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4513 	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4514 	if (sum == 0) {
4515 		return;
4516 	}
4517 
4518 	kpreempt_disable();
4519 	for (;;) {
4520 		struct pmap *pmap;
4521 		struct pv_entry *pve;
4522 		pt_entry_t opte;
4523 		vaddr_t va;
4524 
4525 		mutex_spin_enter(&pp->pp_lock);
4526 		if ((pvpte = pv_pte_first(pp)) == NULL) {
4527 			mutex_spin_exit(&pp->pp_lock);
4528 			break;
4529 		}
4530 
4531 		/*
4532 		 * Add a reference to the pmap before clearing the pte.
4533 		 * Otherwise the pmap can disappear behind us.
4534 		 */
4535 		ptp = pvpte->pte_ptp;
4536 		pmap = ptp_to_pmap(ptp);
4537 		KASSERT(pmap->pm_obj[0].uo_refs > 0);
4538 		if (ptp != NULL) {
4539 			pmap_reference(pmap);
4540 		}
4541 
4542 		/*
4543 		 * Now try to lock it.  We need a direct handoff between
4544 		 * pp_lock and pm_lock to know the pv_entry is kept intact
4545 		 * and kept associated with this pmap.  If that can't be
4546 		 * had, wait for the pmap's lock to become free and then
4547 		 * retry.
4548 		 */
4549 		locked = mutex_tryenter(&pmap->pm_lock);
4550 		mutex_spin_exit(&pp->pp_lock);
4551 		if (!locked) {
4552 			mutex_enter(&pmap->pm_lock);
4553 			/* nothing, just wait for it */
4554 			mutex_exit(&pmap->pm_lock);
4555 			if (ptp != NULL) {
4556 				pmap_destroy(pmap);
4557 			}
4558 			continue;
4559 		}
4560 		va = pvpte->pte_va;
4561 
4562 		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4563 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4564 		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4565 		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4566 		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4567 		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4568 
4569 #ifdef DEBUG
4570 		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4571 		rb_tree_t *tree = (ptp != NULL ?
4572 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4573 		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4574 		if (pve == NULL) {
4575 			KASSERTMSG(&pp->pp_pte == pvpte,
4576 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4577 			    va, pmap, ptp, pvpte, pve);
4578 		} else {
4579 			KASSERTMSG(&pve->pve_pte == pvpte,
4580 			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4581 			    va, pmap, ptp, pvpte, pve);
4582 		}
4583 #endif
4584 
4585 		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4586 			panic("pmap_pp_remove: mapping not present");
4587 		}
4588 
4589 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
4590 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4591 
4592 		/* Update the PTP reference count. Free if last reference. */
4593 		if (ptp != NULL) {
4594 			KASSERT(pmap != pmap_kernel());
4595 			pmap_tlb_shootnow();
4596 			if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4597 				(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4598 			} else {
4599 				pmap_pp_remove_ent(pmap, ptp, opte, va);
4600 			}
4601 		} else {
4602 			KASSERT(pmap == pmap_kernel());
4603 			pmap_stats_update_bypte(pmap, 0, opte);
4604 		}
4605 		pmap_tlb_shootnow();
4606 		pmap_drain_pv(pmap);
4607 		mutex_exit(&pmap->pm_lock);
4608 		if (ptp != NULL) {
4609 			pmap_destroy(pmap);
4610 		}
4611 	}
4612 	kpreempt_enable();
4613 }
4614 
4615 /*
4616  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4617  *
4618  * => R/M bits are sync'd back to attrs
4619  */
4620 void
4621 pmap_page_remove(struct vm_page *pg)
4622 {
4623 	struct pmap_page *pp;
4624 	paddr_t pa;
4625 
4626 	pp = VM_PAGE_TO_PP(pg);
4627 	pa = VM_PAGE_TO_PHYS(pg);
4628 	pmap_pp_remove(pp, pa);
4629 }
4630 
4631 /*
4632  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4633  * that map it
4634  */
4635 void
4636 pmap_pv_remove(paddr_t pa)
4637 {
4638 	struct pmap_page *pp;
4639 
4640 	pp = pmap_pv_tracked(pa);
4641 	if (pp == NULL)
4642 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4643 	pmap_pp_remove(pp, pa);
4644 }
4645 
4646 /*
4647  * p m a p   a t t r i b u t e  f u n c t i o n s
4648  * functions that test/change managed page's attributes
4649  * since a page can be mapped multiple times we must check each PTE that
4650  * maps it by going down the pv lists.
4651  */
4652 
4653 /*
4654  * pmap_test_attrs: test a page's attributes
4655  */
4656 bool
4657 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4658 {
4659 	struct pmap_page *pp;
4660 	struct pv_pte *pvpte;
4661 	struct pmap *pmap;
4662 	uint8_t oattrs;
4663 	u_int result;
4664 	paddr_t pa;
4665 
4666 	pp = VM_PAGE_TO_PP(pg);
4667 	if ((pp->pp_attrs & testbits) != 0) {
4668 		return true;
4669 	}
4670 	pa = VM_PAGE_TO_PHYS(pg);
4671  startover:
4672 	mutex_spin_enter(&pp->pp_lock);
4673 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4674 		if ((pp->pp_attrs & testbits) != 0) {
4675 			break;
4676 		}
4677 		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4678 			/*
4679 			 * raced with a V->P operation.  wait for the other
4680 			 * side to finish by acquiring pmap's lock.  if no
4681 			 * wait, updates to pp_attrs by the other side may
4682 			 * go unseen.
4683 			 */
4684 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4685 			pmap_reference(pmap);
4686 			mutex_spin_exit(&pp->pp_lock);
4687 			mutex_enter(&pmap->pm_lock);
4688 			/* nothing. */
4689 			mutex_exit(&pmap->pm_lock);
4690 			pmap_destroy(pmap);
4691 			goto startover;
4692 		}
4693 		pp->pp_attrs |= oattrs;
4694 	}
4695 	result = pp->pp_attrs & testbits;
4696 	mutex_spin_exit(&pp->pp_lock);
4697 
4698 	/*
4699 	 * note that we will exit the for loop with a non-null pve if
4700 	 * we have found the bits we are testing for.
4701 	 */
4702 
4703 	return result != 0;
4704 }
4705 
4706 static bool
4707 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4708 {
4709 	struct pv_pte *pvpte;
4710 	struct pmap *pmap;
4711 	uint8_t oattrs;
4712 	u_int result;
4713 
4714 startover:
4715 	mutex_spin_enter(&pp->pp_lock);
4716 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4717 		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4718 			/*
4719 			 * raced with a V->P operation.  wait for the other
4720 			 * side to finish by acquiring pmap's lock.  it is
4721 			 * probably unmapping the page, and it will be gone
4722 			 * when the loop is restarted.
4723 			 */
4724 			pmap = ptp_to_pmap(pvpte->pte_ptp);
4725 			pmap_reference(pmap);
4726 			mutex_spin_exit(&pp->pp_lock);
4727 			mutex_enter(&pmap->pm_lock);
4728 			/* nothing. */
4729 			mutex_exit(&pmap->pm_lock);
4730 			pmap_destroy(pmap);
4731 			goto startover;
4732 		}
4733 		pp->pp_attrs |= oattrs;
4734 	}
4735 	result = pp->pp_attrs & clearbits;
4736 	pp->pp_attrs &= ~clearbits;
4737 	pmap_tlb_shootnow();
4738 	mutex_spin_exit(&pp->pp_lock);
4739 
4740 	return result != 0;
4741 }
4742 
4743 /*
4744  * pmap_clear_attrs: clear the specified attribute for a page.
4745  *
4746  * => we return true if we cleared one of the bits we were asked to
4747  */
4748 bool
4749 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4750 {
4751 	struct pmap_page *pp;
4752 	paddr_t pa;
4753 
4754 	pp = VM_PAGE_TO_PP(pg);
4755 	pa = VM_PAGE_TO_PHYS(pg);
4756 
4757 	/*
4758 	 * If this is a new page, assert it has no mappings and simply zap
4759 	 * the stored attributes without taking any locks.
4760 	 */
4761 	if ((pg->flags & PG_FAKE) != 0) {
4762 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4763 		KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4764 		KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4765 		atomic_store_relaxed(&pp->pp_attrs, 0);
4766 		return false;
4767 	} else {
4768 		return pmap_pp_clear_attrs(pp, pa, clearbits);
4769 	}
4770 }
4771 
4772 /*
4773  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4774  * pv-tracked page.
4775  */
4776 bool
4777 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4778 {
4779 	struct pmap_page *pp;
4780 
4781 	pp = pmap_pv_tracked(pa);
4782 	if (pp == NULL)
4783 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4784 
4785 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4786 }
4787 
4788 /*
4789  * p m a p   p r o t e c t i o n   f u n c t i o n s
4790  */
4791 
4792 /*
4793  * pmap_page_protect: change the protection of all recorded mappings
4794  * of a managed page
4795  *
4796  * => NOTE: this is an inline function in pmap.h
4797  */
4798 
4799 /* see pmap.h */
4800 
4801 /*
4802  * pmap_pv_protect: change the protection of all recorded mappings
4803  * of an unmanaged pv-tracked page
4804  *
4805  * => NOTE: this is an inline function in pmap.h
4806  */
4807 
4808 /* see pmap.h */
4809 
4810 /*
4811  * pmap_protect: set the protection in of the pages in a pmap
4812  *
4813  * => NOTE: this is an inline function in pmap.h
4814  */
4815 
4816 /* see pmap.h */
4817 
4818 /*
4819  * pmap_write_protect: write-protect pages in a pmap.
4820  *
4821  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4822  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4823  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4824  * present the page will still be considered as a kernel page, and the privilege
4825  * separation will be enforced correctly.
4826  */
4827 void
4828 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4829 {
4830 	pt_entry_t bit_rem, bit_put;
4831 	pt_entry_t *ptes;
4832 	pt_entry_t * const *pdes;
4833 	struct pmap *pmap2;
4834 	vaddr_t blockend, va;
4835 	int lvl, i;
4836 
4837 	if (__predict_false(pmap->pm_write_protect != NULL)) {
4838 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
4839 		return;
4840 	}
4841 
4842 	bit_rem = 0;
4843 	if (!(prot & VM_PROT_WRITE))
4844 		bit_rem = PTE_W;
4845 
4846 	bit_put = 0;
4847 	if (!(prot & VM_PROT_EXECUTE))
4848 		bit_put = pmap_pg_nx;
4849 
4850 	sva &= ~PAGE_MASK;
4851 	eva &= ~PAGE_MASK;
4852 
4853 	/*
4854 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
4855 	 * be touching PV entries nor stats and kernel PDEs aren't
4856 	 * freed.
4857 	 */
4858 	if (pmap != pmap_kernel()) {
4859 		mutex_enter(&pmap->pm_lock);
4860 	}
4861 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4862 
4863 	for (va = sva ; va < eva; va = blockend) {
4864 		pt_entry_t *spte, *epte;
4865 
4866 		blockend = x86_round_pdr(va + 1);
4867 		if (blockend > eva)
4868 			blockend = eva;
4869 
4870 		/* Is it a valid block? */
4871 		if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4872 			continue;
4873 		}
4874 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4875 		KASSERT(lvl == 1);
4876 
4877 		spte = &ptes[pl1_i(va)];
4878 		epte = &ptes[pl1_i(blockend)];
4879 
4880 		for (i = 0; spte < epte; spte++, i++) {
4881 			pt_entry_t opte, npte;
4882 
4883 			do {
4884 				opte = *spte;
4885 				if (!pmap_valid_entry(opte)) {
4886 					goto next;
4887 				}
4888 				npte = (opte & ~bit_rem) | bit_put;
4889 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4890 
4891 			if ((opte & PTE_D) != 0) {
4892 				vaddr_t tva = va + x86_ptob(i);
4893 				pmap_tlb_shootdown(pmap, tva, opte,
4894 				    TLBSHOOT_WRITE_PROTECT);
4895 			}
4896 next:;
4897 		}
4898 	}
4899 
4900 	/* Release pmap. */
4901 	pmap_unmap_ptes(pmap, pmap2);
4902 	if (pmap != pmap_kernel()) {
4903 		mutex_exit(&pmap->pm_lock);
4904 	}
4905 }
4906 
4907 /*
4908  * pmap_unwire: clear the wired bit in the PTE.
4909  *
4910  * => Mapping should already be present.
4911  */
4912 void
4913 pmap_unwire(struct pmap *pmap, vaddr_t va)
4914 {
4915 	pt_entry_t *ptes, *ptep, opte;
4916 	pd_entry_t * const *pdes;
4917 	struct pmap *pmap2;
4918 	int lvl;
4919 
4920 	if (__predict_false(pmap->pm_unwire != NULL)) {
4921 		(*pmap->pm_unwire)(pmap, va);
4922 		return;
4923 	}
4924 
4925 	/*
4926 	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
4927 	 * statistics.
4928 	 */
4929 	mutex_enter(&pmap->pm_lock);
4930 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4931 
4932 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4933 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4934 	}
4935 	KASSERT(lvl == 1);
4936 
4937 	ptep = &ptes[pl1_i(va)];
4938 	opte = *ptep;
4939 	KASSERT(pmap_valid_entry(opte));
4940 
4941 	if (opte & PTE_WIRED) {
4942 		pt_entry_t npte = opte & ~PTE_WIRED;
4943 
4944 		opte = pmap_pte_testset(ptep, npte);
4945 		pmap_stats_update_bypte(pmap, npte, opte);
4946 	} else {
4947 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4948 		    " did not change!\n", __func__, pmap, va);
4949 	}
4950 
4951 	/* Release pmap. */
4952 	pmap_unmap_ptes(pmap, pmap2);
4953 	mutex_exit(&pmap->pm_lock);
4954 }
4955 
4956 /*
4957  * pmap_copy: copy mappings from one pmap to another
4958  *
4959  * => optional function
4960  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4961  */
4962 
4963 /*
4964  * defined as macro in pmap.h
4965  */
4966 
4967 __strict_weak_alias(pmap_enter, pmap_enter_default);
4968 
4969 int
4970 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4971     u_int flags)
4972 {
4973 	if (__predict_false(pmap->pm_enter != NULL)) {
4974 		return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4975 	}
4976 
4977 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4978 }
4979 
4980 /*
4981  * pmap_enter: enter a mapping into a pmap
4982  *
4983  * => must be done "now" ... no lazy-evaluation
4984  */
4985 int
4986 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4987 	   vm_prot_t prot, u_int flags, int domid)
4988 {
4989 	pt_entry_t *ptes, opte, npte;
4990 	pt_entry_t *ptep;
4991 	pd_entry_t * const *pdes;
4992 	struct vm_page *ptp;
4993 	struct vm_page *new_pg, *old_pg;
4994 	struct pmap_page *new_pp, *old_pp;
4995 	struct pv_entry *old_pve, *new_pve;
4996 	bool wired = (flags & PMAP_WIRED) != 0;
4997 	struct pmap *pmap2;
4998 	struct pmap_ptparray pt;
4999 	int error;
5000 	bool getptp, samepage, new_embedded;
5001 	rb_tree_t *tree;
5002 
5003 	KASSERT(pmap_initialized);
5004 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5005 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5006 	    PRIxVADDR " over PDP!", __func__, va);
5007 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
5008 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
5009 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
5010 
5011 #ifdef XENPV
5012 	KASSERT(domid == DOMID_SELF || pa == 0);
5013 #endif
5014 
5015 	npte = ma | protection_codes[prot] | PTE_P;
5016 	npte |= pmap_pat_flags(flags);
5017 	if (wired)
5018 		npte |= PTE_WIRED;
5019 	if (va < VM_MAXUSER_ADDRESS) {
5020 		KASSERTMSG(pmap != pmap_kernel(),
5021 		    "entering user va %#"PRIxVADDR" into kernel pmap",
5022 		    va);
5023 		if (pmap_is_user(pmap))
5024 			npte |= PTE_U;
5025 	}
5026 
5027 	if (pmap == pmap_kernel())
5028 		npte |= pmap_pg_g;
5029 	if (flags & VM_PROT_ALL) {
5030 		npte |= PTE_A;
5031 		if (flags & VM_PROT_WRITE) {
5032 			KASSERT((npte & PTE_W) != 0);
5033 			npte |= PTE_D;
5034 		}
5035 	}
5036 
5037 #ifdef XENPV
5038 	if (domid != DOMID_SELF)
5039 		new_pg = NULL;
5040 	else
5041 #endif
5042 		new_pg = PHYS_TO_VM_PAGE(pa);
5043 
5044 	if (new_pg != NULL) {
5045 		/* This is a managed page */
5046 		npte |= PTE_PVLIST;
5047 		new_pp = VM_PAGE_TO_PP(new_pg);
5048 		PMAP_CHECK_PP(new_pp);
5049 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5050 		/* This is an unmanaged pv-tracked page */
5051 		npte |= PTE_PVLIST;
5052 		PMAP_CHECK_PP(new_pp);
5053 	} else {
5054 		new_pp = NULL;
5055 	}
5056 
5057 	/* Begin by locking the pmap. */
5058 	mutex_enter(&pmap->pm_lock);
5059 
5060 	/* Look up the PTP.  Allocate if none present. */
5061 	ptp = NULL;
5062 	getptp = false;
5063 	if (pmap != pmap_kernel()) {
5064 		ptp = pmap_find_ptp(pmap, va, 1);
5065 		if (ptp == NULL) {
5066 			getptp = true;
5067 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5068 			if (error != 0) {
5069 				if (flags & PMAP_CANFAIL) {
5070 					mutex_exit(&pmap->pm_lock);
5071 					return error;
5072 				}
5073 				panic("%s: get ptp failed, error=%d", __func__,
5074 				    error);
5075 			}
5076 		}
5077 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5078 	} else {
5079 		/* Embedded PV entries rely on this. */
5080 		KASSERT(va != 0);
5081 		tree = &pmap_kernel_rb;
5082 	}
5083 
5084 	/*
5085 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5086 	 * entry if required for the new mapping.  Temporarily track the old
5087 	 * and new mappings concurrently.  Only after the old mapping is
5088 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5089 	 * our picture of modified/accessed state for either page could get
5090 	 * out of sync (we need any P->V operation for either page to stall
5091 	 * on pmap->pm_lock until done here).
5092 	 */
5093 	new_pve = NULL;
5094 	old_pve = NULL;
5095 	samepage = false;
5096 	new_embedded = false;
5097 
5098 	if (new_pp != NULL) {
5099 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5100 		    &old_pve, &samepage, &new_embedded, tree);
5101 
5102 		/*
5103 		 * If a new pv_entry was needed and none was available, we
5104 		 * can go no further.
5105 		 */
5106 		if (error != 0) {
5107 			if (flags & PMAP_CANFAIL) {
5108 				if (getptp) {
5109 					pmap_unget_ptp(pmap, &pt);
5110 				}
5111 				mutex_exit(&pmap->pm_lock);
5112 				return error;
5113 			}
5114 			panic("%s: alloc pve failed", __func__);
5115 		}
5116 	} else {
5117 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5118 	}
5119 
5120 	/* Map PTEs into address space. */
5121 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5122 
5123 	/* Install any newly allocated PTPs. */
5124 	if (getptp) {
5125 		pmap_install_ptp(pmap, &pt, va, pdes);
5126 	}
5127 
5128 	/* Check if there is an existing mapping. */
5129 	ptep = &ptes[pl1_i(va)];
5130 	opte = *ptep;
5131 	bool have_oldpa = pmap_valid_entry(opte);
5132 	paddr_t oldpa = pmap_pte2pa(opte);
5133 
5134 	/*
5135 	 * Update the pte.
5136 	 */
5137 	do {
5138 		opte = *ptep;
5139 
5140 		/*
5141 		 * if the same page, inherit PTE_A and PTE_D.
5142 		 */
5143 		if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5144 			npte |= opte & (PTE_A | PTE_D);
5145 		}
5146 #if defined(XENPV)
5147 		if (domid != DOMID_SELF) {
5148 			/* pmap_pte_cas with error handling */
5149 			int s = splvm();
5150 			if (opte != *ptep) {
5151 				splx(s);
5152 				continue;
5153 			}
5154 			error = xpq_update_foreign(
5155 			    vtomach((vaddr_t)ptep), npte, domid, flags);
5156 			splx(s);
5157 			if (error) {
5158 				/* Undo pv_entry tracking - oof. */
5159 				if (new_pp != NULL) {
5160 					mutex_spin_enter(&new_pp->pp_lock);
5161 					if (new_pve != NULL) {
5162 						LIST_REMOVE(new_pve, pve_list);
5163 						KASSERT(pmap->pm_pve == NULL);
5164 						pmap->pm_pve = new_pve;
5165 					} else if (new_embedded) {
5166 						new_pp->pp_pte.pte_ptp = NULL;
5167 						new_pp->pp_pte.pte_va = 0;
5168 					}
5169 					mutex_spin_exit(&new_pp->pp_lock);
5170 				}
5171 				pmap_unmap_ptes(pmap, pmap2);
5172 				/* Free new PTP. */
5173 				if (ptp != NULL && ptp->wire_count <= 1) {
5174 					pmap_free_ptp(pmap, ptp, va, ptes,
5175 					    pdes);
5176 				}
5177 				mutex_exit(&pmap->pm_lock);
5178 				return error;
5179 			}
5180 			break;
5181 		}
5182 #endif /* defined(XENPV) */
5183 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
5184 
5185 	/*
5186 	 * Done with the PTEs: they can now be unmapped.
5187 	 */
5188 	pmap_unmap_ptes(pmap, pmap2);
5189 
5190 	/*
5191 	 * Update statistics and PTP's reference count.
5192 	 */
5193 	pmap_stats_update_bypte(pmap, npte, opte);
5194 	if (ptp != NULL) {
5195 		if (!have_oldpa) {
5196 			ptp->wire_count++;
5197 		}
5198 		/* Remember minimum VA in PTP. */
5199 		pmap_ptp_range_set(ptp, va);
5200 	}
5201 	KASSERT(ptp == NULL || ptp->wire_count > 1);
5202 
5203 	/*
5204 	 * If the same page, we can skip pv_entry handling.
5205 	 */
5206 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5207 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5208 		if ((npte & PTE_PVLIST) != 0) {
5209 			KASSERT(samepage);
5210 			pmap_check_pv(pmap, ptp, new_pp, va, true);
5211 		}
5212 		goto same_pa;
5213 	} else if ((npte & PTE_PVLIST) != 0) {
5214 		KASSERT(!samepage);
5215 	}
5216 
5217 	/*
5218 	 * If old page is pv-tracked, remove pv_entry from its list.
5219 	 */
5220 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5221 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5222 			old_pp = VM_PAGE_TO_PP(old_pg);
5223 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5224 			panic("%s: PTE_PVLIST with pv-untracked page"
5225 			    " va = %#"PRIxVADDR
5226 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5227 			    __func__, va, oldpa, atop(pa));
5228 		}
5229 
5230 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5231 		    pmap_pte_to_pp_attrs(opte));
5232 	} else {
5233 		KASSERT(old_pve == NULL);
5234 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5235 	}
5236 
5237 	/*
5238 	 * If new page is dynamically PV tracked, insert to tree.
5239 	 */
5240 	if (new_pve != NULL) {
5241 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5242 		old_pve = rb_tree_insert_node(tree, new_pve);
5243 		KASSERT(old_pve == new_pve);
5244 		pmap_check_pv(pmap, ptp, new_pp, va, true);
5245 	}
5246 
5247 same_pa:
5248 	/*
5249 	 * shootdown tlb if necessary.
5250 	 */
5251 
5252 	if ((~opte & (PTE_P | PTE_A)) == 0 &&
5253 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5254 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5255 	}
5256 	pmap_drain_pv(pmap);
5257 	mutex_exit(&pmap->pm_lock);
5258 	return 0;
5259 }
5260 
5261 #if defined(XEN) && defined(DOM0OPS)
5262 
5263 struct pmap_data_gnt {
5264 	SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5265 	vaddr_t pd_gnt_sva;
5266 	vaddr_t pd_gnt_eva; /* range covered by this gnt */
5267 	int pd_gnt_refs; /* ref counter */
5268 	struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5269 };
5270 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5271 
5272 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5273 
5274 static struct pmap_data_gnt *
5275 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5276 {
5277 	struct pmap_data_gnt_head *headp;
5278 	struct pmap_data_gnt *pgnt;
5279 
5280 	KASSERT(mutex_owned(&pmap->pm_lock));
5281 	headp = pmap->pm_data;
5282 	KASSERT(headp != NULL);
5283 	SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5284 		if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5285 			return pgnt;
5286 		/* check that we're not overlapping part of a region */
5287 		KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5288 	}
5289 	return NULL;
5290 }
5291 
5292 static void
5293 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5294     const struct gnttab_map_grant_ref *ops)
5295 {
5296 	struct pmap_data_gnt_head *headp;
5297 	struct pmap_data_gnt *pgnt;
5298 	vaddr_t eva = sva + nentries * PAGE_SIZE;
5299 	KASSERT(mutex_owned(&pmap->pm_lock));
5300 	KASSERT(nentries >= 1);
5301 	if (pmap->pm_remove == NULL) {
5302 		pmap->pm_remove = pmap_remove_gnt;
5303 		KASSERT(pmap->pm_data == NULL);
5304 		headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5305 		SLIST_INIT(headp);
5306 		pmap->pm_data = headp;
5307 	} else {
5308 		KASSERT(pmap->pm_remove == pmap_remove_gnt);
5309 		KASSERT(pmap->pm_data != NULL);
5310 		headp = pmap->pm_data;
5311 	}
5312 
5313 	pgnt = pmap_find_gnt(pmap, sva, eva);
5314 	if (pgnt != NULL) {
5315 		KASSERT(pgnt->pd_gnt_sva == sva);
5316 		KASSERT(pgnt->pd_gnt_eva == eva);
5317 		return;
5318 	}
5319 
5320 	/* new entry */
5321 	pgnt = kmem_alloc(sizeof(*pgnt) +
5322 	    (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5323 	pgnt->pd_gnt_sva = sva;
5324 	pgnt->pd_gnt_eva = eva;
5325 	pgnt->pd_gnt_refs = 0;
5326 	memcpy(pgnt->pd_gnt_ops, ops,
5327 	    sizeof(struct gnttab_map_grant_ref) * nentries);
5328 	SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5329 }
5330 
5331 static void
5332 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5333 {
5334 	struct pmap_data_gnt_head *headp = pmap->pm_data;
5335 	int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5336 	KASSERT(nentries >= 1);
5337 	KASSERT(mutex_owned(&pmap->pm_lock));
5338 	KASSERT(pgnt->pd_gnt_refs == 0);
5339 	SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5340 	kmem_free(pgnt, sizeof(*pgnt) +
5341 		    (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5342 	if (SLIST_EMPTY(headp)) {
5343 		kmem_free(headp, sizeof(*headp));
5344 		pmap->pm_data = NULL;
5345 		pmap->pm_remove = NULL;
5346 	}
5347 }
5348 
5349 /*
5350  * pmap_enter_gnt: enter a grant entry into a pmap
5351  *
5352  * => must be done "now" ... no lazy-evaluation
5353  */
5354 int
5355 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5356     const struct gnttab_map_grant_ref *oops)
5357 {
5358 	struct pmap_data_gnt *pgnt;
5359 	pt_entry_t *ptes, opte;
5360 #ifndef XENPV
5361 	pt_entry_t npte;
5362 #endif
5363 	pt_entry_t *ptep;
5364 	pd_entry_t * const *pdes;
5365 	struct vm_page *ptp;
5366 	struct vm_page *old_pg;
5367 	struct pmap_page *old_pp;
5368 	struct pv_entry *old_pve;
5369 	struct pmap *pmap2;
5370 	struct pmap_ptparray pt;
5371 	int error;
5372 	bool getptp;
5373 	rb_tree_t *tree;
5374 	struct gnttab_map_grant_ref *op;
5375 	int ret;
5376 	int idx;
5377 
5378 	KASSERT(pmap_initialized);
5379 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5380 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5381 	    PRIxVADDR " over PDP!", __func__, va);
5382 	KASSERT(pmap != pmap_kernel());
5383 
5384 	/* Begin by locking the pmap. */
5385 	mutex_enter(&pmap->pm_lock);
5386 	pmap_alloc_gnt(pmap, sva, nentries, oops);
5387 
5388 	pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5389 	KASSERT(pgnt != NULL);
5390 
5391 	/* Look up the PTP.  Allocate if none present. */
5392 	ptp = NULL;
5393 	getptp = false;
5394 	ptp = pmap_find_ptp(pmap, va, 1);
5395 	if (ptp == NULL) {
5396 		getptp = true;
5397 		error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5398 		if (error != 0) {
5399 			mutex_exit(&pmap->pm_lock);
5400 			return error;
5401 		}
5402 	}
5403 	tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5404 
5405 	/*
5406 	 * Look up the old PV entry at this VA (if any), and insert a new PV
5407 	 * entry if required for the new mapping.  Temporarily track the old
5408 	 * and new mappings concurrently.  Only after the old mapping is
5409 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
5410 	 * our picture of modified/accessed state for either page could get
5411 	 * out of sync (we need any P->V operation for either page to stall
5412 	 * on pmap->pm_lock until done here).
5413 	 */
5414 	old_pve = NULL;
5415 
5416 	old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5417 
5418 	/* Map PTEs into address space. */
5419 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5420 
5421 	/* Install any newly allocated PTPs. */
5422 	if (getptp) {
5423 		pmap_install_ptp(pmap, &pt, va, pdes);
5424 	}
5425 
5426 	/* Check if there is an existing mapping. */
5427 	ptep = &ptes[pl1_i(va)];
5428 	opte = *ptep;
5429 	bool have_oldpa = pmap_valid_entry(opte);
5430 	paddr_t oldpa = pmap_pte2pa(opte);
5431 
5432 	/*
5433 	 * Update the pte.
5434 	 */
5435 
5436 	idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5437 	op = &pgnt->pd_gnt_ops[idx];
5438 
5439 #ifdef XENPV
5440 	KASSERT(op->flags & GNTMAP_contains_pte);
5441 	op->host_addr = xpmap_ptetomach(ptep);
5442 #else
5443 	KASSERT((op->flags & GNTMAP_contains_pte) == 0);
5444 	KASSERT(op->flags != 0);
5445 	KASSERT(op->host_addr != 0);
5446 #endif
5447 	op->dev_bus_addr = 0;
5448 	op->status = GNTST_general_error;
5449 	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5450 	if (__predict_false(ret)) {
5451 		printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5452 		    __func__, ret);
5453 		op->status = GNTST_general_error;
5454 	}
5455 	for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5456 		kpause("gntmap", false, mstohz(1), NULL);
5457 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5458 		if (__predict_false(ret)) {
5459 			printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5460 			    __func__, ret);
5461 			op->status = GNTST_general_error;
5462 		}
5463 	}
5464 	if (__predict_false(op->status != GNTST_okay)) {
5465 		printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5466 		    __func__, op->status);
5467 		if (have_oldpa) { /* XXX did the pte really change if XENPV  ?*/
5468 			ptp->wire_count--;
5469 		}
5470 	} else {
5471 #ifndef XENPV
5472 		npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
5473 		if ((op->flags & GNTMAP_readonly) == 0)
5474 			npte |= PTE_W;
5475 		do {
5476 			opte = *ptep;
5477 		} while (pmap_pte_cas(ptep, opte, npte) != opte);
5478 #endif
5479 		pgnt->pd_gnt_refs++;
5480 		if (!have_oldpa) {
5481 			ptp->wire_count++;
5482 		}
5483 		KASSERT(ptp->wire_count > 1);
5484 		/* Remember minimum VA in PTP. */
5485 		pmap_ptp_range_set(ptp, va);
5486 	}
5487 	if (ptp->wire_count <= 1)
5488 		pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5489 
5490 	/*
5491 	 * Done with the PTEs: they can now be unmapped.
5492 	 */
5493 	pmap_unmap_ptes(pmap, pmap2);
5494 
5495 	/*
5496 	 * Update statistics and PTP's reference count.
5497 	 */
5498 	pmap_stats_update_bypte(pmap, 0, opte);
5499 
5500 	/*
5501 	 * If old page is pv-tracked, remove pv_entry from its list.
5502 	 */
5503 	if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5504 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5505 			old_pp = VM_PAGE_TO_PP(old_pg);
5506 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5507 			panic("%s: PTE_PVLIST with pv-untracked page"
5508 			    " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5509 			    __func__, va, oldpa);
5510 		}
5511 
5512 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5513 		    pmap_pte_to_pp_attrs(opte));
5514 	} else {
5515 		KASSERT(old_pve == NULL);
5516 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5517 	}
5518 
5519 	pmap_drain_pv(pmap);
5520 	mutex_exit(&pmap->pm_lock);
5521 	return op->status;
5522 }
5523 
5524 /*
5525  * pmap_remove_gnt: grant mapping removal function.
5526  *
5527  * => caller should not be holding any pmap locks
5528  */
5529 static void
5530 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5531 {
5532 	struct pmap_data_gnt *pgnt;
5533 	pt_entry_t *ptes;
5534 	pd_entry_t pde;
5535 	pd_entry_t * const *pdes;
5536 	struct vm_page *ptp;
5537 	struct pmap *pmap2;
5538 	vaddr_t va;
5539 	int lvl;
5540 	int idx;
5541 	struct gnttab_map_grant_ref *op;
5542 	struct gnttab_unmap_grant_ref unmap_op;
5543 	int ret;
5544 
5545 	KASSERT(pmap != pmap_kernel());
5546 	KASSERT(pmap->pm_remove == pmap_remove_gnt);
5547 
5548 	mutex_enter(&pmap->pm_lock);
5549 	for (va = sva; va < eva; va += PAGE_SIZE) {
5550 		pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5551 		if (pgnt == NULL) {
5552 			pmap_remove_locked(pmap, sva, eva);
5553 			continue;
5554 		}
5555 
5556 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5557 		if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5558 			panic("pmap_remove_gnt pdes not valid");
5559 		}
5560 
5561 		idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5562 		op = &pgnt->pd_gnt_ops[idx];
5563 		KASSERT(lvl == 1);
5564 
5565 		/* Get PTP if non-kernel mapping. */
5566 		ptp = pmap_find_ptp(pmap, va, 1);
5567 		KASSERTMSG(ptp != NULL,
5568 		    "%s: unmanaged PTP detected", __func__);
5569 
5570 		if (op->status == GNTST_okay)  {
5571 			KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5572 #ifdef XENPV
5573 			unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5574 #else
5575 			unmap_op.host_addr = op->host_addr;
5576 			pmap_pte_testset(&ptes[pl1_i(va)], 0);
5577 #endif
5578 			unmap_op.handle = op->handle;
5579 			unmap_op.dev_bus_addr = 0;
5580 			ret = HYPERVISOR_grant_table_op(
5581 			    GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5582 			if (ret) {
5583 				printf("%s: GNTTABOP_unmap_grant_ref "
5584 				    "failed: %d\n", __func__, ret);
5585 			}
5586 
5587 			ptp->wire_count--;
5588 			pgnt->pd_gnt_refs--;
5589 		}
5590 		if (pgnt->pd_gnt_refs == 0) {
5591 			pmap_free_gnt(pmap, pgnt);
5592 		}
5593 		/*
5594 		 * if mapping removed and the PTP is no longer
5595 		 * being used, free it!
5596 		 */
5597 
5598 		if (ptp->wire_count <= 1)
5599 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5600 		pmap_unmap_ptes(pmap, pmap2);
5601 	}
5602 	mutex_exit(&pmap->pm_lock);
5603 }
5604 #endif /* XEN && DOM0OPS */
5605 
5606 paddr_t
5607 pmap_get_physpage(void)
5608 {
5609 	struct vm_page *ptp;
5610 	struct pmap *kpm = pmap_kernel();
5611 	paddr_t pa;
5612 
5613 	if (!uvm.page_init_done) {
5614 		/*
5615 		 * We're growing the kernel pmap early (from
5616 		 * uvm_pageboot_alloc()). This case must be
5617 		 * handled a little differently.
5618 		 */
5619 
5620 		if (!uvm_page_physget(&pa))
5621 			panic("%s: out of memory", __func__);
5622 #if defined(__HAVE_DIRECT_MAP)
5623 		memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5624 #else
5625 #if defined(XENPV)
5626 		if (XEN_VERSION_SUPPORTED(3, 4)) {
5627 			xen_pagezero(pa);
5628 			return pa;
5629 		}
5630 #endif
5631 		kpreempt_disable();
5632 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5633 		    PTE_W | pmap_pg_nx);
5634 		pmap_pte_flush();
5635 		pmap_update_pg((vaddr_t)early_zerop);
5636 		memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5637 #if defined(DIAGNOSTIC) || defined(XENPV)
5638 		pmap_pte_set(early_zero_pte, 0);
5639 		pmap_pte_flush();
5640 #endif /* defined(DIAGNOSTIC) */
5641 		kpreempt_enable();
5642 #endif /* defined(__HAVE_DIRECT_MAP) */
5643 	} else {
5644 		/* XXX */
5645 		ptp = uvm_pagealloc(NULL, 0, NULL,
5646 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5647 		if (ptp == NULL)
5648 			panic("%s: out of memory", __func__);
5649 		ptp->flags &= ~PG_BUSY;
5650 		ptp->wire_count = 1;
5651 		pa = VM_PAGE_TO_PHYS(ptp);
5652 	}
5653 	pmap_stats_update(kpm, 1, 0);
5654 
5655 	return pa;
5656 }
5657 
5658 /*
5659  * Expand the page tree with the specified amount of PTPs, mapping virtual
5660  * addresses starting at kva. We populate all the levels but the last one
5661  * (L1). The nodes of the tree are created as RW, but the pages covered
5662  * will be kentered in L1, with proper permissions.
5663  *
5664  * Used only by pmap_growkernel.
5665  */
5666 static void
5667 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5668 {
5669 	unsigned long i;
5670 	paddr_t pa;
5671 	unsigned long index, endindex;
5672 	int level;
5673 	pd_entry_t *pdep;
5674 #ifdef XENPV
5675 	int s = splvm(); /* protect xpq_* */
5676 #endif
5677 
5678 	for (level = PTP_LEVELS; level > 1; level--) {
5679 		if (level == PTP_LEVELS)
5680 			pdep = cpm->pm_pdir;
5681 		else
5682 			pdep = normal_pdes[level - 2];
5683 		index = pl_i_roundup(kva, level);
5684 		endindex = index + needed_ptps[level - 1] - 1;
5685 
5686 		for (i = index; i <= endindex; i++) {
5687 			pt_entry_t pte;
5688 
5689 			KASSERT(!pmap_valid_entry(pdep[i]));
5690 			pa = pmap_get_physpage();
5691 			pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5692 #ifdef __x86_64__
5693 			pte |= pmap_pg_nx;
5694 #endif
5695 			pmap_pte_set(&pdep[i], pte);
5696 
5697 #ifdef XENPV
5698 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5699 				if (__predict_true(
5700 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5701 					/* update per-cpu PMDs on all cpus */
5702 					xen_kpm_sync(pmap_kernel(), i);
5703 				} else {
5704 					/*
5705 					 * too early; update primary CPU
5706 					 * PMD only (without locks)
5707 					 */
5708 #ifdef __x86_64__
5709 					pd_entry_t *cpu_pdep =
5710 						&cpu_info_primary.ci_kpm_pdir[i];
5711 #else
5712 					pd_entry_t *cpu_pdep =
5713 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5714 #endif
5715 					pmap_pte_set(cpu_pdep, pte);
5716 				}
5717 			}
5718 #endif
5719 
5720 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5721 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5722 			nkptp[level - 1]++;
5723 		}
5724 		pmap_pte_flush();
5725 	}
5726 #ifdef XENPV
5727 	splx(s);
5728 #endif
5729 }
5730 
5731 /*
5732  * pmap_growkernel: increase usage of KVM space.
5733  *
5734  * => we allocate new PTPs for the kernel and install them in all
5735  *    the pmaps on the system.
5736  */
5737 vaddr_t
5738 pmap_growkernel(vaddr_t maxkvaddr)
5739 {
5740 	struct pmap *kpm = pmap_kernel();
5741 	struct pmap *cpm;
5742 #if !defined(XENPV) || !defined(__x86_64__)
5743 	struct pmap *pm;
5744 	long old;
5745 #endif
5746 	int s, i;
5747 	long needed_kptp[PTP_LEVELS], target_nptp;
5748 	bool invalidate = false;
5749 
5750 	s = splvm();	/* to be safe */
5751 	mutex_enter(&kpm->pm_lock);
5752 
5753 	if (maxkvaddr <= pmap_maxkvaddr) {
5754 		mutex_exit(&kpm->pm_lock);
5755 		splx(s);
5756 		return pmap_maxkvaddr;
5757 	}
5758 
5759 	maxkvaddr = x86_round_pdr(maxkvaddr);
5760 #if !defined(XENPV) || !defined(__x86_64__)
5761 	old = nkptp[PTP_LEVELS - 1];
5762 #endif
5763 
5764 	/* Initialize needed_kptp. */
5765 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
5766 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5767 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5768 
5769 		if (target_nptp > nkptpmax[i])
5770 			panic("out of KVA space");
5771 		KASSERT(target_nptp >= nkptp[i]);
5772 		needed_kptp[i] = target_nptp - nkptp[i];
5773 	}
5774 
5775 #ifdef XENPV
5776 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
5777 	cpm = kpm;
5778 #else
5779 	/* Get the current pmap */
5780 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5781 		cpm = curcpu()->ci_pmap;
5782 	} else {
5783 		cpm = kpm;
5784 	}
5785 #endif
5786 
5787 	kasan_shadow_map((void *)pmap_maxkvaddr,
5788 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5789 	kmsan_shadow_map((void *)pmap_maxkvaddr,
5790 	    (size_t)(maxkvaddr - pmap_maxkvaddr));
5791 
5792 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5793 
5794 	/*
5795 	 * If the number of top level entries changed, update all pmaps.
5796 	 */
5797 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
5798 #ifdef XENPV
5799 #ifdef __x86_64__
5800 		/* nothing, kernel entries are never entered in user pmap */
5801 #else
5802 		int pdkidx;
5803 
5804 		mutex_enter(&pmaps_lock);
5805 		LIST_FOREACH(pm, &pmaps, pm_list) {
5806 			for (pdkidx = PDIR_SLOT_KERN + old;
5807 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5808 			    pdkidx++) {
5809 				pmap_pte_set(&pm->pm_pdir[pdkidx],
5810 				    kpm->pm_pdir[pdkidx]);
5811 			}
5812 			pmap_pte_flush();
5813 		}
5814 		mutex_exit(&pmaps_lock);
5815 #endif /* __x86_64__ */
5816 #else /* XENPV */
5817 		size_t newpdes;
5818 		newpdes = nkptp[PTP_LEVELS - 1] - old;
5819 		if (cpm != kpm) {
5820 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5821 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5822 			    newpdes * sizeof(pd_entry_t));
5823 		}
5824 
5825 		mutex_enter(&pmaps_lock);
5826 		LIST_FOREACH(pm, &pmaps, pm_list) {
5827 			if (__predict_false(pm->pm_enter != NULL)) {
5828 				/*
5829 				 * Not a native pmap, the kernel is not mapped,
5830 				 * so nothing to synchronize.
5831 				 */
5832 				continue;
5833 			}
5834 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5835 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5836 			    newpdes * sizeof(pd_entry_t));
5837 		}
5838 		mutex_exit(&pmaps_lock);
5839 #endif
5840 		invalidate = true;
5841 	}
5842 	pmap_maxkvaddr = maxkvaddr;
5843 	mutex_exit(&kpm->pm_lock);
5844 	splx(s);
5845 
5846 	if (invalidate && pmap_initialized) {
5847 		/* Invalidate the pmap cache. */
5848 		pool_cache_invalidate(&pmap_cache);
5849 	}
5850 
5851 	return maxkvaddr;
5852 }
5853 
5854 #ifdef DEBUG
5855 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5856 
5857 /*
5858  * pmap_dump: dump all the mappings from a pmap
5859  *
5860  * => caller should not be holding any pmap locks
5861  */
5862 void
5863 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5864 {
5865 	pt_entry_t *ptes, *pte;
5866 	pd_entry_t * const *pdes;
5867 	struct pmap *pmap2;
5868 	vaddr_t blkendva;
5869 	int lvl;
5870 
5871 	/*
5872 	 * if end is out of range truncate.
5873 	 * if (end == start) update to max.
5874 	 */
5875 
5876 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5877 		eva = VM_MAXUSER_ADDRESS;
5878 
5879 	mutex_enter(&pmap->pm_lock);
5880 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5881 
5882 	/*
5883 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5884 	 */
5885 
5886 	for (/* null */ ; sva < eva ; sva = blkendva) {
5887 
5888 		/* determine range of block */
5889 		blkendva = x86_round_pdr(sva+1);
5890 		if (blkendva > eva)
5891 			blkendva = eva;
5892 
5893 		/* valid block? */
5894 		if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5895 			continue;
5896 		KASSERT(lvl == 1);
5897 
5898 		pte = &ptes[pl1_i(sva)];
5899 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5900 			if (!pmap_valid_entry(*pte))
5901 				continue;
5902 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5903 			    " (pte=%#" PRIxPADDR ")\n",
5904 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5905 		}
5906 	}
5907 	pmap_unmap_ptes(pmap, pmap2);
5908 	mutex_exit(&pmap->pm_lock);
5909 }
5910 #endif
5911 
5912 /*
5913  * pmap_update: process deferred invalidations and frees.
5914  */
5915 void
5916 pmap_update(struct pmap *pmap)
5917 {
5918 	struct pmap_page *pp;
5919 	struct vm_page *ptp;
5920 
5921 	/*
5922 	 * Initiate any pending TLB shootdowns.  Wait for them to
5923 	 * complete before returning control to the caller.
5924 	 */
5925 	kpreempt_disable();
5926 	pmap_tlb_shootnow();
5927 	kpreempt_enable();
5928 
5929 	/*
5930 	 * Now that shootdowns are complete, process deferred frees.  This
5931 	 * is an unlocked check, but is safe as we're only interested in
5932 	 * work done in this LWP - we won't get a false negative.
5933 	 */
5934 	if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5935 		return;
5936 	}
5937 
5938 	mutex_enter(&pmap->pm_lock);
5939 	while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5940 		KASSERT(ptp->wire_count == 0);
5941 		KASSERT(ptp->uanon == NULL);
5942 		LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5943 		pp = VM_PAGE_TO_PP(ptp);
5944 		LIST_INIT(&pp->pp_pvlist);
5945 		pp->pp_attrs = 0;
5946 		pp->pp_pte.pte_ptp = NULL;
5947 		pp->pp_pte.pte_va = 0;
5948 		PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5949 
5950 		/*
5951 		 * XXX Hack to avoid extra locking, and lock
5952 		 * assertions in uvm_pagefree().  Despite uobject
5953 		 * being set, this isn't a managed page.
5954 		 */
5955 		PMAP_DUMMY_LOCK(pmap);
5956 		uvm_pagerealloc(ptp, NULL, 0);
5957 		PMAP_DUMMY_UNLOCK(pmap);
5958 		uvm_pagefree(ptp);
5959 	}
5960 	mutex_exit(&pmap->pm_lock);
5961 }
5962 
5963 #if PTP_LEVELS > 4
5964 #error "Unsupported number of page table mappings"
5965 #endif
5966 
5967 paddr_t
5968 pmap_init_tmp_pgtbl(paddr_t pg)
5969 {
5970 	static bool maps_loaded;
5971 	static const paddr_t x86_tmp_pml_paddr[] = {
5972 	    4 * PAGE_SIZE,	/* L1 */
5973 	    5 * PAGE_SIZE,	/* L2 */
5974 	    6 * PAGE_SIZE,	/* L3 */
5975 	    7 * PAGE_SIZE	/* L4 */
5976 	};
5977 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5978 
5979 	pd_entry_t *tmp_pml, *kernel_pml;
5980 
5981 	int level;
5982 
5983 	if (!maps_loaded) {
5984 		for (level = 0; level < PTP_LEVELS; ++level) {
5985 			x86_tmp_pml_vaddr[level] =
5986 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5987 			    UVM_KMF_VAONLY);
5988 
5989 			if (x86_tmp_pml_vaddr[level] == 0)
5990 				panic("mapping of real mode PML failed\n");
5991 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5992 			    x86_tmp_pml_paddr[level],
5993 			    VM_PROT_READ | VM_PROT_WRITE, 0);
5994 		}
5995 		pmap_update(pmap_kernel());
5996 		maps_loaded = true;
5997 	}
5998 
5999 	/* Zero levels 1-3 */
6000 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
6001 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6002 		memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
6003 	}
6004 
6005 	/* Copy PML4 */
6006 	kernel_pml = pmap_kernel()->pm_pdir;
6007 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
6008 	memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
6009 
6010 #ifdef PAE
6011 	/*
6012 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
6013 	 * last entries are unlikely to be used for temporary mappings.
6014 	 * 508: maps 0->1GB (userland)
6015 	 * 509: unused
6016 	 * 510: unused
6017 	 * 511: maps 3->4GB (kernel)
6018 	 */
6019 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
6020 	tmp_pml[509] = 0;
6021 	tmp_pml[510] = 0;
6022 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
6023 #endif
6024 
6025 	for (level = PTP_LEVELS - 1; level > 0; --level) {
6026 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6027 
6028 		tmp_pml[pl_i(pg, level + 1)] =
6029 		    (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
6030 	}
6031 
6032 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
6033 	tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
6034 
6035 #ifdef PAE
6036 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
6037 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
6038 #endif
6039 
6040 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
6041 }
6042 
6043 u_int
6044 x86_mmap_flags(paddr_t mdpgno)
6045 {
6046 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
6047 	u_int pflag = 0;
6048 
6049 	if (nflag & X86_MMAP_FLAG_PREFETCH)
6050 		pflag |= PMAP_WRITE_COMBINE;
6051 
6052 	return pflag;
6053 }
6054 
6055 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
6056 
6057 /*
6058  * -----------------------------------------------------------------------------
6059  * *****************************************************************************
6060  * *****************************************************************************
6061  * *****************************************************************************
6062  * *****************************************************************************
6063  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
6064  * *****************************************************************************
6065  * *****************************************************************************
6066  * *****************************************************************************
6067  * *****************************************************************************
6068  * -----------------------------------------------------------------------------
6069  *
6070  * These functions are invoked as callbacks from the code above. Contrary to
6071  * native, EPT does not have a recursive slot; therefore, it is not possible
6072  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
6073  * tree manually.
6074  *
6075  * Apart from that, the logic is mostly the same as native. Once a pmap has
6076  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
6077  * After that we're good, and the callbacks will handle the translations
6078  * for us.
6079  *
6080  * -----------------------------------------------------------------------------
6081  */
6082 
6083 /* Hardware bits. */
6084 #define EPT_R		__BIT(0)	/* read */
6085 #define EPT_W		__BIT(1)	/* write */
6086 #define EPT_X		__BIT(2)	/* execute */
6087 #define EPT_T		__BITS(5,3)	/* type */
6088 #define		TYPE_UC	0
6089 #define		TYPE_WC	1
6090 #define		TYPE_WT	4
6091 #define		TYPE_WP	5
6092 #define		TYPE_WB	6
6093 #define EPT_NOPAT	__BIT(6)
6094 #define EPT_L		__BIT(7)	/* large */
6095 #define EPT_A		__BIT(8)	/* accessed */
6096 #define EPT_D		__BIT(9)	/* dirty */
6097 /* Software bits. */
6098 #define EPT_PVLIST	__BIT(60)
6099 #define EPT_WIRED	__BIT(61)
6100 
6101 #define pmap_ept_valid_entry(pte)	(pte & EPT_R)
6102 
6103 bool pmap_ept_has_ad __read_mostly;
6104 
6105 static inline void
6106 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
6107 {
6108 	int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
6109 	int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
6110 
6111 	KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6112 	KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6113 
6114 	pmap_stats_update(pmap, resid_diff, wired_diff);
6115 }
6116 
6117 static pt_entry_t
6118 pmap_ept_type(u_int flags)
6119 {
6120 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
6121 	pt_entry_t ret;
6122 
6123 	switch (cacheflags) {
6124 	case PMAP_NOCACHE:
6125 	case PMAP_NOCACHE_OVR:
6126 		ret = __SHIFTIN(TYPE_UC, EPT_T);
6127 		break;
6128 	case PMAP_WRITE_COMBINE:
6129 		ret = __SHIFTIN(TYPE_WC, EPT_T);
6130 		break;
6131 	case PMAP_WRITE_BACK:
6132 	default:
6133 		ret = __SHIFTIN(TYPE_WB, EPT_T);
6134 		break;
6135 	}
6136 
6137 	ret |= EPT_NOPAT;
6138 	return ret;
6139 }
6140 
6141 static inline pt_entry_t
6142 pmap_ept_prot(vm_prot_t prot)
6143 {
6144 	pt_entry_t res = 0;
6145 
6146 	if (prot & VM_PROT_READ)
6147 		res |= EPT_R;
6148 	if (prot & VM_PROT_WRITE)
6149 		res |= EPT_W;
6150 	if (prot & VM_PROT_EXECUTE)
6151 		res |= EPT_X;
6152 
6153 	return res;
6154 }
6155 
6156 static inline uint8_t
6157 pmap_ept_to_pp_attrs(pt_entry_t ept)
6158 {
6159 	uint8_t ret = 0;
6160 	if (pmap_ept_has_ad) {
6161 		if (ept & EPT_D)
6162 			ret |= PP_ATTRS_D;
6163 		if (ept & EPT_A)
6164 			ret |= PP_ATTRS_A;
6165 	} else {
6166 		ret |= (PP_ATTRS_D|PP_ATTRS_A);
6167 	}
6168 	if (ept & EPT_W)
6169 		ret |= PP_ATTRS_W;
6170 	return ret;
6171 }
6172 
6173 static inline pt_entry_t
6174 pmap_pp_attrs_to_ept(uint8_t attrs)
6175 {
6176 	pt_entry_t ept = 0;
6177 	if (attrs & PP_ATTRS_D)
6178 		ept |= EPT_D;
6179 	if (attrs & PP_ATTRS_A)
6180 		ept |= EPT_A;
6181 	if (attrs & PP_ATTRS_W)
6182 		ept |= EPT_W;
6183 	return ept;
6184 }
6185 
6186 /*
6187  * Helper for pmap_ept_free_ptp.
6188  * tree[0] = &L2[L2idx]
6189  * tree[1] = &L3[L3idx]
6190  * tree[2] = &L4[L4idx]
6191  */
6192 static void
6193 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6194 {
6195 	pt_entry_t *pteva;
6196 	paddr_t ptepa;
6197 	int i, index;
6198 
6199 	ptepa = pmap->pm_pdirpa[0];
6200 	for (i = PTP_LEVELS; i > 1; i--) {
6201 		index = pl_pi(va, i);
6202 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6203 		KASSERT(pmap_ept_valid_entry(pteva[index]));
6204 		tree[i - 2] = &pteva[index];
6205 		ptepa = pmap_pte2pa(pteva[index]);
6206 	}
6207 }
6208 
6209 static void
6210 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6211 {
6212 	pd_entry_t *tree[3];
6213 	int level;
6214 
6215 	KASSERT(pmap != pmap_kernel());
6216 	KASSERT(mutex_owned(&pmap->pm_lock));
6217 	KASSERT(kpreempt_disabled());
6218 
6219 	pmap_ept_get_tree(pmap, va, tree);
6220 
6221 	level = 1;
6222 	do {
6223 		(void)pmap_pte_testset(tree[level - 1], 0);
6224 
6225 		pmap_freepage(pmap, ptp, level);
6226 		if (level < PTP_LEVELS - 1) {
6227 			ptp = pmap_find_ptp(pmap, va, level + 1);
6228 			ptp->wire_count--;
6229 			if (ptp->wire_count > 1)
6230 				break;
6231 		}
6232 	} while (++level < PTP_LEVELS);
6233 	pmap_pte_flush();
6234 }
6235 
6236 /* Allocate L4->L3->L2. Return L2. */
6237 static void
6238 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6239 {
6240 	struct vm_page *ptp;
6241 	unsigned long index;
6242 	pd_entry_t *pteva;
6243 	paddr_t ptepa;
6244 	int i;
6245 
6246 	KASSERT(pmap != pmap_kernel());
6247 	KASSERT(mutex_owned(&pmap->pm_lock));
6248 	KASSERT(kpreempt_disabled());
6249 
6250 	/*
6251 	 * Now that we have all the pages looked up or allocated,
6252 	 * loop through again installing any new ones into the tree.
6253 	 */
6254 	ptepa = pmap->pm_pdirpa[0];
6255 	for (i = PTP_LEVELS; i > 1; i--) {
6256 		index = pl_pi(va, i);
6257 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6258 
6259 		if (pmap_ept_valid_entry(pteva[index])) {
6260 			KASSERT(!pt->alloced[i]);
6261 			ptepa = pmap_pte2pa(pteva[index]);
6262 			continue;
6263 		}
6264 
6265 		ptp = pt->pg[i];
6266 		ptp->flags &= ~PG_BUSY; /* never busy */
6267 		ptp->wire_count = 1;
6268 		pmap->pm_ptphint[i - 2] = ptp;
6269 		ptepa = VM_PAGE_TO_PHYS(ptp);
6270 		pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6271 
6272 		pmap_pte_flush();
6273 		pmap_stats_update(pmap, 1, 0);
6274 
6275 		/*
6276 		 * If we're not in the top level, increase the
6277 		 * wire count of the parent page.
6278 		 */
6279 		if (i < PTP_LEVELS) {
6280 			pt->pg[i + 1]->wire_count++;
6281 		}
6282 	}
6283 }
6284 
6285 static int
6286 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6287     u_int flags)
6288 {
6289 	pt_entry_t *ptes, opte, npte;
6290 	pt_entry_t *ptep;
6291 	struct vm_page *ptp;
6292 	struct vm_page *new_pg, *old_pg;
6293 	struct pmap_page *new_pp, *old_pp;
6294 	struct pv_entry *old_pve, *new_pve;
6295 	bool wired = (flags & PMAP_WIRED) != 0;
6296 	bool accessed;
6297 	struct pmap_ptparray pt;
6298 	int error;
6299 	bool getptp, samepage, new_embedded;
6300 	rb_tree_t *tree;
6301 
6302 	KASSERT(pmap_initialized);
6303 	KASSERT(va < VM_MAXUSER_ADDRESS);
6304 
6305 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6306 
6307 	if (wired)
6308 		npte |= EPT_WIRED;
6309 	if (flags & VM_PROT_ALL) {
6310 		npte |= EPT_A;
6311 		if (flags & VM_PROT_WRITE) {
6312 			KASSERT((npte & EPT_W) != 0);
6313 			npte |= EPT_D;
6314 		}
6315 	}
6316 
6317 	new_pg = PHYS_TO_VM_PAGE(pa);
6318 	if (new_pg != NULL) {
6319 		/* This is a managed page */
6320 		npte |= EPT_PVLIST;
6321 		new_pp = VM_PAGE_TO_PP(new_pg);
6322 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6323 		/* This is an unmanaged pv-tracked page */
6324 		npte |= EPT_PVLIST;
6325 	} else {
6326 		new_pp = NULL;
6327 	}
6328 
6329 	/* Begin by locking the pmap. */
6330 	mutex_enter(&pmap->pm_lock);
6331 
6332 	/* Look up the PTP.  Allocate if none present. */
6333 	ptp = NULL;
6334 	getptp = false;
6335 	if (pmap != pmap_kernel()) {
6336 		ptp = pmap_find_ptp(pmap, va, 1);
6337 		if (ptp == NULL) {
6338 			getptp = true;
6339 			error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6340 			if (error != 0) {
6341 				if (flags & PMAP_CANFAIL) {
6342 					mutex_exit(&pmap->pm_lock);
6343 					return error;
6344 				}
6345 				panic("%s: get ptp failed, error=%d", __func__,
6346 				    error);
6347 			}
6348 		}
6349 		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6350 	} else {
6351 		/* Embedded PV entries rely on this. */
6352 		KASSERT(va != 0);
6353 		tree = &pmap_kernel_rb;
6354 	}
6355 
6356 	/*
6357 	 * Look up the old PV entry at this VA (if any), and insert a new PV
6358 	 * entry if required for the new mapping.  Temporarily track the old
6359 	 * and new mappings concurrently.  Only after the old mapping is
6360 	 * evicted from the pmap will we remove its PV entry.  Otherwise,
6361 	 * our picture of modified/accessed state for either page could get
6362 	 * out of sync (we need any P->V operation for either page to stall
6363 	 * on pmap->pm_lock until done here).
6364 	 */
6365 	new_pve = NULL;
6366 	old_pve = NULL;
6367 	samepage = false;
6368 	new_embedded = false;
6369 
6370 	if (new_pp != NULL) {
6371 		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6372 		    &old_pve, &samepage, &new_embedded, tree);
6373 
6374 		/*
6375 		 * If a new pv_entry was needed and none was available, we
6376 		 * can go no further.
6377 		 */
6378 		if (error != 0) {
6379 			if (flags & PMAP_CANFAIL) {
6380 				if (getptp) {
6381 					pmap_unget_ptp(pmap, &pt);
6382 				}
6383 				mutex_exit(&pmap->pm_lock);
6384 				return error;
6385 			}
6386 			panic("%s: alloc pve failed", __func__);
6387 		}
6388 	} else {
6389 		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6390 	}
6391 
6392 	/* Map PTEs into address space. */
6393 	kpreempt_disable();
6394 
6395 	/* Install any newly allocated PTPs. */
6396 	if (getptp) {
6397 		pmap_ept_install_ptp(pmap, &pt, va);
6398 	}
6399 
6400 	/* Check if there is an existing mapping. */
6401 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6402 	ptep = &ptes[pl1_pi(va)];
6403 	opte = *ptep;
6404 	bool have_oldpa = pmap_ept_valid_entry(opte);
6405 	paddr_t oldpa = pmap_pte2pa(opte);
6406 
6407 	/*
6408 	 * Update the pte.
6409 	 */
6410 	do {
6411 		opte = *ptep;
6412 
6413 		/*
6414 		 * if the same page, inherit PTE_A and PTE_D.
6415 		 */
6416 		if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6417 			npte |= opte & (EPT_A | EPT_D);
6418 		}
6419 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6420 
6421 	/*
6422 	 * Done with the PTEs: they can now be unmapped.
6423 	 */
6424 	kpreempt_enable();
6425 
6426 	/*
6427 	 * Update statistics and PTP's reference count.
6428 	 */
6429 	pmap_ept_stats_update_bypte(pmap, npte, opte);
6430 	if (ptp != NULL) {
6431 		if (!have_oldpa) {
6432 			ptp->wire_count++;
6433 		}
6434 		/* Remember minimum VA in PTP. */
6435 		pmap_ptp_range_set(ptp, va);
6436 	}
6437 	KASSERT(ptp == NULL || ptp->wire_count > 1);
6438 
6439 	/*
6440 	 * If the same page, we can skip pv_entry handling.
6441 	 */
6442 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6443 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6444 		if ((npte & EPT_PVLIST) != 0) {
6445 			KASSERT(samepage);
6446 			pmap_check_pv(pmap, ptp, new_pp, va, true);
6447 		}
6448 		goto same_pa;
6449 	} else if ((npte & EPT_PVLIST) != 0) {
6450 		KASSERT(!samepage);
6451 	}
6452 
6453 	/*
6454 	 * If old page is pv-tracked, remove pv_entry from its list.
6455 	 */
6456 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6457 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6458 			old_pp = VM_PAGE_TO_PP(old_pg);
6459 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6460 			panic("%s: EPT_PVLIST with pv-untracked page"
6461 			    " va = %#"PRIxVADDR
6462 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6463 			    __func__, va, oldpa, atop(pa));
6464 		}
6465 
6466 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6467 		    pmap_ept_to_pp_attrs(opte));
6468 	} else {
6469 		KASSERT(old_pve == NULL);
6470 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6471 	}
6472 
6473 	/*
6474 	 * If new page is dynamically PV tracked, insert to tree.
6475 	 */
6476 	if (new_pve != NULL) {
6477 		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6478 		old_pve = rb_tree_insert_node(tree, new_pve);
6479 		KASSERT(old_pve == new_pve);
6480 		pmap_check_pv(pmap, ptp, new_pp, va, true);
6481 	}
6482 
6483 same_pa:
6484 	/*
6485 	 * shootdown tlb if necessary.
6486 	 */
6487 
6488 	if (pmap_ept_has_ad) {
6489 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
6490 	} else {
6491 		accessed = (opte & EPT_R) != 0;
6492 	}
6493 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6494 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6495 	}
6496 	pmap_drain_pv(pmap);
6497 	mutex_exit(&pmap->pm_lock);
6498 	return 0;
6499 }
6500 
6501 /* Pay close attention, this returns L2. */
6502 static int
6503 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6504 {
6505 	pt_entry_t *pteva;
6506 	paddr_t ptepa;
6507 	int i, index;
6508 
6509 	KASSERT(mutex_owned(&pmap->pm_lock));
6510 
6511 	ptepa = pmap->pm_pdirpa[0];
6512 	for (i = PTP_LEVELS; i > 1; i--) {
6513 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6514 		index = pl_pi(va, i);
6515 		if (!pmap_ept_valid_entry(pteva[index]))
6516 			return i;
6517 		ptepa = pmap_pte2pa(pteva[index]);
6518 	}
6519 	if (lastpde != NULL) {
6520 		*lastpde = pteva[index];
6521 	}
6522 
6523 	return 0;
6524 }
6525 
6526 static bool
6527 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6528 {
6529 	pt_entry_t *ptes, pte;
6530 	pd_entry_t pde;
6531 	paddr_t ptppa, pa;
6532 	bool rv;
6533 
6534 #ifdef __HAVE_DIRECT_MAP
6535 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6536 		if (pap != NULL) {
6537 			*pap = PMAP_DIRECT_UNMAP(va);
6538 		}
6539 		return true;
6540 	}
6541 #endif
6542 
6543 	rv = false;
6544 	pa = 0;
6545 
6546 	mutex_enter(&pmap->pm_lock);
6547 	kpreempt_disable();
6548 
6549 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6550 		ptppa = pmap_pte2pa(pde);
6551 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6552 		pte = ptes[pl1_pi(va)];
6553 		if (__predict_true((pte & EPT_R) != 0)) {
6554 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6555 			rv = true;
6556 		}
6557 	}
6558 
6559 	kpreempt_enable();
6560 	mutex_exit(&pmap->pm_lock);
6561 
6562 	if (pap != NULL) {
6563 		*pap = pa;
6564 	}
6565 	return rv;
6566 }
6567 
6568 static bool
6569 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6570     vaddr_t va)
6571 {
6572 	struct pv_entry *pve;
6573 	struct vm_page *pg;
6574 	struct pmap_page *pp;
6575 	pt_entry_t opte;
6576 	bool accessed;
6577 
6578 	KASSERT(pmap != pmap_kernel());
6579 	KASSERT(mutex_owned(&pmap->pm_lock));
6580 	KASSERT(kpreempt_disabled());
6581 
6582 	if (!pmap_ept_valid_entry(*pte)) {
6583 		/* VA not mapped. */
6584 		return false;
6585 	}
6586 
6587 	/* Atomically save the old PTE and zap it. */
6588 	opte = pmap_pte_testset(pte, 0);
6589 	if (!pmap_ept_valid_entry(opte)) {
6590 		return false;
6591 	}
6592 
6593 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6594 
6595 	if (ptp) {
6596 		/*
6597 		 * Dropping a PTE.  Make sure that the PDE is flushed.
6598 		 */
6599 		ptp->wire_count--;
6600 		if (ptp->wire_count <= 1) {
6601 			opte |= EPT_A;
6602 		}
6603 	}
6604 
6605 	if (pmap_ept_has_ad) {
6606 		accessed = (opte & EPT_A) != 0;
6607 	} else {
6608 		accessed = true;
6609 	}
6610 	if (accessed) {
6611 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6612 	}
6613 
6614 	/*
6615 	 * If we are not on a pv list - we are done.
6616 	 */
6617 	if ((opte & EPT_PVLIST) == 0) {
6618 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6619 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6620 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6621 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6622 		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6623 		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6624 		return true;
6625 	}
6626 
6627 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6628 		pp = VM_PAGE_TO_PP(pg);
6629 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6630 		paddr_t pa = pmap_pte2pa(opte);
6631 		panic("%s: EPT_PVLIST with pv-untracked page"
6632 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6633 		    __func__, va, pa, atop(pa));
6634 	}
6635 
6636 	/* Sync R/M bits. */
6637 	pve = pmap_lookup_pv(pmap, ptp, pp, va);
6638 	pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6639 	return true;
6640 }
6641 
6642 static void
6643 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6644     vaddr_t startva, vaddr_t endva)
6645 {
6646 	pt_entry_t *pte = (pt_entry_t *)ptpva;
6647 
6648 	KASSERT(pmap != pmap_kernel());
6649 	KASSERT(mutex_owned(&pmap->pm_lock));
6650 	KASSERT(kpreempt_disabled());
6651 
6652 	/*
6653 	 * mappings are very often sparse, so clip the given range to the
6654 	 * range of PTEs that are known present in the PTP.
6655 	 */
6656 	pmap_ptp_range_clip(ptp, &startva, &pte);
6657 
6658 	/*
6659 	 * note that ptpva points to the PTE that maps startva.   this may
6660 	 * or may not be the first PTE in the PTP.
6661 	 *
6662 	 * we loop through the PTP while there are still PTEs to look at
6663 	 * and the wire_count is greater than 1 (because we use the wire_count
6664 	 * to keep track of the number of real PTEs in the PTP).
6665 	 */
6666 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6667 		(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6668 		startva += PAGE_SIZE;
6669 		pte++;
6670 	}
6671 }
6672 
6673 static void
6674 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6675 {
6676 	pt_entry_t *ptes;
6677 	pd_entry_t pde;
6678 	paddr_t ptppa;
6679 	vaddr_t blkendva, va = sva;
6680 	struct vm_page *ptp;
6681 
6682 	mutex_enter(&pmap->pm_lock);
6683 	kpreempt_disable();
6684 
6685 	for (/* null */ ; va < eva ; va = blkendva) {
6686 		int lvl;
6687 
6688 		/* determine range of block */
6689 		blkendva = x86_round_pdr(va+1);
6690 		if (blkendva > eva)
6691 			blkendva = eva;
6692 
6693 		lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6694 		if (lvl != 0) {
6695 			/* Skip a range corresponding to an invalid pde. */
6696 			blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6697 			continue;
6698 		}
6699 
6700 		/* PA of the PTP */
6701 		ptppa = pmap_pte2pa(pde);
6702 
6703 		ptp = pmap_find_ptp(pmap, va, 1);
6704 		KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6705 		    __func__);
6706 
6707 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6708 
6709 		pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6710 		    blkendva);
6711 
6712 		/* If PTP is no longer being used, free it. */
6713 		if (ptp && ptp->wire_count <= 1) {
6714 			pmap_ept_free_ptp(pmap, ptp, va);
6715 		}
6716 	}
6717 
6718 	kpreempt_enable();
6719 	pmap_drain_pv(pmap);
6720 	mutex_exit(&pmap->pm_lock);
6721 }
6722 
6723 static int
6724 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6725     uint8_t *oattrs, pt_entry_t *optep)
6726 {
6727 	struct pmap *pmap;
6728 	pt_entry_t *ptep;
6729 	pt_entry_t opte;
6730 	pt_entry_t npte;
6731 	pt_entry_t expect;
6732 	bool need_shootdown;
6733 
6734 	expect = pmap_pa2pte(pa) | EPT_R;
6735 	pmap = ptp_to_pmap(ptp);
6736 
6737 	if (clearbits != ~0) {
6738 		KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6739 		clearbits = pmap_pp_attrs_to_ept(clearbits);
6740 	}
6741 
6742 	ptep = pmap_map_pte(pmap, ptp, va);
6743 	do {
6744 		opte = *ptep;
6745 		KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6746 		KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6747 		KASSERT(opte == 0 || (opte & EPT_R) != 0);
6748 		if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6749 			/*
6750 			 * We lost a race with a V->P operation like
6751 			 * pmap_remove().  Wait for the competitor
6752 			 * reflecting pte bits into mp_attrs.
6753 			 */
6754 			pmap_unmap_pte();
6755 			return EAGAIN;
6756 		}
6757 
6758 		/*
6759 		 * Check if there's anything to do on this PTE.
6760 		 */
6761 		if ((opte & clearbits) == 0) {
6762 			need_shootdown = false;
6763 			break;
6764 		}
6765 
6766 		/*
6767 		 * We need a shootdown if the PTE is cached (EPT_A) ...
6768 		 * ... Unless we are clearing only the EPT_W bit and
6769 		 * it isn't cached as RW (EPT_D).
6770 		 */
6771 		if (pmap_ept_has_ad) {
6772 			need_shootdown = (opte & EPT_A) != 0 &&
6773 			    !(clearbits == EPT_W && (opte & EPT_D) == 0);
6774 		} else {
6775 			need_shootdown = true;
6776 		}
6777 
6778 		npte = opte & ~clearbits;
6779 
6780 		/*
6781 		 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6782 		 */
6783 		if (need_shootdown) {
6784 			npte &= ~(EPT_A | EPT_D);
6785 		}
6786 		KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6787 		KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6788 		KASSERT(npte == 0 || (opte & EPT_R) != 0);
6789 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
6790 
6791 	if (need_shootdown) {
6792 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6793 	}
6794 	pmap_unmap_pte();
6795 
6796 	*oattrs = pmap_ept_to_pp_attrs(opte);
6797 	if (optep != NULL)
6798 		*optep = opte;
6799 	return 0;
6800 }
6801 
6802 static void
6803 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6804     vaddr_t va)
6805 {
6806 
6807 	KASSERT(mutex_owned(&pmap->pm_lock));
6808 
6809 	pmap_ept_stats_update_bypte(pmap, 0, opte);
6810 	ptp->wire_count--;
6811 	if (ptp->wire_count <= 1) {
6812 		pmap_ept_free_ptp(pmap, ptp, va);
6813 	}
6814 }
6815 
6816 static void
6817 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6818 {
6819 	pt_entry_t bit_rem;
6820 	pt_entry_t *ptes, *spte;
6821 	pt_entry_t opte, npte;
6822 	pd_entry_t pde;
6823 	paddr_t ptppa;
6824 	vaddr_t va;
6825 	bool modified;
6826 
6827 	bit_rem = 0;
6828 	if (!(prot & VM_PROT_WRITE))
6829 		bit_rem = EPT_W;
6830 
6831 	sva &= PTE_FRAME;
6832 	eva &= PTE_FRAME;
6833 
6834 	/* Acquire pmap. */
6835 	mutex_enter(&pmap->pm_lock);
6836 	kpreempt_disable();
6837 
6838 	for (va = sva; va < eva; va += PAGE_SIZE) {
6839 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6840 			continue;
6841 		}
6842 
6843 		ptppa = pmap_pte2pa(pde);
6844 		ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6845 		spte = &ptes[pl1_pi(va)];
6846 
6847 		do {
6848 			opte = *spte;
6849 			if (!pmap_ept_valid_entry(opte)) {
6850 				goto next;
6851 			}
6852 			npte = (opte & ~bit_rem);
6853 		} while (pmap_pte_cas(spte, opte, npte) != opte);
6854 
6855 		if (pmap_ept_has_ad) {
6856 			modified = (opte & EPT_D) != 0;
6857 		} else {
6858 			modified = true;
6859 		}
6860 		if (modified) {
6861 			vaddr_t tva = x86_ptob(spte - ptes);
6862 			pmap_tlb_shootdown(pmap, tva, 0,
6863 			    TLBSHOOT_WRITE_PROTECT);
6864 		}
6865 next:;
6866 	}
6867 
6868 	kpreempt_enable();
6869 	mutex_exit(&pmap->pm_lock);
6870 }
6871 
6872 static void
6873 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6874 {
6875 	pt_entry_t *ptes, *ptep, opte;
6876 	pd_entry_t pde;
6877 	paddr_t ptppa;
6878 
6879 	/* Acquire pmap. */
6880 	mutex_enter(&pmap->pm_lock);
6881 	kpreempt_disable();
6882 
6883 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6884 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6885 	}
6886 
6887 	ptppa = pmap_pte2pa(pde);
6888 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6889 	ptep = &ptes[pl1_pi(va)];
6890 	opte = *ptep;
6891 	KASSERT(pmap_ept_valid_entry(opte));
6892 
6893 	if (opte & EPT_WIRED) {
6894 		pt_entry_t npte = opte & ~EPT_WIRED;
6895 
6896 		opte = pmap_pte_testset(ptep, npte);
6897 		pmap_ept_stats_update_bypte(pmap, npte, opte);
6898 	} else {
6899 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
6900 		    "did not change!\n", __func__, pmap, va);
6901 	}
6902 
6903 	/* Release pmap. */
6904 	kpreempt_enable();
6905 	mutex_exit(&pmap->pm_lock);
6906 }
6907 
6908 /* -------------------------------------------------------------------------- */
6909 
6910 void
6911 pmap_ept_transform(struct pmap *pmap)
6912 {
6913 	pmap->pm_enter = pmap_ept_enter;
6914 	pmap->pm_extract = pmap_ept_extract;
6915 	pmap->pm_remove = pmap_ept_remove;
6916 	pmap->pm_sync_pv = pmap_ept_sync_pv;
6917 	pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6918 	pmap->pm_write_protect = pmap_ept_write_protect;
6919 	pmap->pm_unwire = pmap_ept_unwire;
6920 
6921 	memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6922 }
6923 
6924 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6925