xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*	$NetBSD: pmap.c,v 1.291 2018/06/20 11:57:22 maxv Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright (c) 1997 Charles D. Cranor and Washington University.
74  * All rights reserved.
75  *
76  * Redistribution and use in source and binary forms, with or without
77  * modification, are permitted provided that the following conditions
78  * are met:
79  * 1. Redistributions of source code must retain the above copyright
80  *    notice, this list of conditions and the following disclaimer.
81  * 2. Redistributions in binary form must reproduce the above copyright
82  *    notice, this list of conditions and the following disclaimer in the
83  *    documentation and/or other materials provided with the distribution.
84  *
85  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
86  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
87  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
88  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
89  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
90  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
91  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
92  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
94  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
95  */
96 
97 /*
98  * Copyright 2001 (c) Wasabi Systems, Inc.
99  * All rights reserved.
100  *
101  * Written by Frank van der Linden for Wasabi Systems, Inc.
102  *
103  * Redistribution and use in source and binary forms, with or without
104  * modification, are permitted provided that the following conditions
105  * are met:
106  * 1. Redistributions of source code must retain the above copyright
107  *    notice, this list of conditions and the following disclaimer.
108  * 2. Redistributions in binary form must reproduce the above copyright
109  *    notice, this list of conditions and the following disclaimer in the
110  *    documentation and/or other materials provided with the distribution.
111  * 3. All advertising materials mentioning features or use of this software
112  *    must display the following acknowledgement:
113  *      This product includes software developed for the NetBSD Project by
114  *      Wasabi Systems, Inc.
115  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
116  *    or promote products derived from this software without specific prior
117  *    written permission.
118  *
119  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
120  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
121  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
122  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
123  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
124  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
125  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
126  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
127  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
128  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
129  * POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 /*
133  * This is the i386 pmap modified and generalized to support x86-64
134  * as well. The idea is to hide the upper N levels of the page tables
135  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
136  * is mostly untouched, except that it uses some more generalized
137  * macros and interfaces.
138  *
139  * This pmap has been tested on the i386 as well, and it can be easily
140  * adapted to PAE.
141  *
142  * fvdl@wasabisystems.com 18-Jun-2001
143  */
144 
145 /*
146  * pmap.c: i386 pmap module rewrite
147  * Chuck Cranor <chuck@netbsd>
148  * 11-Aug-97
149  *
150  * history of this pmap module: in addition to my own input, i used
151  *    the following references for this rewrite of the i386 pmap:
152  *
153  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
154  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
155  *     it was then ported to the i386 by William Jolitz of UUNET
156  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
157  *     project fixed some bugs and provided some speed ups.
158  *
159  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
160  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
161  *     and David Greenman.
162  *
163  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
164  *     between several processors.   the VAX version was done by
165  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
166  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
167  *     David Golub, and Richard Draves.    the alpha version was
168  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
169  *     (NetBSD/alpha).
170  */
171 
172 #include <sys/cdefs.h>
173 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.291 2018/06/20 11:57:22 maxv Exp $");
174 
175 #include "opt_user_ldt.h"
176 #include "opt_lockdebug.h"
177 #include "opt_multiprocessor.h"
178 #include "opt_xen.h"
179 #include "opt_svs.h"
180 
181 #include <sys/param.h>
182 #include <sys/systm.h>
183 #include <sys/proc.h>
184 #include <sys/pool.h>
185 #include <sys/kernel.h>
186 #include <sys/atomic.h>
187 #include <sys/cpu.h>
188 #include <sys/intr.h>
189 #include <sys/xcall.h>
190 #include <sys/kcore.h>
191 
192 #include <uvm/uvm.h>
193 #include <uvm/pmap/pmap_pvt.h>
194 
195 #include <dev/isa/isareg.h>
196 
197 #include <machine/specialreg.h>
198 #include <machine/gdt.h>
199 #include <machine/isa_machdep.h>
200 #include <machine/cpuvar.h>
201 #include <machine/cputypes.h>
202 
203 #include <x86/pmap.h>
204 #include <x86/pmap_pv.h>
205 
206 #include <x86/i82489reg.h>
207 #include <x86/i82489var.h>
208 
209 #ifdef XEN
210 #include <xen/xen-public/xen.h>
211 #include <xen/hypervisor.h>
212 #endif
213 
214 /*
215  * general info:
216  *
217  *  - for an explanation of how the i386 MMU hardware works see
218  *    the comments in <machine/pte.h>.
219  *
220  *  - for an explanation of the general memory structure used by
221  *    this pmap (including the recursive mapping), see the comments
222  *    in <machine/pmap.h>.
223  *
224  * this file contains the code for the "pmap module."   the module's
225  * job is to manage the hardware's virtual to physical address mappings.
226  * note that there are two levels of mapping in the VM system:
227  *
228  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
229  *      to map ranges of virtual address space to objects/files.  for
230  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
231  *      to the file /bin/ls starting at offset zero."   note that
232  *      the upper layer mapping is not concerned with how individual
233  *      vm_pages are mapped.
234  *
235  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
236  *      from virtual addresses.   it is concerned with which vm_page is
237  *      mapped where.   for example, when you run /bin/ls and start
238  *      at page 0x1000 the fault routine may lookup the correct page
239  *      of the /bin/ls file and then ask the pmap layer to establish
240  *      a mapping for it.
241  *
242  * note that information in the lower layer of the VM system can be
243  * thrown away since it can easily be reconstructed from the info
244  * in the upper layer.
245  *
246  * data structures we use include:
247  *
248  *  - struct pmap: describes the address space of one thread
249  *  - struct pmap_page: describes one pv-tracked page, without
250  *	necessarily a corresponding vm_page
251  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
252  *  - struct pv_head: there is one pv_head per pv-tracked page of
253  *	physical memory.   the pv_head points to a list of pv_entry
254  *	structures which describe all the <PMAP,VA> pairs that this
255  *      page is mapped in.    this is critical for page based operations
256  *      such as pmap_page_protect() [change protection on _all_ mappings
257  *      of a page]
258  */
259 
260 /*
261  * memory allocation
262  *
263  *  - there are three data structures that we must dynamically allocate:
264  *
265  * [A] new process' page directory page (PDP)
266  *	- plan 1: done at pmap_create() we use
267  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
268  *	  allocation.
269  *
270  * if we are low in free physical memory then we sleep in
271  * uvm_km_alloc -- in this case this is ok since we are creating
272  * a new pmap and should not be holding any locks.
273  *
274  * if the kernel is totally out of virtual space
275  * (i.e. uvm_km_alloc returns NULL), then we panic.
276  *
277  * [B] new page tables pages (PTP)
278  * 	- call uvm_pagealloc()
279  * 		=> success: zero page, add to pm_pdir
280  * 		=> failure: we are out of free vm_pages, let pmap_enter()
281  *		   tell UVM about it.
282  *
283  * note: for kernel PTPs, we start with NKPTP of them.   as we map
284  * kernel memory (at uvm_map time) we check to see if we've grown
285  * the kernel pmap.   if so, we call the optional function
286  * pmap_growkernel() to grow the kernel PTPs in advance.
287  *
288  * [C] pv_entry structures
289  */
290 
291 /*
292  * locking
293  *
294  * we have the following locks that we must contend with:
295  *
296  * mutexes:
297  *
298  * - pmap lock (per pmap, part of uvm_object)
299  *   this lock protects the fields in the pmap structure including
300  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
301  *   in the alternate PTE space (since that is determined by the
302  *   entry in the PDP).
303  *
304  * - pvh_lock (per pv_head)
305  *   this lock protects the pv_entry list which is chained off the
306  *   pv_head structure for a specific pv-tracked PA.   it is locked
307  *   when traversing the list (e.g. adding/removing mappings,
308  *   syncing R/M bits, etc.)
309  *
310  * - pmaps_lock
311  *   this lock protects the list of active pmaps (headed by "pmaps").
312  *   we lock it when adding or removing pmaps from this list.
313  */
314 
315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
317 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
318 const long nbpd[] = NBPD_INITIALIZER;
319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
320 
321 long nkptp[] = NKPTP_INITIALIZER;
322 
323 struct pmap_head pmaps;
324 kmutex_t pmaps_lock;
325 
326 struct pcpu_area *pcpuarea __read_mostly;
327 
328 static vaddr_t pmap_maxkvaddr;
329 
330 /*
331  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
332  * actual locking is done by pm_lock.
333  */
334 #if defined(DIAGNOSTIC)
335 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
336 	KASSERT(mutex_owned((pm)->pm_lock)); \
337 	if ((idx) != 0) \
338 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
339 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
340 	KASSERT(mutex_owned((pm)->pm_lock)); \
341 	if ((idx) != 0) \
342 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
343 #else /* defined(DIAGNOSTIC) */
344 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
345 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
346 #endif /* defined(DIAGNOSTIC) */
347 
348 /*
349  * Misc. event counters.
350  */
351 struct evcnt pmap_iobmp_evcnt;
352 struct evcnt pmap_ldt_evcnt;
353 
354 /*
355  * PAT
356  */
357 #define	PATENTRY(n, type)	(type << ((n) * 8))
358 #define	PAT_UC		0x0ULL
359 #define	PAT_WC		0x1ULL
360 #define	PAT_WT		0x4ULL
361 #define	PAT_WP		0x5ULL
362 #define	PAT_WB		0x6ULL
363 #define	PAT_UCMINUS	0x7ULL
364 
365 static bool cpu_pat_enabled __read_mostly = false;
366 
367 /*
368  * Global data structures
369  */
370 
371 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
372 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
373 
374 struct bootspace bootspace __read_mostly;
375 
376 /*
377  * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
378  * set pmap_pg_nx to PG_NX (otherwise it is zero).
379  */
380 pd_entry_t pmap_pg_nx __read_mostly = 0;
381 
382 /*
383  * pmap_pg_g: if our processor supports PG_G in the PTE then we
384  * set pmap_pg_g to PG_G (otherwise it is zero).
385  */
386 pd_entry_t pmap_pg_g __read_mostly = 0;
387 
388 /*
389  * pmap_largepages: if our processor supports PG_PS and we are
390  * using it, this is set to true.
391  */
392 int pmap_largepages __read_mostly = 0;
393 
394 /*
395  * i386 physical memory comes in a big contig chunk with a small
396  * hole toward the front of it...  the following two paddr_t's
397  * (shared with machdep.c) describe the physical address space
398  * of this machine.
399  */
400 paddr_t lowmem_rsvd __read_mostly;
401 paddr_t avail_start __read_mostly; /* PA of first available physical page */
402 paddr_t avail_end __read_mostly; /* PA of last available physical page */
403 
404 #ifdef XEN
405 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
406 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
407 #endif
408 
409 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
410 
411 #define	PV_HASH_SIZE		32768
412 #define	PV_HASH_LOCK_CNT	32
413 
414 struct pv_hash_lock {
415 	kmutex_t lock;
416 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
417     __aligned(CACHE_LINE_SIZE);
418 
419 struct pv_hash_head {
420 	SLIST_HEAD(, pv_entry) hh_list;
421 } pv_hash_heads[PV_HASH_SIZE];
422 
423 static u_int
424 pvhash_hash(struct vm_page *ptp, vaddr_t va)
425 {
426 
427 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
428 }
429 
430 static struct pv_hash_head *
431 pvhash_head(u_int hash)
432 {
433 
434 	return &pv_hash_heads[hash % PV_HASH_SIZE];
435 }
436 
437 static kmutex_t *
438 pvhash_lock(u_int hash)
439 {
440 
441 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
442 }
443 
444 static struct pv_entry *
445 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
446 {
447 	struct pv_entry *pve;
448 	struct pv_entry *prev;
449 
450 	prev = NULL;
451 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
452 		if (pve->pve_pte.pte_ptp == ptp &&
453 		    pve->pve_pte.pte_va == va) {
454 			if (prev != NULL) {
455 				SLIST_REMOVE_AFTER(prev, pve_hash);
456 			} else {
457 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
458 			}
459 			break;
460 		}
461 		prev = pve;
462 	}
463 	return pve;
464 }
465 
466 /*
467  * Other data structures
468  */
469 
470 static pt_entry_t protection_codes[8] __read_mostly;
471 
472 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
473 
474 /*
475  * The following two vaddr_t's are used during system startup to keep track of
476  * how much of the kernel's VM space we have used. Once the system is started,
477  * the management of the remaining kernel VM space is turned over to the
478  * kernel_map vm_map.
479  */
480 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
481 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
482 
483 #ifndef XEN
484 /*
485  * LAPIC virtual address, and fake physical address.
486  */
487 volatile vaddr_t local_apic_va __read_mostly;
488 paddr_t local_apic_pa __read_mostly;
489 #endif
490 
491 /*
492  * pool that pmap structures are allocated from
493  */
494 static struct pool_cache pmap_cache;
495 
496 /*
497  * pv_entry cache
498  */
499 static struct pool_cache pmap_pv_cache;
500 
501 #ifdef __HAVE_DIRECT_MAP
502 vaddr_t pmap_direct_base __read_mostly;
503 vaddr_t pmap_direct_end __read_mostly;
504 size_t pmap_direct_pdpe __read_mostly;
505 size_t pmap_direct_npdp __read_mostly;
506 #endif
507 
508 #ifndef __HAVE_DIRECT_MAP
509 /*
510  * Special VAs and the PTEs that map them
511  */
512 static pt_entry_t *early_zero_pte;
513 static void pmap_vpage_cpualloc(struct cpu_info *);
514 #ifdef XEN
515 char *early_zerop; /* also referenced from xen_locore() */
516 #else
517 static char *early_zerop;
518 #endif
519 #endif
520 
521 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
522 
523 /* PDP pool_cache(9) and its callbacks */
524 struct pool_cache pmap_pdp_cache;
525 static int  pmap_pdp_ctor(void *, void *, int);
526 static void pmap_pdp_dtor(void *, void *);
527 #ifdef PAE
528 /* need to allocate items of 4 pages */
529 static void *pmap_pdp_alloc(struct pool *, int);
530 static void pmap_pdp_free(struct pool *, void *);
531 static struct pool_allocator pmap_pdp_allocator = {
532 	.pa_alloc = pmap_pdp_alloc,
533 	.pa_free = pmap_pdp_free,
534 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
535 };
536 #endif /* PAE */
537 
538 extern vaddr_t idt_vaddr;
539 extern paddr_t idt_paddr;
540 extern vaddr_t gdt_vaddr;
541 extern paddr_t gdt_paddr;
542 extern vaddr_t ldt_vaddr;
543 extern paddr_t ldt_paddr;
544 
545 extern int end;
546 
547 #ifdef i386
548 /* stuff to fix the pentium f00f bug */
549 extern vaddr_t pentium_idt_vaddr;
550 #endif
551 
552 /*
553  * Local prototypes
554  */
555 
556 #ifdef __HAVE_PCPU_AREA
557 static void pmap_init_pcpu(void);
558 #endif
559 #ifdef __HAVE_DIRECT_MAP
560 static void pmap_init_directmap(struct pmap *);
561 #endif
562 #if !defined(XEN)
563 static void pmap_remap_global(void);
564 #endif
565 #ifndef XEN
566 static void pmap_init_lapic(void);
567 static void pmap_remap_largepages(void);
568 #endif
569 
570 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
571     pd_entry_t * const *, int);
572 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
573 static void pmap_freepage(struct pmap *, struct vm_page *, int);
574 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
575     pt_entry_t *, pd_entry_t * const *);
576 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
577     vaddr_t, struct pv_entry **);
578 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
579     vaddr_t, struct pv_entry **);
580 
581 static paddr_t pmap_get_physpage(void);
582 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
583 
584 static void pmap_reactivate(struct pmap *);
585 
586 /*
587  * p m a p   h e l p e r   f u n c t i o n s
588  */
589 
590 static inline void
591 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
592 {
593 
594 	if (pmap == pmap_kernel()) {
595 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
596 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
597 	} else {
598 		KASSERT(mutex_owned(pmap->pm_lock));
599 		pmap->pm_stats.resident_count += resid_diff;
600 		pmap->pm_stats.wired_count += wired_diff;
601 	}
602 }
603 
604 static inline void
605 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
606 {
607 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
608 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
609 
610 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
611 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
612 
613 	pmap_stats_update(pmap, resid_diff, wired_diff);
614 }
615 
616 /*
617  * ptp_to_pmap: lookup pmap by ptp
618  */
619 
620 static struct pmap *
621 ptp_to_pmap(struct vm_page *ptp)
622 {
623 	struct pmap *pmap;
624 
625 	if (ptp == NULL) {
626 		return pmap_kernel();
627 	}
628 	pmap = (struct pmap *)ptp->uobject;
629 	KASSERT(pmap != NULL);
630 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
631 	return pmap;
632 }
633 
634 static inline struct pv_pte *
635 pve_to_pvpte(struct pv_entry *pve)
636 {
637 
638 	KASSERT((void *)&pve->pve_pte == (void *)pve);
639 	return &pve->pve_pte;
640 }
641 
642 static inline struct pv_entry *
643 pvpte_to_pve(struct pv_pte *pvpte)
644 {
645 	struct pv_entry *pve = (void *)pvpte;
646 
647 	KASSERT(pve_to_pvpte(pve) == pvpte);
648 	return pve;
649 }
650 
651 /*
652  * pv_pte_first, pv_pte_next: PV list iterator.
653  */
654 
655 static struct pv_pte *
656 pv_pte_first(struct pmap_page *pp)
657 {
658 
659 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
660 		return &pp->pp_pte;
661 	}
662 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
663 }
664 
665 static struct pv_pte *
666 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
667 {
668 
669 	KASSERT(pvpte != NULL);
670 	if (pvpte == &pp->pp_pte) {
671 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
672 		return NULL;
673 	}
674 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
675 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
676 }
677 
678 /*
679  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
680  *		of course the kernel is always loaded
681  */
682 
683 bool
684 pmap_is_curpmap(struct pmap *pmap)
685 {
686 	return((pmap == pmap_kernel()) ||
687 	       (pmap == curcpu()->ci_pmap));
688 }
689 
690 /*
691  *	Add a reference to the specified pmap.
692  */
693 
694 void
695 pmap_reference(struct pmap *pmap)
696 {
697 
698 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
699 }
700 
701 /*
702  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
703  *
704  * there are several pmaps involved.  some or all of them might be same.
705  *
706  *	- the pmap given by the first argument
707  *		our caller wants to access this pmap's PTEs.
708  *
709  *	- pmap_kernel()
710  *		the kernel pmap.  note that it only contains the kernel part
711  *		of the address space which is shared by any pmap.  ie. any
712  *		pmap can be used instead of pmap_kernel() for our purpose.
713  *
714  *	- ci->ci_pmap
715  *		pmap currently loaded on the cpu.
716  *
717  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
718  *		current process' pmap.
719  *
720  * => we lock enough pmaps to keep things locked in
721  * => must be undone with pmap_unmap_ptes before returning
722  */
723 
724 void
725 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
726 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
727 {
728 	struct pmap *curpmap;
729 	struct cpu_info *ci;
730 	lwp_t *l;
731 
732 	/* The kernel's pmap is always accessible. */
733 	if (pmap == pmap_kernel()) {
734 		*pmap2 = NULL;
735 		*ptepp = PTE_BASE;
736 		*pdeppp = normal_pdes;
737 		return;
738 	}
739 	KASSERT(kpreempt_disabled());
740 
741 	l = curlwp;
742  retry:
743 	mutex_enter(pmap->pm_lock);
744 	ci = curcpu();
745 	curpmap = ci->ci_pmap;
746 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
747 		/* Our own pmap so just load it: easy. */
748 		if (__predict_false(ci->ci_want_pmapload)) {
749 			mutex_exit(pmap->pm_lock);
750 			pmap_load();
751 			goto retry;
752 		}
753 		KASSERT(pmap == curpmap);
754 	} else if (pmap == curpmap) {
755 		/*
756 		 * Already on the CPU: make it valid.  This is very
757 		 * often the case during exit(), when we have switched
758 		 * to the kernel pmap in order to destroy a user pmap.
759 		 */
760 		pmap_reactivate(pmap);
761 	} else {
762 		/*
763 		 * Toss current pmap from CPU, but keep a reference to it.
764 		 * The reference will be dropped by pmap_unmap_ptes().
765 		 * Can happen if we block during exit().
766 		 */
767 		const cpuid_t cid = cpu_index(ci);
768 
769 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
770 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
771 		ci->ci_pmap = pmap;
772 		ci->ci_tlbstate = TLBSTATE_VALID;
773 		kcpuset_atomic_set(pmap->pm_cpus, cid);
774 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
775 		cpu_load_pmap(pmap, curpmap);
776 	}
777 	pmap->pm_ncsw = l->l_ncsw;
778 	*pmap2 = curpmap;
779 	*ptepp = PTE_BASE;
780 
781 #if defined(XEN) && defined(__x86_64__)
782 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
783 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
784 	*pdeppp = ci->ci_normal_pdes;
785 #else
786 	*pdeppp = normal_pdes;
787 #endif
788 }
789 
790 /*
791  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
792  */
793 
794 void
795 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
796 {
797 	struct cpu_info *ci;
798 	struct pmap *mypmap;
799 
800 	KASSERT(kpreempt_disabled());
801 
802 	/* The kernel's pmap is always accessible. */
803 	if (pmap == pmap_kernel()) {
804 		return;
805 	}
806 
807 	ci = curcpu();
808 
809 #if defined(XEN) && defined(__x86_64__)
810 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
811 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
812 #endif
813 
814 	/*
815 	 * We cannot tolerate context switches while mapped in.
816 	 * If it is our own pmap all we have to do is unlock.
817 	 */
818 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
819 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
820 	if (pmap == mypmap) {
821 		mutex_exit(pmap->pm_lock);
822 		return;
823 	}
824 
825 	/*
826 	 * Mark whatever's on the CPU now as lazy and unlock.
827 	 * If the pmap was already installed, we are done.
828 	 */
829 	ci->ci_tlbstate = TLBSTATE_LAZY;
830 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
831 	mutex_exit(pmap->pm_lock);
832 	if (pmap == pmap2) {
833 		return;
834 	}
835 
836 	/*
837 	 * We installed another pmap on the CPU.  Grab a reference to
838 	 * it and leave in place.  Toss the evicted pmap (can block).
839 	 */
840 	pmap_reference(pmap);
841 	pmap_destroy(pmap2);
842 }
843 
844 
845 inline static void
846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
847 {
848 
849 #if !defined(__x86_64__)
850 	if (curproc == NULL || curproc->p_vmspace == NULL ||
851 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
852 		return;
853 
854 	if ((opte ^ npte) & PG_X)
855 		pmap_update_pg(va);
856 
857 	/*
858 	 * Executability was removed on the last executable change.
859 	 * Reset the code segment to something conservative and
860 	 * let the trap handler deal with setting the right limit.
861 	 * We can't do that because of locking constraints on the vm map.
862 	 */
863 
864 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
865 		struct trapframe *tf = curlwp->l_md.md_regs;
866 
867 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
868 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
869 	}
870 #endif /* !defined(__x86_64__) */
871 }
872 
873 #if !defined(__x86_64__)
874 /*
875  * Fixup the code segment to cover all potential executable mappings.
876  * returns 0 if no changes to the code segment were made.
877  */
878 
879 int
880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
881 {
882 	struct vm_map_entry *ent;
883 	struct pmap *pm = vm_map_pmap(map);
884 	vaddr_t va = 0;
885 
886 	vm_map_lock_read(map);
887 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
888 
889 		/*
890 		 * This entry has greater va than the entries before.
891 		 * We need to make it point to the last page, not past it.
892 		 */
893 
894 		if (ent->protection & VM_PROT_EXECUTE)
895 			va = trunc_page(ent->end) - PAGE_SIZE;
896 	}
897 	vm_map_unlock_read(map);
898 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
899 		return (0);
900 
901 	pm->pm_hiexec = va;
902 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
903 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
904 	} else {
905 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
906 		return (0);
907 	}
908 	return (1);
909 }
910 #endif /* !defined(__x86_64__) */
911 
912 void
913 pat_init(struct cpu_info *ci)
914 {
915 	uint64_t pat;
916 
917 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
918 		return;
919 
920 	/* We change WT to WC. Leave all other entries the default values. */
921 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
922 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
923 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
924 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
925 
926 	wrmsr(MSR_CR_PAT, pat);
927 	cpu_pat_enabled = true;
928 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
929 }
930 
931 static pt_entry_t
932 pmap_pat_flags(u_int flags)
933 {
934 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
935 
936 	if (!cpu_pat_enabled) {
937 		switch (cacheflags) {
938 		case PMAP_NOCACHE:
939 		case PMAP_NOCACHE_OVR:
940 			/* results in PGC_UCMINUS on cpus which have
941 			 * the cpuid PAT but PAT "disabled"
942 			 */
943 			return PG_N;
944 		default:
945 			return 0;
946 		}
947 	}
948 
949 	switch (cacheflags) {
950 	case PMAP_NOCACHE:
951 		return PGC_UC;
952 	case PMAP_WRITE_COMBINE:
953 		return PGC_WC;
954 	case PMAP_WRITE_BACK:
955 		return PGC_WB;
956 	case PMAP_NOCACHE_OVR:
957 		return PGC_UCMINUS;
958 	}
959 
960 	return 0;
961 }
962 
963 /*
964  * p m a p   k e n t e r   f u n c t i o n s
965  *
966  * functions to quickly enter/remove pages from the kernel address
967  * space.   pmap_kremove is exported to MI kernel.  we make use of
968  * the recursive PTE mappings.
969  */
970 
971 /*
972  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
973  *
974  * => no need to lock anything, assume va is already allocated
975  * => should be faster than normal pmap enter function
976  */
977 
978 void
979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
980 {
981 	pt_entry_t *pte, opte, npte;
982 
983 	KASSERT(!(prot & ~VM_PROT_ALL));
984 
985 	if (va < VM_MIN_KERNEL_ADDRESS)
986 		pte = vtopte(va);
987 	else
988 		pte = kvtopte(va);
989 #ifdef DOM0OPS
990 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
991 #ifdef DEBUG
992 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
993 		    " outside range\n", __func__, pa, va);
994 #endif /* DEBUG */
995 		npte = pa;
996 	} else
997 #endif /* DOM0OPS */
998 		npte = pmap_pa2pte(pa);
999 	npte |= protection_codes[prot] | PG_V | pmap_pg_g;
1000 	npte |= pmap_pat_flags(flags);
1001 	opte = pmap_pte_testset(pte, npte); /* zap! */
1002 
1003 	/*
1004 	 * XXX: make sure we are not dealing with a large page, since the only
1005 	 * large pages created are for the kernel image, and they should never
1006 	 * be kentered.
1007 	 */
1008 	KASSERTMSG(!(opte & PG_PS), "PG_PS va=%#"PRIxVADDR, va);
1009 
1010 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1011 		/* This should not happen. */
1012 		printf_nolog("%s: mapping already present\n", __func__);
1013 		kpreempt_disable();
1014 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1015 		kpreempt_enable();
1016 	}
1017 }
1018 
1019 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1020 
1021 #if defined(__x86_64__)
1022 /*
1023  * Change protection for a virtual address. Local for a CPU only, don't
1024  * care about TLB shootdowns.
1025  *
1026  * => must be called with preemption disabled
1027  */
1028 void
1029 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1030 {
1031 	pt_entry_t *pte, opte, npte;
1032 
1033 	KASSERT(kpreempt_disabled());
1034 
1035 	if (va < VM_MIN_KERNEL_ADDRESS)
1036 		pte = vtopte(va);
1037 	else
1038 		pte = kvtopte(va);
1039 
1040 	npte = opte = *pte;
1041 
1042 	if ((prot & VM_PROT_WRITE) != 0)
1043 		npte |= PG_RW;
1044 	else
1045 		npte &= ~PG_RW;
1046 
1047 	if (opte != npte) {
1048 		pmap_pte_set(pte, npte);
1049 		pmap_pte_flush();
1050 		invlpg(va);
1051 	}
1052 }
1053 #endif /* defined(__x86_64__) */
1054 
1055 /*
1056  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1057  *
1058  * => no need to lock anything
1059  * => caller must dispose of any vm_page mapped in the va range
1060  * => note: not an inline function
1061  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1062  * => we assume kernel only unmaps valid addresses and thus don't bother
1063  *    checking the valid bit before doing TLB flushing
1064  * => must be followed by call to pmap_update() before reuse of page
1065  */
1066 
1067 static inline void
1068 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1069 {
1070 	pt_entry_t *pte, opte;
1071 	vaddr_t va, eva;
1072 
1073 	eva = sva + len;
1074 
1075 	kpreempt_disable();
1076 	for (va = sva; va < eva; va += PAGE_SIZE) {
1077 		pte = kvtopte(va);
1078 		opte = pmap_pte_testset(pte, 0); /* zap! */
1079 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1080 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1081 			    TLBSHOOT_KREMOVE);
1082 		}
1083 		KASSERTMSG((opte & PG_PS) == 0,
1084 		    "va %#" PRIxVADDR " is a large page", va);
1085 		KASSERTMSG((opte & PG_PVLIST) == 0,
1086 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1087 	}
1088 	if (localonly) {
1089 		tlbflushg();
1090 	}
1091 	kpreempt_enable();
1092 }
1093 
1094 void
1095 pmap_kremove(vaddr_t sva, vsize_t len)
1096 {
1097 
1098 	pmap_kremove1(sva, len, false);
1099 }
1100 
1101 /*
1102  * pmap_kremove_local: like pmap_kremove(), but only worry about
1103  * TLB invalidations on the current CPU.  this is only intended
1104  * for use while writing kernel crash dumps, either after panic
1105  * or via reboot -d.
1106  */
1107 
1108 void
1109 pmap_kremove_local(vaddr_t sva, vsize_t len)
1110 {
1111 
1112 	pmap_kremove1(sva, len, true);
1113 }
1114 
1115 /*
1116  * p m a p   i n i t   f u n c t i o n s
1117  *
1118  * pmap_bootstrap and pmap_init are called during system startup
1119  * to init the pmap module.   pmap_bootstrap() does a low level
1120  * init just to get things rolling.   pmap_init() finishes the job.
1121  */
1122 
1123 /*
1124  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1125  * This function is to be used before any VM system has been set up.
1126  *
1127  * The va is taken from virtual_avail.
1128  */
1129 static vaddr_t
1130 pmap_bootstrap_valloc(size_t npages)
1131 {
1132 	vaddr_t va = virtual_avail;
1133 	virtual_avail += npages * PAGE_SIZE;
1134 	return va;
1135 }
1136 
1137 /*
1138  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1139  * This function is to be used before any VM system has been set up.
1140  *
1141  * The pa is taken from avail_start.
1142  */
1143 static paddr_t
1144 pmap_bootstrap_palloc(size_t npages)
1145 {
1146 	paddr_t pa = avail_start;
1147 	avail_start += npages * PAGE_SIZE;
1148 	return pa;
1149 }
1150 
1151 /*
1152  * pmap_bootstrap: get the system in a state where it can run with VM properly
1153  * enabled (called before main()). The VM system is fully init'd later.
1154  *
1155  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1156  *    kernel, and nkpde PTP's for the kernel.
1157  * => kva_start is the first free virtual address in kernel space.
1158  */
1159 void
1160 pmap_bootstrap(vaddr_t kva_start)
1161 {
1162 	struct pmap *kpm;
1163 	int i;
1164 	vaddr_t kva;
1165 
1166 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1167 
1168 	/*
1169 	 * Set up our local static global vars that keep track of the usage of
1170 	 * KVM before kernel_map is set up.
1171 	 */
1172 	virtual_avail = kva_start;		/* first free KVA */
1173 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1174 
1175 	/*
1176 	 * Set up protection_codes: we need to be able to convert from a MI
1177 	 * protection code (some combo of VM_PROT...) to something we can jam
1178 	 * into a x86 PTE.
1179 	 */
1180 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1181 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1182 	protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1183 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1184 	protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1185 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1186 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1187 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1188 
1189 	/*
1190 	 * Now we init the kernel's pmap.
1191 	 *
1192 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1193 	 * the pm_obj contains the list of active PTPs.
1194 	 *
1195 	 * The pm_obj currently does not have a pager. It might be possible to
1196 	 * add a pager that would allow a process to read-only mmap its own page
1197 	 * tables (fast user-level vtophys?). This may or may not be useful.
1198 	 */
1199 	kpm = pmap_kernel();
1200 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1201 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1202 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1203 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1204 		kpm->pm_ptphint[i] = NULL;
1205 	}
1206 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1207 
1208 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1209 	for (i = 0; i < PDP_SIZE; i++)
1210 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1211 
1212 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1213 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1214 
1215 	kcpuset_create(&kpm->pm_cpus, true);
1216 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1217 
1218 	kpm->pm_ldt = NULL;
1219 	kpm->pm_ldt_len = 0;
1220 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1221 
1222 	/*
1223 	 * the above is just a rough estimate and not critical to the proper
1224 	 * operation of the system.
1225 	 */
1226 
1227 #if !defined(XEN)
1228 	/*
1229 	 * Begin to enable global TLB entries if they are supported.
1230 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1231 	 * which happens in cpu_init(), which is run on each cpu
1232 	 * (and happens later)
1233 	 */
1234 	if (cpu_feature[0] & CPUID_PGE) {
1235 		pmap_pg_g = PG_G;		/* enable software */
1236 
1237 		/* add PG_G attribute to already mapped kernel pages */
1238 		pmap_remap_global();
1239 	}
1240 #endif
1241 
1242 #ifndef XEN
1243 	/*
1244 	 * Enable large pages if they are supported.
1245 	 */
1246 	if (cpu_feature[0] & CPUID_PSE) {
1247 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1248 		pmap_largepages = 1;	/* enable software */
1249 
1250 		/*
1251 		 * The TLB must be flushed after enabling large pages on Pentium
1252 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1253 		 * Software Developer's Manual, Volume 3: System Programming".
1254 		 */
1255 		tlbflushg();
1256 
1257 		/* Remap the kernel. */
1258 		pmap_remap_largepages();
1259 	}
1260 	pmap_init_lapic();
1261 #endif /* !XEN */
1262 
1263 #ifdef __HAVE_PCPU_AREA
1264 	pmap_init_pcpu();
1265 #endif
1266 
1267 #ifdef __HAVE_DIRECT_MAP
1268 	pmap_init_directmap(kpm);
1269 #else
1270 	pmap_vpage_cpualloc(&cpu_info_primary);
1271 
1272 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1273 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1274 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1275 	} else { /* amd64 */
1276 		/*
1277 		 * zero_pte is stuck at the end of mapped space for the kernel
1278 		 * image (disjunct from kva space). This is done so that it
1279 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1280 		 * when it's called for the first time.
1281 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1282 		 */
1283 #ifdef XEN
1284 		/* early_zerop initialized in xen_locore() */
1285 #else
1286 		early_zerop = (void *)bootspace.spareva;
1287 #endif
1288 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1289 	}
1290 #endif
1291 
1292 #if defined(XEN) && defined(__x86_64__)
1293 	extern vaddr_t xen_dummy_page;
1294 	paddr_t xen_dummy_user_pgd;
1295 
1296 	/*
1297 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1298 	 * Xen will still consider it active. So we set user PGD to this one
1299 	 * to lift all protection on the now inactive page tables set.
1300 	 */
1301 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1302 
1303 	/* Zero fill it, the less checks in Xen it requires the better */
1304 	memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1305 	/* Mark read-only */
1306 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1307 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_V | pmap_pg_nx,
1308 	    UVMF_INVLPG);
1309 	/* Pin as L4 */
1310 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1311 #endif
1312 
1313 	/*
1314 	 * Allocate space for the IDT, GDT and LDT.
1315 	 */
1316 #ifdef __HAVE_PCPU_AREA
1317 	idt_vaddr = (vaddr_t)&pcpuarea->idt;
1318 #else
1319 	idt_vaddr = pmap_bootstrap_valloc(1);
1320 #endif
1321 	idt_paddr = pmap_bootstrap_palloc(1);
1322 
1323 	gdt_vaddr = pmap_bootstrap_valloc(1);
1324 	gdt_paddr = pmap_bootstrap_palloc(1);
1325 
1326 #ifdef __HAVE_PCPU_AREA
1327 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1328 #else
1329 	ldt_vaddr = pmap_bootstrap_valloc(1);
1330 #endif
1331 	ldt_paddr = pmap_bootstrap_palloc(1);
1332 
1333 #if !defined(__x86_64__) && !defined(XEN)
1334 	/* pentium f00f bug stuff */
1335 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1336 #endif
1337 
1338 	/*
1339 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1340 	 */
1341 	virtual_avail = reserve_dumppages(virtual_avail);
1342 
1343 	/*
1344 	 * Init the static-global locks and global lists.
1345 	 *
1346 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1347 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1348 	 *	again is never taken from interrupt context.
1349 	 */
1350 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1351 	LIST_INIT(&pmaps);
1352 
1353 	/*
1354 	 * Ensure the TLB is sync'd with reality by flushing it...
1355 	 */
1356 	tlbflushg();
1357 
1358 	/*
1359 	 * Calculate pmap_maxkvaddr from nkptp[].
1360 	 */
1361 	kva = VM_MIN_KERNEL_ADDRESS;
1362 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1363 		kva += nkptp[i] * nbpd[i];
1364 	}
1365 	pmap_maxkvaddr = kva;
1366 }
1367 
1368 #ifndef XEN
1369 static void
1370 pmap_init_lapic(void)
1371 {
1372 	/*
1373 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1374 	 * x86 implementation relies a lot on this address to be valid; so just
1375 	 * allocate a fake physical page that will be kentered into
1376 	 * local_apic_va by machdep.
1377 	 *
1378 	 * If the LAPIC is present, the va will be remapped somewhere else
1379 	 * later in lapic_map.
1380 	 */
1381 	local_apic_va = pmap_bootstrap_valloc(1);
1382 	local_apic_pa = pmap_bootstrap_palloc(1);
1383 }
1384 #endif
1385 
1386 #if defined(__HAVE_PCPU_AREA) || defined(__HAVE_DIRECT_MAP)
1387 static size_t
1388 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1389 {
1390 	size_t npages;
1391 	npages = (roundup(endva, pgsz) / pgsz) -
1392 	    (rounddown(startva, pgsz) / pgsz);
1393 	return npages;
1394 }
1395 #endif
1396 
1397 #ifdef __HAVE_PCPU_AREA
1398 static void
1399 pmap_init_pcpu(void)
1400 {
1401 	const vaddr_t startva = PMAP_PCPU_BASE;
1402 	size_t nL4e, nL3e, nL2e, nL1e;
1403 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1404 	paddr_t pa;
1405 	vaddr_t endva;
1406 	vaddr_t tmpva;
1407 	pt_entry_t *pte;
1408 	size_t size;
1409 	int i;
1410 
1411 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1412 
1413 	size = sizeof(struct pcpu_area);
1414 
1415 	endva = startva + size;
1416 
1417 	/* We will use this temporary va. */
1418 	tmpva = bootspace.spareva;
1419 	pte = PTE_BASE + pl1_i(tmpva);
1420 
1421 	/* Build L4 */
1422 	L4e_idx = pl4_i(startva);
1423 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1424 	KASSERT(nL4e  == 1);
1425 	for (i = 0; i < nL4e; i++) {
1426 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1427 
1428 		pa = pmap_bootstrap_palloc(1);
1429 		*pte = (pa & PG_FRAME) | pteflags;
1430 		pmap_update_pg(tmpva);
1431 		memset((void *)tmpva, 0, PAGE_SIZE);
1432 
1433 		L4_BASE[L4e_idx+i] = pa | pteflags | PG_U;
1434 	}
1435 
1436 	/* Build L3 */
1437 	L3e_idx = pl3_i(startva);
1438 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1439 	for (i = 0; i < nL3e; i++) {
1440 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1441 
1442 		pa = pmap_bootstrap_palloc(1);
1443 		*pte = (pa & PG_FRAME) | pteflags;
1444 		pmap_update_pg(tmpva);
1445 		memset((void *)tmpva, 0, PAGE_SIZE);
1446 
1447 		L3_BASE[L3e_idx+i] = pa | pteflags | PG_U;
1448 	}
1449 
1450 	/* Build L2 */
1451 	L2e_idx = pl2_i(startva);
1452 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1453 	for (i = 0; i < nL2e; i++) {
1454 
1455 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1456 
1457 		pa = pmap_bootstrap_palloc(1);
1458 		*pte = (pa & PG_FRAME) | pteflags;
1459 		pmap_update_pg(tmpva);
1460 		memset((void *)tmpva, 0, PAGE_SIZE);
1461 
1462 		L2_BASE[L2e_idx+i] = pa | pteflags | PG_U;
1463 	}
1464 
1465 	/* Build L1 */
1466 	L1e_idx = pl1_i(startva);
1467 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1468 	for (i = 0; i < nL1e; i++) {
1469 		/*
1470 		 * Nothing to do, the PTEs will be entered via
1471 		 * pmap_kenter_pa.
1472 		 */
1473 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1474 	}
1475 
1476 	*pte = 0;
1477 	pmap_update_pg(tmpva);
1478 
1479 	pcpuarea = (struct pcpu_area *)startva;
1480 
1481 	tlbflush();
1482 }
1483 #endif
1484 
1485 #ifdef __HAVE_DIRECT_MAP
1486 /*
1487  * Create the amd64 direct map. Called only once at boot time. We map all of
1488  * the physical memory contiguously using 2MB large pages, with RW permissions.
1489  * However there is a hole: the kernel is mapped with RO permissions.
1490  */
1491 static void
1492 pmap_init_directmap(struct pmap *kpm)
1493 {
1494 	extern phys_ram_seg_t mem_clusters[];
1495 	extern int mem_cluster_cnt;
1496 
1497 	const vaddr_t startva = PMAP_DIRECT_DEFAULT_BASE;
1498 	size_t nL4e, nL3e, nL2e;
1499 	size_t L4e_idx, L3e_idx, L2e_idx;
1500 	size_t spahole, epahole;
1501 	paddr_t lastpa, pa;
1502 	vaddr_t endva;
1503 	vaddr_t tmpva;
1504 	pt_entry_t *pte;
1505 	phys_ram_seg_t *mc;
1506 	int i;
1507 
1508 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1509 	const pd_entry_t holepteflags = PG_V | pmap_pg_nx;
1510 
1511 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1512 
1513 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1514 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1515 
1516 	/* Get the last physical address available */
1517 	lastpa = 0;
1518 	for (i = 0; i < mem_cluster_cnt; i++) {
1519 		mc = &mem_clusters[i];
1520 		lastpa = MAX(lastpa, mc->start + mc->size);
1521 	}
1522 
1523 	/*
1524 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1525 	 */
1526 	if (lastpa > MAXPHYSMEM) {
1527 		panic("pmap_init_directmap: lastpa incorrect");
1528 	}
1529 	endva = startva + lastpa;
1530 
1531 	/* We will use this temporary va. */
1532 	tmpva = bootspace.spareva;
1533 	pte = PTE_BASE + pl1_i(tmpva);
1534 
1535 	/* Build L4 */
1536 	L4e_idx = pl4_i(startva);
1537 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1538 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1539 	for (i = 0; i < nL4e; i++) {
1540 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1541 
1542 		pa = pmap_bootstrap_palloc(1);
1543 		*pte = (pa & PG_FRAME) | pteflags;
1544 		pmap_update_pg(tmpva);
1545 		memset((void *)tmpva, 0, PAGE_SIZE);
1546 
1547 		L4_BASE[L4e_idx+i] = pa | pteflags | PG_U;
1548 	}
1549 
1550 	/* Build L3 */
1551 	L3e_idx = pl3_i(startva);
1552 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1553 	for (i = 0; i < nL3e; i++) {
1554 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1555 
1556 		pa = pmap_bootstrap_palloc(1);
1557 		*pte = (pa & PG_FRAME) | pteflags;
1558 		pmap_update_pg(tmpva);
1559 		memset((void *)tmpva, 0, PAGE_SIZE);
1560 
1561 		L3_BASE[L3e_idx+i] = pa | pteflags | PG_U;
1562 	}
1563 
1564 	/* Build L2 */
1565 	L2e_idx = pl2_i(startva);
1566 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1567 	for (i = 0; i < nL2e; i++) {
1568 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1569 
1570 		pa = (paddr_t)(i * NBPD_L2);
1571 
1572 		if (spahole <= pa && pa < epahole) {
1573 			L2_BASE[L2e_idx+i] = pa | holepteflags | PG_U |
1574 			    PG_PS | pmap_pg_g;
1575 		} else {
1576 			L2_BASE[L2e_idx+i] = pa | pteflags | PG_U |
1577 			    PG_PS | pmap_pg_g;
1578 		}
1579 	}
1580 
1581 	*pte = 0;
1582 	pmap_update_pg(tmpva);
1583 
1584 	pmap_direct_base = startva;
1585 	pmap_direct_end = endva;
1586 	pmap_direct_pdpe = L4e_idx;
1587 	pmap_direct_npdp = nL4e;
1588 
1589 	tlbflush();
1590 }
1591 #endif /* __HAVE_DIRECT_MAP */
1592 
1593 #if !defined(XEN)
1594 /*
1595  * Remap all of the virtual pages created so far with the PG_G bit.
1596  */
1597 static void
1598 pmap_remap_global(void)
1599 {
1600 	vaddr_t kva, kva_end;
1601 	unsigned long p1i;
1602 	size_t i;
1603 
1604 	/* head */
1605 	kva = bootspace.head.va;
1606 	kva_end = kva + bootspace.head.sz;
1607 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1608 		p1i = pl1_i(kva);
1609 		if (pmap_valid_entry(PTE_BASE[p1i]))
1610 			PTE_BASE[p1i] |= pmap_pg_g;
1611 	}
1612 
1613 	/* kernel segments */
1614 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1615 		if (bootspace.segs[i].type == BTSEG_NONE) {
1616 			continue;
1617 		}
1618 		kva = bootspace.segs[i].va;
1619 		kva_end = kva + bootspace.segs[i].sz;
1620 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1621 			p1i = pl1_i(kva);
1622 			if (pmap_valid_entry(PTE_BASE[p1i]))
1623 				PTE_BASE[p1i] |= pmap_pg_g;
1624 		}
1625 	}
1626 
1627 	/* boot space */
1628 	kva = bootspace.boot.va;
1629 	kva_end = kva + bootspace.boot.sz;
1630 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1631 		p1i = pl1_i(kva);
1632 		if (pmap_valid_entry(PTE_BASE[p1i]))
1633 			PTE_BASE[p1i] |= pmap_pg_g;
1634 	}
1635 }
1636 #endif
1637 
1638 #ifndef XEN
1639 /*
1640  * Remap several kernel segments with large pages. We cover as many pages as we
1641  * can. Called only once at boot time, if the CPU supports large pages.
1642  */
1643 static void
1644 pmap_remap_largepages(void)
1645 {
1646 	pd_entry_t *pde;
1647 	vaddr_t kva, kva_end;
1648 	paddr_t pa;
1649 	size_t i;
1650 
1651 	/* Remap the kernel text using large pages. */
1652 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1653 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1654 			continue;
1655 		}
1656 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1657 		if (kva < bootspace.segs[i].va) {
1658 			continue;
1659 		}
1660 		kva_end = rounddown(bootspace.segs[i].va +
1661 			bootspace.segs[i].sz, NBPD_L2);
1662 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1663 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1664 			pde = &L2_BASE[pl2_i(kva)];
1665 			*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1666 			tlbflushg();
1667 		}
1668 	}
1669 
1670 	/* Remap the kernel rodata using large pages. */
1671 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1672 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1673 			continue;
1674 		}
1675 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1676 		if (kva < bootspace.segs[i].va) {
1677 			continue;
1678 		}
1679 		kva_end = rounddown(bootspace.segs[i].va +
1680 			bootspace.segs[i].sz, NBPD_L2);
1681 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1682 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1683 			pde = &L2_BASE[pl2_i(kva)];
1684 			*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1685 			tlbflushg();
1686 		}
1687 	}
1688 
1689 	/* Remap the kernel data+bss using large pages. */
1690 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1691 		if (bootspace.segs[i].type != BTSEG_DATA) {
1692 			continue;
1693 		}
1694 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1695 		if (kva < bootspace.segs[i].va) {
1696 			continue;
1697 		}
1698 		kva_end = rounddown(bootspace.segs[i].va +
1699 			bootspace.segs[i].sz, NBPD_L2);
1700 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1701 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1702 			pde = &L2_BASE[pl2_i(kva)];
1703 			*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1704 			tlbflushg();
1705 		}
1706 	}
1707 }
1708 #endif /* !XEN */
1709 
1710 /*
1711  * pmap_init: called from uvm_init, our job is to get the pmap
1712  * system ready to manage mappings...
1713  */
1714 
1715 void
1716 pmap_init(void)
1717 {
1718 	int i, flags;
1719 
1720 	for (i = 0; i < PV_HASH_SIZE; i++) {
1721 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1722 	}
1723 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1724 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1725 	}
1726 
1727 	/*
1728 	 * initialize caches.
1729 	 */
1730 
1731 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1732 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1733 
1734 #ifdef XEN
1735 	/*
1736 	 * pool_cache(9) should not touch cached objects, since they
1737 	 * are pinned on xen and R/O for the domU
1738 	 */
1739 	flags = PR_NOTOUCH;
1740 #else /* XEN */
1741 	flags = 0;
1742 #endif /* XEN */
1743 #ifdef PAE
1744 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1745 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1746 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1747 #else /* PAE */
1748 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1749 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1750 #endif /* PAE */
1751 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1752 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1753 	    NULL, NULL);
1754 
1755 	pmap_tlb_init();
1756 
1757 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1758 	pmap_tlb_cpu_init(curcpu());
1759 
1760 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1761 	    NULL, "x86", "io bitmap copy");
1762 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1763 	    NULL, "x86", "ldt sync");
1764 
1765 	/*
1766 	 * done: pmap module is up (and ready for business)
1767 	 */
1768 
1769 	pmap_initialized = true;
1770 }
1771 
1772 /*
1773  * pmap_cpu_init_late: perform late per-CPU initialization.
1774  */
1775 
1776 #ifndef XEN
1777 void
1778 pmap_cpu_init_late(struct cpu_info *ci)
1779 {
1780 	/*
1781 	 * The BP has already its own PD page allocated during early
1782 	 * MD startup.
1783 	 */
1784 	if (ci == &cpu_info_primary)
1785 		return;
1786 
1787 #ifdef PAE
1788 	cpu_alloc_l3_page(ci);
1789 #endif
1790 }
1791 #endif
1792 
1793 #ifndef __HAVE_DIRECT_MAP
1794 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1795 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1796 
1797 static void
1798 pmap_vpage_cpualloc(struct cpu_info *ci)
1799 {
1800 	bool primary = (ci == &cpu_info_primary);
1801 	size_t i, npages;
1802 	vaddr_t vabase;
1803 	vsize_t vrange;
1804 
1805 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1806 	KASSERT(npages >= VPAGE_MAX);
1807 	vrange = npages * PAGE_SIZE;
1808 
1809 	if (primary) {
1810 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1811 			/* Waste some pages to align properly */
1812 		}
1813 		/* The base is aligned, allocate the rest (contiguous) */
1814 		pmap_bootstrap_valloc(npages - 1);
1815 	} else {
1816 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1817 		    UVM_KMF_VAONLY);
1818 		if (vabase == 0) {
1819 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1820 			    __func__, cpu_index(ci));
1821 		}
1822 	}
1823 
1824 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1825 
1826 	for (i = 0; i < VPAGE_MAX; i++) {
1827 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1828 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1829 	}
1830 }
1831 
1832 void
1833 pmap_vpage_cpu_init(struct cpu_info *ci)
1834 {
1835 	if (ci == &cpu_info_primary) {
1836 		/* cpu0 already taken care of in pmap_bootstrap */
1837 		return;
1838 	}
1839 
1840 	pmap_vpage_cpualloc(ci);
1841 }
1842 #endif
1843 
1844 /*
1845  * p v _ e n t r y   f u n c t i o n s
1846  */
1847 
1848 static bool
1849 pmap_pp_needs_pve(struct pmap_page *pp)
1850 {
1851 
1852 	/*
1853 	 * Adding a pv entry for this page only needs to allocate a pv_entry
1854 	 * structure if the page already has at least one pv entry,
1855 	 * since the first pv entry is stored in the pmap_page.
1856 	 */
1857 
1858 	return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 ||
1859 	    !LIST_EMPTY(&pp->pp_head.pvh_list));
1860 }
1861 
1862 /*
1863  * pmap_free_pvs: free a list of pv_entrys
1864  */
1865 
1866 static void
1867 pmap_free_pvs(struct pv_entry *pve)
1868 {
1869 	struct pv_entry *next;
1870 
1871 	for ( /* null */ ; pve != NULL ; pve = next) {
1872 		next = pve->pve_next;
1873 		pool_cache_put(&pmap_pv_cache, pve);
1874 	}
1875 }
1876 
1877 /*
1878  * main pv_entry manipulation functions:
1879  *   pmap_enter_pv: enter a mapping onto a pv_head list
1880  *   pmap_remove_pv: remove a mapping from a pv_head list
1881  *
1882  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1883  *       the pvh before calling
1884  */
1885 
1886 /*
1887  * insert_pv: a helper of pmap_enter_pv
1888  */
1889 
1890 static void
1891 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1892 {
1893 	struct pv_hash_head *hh;
1894 	kmutex_t *lock;
1895 	u_int hash;
1896 
1897 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1898 	lock = pvhash_lock(hash);
1899 	hh = pvhash_head(hash);
1900 	mutex_spin_enter(lock);
1901 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1902 	mutex_spin_exit(lock);
1903 
1904 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1905 }
1906 
1907 /*
1908  * pmap_enter_pv: enter a mapping onto a pv_head lst
1909  *
1910  * => caller should adjust ptp's wire_count before calling
1911  * => caller has preallocated pve and *sparepve for us
1912  */
1913 
1914 static struct pv_entry *
1915 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1916     struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1917 {
1918 
1919 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1920 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1921 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1922 
1923 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1924 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1925 			pp->pp_flags |= PP_EMBEDDED;
1926 			pp->pp_pte.pte_ptp = ptp;
1927 			pp->pp_pte.pte_va = va;
1928 
1929 			return pve;
1930 		}
1931 	} else {
1932 		struct pv_entry *pve2;
1933 
1934 		pve2 = *sparepve;
1935 		*sparepve = NULL;
1936 
1937 		pve2->pve_pte = pp->pp_pte;
1938 		pp->pp_flags &= ~PP_EMBEDDED;
1939 		LIST_INIT(&pp->pp_head.pvh_list);
1940 		insert_pv(pp, pve2);
1941 	}
1942 
1943 	pve->pve_pte.pte_ptp = ptp;
1944 	pve->pve_pte.pte_va = va;
1945 	insert_pv(pp, pve);
1946 
1947 	return NULL;
1948 }
1949 
1950 /*
1951  * pmap_remove_pv: try to remove a mapping from a pv_list
1952  *
1953  * => caller should adjust ptp's wire_count and free PTP if needed
1954  * => we return the removed pve
1955  */
1956 
1957 static struct pv_entry *
1958 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
1959 {
1960 	struct pv_hash_head *hh;
1961 	struct pv_entry *pve;
1962 	kmutex_t *lock;
1963 	u_int hash;
1964 
1965 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1966 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1967 
1968 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
1969 		KASSERT(pp->pp_pte.pte_ptp == ptp);
1970 		KASSERT(pp->pp_pte.pte_va == va);
1971 
1972 		pp->pp_flags &= ~PP_EMBEDDED;
1973 		LIST_INIT(&pp->pp_head.pvh_list);
1974 
1975 		return NULL;
1976 	}
1977 
1978 	hash = pvhash_hash(ptp, va);
1979 	lock = pvhash_lock(hash);
1980 	hh = pvhash_head(hash);
1981 	mutex_spin_enter(lock);
1982 	pve = pvhash_remove(hh, ptp, va);
1983 	mutex_spin_exit(lock);
1984 
1985 	LIST_REMOVE(pve, pve_list);
1986 
1987 	return pve;
1988 }
1989 
1990 /*
1991  * p t p   f u n c t i o n s
1992  */
1993 
1994 static inline struct vm_page *
1995 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
1996 {
1997 	int lidx = level - 1;
1998 	struct vm_page *pg;
1999 
2000 	KASSERT(mutex_owned(pmap->pm_lock));
2001 
2002 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
2003 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
2004 		return (pmap->pm_ptphint[lidx]);
2005 	}
2006 	PMAP_SUBOBJ_LOCK(pmap, lidx);
2007 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
2008 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
2009 
2010 	KASSERT(pg == NULL || pg->wire_count >= 1);
2011 	return pg;
2012 }
2013 
2014 static inline void
2015 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2016 {
2017 	lwp_t *l;
2018 	int lidx;
2019 	struct uvm_object *obj;
2020 
2021 	KASSERT(ptp->wire_count == 1);
2022 
2023 	lidx = level - 1;
2024 
2025 	obj = &pmap->pm_obj[lidx];
2026 	pmap_stats_update(pmap, -1, 0);
2027 	if (lidx != 0)
2028 		mutex_enter(obj->vmobjlock);
2029 	if (pmap->pm_ptphint[lidx] == ptp)
2030 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
2031 	ptp->wire_count = 0;
2032 	uvm_pagerealloc(ptp, NULL, 0);
2033 	l = curlwp;
2034 	KASSERT((l->l_pflag & LP_INTR) == 0);
2035 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
2036 	l->l_md.md_gc_ptp = ptp;
2037 	if (lidx != 0)
2038 		mutex_exit(obj->vmobjlock);
2039 }
2040 
2041 static void
2042 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2043 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2044 {
2045 	unsigned long index;
2046 	int level;
2047 	vaddr_t invaladdr;
2048 	pd_entry_t opde;
2049 
2050 	KASSERT(pmap != pmap_kernel());
2051 	KASSERT(mutex_owned(pmap->pm_lock));
2052 	KASSERT(kpreempt_disabled());
2053 
2054 	level = 1;
2055 	do {
2056 		index = pl_i(va, level + 1);
2057 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2058 
2059 		/*
2060 		 * On Xen-amd64 or SVS, we need to sync the top level page
2061 		 * directory on each CPU.
2062 		 */
2063 #if defined(XEN) && defined(__x86_64__)
2064 		if (level == PTP_LEVELS - 1) {
2065 			xen_kpm_sync(pmap, index);
2066 		}
2067 #elif defined(SVS)
2068 		if (svs_enabled && level == PTP_LEVELS - 1) {
2069 			svs_pmap_sync(pmap, index);
2070 		}
2071 #endif
2072 
2073 		invaladdr = level == 1 ? (vaddr_t)ptes :
2074 		    (vaddr_t)pdes[level - 2];
2075 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2076 		    opde, TLBSHOOT_FREE_PTP1);
2077 
2078 #if defined(XEN)
2079 		pmap_tlb_shootnow();
2080 #endif
2081 
2082 		pmap_freepage(pmap, ptp, level);
2083 		if (level < PTP_LEVELS - 1) {
2084 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
2085 			ptp->wire_count--;
2086 			if (ptp->wire_count > 1)
2087 				break;
2088 		}
2089 	} while (++level < PTP_LEVELS);
2090 	pmap_pte_flush();
2091 }
2092 
2093 /*
2094  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2095  *
2096  * => pmap should NOT be pmap_kernel()
2097  * => pmap should be locked
2098  * => preemption should be disabled
2099  */
2100 
2101 static struct vm_page *
2102 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags)
2103 {
2104 	struct vm_page *ptp;
2105 	struct {
2106 		struct vm_page *pg;
2107 		bool new;
2108 	} pt[PTP_LEVELS + 1];
2109 	int i, aflags;
2110 	unsigned long index;
2111 	pd_entry_t *pva;
2112 	paddr_t pa;
2113 	struct uvm_object *obj;
2114 	voff_t off;
2115 
2116 	KASSERT(pmap != pmap_kernel());
2117 	KASSERT(mutex_owned(pmap->pm_lock));
2118 	KASSERT(kpreempt_disabled());
2119 
2120 	/*
2121 	 * Loop through all page table levels allocating a page
2122 	 * for any level where we don't already have one.
2123 	 */
2124 	memset(pt, 0, sizeof(pt));
2125 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2126 		UVM_PGA_ZERO;
2127 	for (i = PTP_LEVELS; i > 1; i--) {
2128 		obj = &pmap->pm_obj[i - 2];
2129 		off = ptp_va2o(va, i - 1);
2130 
2131 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2132 		pt[i].pg = uvm_pagelookup(obj, off);
2133 		if (pt[i].pg == NULL) {
2134 			pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags);
2135 			pt[i].new = true;
2136 		}
2137 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2138 
2139 		if (pt[i].pg == NULL)
2140 			goto fail;
2141 	}
2142 
2143 	/*
2144 	 * Now that we have all the pages looked up or allocated,
2145 	 * loop through again installing any new ones into the tree.
2146 	 */
2147 	for (i = PTP_LEVELS; i > 1; i--) {
2148 		index = pl_i(va, i);
2149 		pva = pdes[i - 2];
2150 
2151 		if (pmap_valid_entry(pva[index])) {
2152 			KASSERT(!pt[i].new);
2153 			continue;
2154 		}
2155 
2156 		ptp = pt[i].pg;
2157 		ptp->flags &= ~PG_BUSY; /* never busy */
2158 		ptp->wire_count = 1;
2159 		pmap->pm_ptphint[i - 2] = ptp;
2160 		pa = VM_PAGE_TO_PHYS(ptp);
2161 		pmap_pte_set(&pva[index], (pd_entry_t)
2162 		    (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2163 
2164 		/*
2165 		 * On Xen-amd64 or SVS, we need to sync the top level page
2166 		 * directory on each CPU.
2167 		 */
2168 #if defined(XEN) && defined(__x86_64__)
2169 		if (i == PTP_LEVELS) {
2170 			xen_kpm_sync(pmap, index);
2171 		}
2172 #elif defined(SVS)
2173 		if (svs_enabled && i == PTP_LEVELS) {
2174 			svs_pmap_sync(pmap, index);
2175 		}
2176 #endif
2177 
2178 		pmap_pte_flush();
2179 		pmap_stats_update(pmap, 1, 0);
2180 
2181 		/*
2182 		 * If we're not in the top level, increase the
2183 		 * wire count of the parent page.
2184 		 */
2185 		if (i < PTP_LEVELS) {
2186 			pt[i + 1].pg->wire_count++;
2187 		}
2188 	}
2189 	ptp = pt[2].pg;
2190 	KASSERT(ptp != NULL);
2191 	pmap->pm_ptphint[0] = ptp;
2192 	return ptp;
2193 
2194 	/*
2195 	 * Allocation of a ptp failed, free any others that we just allocated.
2196 	 */
2197 fail:
2198 	for (i = PTP_LEVELS; i > 1; i--) {
2199 		if (pt[i].pg == NULL) {
2200 			break;
2201 		}
2202 		if (!pt[i].new) {
2203 			continue;
2204 		}
2205 		obj = &pmap->pm_obj[i - 2];
2206 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2207 		uvm_pagefree(pt[i].pg);
2208 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2209 	}
2210 	return NULL;
2211 }
2212 
2213 /*
2214  * p m a p   l i f e c y c l e   f u n c t i o n s
2215  */
2216 
2217 /*
2218  * pmap_pdp_ctor: constructor for the PDP cache.
2219  */
2220 static int
2221 pmap_pdp_ctor(void *arg, void *v, int flags)
2222 {
2223 	pd_entry_t *pdir = v;
2224 	paddr_t pdirpa = 0;
2225 	vaddr_t object;
2226 	int i;
2227 
2228 #if !defined(XEN) || !defined(__x86_64__)
2229 	int npde;
2230 #endif
2231 #ifdef XEN
2232 	int s;
2233 #endif
2234 
2235 	/*
2236 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2237 	 */
2238 
2239 #if defined(XEN) && defined(__x86_64__)
2240 	/* Fetch the physical address of the page directory */
2241 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2242 
2243 	/* Zero the area */
2244 	memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2245 
2246 	/*
2247 	 * This pdir will NEVER be active in kernel mode, so mark
2248 	 * recursive entry invalid.
2249 	 */
2250 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2251 
2252 	/*
2253 	 * PDP constructed this way won't be for the kernel, hence we
2254 	 * don't put kernel mappings on Xen.
2255 	 *
2256 	 * But we need to make pmap_create() happy, so put a dummy
2257 	 * (without PG_V) value at the right place.
2258 	 */
2259 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2260 	     (pd_entry_t)-1 & PG_FRAME;
2261 #else /* XEN && __x86_64__*/
2262 	/* Zero the area */
2263 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2264 
2265 	object = (vaddr_t)v;
2266 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2267 		/* Fetch the physical address of the page directory */
2268 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2269 
2270 		/* Put in recursive PDE to map the PTEs */
2271 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2272 		    pmap_pg_nx;
2273 #ifndef XEN
2274 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2275 #endif
2276 	}
2277 
2278 	/* Copy the kernel's top level PDE */
2279 	npde = nkptp[PTP_LEVELS - 1];
2280 
2281 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2282 	    npde * sizeof(pd_entry_t));
2283 
2284 	/* Zero the rest */
2285 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2286 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2287 
2288 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2289 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2290 		pdir[idx] = PDP_BASE[idx];
2291 	}
2292 
2293 #ifdef __HAVE_PCPU_AREA
2294 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2295 #endif
2296 #ifdef __HAVE_DIRECT_MAP
2297 	memcpy(&pdir[pmap_direct_pdpe], &PDP_BASE[pmap_direct_pdpe],
2298 	    pmap_direct_npdp * sizeof(pd_entry_t));
2299 #endif
2300 #endif /* XEN  && __x86_64__*/
2301 
2302 #ifdef XEN
2303 	s = splvm();
2304 	object = (vaddr_t)v;
2305 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2306 	    VM_PROT_READ);
2307 	pmap_update(pmap_kernel());
2308 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2309 		/*
2310 		 * pin as L2/L4 page, we have to do the page with the
2311 		 * PDIR_SLOT_PTE entries last
2312 		 */
2313 #ifdef PAE
2314 		if (i == l2tol3(PDIR_SLOT_PTE))
2315 			continue;
2316 #endif
2317 
2318 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2319 #ifdef __x86_64__
2320 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2321 #else
2322 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2323 #endif
2324 	}
2325 #ifdef PAE
2326 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2327 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2328 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2329 #endif
2330 	splx(s);
2331 #endif /* XEN */
2332 
2333 	return (0);
2334 }
2335 
2336 /*
2337  * pmap_pdp_dtor: destructor for the PDP cache.
2338  */
2339 
2340 static void
2341 pmap_pdp_dtor(void *arg, void *v)
2342 {
2343 #ifdef XEN
2344 	paddr_t pdirpa = 0;	/* XXX: GCC */
2345 	vaddr_t object = (vaddr_t)v;
2346 	int i;
2347 	int s = splvm();
2348 	pt_entry_t *pte;
2349 
2350 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2351 		/* fetch the physical address of the page directory. */
2352 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2353 		/* unpin page table */
2354 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2355 	}
2356 	object = (vaddr_t)v;
2357 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2358 		/* Set page RW again */
2359 		pte = kvtopte(object);
2360 		pmap_pte_set(pte, *pte | PG_RW);
2361 		xen_bcast_invlpg((vaddr_t)object);
2362 	}
2363 	splx(s);
2364 #endif  /* XEN */
2365 }
2366 
2367 #ifdef PAE
2368 
2369 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2370 
2371 static void *
2372 pmap_pdp_alloc(struct pool *pp, int flags)
2373 {
2374 	return (void *)uvm_km_alloc(kernel_map,
2375 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2376 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2377 	    | UVM_KMF_WIRED);
2378 }
2379 
2380 /*
2381  * pmap_pdp_free: free a PDP
2382  */
2383 
2384 static void
2385 pmap_pdp_free(struct pool *pp, void *v)
2386 {
2387 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2388 	    UVM_KMF_WIRED);
2389 }
2390 #endif /* PAE */
2391 
2392 /*
2393  * pmap_create: create a pmap object.
2394  */
2395 struct pmap *
2396 pmap_create(void)
2397 {
2398 	struct pmap *pmap;
2399 	int i;
2400 
2401 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2402 
2403 	/* init uvm_object */
2404 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2405 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2406 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2407 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2408 		pmap->pm_ptphint[i] = NULL;
2409 	}
2410 	pmap->pm_stats.wired_count = 0;
2411 	/* count the PDP allocd below */
2412 	pmap->pm_stats.resident_count = PDP_SIZE;
2413 #if !defined(__x86_64__)
2414 	pmap->pm_hiexec = 0;
2415 #endif /* !defined(__x86_64__) */
2416 	pmap->pm_flags = 0;
2417 	pmap->pm_gc_ptp = NULL;
2418 
2419 	kcpuset_create(&pmap->pm_cpus, true);
2420 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2421 #ifdef XEN
2422 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2423 #endif
2424 	/* init the LDT */
2425 	pmap->pm_ldt = NULL;
2426 	pmap->pm_ldt_len = 0;
2427 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2428 
2429 	/* allocate PDP */
2430  try_again:
2431 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2432 
2433 	mutex_enter(&pmaps_lock);
2434 
2435 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2436 		mutex_exit(&pmaps_lock);
2437 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2438 		goto try_again;
2439 	}
2440 
2441 	for (i = 0; i < PDP_SIZE; i++)
2442 		pmap->pm_pdirpa[i] =
2443 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2444 
2445 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2446 
2447 	mutex_exit(&pmaps_lock);
2448 
2449 	return (pmap);
2450 }
2451 
2452 /*
2453  * pmap_free_ptps: put a list of ptps back to the freelist.
2454  */
2455 
2456 void
2457 pmap_free_ptps(struct vm_page *empty_ptps)
2458 {
2459 	struct vm_page *ptp;
2460 	struct pmap_page *pp;
2461 
2462 	while ((ptp = empty_ptps) != NULL) {
2463 		pp = VM_PAGE_TO_PP(ptp);
2464 		empty_ptps = pp->pp_link;
2465 		LIST_INIT(&pp->pp_head.pvh_list);
2466 		uvm_pagefree(ptp);
2467 	}
2468 }
2469 
2470 /*
2471  * pmap_check_ptps: verify that none of the pmap's page table objects
2472  * have any pages allocated to them.
2473  */
2474 
2475 static inline void
2476 pmap_check_ptps(struct pmap *pmap)
2477 {
2478 	int i;
2479 
2480 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2481 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2482 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2483 	}
2484 }
2485 
2486 static inline void
2487 pmap_check_inuse(struct pmap *pmap)
2488 {
2489 #ifdef DIAGNOSTIC
2490 	CPU_INFO_ITERATOR cii;
2491 	struct cpu_info *ci;
2492 
2493 	for (CPU_INFO_FOREACH(cii, ci)) {
2494 		if (ci->ci_pmap == pmap)
2495 			panic("destroying pmap being used");
2496 #if defined(XEN) && defined(__x86_64__)
2497 		for (int i = 0; i < PDIR_SLOT_PTE; i++) {
2498 			if (pmap->pm_pdir[i] != 0 &&
2499 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2500 				printf("pmap_destroy(%p) pmap_kernel %p "
2501 				    "curcpu %d cpu %d ci_pmap %p "
2502 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2503 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2504 				    pmap, pmap_kernel(), curcpu()->ci_index,
2505 				    ci->ci_index, ci->ci_pmap,
2506 				    i, ci->ci_kpm_pdir[i],
2507 				    i, pmap->pm_pdir[i]);
2508 				panic("%s: used pmap", __func__);
2509 			}
2510 		}
2511 #endif
2512 	}
2513 #endif /* DIAGNOSTIC */
2514 }
2515 
2516 /*
2517  * pmap_destroy: drop reference count on pmap.   free pmap if
2518  *	reference count goes to zero.
2519  */
2520 
2521 void
2522 pmap_destroy(struct pmap *pmap)
2523 {
2524 	lwp_t *l;
2525 	int i;
2526 
2527 	/*
2528 	 * If we have torn down this pmap, process deferred frees and
2529 	 * invalidations.  Free now if the system is low on memory.
2530 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2531 	 * TLB shootdown.
2532 	 */
2533 	l = curlwp;
2534 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2535 		pmap_check_ptps(pmap);
2536 		if (uvmexp.free < uvmexp.freetarg) {
2537 			pmap_update(pmap);
2538 		} else {
2539 			KASSERT(pmap->pm_gc_ptp == NULL);
2540 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2541 			l->l_md.md_gc_ptp = NULL;
2542 			l->l_md.md_gc_pmap = NULL;
2543 		}
2544 	}
2545 
2546 	/*
2547 	 * drop reference count
2548 	 */
2549 
2550 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2551 		return;
2552 	}
2553 
2554 	pmap_check_inuse(pmap);
2555 
2556 	/*
2557 	 * Reference count is zero, free pmap resources and then free pmap.
2558 	 * First, remove it from global list of pmaps.
2559 	 */
2560 
2561 	mutex_enter(&pmaps_lock);
2562 	LIST_REMOVE(pmap, pm_list);
2563 	mutex_exit(&pmaps_lock);
2564 
2565 	/*
2566 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2567 	 * PTP pages are no longer visible to any CPU.
2568 	 */
2569 
2570 	pmap_free_ptps(pmap->pm_gc_ptp);
2571 
2572 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2573 
2574 #ifdef USER_LDT
2575 	if (pmap->pm_ldt != NULL) {
2576 		/*
2577 		 * no need to switch the LDT; this address space is gone,
2578 		 * nothing is using it.
2579 		 *
2580 		 * No need to lock the pmap for ldt_free (or anything else),
2581 		 * we're the last one to use it.
2582 		 */
2583 		mutex_enter(&cpu_lock);
2584 		ldt_free(pmap->pm_ldt_sel);
2585 		mutex_exit(&cpu_lock);
2586 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2587 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2588 	}
2589 #endif
2590 
2591 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2592 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2593 		mutex_destroy(&pmap->pm_obj_lock[i]);
2594 	}
2595 	kcpuset_destroy(pmap->pm_cpus);
2596 	kcpuset_destroy(pmap->pm_kernel_cpus);
2597 #ifdef XEN
2598 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2599 #endif
2600 
2601 	pmap_check_ptps(pmap);
2602 	pool_cache_put(&pmap_cache, pmap);
2603 }
2604 
2605 /*
2606  * pmap_remove_all: pmap is being torn down by the current thread.
2607  * avoid unnecessary invalidations.
2608  */
2609 
2610 void
2611 pmap_remove_all(struct pmap *pmap)
2612 {
2613 	lwp_t *l = curlwp;
2614 
2615 	KASSERT(l->l_md.md_gc_pmap == NULL);
2616 
2617 	l->l_md.md_gc_pmap = pmap;
2618 }
2619 
2620 #if defined(PMAP_FORK)
2621 /*
2622  * pmap_fork: perform any necessary data structure manipulation when
2623  * a VM space is forked.
2624  */
2625 
2626 void
2627 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2628 {
2629 #ifdef USER_LDT
2630 	union descriptor *new_ldt;
2631 	size_t len;
2632 	int sel;
2633 
2634 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2635 		return;
2636 	}
2637 
2638 	/*
2639 	 * Copy the LDT into the new process.
2640 	 *
2641 	 * Read pmap1's ldt pointer and length unlocked; if it changes
2642 	 * behind our back we'll retry. This will starve if there's a
2643 	 * stream of LDT changes in another thread but that should not
2644 	 * happen.
2645 	 */
2646 
2647  retry:
2648 	if (pmap1->pm_ldt != NULL) {
2649 		len = pmap1->pm_ldt_len;
2650 		/* Allocate space for the new process's LDT */
2651 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2652 		    UVM_KMF_WIRED);
2653 		if (new_ldt == NULL) {
2654 			printf("WARNING: %s: unable to allocate LDT space\n",
2655 			    __func__);
2656 			return;
2657 		}
2658 		mutex_enter(&cpu_lock);
2659 		/* Get a GDT slot for it */
2660 		sel = ldt_alloc(new_ldt, len);
2661 		if (sel == -1) {
2662 			mutex_exit(&cpu_lock);
2663 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2664 			    UVM_KMF_WIRED);
2665 			printf("WARNING: %s: unable to allocate LDT selector\n",
2666 			    __func__);
2667 			return;
2668 		}
2669 	} else {
2670 		/* Wasn't anything there after all. */
2671 		len = -1;
2672 		new_ldt = NULL;
2673 		sel = -1;
2674 		mutex_enter(&cpu_lock);
2675 	}
2676 
2677  	/* If there's still something there now that we have cpu_lock... */
2678  	if (pmap1->pm_ldt != NULL) {
2679 		if (len != pmap1->pm_ldt_len) {
2680 			/* Oops, it changed. Drop what we did and try again */
2681 			if (len != -1) {
2682 				ldt_free(sel);
2683 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2684 				    len, UVM_KMF_WIRED);
2685 			}
2686 			mutex_exit(&cpu_lock);
2687 			goto retry;
2688 		}
2689 
2690 		/* Copy the LDT data and install it in pmap2 */
2691 		memcpy(new_ldt, pmap1->pm_ldt, len);
2692 		pmap2->pm_ldt = new_ldt;
2693 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2694 		pmap2->pm_ldt_sel = sel;
2695 		len = -1;
2696 	}
2697 
2698 	if (len != -1) {
2699 		/* There wasn't still something there, so mop up */
2700 		ldt_free(sel);
2701 		mutex_exit(&cpu_lock);
2702 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2703 		    UVM_KMF_WIRED);
2704 	} else {
2705 		mutex_exit(&cpu_lock);
2706 	}
2707 #endif /* USER_LDT */
2708 }
2709 #endif /* PMAP_FORK */
2710 
2711 #ifdef USER_LDT
2712 
2713 /*
2714  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2715  * is active, reload LDTR.
2716  */
2717 static void
2718 pmap_ldt_xcall(void *arg1, void *arg2)
2719 {
2720 	struct pmap *pm;
2721 
2722 	kpreempt_disable();
2723 	pm = arg1;
2724 	if (curcpu()->ci_pmap == pm) {
2725 		lldt(pm->pm_ldt_sel);
2726 	}
2727 	kpreempt_enable();
2728 }
2729 
2730 /*
2731  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2732  * in the new selector on all CPUs.
2733  */
2734 void
2735 pmap_ldt_sync(struct pmap *pm)
2736 {
2737 	uint64_t where;
2738 
2739 	KASSERT(mutex_owned(&cpu_lock));
2740 
2741 	pmap_ldt_evcnt.ev_count++;
2742 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2743 	xc_wait(where);
2744 }
2745 
2746 /*
2747  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2748  * restore the default.
2749  */
2750 
2751 void
2752 pmap_ldt_cleanup(struct lwp *l)
2753 {
2754 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2755 	union descriptor *dp = NULL;
2756 	size_t len = 0;
2757 	int sel = -1;
2758 
2759 	if (__predict_true(pmap->pm_ldt == NULL)) {
2760 		return;
2761 	}
2762 
2763 	mutex_enter(&cpu_lock);
2764 	if (pmap->pm_ldt != NULL) {
2765 		sel = pmap->pm_ldt_sel;
2766 		dp = pmap->pm_ldt;
2767 		len = pmap->pm_ldt_len;
2768 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2769 		pmap->pm_ldt = NULL;
2770 		pmap->pm_ldt_len = 0;
2771 		pmap_ldt_sync(pmap);
2772 		ldt_free(sel);
2773 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2774 	}
2775 	mutex_exit(&cpu_lock);
2776 }
2777 #endif /* USER_LDT */
2778 
2779 /*
2780  * pmap_activate: activate a process' pmap
2781  *
2782  * => must be called with kernel preemption disabled
2783  * => if lwp is the curlwp, then set ci_want_pmapload so that
2784  *    actual MMU context switch will be done by pmap_load() later
2785  */
2786 
2787 void
2788 pmap_activate(struct lwp *l)
2789 {
2790 	struct cpu_info *ci;
2791 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2792 
2793 	KASSERT(kpreempt_disabled());
2794 
2795 	ci = curcpu();
2796 
2797 	if (l != ci->ci_curlwp)
2798 		return;
2799 
2800 	KASSERT(ci->ci_want_pmapload == 0);
2801 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2802 
2803 	/*
2804 	 * no need to switch to kernel vmspace because
2805 	 * it's a subset of any vmspace.
2806 	 */
2807 
2808 	if (pmap == pmap_kernel()) {
2809 		ci->ci_want_pmapload = 0;
2810 		return;
2811 	}
2812 
2813 	ci->ci_want_pmapload = 1;
2814 }
2815 
2816 #if defined(XEN) && defined(__x86_64__)
2817 #define	KASSERT_PDIRPA(pmap) \
2818 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
2819 	    pmap == pmap_kernel())
2820 #elif defined(PAE)
2821 #define	KASSERT_PDIRPA(pmap) \
2822 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
2823 #elif !defined(XEN)
2824 #define	KASSERT_PDIRPA(pmap) \
2825 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
2826 #else
2827 #define	KASSERT_PDIRPA(pmap) 	KASSERT(true)	/* nothing to do */
2828 #endif
2829 
2830 /*
2831  * pmap_reactivate: try to regain reference to the pmap.
2832  *
2833  * => Must be called with kernel preemption disabled.
2834  */
2835 
2836 static void
2837 pmap_reactivate(struct pmap *pmap)
2838 {
2839 	struct cpu_info * const ci = curcpu();
2840 	const cpuid_t cid = cpu_index(ci);
2841 
2842 	KASSERT(kpreempt_disabled());
2843 	KASSERT_PDIRPA(pmap);
2844 
2845 	/*
2846 	 * If we still have a lazy reference to this pmap, we can assume
2847 	 * that there was no TLB shootdown for this pmap in the meantime.
2848 	 *
2849 	 * The order of events here is important as we must synchronize
2850 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2851 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2852 	 * change only when the state is TLBSTATE_LAZY.
2853 	 */
2854 
2855 	ci->ci_tlbstate = TLBSTATE_VALID;
2856 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2857 
2858 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2859 		/* We have the reference, state is valid. */
2860 	} else {
2861 		/*
2862 		 * Must reload the TLB, pmap has been changed during
2863 		 * deactivated.
2864 		 */
2865 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2866 
2867 		tlbflush();
2868 	}
2869 }
2870 
2871 /*
2872  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2873  * and relevant LDT info.
2874  *
2875  * Ensures that the current process' pmap is loaded on the current CPU's
2876  * MMU and that there are no stale TLB entries.
2877  *
2878  * => The caller should disable kernel preemption or do check-and-retry
2879  *    to prevent a preemption from undoing our efforts.
2880  * => This function may block.
2881  */
2882 void
2883 pmap_load(void)
2884 {
2885 	struct cpu_info *ci;
2886 	struct pmap *pmap, *oldpmap;
2887 	struct lwp *l;
2888 	struct pcb *pcb;
2889 	cpuid_t cid;
2890 	uint64_t ncsw;
2891 
2892 	kpreempt_disable();
2893  retry:
2894 	ci = curcpu();
2895 	if (!ci->ci_want_pmapload) {
2896 		kpreempt_enable();
2897 		return;
2898 	}
2899 	l = ci->ci_curlwp;
2900 	ncsw = l->l_ncsw;
2901 
2902 	/* should be able to take ipis. */
2903 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2904 #ifdef XEN
2905 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2906 	KASSERT(x86_read_psl() == 0);
2907 #else
2908 	KASSERT((x86_read_psl() & PSL_I) != 0);
2909 #endif
2910 
2911 	KASSERT(l != NULL);
2912 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2913 	KASSERT(pmap != pmap_kernel());
2914 	oldpmap = ci->ci_pmap;
2915 	pcb = lwp_getpcb(l);
2916 
2917 	if (pmap == oldpmap) {
2918 		pmap_reactivate(pmap);
2919 		ci->ci_want_pmapload = 0;
2920 		kpreempt_enable();
2921 		return;
2922 	}
2923 
2924 	/*
2925 	 * Acquire a reference to the new pmap and perform the switch.
2926 	 */
2927 
2928 	pmap_reference(pmap);
2929 
2930 	cid = cpu_index(ci);
2931 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2932 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2933 
2934 	KASSERT_PDIRPA(oldpmap);
2935 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2936 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
2937 
2938 	/*
2939 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
2940 	 * with TLB shootdown interrupts, so set the state VALID first,
2941 	 * then register us for shootdown events on this pmap.
2942 	 */
2943 	ci->ci_tlbstate = TLBSTATE_VALID;
2944 	kcpuset_atomic_set(pmap->pm_cpus, cid);
2945 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
2946 	ci->ci_pmap = pmap;
2947 
2948 	/*
2949 	 * update tss.  now that we have registered for invalidations
2950 	 * from other CPUs, we're good to load the page tables.
2951 	 */
2952 #ifdef PAE
2953 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
2954 #else
2955 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
2956 #endif
2957 
2958 #ifdef i386
2959 #ifndef XEN
2960 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
2961 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
2962 #endif /* !XEN */
2963 #endif /* i386 */
2964 
2965 	lldt(pmap->pm_ldt_sel);
2966 
2967 	cpu_load_pmap(pmap, oldpmap);
2968 
2969 	ci->ci_want_pmapload = 0;
2970 
2971 	/*
2972 	 * we're now running with the new pmap.  drop the reference
2973 	 * to the old pmap.  if we block, we need to go around again.
2974 	 */
2975 
2976 	pmap_destroy(oldpmap);
2977 	if (l->l_ncsw != ncsw) {
2978 		goto retry;
2979 	}
2980 
2981 	kpreempt_enable();
2982 }
2983 
2984 /*
2985  * pmap_deactivate: deactivate a process' pmap.
2986  *
2987  * => Must be called with kernel preemption disabled (high IPL is enough).
2988  */
2989 void
2990 pmap_deactivate(struct lwp *l)
2991 {
2992 	struct pmap *pmap;
2993 	struct cpu_info *ci;
2994 
2995 	KASSERT(kpreempt_disabled());
2996 
2997 	if (l != curlwp) {
2998 		return;
2999 	}
3000 
3001 	/*
3002 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3003 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3004 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3005 	 * to kernel preemption or blocking routine in between.
3006 	 */
3007 	pmap_tlb_shootnow();
3008 
3009 	ci = curcpu();
3010 
3011 	if (ci->ci_want_pmapload) {
3012 		/*
3013 		 * ci_want_pmapload means that our pmap is not loaded on
3014 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3015 		 * is always considered loaded.
3016 		 */
3017 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3018 		    != pmap_kernel());
3019 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3020 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3021 
3022 		/*
3023 		 * userspace has not been touched.
3024 		 * nothing to do here.
3025 		 */
3026 
3027 		ci->ci_want_pmapload = 0;
3028 		return;
3029 	}
3030 
3031 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3032 
3033 	if (pmap == pmap_kernel()) {
3034 		return;
3035 	}
3036 
3037 	KASSERT_PDIRPA(pmap);
3038 	KASSERT(ci->ci_pmap == pmap);
3039 
3040 	/*
3041 	 * we aren't interested in TLB invalidations for this pmap,
3042 	 * at least for the time being.
3043 	 */
3044 
3045 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3046 	ci->ci_tlbstate = TLBSTATE_LAZY;
3047 }
3048 
3049 /*
3050  * end of lifecycle functions
3051  */
3052 
3053 /*
3054  * some misc. functions
3055  */
3056 
3057 int
3058 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
3059 {
3060 	int i;
3061 	unsigned long index;
3062 	pd_entry_t pde;
3063 
3064 	for (i = PTP_LEVELS; i > 1; i--) {
3065 		index = pl_i(va, i);
3066 		pde = pdes[i - 2][index];
3067 		if ((pde & PG_V) == 0)
3068 			return i;
3069 	}
3070 	if (lastpde != NULL)
3071 		*lastpde = pde;
3072 	return 0;
3073 }
3074 
3075 /*
3076  * pmap_extract: extract a PA for the given VA
3077  */
3078 
3079 bool
3080 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3081 {
3082 	pt_entry_t *ptes, pte;
3083 	pd_entry_t pde;
3084 	pd_entry_t * const *pdes;
3085 	struct pmap *pmap2;
3086 	struct cpu_info *ci;
3087 	paddr_t pa;
3088 	lwp_t *l;
3089 	bool hard, rv;
3090 
3091 #ifdef __HAVE_DIRECT_MAP
3092 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3093 		if (pap != NULL) {
3094 			*pap = PMAP_DIRECT_UNMAP(va);
3095 		}
3096 		return true;
3097 	}
3098 #endif
3099 
3100 	rv = false;
3101 	pa = 0;
3102 	l = curlwp;
3103 
3104 	kpreempt_disable();
3105 	ci = l->l_cpu;
3106 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
3107 	    pmap == pmap_kernel()) {
3108 		/*
3109 		 * no need to lock, because it's pmap_kernel() or our
3110 		 * own pmap and is active.  if a user pmap, the caller
3111 		 * will hold the vm_map write/read locked and so prevent
3112 		 * entries from disappearing while we are here.  ptps
3113 		 * can disappear via pmap_remove() and pmap_protect(),
3114 		 * but they are called with the vm_map write locked.
3115 		 */
3116 		hard = false;
3117 		ptes = PTE_BASE;
3118 		pdes = normal_pdes;
3119 	} else {
3120 		/* we lose, do it the hard way. */
3121 		hard = true;
3122 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3123 	}
3124 	if (pmap_pdes_valid(va, pdes, &pde)) {
3125 		pte = ptes[pl1_i(va)];
3126 		if (pde & PG_PS) {
3127 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
3128 			rv = true;
3129 		} else if (__predict_true((pte & PG_V) != 0)) {
3130 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3131 			rv = true;
3132 		}
3133 	}
3134 	if (__predict_false(hard)) {
3135 		pmap_unmap_ptes(pmap, pmap2);
3136 	}
3137 	kpreempt_enable();
3138 	if (pap != NULL) {
3139 		*pap = pa;
3140 	}
3141 	return rv;
3142 }
3143 
3144 
3145 /*
3146  * vtophys: virtual address to physical address.  For use by
3147  * machine-dependent code only.
3148  */
3149 
3150 paddr_t
3151 vtophys(vaddr_t va)
3152 {
3153 	paddr_t pa;
3154 
3155 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3156 		return (pa);
3157 	return (0);
3158 }
3159 
3160 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3161 
3162 #ifdef XEN
3163 
3164 /*
3165  * vtomach: virtual address to machine address.  For use by
3166  * machine-dependent code only.
3167  */
3168 
3169 paddr_t
3170 vtomach(vaddr_t va)
3171 {
3172 	paddr_t pa;
3173 
3174 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3175 		return (pa);
3176 	return (0);
3177 }
3178 
3179 #endif /* XEN */
3180 
3181 /*
3182  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3183  *	determine the bounds of the kernel virtual addess space.
3184  */
3185 
3186 void
3187 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3188 {
3189 	*startp = virtual_avail;
3190 	*endp = virtual_end;
3191 }
3192 
3193 /*
3194  * pmap_zero_page: zero a page
3195  */
3196 
3197 void
3198 pmap_zero_page(paddr_t pa)
3199 {
3200 #if defined(__HAVE_DIRECT_MAP)
3201 	pagezero(PMAP_DIRECT_MAP(pa));
3202 #else
3203 #if defined(XEN)
3204 	if (XEN_VERSION_SUPPORTED(3, 4))
3205 		xen_pagezero(pa);
3206 #endif
3207 	struct cpu_info *ci;
3208 	pt_entry_t *zpte;
3209 	vaddr_t zerova;
3210 
3211 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U;
3212 
3213 	kpreempt_disable();
3214 
3215 	ci = curcpu();
3216 	zerova = ci->vpage[VPAGE_ZER];
3217 	zpte = ci->vpage_pte[VPAGE_ZER];
3218 
3219 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3220 
3221 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3222 	pmap_pte_flush();
3223 	pmap_update_pg(zerova);		/* flush TLB */
3224 
3225 	memset((void *)zerova, 0, PAGE_SIZE);
3226 
3227 #if defined(DIAGNOSTIC) || defined(XEN)
3228 	pmap_pte_set(zpte, 0);				/* zap ! */
3229 	pmap_pte_flush();
3230 #endif
3231 
3232 	kpreempt_enable();
3233 #endif /* defined(__HAVE_DIRECT_MAP) */
3234 }
3235 
3236 /*
3237  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3238  * Returns true if the page was zero'd, false if we aborted for
3239  * some reason.
3240  */
3241 
3242 bool
3243 pmap_pageidlezero(paddr_t pa)
3244 {
3245 #ifdef __HAVE_DIRECT_MAP
3246 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3247 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3248 #else
3249 	struct cpu_info *ci;
3250 	pt_entry_t *zpte;
3251 	vaddr_t zerova;
3252 	bool rv;
3253 
3254 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U;
3255 
3256 	ci = curcpu();
3257 	zerova = ci->vpage[VPAGE_ZER];
3258 	zpte = ci->vpage_pte[VPAGE_ZER];
3259 
3260 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3261 	KASSERT(*zpte == 0);
3262 
3263 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3264 	pmap_pte_flush();
3265 	pmap_update_pg(zerova);		/* flush TLB */
3266 
3267 	rv = sse2_idlezero_page((void *)zerova);
3268 
3269 #if defined(DIAGNOSTIC) || defined(XEN)
3270 	pmap_pte_set(zpte, 0);				/* zap ! */
3271 	pmap_pte_flush();
3272 #endif
3273 
3274 	return rv;
3275 #endif
3276 }
3277 
3278 /*
3279  * pmap_copy_page: copy a page
3280  */
3281 
3282 void
3283 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3284 {
3285 #if defined(__HAVE_DIRECT_MAP)
3286 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3287 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3288 
3289 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3290 #else
3291 #if defined(XEN)
3292 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3293 		xen_copy_page(srcpa, dstpa);
3294 		return;
3295 	}
3296 #endif
3297 	struct cpu_info *ci;
3298 	pt_entry_t *srcpte, *dstpte;
3299 	vaddr_t srcva, dstva;
3300 
3301 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U;
3302 
3303 	kpreempt_disable();
3304 
3305 	ci = curcpu();
3306 	srcva = ci->vpage[VPAGE_SRC];
3307 	dstva = ci->vpage[VPAGE_DST];
3308 	srcpte = ci->vpage_pte[VPAGE_SRC];
3309 	dstpte = ci->vpage_pte[VPAGE_DST];
3310 
3311 	KASSERT(*srcpte == 0 && *dstpte == 0);
3312 
3313 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3314 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3315 	pmap_pte_flush();
3316 	pmap_update_pg(srcva);
3317 	pmap_update_pg(dstva);
3318 
3319 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3320 
3321 #if defined(DIAGNOSTIC) || defined(XEN)
3322 	pmap_pte_set(srcpte, 0);
3323 	pmap_pte_set(dstpte, 0);
3324 	pmap_pte_flush();
3325 #endif
3326 
3327 	kpreempt_enable();
3328 #endif /* defined(__HAVE_DIRECT_MAP) */
3329 }
3330 
3331 static pt_entry_t *
3332 pmap_map_ptp(struct vm_page *ptp)
3333 {
3334 #ifdef __HAVE_DIRECT_MAP
3335 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3336 #else
3337 	struct cpu_info *ci;
3338 	pt_entry_t *ptppte;
3339 	vaddr_t ptpva;
3340 
3341 	KASSERT(kpreempt_disabled());
3342 
3343 #ifndef XEN
3344 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M;
3345 #else
3346 	const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M;
3347 #endif
3348 
3349 	ci = curcpu();
3350 	ptpva = ci->vpage[VPAGE_PTP];
3351 	ptppte = ci->vpage_pte[VPAGE_PTP];
3352 
3353 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3354 
3355 	pmap_pte_flush();
3356 	pmap_update_pg(ptpva);
3357 
3358 	return (pt_entry_t *)ptpva;
3359 #endif
3360 }
3361 
3362 static void
3363 pmap_unmap_ptp(void)
3364 {
3365 #ifndef __HAVE_DIRECT_MAP
3366 #if defined(DIAGNOSTIC) || defined(XEN)
3367 	struct cpu_info *ci;
3368 	pt_entry_t *pte;
3369 
3370 	KASSERT(kpreempt_disabled());
3371 
3372 	ci = curcpu();
3373 	pte = ci->vpage_pte[VPAGE_PTP];
3374 
3375 	if (*pte != 0) {
3376 		pmap_pte_set(pte, 0);
3377 		pmap_pte_flush();
3378 	}
3379 #endif
3380 #endif
3381 }
3382 
3383 static pt_entry_t *
3384 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3385 {
3386 
3387 	KASSERT(kpreempt_disabled());
3388 	if (pmap_is_curpmap(pmap)) {
3389 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3390 	}
3391 	KASSERT(ptp != NULL);
3392 	return pmap_map_ptp(ptp) + pl1_pi(va);
3393 }
3394 
3395 static void
3396 pmap_unmap_pte(void)
3397 {
3398 
3399 	KASSERT(kpreempt_disabled());
3400 
3401 	pmap_unmap_ptp();
3402 }
3403 
3404 /*
3405  * p m a p   r e m o v e   f u n c t i o n s
3406  *
3407  * functions that remove mappings
3408  */
3409 
3410 /*
3411  * pmap_remove_ptes: remove PTEs from a PTP
3412  *
3413  * => caller must hold pmap's lock
3414  * => PTP must be mapped into KVA
3415  * => PTP should be null if pmap == pmap_kernel()
3416  * => must be called with kernel preemption disabled
3417  * => returns composite pte if at least one page should be shot down
3418  */
3419 
3420 static void
3421 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3422 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3423 {
3424 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3425 
3426 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3427 	KASSERT(kpreempt_disabled());
3428 
3429 	/*
3430 	 * note that ptpva points to the PTE that maps startva.   this may
3431 	 * or may not be the first PTE in the PTP.
3432 	 *
3433 	 * we loop through the PTP while there are still PTEs to look at
3434 	 * and the wire_count is greater than 1 (because we use the wire_count
3435 	 * to keep track of the number of real PTEs in the PTP).
3436 	 */
3437 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3438 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3439 		startva += PAGE_SIZE;
3440 		pte++;
3441 	}
3442 }
3443 
3444 
3445 /*
3446  * pmap_remove_pte: remove a single PTE from a PTP.
3447  *
3448  * => caller must hold pmap's lock
3449  * => PTP must be mapped into KVA
3450  * => PTP should be null if pmap == pmap_kernel()
3451  * => returns true if we removed a mapping
3452  * => must be called with kernel preemption disabled
3453  */
3454 static bool
3455 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3456 		vaddr_t va, struct pv_entry **pv_tofree)
3457 {
3458 	struct pv_entry *pve;
3459 	struct vm_page *pg;
3460 	struct pmap_page *pp;
3461 	pt_entry_t opte;
3462 
3463 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3464 	KASSERT(kpreempt_disabled());
3465 
3466 	if (!pmap_valid_entry(*pte)) {
3467 		/* VA not mapped. */
3468 		return false;
3469 	}
3470 
3471 	/* Atomically save the old PTE and zap it. */
3472 	opte = pmap_pte_testset(pte, 0);
3473 	if (!pmap_valid_entry(opte)) {
3474 		return false;
3475 	}
3476 
3477 	pmap_exec_account(pmap, va, opte, 0);
3478 	pmap_stats_update_bypte(pmap, 0, opte);
3479 
3480 	if (ptp) {
3481 		/*
3482 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3483 		 */
3484 		ptp->wire_count--;
3485 		if (ptp->wire_count <= 1) {
3486 			opte |= PG_U;
3487 		}
3488 	}
3489 
3490 	if ((opte & PG_U) != 0) {
3491 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3492 	}
3493 
3494 	/*
3495 	 * If we are not on a pv_head list - we are done.
3496 	 */
3497 	if ((opte & PG_PVLIST) == 0) {
3498 #ifndef DOM0OPS
3499 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3500 		    "managed page without PG_PVLIST for %#"PRIxVADDR, va);
3501 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3502 		    "pv-tracked page without PG_PVLIST for %#"PRIxVADDR, va);
3503 #endif
3504 		return true;
3505 	}
3506 
3507 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3508 		KASSERT(uvm_page_locked_p(pg));
3509 		pp = VM_PAGE_TO_PP(pg);
3510 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3511 		paddr_t pa = pmap_pte2pa(opte);
3512 		panic("%s: PG_PVLIST with pv-untracked page"
3513 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
3514 		    __func__, va, pa, atop(pa));
3515 	}
3516 
3517 	/* Sync R/M bits. */
3518 	pp->pp_attrs |= opte;
3519 	pve = pmap_remove_pv(pp, ptp, va);
3520 
3521 	if (pve) {
3522 		pve->pve_next = *pv_tofree;
3523 		*pv_tofree = pve;
3524 	}
3525 	return true;
3526 }
3527 
3528 /*
3529  * pmap_remove: mapping removal function.
3530  *
3531  * => caller should not be holding any pmap locks
3532  */
3533 
3534 void
3535 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3536 {
3537 	pt_entry_t *ptes;
3538 	pd_entry_t pde;
3539 	pd_entry_t * const *pdes;
3540 	struct pv_entry *pv_tofree = NULL;
3541 	bool result;
3542 	int i;
3543 	paddr_t ptppa;
3544 	vaddr_t blkendva, va = sva;
3545 	struct vm_page *ptp;
3546 	struct pmap *pmap2;
3547 
3548 	kpreempt_disable();
3549 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3550 
3551 	/*
3552 	 * removing one page?  take shortcut function.
3553 	 */
3554 
3555 	if (va + PAGE_SIZE == eva) {
3556 		if (pmap_pdes_valid(va, pdes, &pde)) {
3557 
3558 			/* PA of the PTP */
3559 			ptppa = pmap_pte2pa(pde);
3560 
3561 			/* Get PTP if non-kernel mapping. */
3562 			if (pmap != pmap_kernel()) {
3563 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3564 				KASSERTMSG(ptp != NULL,
3565 				    "%s: unmanaged PTP detected", __func__);
3566 			} else {
3567 				/* Never free kernel PTPs. */
3568 				ptp = NULL;
3569 			}
3570 
3571 			result = pmap_remove_pte(pmap, ptp,
3572 			    &ptes[pl1_i(va)], va, &pv_tofree);
3573 
3574 			/*
3575 			 * if mapping removed and the PTP is no longer
3576 			 * being used, free it!
3577 			 */
3578 
3579 			if (result && ptp && ptp->wire_count <= 1)
3580 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3581 		}
3582 	} else for (/* null */ ; va < eva ; va = blkendva) {
3583 		int lvl;
3584 
3585 		/* determine range of block */
3586 		blkendva = x86_round_pdr(va+1);
3587 		if (blkendva > eva)
3588 			blkendva = eva;
3589 
3590 		/*
3591 		 * Our PTE mappings should never be removed with pmap_remove.
3592 		 *
3593 		 * XXXmaxv: still needed?
3594 		 *
3595 		 * A long term solution is to move the PTEs out of user address
3596 		 * space, and into kernel address space. Then we can set
3597 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3598 		 */
3599 		for (i = 0; i < PDP_SIZE; i++) {
3600 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3601 				panic("PTE space accessed");
3602 		}
3603 
3604 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3605 		if (lvl != 0) {
3606 			/*
3607 			 * skip a range corresponding to an invalid pde.
3608 			 */
3609 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3610  			continue;
3611 		}
3612 
3613 		/* PA of the PTP */
3614 		ptppa = pmap_pte2pa(pde);
3615 
3616 		/* Get PTP if non-kernel mapping. */
3617 		if (pmap != pmap_kernel()) {
3618 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3619 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
3620 			    __func__);
3621 		} else {
3622 			/* Never free kernel PTPs. */
3623 			ptp = NULL;
3624 		}
3625 
3626 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3627 		    blkendva, &pv_tofree);
3628 
3629 		/* if PTP is no longer being used, free it! */
3630 		if (ptp && ptp->wire_count <= 1) {
3631 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3632 		}
3633 	}
3634 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3635 	kpreempt_enable();
3636 
3637 	/* Now we free unused PVs */
3638 	if (pv_tofree)
3639 		pmap_free_pvs(pv_tofree);
3640 }
3641 
3642 /*
3643  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3644  *
3645  * => Caller should disable kernel preemption.
3646  * => issues tlb shootdowns if necessary.
3647  */
3648 
3649 static int
3650 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3651     pt_entry_t *optep)
3652 {
3653 	struct pmap *pmap;
3654 	struct vm_page *ptp;
3655 	vaddr_t va;
3656 	pt_entry_t *ptep;
3657 	pt_entry_t opte;
3658 	pt_entry_t npte;
3659 	bool need_shootdown;
3660 
3661 	ptp = pvpte->pte_ptp;
3662 	va = pvpte->pte_va;
3663 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3664 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3665 	pmap = ptp_to_pmap(ptp);
3666 
3667 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3668 	KASSERT((expect & PG_V) != 0);
3669 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3670 	KASSERT(kpreempt_disabled());
3671 
3672 	ptep = pmap_map_pte(pmap, ptp, va);
3673 	do {
3674 		opte = *ptep;
3675 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3676 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3677 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3678 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3679 
3680 			/*
3681 			 * we lost a race with a V->P operation like
3682 			 * pmap_remove().  wait for the competitor
3683 			 * reflecting pte bits into mp_attrs.
3684 			 *
3685 			 * issue a redundant TLB shootdown so that
3686 			 * we can wait for its completion.
3687 			 */
3688 
3689 			pmap_unmap_pte();
3690 			if (clearbits != 0) {
3691 				pmap_tlb_shootdown(pmap, va,
3692 				    (pmap == pmap_kernel() ? PG_G : 0),
3693 				    TLBSHOOT_SYNC_PV1);
3694 			}
3695 			return EAGAIN;
3696 		}
3697 
3698 		/*
3699 		 * check if there's anything to do on this pte.
3700 		 */
3701 
3702 		if ((opte & clearbits) == 0) {
3703 			need_shootdown = false;
3704 			break;
3705 		}
3706 
3707 		/*
3708 		 * we need a shootdown if the pte is cached. (PG_U)
3709 		 *
3710 		 * ...unless we are clearing only the PG_RW bit and
3711 		 * it isn't cached as RW. (PG_M)
3712 		 */
3713 
3714 		need_shootdown = (opte & PG_U) != 0 &&
3715 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3716 
3717 		npte = opte & ~clearbits;
3718 
3719 		/*
3720 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3721 		 */
3722 
3723 		if (need_shootdown) {
3724 			npte &= ~(PG_U | PG_M);
3725 		}
3726 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3727 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3728 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3729 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3730 
3731 	if (need_shootdown) {
3732 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3733 	}
3734 	pmap_unmap_pte();
3735 
3736 	*optep = opte;
3737 	return 0;
3738 }
3739 
3740 static void
3741 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3742 {
3743 	struct pv_pte *pvpte;
3744 	struct pv_entry *killlist = NULL;
3745 	struct vm_page *ptp;
3746 	pt_entry_t expect;
3747 	int count;
3748 
3749 	expect = pmap_pa2pte(pa) | PG_V;
3750 	count = SPINLOCK_BACKOFF_MIN;
3751 	kpreempt_disable();
3752 startover:
3753 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3754 		struct pmap *pmap;
3755 		struct pv_entry *pve;
3756 		pt_entry_t opte;
3757 		vaddr_t va;
3758 		int error;
3759 
3760 		/*
3761 		 * add a reference to the pmap before clearing the pte.
3762 		 * otherwise the pmap can disappear behind us.
3763 		 */
3764 
3765 		ptp = pvpte->pte_ptp;
3766 		pmap = ptp_to_pmap(ptp);
3767 		if (ptp != NULL) {
3768 			pmap_reference(pmap);
3769 		}
3770 
3771 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3772 		if (error == EAGAIN) {
3773 			int hold_count;
3774 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3775 			if (ptp != NULL) {
3776 				pmap_destroy(pmap);
3777 			}
3778 			SPINLOCK_BACKOFF(count);
3779 			KERNEL_LOCK(hold_count, curlwp);
3780 			goto startover;
3781 		}
3782 
3783 		pp->pp_attrs |= opte;
3784 		va = pvpte->pte_va;
3785 		pve = pmap_remove_pv(pp, ptp, va);
3786 
3787 		/* update the PTP reference count.  free if last reference. */
3788 		if (ptp != NULL) {
3789 			struct pmap *pmap2;
3790 			pt_entry_t *ptes;
3791 			pd_entry_t * const *pdes;
3792 
3793 			KASSERT(pmap != pmap_kernel());
3794 
3795 			pmap_tlb_shootnow();
3796 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3797 			pmap_stats_update_bypte(pmap, 0, opte);
3798 			ptp->wire_count--;
3799 			if (ptp->wire_count <= 1) {
3800 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3801 			}
3802 			pmap_unmap_ptes(pmap, pmap2);
3803 			pmap_destroy(pmap);
3804 		} else {
3805 			KASSERT(pmap == pmap_kernel());
3806 			pmap_stats_update_bypte(pmap, 0, opte);
3807 		}
3808 
3809 		if (pve != NULL) {
3810 			pve->pve_next = killlist;	/* mark it for death */
3811 			killlist = pve;
3812 		}
3813 	}
3814 	pmap_tlb_shootnow();
3815 	kpreempt_enable();
3816 
3817 	/* Now free unused pvs. */
3818 	pmap_free_pvs(killlist);
3819 }
3820 
3821 /*
3822  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3823  *
3824  * => R/M bits are sync'd back to attrs
3825  */
3826 
3827 void
3828 pmap_page_remove(struct vm_page *pg)
3829 {
3830 	struct pmap_page *pp;
3831 	paddr_t pa;
3832 
3833 	KASSERT(uvm_page_locked_p(pg));
3834 
3835 	pp = VM_PAGE_TO_PP(pg);
3836 	pa = VM_PAGE_TO_PHYS(pg);
3837 	pmap_pp_remove(pp, pa);
3838 }
3839 
3840 /*
3841  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3842  *	that map it
3843  */
3844 
3845 void
3846 pmap_pv_remove(paddr_t pa)
3847 {
3848 	struct pmap_page *pp;
3849 
3850 	pp = pmap_pv_tracked(pa);
3851 	if (pp == NULL)
3852 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
3853 	pmap_pp_remove(pp, pa);
3854 }
3855 
3856 /*
3857  * p m a p   a t t r i b u t e  f u n c t i o n s
3858  * functions that test/change managed page's attributes
3859  * since a page can be mapped multiple times we must check each PTE that
3860  * maps it by going down the pv lists.
3861  */
3862 
3863 /*
3864  * pmap_test_attrs: test a page's attributes
3865  */
3866 
3867 bool
3868 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3869 {
3870 	struct pmap_page *pp;
3871 	struct pv_pte *pvpte;
3872 	pt_entry_t expect;
3873 	u_int result;
3874 
3875 	KASSERT(uvm_page_locked_p(pg));
3876 
3877 	pp = VM_PAGE_TO_PP(pg);
3878 	if ((pp->pp_attrs & testbits) != 0) {
3879 		return true;
3880 	}
3881 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3882 	kpreempt_disable();
3883 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3884 		pt_entry_t opte;
3885 		int error;
3886 
3887 		if ((pp->pp_attrs & testbits) != 0) {
3888 			break;
3889 		}
3890 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3891 		if (error == 0) {
3892 			pp->pp_attrs |= opte;
3893 		}
3894 	}
3895 	result = pp->pp_attrs & testbits;
3896 	kpreempt_enable();
3897 
3898 	/*
3899 	 * note that we will exit the for loop with a non-null pve if
3900 	 * we have found the bits we are testing for.
3901 	 */
3902 
3903 	return result != 0;
3904 }
3905 
3906 static bool
3907 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3908 {
3909 	struct pv_pte *pvpte;
3910 	u_int result;
3911 	pt_entry_t expect;
3912 	int count;
3913 
3914 	expect = pmap_pa2pte(pa) | PG_V;
3915 	count = SPINLOCK_BACKOFF_MIN;
3916 	kpreempt_disable();
3917 startover:
3918 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3919 		pt_entry_t opte;
3920 		int error;
3921 
3922 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3923 		if (error == EAGAIN) {
3924 			int hold_count;
3925 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3926 			SPINLOCK_BACKOFF(count);
3927 			KERNEL_LOCK(hold_count, curlwp);
3928 			goto startover;
3929 		}
3930 		pp->pp_attrs |= opte;
3931 	}
3932 	result = pp->pp_attrs & clearbits;
3933 	pp->pp_attrs &= ~clearbits;
3934 	pmap_tlb_shootnow();
3935 	kpreempt_enable();
3936 
3937 	return result != 0;
3938 }
3939 
3940 /*
3941  * pmap_clear_attrs: clear the specified attribute for a page.
3942  *
3943  * => we return true if we cleared one of the bits we were asked to
3944  */
3945 
3946 bool
3947 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
3948 {
3949 	struct pmap_page *pp;
3950 	paddr_t pa;
3951 
3952 	KASSERT(uvm_page_locked_p(pg));
3953 
3954 	pp = VM_PAGE_TO_PP(pg);
3955 	pa = VM_PAGE_TO_PHYS(pg);
3956 
3957 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3958 }
3959 
3960 /*
3961  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
3962  *	pv-tracked page.
3963  */
3964 
3965 bool
3966 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
3967 {
3968 	struct pmap_page *pp;
3969 
3970 	pp = pmap_pv_tracked(pa);
3971 	if (pp == NULL)
3972 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
3973 
3974 	return pmap_pp_clear_attrs(pp, pa, clearbits);
3975 }
3976 
3977 /*
3978  * p m a p   p r o t e c t i o n   f u n c t i o n s
3979  */
3980 
3981 /*
3982  * pmap_page_protect: change the protection of all recorded mappings
3983  *	of a managed page
3984  *
3985  * => NOTE: this is an inline function in pmap.h
3986  */
3987 
3988 /* see pmap.h */
3989 
3990 /*
3991  * pmap_pv_protect: change the protection of all recorded mappings
3992  *	of an unmanaged pv-tracked page
3993  *
3994  * => NOTE: this is an inline function in pmap.h
3995  */
3996 
3997 /* see pmap.h */
3998 
3999 /*
4000  * pmap_protect: set the protection in of the pages in a pmap
4001  *
4002  * => NOTE: this is an inline function in pmap.h
4003  */
4004 
4005 /* see pmap.h */
4006 
4007 /*
4008  * pmap_write_protect: write-protect pages in a pmap.
4009  *
4010  * Note for Xen-amd64. Xen automatically adds PG_u to the kernel pages, but we
4011  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4012  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PG_u is
4013  * present the page will still be considered as a kernel page, and the privilege
4014  * separation will be enforced correctly.
4015  */
4016 void
4017 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4018 {
4019 	pt_entry_t bit_rem, bit_put;
4020 	pt_entry_t *ptes;
4021 	pt_entry_t * const *pdes;
4022 	struct pmap *pmap2;
4023 	vaddr_t blockend, va;
4024 
4025 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4026 
4027 	bit_rem = 0;
4028 	if (!(prot & VM_PROT_WRITE))
4029 		bit_rem = PG_RW;
4030 
4031 	bit_put = 0;
4032 	if (!(prot & VM_PROT_EXECUTE))
4033 		bit_put = pmap_pg_nx;
4034 
4035 	sva &= PG_FRAME;
4036 	eva &= PG_FRAME;
4037 
4038 	/* Acquire pmap. */
4039 	kpreempt_disable();
4040 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4041 
4042 	for (va = sva ; va < eva; va = blockend) {
4043 		pt_entry_t *spte, *epte;
4044 		int i;
4045 
4046 		blockend = x86_round_pdr(va + 1);
4047 		if (blockend > eva)
4048 			blockend = eva;
4049 
4050 		/*
4051 		 * Our PTE mappings should never be write-protected.
4052 		 *
4053 		 * XXXmaxv: still needed?
4054 		 *
4055 		 * A long term solution is to move the PTEs out of user address
4056 		 * space, and into kernel address space. Then we can set
4057 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
4058 		 */
4059 		for (i = 0; i < PDP_SIZE; i++) {
4060 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
4061 				panic("PTE space accessed");
4062 		}
4063 
4064 		/* Is it a valid block? */
4065 		if (!pmap_pdes_valid(va, pdes, NULL)) {
4066 			continue;
4067 		}
4068 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4069 
4070 		spte = &ptes[pl1_i(va)];
4071 		epte = &ptes[pl1_i(blockend)];
4072 
4073 		for (/* */; spte < epte; spte++) {
4074 			pt_entry_t opte, npte;
4075 
4076 			do {
4077 				opte = *spte;
4078 				if (!pmap_valid_entry(opte)) {
4079 					goto next;
4080 				}
4081 				npte = (opte & ~bit_rem) | bit_put;
4082 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4083 
4084 			if ((opte & PG_M) != 0) {
4085 				vaddr_t tva = x86_ptob(spte - ptes);
4086 				pmap_tlb_shootdown(pmap, tva, opte,
4087 				    TLBSHOOT_WRITE_PROTECT);
4088 			}
4089 next:;
4090 		}
4091 	}
4092 
4093 	/* Release pmap. */
4094 	pmap_unmap_ptes(pmap, pmap2);
4095 	kpreempt_enable();
4096 }
4097 
4098 /*
4099  * pmap_unwire: clear the wired bit in the PTE.
4100  *
4101  * => Mapping should already be present.
4102  */
4103 void
4104 pmap_unwire(struct pmap *pmap, vaddr_t va)
4105 {
4106 	pt_entry_t *ptes, *ptep, opte;
4107 	pd_entry_t * const *pdes;
4108 	struct pmap *pmap2;
4109 
4110 	/* Acquire pmap. */
4111 	kpreempt_disable();
4112 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4113 
4114 	if (!pmap_pdes_valid(va, pdes, NULL)) {
4115 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4116 	}
4117 
4118 	ptep = &ptes[pl1_i(va)];
4119 	opte = *ptep;
4120 	KASSERT(pmap_valid_entry(opte));
4121 
4122 	if (opte & PG_W) {
4123 		pt_entry_t npte = opte & ~PG_W;
4124 
4125 		opte = pmap_pte_testset(ptep, npte);
4126 		pmap_stats_update_bypte(pmap, npte, opte);
4127 	} else {
4128 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4129 		    "did not change!\n", __func__, pmap, va);
4130 	}
4131 
4132 	/* Release pmap. */
4133 	pmap_unmap_ptes(pmap, pmap2);
4134 	kpreempt_enable();
4135 }
4136 
4137 /*
4138  * pmap_copy: copy mappings from one pmap to another
4139  *
4140  * => optional function
4141  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4142  */
4143 
4144 /*
4145  * defined as macro in pmap.h
4146  */
4147 
4148 __strict_weak_alias(pmap_enter, pmap_enter_default);
4149 
4150 int
4151 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4152     u_int flags)
4153 {
4154 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4155 }
4156 
4157 /*
4158  * pmap_enter: enter a mapping into a pmap
4159  *
4160  * => must be done "now" ... no lazy-evaluation
4161  * => we set pmap => pv_head locking
4162  */
4163 int
4164 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4165 	   vm_prot_t prot, u_int flags, int domid)
4166 {
4167 	pt_entry_t *ptes, opte, npte;
4168 	pt_entry_t *ptep;
4169 	pd_entry_t * const *pdes;
4170 	struct vm_page *ptp;
4171 	struct vm_page *new_pg, *old_pg;
4172 	struct pmap_page *new_pp, *old_pp;
4173 	struct pv_entry *old_pve = NULL;
4174 	struct pv_entry *new_pve;
4175 	struct pv_entry *new_sparepve;
4176 	int error;
4177 	bool wired = (flags & PMAP_WIRED) != 0;
4178 	struct pmap *pmap2;
4179 
4180 	KASSERT(pmap_initialized);
4181 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4182 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4183 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4184 	    PRIxVADDR " over PDP!", __func__, va);
4185 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4186 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4187 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4188 
4189 #ifdef XEN
4190 	KASSERT(domid == DOMID_SELF || pa == 0);
4191 #endif /* XEN */
4192 
4193 	npte = ma | protection_codes[prot] | PG_V;
4194 	npte |= pmap_pat_flags(flags);
4195 	if (wired)
4196 	        npte |= PG_W;
4197 	if (va < VM_MAXUSER_ADDRESS)
4198 		npte |= PG_u;
4199 	else if (va < VM_MAX_ADDRESS)
4200 		panic("PTE space accessed");	/* XXXmaxv: no longer needed? */
4201 
4202 	if (pmap == pmap_kernel())
4203 		npte |= pmap_pg_g;
4204 	if (flags & VM_PROT_ALL) {
4205 		npte |= PG_U;
4206 		if (flags & VM_PROT_WRITE) {
4207 			KASSERT((npte & PG_RW) != 0);
4208 			npte |= PG_M;
4209 		}
4210 	}
4211 
4212 #ifdef XEN
4213 	if (domid != DOMID_SELF)
4214 		new_pg = NULL;
4215 	else
4216 #endif
4217 		new_pg = PHYS_TO_VM_PAGE(pa);
4218 	if (new_pg != NULL) {
4219 		/* This is a managed page */
4220 		npte |= PG_PVLIST;
4221 		new_pp = VM_PAGE_TO_PP(new_pg);
4222 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4223 		/* This is an unmanaged pv-tracked page */
4224 		npte |= PG_PVLIST;
4225 	} else {
4226 		new_pp = NULL;
4227 	}
4228 
4229 	/*
4230 	 * Try to get pves now if we might need them.
4231 	 * Keep going even if we fail, since we will not actually need them
4232 	 * if we are just changing the permissions on an existing mapping,
4233 	 * but we won't know if that's the case until later.
4234 	 */
4235 
4236 	bool needpves = pmap_pp_needs_pve(new_pp);
4237 	if (needpves) {
4238 		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4239 		new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4240 	} else {
4241 		new_pve = NULL;
4242 		new_sparepve = NULL;
4243 	}
4244 
4245 	kpreempt_disable();
4246 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4247 	if (pmap == pmap_kernel()) {
4248 		ptp = NULL;
4249 	} else {
4250 		ptp = pmap_get_ptp(pmap, va, pdes, flags);
4251 		if (ptp == NULL) {
4252 			pmap_unmap_ptes(pmap, pmap2);
4253 			if (flags & PMAP_CANFAIL) {
4254 				error = ENOMEM;
4255 				goto out;
4256 			}
4257 			panic("%s: get ptp failed", __func__);
4258 		}
4259 	}
4260 
4261 	/*
4262 	 * Check if there is an existing mapping.  If we are now sure that
4263 	 * we need pves and we failed to allocate them earlier, handle that.
4264 	 * Caching the value of oldpa here is safe because only the mod/ref bits
4265 	 * can change while the pmap is locked.
4266 	 */
4267 
4268 	ptep = &ptes[pl1_i(va)];
4269 	opte = *ptep;
4270 	bool have_oldpa = pmap_valid_entry(opte);
4271 	paddr_t oldpa = pmap_pte2pa(opte);
4272 
4273 	if (needpves && (!have_oldpa || oldpa != pa) &&
4274 	    (new_pve == NULL || new_sparepve == NULL)) {
4275 		pmap_unmap_ptes(pmap, pmap2);
4276 		if (flags & PMAP_CANFAIL) {
4277 			error = ENOMEM;
4278 			goto out;
4279 		}
4280 		panic("%s: pve allocation failed", __func__);
4281 	}
4282 
4283 	/*
4284 	 * update the pte.
4285 	 */
4286 
4287 	do {
4288 		opte = *ptep;
4289 
4290 		/*
4291 		 * if the same page, inherit PG_U and PG_M.
4292 		 */
4293 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4294 			npte |= opte & (PG_U | PG_M);
4295 		}
4296 #if defined(XEN)
4297 		if (domid != DOMID_SELF) {
4298 			/* pmap_pte_cas with error handling */
4299 			int s = splvm();
4300 			if (opte != *ptep) {
4301 				splx(s);
4302 				continue;
4303 			}
4304 			error = xpq_update_foreign(
4305 			    vtomach((vaddr_t)ptep), npte, domid);
4306 			splx(s);
4307 			if (error) {
4308 				if (ptp != NULL && ptp->wire_count <= 1) {
4309 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4310 				}
4311 				pmap_unmap_ptes(pmap, pmap2);
4312 				goto out;
4313 			}
4314 			break;
4315 		}
4316 #endif /* defined(XEN) */
4317 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4318 
4319 	/*
4320 	 * update statistics and PTP's reference count.
4321 	 */
4322 
4323 	pmap_stats_update_bypte(pmap, npte, opte);
4324 	if (ptp != NULL && !have_oldpa) {
4325 		ptp->wire_count++;
4326 	}
4327 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4328 
4329 	/*
4330 	 * if the same page, we can skip pv_entry handling.
4331 	 */
4332 
4333 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4334 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4335 		goto same_pa;
4336 	}
4337 
4338 	/*
4339 	 * if old page is pv-tracked, remove pv_entry from its list.
4340 	 */
4341 
4342 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4343 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
4344 			KASSERT(uvm_page_locked_p(old_pg));
4345 			old_pp = VM_PAGE_TO_PP(old_pg);
4346 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
4347 			panic("%s: PG_PVLIST with pv-untracked page"
4348 			    " va = %#"PRIxVADDR
4349 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
4350 			    __func__, va, oldpa, atop(pa));
4351 		}
4352 
4353 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4354 		old_pp->pp_attrs |= opte;
4355 	}
4356 
4357 	/*
4358 	 * if new page is pv-tracked, insert pv_entry into its list.
4359 	 */
4360 
4361 	if (new_pp) {
4362 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4363 	}
4364 
4365 same_pa:
4366 	pmap_unmap_ptes(pmap, pmap2);
4367 
4368 	/*
4369 	 * shootdown tlb if necessary.
4370 	 */
4371 
4372 	if ((~opte & (PG_V | PG_U)) == 0 &&
4373 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4374 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4375 	}
4376 
4377 	error = 0;
4378 out:
4379 	kpreempt_enable();
4380 	if (old_pve != NULL) {
4381 		pool_cache_put(&pmap_pv_cache, old_pve);
4382 	}
4383 	if (new_pve != NULL) {
4384 		pool_cache_put(&pmap_pv_cache, new_pve);
4385 	}
4386 	if (new_sparepve != NULL) {
4387 		pool_cache_put(&pmap_pv_cache, new_sparepve);
4388 	}
4389 
4390 	return error;
4391 }
4392 
4393 static paddr_t
4394 pmap_get_physpage(void)
4395 {
4396 	struct vm_page *ptp;
4397 	struct pmap *kpm = pmap_kernel();
4398 	paddr_t pa;
4399 
4400 	if (!uvm.page_init_done) {
4401 		/*
4402 		 * We're growing the kernel pmap early (from
4403 		 * uvm_pageboot_alloc()). This case must be
4404 		 * handled a little differently.
4405 		 */
4406 
4407 		if (!uvm_page_physget(&pa))
4408 			panic("%s: out of memory", __func__);
4409 #if defined(__HAVE_DIRECT_MAP)
4410 		pagezero(PMAP_DIRECT_MAP(pa));
4411 #else
4412 #if defined(XEN)
4413 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4414 			xen_pagezero(pa);
4415 			return pa;
4416 		}
4417 #endif
4418 		kpreempt_disable();
4419 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4420 		    PG_RW | pmap_pg_nx);
4421 		pmap_pte_flush();
4422 		pmap_update_pg((vaddr_t)early_zerop);
4423 		memset(early_zerop, 0, PAGE_SIZE);
4424 #if defined(DIAGNOSTIC) || defined(XEN)
4425 		pmap_pte_set(early_zero_pte, 0);
4426 		pmap_pte_flush();
4427 #endif /* defined(DIAGNOSTIC) */
4428 		kpreempt_enable();
4429 #endif /* defined(__HAVE_DIRECT_MAP) */
4430 	} else {
4431 		/* XXX */
4432 		ptp = uvm_pagealloc(NULL, 0, NULL,
4433 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4434 		if (ptp == NULL)
4435 			panic("%s: out of memory", __func__);
4436 		ptp->flags &= ~PG_BUSY;
4437 		ptp->wire_count = 1;
4438 		pa = VM_PAGE_TO_PHYS(ptp);
4439 	}
4440 	pmap_stats_update(kpm, 1, 0);
4441 
4442 	return pa;
4443 }
4444 
4445 /*
4446  * Expand the page tree with the specified amount of PTPs, mapping virtual
4447  * addresses starting at kva. We populate all the levels but the last one
4448  * (L1). The nodes of the tree are created as RWX, but the pages covered
4449  * will be kentered in L1, with proper permissions.
4450  *
4451  * Used only by pmap_growkernel.
4452  */
4453 static void
4454 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
4455 {
4456 	unsigned long i;
4457 	paddr_t pa;
4458 	unsigned long index, endindex;
4459 	int level;
4460 	pd_entry_t *pdep;
4461 #ifdef XEN
4462 	int s = splvm(); /* protect xpq_* */
4463 #endif
4464 
4465 	for (level = PTP_LEVELS; level > 1; level--) {
4466 		if (level == PTP_LEVELS)
4467 			pdep = cpm->pm_pdir;
4468 		else
4469 			pdep = normal_pdes[level - 2];
4470 		index = pl_i_roundup(kva, level);
4471 		endindex = index + needed_ptps[level - 1] - 1;
4472 
4473 		for (i = index; i <= endindex; i++) {
4474 			pt_entry_t pte;
4475 
4476 			KASSERT(!pmap_valid_entry(pdep[i]));
4477 			pa = pmap_get_physpage();
4478 			pte = pmap_pa2pte(pa) | PG_V | PG_RW;
4479 			pmap_pte_set(&pdep[i], pte);
4480 
4481 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4482 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4483 				if (__predict_true(
4484 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4485 					/* update per-cpu PMDs on all cpus */
4486 					xen_kpm_sync(pmap_kernel(), i);
4487 				} else {
4488 					/*
4489 					 * too early; update primary CPU
4490 					 * PMD only (without locks)
4491 					 */
4492 #ifdef PAE
4493 					pd_entry_t *cpu_pdep =
4494 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4495 #endif
4496 #ifdef __x86_64__
4497 					pd_entry_t *cpu_pdep =
4498 						&cpu_info_primary.ci_kpm_pdir[i];
4499 #endif
4500 					pmap_pte_set(cpu_pdep, pte);
4501 				}
4502 			}
4503 #endif /* XEN && (PAE || __x86_64__) */
4504 
4505 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4506 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4507 			nkptp[level - 1]++;
4508 		}
4509 		pmap_pte_flush();
4510 	}
4511 #ifdef XEN
4512 	splx(s);
4513 #endif
4514 }
4515 
4516 /*
4517  * pmap_growkernel: increase usage of KVM space.
4518  *
4519  * => we allocate new PTPs for the kernel and install them in all
4520  *    the pmaps on the system.
4521  */
4522 
4523 vaddr_t
4524 pmap_growkernel(vaddr_t maxkvaddr)
4525 {
4526 	struct pmap *kpm = pmap_kernel();
4527 	struct pmap *cpm;
4528 #if !defined(XEN) || !defined(__x86_64__)
4529 	struct pmap *pm;
4530 	long old;
4531 #endif
4532 	int s, i;
4533 	long needed_kptp[PTP_LEVELS], target_nptp;
4534 	bool invalidate = false;
4535 
4536 	s = splvm();	/* to be safe */
4537 	mutex_enter(kpm->pm_lock);
4538 
4539 	if (maxkvaddr <= pmap_maxkvaddr) {
4540 		mutex_exit(kpm->pm_lock);
4541 		splx(s);
4542 		return pmap_maxkvaddr;
4543 	}
4544 
4545 	maxkvaddr = x86_round_pdr(maxkvaddr);
4546 #if !defined(XEN) || !defined(__x86_64__)
4547 	old = nkptp[PTP_LEVELS - 1];
4548 #endif
4549 
4550 	/* Initialize needed_kptp. */
4551 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4552 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4553 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4554 
4555 		if (target_nptp > nkptpmax[i])
4556 			panic("out of KVA space");
4557 		KASSERT(target_nptp >= nkptp[i]);
4558 		needed_kptp[i] = target_nptp - nkptp[i];
4559 	}
4560 
4561 #if defined(XEN) && (defined(__x86_64__) || defined(PAE))
4562 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
4563 	cpm = kpm;
4564 #else
4565 	/* Get the current pmap */
4566 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4567 		cpm = curcpu()->ci_pmap;
4568 	} else {
4569 		cpm = kpm;
4570 	}
4571 #endif
4572 
4573 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
4574 
4575 	/*
4576 	 * If the number of top level entries changed, update all pmaps.
4577 	 */
4578 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4579 #ifdef XEN
4580 #ifdef __x86_64__
4581 		/* nothing, kernel entries are never entered in user pmap */
4582 #else /* __x86_64__ */
4583 		int pdkidx;
4584 #ifndef PAE
4585 		/*
4586 		 * for PAE this is not needed, because pmap_alloc_level()
4587 		 * already did update the per-CPU tables
4588 		 */
4589 		if (cpm != kpm) {
4590 			for (pdkidx = PDIR_SLOT_KERN + old;
4591 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4592 			    pdkidx++) {
4593 				pmap_pte_set(&kpm->pm_pdir[pdkidx],
4594 				    cpm->pm_pdir[pdkidx]);
4595 			}
4596 			pmap_pte_flush();
4597 		}
4598 #endif /* !PAE */
4599 
4600 		mutex_enter(&pmaps_lock);
4601 		LIST_FOREACH(pm, &pmaps, pm_list) {
4602 			for (pdkidx = PDIR_SLOT_KERN + old;
4603 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4604 			    pdkidx++) {
4605 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4606 				    kpm->pm_pdir[pdkidx]);
4607 			}
4608 			pmap_pte_flush();
4609 		}
4610 		mutex_exit(&pmaps_lock);
4611 #endif /* __x86_64__ */
4612 #else /* XEN */
4613 		size_t newpdes;
4614 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4615 		if (cpm != kpm) {
4616 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
4617 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
4618 			    newpdes * sizeof(pd_entry_t));
4619 		}
4620 
4621 		mutex_enter(&pmaps_lock);
4622 		LIST_FOREACH(pm, &pmaps, pm_list) {
4623 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4624 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4625 			    newpdes * sizeof (pd_entry_t));
4626 		}
4627 		mutex_exit(&pmaps_lock);
4628 #endif
4629 		invalidate = true;
4630 	}
4631 	pmap_maxkvaddr = maxkvaddr;
4632 	mutex_exit(kpm->pm_lock);
4633 	splx(s);
4634 
4635 	if (invalidate && pmap_initialized) {
4636 		/* Invalidate the PDP cache. */
4637 		pool_cache_invalidate(&pmap_pdp_cache);
4638 	}
4639 
4640 	return maxkvaddr;
4641 }
4642 
4643 #ifdef DEBUG
4644 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4645 
4646 /*
4647  * pmap_dump: dump all the mappings from a pmap
4648  *
4649  * => caller should not be holding any pmap locks
4650  */
4651 
4652 void
4653 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4654 {
4655 	pt_entry_t *ptes, *pte;
4656 	pd_entry_t * const *pdes;
4657 	struct pmap *pmap2;
4658 	vaddr_t blkendva;
4659 
4660 	/*
4661 	 * if end is out of range truncate.
4662 	 * if (end == start) update to max.
4663 	 */
4664 
4665 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4666 		eva = VM_MAXUSER_ADDRESS;
4667 
4668 	/*
4669 	 * we lock in the pmap => pv_head direction
4670 	 */
4671 
4672 	kpreempt_disable();
4673 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4674 
4675 	/*
4676 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4677 	 */
4678 
4679 	for (/* null */ ; sva < eva ; sva = blkendva) {
4680 
4681 		/* determine range of block */
4682 		blkendva = x86_round_pdr(sva+1);
4683 		if (blkendva > eva)
4684 			blkendva = eva;
4685 
4686 		/* valid block? */
4687 		if (!pmap_pdes_valid(sva, pdes, NULL))
4688 			continue;
4689 
4690 		pte = &ptes[pl1_i(sva)];
4691 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4692 			if (!pmap_valid_entry(*pte))
4693 				continue;
4694 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4695 			    " (pte=%#" PRIxPADDR ")\n",
4696 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4697 		}
4698 	}
4699 	pmap_unmap_ptes(pmap, pmap2);
4700 	kpreempt_enable();
4701 }
4702 #endif
4703 
4704 /*
4705  * pmap_update: process deferred invalidations and frees.
4706  */
4707 
4708 void
4709 pmap_update(struct pmap *pmap)
4710 {
4711 	struct vm_page *empty_ptps;
4712 	lwp_t *l = curlwp;
4713 
4714 	/*
4715 	 * If we have torn down this pmap, invalidate non-global TLB
4716 	 * entries on any processors using it.
4717 	 */
4718 	kpreempt_disable();
4719 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4720 		l->l_md.md_gc_pmap = NULL;
4721 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4722 	}
4723 
4724 	/*
4725 	 * Initiate any pending TLB shootdowns.  Wait for them to
4726 	 * complete before returning control to the caller.
4727 	 */
4728 	pmap_tlb_shootnow();
4729 	kpreempt_enable();
4730 
4731 	/*
4732 	 * Now that shootdowns are complete, process deferred frees,
4733 	 * but not from interrupt context.
4734 	 */
4735 	if (l->l_md.md_gc_ptp != NULL) {
4736 		KASSERT((l->l_pflag & LP_INTR) == 0);
4737 		if (cpu_intr_p()) {
4738 			return;
4739 		}
4740 		empty_ptps = l->l_md.md_gc_ptp;
4741 		l->l_md.md_gc_ptp = NULL;
4742 		pmap_free_ptps(empty_ptps);
4743 	}
4744 }
4745 
4746 #if PTP_LEVELS > 4
4747 #error "Unsupported number of page table mappings"
4748 #endif
4749 
4750 paddr_t
4751 pmap_init_tmp_pgtbl(paddr_t pg)
4752 {
4753 	static bool maps_loaded;
4754 	static const paddr_t x86_tmp_pml_paddr[] = {
4755 	    4 * PAGE_SIZE,	/* L1 */
4756 	    5 * PAGE_SIZE,	/* L2 */
4757 	    6 * PAGE_SIZE,	/* L3 */
4758 	    7 * PAGE_SIZE	/* L4 */
4759 	};
4760 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4761 
4762 	pd_entry_t *tmp_pml, *kernel_pml;
4763 
4764 	int level;
4765 
4766 	if (!maps_loaded) {
4767 		for (level = 0; level < PTP_LEVELS; ++level) {
4768 			x86_tmp_pml_vaddr[level] =
4769 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4770 			    UVM_KMF_VAONLY);
4771 
4772 			if (x86_tmp_pml_vaddr[level] == 0)
4773 				panic("mapping of real mode PML failed\n");
4774 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4775 			    x86_tmp_pml_paddr[level],
4776 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4777 		}
4778 		pmap_update(pmap_kernel());
4779 		maps_loaded = true;
4780 	}
4781 
4782 	/* Zero levels 1-3 */
4783 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4784 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4785 		memset(tmp_pml, 0, PAGE_SIZE);
4786 	}
4787 
4788 	/* Copy PML4 */
4789 	kernel_pml = pmap_kernel()->pm_pdir;
4790 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4791 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4792 
4793 #ifdef PAE
4794 	/*
4795 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4796 	 * last entries are unlikely to be used for temporary mappings.
4797 	 * 508: maps 0->1GB (userland)
4798 	 * 509: unused
4799 	 * 510: unused
4800 	 * 511: maps 3->4GB (kernel)
4801 	 */
4802 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4803 	tmp_pml[509] = 0;
4804 	tmp_pml[510] = 0;
4805 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4806 #endif
4807 
4808 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4809 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4810 
4811 		tmp_pml[pl_i(pg, level + 1)] =
4812 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4813 	}
4814 
4815 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4816 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4817 
4818 #ifdef PAE
4819 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4820 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4821 #endif
4822 
4823 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4824 }
4825 
4826 u_int
4827 x86_mmap_flags(paddr_t mdpgno)
4828 {
4829 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4830 	u_int pflag = 0;
4831 
4832 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4833 		pflag |= PMAP_WRITE_COMBINE;
4834 
4835 	return pflag;
4836 }
4837