xref: /netbsd-src/sys/arch/x86/x86/pmap.c (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1 /*	$NetBSD: pmap.c,v 1.289 2018/03/04 23:25:35 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright (c) 1997 Charles D. Cranor and Washington University.
74  * All rights reserved.
75  *
76  * Redistribution and use in source and binary forms, with or without
77  * modification, are permitted provided that the following conditions
78  * are met:
79  * 1. Redistributions of source code must retain the above copyright
80  *    notice, this list of conditions and the following disclaimer.
81  * 2. Redistributions in binary form must reproduce the above copyright
82  *    notice, this list of conditions and the following disclaimer in the
83  *    documentation and/or other materials provided with the distribution.
84  *
85  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
86  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
87  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
88  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
89  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
90  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
91  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
92  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
94  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
95  */
96 
97 /*
98  * Copyright 2001 (c) Wasabi Systems, Inc.
99  * All rights reserved.
100  *
101  * Written by Frank van der Linden for Wasabi Systems, Inc.
102  *
103  * Redistribution and use in source and binary forms, with or without
104  * modification, are permitted provided that the following conditions
105  * are met:
106  * 1. Redistributions of source code must retain the above copyright
107  *    notice, this list of conditions and the following disclaimer.
108  * 2. Redistributions in binary form must reproduce the above copyright
109  *    notice, this list of conditions and the following disclaimer in the
110  *    documentation and/or other materials provided with the distribution.
111  * 3. All advertising materials mentioning features or use of this software
112  *    must display the following acknowledgement:
113  *      This product includes software developed for the NetBSD Project by
114  *      Wasabi Systems, Inc.
115  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
116  *    or promote products derived from this software without specific prior
117  *    written permission.
118  *
119  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
120  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
121  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
122  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
123  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
124  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
125  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
126  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
127  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
128  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
129  * POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 /*
133  * This is the i386 pmap modified and generalized to support x86-64
134  * as well. The idea is to hide the upper N levels of the page tables
135  * inside pmap_get_ptp, pmap_free_ptp and pmap_growkernel. The rest
136  * is mostly untouched, except that it uses some more generalized
137  * macros and interfaces.
138  *
139  * This pmap has been tested on the i386 as well, and it can be easily
140  * adapted to PAE.
141  *
142  * fvdl@wasabisystems.com 18-Jun-2001
143  */
144 
145 /*
146  * pmap.c: i386 pmap module rewrite
147  * Chuck Cranor <chuck@netbsd>
148  * 11-Aug-97
149  *
150  * history of this pmap module: in addition to my own input, i used
151  *    the following references for this rewrite of the i386 pmap:
152  *
153  * [1] the NetBSD i386 pmap.   this pmap appears to be based on the
154  *     BSD hp300 pmap done by Mike Hibler at University of Utah.
155  *     it was then ported to the i386 by William Jolitz of UUNET
156  *     Technologies, Inc.   Then Charles M. Hannum of the NetBSD
157  *     project fixed some bugs and provided some speed ups.
158  *
159  * [2] the FreeBSD i386 pmap.   this pmap seems to be the
160  *     Hibler/Jolitz pmap, as modified for FreeBSD by John S. Dyson
161  *     and David Greenman.
162  *
163  * [3] the Mach pmap.   this pmap, from CMU, seems to have migrated
164  *     between several processors.   the VAX version was done by
165  *     Avadis Tevanian, Jr., and Michael Wayne Young.    the i386
166  *     version was done by Lance Berc, Mike Kupfer, Bob Baron,
167  *     David Golub, and Richard Draves.    the alpha version was
168  *     done by Alessandro Forin (CMU/Mach) and Chris Demetriou
169  *     (NetBSD/alpha).
170  */
171 
172 #include <sys/cdefs.h>
173 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.289 2018/03/04 23:25:35 jdolecek Exp $");
174 
175 #include "opt_user_ldt.h"
176 #include "opt_lockdebug.h"
177 #include "opt_multiprocessor.h"
178 #include "opt_xen.h"
179 #include "opt_svs.h"
180 
181 #include <sys/param.h>
182 #include <sys/systm.h>
183 #include <sys/proc.h>
184 #include <sys/pool.h>
185 #include <sys/kernel.h>
186 #include <sys/atomic.h>
187 #include <sys/cpu.h>
188 #include <sys/intr.h>
189 #include <sys/xcall.h>
190 #include <sys/kcore.h>
191 
192 #include <uvm/uvm.h>
193 #include <uvm/pmap/pmap_pvt.h>
194 
195 #include <dev/isa/isareg.h>
196 
197 #include <machine/specialreg.h>
198 #include <machine/gdt.h>
199 #include <machine/isa_machdep.h>
200 #include <machine/cpuvar.h>
201 #include <machine/cputypes.h>
202 
203 #include <x86/pmap.h>
204 #include <x86/pmap_pv.h>
205 
206 #include <x86/i82489reg.h>
207 #include <x86/i82489var.h>
208 
209 #ifdef XEN
210 #include <xen/xen-public/xen.h>
211 #include <xen/hypervisor.h>
212 #endif
213 
214 /*
215  * general info:
216  *
217  *  - for an explanation of how the i386 MMU hardware works see
218  *    the comments in <machine/pte.h>.
219  *
220  *  - for an explanation of the general memory structure used by
221  *    this pmap (including the recursive mapping), see the comments
222  *    in <machine/pmap.h>.
223  *
224  * this file contains the code for the "pmap module."   the module's
225  * job is to manage the hardware's virtual to physical address mappings.
226  * note that there are two levels of mapping in the VM system:
227  *
228  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
229  *      to map ranges of virtual address space to objects/files.  for
230  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
231  *      to the file /bin/ls starting at offset zero."   note that
232  *      the upper layer mapping is not concerned with how individual
233  *      vm_pages are mapped.
234  *
235  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
236  *      from virtual addresses.   it is concerned with which vm_page is
237  *      mapped where.   for example, when you run /bin/ls and start
238  *      at page 0x1000 the fault routine may lookup the correct page
239  *      of the /bin/ls file and then ask the pmap layer to establish
240  *      a mapping for it.
241  *
242  * note that information in the lower layer of the VM system can be
243  * thrown away since it can easily be reconstructed from the info
244  * in the upper layer.
245  *
246  * data structures we use include:
247  *
248  *  - struct pmap: describes the address space of one thread
249  *  - struct pmap_page: describes one pv-tracked page, without
250  *	necessarily a corresponding vm_page
251  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
252  *  - struct pv_head: there is one pv_head per pv-tracked page of
253  *	physical memory.   the pv_head points to a list of pv_entry
254  *	structures which describe all the <PMAP,VA> pairs that this
255  *      page is mapped in.    this is critical for page based operations
256  *      such as pmap_page_protect() [change protection on _all_ mappings
257  *      of a page]
258  */
259 
260 /*
261  * memory allocation
262  *
263  *  - there are three data structures that we must dynamically allocate:
264  *
265  * [A] new process' page directory page (PDP)
266  *	- plan 1: done at pmap_create() we use
267  *	  uvm_km_alloc(kernel_map, PAGE_SIZE)  [fka kmem_alloc] to do this
268  *	  allocation.
269  *
270  * if we are low in free physical memory then we sleep in
271  * uvm_km_alloc -- in this case this is ok since we are creating
272  * a new pmap and should not be holding any locks.
273  *
274  * if the kernel is totally out of virtual space
275  * (i.e. uvm_km_alloc returns NULL), then we panic.
276  *
277  * [B] new page tables pages (PTP)
278  * 	- call uvm_pagealloc()
279  * 		=> success: zero page, add to pm_pdir
280  * 		=> failure: we are out of free vm_pages, let pmap_enter()
281  *		   tell UVM about it.
282  *
283  * note: for kernel PTPs, we start with NKPTP of them.   as we map
284  * kernel memory (at uvm_map time) we check to see if we've grown
285  * the kernel pmap.   if so, we call the optional function
286  * pmap_growkernel() to grow the kernel PTPs in advance.
287  *
288  * [C] pv_entry structures
289  */
290 
291 /*
292  * locking
293  *
294  * we have the following locks that we must contend with:
295  *
296  * mutexes:
297  *
298  * - pmap lock (per pmap, part of uvm_object)
299  *   this lock protects the fields in the pmap structure including
300  *   the non-kernel PDEs in the PDP, and the PTEs.  it also locks
301  *   in the alternate PTE space (since that is determined by the
302  *   entry in the PDP).
303  *
304  * - pvh_lock (per pv_head)
305  *   this lock protects the pv_entry list which is chained off the
306  *   pv_head structure for a specific pv-tracked PA.   it is locked
307  *   when traversing the list (e.g. adding/removing mappings,
308  *   syncing R/M bits, etc.)
309  *
310  * - pmaps_lock
311  *   this lock protects the list of active pmaps (headed by "pmaps").
312  *   we lock it when adding or removing pmaps from this list.
313  */
314 
315 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
316 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
317 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
318 const long nbpd[] = NBPD_INITIALIZER;
319 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
320 
321 long nkptp[] = NKPTP_INITIALIZER;
322 
323 struct pmap_head pmaps;
324 kmutex_t pmaps_lock;
325 
326 struct pcpu_area *pcpuarea __read_mostly;
327 
328 static vaddr_t pmap_maxkvaddr;
329 
330 /*
331  * XXX kludge: dummy locking to make KASSERTs in uvm_page.c comfortable.
332  * actual locking is done by pm_lock.
333  */
334 #if defined(DIAGNOSTIC)
335 #define	PMAP_SUBOBJ_LOCK(pm, idx) \
336 	KASSERT(mutex_owned((pm)->pm_lock)); \
337 	if ((idx) != 0) \
338 		mutex_enter((pm)->pm_obj[(idx)].vmobjlock)
339 #define	PMAP_SUBOBJ_UNLOCK(pm, idx) \
340 	KASSERT(mutex_owned((pm)->pm_lock)); \
341 	if ((idx) != 0) \
342 		mutex_exit((pm)->pm_obj[(idx)].vmobjlock)
343 #else /* defined(DIAGNOSTIC) */
344 #define	PMAP_SUBOBJ_LOCK(pm, idx)	/* nothing */
345 #define	PMAP_SUBOBJ_UNLOCK(pm, idx)	/* nothing */
346 #endif /* defined(DIAGNOSTIC) */
347 
348 /*
349  * Misc. event counters.
350  */
351 struct evcnt pmap_iobmp_evcnt;
352 struct evcnt pmap_ldt_evcnt;
353 
354 /*
355  * PAT
356  */
357 #define	PATENTRY(n, type)	(type << ((n) * 8))
358 #define	PAT_UC		0x0ULL
359 #define	PAT_WC		0x1ULL
360 #define	PAT_WT		0x4ULL
361 #define	PAT_WP		0x5ULL
362 #define	PAT_WB		0x6ULL
363 #define	PAT_UCMINUS	0x7ULL
364 
365 static bool cpu_pat_enabled __read_mostly = false;
366 
367 /*
368  * Global data structures
369  */
370 
371 static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
372 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
373 
374 struct bootspace bootspace __read_mostly;
375 
376 /*
377  * pmap_pg_nx: if our processor supports PG_NX in the PTE then we
378  * set pmap_pg_nx to PG_NX (otherwise it is zero).
379  */
380 pd_entry_t pmap_pg_nx __read_mostly = 0;
381 
382 /*
383  * pmap_pg_g: if our processor supports PG_G in the PTE then we
384  * set pmap_pg_g to PG_G (otherwise it is zero).
385  */
386 pd_entry_t pmap_pg_g __read_mostly = 0;
387 
388 /*
389  * pmap_largepages: if our processor supports PG_PS and we are
390  * using it, this is set to true.
391  */
392 int pmap_largepages __read_mostly = 0;
393 
394 /*
395  * i386 physical memory comes in a big contig chunk with a small
396  * hole toward the front of it...  the following two paddr_t's
397  * (shared with machdep.c) describe the physical address space
398  * of this machine.
399  */
400 paddr_t lowmem_rsvd __read_mostly;
401 paddr_t avail_start __read_mostly; /* PA of first available physical page */
402 paddr_t avail_end __read_mostly; /* PA of last available physical page */
403 
404 #ifdef XEN
405 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
406 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
407 #endif
408 
409 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
410 
411 #define	PV_HASH_SIZE		32768
412 #define	PV_HASH_LOCK_CNT	32
413 
414 struct pv_hash_lock {
415 	kmutex_t lock;
416 } __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
417     __aligned(CACHE_LINE_SIZE);
418 
419 struct pv_hash_head {
420 	SLIST_HEAD(, pv_entry) hh_list;
421 } pv_hash_heads[PV_HASH_SIZE];
422 
423 static u_int
424 pvhash_hash(struct vm_page *ptp, vaddr_t va)
425 {
426 
427 	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
428 }
429 
430 static struct pv_hash_head *
431 pvhash_head(u_int hash)
432 {
433 
434 	return &pv_hash_heads[hash % PV_HASH_SIZE];
435 }
436 
437 static kmutex_t *
438 pvhash_lock(u_int hash)
439 {
440 
441 	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
442 }
443 
444 static struct pv_entry *
445 pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
446 {
447 	struct pv_entry *pve;
448 	struct pv_entry *prev;
449 
450 	prev = NULL;
451 	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
452 		if (pve->pve_pte.pte_ptp == ptp &&
453 		    pve->pve_pte.pte_va == va) {
454 			if (prev != NULL) {
455 				SLIST_REMOVE_AFTER(prev, pve_hash);
456 			} else {
457 				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
458 			}
459 			break;
460 		}
461 		prev = pve;
462 	}
463 	return pve;
464 }
465 
466 /*
467  * Other data structures
468  */
469 
470 static pt_entry_t protection_codes[8] __read_mostly;
471 
472 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
473 
474 /*
475  * The following two vaddr_t's are used during system startup to keep track of
476  * how much of the kernel's VM space we have used. Once the system is started,
477  * the management of the remaining kernel VM space is turned over to the
478  * kernel_map vm_map.
479  */
480 static vaddr_t virtual_avail __read_mostly;	/* VA of first free KVA */
481 static vaddr_t virtual_end __read_mostly;	/* VA of last free KVA */
482 
483 #ifndef XEN
484 /*
485  * LAPIC virtual address, and fake physical address.
486  */
487 volatile vaddr_t local_apic_va __read_mostly;
488 paddr_t local_apic_pa __read_mostly;
489 #endif
490 
491 /*
492  * pool that pmap structures are allocated from
493  */
494 static struct pool_cache pmap_cache;
495 
496 /*
497  * pv_entry cache
498  */
499 static struct pool_cache pmap_pv_cache;
500 
501 #ifdef __HAVE_DIRECT_MAP
502 vaddr_t pmap_direct_base __read_mostly;
503 vaddr_t pmap_direct_end __read_mostly;
504 size_t pmap_direct_pdpe __read_mostly;
505 size_t pmap_direct_npdp __read_mostly;
506 #endif
507 
508 #ifndef __HAVE_DIRECT_MAP
509 /*
510  * Special VAs and the PTEs that map them
511  */
512 static pt_entry_t *early_zero_pte;
513 static void pmap_vpage_cpualloc(struct cpu_info *);
514 #ifdef XEN
515 char *early_zerop; /* also referenced from xen_locore() */
516 #else
517 static char *early_zerop;
518 #endif
519 #endif
520 
521 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
522 
523 /* PDP pool_cache(9) and its callbacks */
524 struct pool_cache pmap_pdp_cache;
525 static int  pmap_pdp_ctor(void *, void *, int);
526 static void pmap_pdp_dtor(void *, void *);
527 #ifdef PAE
528 /* need to allocate items of 4 pages */
529 static void *pmap_pdp_alloc(struct pool *, int);
530 static void pmap_pdp_free(struct pool *, void *);
531 static struct pool_allocator pmap_pdp_allocator = {
532 	.pa_alloc = pmap_pdp_alloc,
533 	.pa_free = pmap_pdp_free,
534 	.pa_pagesz = PAGE_SIZE * PDP_SIZE,
535 };
536 #endif /* PAE */
537 
538 extern vaddr_t idt_vaddr;
539 extern paddr_t idt_paddr;
540 extern vaddr_t gdt_vaddr;
541 extern paddr_t gdt_paddr;
542 extern vaddr_t ldt_vaddr;
543 extern paddr_t ldt_paddr;
544 
545 extern int end;
546 
547 #ifdef i386
548 /* stuff to fix the pentium f00f bug */
549 extern vaddr_t pentium_idt_vaddr;
550 #endif
551 
552 /*
553  * Local prototypes
554  */
555 
556 #ifdef __HAVE_PCPU_AREA
557 static void pmap_init_pcpu(void);
558 #endif
559 #ifdef __HAVE_DIRECT_MAP
560 static void pmap_init_directmap(struct pmap *);
561 #endif
562 #if !defined(XEN)
563 static void pmap_remap_global(void);
564 #endif
565 #ifndef XEN
566 static void pmap_init_lapic(void);
567 static void pmap_remap_largepages(void);
568 #endif
569 
570 static struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t,
571     pd_entry_t * const *, int);
572 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
573 static void pmap_freepage(struct pmap *, struct vm_page *, int);
574 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
575     pt_entry_t *, pd_entry_t * const *);
576 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
577     vaddr_t, struct pv_entry **);
578 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
579     vaddr_t, struct pv_entry **);
580 
581 static paddr_t pmap_get_physpage(void);
582 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
583 
584 static void pmap_reactivate(struct pmap *);
585 
586 /*
587  * p m a p   h e l p e r   f u n c t i o n s
588  */
589 
590 static inline void
591 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
592 {
593 
594 	if (pmap == pmap_kernel()) {
595 		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
596 		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
597 	} else {
598 		KASSERT(mutex_owned(pmap->pm_lock));
599 		pmap->pm_stats.resident_count += resid_diff;
600 		pmap->pm_stats.wired_count += wired_diff;
601 	}
602 }
603 
604 static inline void
605 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
606 {
607 	int resid_diff = ((npte & PG_V) ? 1 : 0) - ((opte & PG_V) ? 1 : 0);
608 	int wired_diff = ((npte & PG_W) ? 1 : 0) - ((opte & PG_W) ? 1 : 0);
609 
610 	KASSERT((npte & (PG_V | PG_W)) != PG_W);
611 	KASSERT((opte & (PG_V | PG_W)) != PG_W);
612 
613 	pmap_stats_update(pmap, resid_diff, wired_diff);
614 }
615 
616 /*
617  * ptp_to_pmap: lookup pmap by ptp
618  */
619 
620 static struct pmap *
621 ptp_to_pmap(struct vm_page *ptp)
622 {
623 	struct pmap *pmap;
624 
625 	if (ptp == NULL) {
626 		return pmap_kernel();
627 	}
628 	pmap = (struct pmap *)ptp->uobject;
629 	KASSERT(pmap != NULL);
630 	KASSERT(&pmap->pm_obj[0] == ptp->uobject);
631 	return pmap;
632 }
633 
634 static inline struct pv_pte *
635 pve_to_pvpte(struct pv_entry *pve)
636 {
637 
638 	KASSERT((void *)&pve->pve_pte == (void *)pve);
639 	return &pve->pve_pte;
640 }
641 
642 static inline struct pv_entry *
643 pvpte_to_pve(struct pv_pte *pvpte)
644 {
645 	struct pv_entry *pve = (void *)pvpte;
646 
647 	KASSERT(pve_to_pvpte(pve) == pvpte);
648 	return pve;
649 }
650 
651 /*
652  * pv_pte_first, pv_pte_next: PV list iterator.
653  */
654 
655 static struct pv_pte *
656 pv_pte_first(struct pmap_page *pp)
657 {
658 
659 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
660 		return &pp->pp_pte;
661 	}
662 	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
663 }
664 
665 static struct pv_pte *
666 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
667 {
668 
669 	KASSERT(pvpte != NULL);
670 	if (pvpte == &pp->pp_pte) {
671 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
672 		return NULL;
673 	}
674 	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
675 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
676 }
677 
678 /*
679  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
680  *		of course the kernel is always loaded
681  */
682 
683 bool
684 pmap_is_curpmap(struct pmap *pmap)
685 {
686 	return((pmap == pmap_kernel()) ||
687 	       (pmap == curcpu()->ci_pmap));
688 }
689 
690 /*
691  *	Add a reference to the specified pmap.
692  */
693 
694 void
695 pmap_reference(struct pmap *pmap)
696 {
697 
698 	atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
699 }
700 
701 /*
702  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
703  *
704  * there are several pmaps involved.  some or all of them might be same.
705  *
706  *	- the pmap given by the first argument
707  *		our caller wants to access this pmap's PTEs.
708  *
709  *	- pmap_kernel()
710  *		the kernel pmap.  note that it only contains the kernel part
711  *		of the address space which is shared by any pmap.  ie. any
712  *		pmap can be used instead of pmap_kernel() for our purpose.
713  *
714  *	- ci->ci_pmap
715  *		pmap currently loaded on the cpu.
716  *
717  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
718  *		current process' pmap.
719  *
720  * => we lock enough pmaps to keep things locked in
721  * => must be undone with pmap_unmap_ptes before returning
722  */
723 
724 void
725 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
726 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
727 {
728 	struct pmap *curpmap;
729 	struct cpu_info *ci;
730 	lwp_t *l;
731 
732 	/* The kernel's pmap is always accessible. */
733 	if (pmap == pmap_kernel()) {
734 		*pmap2 = NULL;
735 		*ptepp = PTE_BASE;
736 		*pdeppp = normal_pdes;
737 		return;
738 	}
739 	KASSERT(kpreempt_disabled());
740 
741 	l = curlwp;
742  retry:
743 	mutex_enter(pmap->pm_lock);
744 	ci = curcpu();
745 	curpmap = ci->ci_pmap;
746 	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
747 		/* Our own pmap so just load it: easy. */
748 		if (__predict_false(ci->ci_want_pmapload)) {
749 			mutex_exit(pmap->pm_lock);
750 			pmap_load();
751 			goto retry;
752 		}
753 		KASSERT(pmap == curpmap);
754 	} else if (pmap == curpmap) {
755 		/*
756 		 * Already on the CPU: make it valid.  This is very
757 		 * often the case during exit(), when we have switched
758 		 * to the kernel pmap in order to destroy a user pmap.
759 		 */
760 		pmap_reactivate(pmap);
761 	} else {
762 		/*
763 		 * Toss current pmap from CPU, but keep a reference to it.
764 		 * The reference will be dropped by pmap_unmap_ptes().
765 		 * Can happen if we block during exit().
766 		 */
767 		const cpuid_t cid = cpu_index(ci);
768 
769 		kcpuset_atomic_clear(curpmap->pm_cpus, cid);
770 		kcpuset_atomic_clear(curpmap->pm_kernel_cpus, cid);
771 		ci->ci_pmap = pmap;
772 		ci->ci_tlbstate = TLBSTATE_VALID;
773 		kcpuset_atomic_set(pmap->pm_cpus, cid);
774 		kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
775 		cpu_load_pmap(pmap, curpmap);
776 	}
777 	pmap->pm_ncsw = l->l_ncsw;
778 	*pmap2 = curpmap;
779 	*ptepp = PTE_BASE;
780 
781 #if defined(XEN) && defined(__x86_64__)
782 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
783 	ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
784 	*pdeppp = ci->ci_normal_pdes;
785 #else
786 	*pdeppp = normal_pdes;
787 #endif
788 }
789 
790 /*
791  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
792  */
793 
794 void
795 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
796 {
797 	struct cpu_info *ci;
798 	struct pmap *mypmap;
799 
800 	KASSERT(kpreempt_disabled());
801 
802 	/* The kernel's pmap is always accessible. */
803 	if (pmap == pmap_kernel()) {
804 		return;
805 	}
806 
807 	ci = curcpu();
808 
809 #if defined(XEN) && defined(__x86_64__)
810 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
811 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
812 #endif
813 
814 	/*
815 	 * We cannot tolerate context switches while mapped in.
816 	 * If it is our own pmap all we have to do is unlock.
817 	 */
818 	KASSERT(pmap->pm_ncsw == curlwp->l_ncsw);
819 	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
820 	if (pmap == mypmap) {
821 		mutex_exit(pmap->pm_lock);
822 		return;
823 	}
824 
825 	/*
826 	 * Mark whatever's on the CPU now as lazy and unlock.
827 	 * If the pmap was already installed, we are done.
828 	 */
829 	ci->ci_tlbstate = TLBSTATE_LAZY;
830 	ci->ci_want_pmapload = (mypmap != pmap_kernel());
831 	mutex_exit(pmap->pm_lock);
832 	if (pmap == pmap2) {
833 		return;
834 	}
835 
836 	/*
837 	 * We installed another pmap on the CPU.  Grab a reference to
838 	 * it and leave in place.  Toss the evicted pmap (can block).
839 	 */
840 	pmap_reference(pmap);
841 	pmap_destroy(pmap2);
842 }
843 
844 
845 inline static void
846 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
847 {
848 
849 #if !defined(__x86_64__)
850 	if (curproc == NULL || curproc->p_vmspace == NULL ||
851 	    pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
852 		return;
853 
854 	if ((opte ^ npte) & PG_X)
855 		pmap_update_pg(va);
856 
857 	/*
858 	 * Executability was removed on the last executable change.
859 	 * Reset the code segment to something conservative and
860 	 * let the trap handler deal with setting the right limit.
861 	 * We can't do that because of locking constraints on the vm map.
862 	 */
863 
864 	if ((opte & PG_X) && (npte & PG_X) == 0 && va == pm->pm_hiexec) {
865 		struct trapframe *tf = curlwp->l_md.md_regs;
866 
867 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
868 		pm->pm_hiexec = I386_MAX_EXE_ADDR;
869 	}
870 #endif /* !defined(__x86_64__) */
871 }
872 
873 #if !defined(__x86_64__)
874 /*
875  * Fixup the code segment to cover all potential executable mappings.
876  * returns 0 if no changes to the code segment were made.
877  */
878 
879 int
880 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
881 {
882 	struct vm_map_entry *ent;
883 	struct pmap *pm = vm_map_pmap(map);
884 	vaddr_t va = 0;
885 
886 	vm_map_lock_read(map);
887 	for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
888 
889 		/*
890 		 * This entry has greater va than the entries before.
891 		 * We need to make it point to the last page, not past it.
892 		 */
893 
894 		if (ent->protection & VM_PROT_EXECUTE)
895 			va = trunc_page(ent->end) - PAGE_SIZE;
896 	}
897 	vm_map_unlock_read(map);
898 	if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
899 		return (0);
900 
901 	pm->pm_hiexec = va;
902 	if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
903 		tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
904 	} else {
905 		tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
906 		return (0);
907 	}
908 	return (1);
909 }
910 #endif /* !defined(__x86_64__) */
911 
912 void
913 pat_init(struct cpu_info *ci)
914 {
915 	uint64_t pat;
916 
917 	if (!(ci->ci_feat_val[0] & CPUID_PAT))
918 		return;
919 
920 	/* We change WT to WC. Leave all other entries the default values. */
921 	pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
922 	      PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
923 	      PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
924 	      PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
925 
926 	wrmsr(MSR_CR_PAT, pat);
927 	cpu_pat_enabled = true;
928 	aprint_debug_dev(ci->ci_dev, "PAT enabled\n");
929 }
930 
931 static pt_entry_t
932 pmap_pat_flags(u_int flags)
933 {
934 	u_int cacheflags = (flags & PMAP_CACHE_MASK);
935 
936 	if (!cpu_pat_enabled) {
937 		switch (cacheflags) {
938 		case PMAP_NOCACHE:
939 		case PMAP_NOCACHE_OVR:
940 			/* results in PGC_UCMINUS on cpus which have
941 			 * the cpuid PAT but PAT "disabled"
942 			 */
943 			return PG_N;
944 		default:
945 			return 0;
946 		}
947 	}
948 
949 	switch (cacheflags) {
950 	case PMAP_NOCACHE:
951 		return PGC_UC;
952 	case PMAP_WRITE_COMBINE:
953 		return PGC_WC;
954 	case PMAP_WRITE_BACK:
955 		return PGC_WB;
956 	case PMAP_NOCACHE_OVR:
957 		return PGC_UCMINUS;
958 	}
959 
960 	return 0;
961 }
962 
963 /*
964  * p m a p   k e n t e r   f u n c t i o n s
965  *
966  * functions to quickly enter/remove pages from the kernel address
967  * space.   pmap_kremove is exported to MI kernel.  we make use of
968  * the recursive PTE mappings.
969  */
970 
971 /*
972  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
973  *
974  * => no need to lock anything, assume va is already allocated
975  * => should be faster than normal pmap enter function
976  */
977 
978 void
979 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
980 {
981 	pt_entry_t *pte, opte, npte;
982 
983 	KASSERT(!(prot & ~VM_PROT_ALL));
984 
985 	if (va < VM_MIN_KERNEL_ADDRESS)
986 		pte = vtopte(va);
987 	else
988 		pte = kvtopte(va);
989 #ifdef DOM0OPS
990 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
991 #ifdef DEBUG
992 		printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
993 		    " outside range\n", __func__, pa, va);
994 #endif /* DEBUG */
995 		npte = pa;
996 	} else
997 #endif /* DOM0OPS */
998 		npte = pmap_pa2pte(pa);
999 	npte |= protection_codes[prot] | PG_V | pmap_pg_g;
1000 	npte |= pmap_pat_flags(flags);
1001 	opte = pmap_pte_testset(pte, npte); /* zap! */
1002 
1003 	/*
1004 	 * XXX: make sure we are not dealing with a large page, since the only
1005 	 * large pages created are for the kernel image, and they should never
1006 	 * be kentered.
1007 	 */
1008 	KASSERTMSG(!(opte & PG_PS), "PG_PS va=%#"PRIxVADDR, va);
1009 
1010 	if ((opte & (PG_V | PG_U)) == (PG_V | PG_U)) {
1011 		/* This should not happen. */
1012 		printf_nolog("%s: mapping already present\n", __func__);
1013 		kpreempt_disable();
1014 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1015 		kpreempt_enable();
1016 	}
1017 }
1018 
1019 void
1020 pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
1021 {
1022 	pt_entry_t *pte, npte;
1023 
1024 	KASSERT((prot & ~VM_PROT_ALL) == 0);
1025 	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1026 
1027 #ifdef DOM0OPS
1028 	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1029 		npte = pa;
1030 	} else
1031 #endif
1032 		npte = pmap_pa2pte(pa);
1033 
1034 	npte = pmap_pa2pte(pa);
1035 	npte |= protection_codes[prot] | PG_V;
1036 	pmap_pte_set(pte, npte);
1037 	pmap_pte_flush();
1038 }
1039 
1040 /*
1041  * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
1042  */
1043 void
1044 pmap_emap_sync(bool canload)
1045 {
1046 	struct cpu_info *ci = curcpu();
1047 	struct pmap *pmap;
1048 
1049 	KASSERT(kpreempt_disabled());
1050 	if (__predict_true(ci->ci_want_pmapload && canload)) {
1051 		/*
1052 		 * XXX: Hint for pmap_reactivate(), which might suggest to
1053 		 * not perform TLB flush, if state has not changed.
1054 		 */
1055 		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
1056 		if (__predict_false(pmap == ci->ci_pmap)) {
1057 			kcpuset_atomic_clear(pmap->pm_cpus, cpu_index(ci));
1058 		}
1059 		pmap_load();
1060 		KASSERT(ci->ci_want_pmapload == 0);
1061 	} else {
1062 		tlbflush();
1063 	}
1064 }
1065 
1066 void
1067 pmap_emap_remove(vaddr_t sva, vsize_t len)
1068 {
1069 	pt_entry_t *pte;
1070 	vaddr_t va, eva = sva + len;
1071 
1072 	for (va = sva; va < eva; va += PAGE_SIZE) {
1073 		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
1074 		pmap_pte_set(pte, 0);
1075 	}
1076 
1077 	pmap_pte_flush();
1078 }
1079 
1080 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1081 
1082 #if defined(__x86_64__)
1083 /*
1084  * Change protection for a virtual address. Local for a CPU only, don't
1085  * care about TLB shootdowns.
1086  *
1087  * => must be called with preemption disabled
1088  */
1089 void
1090 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1091 {
1092 	pt_entry_t *pte, opte, npte;
1093 
1094 	KASSERT(kpreempt_disabled());
1095 
1096 	if (va < VM_MIN_KERNEL_ADDRESS)
1097 		pte = vtopte(va);
1098 	else
1099 		pte = kvtopte(va);
1100 
1101 	npte = opte = *pte;
1102 
1103 	if ((prot & VM_PROT_WRITE) != 0)
1104 		npte |= PG_RW;
1105 	else
1106 		npte &= ~PG_RW;
1107 
1108 	if (opte != npte) {
1109 		pmap_pte_set(pte, npte);
1110 		pmap_pte_flush();
1111 		invlpg(va);
1112 	}
1113 }
1114 #endif /* defined(__x86_64__) */
1115 
1116 /*
1117  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1118  *
1119  * => no need to lock anything
1120  * => caller must dispose of any vm_page mapped in the va range
1121  * => note: not an inline function
1122  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1123  * => we assume kernel only unmaps valid addresses and thus don't bother
1124  *    checking the valid bit before doing TLB flushing
1125  * => must be followed by call to pmap_update() before reuse of page
1126  */
1127 
1128 static inline void
1129 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1130 {
1131 	pt_entry_t *pte, opte;
1132 	vaddr_t va, eva;
1133 
1134 	eva = sva + len;
1135 
1136 	kpreempt_disable();
1137 	for (va = sva; va < eva; va += PAGE_SIZE) {
1138 		pte = kvtopte(va);
1139 		opte = pmap_pte_testset(pte, 0); /* zap! */
1140 		if ((opte & (PG_V | PG_U)) == (PG_V | PG_U) && !localonly) {
1141 			pmap_tlb_shootdown(pmap_kernel(), va, opte,
1142 			    TLBSHOOT_KREMOVE);
1143 		}
1144 		KASSERTMSG((opte & PG_PS) == 0,
1145 		    "va %#" PRIxVADDR " is a large page", va);
1146 		KASSERTMSG((opte & PG_PVLIST) == 0,
1147 		    "va %#" PRIxVADDR " is a pv tracked page", va);
1148 	}
1149 	if (localonly) {
1150 		tlbflushg();
1151 	}
1152 	kpreempt_enable();
1153 }
1154 
1155 void
1156 pmap_kremove(vaddr_t sva, vsize_t len)
1157 {
1158 
1159 	pmap_kremove1(sva, len, false);
1160 }
1161 
1162 /*
1163  * pmap_kremove_local: like pmap_kremove(), but only worry about
1164  * TLB invalidations on the current CPU.  this is only intended
1165  * for use while writing kernel crash dumps, either after panic
1166  * or via reboot -d.
1167  */
1168 
1169 void
1170 pmap_kremove_local(vaddr_t sva, vsize_t len)
1171 {
1172 
1173 	pmap_kremove1(sva, len, true);
1174 }
1175 
1176 /*
1177  * p m a p   i n i t   f u n c t i o n s
1178  *
1179  * pmap_bootstrap and pmap_init are called during system startup
1180  * to init the pmap module.   pmap_bootstrap() does a low level
1181  * init just to get things rolling.   pmap_init() finishes the job.
1182  */
1183 
1184 /*
1185  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1186  * This function is to be used before any VM system has been set up.
1187  *
1188  * The va is taken from virtual_avail.
1189  */
1190 static vaddr_t
1191 pmap_bootstrap_valloc(size_t npages)
1192 {
1193 	vaddr_t va = virtual_avail;
1194 	virtual_avail += npages * PAGE_SIZE;
1195 	return va;
1196 }
1197 
1198 /*
1199  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1200  * This function is to be used before any VM system has been set up.
1201  *
1202  * The pa is taken from avail_start.
1203  */
1204 static paddr_t
1205 pmap_bootstrap_palloc(size_t npages)
1206 {
1207 	paddr_t pa = avail_start;
1208 	avail_start += npages * PAGE_SIZE;
1209 	return pa;
1210 }
1211 
1212 /*
1213  * pmap_bootstrap: get the system in a state where it can run with VM properly
1214  * enabled (called before main()). The VM system is fully init'd later.
1215  *
1216  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1217  *    kernel, and nkpde PTP's for the kernel.
1218  * => kva_start is the first free virtual address in kernel space.
1219  */
1220 void
1221 pmap_bootstrap(vaddr_t kva_start)
1222 {
1223 	struct pmap *kpm;
1224 	int i;
1225 	vaddr_t kva;
1226 
1227 	pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PG_NX : 0);
1228 
1229 	/*
1230 	 * Set up our local static global vars that keep track of the usage of
1231 	 * KVM before kernel_map is set up.
1232 	 */
1233 	virtual_avail = kva_start;		/* first free KVA */
1234 	virtual_end = VM_MAX_KERNEL_ADDRESS;	/* last KVA */
1235 
1236 	/*
1237 	 * Set up protection_codes: we need to be able to convert from a MI
1238 	 * protection code (some combo of VM_PROT...) to something we can jam
1239 	 * into a x86 PTE.
1240 	 */
1241 	protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1242 	protection_codes[VM_PROT_EXECUTE] = PG_RO | PG_X;
1243 	protection_codes[VM_PROT_READ] = PG_RO | pmap_pg_nx;
1244 	protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PG_RO | PG_X;
1245 	protection_codes[VM_PROT_WRITE] = PG_RW | pmap_pg_nx;
1246 	protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PG_RW | PG_X;
1247 	protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PG_RW | pmap_pg_nx;
1248 	protection_codes[VM_PROT_ALL] = PG_RW | PG_X;
1249 
1250 	/*
1251 	 * Now we init the kernel's pmap.
1252 	 *
1253 	 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1254 	 * the pm_obj contains the list of active PTPs.
1255 	 *
1256 	 * The pm_obj currently does not have a pager. It might be possible to
1257 	 * add a pager that would allow a process to read-only mmap its own page
1258 	 * tables (fast user-level vtophys?). This may or may not be useful.
1259 	 */
1260 	kpm = pmap_kernel();
1261 	for (i = 0; i < PTP_LEVELS - 1; i++) {
1262 		mutex_init(&kpm->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
1263 		uvm_obj_init(&kpm->pm_obj[i], NULL, false, 1);
1264 		uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_obj_lock[i]);
1265 		kpm->pm_ptphint[i] = NULL;
1266 	}
1267 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1268 
1269 	kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1270 	for (i = 0; i < PDP_SIZE; i++)
1271 		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1272 
1273 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1274 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1275 
1276 	kcpuset_create(&kpm->pm_cpus, true);
1277 	kcpuset_create(&kpm->pm_kernel_cpus, true);
1278 
1279 	kpm->pm_ldt = NULL;
1280 	kpm->pm_ldt_len = 0;
1281 	kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1282 
1283 	/*
1284 	 * the above is just a rough estimate and not critical to the proper
1285 	 * operation of the system.
1286 	 */
1287 
1288 #if !defined(XEN)
1289 	/*
1290 	 * Begin to enable global TLB entries if they are supported.
1291 	 * The G bit has no effect until the CR4_PGE bit is set in CR4,
1292 	 * which happens in cpu_init(), which is run on each cpu
1293 	 * (and happens later)
1294 	 */
1295 	if (cpu_feature[0] & CPUID_PGE) {
1296 		pmap_pg_g = PG_G;		/* enable software */
1297 
1298 		/* add PG_G attribute to already mapped kernel pages */
1299 		pmap_remap_global();
1300 	}
1301 #endif
1302 
1303 #ifndef XEN
1304 	/*
1305 	 * Enable large pages if they are supported.
1306 	 */
1307 	if (cpu_feature[0] & CPUID_PSE) {
1308 		lcr4(rcr4() | CR4_PSE);	/* enable hardware (via %cr4) */
1309 		pmap_largepages = 1;	/* enable software */
1310 
1311 		/*
1312 		 * The TLB must be flushed after enabling large pages on Pentium
1313 		 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1314 		 * Software Developer's Manual, Volume 3: System Programming".
1315 		 */
1316 		tlbflushg();
1317 
1318 		/* Remap the kernel. */
1319 		pmap_remap_largepages();
1320 	}
1321 	pmap_init_lapic();
1322 #endif /* !XEN */
1323 
1324 #ifdef __HAVE_PCPU_AREA
1325 	pmap_init_pcpu();
1326 #endif
1327 
1328 #ifdef __HAVE_DIRECT_MAP
1329 	pmap_init_directmap(kpm);
1330 #else
1331 	pmap_vpage_cpualloc(&cpu_info_primary);
1332 
1333 	if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1334 		early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1335 		early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1336 	} else { /* amd64 */
1337 		/*
1338 		 * zero_pte is stuck at the end of mapped space for the kernel
1339 		 * image (disjunct from kva space). This is done so that it
1340 		 * can safely be used in pmap_growkernel (pmap_get_physpage),
1341 		 * when it's called for the first time.
1342 		 * XXXfvdl fix this for MULTIPROCESSOR later.
1343 		 */
1344 #ifdef XEN
1345 		/* early_zerop initialized in xen_locore() */
1346 #else
1347 		early_zerop = (void *)bootspace.spareva;
1348 #endif
1349 		early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1350 	}
1351 #endif
1352 
1353 #if defined(XEN) && defined(__x86_64__)
1354 	extern vaddr_t xen_dummy_page;
1355 	paddr_t xen_dummy_user_pgd;
1356 
1357 	/*
1358 	 * We want a dummy page directory for Xen: when deactivating a pmap,
1359 	 * Xen will still consider it active. So we set user PGD to this one
1360 	 * to lift all protection on the now inactive page tables set.
1361 	 */
1362 	xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1363 
1364 	/* Zero fill it, the less checks in Xen it requires the better */
1365 	memset((void *)(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1366 	/* Mark read-only */
1367 	HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1368 	    pmap_pa2pte(xen_dummy_user_pgd) | PG_V | pmap_pg_nx,
1369 	    UVMF_INVLPG);
1370 	/* Pin as L4 */
1371 	xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1372 #endif
1373 
1374 	/*
1375 	 * Allocate space for the IDT, GDT and LDT.
1376 	 */
1377 #ifdef __HAVE_PCPU_AREA
1378 	idt_vaddr = (vaddr_t)&pcpuarea->idt;
1379 #else
1380 	idt_vaddr = pmap_bootstrap_valloc(1);
1381 #endif
1382 	idt_paddr = pmap_bootstrap_palloc(1);
1383 
1384 	gdt_vaddr = pmap_bootstrap_valloc(1);
1385 	gdt_paddr = pmap_bootstrap_palloc(1);
1386 
1387 #ifdef __HAVE_PCPU_AREA
1388 	ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1389 #else
1390 	ldt_vaddr = pmap_bootstrap_valloc(1);
1391 #endif
1392 	ldt_paddr = pmap_bootstrap_palloc(1);
1393 
1394 #if !defined(__x86_64__) && !defined(XEN)
1395 	/* pentium f00f bug stuff */
1396 	pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1397 #endif
1398 
1399 	/*
1400 	 * Now we reserve some VM for mapping pages when doing a crash dump.
1401 	 */
1402 	virtual_avail = reserve_dumppages(virtual_avail);
1403 
1404 	/*
1405 	 * Init the static-global locks and global lists.
1406 	 *
1407 	 * => pventry::pvh_lock (initialized elsewhere) must also be
1408 	 *      a spin lock, again at IPL_VM to prevent deadlock, and
1409 	 *	again is never taken from interrupt context.
1410 	 */
1411 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1412 	LIST_INIT(&pmaps);
1413 
1414 	/*
1415 	 * Ensure the TLB is sync'd with reality by flushing it...
1416 	 */
1417 	tlbflushg();
1418 
1419 	/*
1420 	 * Calculate pmap_maxkvaddr from nkptp[].
1421 	 */
1422 	kva = VM_MIN_KERNEL_ADDRESS;
1423 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
1424 		kva += nkptp[i] * nbpd[i];
1425 	}
1426 	pmap_maxkvaddr = kva;
1427 }
1428 
1429 #ifndef XEN
1430 static void
1431 pmap_init_lapic(void)
1432 {
1433 	/*
1434 	 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1435 	 * x86 implementation relies a lot on this address to be valid; so just
1436 	 * allocate a fake physical page that will be kentered into
1437 	 * local_apic_va by machdep.
1438 	 *
1439 	 * If the LAPIC is present, the va will be remapped somewhere else
1440 	 * later in lapic_map.
1441 	 */
1442 	local_apic_va = pmap_bootstrap_valloc(1);
1443 	local_apic_pa = pmap_bootstrap_palloc(1);
1444 }
1445 #endif
1446 
1447 #if defined(__HAVE_PCPU_AREA) || defined(__HAVE_DIRECT_MAP)
1448 static size_t
1449 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1450 {
1451 	size_t npages;
1452 	npages = (roundup(endva, pgsz) / pgsz) -
1453 	    (rounddown(startva, pgsz) / pgsz);
1454 	return npages;
1455 }
1456 #endif
1457 
1458 #ifdef __HAVE_PCPU_AREA
1459 static void
1460 pmap_init_pcpu(void)
1461 {
1462 	const vaddr_t startva = PMAP_PCPU_BASE;
1463 	size_t nL4e, nL3e, nL2e, nL1e;
1464 	size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1465 	paddr_t pa;
1466 	vaddr_t endva;
1467 	vaddr_t tmpva;
1468 	pt_entry_t *pte;
1469 	size_t size;
1470 	int i;
1471 
1472 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1473 
1474 	size = sizeof(struct pcpu_area);
1475 
1476 	endva = startva + size;
1477 
1478 	/* We will use this temporary va. */
1479 	tmpva = bootspace.spareva;
1480 	pte = PTE_BASE + pl1_i(tmpva);
1481 
1482 	/* Build L4 */
1483 	L4e_idx = pl4_i(startva);
1484 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1485 	KASSERT(nL4e  == 1);
1486 	for (i = 0; i < nL4e; i++) {
1487 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1488 
1489 		pa = pmap_bootstrap_palloc(1);
1490 		*pte = (pa & PG_FRAME) | pteflags;
1491 		pmap_update_pg(tmpva);
1492 		memset((void *)tmpva, 0, PAGE_SIZE);
1493 
1494 		L4_BASE[L4e_idx+i] = pa | pteflags | PG_U;
1495 	}
1496 
1497 	/* Build L3 */
1498 	L3e_idx = pl3_i(startva);
1499 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1500 	for (i = 0; i < nL3e; i++) {
1501 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1502 
1503 		pa = pmap_bootstrap_palloc(1);
1504 		*pte = (pa & PG_FRAME) | pteflags;
1505 		pmap_update_pg(tmpva);
1506 		memset((void *)tmpva, 0, PAGE_SIZE);
1507 
1508 		L3_BASE[L3e_idx+i] = pa | pteflags | PG_U;
1509 	}
1510 
1511 	/* Build L2 */
1512 	L2e_idx = pl2_i(startva);
1513 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1514 	for (i = 0; i < nL2e; i++) {
1515 
1516 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1517 
1518 		pa = pmap_bootstrap_palloc(1);
1519 		*pte = (pa & PG_FRAME) | pteflags;
1520 		pmap_update_pg(tmpva);
1521 		memset((void *)tmpva, 0, PAGE_SIZE);
1522 
1523 		L2_BASE[L2e_idx+i] = pa | pteflags | PG_U;
1524 	}
1525 
1526 	/* Build L1 */
1527 	L1e_idx = pl1_i(startva);
1528 	nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1529 	for (i = 0; i < nL1e; i++) {
1530 		/*
1531 		 * Nothing to do, the PTEs will be entered via
1532 		 * pmap_kenter_pa.
1533 		 */
1534 		KASSERT(L1_BASE[L1e_idx+i] == 0);
1535 	}
1536 
1537 	*pte = 0;
1538 	pmap_update_pg(tmpva);
1539 
1540 	pcpuarea = (struct pcpu_area *)startva;
1541 
1542 	tlbflush();
1543 }
1544 #endif
1545 
1546 #ifdef __HAVE_DIRECT_MAP
1547 /*
1548  * Create the amd64 direct map. Called only once at boot time. We map all of
1549  * the physical memory contiguously using 2MB large pages, with RW permissions.
1550  * However there is a hole: the kernel is mapped with RO permissions.
1551  */
1552 static void
1553 pmap_init_directmap(struct pmap *kpm)
1554 {
1555 	extern phys_ram_seg_t mem_clusters[];
1556 	extern int mem_cluster_cnt;
1557 
1558 	const vaddr_t startva = PMAP_DIRECT_DEFAULT_BASE;
1559 	size_t nL4e, nL3e, nL2e;
1560 	size_t L4e_idx, L3e_idx, L2e_idx;
1561 	size_t spahole, epahole;
1562 	paddr_t lastpa, pa;
1563 	vaddr_t endva;
1564 	vaddr_t tmpva;
1565 	pt_entry_t *pte;
1566 	phys_ram_seg_t *mc;
1567 	int i;
1568 
1569 	const pd_entry_t pteflags = PG_V | PG_KW | pmap_pg_nx;
1570 	const pd_entry_t holepteflags = PG_V | pmap_pg_nx;
1571 
1572 	CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1573 
1574 	spahole = roundup(bootspace.head.pa, NBPD_L2);
1575 	epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1576 
1577 	/* Get the last physical address available */
1578 	lastpa = 0;
1579 	for (i = 0; i < mem_cluster_cnt; i++) {
1580 		mc = &mem_clusters[i];
1581 		lastpa = MAX(lastpa, mc->start + mc->size);
1582 	}
1583 
1584 	/*
1585 	 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1586 	 */
1587 	if (lastpa > MAXPHYSMEM) {
1588 		panic("pmap_init_directmap: lastpa incorrect");
1589 	}
1590 	endva = startva + lastpa;
1591 
1592 	/* We will use this temporary va. */
1593 	tmpva = bootspace.spareva;
1594 	pte = PTE_BASE + pl1_i(tmpva);
1595 
1596 	/* Build L4 */
1597 	L4e_idx = pl4_i(startva);
1598 	nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1599 	KASSERT(nL4e <= NL4_SLOT_DIRECT);
1600 	for (i = 0; i < nL4e; i++) {
1601 		KASSERT(L4_BASE[L4e_idx+i] == 0);
1602 
1603 		pa = pmap_bootstrap_palloc(1);
1604 		*pte = (pa & PG_FRAME) | pteflags;
1605 		pmap_update_pg(tmpva);
1606 		memset((void *)tmpva, 0, PAGE_SIZE);
1607 
1608 		L4_BASE[L4e_idx+i] = pa | pteflags | PG_U;
1609 	}
1610 
1611 	/* Build L3 */
1612 	L3e_idx = pl3_i(startva);
1613 	nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1614 	for (i = 0; i < nL3e; i++) {
1615 		KASSERT(L3_BASE[L3e_idx+i] == 0);
1616 
1617 		pa = pmap_bootstrap_palloc(1);
1618 		*pte = (pa & PG_FRAME) | pteflags;
1619 		pmap_update_pg(tmpva);
1620 		memset((void *)tmpva, 0, PAGE_SIZE);
1621 
1622 		L3_BASE[L3e_idx+i] = pa | pteflags | PG_U;
1623 	}
1624 
1625 	/* Build L2 */
1626 	L2e_idx = pl2_i(startva);
1627 	nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1628 	for (i = 0; i < nL2e; i++) {
1629 		KASSERT(L2_BASE[L2e_idx+i] == 0);
1630 
1631 		pa = (paddr_t)(i * NBPD_L2);
1632 
1633 		if (spahole <= pa && pa < epahole) {
1634 			L2_BASE[L2e_idx+i] = pa | holepteflags | PG_U |
1635 			    PG_PS | pmap_pg_g;
1636 		} else {
1637 			L2_BASE[L2e_idx+i] = pa | pteflags | PG_U |
1638 			    PG_PS | pmap_pg_g;
1639 		}
1640 	}
1641 
1642 	*pte = 0;
1643 	pmap_update_pg(tmpva);
1644 
1645 	pmap_direct_base = startva;
1646 	pmap_direct_end = endva;
1647 	pmap_direct_pdpe = L4e_idx;
1648 	pmap_direct_npdp = nL4e;
1649 
1650 	tlbflush();
1651 }
1652 #endif /* __HAVE_DIRECT_MAP */
1653 
1654 #if !defined(XEN)
1655 /*
1656  * Remap all of the virtual pages created so far with the PG_G bit.
1657  */
1658 static void
1659 pmap_remap_global(void)
1660 {
1661 	vaddr_t kva, kva_end;
1662 	unsigned long p1i;
1663 	size_t i;
1664 
1665 	/* head */
1666 	kva = bootspace.head.va;
1667 	kva_end = kva + bootspace.head.sz;
1668 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1669 		p1i = pl1_i(kva);
1670 		if (pmap_valid_entry(PTE_BASE[p1i]))
1671 			PTE_BASE[p1i] |= pmap_pg_g;
1672 	}
1673 
1674 	/* kernel segments */
1675 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1676 		if (bootspace.segs[i].type == BTSEG_NONE) {
1677 			continue;
1678 		}
1679 		kva = bootspace.segs[i].va;
1680 		kva_end = kva + bootspace.segs[i].sz;
1681 		for ( ; kva < kva_end; kva += PAGE_SIZE) {
1682 			p1i = pl1_i(kva);
1683 			if (pmap_valid_entry(PTE_BASE[p1i]))
1684 				PTE_BASE[p1i] |= pmap_pg_g;
1685 		}
1686 	}
1687 
1688 	/* boot space */
1689 	kva = bootspace.boot.va;
1690 	kva_end = kva + bootspace.boot.sz;
1691 	for ( ; kva < kva_end; kva += PAGE_SIZE) {
1692 		p1i = pl1_i(kva);
1693 		if (pmap_valid_entry(PTE_BASE[p1i]))
1694 			PTE_BASE[p1i] |= pmap_pg_g;
1695 	}
1696 }
1697 #endif
1698 
1699 #ifndef XEN
1700 /*
1701  * Remap several kernel segments with large pages. We cover as many pages as we
1702  * can. Called only once at boot time, if the CPU supports large pages.
1703  */
1704 static void
1705 pmap_remap_largepages(void)
1706 {
1707 	pd_entry_t *pde;
1708 	vaddr_t kva, kva_end;
1709 	paddr_t pa;
1710 	size_t i;
1711 
1712 	/* Remap the kernel text using large pages. */
1713 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1714 		if (bootspace.segs[i].type != BTSEG_TEXT) {
1715 			continue;
1716 		}
1717 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1718 		if (kva < bootspace.segs[i].va) {
1719 			continue;
1720 		}
1721 		kva_end = rounddown(bootspace.segs[i].va +
1722 			bootspace.segs[i].sz, NBPD_L2);
1723 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1724 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1725 			pde = &L2_BASE[pl2_i(kva)];
1726 			*pde = pa | pmap_pg_g | PG_PS | PG_KR | PG_V;
1727 			tlbflushg();
1728 		}
1729 	}
1730 
1731 	/* Remap the kernel rodata using large pages. */
1732 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1733 		if (bootspace.segs[i].type != BTSEG_RODATA) {
1734 			continue;
1735 		}
1736 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1737 		if (kva < bootspace.segs[i].va) {
1738 			continue;
1739 		}
1740 		kva_end = rounddown(bootspace.segs[i].va +
1741 			bootspace.segs[i].sz, NBPD_L2);
1742 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1743 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1744 			pde = &L2_BASE[pl2_i(kva)];
1745 			*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KR | PG_V;
1746 			tlbflushg();
1747 		}
1748 	}
1749 
1750 	/* Remap the kernel data+bss using large pages. */
1751 	for (i = 0; i < BTSPACE_NSEGS; i++) {
1752 		if (bootspace.segs[i].type != BTSEG_DATA) {
1753 			continue;
1754 		}
1755 		kva = roundup(bootspace.segs[i].va, NBPD_L2);
1756 		if (kva < bootspace.segs[i].va) {
1757 			continue;
1758 		}
1759 		kva_end = rounddown(bootspace.segs[i].va +
1760 			bootspace.segs[i].sz, NBPD_L2);
1761 		pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1762 		for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1763 			pde = &L2_BASE[pl2_i(kva)];
1764 			*pde = pa | pmap_pg_g | PG_PS | pmap_pg_nx | PG_KW | PG_V;
1765 			tlbflushg();
1766 		}
1767 	}
1768 }
1769 #endif /* !XEN */
1770 
1771 /*
1772  * pmap_init: called from uvm_init, our job is to get the pmap
1773  * system ready to manage mappings...
1774  */
1775 
1776 void
1777 pmap_init(void)
1778 {
1779 	int i, flags;
1780 
1781 	for (i = 0; i < PV_HASH_SIZE; i++) {
1782 		SLIST_INIT(&pv_hash_heads[i].hh_list);
1783 	}
1784 	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
1785 		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
1786 	}
1787 
1788 	/*
1789 	 * initialize caches.
1790 	 */
1791 
1792 	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
1793 	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
1794 
1795 #ifdef XEN
1796 	/*
1797 	 * pool_cache(9) should not touch cached objects, since they
1798 	 * are pinned on xen and R/O for the domU
1799 	 */
1800 	flags = PR_NOTOUCH;
1801 #else /* XEN */
1802 	flags = 0;
1803 #endif /* XEN */
1804 #ifdef PAE
1805 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1806 	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
1807 	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1808 #else /* PAE */
1809 	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
1810 	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
1811 #endif /* PAE */
1812 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
1813 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
1814 	    NULL, NULL);
1815 
1816 	pmap_tlb_init();
1817 
1818 	/* XXX: Since cpu_hatch() is only for secondary CPUs. */
1819 	pmap_tlb_cpu_init(curcpu());
1820 
1821 	evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1822 	    NULL, "x86", "io bitmap copy");
1823 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1824 	    NULL, "x86", "ldt sync");
1825 
1826 	/*
1827 	 * done: pmap module is up (and ready for business)
1828 	 */
1829 
1830 	pmap_initialized = true;
1831 }
1832 
1833 /*
1834  * pmap_cpu_init_late: perform late per-CPU initialization.
1835  */
1836 
1837 #ifndef XEN
1838 void
1839 pmap_cpu_init_late(struct cpu_info *ci)
1840 {
1841 	/*
1842 	 * The BP has already its own PD page allocated during early
1843 	 * MD startup.
1844 	 */
1845 	if (ci == &cpu_info_primary)
1846 		return;
1847 
1848 #ifdef PAE
1849 	cpu_alloc_l3_page(ci);
1850 #endif
1851 }
1852 #endif
1853 
1854 #ifndef __HAVE_DIRECT_MAP
1855 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1856 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1857 
1858 static void
1859 pmap_vpage_cpualloc(struct cpu_info *ci)
1860 {
1861 	bool primary = (ci == &cpu_info_primary);
1862 	size_t i, npages;
1863 	vaddr_t vabase;
1864 	vsize_t vrange;
1865 
1866 	npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
1867 	KASSERT(npages >= VPAGE_MAX);
1868 	vrange = npages * PAGE_SIZE;
1869 
1870 	if (primary) {
1871 		while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
1872 			/* Waste some pages to align properly */
1873 		}
1874 		/* The base is aligned, allocate the rest (contiguous) */
1875 		pmap_bootstrap_valloc(npages - 1);
1876 	} else {
1877 		vabase = uvm_km_alloc(kernel_map, vrange, vrange,
1878 		    UVM_KMF_VAONLY);
1879 		if (vabase == 0) {
1880 			panic("%s: failed to allocate tmp VA for CPU %d\n",
1881 			    __func__, cpu_index(ci));
1882 		}
1883 	}
1884 
1885 	KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
1886 
1887 	for (i = 0; i < VPAGE_MAX; i++) {
1888 		ci->vpage[i] = vabase + i * PAGE_SIZE;
1889 		ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
1890 	}
1891 }
1892 
1893 void
1894 pmap_vpage_cpu_init(struct cpu_info *ci)
1895 {
1896 	if (ci == &cpu_info_primary) {
1897 		/* cpu0 already taken care of in pmap_bootstrap */
1898 		return;
1899 	}
1900 
1901 	pmap_vpage_cpualloc(ci);
1902 }
1903 #endif
1904 
1905 /*
1906  * p v _ e n t r y   f u n c t i o n s
1907  */
1908 
1909 static bool
1910 pmap_pp_needs_pve(struct pmap_page *pp)
1911 {
1912 
1913 	/*
1914 	 * Adding a pv entry for this page only needs to allocate a pv_entry
1915 	 * structure if the page already has at least one pv entry,
1916 	 * since the first pv entry is stored in the pmap_page.
1917 	 */
1918 
1919 	return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 ||
1920 	    !LIST_EMPTY(&pp->pp_head.pvh_list));
1921 }
1922 
1923 /*
1924  * pmap_free_pvs: free a list of pv_entrys
1925  */
1926 
1927 static void
1928 pmap_free_pvs(struct pv_entry *pve)
1929 {
1930 	struct pv_entry *next;
1931 
1932 	for ( /* null */ ; pve != NULL ; pve = next) {
1933 		next = pve->pve_next;
1934 		pool_cache_put(&pmap_pv_cache, pve);
1935 	}
1936 }
1937 
1938 /*
1939  * main pv_entry manipulation functions:
1940  *   pmap_enter_pv: enter a mapping onto a pv_head list
1941  *   pmap_remove_pv: remove a mapping from a pv_head list
1942  *
1943  * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
1944  *       the pvh before calling
1945  */
1946 
1947 /*
1948  * insert_pv: a helper of pmap_enter_pv
1949  */
1950 
1951 static void
1952 insert_pv(struct pmap_page *pp, struct pv_entry *pve)
1953 {
1954 	struct pv_hash_head *hh;
1955 	kmutex_t *lock;
1956 	u_int hash;
1957 
1958 	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
1959 	lock = pvhash_lock(hash);
1960 	hh = pvhash_head(hash);
1961 	mutex_spin_enter(lock);
1962 	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
1963 	mutex_spin_exit(lock);
1964 
1965 	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
1966 }
1967 
1968 /*
1969  * pmap_enter_pv: enter a mapping onto a pv_head lst
1970  *
1971  * => caller should adjust ptp's wire_count before calling
1972  * => caller has preallocated pve and *sparepve for us
1973  */
1974 
1975 static struct pv_entry *
1976 pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
1977     struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
1978 {
1979 
1980 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
1981 	KASSERT(ptp == NULL || ptp->uobject != NULL);
1982 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
1983 
1984 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
1985 		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
1986 			pp->pp_flags |= PP_EMBEDDED;
1987 			pp->pp_pte.pte_ptp = ptp;
1988 			pp->pp_pte.pte_va = va;
1989 
1990 			return pve;
1991 		}
1992 	} else {
1993 		struct pv_entry *pve2;
1994 
1995 		pve2 = *sparepve;
1996 		*sparepve = NULL;
1997 
1998 		pve2->pve_pte = pp->pp_pte;
1999 		pp->pp_flags &= ~PP_EMBEDDED;
2000 		LIST_INIT(&pp->pp_head.pvh_list);
2001 		insert_pv(pp, pve2);
2002 	}
2003 
2004 	pve->pve_pte.pte_ptp = ptp;
2005 	pve->pve_pte.pte_va = va;
2006 	insert_pv(pp, pve);
2007 
2008 	return NULL;
2009 }
2010 
2011 /*
2012  * pmap_remove_pv: try to remove a mapping from a pv_list
2013  *
2014  * => caller should adjust ptp's wire_count and free PTP if needed
2015  * => we return the removed pve
2016  */
2017 
2018 static struct pv_entry *
2019 pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
2020 {
2021 	struct pv_hash_head *hh;
2022 	struct pv_entry *pve;
2023 	kmutex_t *lock;
2024 	u_int hash;
2025 
2026 	KASSERT(ptp == NULL || ptp->uobject != NULL);
2027 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2028 
2029 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
2030 		KASSERT(pp->pp_pte.pte_ptp == ptp);
2031 		KASSERT(pp->pp_pte.pte_va == va);
2032 
2033 		pp->pp_flags &= ~PP_EMBEDDED;
2034 		LIST_INIT(&pp->pp_head.pvh_list);
2035 
2036 		return NULL;
2037 	}
2038 
2039 	hash = pvhash_hash(ptp, va);
2040 	lock = pvhash_lock(hash);
2041 	hh = pvhash_head(hash);
2042 	mutex_spin_enter(lock);
2043 	pve = pvhash_remove(hh, ptp, va);
2044 	mutex_spin_exit(lock);
2045 
2046 	LIST_REMOVE(pve, pve_list);
2047 
2048 	return pve;
2049 }
2050 
2051 /*
2052  * p t p   f u n c t i o n s
2053  */
2054 
2055 static inline struct vm_page *
2056 pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
2057 {
2058 	int lidx = level - 1;
2059 	struct vm_page *pg;
2060 
2061 	KASSERT(mutex_owned(pmap->pm_lock));
2062 
2063 	if (pa != (paddr_t)-1 && pmap->pm_ptphint[lidx] &&
2064 	    pa == VM_PAGE_TO_PHYS(pmap->pm_ptphint[lidx])) {
2065 		return (pmap->pm_ptphint[lidx]);
2066 	}
2067 	PMAP_SUBOBJ_LOCK(pmap, lidx);
2068 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], ptp_va2o(va, level));
2069 	PMAP_SUBOBJ_UNLOCK(pmap, lidx);
2070 
2071 	KASSERT(pg == NULL || pg->wire_count >= 1);
2072 	return pg;
2073 }
2074 
2075 static inline void
2076 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2077 {
2078 	lwp_t *l;
2079 	int lidx;
2080 	struct uvm_object *obj;
2081 
2082 	KASSERT(ptp->wire_count == 1);
2083 
2084 	lidx = level - 1;
2085 
2086 	obj = &pmap->pm_obj[lidx];
2087 	pmap_stats_update(pmap, -1, 0);
2088 	if (lidx != 0)
2089 		mutex_enter(obj->vmobjlock);
2090 	if (pmap->pm_ptphint[lidx] == ptp)
2091 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
2092 	ptp->wire_count = 0;
2093 	uvm_pagerealloc(ptp, NULL, 0);
2094 	l = curlwp;
2095 	KASSERT((l->l_pflag & LP_INTR) == 0);
2096 	VM_PAGE_TO_PP(ptp)->pp_link = l->l_md.md_gc_ptp;
2097 	l->l_md.md_gc_ptp = ptp;
2098 	if (lidx != 0)
2099 		mutex_exit(obj->vmobjlock);
2100 }
2101 
2102 static void
2103 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2104 	      pt_entry_t *ptes, pd_entry_t * const *pdes)
2105 {
2106 	unsigned long index;
2107 	int level;
2108 	vaddr_t invaladdr;
2109 	pd_entry_t opde;
2110 
2111 	KASSERT(pmap != pmap_kernel());
2112 	KASSERT(mutex_owned(pmap->pm_lock));
2113 	KASSERT(kpreempt_disabled());
2114 
2115 	level = 1;
2116 	do {
2117 		index = pl_i(va, level + 1);
2118 		opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2119 
2120 		/*
2121 		 * On Xen-amd64 or SVS, we need to sync the top level page
2122 		 * directory on each CPU.
2123 		 */
2124 #if defined(XEN) && defined(__x86_64__)
2125 		if (level == PTP_LEVELS - 1) {
2126 			xen_kpm_sync(pmap, index);
2127 		}
2128 #elif defined(SVS)
2129 		if (svs_enabled && level == PTP_LEVELS - 1) {
2130 			svs_pmap_sync(pmap, index);
2131 		}
2132 #endif
2133 
2134 		invaladdr = level == 1 ? (vaddr_t)ptes :
2135 		    (vaddr_t)pdes[level - 2];
2136 		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2137 		    opde, TLBSHOOT_FREE_PTP1);
2138 
2139 #if defined(XEN)
2140 		pmap_tlb_shootnow();
2141 #endif
2142 
2143 		pmap_freepage(pmap, ptp, level);
2144 		if (level < PTP_LEVELS - 1) {
2145 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
2146 			ptp->wire_count--;
2147 			if (ptp->wire_count > 1)
2148 				break;
2149 		}
2150 	} while (++level < PTP_LEVELS);
2151 	pmap_pte_flush();
2152 }
2153 
2154 /*
2155  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2156  *
2157  * => pmap should NOT be pmap_kernel()
2158  * => pmap should be locked
2159  * => preemption should be disabled
2160  */
2161 
2162 static struct vm_page *
2163 pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags)
2164 {
2165 	struct vm_page *ptp;
2166 	struct {
2167 		struct vm_page *pg;
2168 		bool new;
2169 	} pt[PTP_LEVELS + 1];
2170 	int i, aflags;
2171 	unsigned long index;
2172 	pd_entry_t *pva;
2173 	paddr_t pa;
2174 	struct uvm_object *obj;
2175 	voff_t off;
2176 
2177 	KASSERT(pmap != pmap_kernel());
2178 	KASSERT(mutex_owned(pmap->pm_lock));
2179 	KASSERT(kpreempt_disabled());
2180 
2181 	/*
2182 	 * Loop through all page table levels allocating a page
2183 	 * for any level where we don't already have one.
2184 	 */
2185 	memset(pt, 0, sizeof(pt));
2186 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2187 		UVM_PGA_ZERO;
2188 	for (i = PTP_LEVELS; i > 1; i--) {
2189 		obj = &pmap->pm_obj[i - 2];
2190 		off = ptp_va2o(va, i - 1);
2191 
2192 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2193 		pt[i].pg = uvm_pagelookup(obj, off);
2194 		if (pt[i].pg == NULL) {
2195 			pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags);
2196 			pt[i].new = true;
2197 		}
2198 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2199 
2200 		if (pt[i].pg == NULL)
2201 			goto fail;
2202 	}
2203 
2204 	/*
2205 	 * Now that we have all the pages looked up or allocated,
2206 	 * loop through again installing any new ones into the tree.
2207 	 */
2208 	for (i = PTP_LEVELS; i > 1; i--) {
2209 		index = pl_i(va, i);
2210 		pva = pdes[i - 2];
2211 
2212 		if (pmap_valid_entry(pva[index])) {
2213 			KASSERT(!pt[i].new);
2214 			continue;
2215 		}
2216 
2217 		ptp = pt[i].pg;
2218 		ptp->flags &= ~PG_BUSY; /* never busy */
2219 		ptp->wire_count = 1;
2220 		pmap->pm_ptphint[i - 2] = ptp;
2221 		pa = VM_PAGE_TO_PHYS(ptp);
2222 		pmap_pte_set(&pva[index], (pd_entry_t)
2223 		    (pmap_pa2pte(pa) | PG_u | PG_RW | PG_V));
2224 
2225 		/*
2226 		 * On Xen-amd64 or SVS, we need to sync the top level page
2227 		 * directory on each CPU.
2228 		 */
2229 #if defined(XEN) && defined(__x86_64__)
2230 		if (i == PTP_LEVELS) {
2231 			xen_kpm_sync(pmap, index);
2232 		}
2233 #elif defined(SVS)
2234 		if (svs_enabled && i == PTP_LEVELS) {
2235 			svs_pmap_sync(pmap, index);
2236 		}
2237 #endif
2238 
2239 		pmap_pte_flush();
2240 		pmap_stats_update(pmap, 1, 0);
2241 
2242 		/*
2243 		 * If we're not in the top level, increase the
2244 		 * wire count of the parent page.
2245 		 */
2246 		if (i < PTP_LEVELS) {
2247 			pt[i + 1].pg->wire_count++;
2248 		}
2249 	}
2250 	ptp = pt[2].pg;
2251 	KASSERT(ptp != NULL);
2252 	pmap->pm_ptphint[0] = ptp;
2253 	return ptp;
2254 
2255 	/*
2256 	 * Allocation of a ptp failed, free any others that we just allocated.
2257 	 */
2258 fail:
2259 	for (i = PTP_LEVELS; i > 1; i--) {
2260 		if (pt[i].pg == NULL) {
2261 			break;
2262 		}
2263 		if (!pt[i].new) {
2264 			continue;
2265 		}
2266 		obj = &pmap->pm_obj[i - 2];
2267 		PMAP_SUBOBJ_LOCK(pmap, i - 2);
2268 		uvm_pagefree(pt[i].pg);
2269 		PMAP_SUBOBJ_UNLOCK(pmap, i - 2);
2270 	}
2271 	return NULL;
2272 }
2273 
2274 /*
2275  * p m a p   l i f e c y c l e   f u n c t i o n s
2276  */
2277 
2278 /*
2279  * pmap_pdp_ctor: constructor for the PDP cache.
2280  */
2281 static int
2282 pmap_pdp_ctor(void *arg, void *v, int flags)
2283 {
2284 	pd_entry_t *pdir = v;
2285 	paddr_t pdirpa = 0;
2286 	vaddr_t object;
2287 	int i;
2288 
2289 #if !defined(XEN) || !defined(__x86_64__)
2290 	int npde;
2291 #endif
2292 #ifdef XEN
2293 	int s;
2294 #endif
2295 
2296 	/*
2297 	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
2298 	 */
2299 
2300 #if defined(XEN) && defined(__x86_64__)
2301 	/* Fetch the physical address of the page directory */
2302 	(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2303 
2304 	/* Zero the area */
2305 	memset(pdir, 0, PAGE_SIZE); /* Xen wants a clean page */
2306 
2307 	/*
2308 	 * This pdir will NEVER be active in kernel mode, so mark
2309 	 * recursive entry invalid.
2310 	 */
2311 	pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2312 
2313 	/*
2314 	 * PDP constructed this way won't be for the kernel, hence we
2315 	 * don't put kernel mappings on Xen.
2316 	 *
2317 	 * But we need to make pmap_create() happy, so put a dummy
2318 	 * (without PG_V) value at the right place.
2319 	 */
2320 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2321 	     (pd_entry_t)-1 & PG_FRAME;
2322 #else /* XEN && __x86_64__*/
2323 	/* Zero the area */
2324 	memset(pdir, 0, PDIR_SLOT_PTE * sizeof(pd_entry_t));
2325 
2326 	object = (vaddr_t)v;
2327 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2328 		/* Fetch the physical address of the page directory */
2329 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2330 
2331 		/* Put in recursive PDE to map the PTEs */
2332 		pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PG_V |
2333 		    pmap_pg_nx;
2334 #ifndef XEN
2335 		pdir[PDIR_SLOT_PTE + i] |= PG_KW;
2336 #endif
2337 	}
2338 
2339 	/* Copy the kernel's top level PDE */
2340 	npde = nkptp[PTP_LEVELS - 1];
2341 
2342 	memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2343 	    npde * sizeof(pd_entry_t));
2344 
2345 	/* Zero the rest */
2346 	memset(&pdir[PDIR_SLOT_KERN + npde], 0, (PAGE_SIZE * PDP_SIZE) -
2347 	    (PDIR_SLOT_KERN + npde) * sizeof(pd_entry_t));
2348 
2349 	if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2350 		int idx = pl_i(KERNBASE, PTP_LEVELS);
2351 		pdir[idx] = PDP_BASE[idx];
2352 	}
2353 
2354 #ifdef __HAVE_PCPU_AREA
2355 	pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2356 #endif
2357 #ifdef __HAVE_DIRECT_MAP
2358 	memcpy(&pdir[pmap_direct_pdpe], &PDP_BASE[pmap_direct_pdpe],
2359 	    pmap_direct_npdp * sizeof(pd_entry_t));
2360 #endif
2361 #endif /* XEN  && __x86_64__*/
2362 
2363 #ifdef XEN
2364 	s = splvm();
2365 	object = (vaddr_t)v;
2366 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2367 	    VM_PROT_READ);
2368 	pmap_update(pmap_kernel());
2369 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2370 		/*
2371 		 * pin as L2/L4 page, we have to do the page with the
2372 		 * PDIR_SLOT_PTE entries last
2373 		 */
2374 #ifdef PAE
2375 		if (i == l2tol3(PDIR_SLOT_PTE))
2376 			continue;
2377 #endif
2378 
2379 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2380 #ifdef __x86_64__
2381 		xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2382 #else
2383 		xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2384 #endif
2385 	}
2386 #ifdef PAE
2387 	object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2388 	(void)pmap_extract(pmap_kernel(), object, &pdirpa);
2389 	xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2390 #endif
2391 	splx(s);
2392 #endif /* XEN */
2393 
2394 	return (0);
2395 }
2396 
2397 /*
2398  * pmap_pdp_dtor: destructor for the PDP cache.
2399  */
2400 
2401 static void
2402 pmap_pdp_dtor(void *arg, void *v)
2403 {
2404 #ifdef XEN
2405 	paddr_t pdirpa = 0;	/* XXX: GCC */
2406 	vaddr_t object = (vaddr_t)v;
2407 	int i;
2408 	int s = splvm();
2409 	pt_entry_t *pte;
2410 
2411 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2412 		/* fetch the physical address of the page directory. */
2413 		(void) pmap_extract(pmap_kernel(), object, &pdirpa);
2414 		/* unpin page table */
2415 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2416 	}
2417 	object = (vaddr_t)v;
2418 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2419 		/* Set page RW again */
2420 		pte = kvtopte(object);
2421 		pmap_pte_set(pte, *pte | PG_RW);
2422 		xen_bcast_invlpg((vaddr_t)object);
2423 	}
2424 	splx(s);
2425 #endif  /* XEN */
2426 }
2427 
2428 #ifdef PAE
2429 
2430 /* pmap_pdp_alloc: Allocate a page for the pdp memory pool. */
2431 
2432 static void *
2433 pmap_pdp_alloc(struct pool *pp, int flags)
2434 {
2435 	return (void *)uvm_km_alloc(kernel_map,
2436 	    PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2437 	    ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
2438 	    | UVM_KMF_WIRED);
2439 }
2440 
2441 /*
2442  * pmap_pdp_free: free a PDP
2443  */
2444 
2445 static void
2446 pmap_pdp_free(struct pool *pp, void *v)
2447 {
2448 	uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2449 	    UVM_KMF_WIRED);
2450 }
2451 #endif /* PAE */
2452 
2453 /*
2454  * pmap_create: create a pmap object.
2455  */
2456 struct pmap *
2457 pmap_create(void)
2458 {
2459 	struct pmap *pmap;
2460 	int i;
2461 
2462 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2463 
2464 	/* init uvm_object */
2465 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2466 		mutex_init(&pmap->pm_obj_lock[i], MUTEX_DEFAULT, IPL_NONE);
2467 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
2468 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_obj_lock[i]);
2469 		pmap->pm_ptphint[i] = NULL;
2470 	}
2471 	pmap->pm_stats.wired_count = 0;
2472 	/* count the PDP allocd below */
2473 	pmap->pm_stats.resident_count = PDP_SIZE;
2474 #if !defined(__x86_64__)
2475 	pmap->pm_hiexec = 0;
2476 #endif /* !defined(__x86_64__) */
2477 	pmap->pm_flags = 0;
2478 	pmap->pm_gc_ptp = NULL;
2479 
2480 	kcpuset_create(&pmap->pm_cpus, true);
2481 	kcpuset_create(&pmap->pm_kernel_cpus, true);
2482 #ifdef XEN
2483 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2484 #endif
2485 	/* init the LDT */
2486 	pmap->pm_ldt = NULL;
2487 	pmap->pm_ldt_len = 0;
2488 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2489 
2490 	/* allocate PDP */
2491  try_again:
2492 	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
2493 
2494 	mutex_enter(&pmaps_lock);
2495 
2496 	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
2497 		mutex_exit(&pmaps_lock);
2498 		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
2499 		goto try_again;
2500 	}
2501 
2502 	for (i = 0; i < PDP_SIZE; i++)
2503 		pmap->pm_pdirpa[i] =
2504 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2505 
2506 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2507 
2508 	mutex_exit(&pmaps_lock);
2509 
2510 	return (pmap);
2511 }
2512 
2513 /*
2514  * pmap_free_ptps: put a list of ptps back to the freelist.
2515  */
2516 
2517 void
2518 pmap_free_ptps(struct vm_page *empty_ptps)
2519 {
2520 	struct vm_page *ptp;
2521 	struct pmap_page *pp;
2522 
2523 	while ((ptp = empty_ptps) != NULL) {
2524 		pp = VM_PAGE_TO_PP(ptp);
2525 		empty_ptps = pp->pp_link;
2526 		LIST_INIT(&pp->pp_head.pvh_list);
2527 		uvm_pagefree(ptp);
2528 	}
2529 }
2530 
2531 /*
2532  * pmap_check_ptps: verify that none of the pmap's page table objects
2533  * have any pages allocated to them.
2534  */
2535 
2536 static inline void
2537 pmap_check_ptps(struct pmap *pmap)
2538 {
2539 	int i;
2540 
2541 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2542 		KASSERT(pmap->pm_obj[i].uo_npages == 0);
2543 		KASSERT(TAILQ_EMPTY(&pmap->pm_obj[i].memq));
2544 	}
2545 }
2546 
2547 static inline void
2548 pmap_check_inuse(struct pmap *pmap)
2549 {
2550 #ifdef DIAGNOSTIC
2551 	CPU_INFO_ITERATOR cii;
2552 	struct cpu_info *ci;
2553 
2554 	for (CPU_INFO_FOREACH(cii, ci)) {
2555 		if (ci->ci_pmap == pmap)
2556 			panic("destroying pmap being used");
2557 #if defined(XEN) && defined(__x86_64__)
2558 		for (int i = 0; i < PDIR_SLOT_PTE; i++) {
2559 			if (pmap->pm_pdir[i] != 0 &&
2560 			    ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
2561 				printf("pmap_destroy(%p) pmap_kernel %p "
2562 				    "curcpu %d cpu %d ci_pmap %p "
2563 				    "ci->ci_kpm_pdir[%d]=%" PRIx64
2564 				    " pmap->pm_pdir[%d]=%" PRIx64 "\n",
2565 				    pmap, pmap_kernel(), curcpu()->ci_index,
2566 				    ci->ci_index, ci->ci_pmap,
2567 				    i, ci->ci_kpm_pdir[i],
2568 				    i, pmap->pm_pdir[i]);
2569 				panic("%s: used pmap", __func__);
2570 			}
2571 		}
2572 #endif
2573 	}
2574 #endif /* DIAGNOSTIC */
2575 }
2576 
2577 /*
2578  * pmap_destroy: drop reference count on pmap.   free pmap if
2579  *	reference count goes to zero.
2580  */
2581 
2582 void
2583 pmap_destroy(struct pmap *pmap)
2584 {
2585 	lwp_t *l;
2586 	int i;
2587 
2588 	/*
2589 	 * If we have torn down this pmap, process deferred frees and
2590 	 * invalidations.  Free now if the system is low on memory.
2591 	 * Otherwise, free when the pmap is destroyed thus avoiding a
2592 	 * TLB shootdown.
2593 	 */
2594 	l = curlwp;
2595 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
2596 		pmap_check_ptps(pmap);
2597 		if (uvmexp.free < uvmexp.freetarg) {
2598 			pmap_update(pmap);
2599 		} else {
2600 			KASSERT(pmap->pm_gc_ptp == NULL);
2601 			pmap->pm_gc_ptp = l->l_md.md_gc_ptp;
2602 			l->l_md.md_gc_ptp = NULL;
2603 			l->l_md.md_gc_pmap = NULL;
2604 		}
2605 	}
2606 
2607 	/*
2608 	 * drop reference count
2609 	 */
2610 
2611 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
2612 		return;
2613 	}
2614 
2615 	pmap_check_inuse(pmap);
2616 
2617 	/*
2618 	 * Reference count is zero, free pmap resources and then free pmap.
2619 	 * First, remove it from global list of pmaps.
2620 	 */
2621 
2622 	mutex_enter(&pmaps_lock);
2623 	LIST_REMOVE(pmap, pm_list);
2624 	mutex_exit(&pmaps_lock);
2625 
2626 	/*
2627 	 * Process deferred PTP frees.  No TLB shootdown required, as the
2628 	 * PTP pages are no longer visible to any CPU.
2629 	 */
2630 
2631 	pmap_free_ptps(pmap->pm_gc_ptp);
2632 
2633 	pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
2634 
2635 #ifdef USER_LDT
2636 	if (pmap->pm_ldt != NULL) {
2637 		/*
2638 		 * no need to switch the LDT; this address space is gone,
2639 		 * nothing is using it.
2640 		 *
2641 		 * No need to lock the pmap for ldt_free (or anything else),
2642 		 * we're the last one to use it.
2643 		 */
2644 		mutex_enter(&cpu_lock);
2645 		ldt_free(pmap->pm_ldt_sel);
2646 		mutex_exit(&cpu_lock);
2647 		uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
2648 		    pmap->pm_ldt_len, UVM_KMF_WIRED);
2649 	}
2650 #endif
2651 
2652 	for (i = 0; i < PTP_LEVELS - 1; i++) {
2653 		uvm_obj_destroy(&pmap->pm_obj[i], false);
2654 		mutex_destroy(&pmap->pm_obj_lock[i]);
2655 	}
2656 	kcpuset_destroy(pmap->pm_cpus);
2657 	kcpuset_destroy(pmap->pm_kernel_cpus);
2658 #ifdef XEN
2659 	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2660 #endif
2661 
2662 	pmap_check_ptps(pmap);
2663 	pool_cache_put(&pmap_cache, pmap);
2664 }
2665 
2666 /*
2667  * pmap_remove_all: pmap is being torn down by the current thread.
2668  * avoid unnecessary invalidations.
2669  */
2670 
2671 void
2672 pmap_remove_all(struct pmap *pmap)
2673 {
2674 	lwp_t *l = curlwp;
2675 
2676 	KASSERT(l->l_md.md_gc_pmap == NULL);
2677 
2678 	l->l_md.md_gc_pmap = pmap;
2679 }
2680 
2681 #if defined(PMAP_FORK)
2682 /*
2683  * pmap_fork: perform any necessary data structure manipulation when
2684  * a VM space is forked.
2685  */
2686 
2687 void
2688 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
2689 {
2690 #ifdef USER_LDT
2691 	union descriptor *new_ldt;
2692 	size_t len;
2693 	int sel;
2694 
2695 	if (__predict_true(pmap1->pm_ldt == NULL)) {
2696 		return;
2697 	}
2698 
2699 	/*
2700 	 * Copy the LDT into the new process.
2701 	 *
2702 	 * Read pmap1's ldt pointer and length unlocked; if it changes
2703 	 * behind our back we'll retry. This will starve if there's a
2704 	 * stream of LDT changes in another thread but that should not
2705 	 * happen.
2706 	 */
2707 
2708  retry:
2709 	if (pmap1->pm_ldt != NULL) {
2710 		len = pmap1->pm_ldt_len;
2711 		/* Allocate space for the new process's LDT */
2712 		new_ldt = (union descriptor *)uvm_km_alloc(kernel_map, len, 0,
2713 		    UVM_KMF_WIRED);
2714 		if (new_ldt == NULL) {
2715 			printf("WARNING: %s: unable to allocate LDT space\n",
2716 			    __func__);
2717 			return;
2718 		}
2719 		mutex_enter(&cpu_lock);
2720 		/* Get a GDT slot for it */
2721 		sel = ldt_alloc(new_ldt, len);
2722 		if (sel == -1) {
2723 			mutex_exit(&cpu_lock);
2724 			uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2725 			    UVM_KMF_WIRED);
2726 			printf("WARNING: %s: unable to allocate LDT selector\n",
2727 			    __func__);
2728 			return;
2729 		}
2730 	} else {
2731 		/* Wasn't anything there after all. */
2732 		len = -1;
2733 		new_ldt = NULL;
2734 		sel = -1;
2735 		mutex_enter(&cpu_lock);
2736 	}
2737 
2738  	/* If there's still something there now that we have cpu_lock... */
2739  	if (pmap1->pm_ldt != NULL) {
2740 		if (len != pmap1->pm_ldt_len) {
2741 			/* Oops, it changed. Drop what we did and try again */
2742 			if (len != -1) {
2743 				ldt_free(sel);
2744 				uvm_km_free(kernel_map, (vaddr_t)new_ldt,
2745 				    len, UVM_KMF_WIRED);
2746 			}
2747 			mutex_exit(&cpu_lock);
2748 			goto retry;
2749 		}
2750 
2751 		/* Copy the LDT data and install it in pmap2 */
2752 		memcpy(new_ldt, pmap1->pm_ldt, len);
2753 		pmap2->pm_ldt = new_ldt;
2754 		pmap2->pm_ldt_len = pmap1->pm_ldt_len;
2755 		pmap2->pm_ldt_sel = sel;
2756 		len = -1;
2757 	}
2758 
2759 	if (len != -1) {
2760 		/* There wasn't still something there, so mop up */
2761 		ldt_free(sel);
2762 		mutex_exit(&cpu_lock);
2763 		uvm_km_free(kernel_map, (vaddr_t)new_ldt, len,
2764 		    UVM_KMF_WIRED);
2765 	} else {
2766 		mutex_exit(&cpu_lock);
2767 	}
2768 #endif /* USER_LDT */
2769 }
2770 #endif /* PMAP_FORK */
2771 
2772 #ifdef USER_LDT
2773 
2774 /*
2775  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
2776  * is active, reload LDTR.
2777  */
2778 static void
2779 pmap_ldt_xcall(void *arg1, void *arg2)
2780 {
2781 	struct pmap *pm;
2782 
2783 	kpreempt_disable();
2784 	pm = arg1;
2785 	if (curcpu()->ci_pmap == pm) {
2786 		lldt(pm->pm_ldt_sel);
2787 	}
2788 	kpreempt_enable();
2789 }
2790 
2791 /*
2792  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
2793  * in the new selector on all CPUs.
2794  */
2795 void
2796 pmap_ldt_sync(struct pmap *pm)
2797 {
2798 	uint64_t where;
2799 
2800 	KASSERT(mutex_owned(&cpu_lock));
2801 
2802 	pmap_ldt_evcnt.ev_count++;
2803 	where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
2804 	xc_wait(where);
2805 }
2806 
2807 /*
2808  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
2809  * restore the default.
2810  */
2811 
2812 void
2813 pmap_ldt_cleanup(struct lwp *l)
2814 {
2815 	pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
2816 	union descriptor *dp = NULL;
2817 	size_t len = 0;
2818 	int sel = -1;
2819 
2820 	if (__predict_true(pmap->pm_ldt == NULL)) {
2821 		return;
2822 	}
2823 
2824 	mutex_enter(&cpu_lock);
2825 	if (pmap->pm_ldt != NULL) {
2826 		sel = pmap->pm_ldt_sel;
2827 		dp = pmap->pm_ldt;
2828 		len = pmap->pm_ldt_len;
2829 		pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2830 		pmap->pm_ldt = NULL;
2831 		pmap->pm_ldt_len = 0;
2832 		pmap_ldt_sync(pmap);
2833 		ldt_free(sel);
2834 		uvm_km_free(kernel_map, (vaddr_t)dp, len, UVM_KMF_WIRED);
2835 	}
2836 	mutex_exit(&cpu_lock);
2837 }
2838 #endif /* USER_LDT */
2839 
2840 /*
2841  * pmap_activate: activate a process' pmap
2842  *
2843  * => must be called with kernel preemption disabled
2844  * => if lwp is the curlwp, then set ci_want_pmapload so that
2845  *    actual MMU context switch will be done by pmap_load() later
2846  */
2847 
2848 void
2849 pmap_activate(struct lwp *l)
2850 {
2851 	struct cpu_info *ci;
2852 	struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2853 
2854 	KASSERT(kpreempt_disabled());
2855 
2856 	ci = curcpu();
2857 
2858 	if (l != ci->ci_curlwp)
2859 		return;
2860 
2861 	KASSERT(ci->ci_want_pmapload == 0);
2862 	KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
2863 
2864 	/*
2865 	 * no need to switch to kernel vmspace because
2866 	 * it's a subset of any vmspace.
2867 	 */
2868 
2869 	if (pmap == pmap_kernel()) {
2870 		ci->ci_want_pmapload = 0;
2871 		return;
2872 	}
2873 
2874 	ci->ci_want_pmapload = 1;
2875 }
2876 
2877 #if defined(XEN) && defined(__x86_64__)
2878 #define	KASSERT_PDIRPA(pmap) \
2879 	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
2880 	    pmap == pmap_kernel())
2881 #elif defined(PAE)
2882 #define	KASSERT_PDIRPA(pmap) \
2883 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
2884 #elif !defined(XEN)
2885 #define	KASSERT_PDIRPA(pmap) \
2886 	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
2887 #else
2888 #define	KASSERT_PDIRPA(pmap) 	KASSERT(true)	/* nothing to do */
2889 #endif
2890 
2891 /*
2892  * pmap_reactivate: try to regain reference to the pmap.
2893  *
2894  * => Must be called with kernel preemption disabled.
2895  */
2896 
2897 static void
2898 pmap_reactivate(struct pmap *pmap)
2899 {
2900 	struct cpu_info * const ci = curcpu();
2901 	const cpuid_t cid = cpu_index(ci);
2902 
2903 	KASSERT(kpreempt_disabled());
2904 	KASSERT_PDIRPA(pmap);
2905 
2906 	/*
2907 	 * If we still have a lazy reference to this pmap, we can assume
2908 	 * that there was no TLB shootdown for this pmap in the meantime.
2909 	 *
2910 	 * The order of events here is important as we must synchronize
2911 	 * with TLB shootdown interrupts.  Declare interest in invalidations
2912 	 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
2913 	 * change only when the state is TLBSTATE_LAZY.
2914 	 */
2915 
2916 	ci->ci_tlbstate = TLBSTATE_VALID;
2917 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
2918 
2919 	if (kcpuset_isset(pmap->pm_cpus, cid)) {
2920 		/* We have the reference, state is valid. */
2921 	} else {
2922 		/*
2923 		 * Must reload the TLB, pmap has been changed during
2924 		 * deactivated.
2925 		 */
2926 		kcpuset_atomic_set(pmap->pm_cpus, cid);
2927 
2928 		u_int gen = uvm_emap_gen_return();
2929 		tlbflush();
2930 		uvm_emap_update(gen);
2931 	}
2932 }
2933 
2934 /*
2935  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
2936  * and relevant LDT info.
2937  *
2938  * Ensures that the current process' pmap is loaded on the current CPU's
2939  * MMU and that there are no stale TLB entries.
2940  *
2941  * => The caller should disable kernel preemption or do check-and-retry
2942  *    to prevent a preemption from undoing our efforts.
2943  * => This function may block.
2944  */
2945 void
2946 pmap_load(void)
2947 {
2948 	struct cpu_info *ci;
2949 	struct pmap *pmap, *oldpmap;
2950 	struct lwp *l;
2951 	struct pcb *pcb;
2952 	cpuid_t cid;
2953 	uint64_t ncsw;
2954 
2955 	kpreempt_disable();
2956  retry:
2957 	ci = curcpu();
2958 	if (!ci->ci_want_pmapload) {
2959 		kpreempt_enable();
2960 		return;
2961 	}
2962 	l = ci->ci_curlwp;
2963 	ncsw = l->l_ncsw;
2964 
2965 	/* should be able to take ipis. */
2966 	KASSERT(ci->ci_ilevel < IPL_HIGH);
2967 #ifdef XEN
2968 	/* Check to see if interrupts are enabled (ie; no events are masked) */
2969 	KASSERT(x86_read_psl() == 0);
2970 #else
2971 	KASSERT((x86_read_psl() & PSL_I) != 0);
2972 #endif
2973 
2974 	KASSERT(l != NULL);
2975 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
2976 	KASSERT(pmap != pmap_kernel());
2977 	oldpmap = ci->ci_pmap;
2978 	pcb = lwp_getpcb(l);
2979 
2980 	if (pmap == oldpmap) {
2981 		pmap_reactivate(pmap);
2982 		ci->ci_want_pmapload = 0;
2983 		kpreempt_enable();
2984 		return;
2985 	}
2986 
2987 	/*
2988 	 * Acquire a reference to the new pmap and perform the switch.
2989 	 */
2990 
2991 	pmap_reference(pmap);
2992 
2993 	cid = cpu_index(ci);
2994 	kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
2995 	kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
2996 
2997 	KASSERT_PDIRPA(oldpmap);
2998 	KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
2999 	KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3000 
3001 	/*
3002 	 * Mark the pmap in use by this CPU.  Again, we must synchronize
3003 	 * with TLB shootdown interrupts, so set the state VALID first,
3004 	 * then register us for shootdown events on this pmap.
3005 	 */
3006 	ci->ci_tlbstate = TLBSTATE_VALID;
3007 	kcpuset_atomic_set(pmap->pm_cpus, cid);
3008 	kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3009 	ci->ci_pmap = pmap;
3010 
3011 	/*
3012 	 * update tss.  now that we have registered for invalidations
3013 	 * from other CPUs, we're good to load the page tables.
3014 	 */
3015 #ifdef PAE
3016 	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3017 #else
3018 	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3019 #endif
3020 
3021 #ifdef i386
3022 #ifndef XEN
3023 	ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3024 	ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3025 #endif /* !XEN */
3026 #endif /* i386 */
3027 
3028 	lldt(pmap->pm_ldt_sel);
3029 
3030 	u_int gen = uvm_emap_gen_return();
3031 	cpu_load_pmap(pmap, oldpmap);
3032 	uvm_emap_update(gen);
3033 
3034 	ci->ci_want_pmapload = 0;
3035 
3036 	/*
3037 	 * we're now running with the new pmap.  drop the reference
3038 	 * to the old pmap.  if we block, we need to go around again.
3039 	 */
3040 
3041 	pmap_destroy(oldpmap);
3042 	if (l->l_ncsw != ncsw) {
3043 		goto retry;
3044 	}
3045 
3046 	kpreempt_enable();
3047 }
3048 
3049 /*
3050  * pmap_deactivate: deactivate a process' pmap.
3051  *
3052  * => Must be called with kernel preemption disabled (high IPL is enough).
3053  */
3054 void
3055 pmap_deactivate(struct lwp *l)
3056 {
3057 	struct pmap *pmap;
3058 	struct cpu_info *ci;
3059 
3060 	KASSERT(kpreempt_disabled());
3061 
3062 	if (l != curlwp) {
3063 		return;
3064 	}
3065 
3066 	/*
3067 	 * Wait for pending TLB shootdowns to complete.  Necessary because
3068 	 * TLB shootdown state is per-CPU, and the LWP may be coming off
3069 	 * the CPU before it has a chance to call pmap_update(), e.g. due
3070 	 * to kernel preemption or blocking routine in between.
3071 	 */
3072 	pmap_tlb_shootnow();
3073 
3074 	ci = curcpu();
3075 
3076 	if (ci->ci_want_pmapload) {
3077 		/*
3078 		 * ci_want_pmapload means that our pmap is not loaded on
3079 		 * the CPU or TLB might be stale.  note that pmap_kernel()
3080 		 * is always considered loaded.
3081 		 */
3082 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3083 		    != pmap_kernel());
3084 		KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3085 		    != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3086 
3087 		/*
3088 		 * userspace has not been touched.
3089 		 * nothing to do here.
3090 		 */
3091 
3092 		ci->ci_want_pmapload = 0;
3093 		return;
3094 	}
3095 
3096 	pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3097 
3098 	if (pmap == pmap_kernel()) {
3099 		return;
3100 	}
3101 
3102 	KASSERT_PDIRPA(pmap);
3103 	KASSERT(ci->ci_pmap == pmap);
3104 
3105 	/*
3106 	 * we aren't interested in TLB invalidations for this pmap,
3107 	 * at least for the time being.
3108 	 */
3109 
3110 	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3111 	ci->ci_tlbstate = TLBSTATE_LAZY;
3112 }
3113 
3114 /*
3115  * end of lifecycle functions
3116  */
3117 
3118 /*
3119  * some misc. functions
3120  */
3121 
3122 int
3123 pmap_pdes_invalid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde)
3124 {
3125 	int i;
3126 	unsigned long index;
3127 	pd_entry_t pde;
3128 
3129 	for (i = PTP_LEVELS; i > 1; i--) {
3130 		index = pl_i(va, i);
3131 		pde = pdes[i - 2][index];
3132 		if ((pde & PG_V) == 0)
3133 			return i;
3134 	}
3135 	if (lastpde != NULL)
3136 		*lastpde = pde;
3137 	return 0;
3138 }
3139 
3140 /*
3141  * pmap_extract: extract a PA for the given VA
3142  */
3143 
3144 bool
3145 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3146 {
3147 	pt_entry_t *ptes, pte;
3148 	pd_entry_t pde;
3149 	pd_entry_t * const *pdes;
3150 	struct pmap *pmap2;
3151 	struct cpu_info *ci;
3152 	paddr_t pa;
3153 	lwp_t *l;
3154 	bool hard, rv;
3155 
3156 #ifdef __HAVE_DIRECT_MAP
3157 	if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3158 		if (pap != NULL) {
3159 			*pap = va - PMAP_DIRECT_BASE;
3160 		}
3161 		return true;
3162 	}
3163 #endif
3164 
3165 	rv = false;
3166 	pa = 0;
3167 	l = curlwp;
3168 
3169 	kpreempt_disable();
3170 	ci = l->l_cpu;
3171 	if (__predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap) ||
3172 	    pmap == pmap_kernel()) {
3173 		/*
3174 		 * no need to lock, because it's pmap_kernel() or our
3175 		 * own pmap and is active.  if a user pmap, the caller
3176 		 * will hold the vm_map write/read locked and so prevent
3177 		 * entries from disappearing while we are here.  ptps
3178 		 * can disappear via pmap_remove() and pmap_protect(),
3179 		 * but they are called with the vm_map write locked.
3180 		 */
3181 		hard = false;
3182 		ptes = PTE_BASE;
3183 		pdes = normal_pdes;
3184 	} else {
3185 		/* we lose, do it the hard way. */
3186 		hard = true;
3187 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3188 	}
3189 	if (pmap_pdes_valid(va, pdes, &pde)) {
3190 		pte = ptes[pl1_i(va)];
3191 		if (pde & PG_PS) {
3192 			pa = (pde & PG_LGFRAME) | (va & (NBPD_L2 - 1));
3193 			rv = true;
3194 		} else if (__predict_true((pte & PG_V) != 0)) {
3195 			pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3196 			rv = true;
3197 		}
3198 	}
3199 	if (__predict_false(hard)) {
3200 		pmap_unmap_ptes(pmap, pmap2);
3201 	}
3202 	kpreempt_enable();
3203 	if (pap != NULL) {
3204 		*pap = pa;
3205 	}
3206 	return rv;
3207 }
3208 
3209 
3210 /*
3211  * vtophys: virtual address to physical address.  For use by
3212  * machine-dependent code only.
3213  */
3214 
3215 paddr_t
3216 vtophys(vaddr_t va)
3217 {
3218 	paddr_t pa;
3219 
3220 	if (pmap_extract(pmap_kernel(), va, &pa) == true)
3221 		return (pa);
3222 	return (0);
3223 }
3224 
3225 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3226 
3227 #ifdef XEN
3228 
3229 /*
3230  * vtomach: virtual address to machine address.  For use by
3231  * machine-dependent code only.
3232  */
3233 
3234 paddr_t
3235 vtomach(vaddr_t va)
3236 {
3237 	paddr_t pa;
3238 
3239 	if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3240 		return (pa);
3241 	return (0);
3242 }
3243 
3244 #endif /* XEN */
3245 
3246 /*
3247  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3248  *	determine the bounds of the kernel virtual addess space.
3249  */
3250 
3251 void
3252 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3253 {
3254 	*startp = virtual_avail;
3255 	*endp = virtual_end;
3256 }
3257 
3258 /*
3259  * pmap_zero_page: zero a page
3260  */
3261 
3262 void
3263 pmap_zero_page(paddr_t pa)
3264 {
3265 #if defined(__HAVE_DIRECT_MAP)
3266 	pagezero(PMAP_DIRECT_MAP(pa));
3267 #else
3268 #if defined(XEN)
3269 	if (XEN_VERSION_SUPPORTED(3, 4))
3270 		xen_pagezero(pa);
3271 #endif
3272 	struct cpu_info *ci;
3273 	pt_entry_t *zpte;
3274 	vaddr_t zerova;
3275 
3276 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U;
3277 
3278 	kpreempt_disable();
3279 
3280 	ci = curcpu();
3281 	zerova = ci->vpage[VPAGE_ZER];
3282 	zpte = ci->vpage_pte[VPAGE_ZER];
3283 
3284 	KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
3285 
3286 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3287 	pmap_pte_flush();
3288 	pmap_update_pg(zerova);		/* flush TLB */
3289 
3290 	memset((void *)zerova, 0, PAGE_SIZE);
3291 
3292 #if defined(DIAGNOSTIC) || defined(XEN)
3293 	pmap_pte_set(zpte, 0);				/* zap ! */
3294 	pmap_pte_flush();
3295 #endif
3296 
3297 	kpreempt_enable();
3298 #endif /* defined(__HAVE_DIRECT_MAP) */
3299 }
3300 
3301 /*
3302  * pmap_pagezeroidle: the same, for the idle loop page zero'er.
3303  * Returns true if the page was zero'd, false if we aborted for
3304  * some reason.
3305  */
3306 
3307 bool
3308 pmap_pageidlezero(paddr_t pa)
3309 {
3310 #ifdef __HAVE_DIRECT_MAP
3311 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3312 	return sse2_idlezero_page((void *)PMAP_DIRECT_MAP(pa));
3313 #else
3314 	struct cpu_info *ci;
3315 	pt_entry_t *zpte;
3316 	vaddr_t zerova;
3317 	bool rv;
3318 
3319 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_M | PG_U;
3320 
3321 	ci = curcpu();
3322 	zerova = ci->vpage[VPAGE_ZER];
3323 	zpte = ci->vpage_pte[VPAGE_ZER];
3324 
3325 	KASSERT(cpu_feature[0] & CPUID_SSE2);
3326 	KASSERT(*zpte == 0);
3327 
3328 	pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
3329 	pmap_pte_flush();
3330 	pmap_update_pg(zerova);		/* flush TLB */
3331 
3332 	rv = sse2_idlezero_page((void *)zerova);
3333 
3334 #if defined(DIAGNOSTIC) || defined(XEN)
3335 	pmap_pte_set(zpte, 0);				/* zap ! */
3336 	pmap_pte_flush();
3337 #endif
3338 
3339 	return rv;
3340 #endif
3341 }
3342 
3343 /*
3344  * pmap_copy_page: copy a page
3345  */
3346 
3347 void
3348 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
3349 {
3350 #if defined(__HAVE_DIRECT_MAP)
3351 	vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
3352 	vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
3353 
3354 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3355 #else
3356 #if defined(XEN)
3357 	if (XEN_VERSION_SUPPORTED(3, 4)) {
3358 		xen_copy_page(srcpa, dstpa);
3359 		return;
3360 	}
3361 #endif
3362 	struct cpu_info *ci;
3363 	pt_entry_t *srcpte, *dstpte;
3364 	vaddr_t srcva, dstva;
3365 
3366 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U;
3367 
3368 	kpreempt_disable();
3369 
3370 	ci = curcpu();
3371 	srcva = ci->vpage[VPAGE_SRC];
3372 	dstva = ci->vpage[VPAGE_DST];
3373 	srcpte = ci->vpage_pte[VPAGE_SRC];
3374 	dstpte = ci->vpage_pte[VPAGE_DST];
3375 
3376 	KASSERT(*srcpte == 0 && *dstpte == 0);
3377 
3378 	pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
3379 	pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PG_M);
3380 	pmap_pte_flush();
3381 	pmap_update_pg(srcva);
3382 	pmap_update_pg(dstva);
3383 
3384 	memcpy((void *)dstva, (void *)srcva, PAGE_SIZE);
3385 
3386 #if defined(DIAGNOSTIC) || defined(XEN)
3387 	pmap_pte_set(srcpte, 0);
3388 	pmap_pte_set(dstpte, 0);
3389 	pmap_pte_flush();
3390 #endif
3391 
3392 	kpreempt_enable();
3393 #endif /* defined(__HAVE_DIRECT_MAP) */
3394 }
3395 
3396 static pt_entry_t *
3397 pmap_map_ptp(struct vm_page *ptp)
3398 {
3399 #ifdef __HAVE_DIRECT_MAP
3400 	return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
3401 #else
3402 	struct cpu_info *ci;
3403 	pt_entry_t *ptppte;
3404 	vaddr_t ptpva;
3405 
3406 	KASSERT(kpreempt_disabled());
3407 
3408 #ifndef XEN
3409 	const pd_entry_t pteflags = PG_V | PG_RW | pmap_pg_nx | PG_U | PG_M;
3410 #else
3411 	const pd_entry_t pteflags = PG_V | pmap_pg_nx | PG_U | PG_M;
3412 #endif
3413 
3414 	ci = curcpu();
3415 	ptpva = ci->vpage[VPAGE_PTP];
3416 	ptppte = ci->vpage_pte[VPAGE_PTP];
3417 
3418 	pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
3419 
3420 	pmap_pte_flush();
3421 	pmap_update_pg(ptpva);
3422 
3423 	return (pt_entry_t *)ptpva;
3424 #endif
3425 }
3426 
3427 static void
3428 pmap_unmap_ptp(void)
3429 {
3430 #ifndef __HAVE_DIRECT_MAP
3431 #if defined(DIAGNOSTIC) || defined(XEN)
3432 	struct cpu_info *ci;
3433 	pt_entry_t *pte;
3434 
3435 	KASSERT(kpreempt_disabled());
3436 
3437 	ci = curcpu();
3438 	pte = ci->vpage_pte[VPAGE_PTP];
3439 
3440 	if (*pte != 0) {
3441 		pmap_pte_set(pte, 0);
3442 		pmap_pte_flush();
3443 	}
3444 #endif
3445 #endif
3446 }
3447 
3448 static pt_entry_t *
3449 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
3450 {
3451 
3452 	KASSERT(kpreempt_disabled());
3453 	if (pmap_is_curpmap(pmap)) {
3454 		return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
3455 	}
3456 	KASSERT(ptp != NULL);
3457 	return pmap_map_ptp(ptp) + pl1_pi(va);
3458 }
3459 
3460 static void
3461 pmap_unmap_pte(void)
3462 {
3463 
3464 	KASSERT(kpreempt_disabled());
3465 
3466 	pmap_unmap_ptp();
3467 }
3468 
3469 /*
3470  * p m a p   r e m o v e   f u n c t i o n s
3471  *
3472  * functions that remove mappings
3473  */
3474 
3475 /*
3476  * pmap_remove_ptes: remove PTEs from a PTP
3477  *
3478  * => caller must hold pmap's lock
3479  * => PTP must be mapped into KVA
3480  * => PTP should be null if pmap == pmap_kernel()
3481  * => must be called with kernel preemption disabled
3482  * => returns composite pte if at least one page should be shot down
3483  */
3484 
3485 static void
3486 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
3487 		 vaddr_t startva, vaddr_t endva, struct pv_entry **pv_tofree)
3488 {
3489 	pt_entry_t *pte = (pt_entry_t *)ptpva;
3490 
3491 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3492 	KASSERT(kpreempt_disabled());
3493 
3494 	/*
3495 	 * note that ptpva points to the PTE that maps startva.   this may
3496 	 * or may not be the first PTE in the PTP.
3497 	 *
3498 	 * we loop through the PTP while there are still PTEs to look at
3499 	 * and the wire_count is greater than 1 (because we use the wire_count
3500 	 * to keep track of the number of real PTEs in the PTP).
3501 	 */
3502 	while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
3503 		(void)pmap_remove_pte(pmap, ptp, pte, startva, pv_tofree);
3504 		startva += PAGE_SIZE;
3505 		pte++;
3506 	}
3507 }
3508 
3509 
3510 /*
3511  * pmap_remove_pte: remove a single PTE from a PTP.
3512  *
3513  * => caller must hold pmap's lock
3514  * => PTP must be mapped into KVA
3515  * => PTP should be null if pmap == pmap_kernel()
3516  * => returns true if we removed a mapping
3517  * => must be called with kernel preemption disabled
3518  */
3519 static bool
3520 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3521 		vaddr_t va, struct pv_entry **pv_tofree)
3522 {
3523 	struct pv_entry *pve;
3524 	struct vm_page *pg;
3525 	struct pmap_page *pp;
3526 	pt_entry_t opte;
3527 
3528 	KASSERT(pmap == pmap_kernel() || mutex_owned(pmap->pm_lock));
3529 	KASSERT(kpreempt_disabled());
3530 
3531 	if (!pmap_valid_entry(*pte)) {
3532 		/* VA not mapped. */
3533 		return false;
3534 	}
3535 
3536 	/* Atomically save the old PTE and zap it. */
3537 	opte = pmap_pte_testset(pte, 0);
3538 	if (!pmap_valid_entry(opte)) {
3539 		return false;
3540 	}
3541 
3542 	pmap_exec_account(pmap, va, opte, 0);
3543 	pmap_stats_update_bypte(pmap, 0, opte);
3544 
3545 	if (ptp) {
3546 		/*
3547 		 * Dropping a PTE.  Make sure that the PDE is flushed.
3548 		 */
3549 		ptp->wire_count--;
3550 		if (ptp->wire_count <= 1) {
3551 			opte |= PG_U;
3552 		}
3553 	}
3554 
3555 	if ((opte & PG_U) != 0) {
3556 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
3557 	}
3558 
3559 	/*
3560 	 * If we are not on a pv_head list - we are done.
3561 	 */
3562 	if ((opte & PG_PVLIST) == 0) {
3563 #ifndef DOM0OPS
3564 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3565 		    "managed page without PG_PVLIST for %#"PRIxVADDR, va);
3566 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3567 		    "pv-tracked page without PG_PVLIST for %#"PRIxVADDR, va);
3568 #endif
3569 		return true;
3570 	}
3571 
3572 	if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3573 		KASSERT(uvm_page_locked_p(pg));
3574 		pp = VM_PAGE_TO_PP(pg);
3575 	} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3576 		paddr_t pa = pmap_pte2pa(opte);
3577 		panic("%s: PG_PVLIST with pv-untracked page"
3578 		    " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
3579 		    __func__, va, pa, atop(pa));
3580 	}
3581 
3582 	/* Sync R/M bits. */
3583 	pp->pp_attrs |= opte;
3584 	pve = pmap_remove_pv(pp, ptp, va);
3585 
3586 	if (pve) {
3587 		pve->pve_next = *pv_tofree;
3588 		*pv_tofree = pve;
3589 	}
3590 	return true;
3591 }
3592 
3593 /*
3594  * pmap_remove: mapping removal function.
3595  *
3596  * => caller should not be holding any pmap locks
3597  */
3598 
3599 void
3600 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
3601 {
3602 	pt_entry_t *ptes;
3603 	pd_entry_t pde;
3604 	pd_entry_t * const *pdes;
3605 	struct pv_entry *pv_tofree = NULL;
3606 	bool result;
3607 	int i;
3608 	paddr_t ptppa;
3609 	vaddr_t blkendva, va = sva;
3610 	struct vm_page *ptp;
3611 	struct pmap *pmap2;
3612 
3613 	kpreempt_disable();
3614 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
3615 
3616 	/*
3617 	 * removing one page?  take shortcut function.
3618 	 */
3619 
3620 	if (va + PAGE_SIZE == eva) {
3621 		if (pmap_pdes_valid(va, pdes, &pde)) {
3622 
3623 			/* PA of the PTP */
3624 			ptppa = pmap_pte2pa(pde);
3625 
3626 			/* Get PTP if non-kernel mapping. */
3627 			if (pmap != pmap_kernel()) {
3628 				ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3629 				KASSERTMSG(ptp != NULL,
3630 				    "%s: unmanaged PTP detected", __func__);
3631 			} else {
3632 				/* Never free kernel PTPs. */
3633 				ptp = NULL;
3634 			}
3635 
3636 			result = pmap_remove_pte(pmap, ptp,
3637 			    &ptes[pl1_i(va)], va, &pv_tofree);
3638 
3639 			/*
3640 			 * if mapping removed and the PTP is no longer
3641 			 * being used, free it!
3642 			 */
3643 
3644 			if (result && ptp && ptp->wire_count <= 1)
3645 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3646 		}
3647 	} else for (/* null */ ; va < eva ; va = blkendva) {
3648 		int lvl;
3649 
3650 		/* determine range of block */
3651 		blkendva = x86_round_pdr(va+1);
3652 		if (blkendva > eva)
3653 			blkendva = eva;
3654 
3655 		/*
3656 		 * Our PTE mappings should never be removed with pmap_remove.
3657 		 *
3658 		 * XXXmaxv: still needed?
3659 		 *
3660 		 * A long term solution is to move the PTEs out of user address
3661 		 * space, and into kernel address space. Then we can set
3662 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
3663 		 */
3664 		for (i = 0; i < PDP_SIZE; i++) {
3665 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
3666 				panic("PTE space accessed");
3667 		}
3668 
3669 		lvl = pmap_pdes_invalid(va, pdes, &pde);
3670 		if (lvl != 0) {
3671 			/*
3672 			 * skip a range corresponding to an invalid pde.
3673 			 */
3674 			blkendva = (va & ptp_masks[lvl - 1]) + nbpd[lvl - 1];
3675  			continue;
3676 		}
3677 
3678 		/* PA of the PTP */
3679 		ptppa = pmap_pte2pa(pde);
3680 
3681 		/* Get PTP if non-kernel mapping. */
3682 		if (pmap != pmap_kernel()) {
3683 			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
3684 			KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
3685 			    __func__);
3686 		} else {
3687 			/* Never free kernel PTPs. */
3688 			ptp = NULL;
3689 		}
3690 
3691 		pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
3692 		    blkendva, &pv_tofree);
3693 
3694 		/* if PTP is no longer being used, free it! */
3695 		if (ptp && ptp->wire_count <= 1) {
3696 			pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3697 		}
3698 	}
3699 	pmap_unmap_ptes(pmap, pmap2);		/* unlock pmap */
3700 	kpreempt_enable();
3701 
3702 	/* Now we free unused PVs */
3703 	if (pv_tofree)
3704 		pmap_free_pvs(pv_tofree);
3705 }
3706 
3707 /*
3708  * pmap_sync_pv: clear pte bits and return the old value of the pte.
3709  *
3710  * => Caller should disable kernel preemption.
3711  * => issues tlb shootdowns if necessary.
3712  */
3713 
3714 static int
3715 pmap_sync_pv(struct pv_pte *pvpte, pt_entry_t expect, int clearbits,
3716     pt_entry_t *optep)
3717 {
3718 	struct pmap *pmap;
3719 	struct vm_page *ptp;
3720 	vaddr_t va;
3721 	pt_entry_t *ptep;
3722 	pt_entry_t opte;
3723 	pt_entry_t npte;
3724 	bool need_shootdown;
3725 
3726 	ptp = pvpte->pte_ptp;
3727 	va = pvpte->pte_va;
3728 	KASSERT(ptp == NULL || ptp->uobject != NULL);
3729 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
3730 	pmap = ptp_to_pmap(ptp);
3731 
3732 	KASSERT((expect & ~(PG_FRAME | PG_V)) == 0);
3733 	KASSERT((expect & PG_V) != 0);
3734 	KASSERT(clearbits == ~0 || (clearbits & ~(PG_M | PG_U | PG_RW)) == 0);
3735 	KASSERT(kpreempt_disabled());
3736 
3737 	ptep = pmap_map_pte(pmap, ptp, va);
3738 	do {
3739 		opte = *ptep;
3740 		KASSERT((opte & (PG_M | PG_U)) != PG_M);
3741 		KASSERT((opte & (PG_U | PG_V)) != PG_U);
3742 		KASSERT(opte == 0 || (opte & PG_V) != 0);
3743 		if ((opte & (PG_FRAME | PG_V)) != expect) {
3744 
3745 			/*
3746 			 * we lost a race with a V->P operation like
3747 			 * pmap_remove().  wait for the competitor
3748 			 * reflecting pte bits into mp_attrs.
3749 			 *
3750 			 * issue a redundant TLB shootdown so that
3751 			 * we can wait for its completion.
3752 			 */
3753 
3754 			pmap_unmap_pte();
3755 			if (clearbits != 0) {
3756 				pmap_tlb_shootdown(pmap, va,
3757 				    (pmap == pmap_kernel() ? PG_G : 0),
3758 				    TLBSHOOT_SYNC_PV1);
3759 			}
3760 			return EAGAIN;
3761 		}
3762 
3763 		/*
3764 		 * check if there's anything to do on this pte.
3765 		 */
3766 
3767 		if ((opte & clearbits) == 0) {
3768 			need_shootdown = false;
3769 			break;
3770 		}
3771 
3772 		/*
3773 		 * we need a shootdown if the pte is cached. (PG_U)
3774 		 *
3775 		 * ...unless we are clearing only the PG_RW bit and
3776 		 * it isn't cached as RW. (PG_M)
3777 		 */
3778 
3779 		need_shootdown = (opte & PG_U) != 0 &&
3780 		    !(clearbits == PG_RW && (opte & PG_M) == 0);
3781 
3782 		npte = opte & ~clearbits;
3783 
3784 		/*
3785 		 * if we need a shootdown anyway, clear PG_U and PG_M.
3786 		 */
3787 
3788 		if (need_shootdown) {
3789 			npte &= ~(PG_U | PG_M);
3790 		}
3791 		KASSERT((npte & (PG_M | PG_U)) != PG_M);
3792 		KASSERT((npte & (PG_U | PG_V)) != PG_U);
3793 		KASSERT(npte == 0 || (opte & PG_V) != 0);
3794 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
3795 
3796 	if (need_shootdown) {
3797 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV2);
3798 	}
3799 	pmap_unmap_pte();
3800 
3801 	*optep = opte;
3802 	return 0;
3803 }
3804 
3805 static void
3806 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
3807 {
3808 	struct pv_pte *pvpte;
3809 	struct pv_entry *killlist = NULL;
3810 	struct vm_page *ptp;
3811 	pt_entry_t expect;
3812 	int count;
3813 
3814 	expect = pmap_pa2pte(pa) | PG_V;
3815 	count = SPINLOCK_BACKOFF_MIN;
3816 	kpreempt_disable();
3817 startover:
3818 	while ((pvpte = pv_pte_first(pp)) != NULL) {
3819 		struct pmap *pmap;
3820 		struct pv_entry *pve;
3821 		pt_entry_t opte;
3822 		vaddr_t va;
3823 		int error;
3824 
3825 		/*
3826 		 * add a reference to the pmap before clearing the pte.
3827 		 * otherwise the pmap can disappear behind us.
3828 		 */
3829 
3830 		ptp = pvpte->pte_ptp;
3831 		pmap = ptp_to_pmap(ptp);
3832 		if (ptp != NULL) {
3833 			pmap_reference(pmap);
3834 		}
3835 
3836 		error = pmap_sync_pv(pvpte, expect, ~0, &opte);
3837 		if (error == EAGAIN) {
3838 			int hold_count;
3839 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3840 			if (ptp != NULL) {
3841 				pmap_destroy(pmap);
3842 			}
3843 			SPINLOCK_BACKOFF(count);
3844 			KERNEL_LOCK(hold_count, curlwp);
3845 			goto startover;
3846 		}
3847 
3848 		pp->pp_attrs |= opte;
3849 		va = pvpte->pte_va;
3850 		pve = pmap_remove_pv(pp, ptp, va);
3851 
3852 		/* update the PTP reference count.  free if last reference. */
3853 		if (ptp != NULL) {
3854 			struct pmap *pmap2;
3855 			pt_entry_t *ptes;
3856 			pd_entry_t * const *pdes;
3857 
3858 			KASSERT(pmap != pmap_kernel());
3859 
3860 			pmap_tlb_shootnow();
3861 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3862 			pmap_stats_update_bypte(pmap, 0, opte);
3863 			ptp->wire_count--;
3864 			if (ptp->wire_count <= 1) {
3865 				pmap_free_ptp(pmap, ptp, va, ptes, pdes);
3866 			}
3867 			pmap_unmap_ptes(pmap, pmap2);
3868 			pmap_destroy(pmap);
3869 		} else {
3870 			KASSERT(pmap == pmap_kernel());
3871 			pmap_stats_update_bypte(pmap, 0, opte);
3872 		}
3873 
3874 		if (pve != NULL) {
3875 			pve->pve_next = killlist;	/* mark it for death */
3876 			killlist = pve;
3877 		}
3878 	}
3879 	pmap_tlb_shootnow();
3880 	kpreempt_enable();
3881 
3882 	/* Now free unused pvs. */
3883 	pmap_free_pvs(killlist);
3884 }
3885 
3886 /*
3887  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
3888  *
3889  * => R/M bits are sync'd back to attrs
3890  */
3891 
3892 void
3893 pmap_page_remove(struct vm_page *pg)
3894 {
3895 	struct pmap_page *pp;
3896 	paddr_t pa;
3897 
3898 	KASSERT(uvm_page_locked_p(pg));
3899 
3900 	pp = VM_PAGE_TO_PP(pg);
3901 	pa = VM_PAGE_TO_PHYS(pg);
3902 	pmap_pp_remove(pp, pa);
3903 }
3904 
3905 /*
3906  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
3907  *	that map it
3908  */
3909 
3910 void
3911 pmap_pv_remove(paddr_t pa)
3912 {
3913 	struct pmap_page *pp;
3914 
3915 	pp = pmap_pv_tracked(pa);
3916 	if (pp == NULL)
3917 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
3918 	pmap_pp_remove(pp, pa);
3919 }
3920 
3921 /*
3922  * p m a p   a t t r i b u t e  f u n c t i o n s
3923  * functions that test/change managed page's attributes
3924  * since a page can be mapped multiple times we must check each PTE that
3925  * maps it by going down the pv lists.
3926  */
3927 
3928 /*
3929  * pmap_test_attrs: test a page's attributes
3930  */
3931 
3932 bool
3933 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
3934 {
3935 	struct pmap_page *pp;
3936 	struct pv_pte *pvpte;
3937 	pt_entry_t expect;
3938 	u_int result;
3939 
3940 	KASSERT(uvm_page_locked_p(pg));
3941 
3942 	pp = VM_PAGE_TO_PP(pg);
3943 	if ((pp->pp_attrs & testbits) != 0) {
3944 		return true;
3945 	}
3946 	expect = pmap_pa2pte(VM_PAGE_TO_PHYS(pg)) | PG_V;
3947 	kpreempt_disable();
3948 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3949 		pt_entry_t opte;
3950 		int error;
3951 
3952 		if ((pp->pp_attrs & testbits) != 0) {
3953 			break;
3954 		}
3955 		error = pmap_sync_pv(pvpte, expect, 0, &opte);
3956 		if (error == 0) {
3957 			pp->pp_attrs |= opte;
3958 		}
3959 	}
3960 	result = pp->pp_attrs & testbits;
3961 	kpreempt_enable();
3962 
3963 	/*
3964 	 * note that we will exit the for loop with a non-null pve if
3965 	 * we have found the bits we are testing for.
3966 	 */
3967 
3968 	return result != 0;
3969 }
3970 
3971 static bool
3972 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
3973 {
3974 	struct pv_pte *pvpte;
3975 	u_int result;
3976 	pt_entry_t expect;
3977 	int count;
3978 
3979 	expect = pmap_pa2pte(pa) | PG_V;
3980 	count = SPINLOCK_BACKOFF_MIN;
3981 	kpreempt_disable();
3982 startover:
3983 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
3984 		pt_entry_t opte;
3985 		int error;
3986 
3987 		error = pmap_sync_pv(pvpte, expect, clearbits, &opte);
3988 		if (error == EAGAIN) {
3989 			int hold_count;
3990 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
3991 			SPINLOCK_BACKOFF(count);
3992 			KERNEL_LOCK(hold_count, curlwp);
3993 			goto startover;
3994 		}
3995 		pp->pp_attrs |= opte;
3996 	}
3997 	result = pp->pp_attrs & clearbits;
3998 	pp->pp_attrs &= ~clearbits;
3999 	pmap_tlb_shootnow();
4000 	kpreempt_enable();
4001 
4002 	return result != 0;
4003 }
4004 
4005 /*
4006  * pmap_clear_attrs: clear the specified attribute for a page.
4007  *
4008  * => we return true if we cleared one of the bits we were asked to
4009  */
4010 
4011 bool
4012 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4013 {
4014 	struct pmap_page *pp;
4015 	paddr_t pa;
4016 
4017 	KASSERT(uvm_page_locked_p(pg));
4018 
4019 	pp = VM_PAGE_TO_PP(pg);
4020 	pa = VM_PAGE_TO_PHYS(pg);
4021 
4022 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4023 }
4024 
4025 /*
4026  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4027  *	pv-tracked page.
4028  */
4029 
4030 bool
4031 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4032 {
4033 	struct pmap_page *pp;
4034 
4035 	pp = pmap_pv_tracked(pa);
4036 	if (pp == NULL)
4037 		panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4038 
4039 	return pmap_pp_clear_attrs(pp, pa, clearbits);
4040 }
4041 
4042 /*
4043  * p m a p   p r o t e c t i o n   f u n c t i o n s
4044  */
4045 
4046 /*
4047  * pmap_page_protect: change the protection of all recorded mappings
4048  *	of a managed page
4049  *
4050  * => NOTE: this is an inline function in pmap.h
4051  */
4052 
4053 /* see pmap.h */
4054 
4055 /*
4056  * pmap_pv_protect: change the protection of all recorded mappings
4057  *	of an unmanaged pv-tracked page
4058  *
4059  * => NOTE: this is an inline function in pmap.h
4060  */
4061 
4062 /* see pmap.h */
4063 
4064 /*
4065  * pmap_protect: set the protection in of the pages in a pmap
4066  *
4067  * => NOTE: this is an inline function in pmap.h
4068  */
4069 
4070 /* see pmap.h */
4071 
4072 /*
4073  * pmap_write_protect: write-protect pages in a pmap.
4074  *
4075  * Note for Xen-amd64. Xen automatically adds PG_u to the kernel pages, but we
4076  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4077  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PG_u is
4078  * present the page will still be considered as a kernel page, and the privilege
4079  * separation will be enforced correctly.
4080  */
4081 void
4082 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4083 {
4084 	pt_entry_t bit_rem, bit_put;
4085 	pt_entry_t *ptes;
4086 	pt_entry_t * const *pdes;
4087 	struct pmap *pmap2;
4088 	vaddr_t blockend, va;
4089 
4090 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4091 
4092 	bit_rem = 0;
4093 	if (!(prot & VM_PROT_WRITE))
4094 		bit_rem = PG_RW;
4095 
4096 	bit_put = 0;
4097 	if (!(prot & VM_PROT_EXECUTE))
4098 		bit_put = pmap_pg_nx;
4099 
4100 	sva &= PG_FRAME;
4101 	eva &= PG_FRAME;
4102 
4103 	/* Acquire pmap. */
4104 	kpreempt_disable();
4105 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4106 
4107 	for (va = sva ; va < eva; va = blockend) {
4108 		pt_entry_t *spte, *epte;
4109 		int i;
4110 
4111 		blockend = x86_round_pdr(va + 1);
4112 		if (blockend > eva)
4113 			blockend = eva;
4114 
4115 		/*
4116 		 * Our PTE mappings should never be write-protected.
4117 		 *
4118 		 * XXXmaxv: still needed?
4119 		 *
4120 		 * A long term solution is to move the PTEs out of user address
4121 		 * space, and into kernel address space. Then we can set
4122 		 * VM_MAXUSER_ADDRESS to be VM_MAX_ADDRESS.
4123 		 */
4124 		for (i = 0; i < PDP_SIZE; i++) {
4125 			if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE+i)
4126 				panic("PTE space accessed");
4127 		}
4128 
4129 		/* Is it a valid block? */
4130 		if (!pmap_pdes_valid(va, pdes, NULL)) {
4131 			continue;
4132 		}
4133 		KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4134 
4135 		spte = &ptes[pl1_i(va)];
4136 		epte = &ptes[pl1_i(blockend)];
4137 
4138 		for (/* */; spte < epte; spte++) {
4139 			pt_entry_t opte, npte;
4140 
4141 			do {
4142 				opte = *spte;
4143 				if (!pmap_valid_entry(opte)) {
4144 					goto next;
4145 				}
4146 				npte = (opte & ~bit_rem) | bit_put;
4147 			} while (pmap_pte_cas(spte, opte, npte) != opte);
4148 
4149 			if ((opte & PG_M) != 0) {
4150 				vaddr_t tva = x86_ptob(spte - ptes);
4151 				pmap_tlb_shootdown(pmap, tva, opte,
4152 				    TLBSHOOT_WRITE_PROTECT);
4153 			}
4154 next:;
4155 		}
4156 	}
4157 
4158 	/* Release pmap. */
4159 	pmap_unmap_ptes(pmap, pmap2);
4160 	kpreempt_enable();
4161 }
4162 
4163 /*
4164  * pmap_unwire: clear the wired bit in the PTE.
4165  *
4166  * => Mapping should already be present.
4167  */
4168 void
4169 pmap_unwire(struct pmap *pmap, vaddr_t va)
4170 {
4171 	pt_entry_t *ptes, *ptep, opte;
4172 	pd_entry_t * const *pdes;
4173 	struct pmap *pmap2;
4174 
4175 	/* Acquire pmap. */
4176 	kpreempt_disable();
4177 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4178 
4179 	if (!pmap_pdes_valid(va, pdes, NULL)) {
4180 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4181 	}
4182 
4183 	ptep = &ptes[pl1_i(va)];
4184 	opte = *ptep;
4185 	KASSERT(pmap_valid_entry(opte));
4186 
4187 	if (opte & PG_W) {
4188 		pt_entry_t npte = opte & ~PG_W;
4189 
4190 		opte = pmap_pte_testset(ptep, npte);
4191 		pmap_stats_update_bypte(pmap, npte, opte);
4192 	} else {
4193 		printf("%s: wiring for pmap %p va %#" PRIxVADDR
4194 		    "did not change!\n", __func__, pmap, va);
4195 	}
4196 
4197 	/* Release pmap. */
4198 	pmap_unmap_ptes(pmap, pmap2);
4199 	kpreempt_enable();
4200 }
4201 
4202 /*
4203  * pmap_copy: copy mappings from one pmap to another
4204  *
4205  * => optional function
4206  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4207  */
4208 
4209 /*
4210  * defined as macro in pmap.h
4211  */
4212 
4213 __strict_weak_alias(pmap_enter, pmap_enter_default);
4214 
4215 int
4216 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4217     u_int flags)
4218 {
4219 	return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4220 }
4221 
4222 /*
4223  * pmap_enter: enter a mapping into a pmap
4224  *
4225  * => must be done "now" ... no lazy-evaluation
4226  * => we set pmap => pv_head locking
4227  */
4228 int
4229 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4230 	   vm_prot_t prot, u_int flags, int domid)
4231 {
4232 	pt_entry_t *ptes, opte, npte;
4233 	pt_entry_t *ptep;
4234 	pd_entry_t * const *pdes;
4235 	struct vm_page *ptp;
4236 	struct vm_page *new_pg, *old_pg;
4237 	struct pmap_page *new_pp, *old_pp;
4238 	struct pv_entry *old_pve = NULL;
4239 	struct pv_entry *new_pve;
4240 	struct pv_entry *new_sparepve;
4241 	int error;
4242 	bool wired = (flags & PMAP_WIRED) != 0;
4243 	struct pmap *pmap2;
4244 
4245 	KASSERT(pmap_initialized);
4246 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
4247 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
4248 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
4249 	    PRIxVADDR " over PDP!", __func__, va);
4250 	KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
4251 	    pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
4252 	    "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
4253 
4254 #ifdef XEN
4255 	KASSERT(domid == DOMID_SELF || pa == 0);
4256 #endif /* XEN */
4257 
4258 	npte = ma | protection_codes[prot] | PG_V;
4259 	npte |= pmap_pat_flags(flags);
4260 	if (wired)
4261 	        npte |= PG_W;
4262 	if (va < VM_MAXUSER_ADDRESS)
4263 		npte |= PG_u;
4264 	else if (va < VM_MAX_ADDRESS)
4265 		panic("PTE space accessed");	/* XXXmaxv: no longer needed? */
4266 
4267 	if (pmap == pmap_kernel())
4268 		npte |= pmap_pg_g;
4269 	if (flags & VM_PROT_ALL) {
4270 		npte |= PG_U;
4271 		if (flags & VM_PROT_WRITE) {
4272 			KASSERT((npte & PG_RW) != 0);
4273 			npte |= PG_M;
4274 		}
4275 	}
4276 
4277 #ifdef XEN
4278 	if (domid != DOMID_SELF)
4279 		new_pg = NULL;
4280 	else
4281 #endif
4282 		new_pg = PHYS_TO_VM_PAGE(pa);
4283 	if (new_pg != NULL) {
4284 		/* This is a managed page */
4285 		npte |= PG_PVLIST;
4286 		new_pp = VM_PAGE_TO_PP(new_pg);
4287 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
4288 		/* This is an unmanaged pv-tracked page */
4289 		npte |= PG_PVLIST;
4290 	} else {
4291 		new_pp = NULL;
4292 	}
4293 
4294 	/*
4295 	 * Try to get pves now if we might need them.
4296 	 * Keep going even if we fail, since we will not actually need them
4297 	 * if we are just changing the permissions on an existing mapping,
4298 	 * but we won't know if that's the case until later.
4299 	 */
4300 
4301 	bool needpves = pmap_pp_needs_pve(new_pp);
4302 	if (needpves) {
4303 		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4304 		new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
4305 	} else {
4306 		new_pve = NULL;
4307 		new_sparepve = NULL;
4308 	}
4309 
4310 	kpreempt_disable();
4311 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4312 	if (pmap == pmap_kernel()) {
4313 		ptp = NULL;
4314 	} else {
4315 		ptp = pmap_get_ptp(pmap, va, pdes, flags);
4316 		if (ptp == NULL) {
4317 			pmap_unmap_ptes(pmap, pmap2);
4318 			if (flags & PMAP_CANFAIL) {
4319 				error = ENOMEM;
4320 				goto out;
4321 			}
4322 			panic("%s: get ptp failed", __func__);
4323 		}
4324 	}
4325 
4326 	/*
4327 	 * Check if there is an existing mapping.  If we are now sure that
4328 	 * we need pves and we failed to allocate them earlier, handle that.
4329 	 * Caching the value of oldpa here is safe because only the mod/ref bits
4330 	 * can change while the pmap is locked.
4331 	 */
4332 
4333 	ptep = &ptes[pl1_i(va)];
4334 	opte = *ptep;
4335 	bool have_oldpa = pmap_valid_entry(opte);
4336 	paddr_t oldpa = pmap_pte2pa(opte);
4337 
4338 	if (needpves && (!have_oldpa || oldpa != pa) &&
4339 	    (new_pve == NULL || new_sparepve == NULL)) {
4340 		pmap_unmap_ptes(pmap, pmap2);
4341 		if (flags & PMAP_CANFAIL) {
4342 			error = ENOMEM;
4343 			goto out;
4344 		}
4345 		panic("%s: pve allocation failed", __func__);
4346 	}
4347 
4348 	/*
4349 	 * update the pte.
4350 	 */
4351 
4352 	do {
4353 		opte = *ptep;
4354 
4355 		/*
4356 		 * if the same page, inherit PG_U and PG_M.
4357 		 */
4358 		if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4359 			npte |= opte & (PG_U | PG_M);
4360 		}
4361 #if defined(XEN)
4362 		if (domid != DOMID_SELF) {
4363 			/* pmap_pte_cas with error handling */
4364 			int s = splvm();
4365 			if (opte != *ptep) {
4366 				splx(s);
4367 				continue;
4368 			}
4369 			error = xpq_update_foreign(
4370 			    vtomach((vaddr_t)ptep), npte, domid);
4371 			splx(s);
4372 			if (error) {
4373 				if (ptp != NULL && ptp->wire_count <= 1) {
4374 					pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4375 				}
4376 				pmap_unmap_ptes(pmap, pmap2);
4377 				goto out;
4378 			}
4379 			break;
4380 		}
4381 #endif /* defined(XEN) */
4382 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
4383 
4384 	/*
4385 	 * update statistics and PTP's reference count.
4386 	 */
4387 
4388 	pmap_stats_update_bypte(pmap, npte, opte);
4389 	if (ptp != NULL && !have_oldpa) {
4390 		ptp->wire_count++;
4391 	}
4392 	KASSERT(ptp == NULL || ptp->wire_count > 1);
4393 
4394 	/*
4395 	 * if the same page, we can skip pv_entry handling.
4396 	 */
4397 
4398 	if (((opte ^ npte) & (PG_FRAME | PG_V)) == 0) {
4399 		KASSERT(((opte ^ npte) & PG_PVLIST) == 0);
4400 		goto same_pa;
4401 	}
4402 
4403 	/*
4404 	 * if old page is pv-tracked, remove pv_entry from its list.
4405 	 */
4406 
4407 	if ((~opte & (PG_V | PG_PVLIST)) == 0) {
4408 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
4409 			KASSERT(uvm_page_locked_p(old_pg));
4410 			old_pp = VM_PAGE_TO_PP(old_pg);
4411 		} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
4412 			panic("%s: PG_PVLIST with pv-untracked page"
4413 			    " va = %#"PRIxVADDR
4414 			    " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
4415 			    __func__, va, oldpa, atop(pa));
4416 		}
4417 
4418 		old_pve = pmap_remove_pv(old_pp, ptp, va);
4419 		old_pp->pp_attrs |= opte;
4420 	}
4421 
4422 	/*
4423 	 * if new page is pv-tracked, insert pv_entry into its list.
4424 	 */
4425 
4426 	if (new_pp) {
4427 		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
4428 	}
4429 
4430 same_pa:
4431 	pmap_unmap_ptes(pmap, pmap2);
4432 
4433 	/*
4434 	 * shootdown tlb if necessary.
4435 	 */
4436 
4437 	if ((~opte & (PG_V | PG_U)) == 0 &&
4438 	    ((opte ^ npte) & (PG_FRAME | PG_RW)) != 0) {
4439 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
4440 	}
4441 
4442 	error = 0;
4443 out:
4444 	kpreempt_enable();
4445 	if (old_pve != NULL) {
4446 		pool_cache_put(&pmap_pv_cache, old_pve);
4447 	}
4448 	if (new_pve != NULL) {
4449 		pool_cache_put(&pmap_pv_cache, new_pve);
4450 	}
4451 	if (new_sparepve != NULL) {
4452 		pool_cache_put(&pmap_pv_cache, new_sparepve);
4453 	}
4454 
4455 	return error;
4456 }
4457 
4458 static paddr_t
4459 pmap_get_physpage(void)
4460 {
4461 	struct vm_page *ptp;
4462 	struct pmap *kpm = pmap_kernel();
4463 	paddr_t pa;
4464 
4465 	if (!uvm.page_init_done) {
4466 		/*
4467 		 * We're growing the kernel pmap early (from
4468 		 * uvm_pageboot_alloc()). This case must be
4469 		 * handled a little differently.
4470 		 */
4471 
4472 		if (!uvm_page_physget(&pa))
4473 			panic("%s: out of memory", __func__);
4474 #if defined(__HAVE_DIRECT_MAP)
4475 		pagezero(PMAP_DIRECT_MAP(pa));
4476 #else
4477 #if defined(XEN)
4478 		if (XEN_VERSION_SUPPORTED(3, 4)) {
4479 			xen_pagezero(pa);
4480 			return pa;
4481 		}
4482 #endif
4483 		kpreempt_disable();
4484 		pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PG_V |
4485 		    PG_RW | pmap_pg_nx);
4486 		pmap_pte_flush();
4487 		pmap_update_pg((vaddr_t)early_zerop);
4488 		memset(early_zerop, 0, PAGE_SIZE);
4489 #if defined(DIAGNOSTIC) || defined(XEN)
4490 		pmap_pte_set(early_zero_pte, 0);
4491 		pmap_pte_flush();
4492 #endif /* defined(DIAGNOSTIC) */
4493 		kpreempt_enable();
4494 #endif /* defined(__HAVE_DIRECT_MAP) */
4495 	} else {
4496 		/* XXX */
4497 		ptp = uvm_pagealloc(NULL, 0, NULL,
4498 				    UVM_PGA_USERESERVE|UVM_PGA_ZERO);
4499 		if (ptp == NULL)
4500 			panic("%s: out of memory", __func__);
4501 		ptp->flags &= ~PG_BUSY;
4502 		ptp->wire_count = 1;
4503 		pa = VM_PAGE_TO_PHYS(ptp);
4504 	}
4505 	pmap_stats_update(kpm, 1, 0);
4506 
4507 	return pa;
4508 }
4509 
4510 /*
4511  * Expand the page tree with the specified amount of PTPs, mapping virtual
4512  * addresses starting at kva. We populate all the levels but the last one
4513  * (L1). The nodes of the tree are created as RWX, but the pages covered
4514  * will be kentered in L1, with proper permissions.
4515  *
4516  * Used only by pmap_growkernel.
4517  */
4518 static void
4519 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
4520 {
4521 	unsigned long i;
4522 	paddr_t pa;
4523 	unsigned long index, endindex;
4524 	int level;
4525 	pd_entry_t *pdep;
4526 #ifdef XEN
4527 	int s = splvm(); /* protect xpq_* */
4528 #endif
4529 
4530 	for (level = PTP_LEVELS; level > 1; level--) {
4531 		if (level == PTP_LEVELS)
4532 			pdep = cpm->pm_pdir;
4533 		else
4534 			pdep = normal_pdes[level - 2];
4535 		index = pl_i_roundup(kva, level);
4536 		endindex = index + needed_ptps[level - 1] - 1;
4537 
4538 		for (i = index; i <= endindex; i++) {
4539 			pt_entry_t pte;
4540 
4541 			KASSERT(!pmap_valid_entry(pdep[i]));
4542 			pa = pmap_get_physpage();
4543 			pte = pmap_pa2pte(pa) | PG_V | PG_RW;
4544 			pmap_pte_set(&pdep[i], pte);
4545 
4546 #if defined(XEN) && (defined(PAE) || defined(__x86_64__))
4547 			if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
4548 				if (__predict_true(
4549 				    cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4550 					/* update per-cpu PMDs on all cpus */
4551 					xen_kpm_sync(pmap_kernel(), i);
4552 				} else {
4553 					/*
4554 					 * too early; update primary CPU
4555 					 * PMD only (without locks)
4556 					 */
4557 #ifdef PAE
4558 					pd_entry_t *cpu_pdep =
4559 					    &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
4560 #endif
4561 #ifdef __x86_64__
4562 					pd_entry_t *cpu_pdep =
4563 						&cpu_info_primary.ci_kpm_pdir[i];
4564 #endif
4565 					pmap_pte_set(cpu_pdep, pte);
4566 				}
4567 			}
4568 #endif /* XEN && (PAE || __x86_64__) */
4569 
4570 			KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
4571 			    pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
4572 			nkptp[level - 1]++;
4573 		}
4574 		pmap_pte_flush();
4575 	}
4576 #ifdef XEN
4577 	splx(s);
4578 #endif
4579 }
4580 
4581 /*
4582  * pmap_growkernel: increase usage of KVM space.
4583  *
4584  * => we allocate new PTPs for the kernel and install them in all
4585  *    the pmaps on the system.
4586  */
4587 
4588 vaddr_t
4589 pmap_growkernel(vaddr_t maxkvaddr)
4590 {
4591 	struct pmap *kpm = pmap_kernel();
4592 	struct pmap *cpm;
4593 #if !defined(XEN) || !defined(__x86_64__)
4594 	struct pmap *pm;
4595 	long old;
4596 #endif
4597 	int s, i;
4598 	long needed_kptp[PTP_LEVELS], target_nptp;
4599 	bool invalidate = false;
4600 
4601 	s = splvm();	/* to be safe */
4602 	mutex_enter(kpm->pm_lock);
4603 
4604 	if (maxkvaddr <= pmap_maxkvaddr) {
4605 		mutex_exit(kpm->pm_lock);
4606 		splx(s);
4607 		return pmap_maxkvaddr;
4608 	}
4609 
4610 	maxkvaddr = x86_round_pdr(maxkvaddr);
4611 #if !defined(XEN) || !defined(__x86_64__)
4612 	old = nkptp[PTP_LEVELS - 1];
4613 #endif
4614 
4615 	/* Initialize needed_kptp. */
4616 	for (i = PTP_LEVELS - 1; i >= 1; i--) {
4617 		target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
4618 		    pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
4619 
4620 		if (target_nptp > nkptpmax[i])
4621 			panic("out of KVA space");
4622 		KASSERT(target_nptp >= nkptp[i]);
4623 		needed_kptp[i] = target_nptp - nkptp[i];
4624 	}
4625 
4626 #if defined(XEN) && (defined(__x86_64__) || defined(PAE))
4627 	/* only pmap_kernel(), or the per-cpu map, has kernel entries */
4628 	cpm = kpm;
4629 #else
4630 	/* Get the current pmap */
4631 	if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
4632 		cpm = curcpu()->ci_pmap;
4633 	} else {
4634 		cpm = kpm;
4635 	}
4636 #endif
4637 
4638 	pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
4639 
4640 	/*
4641 	 * If the number of top level entries changed, update all pmaps.
4642 	 */
4643 	if (needed_kptp[PTP_LEVELS - 1] != 0) {
4644 #ifdef XEN
4645 #ifdef __x86_64__
4646 		/* nothing, kernel entries are never entered in user pmap */
4647 #else /* __x86_64__ */
4648 		int pdkidx;
4649 #ifndef PAE
4650 		/*
4651 		 * for PAE this is not needed, because pmap_alloc_level()
4652 		 * already did update the per-CPU tables
4653 		 */
4654 		if (cpm != kpm) {
4655 			for (pdkidx = PDIR_SLOT_KERN + old;
4656 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4657 			    pdkidx++) {
4658 				pmap_pte_set(&kpm->pm_pdir[pdkidx],
4659 				    cpm->pm_pdir[pdkidx]);
4660 			}
4661 			pmap_pte_flush();
4662 		}
4663 #endif /* !PAE */
4664 
4665 		mutex_enter(&pmaps_lock);
4666 		LIST_FOREACH(pm, &pmaps, pm_list) {
4667 			for (pdkidx = PDIR_SLOT_KERN + old;
4668 			    pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
4669 			    pdkidx++) {
4670 				pmap_pte_set(&pm->pm_pdir[pdkidx],
4671 				    kpm->pm_pdir[pdkidx]);
4672 			}
4673 			pmap_pte_flush();
4674 		}
4675 		mutex_exit(&pmaps_lock);
4676 #endif /* __x86_64__ */
4677 #else /* XEN */
4678 		size_t newpdes;
4679 		newpdes = nkptp[PTP_LEVELS - 1] - old;
4680 		if (cpm != kpm) {
4681 			memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
4682 			    &cpm->pm_pdir[PDIR_SLOT_KERN + old],
4683 			    newpdes * sizeof(pd_entry_t));
4684 		}
4685 
4686 		mutex_enter(&pmaps_lock);
4687 		LIST_FOREACH(pm, &pmaps, pm_list) {
4688 			memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
4689 			    &kpm->pm_pdir[PDIR_SLOT_KERN + old],
4690 			    newpdes * sizeof (pd_entry_t));
4691 		}
4692 		mutex_exit(&pmaps_lock);
4693 #endif
4694 		invalidate = true;
4695 	}
4696 	pmap_maxkvaddr = maxkvaddr;
4697 	mutex_exit(kpm->pm_lock);
4698 	splx(s);
4699 
4700 	if (invalidate && pmap_initialized) {
4701 		/* Invalidate the PDP cache. */
4702 		pool_cache_invalidate(&pmap_pdp_cache);
4703 	}
4704 
4705 	return maxkvaddr;
4706 }
4707 
4708 #ifdef DEBUG
4709 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
4710 
4711 /*
4712  * pmap_dump: dump all the mappings from a pmap
4713  *
4714  * => caller should not be holding any pmap locks
4715  */
4716 
4717 void
4718 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4719 {
4720 	pt_entry_t *ptes, *pte;
4721 	pd_entry_t * const *pdes;
4722 	struct pmap *pmap2;
4723 	vaddr_t blkendva;
4724 
4725 	/*
4726 	 * if end is out of range truncate.
4727 	 * if (end == start) update to max.
4728 	 */
4729 
4730 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
4731 		eva = VM_MAXUSER_ADDRESS;
4732 
4733 	/*
4734 	 * we lock in the pmap => pv_head direction
4735 	 */
4736 
4737 	kpreempt_disable();
4738 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
4739 
4740 	/*
4741 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
4742 	 */
4743 
4744 	for (/* null */ ; sva < eva ; sva = blkendva) {
4745 
4746 		/* determine range of block */
4747 		blkendva = x86_round_pdr(sva+1);
4748 		if (blkendva > eva)
4749 			blkendva = eva;
4750 
4751 		/* valid block? */
4752 		if (!pmap_pdes_valid(sva, pdes, NULL))
4753 			continue;
4754 
4755 		pte = &ptes[pl1_i(sva)];
4756 		for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
4757 			if (!pmap_valid_entry(*pte))
4758 				continue;
4759 			printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
4760 			    " (pte=%#" PRIxPADDR ")\n",
4761 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
4762 		}
4763 	}
4764 	pmap_unmap_ptes(pmap, pmap2);
4765 	kpreempt_enable();
4766 }
4767 #endif
4768 
4769 /*
4770  * pmap_update: process deferred invalidations and frees.
4771  */
4772 
4773 void
4774 pmap_update(struct pmap *pmap)
4775 {
4776 	struct vm_page *empty_ptps;
4777 	lwp_t *l = curlwp;
4778 
4779 	/*
4780 	 * If we have torn down this pmap, invalidate non-global TLB
4781 	 * entries on any processors using it.
4782 	 */
4783 	kpreempt_disable();
4784 	if (__predict_false(l->l_md.md_gc_pmap == pmap)) {
4785 		l->l_md.md_gc_pmap = NULL;
4786 		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
4787 	}
4788 
4789 	/*
4790 	 * Initiate any pending TLB shootdowns.  Wait for them to
4791 	 * complete before returning control to the caller.
4792 	 */
4793 	pmap_tlb_shootnow();
4794 	kpreempt_enable();
4795 
4796 	/*
4797 	 * Now that shootdowns are complete, process deferred frees,
4798 	 * but not from interrupt context.
4799 	 */
4800 	if (l->l_md.md_gc_ptp != NULL) {
4801 		KASSERT((l->l_pflag & LP_INTR) == 0);
4802 		if (cpu_intr_p()) {
4803 			return;
4804 		}
4805 		empty_ptps = l->l_md.md_gc_ptp;
4806 		l->l_md.md_gc_ptp = NULL;
4807 		pmap_free_ptps(empty_ptps);
4808 	}
4809 }
4810 
4811 #if PTP_LEVELS > 4
4812 #error "Unsupported number of page table mappings"
4813 #endif
4814 
4815 paddr_t
4816 pmap_init_tmp_pgtbl(paddr_t pg)
4817 {
4818 	static bool maps_loaded;
4819 	static const paddr_t x86_tmp_pml_paddr[] = {
4820 	    4 * PAGE_SIZE,	/* L1 */
4821 	    5 * PAGE_SIZE,	/* L2 */
4822 	    6 * PAGE_SIZE,	/* L3 */
4823 	    7 * PAGE_SIZE	/* L4 */
4824 	};
4825 	static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
4826 
4827 	pd_entry_t *tmp_pml, *kernel_pml;
4828 
4829 	int level;
4830 
4831 	if (!maps_loaded) {
4832 		for (level = 0; level < PTP_LEVELS; ++level) {
4833 			x86_tmp_pml_vaddr[level] =
4834 			    uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
4835 			    UVM_KMF_VAONLY);
4836 
4837 			if (x86_tmp_pml_vaddr[level] == 0)
4838 				panic("mapping of real mode PML failed\n");
4839 			pmap_kenter_pa(x86_tmp_pml_vaddr[level],
4840 			    x86_tmp_pml_paddr[level],
4841 			    VM_PROT_READ | VM_PROT_WRITE, 0);
4842 		}
4843 		pmap_update(pmap_kernel());
4844 		maps_loaded = true;
4845 	}
4846 
4847 	/* Zero levels 1-3 */
4848 	for (level = 0; level < PTP_LEVELS - 1; ++level) {
4849 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4850 		memset(tmp_pml, 0, PAGE_SIZE);
4851 	}
4852 
4853 	/* Copy PML4 */
4854 	kernel_pml = pmap_kernel()->pm_pdir;
4855 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
4856 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
4857 
4858 #ifdef PAE
4859 	/*
4860 	 * Use the last 4 entries of the L2 page as L3 PD entries. These
4861 	 * last entries are unlikely to be used for temporary mappings.
4862 	 * 508: maps 0->1GB (userland)
4863 	 * 509: unused
4864 	 * 510: unused
4865 	 * 511: maps 3->4GB (kernel)
4866 	 */
4867 	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
4868 	tmp_pml[509] = 0;
4869 	tmp_pml[510] = 0;
4870 	tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PG_V;
4871 #endif
4872 
4873 	for (level = PTP_LEVELS - 1; level > 0; --level) {
4874 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
4875 
4876 		tmp_pml[pl_i(pg, level + 1)] =
4877 		    (x86_tmp_pml_paddr[level - 1] & PG_FRAME) | PG_RW | PG_V;
4878 	}
4879 
4880 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
4881 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
4882 
4883 #ifdef PAE
4884 	/* Return the PA of the L3 page (entry 508 of the L2 page) */
4885 	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
4886 #endif
4887 
4888 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
4889 }
4890 
4891 u_int
4892 x86_mmap_flags(paddr_t mdpgno)
4893 {
4894 	u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
4895 	u_int pflag = 0;
4896 
4897 	if (nflag & X86_MMAP_FLAG_PREFETCH)
4898 		pflag |= PMAP_WRITE_COMBINE;
4899 
4900 	return pflag;
4901 }
4902